diff options
Diffstat (limited to 'src/spdk/lib')
268 files changed, 152758 insertions, 0 deletions
diff --git a/src/spdk/lib/Makefile b/src/spdk/lib/Makefile new file mode 100644 index 000000000..4c0c383eb --- /dev/null +++ b/src/spdk/lib/Makefile @@ -0,0 +1,65 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +include $(SPDK_ROOT_DIR)/mk/spdk.lib_deps.mk + +DIRS-y += bdev blob blobfs conf accel event json jsonrpc \ + log log_rpc lvol net rpc sock thread trace util nvme vmd nvmf scsi \ + ioat ut_mock iscsi notify +ifeq ($(OS),Linux) +DIRS-y += nbd ftl +endif + +DIRS-$(CONFIG_OCF) += env_ocf +DIRS-$(CONFIG_IDXD) += idxd +DIRS-$(CONFIG_VHOST) += vhost +DIRS-$(CONFIG_VIRTIO) += virtio +DIRS-$(CONFIG_REDUCE) += reduce +DIRS-$(CONFIG_VHOST_INTERNAL_LIB) += rte_vhost +DIRS-$(CONFIG_RDMA) += rdma + +# If CONFIG_ENV is pointing at a directory in lib, build it. +# Out-of-tree env implementations must be built separately by the user. +ENV_NAME := $(notdir $(CONFIG_ENV)) +ifeq ($(abspath $(CONFIG_ENV)),$(SPDK_ROOT_DIR)/lib/$(ENV_NAME)) +DIRS-y += $(ENV_NAME) +endif + +.PHONY: all clean $(DIRS-y) + +all: $(DIRS-y) +clean: $(DIRS-y) + +include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk diff --git a/src/spdk/lib/accel/Makefile b/src/spdk/lib/accel/Makefile new file mode 100644 index 000000000..0d41104de --- /dev/null +++ b/src/spdk/lib/accel/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 +SO_SUFFIX := $(SO_VER).$(SO_MINOR) + +LIBNAME = accel +C_SRCS = accel_engine.c + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_accel.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/accel/accel_engine.c b/src/spdk/lib/accel/accel_engine.c new file mode 100644 index 000000000..03a405439 --- /dev/null +++ b/src/spdk/lib/accel/accel_engine.c @@ -0,0 +1,1044 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/accel_engine.h" + +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/crc32.h" + +/* Accelerator Engine Framework: The following provides a top level + * generic API for the accelerator functions defined here. Modules, + * such as the one in /module/accel/ioat, supply the implemention of + * with the exception of the pure software implemention contained + * later in this file. + */ + +#define ALIGN_4K 0x1000 +#define SPDK_ACCEL_NUM_TASKS 0x4000 + +static struct spdk_mempool *g_accel_task_pool; + +/* Largest context size for all accel modules */ +static size_t g_max_accel_module_size = 0; + +static struct spdk_accel_engine *g_hw_accel_engine = NULL; +static struct spdk_accel_engine *g_sw_accel_engine = NULL; +static struct spdk_accel_module_if *g_accel_engine_module = NULL; +static spdk_accel_fini_cb g_fini_cb_fn = NULL; +static void *g_fini_cb_arg = NULL; + +/* Global list of registered accelerator modules */ +static TAILQ_HEAD(, spdk_accel_module_if) spdk_accel_module_list = + TAILQ_HEAD_INITIALIZER(spdk_accel_module_list); + +struct accel_io_channel { + struct spdk_accel_engine *engine; + struct spdk_io_channel *ch; +}; + +/* Forward declarations of software implementations used when an + * engine has not implemented the capability. + */ +static int sw_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); +static int sw_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); +static int sw_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); +static int sw_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg); +static int sw_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src, + uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, + void *cb_arg); + +/* Registration of hw modules (currently supports only 1 at a time) */ +void +spdk_accel_hw_engine_register(struct spdk_accel_engine *accel_engine) +{ + if (g_hw_accel_engine == NULL) { + g_hw_accel_engine = accel_engine; + } else { + SPDK_NOTICELOG("Hardware offload engine already enabled\n"); + } +} + +/* Registration of sw modules (currently supports only 1) */ +static void +accel_sw_register(struct spdk_accel_engine *accel_engine) +{ + assert(g_sw_accel_engine == NULL); + g_sw_accel_engine = accel_engine; +} + +static void +accel_sw_unregister(void) +{ + g_sw_accel_engine = NULL; +} + +/* Common completion routine, called only by the accel framework */ +static void +_accel_engine_done(void *ref, int status) +{ + struct spdk_accel_task *req = (struct spdk_accel_task *)ref; + + req->cb(req->cb_arg, status); + spdk_mempool_put(g_accel_task_pool, req); +} + +uint64_t +spdk_accel_get_capabilities(struct spdk_io_channel *ch) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + + /* All engines are required to implement this API. */ + return accel_ch->engine->get_capabilities(); +} + +/* Accel framework public API for copy function */ +int +spdk_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + /* If the engine does not support it, fallback to the sw implementation. */ + if (accel_ch->engine->copy) { + return accel_ch->engine->copy(accel_ch->ch, dst, src, nbytes, + _accel_engine_done, accel_req->offload_ctx); + } else { + return sw_accel_submit_copy(accel_ch->ch, dst, src, nbytes, + _accel_engine_done, accel_req->offload_ctx); + } +} + +/* Accel framework public API for dual cast copy function */ +int +spdk_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) { + SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n"); + return -EINVAL; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + /* If the engine does not support it, fallback to the sw implementation. */ + if (accel_ch->engine->dualcast) { + return accel_ch->engine->dualcast(accel_ch->ch, dst1, dst2, src, nbytes, + _accel_engine_done, accel_req->offload_ctx); + } else { + return sw_accel_submit_dualcast(accel_ch->ch, dst1, dst2, src, nbytes, + _accel_engine_done, accel_req->offload_ctx); + } +} + +/* Accel framework public API for batch_create function. All engines are + * required to implement this API. + */ +struct spdk_accel_batch * +spdk_accel_batch_create(struct spdk_io_channel *ch) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + + return accel_ch->engine->batch_create(accel_ch->ch); +} + +/* Accel framework public API for batch_submit function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + return accel_ch->engine->batch_submit(accel_ch->ch, batch, _accel_engine_done, + accel_req->offload_ctx); +} + +/* Accel framework public API for getting max batch. All engines are + * required to implement this API. + */ +uint32_t +spdk_accel_batch_get_max(struct spdk_io_channel *ch) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + + return accel_ch->engine->batch_get_max(); +} + +/* Accel framework public API for for when an app is unable to complete a batch sequence, + * it cancels with this API. + */ +int +spdk_accel_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + + return accel_ch->engine->batch_cancel(accel_ch->ch, batch); +} + +/* Accel framework public API for batch prep_copy function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst, + void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + return accel_ch->engine->batch_prep_copy(accel_ch->ch, batch, dst, src, nbytes, + _accel_engine_done, accel_req->offload_ctx); +} + +/* Accel framework public API for batch prep_dualcast function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst1, void *dst2, void *src, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) { + SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n"); + return -EINVAL; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + return accel_ch->engine->batch_prep_dualcast(accel_ch->ch, batch, dst1, dst2, src, + nbytes, _accel_engine_done, accel_req->offload_ctx); +} + +/* Accel framework public API for batch prep_compare function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *src1, void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, + void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + return accel_ch->engine->batch_prep_compare(accel_ch->ch, batch, src1, src2, nbytes, + _accel_engine_done, accel_req->offload_ctx); +} + +/* Accel framework public API for batch prep_fill function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst, + uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + return accel_ch->engine->batch_prep_fill(accel_ch->ch, batch, dst, fill, nbytes, + _accel_engine_done, accel_req->offload_ctx); +} + +/* Accel framework public API for batch prep_crc32c function. All engines are + * required to implement this API. + */ +int +spdk_accel_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + return accel_ch->engine->batch_prep_crc32c(accel_ch->ch, batch, dst, src, seed, nbytes, + _accel_engine_done, accel_req->offload_ctx); +} + +/* Accel framework public API for compare function */ +int +spdk_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + /* If the engine does not support it, fallback to the sw implementation. */ + if (accel_ch->engine->compare) { + return accel_ch->engine->compare(accel_ch->ch, src1, src2, nbytes, + _accel_engine_done, accel_req->offload_ctx); + } else { + return sw_accel_submit_compare(accel_ch->ch, src1, src2, nbytes, + _accel_engine_done, accel_req->offload_ctx); + } +} + +/* Accel framework public API for fill function */ +int +spdk_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, uint64_t nbytes, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + /* If the engine does not support it, fallback to the sw implementation. */ + if (accel_ch->engine->fill) { + return accel_ch->engine->fill(accel_ch->ch, dst, fill, nbytes, + _accel_engine_done, accel_req->offload_ctx); + } else { + return sw_accel_submit_fill(accel_ch->ch, dst, fill, nbytes, + _accel_engine_done, accel_req->offload_ctx); + } +} + +/* Accel framework public API for CRC-32C function */ +int +spdk_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src, uint32_t seed, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool); + + if (accel_req == NULL) { + SPDK_ERRLOG("Unable to get an accel task.\n"); + return -ENOMEM; + } + + accel_req->cb = cb_fn; + accel_req->cb_arg = cb_arg; + + /* If the engine does not support it, fallback to the sw implementation. */ + if (accel_ch->engine->crc32c) { + return accel_ch->engine->crc32c(accel_ch->ch, dst, src, seed, nbytes, + _accel_engine_done, accel_req->offload_ctx); + } else { + return sw_accel_submit_crc32c(accel_ch->ch, dst, src, seed, nbytes, + _accel_engine_done, accel_req->offload_ctx); + } +} + +/* Helper function when when accel modules register with the framework. */ +void spdk_accel_module_list_add(struct spdk_accel_module_if *accel_module) +{ + TAILQ_INSERT_TAIL(&spdk_accel_module_list, accel_module, tailq); + if (accel_module->get_ctx_size && accel_module->get_ctx_size() > g_max_accel_module_size) { + g_max_accel_module_size = accel_module->get_ctx_size(); + } +} + +/* Framework level channel create callback. */ +static int +accel_engine_create_cb(void *io_device, void *ctx_buf) +{ + struct accel_io_channel *accel_ch = ctx_buf; + + if (g_hw_accel_engine != NULL) { + accel_ch->ch = g_hw_accel_engine->get_io_channel(); + if (accel_ch->ch != NULL) { + accel_ch->engine = g_hw_accel_engine; + return 0; + } + } + + /* No hw engine enabled, use sw. */ + accel_ch->ch = g_sw_accel_engine->get_io_channel(); + assert(accel_ch->ch != NULL); + accel_ch->engine = g_sw_accel_engine; + return 0; +} + +/* Framework level channel destroy callback. */ +static void +accel_engine_destroy_cb(void *io_device, void *ctx_buf) +{ + struct accel_io_channel *accel_ch = ctx_buf; + + spdk_put_io_channel(accel_ch->ch); +} + +struct spdk_io_channel * +spdk_accel_engine_get_io_channel(void) +{ + return spdk_get_io_channel(&spdk_accel_module_list); +} + +static void +accel_engine_module_initialize(void) +{ + struct spdk_accel_module_if *accel_engine_module; + char task_pool_name[30]; + + TAILQ_FOREACH(accel_engine_module, &spdk_accel_module_list, tailq) { + accel_engine_module->module_init(); + } + + snprintf(task_pool_name, sizeof(task_pool_name), "accel_task_pool"); + g_accel_task_pool = spdk_mempool_create(task_pool_name, + SPDK_ACCEL_NUM_TASKS, + g_max_accel_module_size, + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + assert(g_accel_task_pool); + +} + +int +spdk_accel_engine_initialize(void) +{ + SPDK_NOTICELOG("Accel engine initialized to use software engine.\n"); + accel_engine_module_initialize(); + + /* + * We need a unique identifier for the accel engine framework, so use the + * spdk_accel_module_list address for this purpose. + */ + spdk_io_device_register(&spdk_accel_module_list, accel_engine_create_cb, accel_engine_destroy_cb, + sizeof(struct accel_io_channel), "accel_module"); + + return 0; +} + +static void +accel_engine_module_finish_cb(void) +{ + spdk_accel_fini_cb cb_fn = g_fini_cb_fn; + + cb_fn(g_fini_cb_arg); + g_fini_cb_fn = NULL; + g_fini_cb_arg = NULL; +} + +void +spdk_accel_write_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_accel_module_if *accel_engine_module; + + /* + * The accel engine has no config, there may be some in + * the modules though. + */ + spdk_json_write_array_begin(w); + TAILQ_FOREACH(accel_engine_module, &spdk_accel_module_list, tailq) { + if (accel_engine_module->write_config_json) { + accel_engine_module->write_config_json(w); + } + } + spdk_json_write_array_end(w); +} + +void +spdk_accel_engine_module_finish(void) +{ + if (!g_accel_engine_module) { + g_accel_engine_module = TAILQ_FIRST(&spdk_accel_module_list); + } else { + g_accel_engine_module = TAILQ_NEXT(g_accel_engine_module, tailq); + } + + if (!g_accel_engine_module) { + accel_engine_module_finish_cb(); + return; + } + + if (g_accel_engine_module->module_fini) { + spdk_thread_send_msg(spdk_get_thread(), g_accel_engine_module->module_fini, NULL); + } else { + spdk_accel_engine_module_finish(); + } +} + +void +spdk_accel_engine_finish(spdk_accel_fini_cb cb_fn, void *cb_arg) +{ + assert(cb_fn != NULL); + + g_fini_cb_fn = cb_fn; + g_fini_cb_arg = cb_arg; + + spdk_io_device_unregister(&spdk_accel_module_list, NULL); + spdk_accel_engine_module_finish(); + spdk_mempool_free(g_accel_task_pool); +} + +void +spdk_accel_engine_config_text(FILE *fp) +{ + struct spdk_accel_module_if *accel_engine_module; + + TAILQ_FOREACH(accel_engine_module, &spdk_accel_module_list, tailq) { + if (accel_engine_module->config_text) { + accel_engine_module->config_text(fp); + } + } +} + +/* + * The SW Accelerator module is "built in" here (rest of file) + */ + +#define SW_ACCEL_BATCH_SIZE 2048 + +enum sw_accel_opcode { + SW_ACCEL_OPCODE_MEMMOVE = 0, + SW_ACCEL_OPCODE_MEMFILL = 1, + SW_ACCEL_OPCODE_COMPARE = 2, + SW_ACCEL_OPCODE_CRC32C = 3, + SW_ACCEL_OPCODE_DUALCAST = 4, +}; + +struct sw_accel_op { + struct sw_accel_io_channel *sw_ch; + void *cb_arg; + spdk_accel_completion_cb cb_fn; + void *src; + union { + void *dst; + void *src2; + }; + void *dst2; + uint32_t seed; + uint64_t fill_pattern; + enum sw_accel_opcode op_code; + uint64_t nbytes; + TAILQ_ENTRY(sw_accel_op) link; +}; + +/* The sw accel engine only supports one outstanding batch at a time. */ +struct sw_accel_io_channel { + TAILQ_HEAD(, sw_accel_op) op_pool; + TAILQ_HEAD(, sw_accel_op) batch; +}; + +static uint64_t +sw_accel_get_capabilities(void) +{ + return ACCEL_COPY | ACCEL_FILL | ACCEL_CRC32C | ACCEL_COMPARE | + ACCEL_DUALCAST | ACCEL_BATCH; +} + +static uint32_t +sw_accel_batch_get_max(void) +{ + return SW_ACCEL_BATCH_SIZE; +} + +/* The sw engine plug-in does not ahve a public API, it is only callable + * from the accel fw and thus does not need to have its own struct definition + * of a batch, it just simply casts the address of the single supported batch + * as the struct spdk_accel_batch pointer. + */ +static struct spdk_accel_batch * +sw_accel_batch_start(struct spdk_io_channel *ch) +{ + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + if (!TAILQ_EMPTY(&sw_ch->batch)) { + SPDK_ERRLOG("SW accel engine only supports one batch at a time.\n"); + return NULL; + } + + return (struct spdk_accel_batch *)&sw_ch->batch; +} + +static struct sw_accel_op * +_prep_op(struct sw_accel_io_channel *sw_ch, struct spdk_accel_batch *batch, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + + if ((struct spdk_accel_batch *)&sw_ch->batch != batch) { + SPDK_ERRLOG("Invalid batch\n"); + return NULL; + } + + if (!TAILQ_EMPTY(&sw_ch->op_pool)) { + op = TAILQ_FIRST(&sw_ch->op_pool); + TAILQ_REMOVE(&sw_ch->op_pool, op, link); + } else { + SPDK_ERRLOG("Ran out of operations for batch\n"); + return NULL; + } + + op->cb_arg = cb_arg; + op->cb_fn = cb_fn; + op->sw_ch = sw_ch; + + return op; +} + +static int +sw_accel_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(sw_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->src = src; + op->dst = dst; + op->nbytes = nbytes; + op->op_code = SW_ACCEL_OPCODE_MEMMOVE; + TAILQ_INSERT_TAIL(&sw_ch->batch, op, link); + + return 0; +} + +static int +sw_accel_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst1, + void *dst2, + void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(sw_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->src = src; + op->dst = dst1; + op->dst2 = dst2; + op->nbytes = nbytes; + op->op_code = SW_ACCEL_OPCODE_DUALCAST; + TAILQ_INSERT_TAIL(&sw_ch->batch, op, link); + + return 0; +} + +static int +sw_accel_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *src1, + void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(sw_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->src = src1; + op->src2 = src2; + op->nbytes = nbytes; + op->op_code = SW_ACCEL_OPCODE_COMPARE; + TAILQ_INSERT_TAIL(&sw_ch->batch, op, link); + + return 0; +} + +static int +sw_accel_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst, + uint8_t fill, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(sw_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->dst = dst; + op->fill_pattern = fill; + op->nbytes = nbytes; + op->op_code = SW_ACCEL_OPCODE_MEMFILL; + TAILQ_INSERT_TAIL(&sw_ch->batch, op, link); + + return 0; +} + +static int +sw_accel_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + uint32_t *dst, + void *src, uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + op = _prep_op(sw_ch, batch, cb_fn, cb_arg); + if (op == NULL) { + return -EINVAL; + } + + /* Command specific. */ + op->dst = (void *)dst; + op->src = src; + op->seed = seed; + op->nbytes = nbytes; + op->op_code = SW_ACCEL_OPCODE_CRC32C; + TAILQ_INSERT_TAIL(&sw_ch->batch, op, link); + + return 0; +} + + +static int +sw_accel_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + + if ((struct spdk_accel_batch *)&sw_ch->batch != batch) { + SPDK_ERRLOG("Invalid batch\n"); + return -EINVAL; + } + + /* Cancel the batch items by moving them back to the op_pool. */ + while ((op = TAILQ_FIRST(&sw_ch->batch))) { + TAILQ_REMOVE(&sw_ch->batch, op, link); + TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link); + } + + return 0; +} + +static int +sw_accel_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, + spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct sw_accel_op *op; + struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch); + struct spdk_accel_task *accel_req; + int batch_status = 0, cmd_status = 0; + + if ((struct spdk_accel_batch *)&sw_ch->batch != batch) { + SPDK_ERRLOG("Invalid batch\n"); + return -EINVAL; + } + + /* Complete the batch items. */ + while ((op = TAILQ_FIRST(&sw_ch->batch))) { + TAILQ_REMOVE(&sw_ch->batch, op, link); + accel_req = (struct spdk_accel_task *)((uintptr_t)op->cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + + switch (op->op_code) { + case SW_ACCEL_OPCODE_MEMMOVE: + memcpy(op->dst, op->src, op->nbytes); + break; + case SW_ACCEL_OPCODE_DUALCAST: + memcpy(op->dst, op->src, op->nbytes); + memcpy(op->dst2, op->src, op->nbytes); + break; + case SW_ACCEL_OPCODE_COMPARE: + cmd_status = memcmp(op->src, op->src2, op->nbytes); + break; + case SW_ACCEL_OPCODE_MEMFILL: + memset(op->dst, op->fill_pattern, op->nbytes); + break; + case SW_ACCEL_OPCODE_CRC32C: + *(uint32_t *)op->dst = spdk_crc32c_update(op->src, op->nbytes, ~op->seed); + break; + default: + assert(false); + break; + } + + batch_status |= cmd_status; + op->cb_fn(accel_req, cmd_status); + TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link); + } + + /* Now complete the batch request itself. */ + accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_req, batch_status); + + return 0; +} + +static int +sw_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_accel_task *accel_req; + + memcpy(dst, src, (size_t)nbytes); + + accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_req, 0); + return 0; +} + +static int +sw_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, + void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_accel_task *accel_req; + + memcpy(dst1, src, (size_t)nbytes); + memcpy(dst2, src, (size_t)nbytes); + + accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_req, 0); + return 0; +} + +static int +sw_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_accel_task *accel_req; + int result; + + result = memcmp(src1, src2, (size_t)nbytes); + + accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_req, result); + + return 0; +} + +static int +sw_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, + uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_accel_task *accel_req; + + memset(dst, fill, nbytes); + accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_req, 0); + + return 0; +} + +static int +sw_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src, + uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_accel_task *accel_req; + + *dst = spdk_crc32c_update(src, nbytes, ~seed); + accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_accel_task, offload_ctx)); + cb_fn(accel_req, 0); + + return 0; +} + +static struct spdk_io_channel *sw_accel_get_io_channel(void); + +static struct spdk_accel_engine sw_accel_engine = { + .get_capabilities = sw_accel_get_capabilities, + .copy = sw_accel_submit_copy, + .dualcast = sw_accel_submit_dualcast, + .batch_get_max = sw_accel_batch_get_max, + .batch_create = sw_accel_batch_start, + .batch_cancel = sw_accel_batch_cancel, + .batch_prep_copy = sw_accel_batch_prep_copy, + .batch_prep_dualcast = sw_accel_batch_prep_dualcast, + .batch_prep_compare = sw_accel_batch_prep_compare, + .batch_prep_fill = sw_accel_batch_prep_fill, + .batch_prep_crc32c = sw_accel_batch_prep_crc32c, + .batch_submit = sw_accel_batch_submit, + .compare = sw_accel_submit_compare, + .fill = sw_accel_submit_fill, + .crc32c = sw_accel_submit_crc32c, + .get_io_channel = sw_accel_get_io_channel, +}; + +static int +sw_accel_create_cb(void *io_device, void *ctx_buf) +{ + struct sw_accel_io_channel *sw_ch = ctx_buf; + struct sw_accel_op *op; + int i; + + TAILQ_INIT(&sw_ch->batch); + + TAILQ_INIT(&sw_ch->op_pool); + for (i = 0 ; i < SW_ACCEL_BATCH_SIZE ; i++) { + op = calloc(1, sizeof(struct sw_accel_op)); + if (op == NULL) { + SPDK_ERRLOG("Failed to allocate operation for batch.\n"); + while ((op = TAILQ_FIRST(&sw_ch->op_pool))) { + TAILQ_REMOVE(&sw_ch->op_pool, op, link); + free(op); + } + return -ENOMEM; + } + TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link); + } + + return 0; +} + +static void +sw_accel_destroy_cb(void *io_device, void *ctx_buf) +{ + struct sw_accel_io_channel *sw_ch = ctx_buf; + struct sw_accel_op *op; + + while ((op = TAILQ_FIRST(&sw_ch->op_pool))) { + TAILQ_REMOVE(&sw_ch->op_pool, op, link); + free(op); + } +} + +static struct spdk_io_channel *sw_accel_get_io_channel(void) +{ + return spdk_get_io_channel(&sw_accel_engine); +} + +static size_t +sw_accel_engine_get_ctx_size(void) +{ + return sizeof(struct spdk_accel_task); +} + +static int +sw_accel_engine_init(void) +{ + accel_sw_register(&sw_accel_engine); + spdk_io_device_register(&sw_accel_engine, sw_accel_create_cb, sw_accel_destroy_cb, + sizeof(struct sw_accel_io_channel), "sw_accel_engine"); + + return 0; +} + +static void +sw_accel_engine_fini(void *ctxt) +{ + spdk_io_device_unregister(&sw_accel_engine, NULL); + accel_sw_unregister(); + + spdk_accel_engine_module_finish(); +} + +SPDK_ACCEL_MODULE_REGISTER(sw_accel_engine_init, sw_accel_engine_fini, + NULL, NULL, sw_accel_engine_get_ctx_size) diff --git a/src/spdk/lib/accel/spdk_accel.map b/src/spdk/lib/accel/spdk_accel.map new file mode 100644 index 000000000..bfccf0a90 --- /dev/null +++ b/src/spdk/lib/accel/spdk_accel.map @@ -0,0 +1,33 @@ +{ + global: + + # public functions + spdk_accel_engine_initialize; + spdk_accel_engine_finish; + spdk_accel_engine_config_text; + spdk_accel_engine_module_finish; + spdk_accel_engine_get_io_channel; + spdk_accel_get_capabilities; + spdk_accel_batch_get_max; + spdk_accel_batch_create; + spdk_accel_batch_prep_copy; + spdk_accel_batch_prep_dualcast; + spdk_accel_batch_prep_compare; + spdk_accel_batch_prep_fill; + spdk_accel_batch_prep_crc32c; + spdk_accel_batch_submit; + spdk_accel_batch_cancel; + spdk_accel_submit_copy; + spdk_accel_submit_dualcast; + spdk_accel_submit_compare; + spdk_accel_submit_fill; + spdk_accel_submit_crc32c; + spdk_accel_write_config_json; + + # functions needed by modules + spdk_accel_hw_engine_register; + spdk_accel_module_list_add; + + + local: *; +}; diff --git a/src/spdk/lib/bdev/Makefile b/src/spdk/lib/bdev/Makefile new file mode 100644 index 000000000..ca0bf992a --- /dev/null +++ b/src/spdk/lib/bdev/Makefile @@ -0,0 +1,50 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 + +ifeq ($(CONFIG_VTUNE),y) +CFLAGS += -I$(CONFIG_VTUNE_DIR)/include -I$(CONFIG_VTUNE_DIR)/sdk/src/ittnotify +endif + +C_SRCS = bdev.c bdev_rpc.c bdev_zone.c part.c scsi_nvme.c +C_SRCS-$(CONFIG_VTUNE) += vtune.c +LIBNAME = bdev + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_bdev.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/bdev.c b/src/spdk/lib/bdev/bdev.c new file mode 100644 index 000000000..af8c05aaa --- /dev/null +++ b/src/spdk/lib/bdev/bdev.c @@ -0,0 +1,6763 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" + +#include "spdk/config.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/nvme_spec.h" +#include "spdk/scsi_spec.h" +#include "spdk/notify.h" +#include "spdk/util.h" +#include "spdk/trace.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk/string.h" + +#include "bdev_internal.h" + +#ifdef SPDK_CONFIG_VTUNE +#include "ittnotify.h" +#include "ittnotify_types.h" +int __itt_init_ittlib(const char *, __itt_group_id); +#endif + +#define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) +#define SPDK_BDEV_IO_CACHE_SIZE 256 +#define SPDK_BDEV_AUTO_EXAMINE true +#define BUF_SMALL_POOL_SIZE 8191 +#define BUF_LARGE_POOL_SIZE 1023 +#define NOMEM_THRESHOLD_COUNT 8 +#define ZERO_BUFFER_SIZE 0x100000 + +#define OWNER_BDEV 0x2 + +#define OBJECT_BDEV_IO 0x2 + +#define TRACE_GROUP_BDEV 0x3 +#define TRACE_BDEV_IO_START SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0) +#define TRACE_BDEV_IO_DONE SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1) + +#define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 +#define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 +#define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 +#define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 +#define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) +#define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX +#define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 + +#define SPDK_BDEV_POOL_ALIGNMENT 512 + +static const char *qos_conf_type[] = {"Limit_IOPS", + "Limit_BPS", "Limit_Read_BPS", "Limit_Write_BPS" + }; +static const char *qos_rpc_type[] = {"rw_ios_per_sec", + "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" + }; + +TAILQ_HEAD(spdk_bdev_list, spdk_bdev); + +struct spdk_bdev_mgr { + struct spdk_mempool *bdev_io_pool; + + struct spdk_mempool *buf_small_pool; + struct spdk_mempool *buf_large_pool; + + void *zero_buffer; + + TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; + + struct spdk_bdev_list bdevs; + + bool init_complete; + bool module_init_complete; + + pthread_mutex_t mutex; + +#ifdef SPDK_CONFIG_VTUNE + __itt_domain *domain; +#endif +}; + +static struct spdk_bdev_mgr g_bdev_mgr = { + .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), + .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), + .init_complete = false, + .module_init_complete = false, + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +typedef void (*lock_range_cb)(void *ctx, int status); + +struct lba_range { + uint64_t offset; + uint64_t length; + void *locked_ctx; + struct spdk_bdev_channel *owner_ch; + TAILQ_ENTRY(lba_range) tailq; +}; + +static struct spdk_bdev_opts g_bdev_opts = { + .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, + .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, + .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, +}; + +static spdk_bdev_init_cb g_init_cb_fn = NULL; +static void *g_init_cb_arg = NULL; + +static spdk_bdev_fini_cb g_fini_cb_fn = NULL; +static void *g_fini_cb_arg = NULL; +static struct spdk_thread *g_fini_thread = NULL; + +struct spdk_bdev_qos_limit { + /** IOs or bytes allowed per second (i.e., 1s). */ + uint64_t limit; + + /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). + * For remaining bytes, allowed to run negative if an I/O is submitted when + * some bytes are remaining, but the I/O is bigger than that amount. The + * excess will be deducted from the next timeslice. + */ + int64_t remaining_this_timeslice; + + /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ + uint32_t min_per_timeslice; + + /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ + uint32_t max_per_timeslice; + + /** Function to check whether to queue the IO. */ + bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); + + /** Function to update for the submitted IO. */ + void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); +}; + +struct spdk_bdev_qos { + /** Types of structure of rate limits. */ + struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; + + /** The channel that all I/O are funneled through. */ + struct spdk_bdev_channel *ch; + + /** The thread on which the poller is running. */ + struct spdk_thread *thread; + + /** Queue of I/O waiting to be issued. */ + bdev_io_tailq_t queued; + + /** Size of a timeslice in tsc ticks. */ + uint64_t timeslice_size; + + /** Timestamp of start of last timeslice. */ + uint64_t last_timeslice; + + /** Poller that processes queued I/O commands each time slice. */ + struct spdk_poller *poller; +}; + +struct spdk_bdev_mgmt_channel { + bdev_io_stailq_t need_buf_small; + bdev_io_stailq_t need_buf_large; + + /* + * Each thread keeps a cache of bdev_io - this allows + * bdev threads which are *not* DPDK threads to still + * benefit from a per-thread bdev_io cache. Without + * this, non-DPDK threads fetching from the mempool + * incur a cmpxchg on get and put. + */ + bdev_io_stailq_t per_thread_cache; + uint32_t per_thread_cache_count; + uint32_t bdev_io_cache_size; + + TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; + TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; +}; + +/* + * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device + * will queue here their IO that awaits retry. It makes it possible to retry sending + * IO to one bdev after IO from other bdev completes. + */ +struct spdk_bdev_shared_resource { + /* The bdev management channel */ + struct spdk_bdev_mgmt_channel *mgmt_ch; + + /* + * Count of I/O submitted to bdev module and waiting for completion. + * Incremented before submit_request() is called on an spdk_bdev_io. + */ + uint64_t io_outstanding; + + /* + * Queue of IO awaiting retry because of a previous NOMEM status returned + * on this channel. + */ + bdev_io_tailq_t nomem_io; + + /* + * Threshold which io_outstanding must drop to before retrying nomem_io. + */ + uint64_t nomem_threshold; + + /* I/O channel allocated by a bdev module */ + struct spdk_io_channel *shared_ch; + + /* Refcount of bdev channels using this resource */ + uint32_t ref; + + TAILQ_ENTRY(spdk_bdev_shared_resource) link; +}; + +#define BDEV_CH_RESET_IN_PROGRESS (1 << 0) +#define BDEV_CH_QOS_ENABLED (1 << 1) + +struct spdk_bdev_channel { + struct spdk_bdev *bdev; + + /* The channel for the underlying device */ + struct spdk_io_channel *channel; + + /* Per io_device per thread data */ + struct spdk_bdev_shared_resource *shared_resource; + + struct spdk_bdev_io_stat stat; + + /* + * Count of I/O submitted to the underlying dev module through this channel + * and waiting for completion. + */ + uint64_t io_outstanding; + + /* + * List of all submitted I/Os including I/O that are generated via splitting. + */ + bdev_io_tailq_t io_submitted; + + /* + * List of spdk_bdev_io that are currently queued because they write to a locked + * LBA range. + */ + bdev_io_tailq_t io_locked; + + uint32_t flags; + + struct spdk_histogram_data *histogram; + +#ifdef SPDK_CONFIG_VTUNE + uint64_t start_tsc; + uint64_t interval_tsc; + __itt_string_handle *handle; + struct spdk_bdev_io_stat prev_stat; +#endif + + bdev_io_tailq_t queued_resets; + + lba_range_tailq_t locked_ranges; +}; + +struct media_event_entry { + struct spdk_bdev_media_event event; + TAILQ_ENTRY(media_event_entry) tailq; +}; + +#define MEDIA_EVENT_POOL_SIZE 64 + +struct spdk_bdev_desc { + struct spdk_bdev *bdev; + struct spdk_thread *thread; + struct { + bool open_with_ext; + union { + spdk_bdev_remove_cb_t remove_fn; + spdk_bdev_event_cb_t event_fn; + }; + void *ctx; + } callback; + bool closed; + bool write; + pthread_mutex_t mutex; + uint32_t refs; + TAILQ_HEAD(, media_event_entry) pending_media_events; + TAILQ_HEAD(, media_event_entry) free_media_events; + struct media_event_entry *media_events_buffer; + TAILQ_ENTRY(spdk_bdev_desc) link; + + uint64_t timeout_in_sec; + spdk_bdev_io_timeout_cb cb_fn; + void *cb_arg; + struct spdk_poller *io_timeout_poller; +}; + +struct spdk_bdev_iostat_ctx { + struct spdk_bdev_io_stat *stat; + spdk_bdev_get_device_stat_cb cb; + void *cb_arg; +}; + +struct set_qos_limit_ctx { + void (*cb_fn)(void *cb_arg, int status); + void *cb_arg; + struct spdk_bdev *bdev; +}; + +#define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) +#define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) + +static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); +static void bdev_write_zero_buffer_next(void *_bdev_io); + +static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); +static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); + +static int +bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, + uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg); +static int +bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, void *md_buf, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg); + +static int +bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, + uint64_t offset, uint64_t length, + lock_range_cb cb_fn, void *cb_arg); + +static int +bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, + uint64_t offset, uint64_t length, + lock_range_cb cb_fn, void *cb_arg); + +static inline void bdev_io_complete(void *ctx); + +static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); +static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); + +void +spdk_bdev_get_opts(struct spdk_bdev_opts *opts) +{ + *opts = g_bdev_opts; +} + +int +spdk_bdev_set_opts(struct spdk_bdev_opts *opts) +{ + uint32_t min_pool_size; + + /* + * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem + * initialization. A second mgmt_ch will be created on the same thread when the application starts + * but before the deferred put_io_channel event is executed for the first mgmt_ch. + */ + min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); + if (opts->bdev_io_pool_size < min_pool_size) { + SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 + " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, + spdk_thread_get_count()); + SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); + return -1; + } + + g_bdev_opts = *opts; + return 0; +} + +struct spdk_bdev_examine_item { + char *name; + TAILQ_ENTRY(spdk_bdev_examine_item) link; +}; + +TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); + +struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( + g_bdev_examine_allowlist); + +static inline bool +bdev_examine_allowlist_check(const char *name) +{ + struct spdk_bdev_examine_item *item; + TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { + if (strcmp(name, item->name) == 0) { + return true; + } + } + return false; +} + +static inline bool +bdev_in_examine_allowlist(struct spdk_bdev *bdev) +{ + struct spdk_bdev_alias *tmp; + if (bdev_examine_allowlist_check(bdev->name)) { + return true; + } + TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { + if (bdev_examine_allowlist_check(tmp->alias)) { + return true; + } + } + return false; +} + +static inline bool +bdev_ok_to_examine(struct spdk_bdev *bdev) +{ + if (g_bdev_opts.bdev_auto_examine) { + return true; + } else { + return bdev_in_examine_allowlist(bdev); + } +} + +static void +bdev_examine(struct spdk_bdev *bdev) +{ + struct spdk_bdev_module *module; + uint32_t action; + + TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (module->examine_config && bdev_ok_to_examine(bdev)) { + action = module->internal.action_in_progress; + module->internal.action_in_progress++; + module->examine_config(bdev); + if (action != module->internal.action_in_progress) { + SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", + module->name); + } + } + } + + if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { + if (bdev->internal.claim_module->examine_disk) { + bdev->internal.claim_module->internal.action_in_progress++; + bdev->internal.claim_module->examine_disk(bdev); + } + return; + } + + TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (module->examine_disk && bdev_ok_to_examine(bdev)) { + module->internal.action_in_progress++; + module->examine_disk(bdev); + } + } +} + +struct spdk_bdev * +spdk_bdev_first(void) +{ + struct spdk_bdev *bdev; + + bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); + if (bdev) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); + } + + return bdev; +} + +struct spdk_bdev * +spdk_bdev_next(struct spdk_bdev *prev) +{ + struct spdk_bdev *bdev; + + bdev = TAILQ_NEXT(prev, internal.link); + if (bdev) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); + } + + return bdev; +} + +static struct spdk_bdev * +_bdev_next_leaf(struct spdk_bdev *bdev) +{ + while (bdev != NULL) { + if (bdev->internal.claim_module == NULL) { + return bdev; + } else { + bdev = TAILQ_NEXT(bdev, internal.link); + } + } + + return bdev; +} + +struct spdk_bdev * +spdk_bdev_first_leaf(void) +{ + struct spdk_bdev *bdev; + + bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); + + if (bdev) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); + } + + return bdev; +} + +struct spdk_bdev * +spdk_bdev_next_leaf(struct spdk_bdev *prev) +{ + struct spdk_bdev *bdev; + + bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); + + if (bdev) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); + } + + return bdev; +} + +struct spdk_bdev * +spdk_bdev_get_by_name(const char *bdev_name) +{ + struct spdk_bdev_alias *tmp; + struct spdk_bdev *bdev = spdk_bdev_first(); + + while (bdev != NULL) { + if (strcmp(bdev_name, bdev->name) == 0) { + return bdev; + } + + TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { + if (strcmp(bdev_name, tmp->alias) == 0) { + return bdev; + } + } + + bdev = spdk_bdev_next(bdev); + } + + return NULL; +} + +void +spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) +{ + struct iovec *iovs; + + if (bdev_io->u.bdev.iovs == NULL) { + bdev_io->u.bdev.iovs = &bdev_io->iov; + bdev_io->u.bdev.iovcnt = 1; + } + + iovs = bdev_io->u.bdev.iovs; + + assert(iovs != NULL); + assert(bdev_io->u.bdev.iovcnt >= 1); + + iovs[0].iov_base = buf; + iovs[0].iov_len = len; +} + +void +spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) +{ + assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); + bdev_io->u.bdev.md_buf = md_buf; +} + +static bool +_is_buf_allocated(const struct iovec *iovs) +{ + if (iovs == NULL) { + return false; + } + + return iovs[0].iov_base != NULL; +} + +static bool +_are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) +{ + int i; + uintptr_t iov_base; + + if (spdk_likely(alignment == 1)) { + return true; + } + + for (i = 0; i < iovcnt; i++) { + iov_base = (uintptr_t)iovs[i].iov_base; + if ((iov_base & (alignment - 1)) != 0) { + return false; + } + } + + return true; +} + +static void +_copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) +{ + int i; + size_t len; + + for (i = 0; i < iovcnt; i++) { + len = spdk_min(iovs[i].iov_len, buf_len); + memcpy(buf, iovs[i].iov_base, len); + buf += len; + buf_len -= len; + } +} + +static void +_copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) +{ + int i; + size_t len; + + for (i = 0; i < iovcnt; i++) { + len = spdk_min(iovs[i].iov_len, buf_len); + memcpy(iovs[i].iov_base, buf, len); + buf += len; + buf_len -= len; + } +} + +static void +_bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) +{ + /* save original iovec */ + bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; + bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; + /* set bounce iov */ + bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; + bdev_io->u.bdev.iovcnt = 1; + /* set bounce buffer for this operation */ + bdev_io->u.bdev.iovs[0].iov_base = buf; + bdev_io->u.bdev.iovs[0].iov_len = len; + /* if this is write path, copy data from original buffer to bounce buffer */ + if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); + } +} + +static void +_bdev_io_set_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) +{ + /* save original md_buf */ + bdev_io->internal.orig_md_buf = bdev_io->u.bdev.md_buf; + /* set bounce md_buf */ + bdev_io->u.bdev.md_buf = md_buf; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + memcpy(md_buf, bdev_io->internal.orig_md_buf, len); + } +} + +static void +bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, void *buf, bool status) +{ + struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); + + if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { + bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); + bdev_io->internal.get_aux_buf_cb = NULL; + } else { + assert(bdev_io->internal.get_buf_cb != NULL); + bdev_io->internal.buf = buf; + bdev_io->internal.get_buf_cb(ch, bdev_io, status); + bdev_io->internal.get_buf_cb = NULL; + } +} + +static void +_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + bool buf_allocated; + uint64_t md_len, alignment; + void *aligned_buf; + + if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { + bdev_io_get_buf_complete(bdev_io, buf, true); + return; + } + + alignment = spdk_bdev_get_buf_align(bdev); + buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); + aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); + + if (buf_allocated) { + _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len); + } else { + spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); + } + + if (spdk_bdev_is_md_separate(bdev)) { + aligned_buf = (char *)aligned_buf + len; + md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; + + assert(((uintptr_t)aligned_buf & (alignment - 1)) == 0); + + if (bdev_io->u.bdev.md_buf != NULL) { + _bdev_io_set_bounce_md_buf(bdev_io, aligned_buf, md_len); + } else { + spdk_bdev_io_set_md_buf(bdev_io, aligned_buf, md_len); + } + } + bdev_io_get_buf_complete(bdev_io, buf, true); +} + +static void +_bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_mempool *pool; + struct spdk_bdev_io *tmp; + bdev_io_stailq_t *stailq; + struct spdk_bdev_mgmt_channel *ch; + uint64_t md_len, alignment; + + md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; + alignment = spdk_bdev_get_buf_align(bdev); + ch = bdev_io->internal.ch->shared_resource->mgmt_ch; + + if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + + SPDK_BDEV_POOL_ALIGNMENT) { + pool = g_bdev_mgr.buf_small_pool; + stailq = &ch->need_buf_small; + } else { + pool = g_bdev_mgr.buf_large_pool; + stailq = &ch->need_buf_large; + } + + if (STAILQ_EMPTY(stailq)) { + spdk_mempool_put(pool, buf); + } else { + tmp = STAILQ_FIRST(stailq); + STAILQ_REMOVE_HEAD(stailq, internal.buf_link); + _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); + } +} + +static void +bdev_io_put_buf(struct spdk_bdev_io *bdev_io) +{ + assert(bdev_io->internal.buf != NULL); + _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); + bdev_io->internal.buf = NULL; +} + +void +spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) +{ + uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + + assert(buf != NULL); + _bdev_io_put_buf(bdev_io, buf, len); +} + +static void +_bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io) +{ + if (spdk_likely(bdev_io->internal.orig_iovcnt == 0)) { + assert(bdev_io->internal.orig_md_buf == NULL); + return; + } + + /* if this is read path, copy data from bounce buffer to original buffer */ + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && + bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { + _copy_buf_to_iovs(bdev_io->internal.orig_iovs, + bdev_io->internal.orig_iovcnt, + bdev_io->internal.bounce_iov.iov_base, + bdev_io->internal.bounce_iov.iov_len); + } + /* set original buffer for this io */ + bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; + bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; + /* disable bouncing buffer for this io */ + bdev_io->internal.orig_iovcnt = 0; + bdev_io->internal.orig_iovs = NULL; + + /* do the same for metadata buffer */ + if (spdk_unlikely(bdev_io->internal.orig_md_buf != NULL)) { + assert(spdk_bdev_is_md_separate(bdev_io->bdev)); + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && + bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { + memcpy(bdev_io->internal.orig_md_buf, bdev_io->u.bdev.md_buf, + bdev_io->u.bdev.num_blocks * spdk_bdev_get_md_size(bdev_io->bdev)); + } + + bdev_io->u.bdev.md_buf = bdev_io->internal.orig_md_buf; + bdev_io->internal.orig_md_buf = NULL; + } + + /* We want to free the bounce buffer here since we know we're done with it (as opposed + * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). + */ + bdev_io_put_buf(bdev_io); +} + +static void +bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_mempool *pool; + bdev_io_stailq_t *stailq; + struct spdk_bdev_mgmt_channel *mgmt_ch; + uint64_t alignment, md_len; + void *buf; + + alignment = spdk_bdev_get_buf_align(bdev); + md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; + + if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + + SPDK_BDEV_POOL_ALIGNMENT) { + SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", + len + alignment); + bdev_io_get_buf_complete(bdev_io, NULL, false); + return; + } + + mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; + + bdev_io->internal.buf_len = len; + + if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + + SPDK_BDEV_POOL_ALIGNMENT) { + pool = g_bdev_mgr.buf_small_pool; + stailq = &mgmt_ch->need_buf_small; + } else { + pool = g_bdev_mgr.buf_large_pool; + stailq = &mgmt_ch->need_buf_large; + } + + buf = spdk_mempool_get(pool); + if (!buf) { + STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); + } else { + _bdev_io_set_buf(bdev_io, buf, len); + } +} + +void +spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + uint64_t alignment; + + assert(cb != NULL); + bdev_io->internal.get_buf_cb = cb; + + alignment = spdk_bdev_get_buf_align(bdev); + + if (_is_buf_allocated(bdev_io->u.bdev.iovs) && + _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { + /* Buffer already present and aligned */ + cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); + return; + } + + bdev_io_get_buf(bdev_io, len); +} + +void +spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) +{ + uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + + assert(cb != NULL); + assert(bdev_io->internal.get_aux_buf_cb == NULL); + bdev_io->internal.get_aux_buf_cb = cb; + bdev_io_get_buf(bdev_io, len); +} + +static int +bdev_module_get_max_ctx_size(void) +{ + struct spdk_bdev_module *bdev_module; + int max_bdev_module_size = 0; + + TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { + max_bdev_module_size = bdev_module->get_ctx_size(); + } + } + + return max_bdev_module_size; +} + +void +spdk_bdev_config_text(FILE *fp) +{ + struct spdk_bdev_module *bdev_module; + + TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (bdev_module->config_text) { + bdev_module->config_text(fp); + } + } +} + +static void +bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + int i; + struct spdk_bdev_qos *qos = bdev->internal.qos; + uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; + + if (!qos) { + return; + } + + spdk_bdev_get_qos_rate_limits(bdev, limits); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] > 0) { + spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); + } + } + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +void +spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_bdev_module *bdev_module; + struct spdk_bdev *bdev; + + assert(w != NULL); + + spdk_json_write_array_begin(w); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "bdev_set_options"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); + spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); + spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); + + TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (bdev_module->config_json) { + bdev_module->config_json(w); + } + } + + pthread_mutex_lock(&g_bdev_mgr.mutex); + + TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { + if (bdev->fn_table->write_config_json) { + bdev->fn_table->write_config_json(bdev, w); + } + + bdev_qos_config_json(bdev, w); + } + + pthread_mutex_unlock(&g_bdev_mgr.mutex); + + spdk_json_write_array_end(w); +} + +static int +bdev_mgmt_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_bdev_mgmt_channel *ch = ctx_buf; + struct spdk_bdev_io *bdev_io; + uint32_t i; + + STAILQ_INIT(&ch->need_buf_small); + STAILQ_INIT(&ch->need_buf_large); + + STAILQ_INIT(&ch->per_thread_cache); + ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; + + /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ + ch->per_thread_cache_count = 0; + for (i = 0; i < ch->bdev_io_cache_size; i++) { + bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); + assert(bdev_io != NULL); + ch->per_thread_cache_count++; + STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); + } + + TAILQ_INIT(&ch->shared_resources); + TAILQ_INIT(&ch->io_wait_queue); + + return 0; +} + +static void +bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_bdev_mgmt_channel *ch = ctx_buf; + struct spdk_bdev_io *bdev_io; + + if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { + SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); + } + + if (!TAILQ_EMPTY(&ch->shared_resources)) { + SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); + } + + while (!STAILQ_EMPTY(&ch->per_thread_cache)) { + bdev_io = STAILQ_FIRST(&ch->per_thread_cache); + STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); + ch->per_thread_cache_count--; + spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); + } + + assert(ch->per_thread_cache_count == 0); +} + +static void +bdev_init_complete(int rc) +{ + spdk_bdev_init_cb cb_fn = g_init_cb_fn; + void *cb_arg = g_init_cb_arg; + struct spdk_bdev_module *m; + + g_bdev_mgr.init_complete = true; + g_init_cb_fn = NULL; + g_init_cb_arg = NULL; + + /* + * For modules that need to know when subsystem init is complete, + * inform them now. + */ + if (rc == 0) { + TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (m->init_complete) { + m->init_complete(); + } + } + } + + cb_fn(cb_arg, rc); +} + +static void +bdev_module_action_complete(void) +{ + struct spdk_bdev_module *m; + + /* + * Don't finish bdev subsystem initialization if + * module pre-initialization is still in progress, or + * the subsystem been already initialized. + */ + if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { + return; + } + + /* + * Check all bdev modules for inits/examinations in progress. If any + * exist, return immediately since we cannot finish bdev subsystem + * initialization until all are completed. + */ + TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (m->internal.action_in_progress > 0) { + return; + } + } + + /* + * Modules already finished initialization - now that all + * the bdev modules have finished their asynchronous I/O + * processing, the entire bdev layer can be marked as complete. + */ + bdev_init_complete(0); +} + +static void +bdev_module_action_done(struct spdk_bdev_module *module) +{ + assert(module->internal.action_in_progress > 0); + module->internal.action_in_progress--; + bdev_module_action_complete(); +} + +void +spdk_bdev_module_init_done(struct spdk_bdev_module *module) +{ + bdev_module_action_done(module); +} + +void +spdk_bdev_module_examine_done(struct spdk_bdev_module *module) +{ + bdev_module_action_done(module); +} + +/** The last initialized bdev module */ +static struct spdk_bdev_module *g_resume_bdev_module = NULL; + +static void +bdev_init_failed(void *cb_arg) +{ + struct spdk_bdev_module *module = cb_arg; + + module->internal.action_in_progress--; + bdev_init_complete(-1); +} + +static int +bdev_modules_init(void) +{ + struct spdk_bdev_module *module; + int rc = 0; + + TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { + g_resume_bdev_module = module; + if (module->async_init) { + module->internal.action_in_progress = 1; + } + rc = module->module_init(); + if (rc != 0) { + /* Bump action_in_progress to prevent other modules from completion of modules_init + * Send message to defer application shutdown until resources are cleaned up */ + module->internal.action_in_progress = 1; + spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); + return rc; + } + } + + g_resume_bdev_module = NULL; + return 0; +} + +void +spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) +{ + struct spdk_conf_section *sp; + struct spdk_bdev_opts bdev_opts; + int32_t bdev_io_pool_size, bdev_io_cache_size; + int cache_size; + int rc = 0; + char mempool_name[32]; + + assert(cb_fn != NULL); + + sp = spdk_conf_find_section(NULL, "Bdev"); + if (sp != NULL) { + spdk_bdev_get_opts(&bdev_opts); + + bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize"); + if (bdev_io_pool_size >= 0) { + bdev_opts.bdev_io_pool_size = bdev_io_pool_size; + } + + bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize"); + if (bdev_io_cache_size >= 0) { + bdev_opts.bdev_io_cache_size = bdev_io_cache_size; + } + + if (spdk_bdev_set_opts(&bdev_opts)) { + bdev_init_complete(-1); + return; + } + + assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0); + } + + g_init_cb_fn = cb_fn; + g_init_cb_arg = cb_arg; + + spdk_notify_type_register("bdev_register"); + spdk_notify_type_register("bdev_unregister"); + + snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); + + g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, + g_bdev_opts.bdev_io_pool_size, + sizeof(struct spdk_bdev_io) + + bdev_module_get_max_ctx_size(), + 0, + SPDK_ENV_SOCKET_ID_ANY); + + if (g_bdev_mgr.bdev_io_pool == NULL) { + SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); + bdev_init_complete(-1); + return; + } + + /** + * Ensure no more than half of the total buffers end up local caches, by + * using spdk_env_get_core_count() to determine how many local caches we need + * to account for. + */ + cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); + snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); + + g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, + BUF_SMALL_POOL_SIZE, + SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + + SPDK_BDEV_POOL_ALIGNMENT, + cache_size, + SPDK_ENV_SOCKET_ID_ANY); + if (!g_bdev_mgr.buf_small_pool) { + SPDK_ERRLOG("create rbuf small pool failed\n"); + bdev_init_complete(-1); + return; + } + + cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); + snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); + + g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, + BUF_LARGE_POOL_SIZE, + SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + + SPDK_BDEV_POOL_ALIGNMENT, + cache_size, + SPDK_ENV_SOCKET_ID_ANY); + if (!g_bdev_mgr.buf_large_pool) { + SPDK_ERRLOG("create rbuf large pool failed\n"); + bdev_init_complete(-1); + return; + } + + g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, + NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!g_bdev_mgr.zero_buffer) { + SPDK_ERRLOG("create bdev zero buffer failed\n"); + bdev_init_complete(-1); + return; + } + +#ifdef SPDK_CONFIG_VTUNE + g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); +#endif + + spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, + bdev_mgmt_channel_destroy, + sizeof(struct spdk_bdev_mgmt_channel), + "bdev_mgr"); + + rc = bdev_modules_init(); + g_bdev_mgr.module_init_complete = true; + if (rc != 0) { + SPDK_ERRLOG("bdev modules init failed\n"); + return; + } + + bdev_module_action_complete(); +} + +static void +bdev_mgr_unregister_cb(void *io_device) +{ + spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; + + if (g_bdev_mgr.bdev_io_pool) { + if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { + SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", + spdk_mempool_count(g_bdev_mgr.bdev_io_pool), + g_bdev_opts.bdev_io_pool_size); + } + + spdk_mempool_free(g_bdev_mgr.bdev_io_pool); + } + + if (g_bdev_mgr.buf_small_pool) { + if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { + SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", + spdk_mempool_count(g_bdev_mgr.buf_small_pool), + BUF_SMALL_POOL_SIZE); + assert(false); + } + + spdk_mempool_free(g_bdev_mgr.buf_small_pool); + } + + if (g_bdev_mgr.buf_large_pool) { + if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { + SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", + spdk_mempool_count(g_bdev_mgr.buf_large_pool), + BUF_LARGE_POOL_SIZE); + assert(false); + } + + spdk_mempool_free(g_bdev_mgr.buf_large_pool); + } + + spdk_free(g_bdev_mgr.zero_buffer); + + cb_fn(g_fini_cb_arg); + g_fini_cb_fn = NULL; + g_fini_cb_arg = NULL; + g_bdev_mgr.init_complete = false; + g_bdev_mgr.module_init_complete = false; + pthread_mutex_destroy(&g_bdev_mgr.mutex); +} + +static void +bdev_module_finish_iter(void *arg) +{ + struct spdk_bdev_module *bdev_module; + + /* FIXME: Handling initialization failures is broken now, + * so we won't even try cleaning up after successfully + * initialized modules. if module_init_complete is false, + * just call spdk_bdev_mgr_unregister_cb + */ + if (!g_bdev_mgr.module_init_complete) { + bdev_mgr_unregister_cb(NULL); + return; + } + + /* Start iterating from the last touched module */ + if (!g_resume_bdev_module) { + bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); + } else { + bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, + internal.tailq); + } + + while (bdev_module) { + if (bdev_module->async_fini) { + /* Save our place so we can resume later. We must + * save the variable here, before calling module_fini() + * below, because in some cases the module may immediately + * call spdk_bdev_module_finish_done() and re-enter + * this function to continue iterating. */ + g_resume_bdev_module = bdev_module; + } + + if (bdev_module->module_fini) { + bdev_module->module_fini(); + } + + if (bdev_module->async_fini) { + return; + } + + bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, + internal.tailq); + } + + g_resume_bdev_module = NULL; + spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); +} + +void +spdk_bdev_module_finish_done(void) +{ + if (spdk_get_thread() != g_fini_thread) { + spdk_thread_send_msg(g_fini_thread, bdev_module_finish_iter, NULL); + } else { + bdev_module_finish_iter(NULL); + } +} + +static void +bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) +{ + struct spdk_bdev *bdev = cb_arg; + + if (bdeverrno && bdev) { + SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", + bdev->name); + + /* + * Since the call to spdk_bdev_unregister() failed, we have no way to free this + * bdev; try to continue by manually removing this bdev from the list and continue + * with the next bdev in the list. + */ + TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); + } + + if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); + /* + * Bdev module finish need to be deferred as we might be in the middle of some context + * (like bdev part free) that will use this bdev (or private bdev driver ctx data) + * after returning. + */ + spdk_thread_send_msg(spdk_get_thread(), bdev_module_finish_iter, NULL); + return; + } + + /* + * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem + * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity + * to detect clean shutdown as opposed to run-time hot removal of the underlying + * base bdevs. + * + * Also, walk the list in the reverse order. + */ + for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); + bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { + if (bdev->internal.claim_module != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Skipping claimed bdev '%s'(<-'%s').\n", + bdev->name, bdev->internal.claim_module->name); + continue; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); + spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); + return; + } + + /* + * If any bdev fails to unclaim underlying bdev properly, we may face the + * case of bdev list consisting of claimed bdevs only (if claims are managed + * correctly, this would mean there's a loop in the claims graph which is + * clearly impossible). Warn and unregister last bdev on the list then. + */ + for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); + bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { + SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); + spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); + return; + } +} + +void +spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) +{ + struct spdk_bdev_module *m; + + assert(cb_fn != NULL); + + g_fini_thread = spdk_get_thread(); + + g_fini_cb_fn = cb_fn; + g_fini_cb_arg = cb_arg; + + TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (m->fini_start) { + m->fini_start(); + } + } + + bdev_finish_unregister_bdevs_iter(NULL, 0); +} + +struct spdk_bdev_io * +bdev_channel_get_io(struct spdk_bdev_channel *channel) +{ + struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; + struct spdk_bdev_io *bdev_io; + + if (ch->per_thread_cache_count > 0) { + bdev_io = STAILQ_FIRST(&ch->per_thread_cache); + STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); + ch->per_thread_cache_count--; + } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { + /* + * Don't try to look for bdev_ios in the global pool if there are + * waiters on bdev_ios - we don't want this caller to jump the line. + */ + bdev_io = NULL; + } else { + bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); + } + + return bdev_io; +} + +void +spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev_mgmt_channel *ch; + + assert(bdev_io != NULL); + assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); + + ch = bdev_io->internal.ch->shared_resource->mgmt_ch; + + if (bdev_io->internal.buf != NULL) { + bdev_io_put_buf(bdev_io); + } + + if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { + ch->per_thread_cache_count++; + STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); + while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { + struct spdk_bdev_io_wait_entry *entry; + + entry = TAILQ_FIRST(&ch->io_wait_queue); + TAILQ_REMOVE(&ch->io_wait_queue, entry, link); + entry->cb_fn(entry->cb_arg); + } + } else { + /* We should never have a full cache with entries on the io wait queue. */ + assert(TAILQ_EMPTY(&ch->io_wait_queue)); + spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); + } +} + +static bool +bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) +{ + assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); + + switch (limit) { + case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: + return true; + case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: + case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: + case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: + return false; + case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: + default: + return false; + } +} + +static bool +bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_NVME_IO: + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + return true; + case SPDK_BDEV_IO_TYPE_ZCOPY: + if (bdev_io->u.bdev.zcopy.start) { + return true; + } else { + return false; + } + default: + return false; + } +} + +static bool +bdev_is_read_io(struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_NVME_IO: + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + /* Bit 1 (0x2) set for read operation */ + if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { + return true; + } else { + return false; + } + case SPDK_BDEV_IO_TYPE_READ: + return true; + case SPDK_BDEV_IO_TYPE_ZCOPY: + /* Populate to read from disk */ + if (bdev_io->u.bdev.zcopy.populate) { + return true; + } else { + return false; + } + default: + return false; + } +} + +static uint64_t +bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_NVME_IO: + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + return bdev_io->u.nvme_passthru.nbytes; + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + return bdev_io->u.bdev.num_blocks * bdev->blocklen; + case SPDK_BDEV_IO_TYPE_ZCOPY: + /* Track the data in the start phase only */ + if (bdev_io->u.bdev.zcopy.start) { + return bdev_io->u.bdev.num_blocks * bdev->blocklen; + } else { + return 0; + } + default: + return 0; + } +} + +static bool +bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +{ + if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { + return true; + } else { + return false; + } +} + +static bool +bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +{ + if (bdev_is_read_io(io) == false) { + return false; + } + + return bdev_qos_rw_queue_io(limit, io); +} + +static bool +bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +{ + if (bdev_is_read_io(io) == true) { + return false; + } + + return bdev_qos_rw_queue_io(limit, io); +} + +static void +bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +{ + limit->remaining_this_timeslice--; +} + +static void +bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +{ + limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); +} + +static void +bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +{ + if (bdev_is_read_io(io) == false) { + return; + } + + return bdev_qos_rw_bps_update_quota(limit, io); +} + +static void +bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +{ + if (bdev_is_read_io(io) == true) { + return; + } + + return bdev_qos_rw_bps_update_quota(limit, io); +} + +static void +bdev_qos_set_ops(struct spdk_bdev_qos *qos) +{ + int i; + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + qos->rate_limits[i].queue_io = NULL; + qos->rate_limits[i].update_quota = NULL; + continue; + } + + switch (i) { + case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: + qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; + qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; + break; + case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: + qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; + qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; + break; + case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: + qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; + qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; + break; + case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: + qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; + qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; + break; + default: + break; + } + } +} + +static void +_bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, + struct spdk_bdev_io *bdev_io, + enum spdk_bdev_io_status status) +{ + struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; + + bdev_io->internal.in_submit_request = true; + bdev_ch->io_outstanding++; + shared_resource->io_outstanding++; + spdk_bdev_io_complete(bdev_io, status); + bdev_io->internal.in_submit_request = false; +} + +static inline void +bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_io_channel *ch = bdev_ch->channel; + struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; + + if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { + struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; + struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; + + if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || + bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || + bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { + _bdev_io_complete_in_submit(bdev_ch, bdev_io, + SPDK_BDEV_IO_STATUS_SUCCESS); + return; + } + } + + if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { + bdev_ch->io_outstanding++; + shared_resource->io_outstanding++; + bdev_io->internal.in_submit_request = true; + bdev->fn_table->submit_request(ch, bdev_io); + bdev_io->internal.in_submit_request = false; + } else { + TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); + } +} + +static int +bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) +{ + struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; + int i, submitted_ios = 0; + + TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { + if (bdev_qos_io_to_limit(bdev_io) == true) { + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (!qos->rate_limits[i].queue_io) { + continue; + } + + if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], + bdev_io) == true) { + return submitted_ios; + } + } + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (!qos->rate_limits[i].update_quota) { + continue; + } + + qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); + } + } + + TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); + bdev_io_do_submit(ch, bdev_io); + submitted_ios++; + } + + return submitted_ios; +} + +static void +bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) +{ + int rc; + + bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; + bdev_io->internal.waitq_entry.cb_fn = cb_fn; + bdev_io->internal.waitq_entry.cb_arg = bdev_io; + rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), + &bdev_io->internal.waitq_entry); + if (rc != 0) { + SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } +} + +static bool +bdev_io_type_can_split(uint8_t type) +{ + assert(type != SPDK_BDEV_IO_TYPE_INVALID); + assert(type < SPDK_BDEV_NUM_IO_TYPES); + + /* Only split READ and WRITE I/O. Theoretically other types of I/O like + * UNMAP could be split, but these types of I/O are typically much larger + * in size (sometimes the size of the entire block device), and the bdev + * module can more efficiently split these types of I/O. Plus those types + * of I/O do not have a payload, which makes the splitting process simpler. + */ + if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) { + return true; + } else { + return false; + } +} + +static bool +bdev_io_should_split(struct spdk_bdev_io *bdev_io) +{ + uint64_t start_stripe, end_stripe; + uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; + + if (io_boundary == 0) { + return false; + } + + if (!bdev_io_type_can_split(bdev_io->type)) { + return false; + } + + start_stripe = bdev_io->u.bdev.offset_blocks; + end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; + /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ + if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { + start_stripe >>= spdk_u32log2(io_boundary); + end_stripe >>= spdk_u32log2(io_boundary); + } else { + start_stripe /= io_boundary; + end_stripe /= io_boundary; + } + return (start_stripe != end_stripe); +} + +static uint32_t +_to_next_boundary(uint64_t offset, uint32_t boundary) +{ + return (boundary - (offset % boundary)); +} + +static void +bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); + +static void +_bdev_io_split(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + uint64_t current_offset, remaining; + uint32_t blocklen, to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; + struct iovec *parent_iov, *iov; + uint64_t parent_iov_offset, iov_len; + uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; + void *md_buf = NULL; + int rc; + + remaining = bdev_io->u.bdev.split_remaining_num_blocks; + current_offset = bdev_io->u.bdev.split_current_offset_blocks; + blocklen = bdev_io->bdev->blocklen; + parent_iov_offset = (current_offset - bdev_io->u.bdev.offset_blocks) * blocklen; + parent_iovcnt = bdev_io->u.bdev.iovcnt; + + for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { + parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; + if (parent_iov_offset < parent_iov->iov_len) { + break; + } + parent_iov_offset -= parent_iov->iov_len; + } + + child_iovcnt = 0; + while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { + to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary); + to_next_boundary = spdk_min(remaining, to_next_boundary); + to_next_boundary_bytes = to_next_boundary * blocklen; + iov = &bdev_io->child_iov[child_iovcnt]; + iovcnt = 0; + + if (bdev_io->u.bdev.md_buf) { + assert((parent_iov_offset % blocklen) > 0); + md_buf = (char *)bdev_io->u.bdev.md_buf + (parent_iov_offset / blocklen) * + spdk_bdev_get_md_size(bdev_io->bdev); + } + + while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && + child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { + parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; + iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset); + to_next_boundary_bytes -= iov_len; + + bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; + bdev_io->child_iov[child_iovcnt].iov_len = iov_len; + + if (iov_len < parent_iov->iov_len - parent_iov_offset) { + parent_iov_offset += iov_len; + } else { + parent_iovpos++; + parent_iov_offset = 0; + } + child_iovcnt++; + iovcnt++; + } + + if (to_next_boundary_bytes > 0) { + /* We had to stop this child I/O early because we ran out of + * child_iov space. Ensure the iovs to be aligned with block + * size and then adjust to_next_boundary before starting the + * child I/O. + */ + assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV); + to_last_block_bytes = to_next_boundary_bytes % blocklen; + if (to_last_block_bytes != 0) { + uint32_t child_iovpos = child_iovcnt - 1; + /* don't decrease child_iovcnt so the loop will naturally end */ + + to_last_block_bytes = blocklen - to_last_block_bytes; + to_next_boundary_bytes += to_last_block_bytes; + while (to_last_block_bytes > 0 && iovcnt > 0) { + iov_len = spdk_min(to_last_block_bytes, + bdev_io->child_iov[child_iovpos].iov_len); + bdev_io->child_iov[child_iovpos].iov_len -= iov_len; + if (bdev_io->child_iov[child_iovpos].iov_len == 0) { + child_iovpos--; + if (--iovcnt == 0) { + return; + } + } + to_last_block_bytes -= iov_len; + } + + assert(to_last_block_bytes == 0); + } + to_next_boundary -= to_next_boundary_bytes / blocklen; + } + + bdev_io->u.bdev.split_outstanding++; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), + iov, iovcnt, md_buf, current_offset, + to_next_boundary, + bdev_io_split_done, bdev_io); + } else { + rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), + iov, iovcnt, md_buf, current_offset, + to_next_boundary, + bdev_io_split_done, bdev_io); + } + + if (rc == 0) { + current_offset += to_next_boundary; + remaining -= to_next_boundary; + bdev_io->u.bdev.split_current_offset_blocks = current_offset; + bdev_io->u.bdev.split_remaining_num_blocks = remaining; + } else { + bdev_io->u.bdev.split_outstanding--; + if (rc == -ENOMEM) { + if (bdev_io->u.bdev.split_outstanding == 0) { + /* No I/O is outstanding. Hence we should wait here. */ + bdev_queue_io_wait_with_cb(bdev_io, _bdev_io_split); + } + } else { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + if (bdev_io->u.bdev.split_outstanding == 0) { + spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0, + (uintptr_t)bdev_io, 0); + TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } + } + + return; + } + } +} + +static void +bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (!success) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + /* If any child I/O failed, stop further splitting process. */ + parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; + parent_io->u.bdev.split_remaining_num_blocks = 0; + } + parent_io->u.bdev.split_outstanding--; + if (parent_io->u.bdev.split_outstanding != 0) { + return; + } + + /* + * Parent I/O finishes when all blocks are consumed. + */ + if (parent_io->u.bdev.split_remaining_num_blocks == 0) { + assert(parent_io->internal.cb != bdev_io_split_done); + spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0, + (uintptr_t)parent_io, 0); + TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); + parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, + parent_io->internal.caller_ctx); + return; + } + + /* + * Continue with the splitting process. This function will complete the parent I/O if the + * splitting is done. + */ + _bdev_io_split(parent_io); +} + +static void +bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); + +static void +bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + assert(bdev_io_type_can_split(bdev_io->type)); + + bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; + bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; + bdev_io->u.bdev.split_outstanding = 0; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + + if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { + _bdev_io_split(bdev_io); + } else { + assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); + spdk_bdev_io_get_buf(bdev_io, bdev_io_split_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + } +} + +static void +bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + bdev_io_split(ch, bdev_io); +} + +/* Explicitly mark this inline, since it's used as a function pointer and otherwise won't + * be inlined, at least on some compilers. + */ +static inline void +_bdev_io_submit(void *ctx) +{ + struct spdk_bdev_io *bdev_io = ctx; + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; + uint64_t tsc; + + tsc = spdk_get_ticks(); + bdev_io->internal.submit_tsc = tsc; + spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type); + + if (spdk_likely(bdev_ch->flags == 0)) { + bdev_io_do_submit(bdev_ch, bdev_io); + return; + } + + if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { + _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); + } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { + if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && + bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { + _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); + bdev_qos_io_submit(bdev_ch, bdev->internal.qos); + } + } else { + SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); + _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +bool +bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); + +bool +bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) +{ + if (range1->length == 0 || range2->length == 0) { + return false; + } + + if (range1->offset + range1->length <= range2->offset) { + return false; + } + + if (range2->offset + range2->length <= range1->offset) { + return false; + } + + return true; +} + +static bool +bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) +{ + struct spdk_bdev_channel *ch = bdev_io->internal.ch; + struct lba_range r; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_NVME_IO: + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + /* Don't try to decode the NVMe command - just assume worst-case and that + * it overlaps a locked range. + */ + return true; + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + case SPDK_BDEV_IO_TYPE_ZCOPY: + r.offset = bdev_io->u.bdev.offset_blocks; + r.length = bdev_io->u.bdev.num_blocks; + if (!bdev_lba_range_overlapped(range, &r)) { + /* This I/O doesn't overlap the specified LBA range. */ + return false; + } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { + /* This I/O overlaps, but the I/O is on the same channel that locked this + * range, and the caller_ctx is the same as the locked_ctx. This means + * that this I/O is associated with the lock, and is allowed to execute. + */ + return false; + } else { + return true; + } + default: + return false; + } +} + +void +bdev_io_submit(struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); + struct spdk_bdev_channel *ch = bdev_io->internal.ch; + + assert(thread != NULL); + assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); + + if (!TAILQ_EMPTY(&ch->locked_ranges)) { + struct lba_range *range; + + TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { + if (bdev_io_range_is_locked(bdev_io, range)) { + TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); + return; + } + } + } + + TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); + + if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bdev_io)) { + bdev_io->internal.submit_tsc = spdk_get_ticks(); + spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, + (uintptr_t)bdev_io, bdev_io->type); + bdev_io_split(NULL, bdev_io); + return; + } + + if (ch->flags & BDEV_CH_QOS_ENABLED) { + if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { + _bdev_io_submit(bdev_io); + } else { + bdev_io->internal.io_submit_ch = ch; + bdev_io->internal.ch = bdev->internal.qos->ch; + spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); + } + } else { + _bdev_io_submit(bdev_io); + } +} + +static void +bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; + struct spdk_io_channel *ch = bdev_ch->channel; + + assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); + + bdev_io->internal.in_submit_request = true; + bdev->fn_table->submit_request(ch, bdev_io); + bdev_io->internal.in_submit_request = false; +} + +void +bdev_io_init(struct spdk_bdev_io *bdev_io, + struct spdk_bdev *bdev, void *cb_arg, + spdk_bdev_io_completion_cb cb) +{ + bdev_io->bdev = bdev; + bdev_io->internal.caller_ctx = cb_arg; + bdev_io->internal.cb = cb; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; + bdev_io->internal.in_submit_request = false; + bdev_io->internal.buf = NULL; + bdev_io->internal.io_submit_ch = NULL; + bdev_io->internal.orig_iovs = NULL; + bdev_io->internal.orig_iovcnt = 0; + bdev_io->internal.orig_md_buf = NULL; + bdev_io->internal.error.nvme.cdw0 = 0; + bdev_io->num_retries = 0; + bdev_io->internal.get_buf_cb = NULL; + bdev_io->internal.get_aux_buf_cb = NULL; +} + +static bool +bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) +{ + return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); +} + +bool +spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) +{ + bool supported; + + supported = bdev_io_type_supported(bdev, io_type); + + if (!supported) { + switch (io_type) { + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + /* The bdev layer will emulate write zeroes as long as write is supported. */ + supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); + break; + case SPDK_BDEV_IO_TYPE_ZCOPY: + /* Zero copy can be emulated with regular read and write */ + supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ) && + bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); + break; + default: + break; + } + } + + return supported; +} + +int +spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + if (bdev->fn_table->dump_info_json) { + return bdev->fn_table->dump_info_json(bdev->ctxt, w); + } + + return 0; +} + +static void +bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) +{ + uint32_t max_per_timeslice = 0; + int i; + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + qos->rate_limits[i].max_per_timeslice = 0; + continue; + } + + max_per_timeslice = qos->rate_limits[i].limit * + SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; + + qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, + qos->rate_limits[i].min_per_timeslice); + + qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; + } + + bdev_qos_set_ops(qos); +} + +static int +bdev_channel_poll_qos(void *arg) +{ + struct spdk_bdev_qos *qos = arg; + uint64_t now = spdk_get_ticks(); + int i; + + if (now < (qos->last_timeslice + qos->timeslice_size)) { + /* We received our callback earlier than expected - return + * immediately and wait to do accounting until at least one + * timeslice has actually expired. This should never happen + * with a well-behaved timer implementation. + */ + return SPDK_POLLER_IDLE; + } + + /* Reset for next round of rate limiting */ + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + /* We may have allowed the IOs or bytes to slightly overrun in the last + * timeslice. remaining_this_timeslice is signed, so if it's negative + * here, we'll account for the overrun so that the next timeslice will + * be appropriately reduced. + */ + if (qos->rate_limits[i].remaining_this_timeslice > 0) { + qos->rate_limits[i].remaining_this_timeslice = 0; + } + } + + while (now >= (qos->last_timeslice + qos->timeslice_size)) { + qos->last_timeslice += qos->timeslice_size; + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + qos->rate_limits[i].remaining_this_timeslice += + qos->rate_limits[i].max_per_timeslice; + } + } + + return bdev_qos_io_submit(qos->ch, qos); +} + +static void +bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) +{ + struct spdk_bdev_shared_resource *shared_resource; + struct lba_range *range; + + while (!TAILQ_EMPTY(&ch->locked_ranges)) { + range = TAILQ_FIRST(&ch->locked_ranges); + TAILQ_REMOVE(&ch->locked_ranges, range, tailq); + free(range); + } + + spdk_put_io_channel(ch->channel); + + shared_resource = ch->shared_resource; + + assert(TAILQ_EMPTY(&ch->io_locked)); + assert(TAILQ_EMPTY(&ch->io_submitted)); + assert(ch->io_outstanding == 0); + assert(shared_resource->ref > 0); + shared_resource->ref--; + if (shared_resource->ref == 0) { + assert(shared_resource->io_outstanding == 0); + TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); + spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); + free(shared_resource); + } +} + +/* Caller must hold bdev->internal.mutex. */ +static void +bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) +{ + struct spdk_bdev_qos *qos = bdev->internal.qos; + int i; + + /* Rate limiting on this bdev enabled */ + if (qos) { + if (qos->ch == NULL) { + struct spdk_io_channel *io_ch; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, + bdev->name, spdk_get_thread()); + + /* No qos channel has been selected, so set one up */ + + /* Take another reference to ch */ + io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); + assert(io_ch != NULL); + qos->ch = ch; + + qos->thread = spdk_io_channel_get_thread(io_ch); + + TAILQ_INIT(&qos->queued); + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (bdev_qos_is_iops_rate_limit(i) == true) { + qos->rate_limits[i].min_per_timeslice = + SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; + } else { + qos->rate_limits[i].min_per_timeslice = + SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; + } + + if (qos->rate_limits[i].limit == 0) { + qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; + } + } + bdev_qos_update_max_quota_per_timeslice(qos); + qos->timeslice_size = + SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; + qos->last_timeslice = spdk_get_ticks(); + qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, + qos, + SPDK_BDEV_QOS_TIMESLICE_IN_USEC); + } + + ch->flags |= BDEV_CH_QOS_ENABLED; + } +} + +struct poll_timeout_ctx { + struct spdk_bdev_desc *desc; + uint64_t timeout_in_sec; + spdk_bdev_io_timeout_cb cb_fn; + void *cb_arg; +}; + +static void +bdev_desc_free(struct spdk_bdev_desc *desc) +{ + pthread_mutex_destroy(&desc->mutex); + free(desc->media_events_buffer); + free(desc); +} + +static void +bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) +{ + struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_bdev_desc *desc = ctx->desc; + + free(ctx); + + pthread_mutex_lock(&desc->mutex); + desc->refs--; + if (desc->closed == true && desc->refs == 0) { + pthread_mutex_unlock(&desc->mutex); + bdev_desc_free(desc); + return; + } + pthread_mutex_unlock(&desc->mutex); +} + +static void +bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) +{ + struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); + struct spdk_bdev_desc *desc = ctx->desc; + struct spdk_bdev_io *bdev_io; + uint64_t now; + + pthread_mutex_lock(&desc->mutex); + if (desc->closed == true) { + pthread_mutex_unlock(&desc->mutex); + spdk_for_each_channel_continue(i, -1); + return; + } + pthread_mutex_unlock(&desc->mutex); + + now = spdk_get_ticks(); + TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { + /* Exclude any I/O that are generated via splitting. */ + if (bdev_io->internal.cb == bdev_io_split_done) { + continue; + } + + /* Once we find an I/O that has not timed out, we can immediately + * exit the loop. + */ + if (now < (bdev_io->internal.submit_tsc + + ctx->timeout_in_sec * spdk_get_ticks_hz())) { + goto end; + } + + if (bdev_io->internal.desc == desc) { + ctx->cb_fn(ctx->cb_arg, bdev_io); + } + } + +end: + spdk_for_each_channel_continue(i, 0); +} + +static int +bdev_poll_timeout_io(void *arg) +{ + struct spdk_bdev_desc *desc = arg; + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct poll_timeout_ctx *ctx; + + ctx = calloc(1, sizeof(struct poll_timeout_ctx)); + if (!ctx) { + SPDK_ERRLOG("failed to allocate memory\n"); + return SPDK_POLLER_BUSY; + } + ctx->desc = desc; + ctx->cb_arg = desc->cb_arg; + ctx->cb_fn = desc->cb_fn; + ctx->timeout_in_sec = desc->timeout_in_sec; + + /* Take a ref on the descriptor in case it gets closed while we are checking + * all of the channels. + */ + pthread_mutex_lock(&desc->mutex); + desc->refs++; + pthread_mutex_unlock(&desc->mutex); + + spdk_for_each_channel(__bdev_to_io_dev(bdev), + bdev_channel_poll_timeout_io, + ctx, + bdev_channel_poll_timeout_io_done); + + return SPDK_POLLER_BUSY; +} + +int +spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, + spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) +{ + assert(desc->thread == spdk_get_thread()); + + spdk_poller_unregister(&desc->io_timeout_poller); + + if (timeout_in_sec) { + assert(cb_fn != NULL); + desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, + desc, + SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / + 1000); + if (desc->io_timeout_poller == NULL) { + SPDK_ERRLOG("can not register the desc timeout IO poller\n"); + return -1; + } + } + + desc->cb_fn = cb_fn; + desc->cb_arg = cb_arg; + desc->timeout_in_sec = timeout_in_sec; + + return 0; +} + +static int +bdev_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); + struct spdk_bdev_channel *ch = ctx_buf; + struct spdk_io_channel *mgmt_io_ch; + struct spdk_bdev_mgmt_channel *mgmt_ch; + struct spdk_bdev_shared_resource *shared_resource; + struct lba_range *range; + + ch->bdev = bdev; + ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); + if (!ch->channel) { + return -1; + } + + assert(ch->histogram == NULL); + if (bdev->internal.histogram_enabled) { + ch->histogram = spdk_histogram_data_alloc(); + if (ch->histogram == NULL) { + SPDK_ERRLOG("Could not allocate histogram\n"); + } + } + + mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); + if (!mgmt_io_ch) { + spdk_put_io_channel(ch->channel); + return -1; + } + + mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); + TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { + if (shared_resource->shared_ch == ch->channel) { + spdk_put_io_channel(mgmt_io_ch); + shared_resource->ref++; + break; + } + } + + if (shared_resource == NULL) { + shared_resource = calloc(1, sizeof(*shared_resource)); + if (shared_resource == NULL) { + spdk_put_io_channel(ch->channel); + spdk_put_io_channel(mgmt_io_ch); + return -1; + } + + shared_resource->mgmt_ch = mgmt_ch; + shared_resource->io_outstanding = 0; + TAILQ_INIT(&shared_resource->nomem_io); + shared_resource->nomem_threshold = 0; + shared_resource->shared_ch = ch->channel; + shared_resource->ref = 1; + TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); + } + + memset(&ch->stat, 0, sizeof(ch->stat)); + ch->stat.ticks_rate = spdk_get_ticks_hz(); + ch->io_outstanding = 0; + TAILQ_INIT(&ch->queued_resets); + TAILQ_INIT(&ch->locked_ranges); + ch->flags = 0; + ch->shared_resource = shared_resource; + + TAILQ_INIT(&ch->io_submitted); + TAILQ_INIT(&ch->io_locked); + +#ifdef SPDK_CONFIG_VTUNE + { + char *name; + __itt_init_ittlib(NULL, 0); + name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); + if (!name) { + bdev_channel_destroy_resource(ch); + return -1; + } + ch->handle = __itt_string_handle_create(name); + free(name); + ch->start_tsc = spdk_get_ticks(); + ch->interval_tsc = spdk_get_ticks_hz() / 100; + memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); + } +#endif + + pthread_mutex_lock(&bdev->internal.mutex); + bdev_enable_qos(bdev, ch); + + TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { + struct lba_range *new_range; + + new_range = calloc(1, sizeof(*new_range)); + if (new_range == NULL) { + pthread_mutex_unlock(&bdev->internal.mutex); + bdev_channel_destroy_resource(ch); + return -1; + } + new_range->length = range->length; + new_range->offset = range->offset; + new_range->locked_ctx = range->locked_ctx; + TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); + } + + pthread_mutex_unlock(&bdev->internal.mutex); + + return 0; +} + +/* + * Abort I/O that are waiting on a data buffer. These types of I/O are + * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. + */ +static void +bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) +{ + bdev_io_stailq_t tmp; + struct spdk_bdev_io *bdev_io; + + STAILQ_INIT(&tmp); + + while (!STAILQ_EMPTY(queue)) { + bdev_io = STAILQ_FIRST(queue); + STAILQ_REMOVE_HEAD(queue, internal.buf_link); + if (bdev_io->internal.ch == ch) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); + } else { + STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); + } + } + + STAILQ_SWAP(&tmp, queue, spdk_bdev_io); +} + +/* + * Abort I/O that are queued waiting for submission. These types of I/O are + * linked using the spdk_bdev_io link TAILQ_ENTRY. + */ +static void +bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) +{ + struct spdk_bdev_io *bdev_io, *tmp; + + TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { + if (bdev_io->internal.ch == ch) { + TAILQ_REMOVE(queue, bdev_io, internal.link); + /* + * spdk_bdev_io_complete() assumes that the completed I/O had + * been submitted to the bdev module. Since in this case it + * hadn't, bump io_outstanding to account for the decrement + * that spdk_bdev_io_complete() will do. + */ + if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { + ch->io_outstanding++; + ch->shared_resource->io_outstanding++; + } + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); + } + } +} + +static bool +bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) +{ + struct spdk_bdev_io *bdev_io; + + TAILQ_FOREACH(bdev_io, queue, internal.link) { + if (bdev_io == bio_to_abort) { + TAILQ_REMOVE(queue, bio_to_abort, internal.link); + spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); + return true; + } + } + + return false; +} + +static bool +bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) +{ + struct spdk_bdev_io *bdev_io; + + STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { + if (bdev_io == bio_to_abort) { + STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); + spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); + return true; + } + } + + return false; +} + +static void +bdev_qos_channel_destroy(void *cb_arg) +{ + struct spdk_bdev_qos *qos = cb_arg; + + spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); + spdk_poller_unregister(&qos->poller); + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); + + free(qos); +} + +static int +bdev_qos_destroy(struct spdk_bdev *bdev) +{ + int i; + + /* + * Cleanly shutting down the QoS poller is tricky, because + * during the asynchronous operation the user could open + * a new descriptor and create a new channel, spawning + * a new QoS poller. + * + * The strategy is to create a new QoS structure here and swap it + * in. The shutdown path then continues to refer to the old one + * until it completes and then releases it. + */ + struct spdk_bdev_qos *new_qos, *old_qos; + + old_qos = bdev->internal.qos; + + new_qos = calloc(1, sizeof(*new_qos)); + if (!new_qos) { + SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); + return -ENOMEM; + } + + /* Copy the old QoS data into the newly allocated structure */ + memcpy(new_qos, old_qos, sizeof(*new_qos)); + + /* Zero out the key parts of the QoS structure */ + new_qos->ch = NULL; + new_qos->thread = NULL; + new_qos->poller = NULL; + TAILQ_INIT(&new_qos->queued); + /* + * The limit member of spdk_bdev_qos_limit structure is not zeroed. + * It will be used later for the new QoS structure. + */ + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + new_qos->rate_limits[i].remaining_this_timeslice = 0; + new_qos->rate_limits[i].min_per_timeslice = 0; + new_qos->rate_limits[i].max_per_timeslice = 0; + } + + bdev->internal.qos = new_qos; + + if (old_qos->thread == NULL) { + free(old_qos); + } else { + spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); + } + + /* It is safe to continue with destroying the bdev even though the QoS channel hasn't + * been destroyed yet. The destruction path will end up waiting for the final + * channel to be put before it releases resources. */ + + return 0; +} + +static void +bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) +{ + total->bytes_read += add->bytes_read; + total->num_read_ops += add->num_read_ops; + total->bytes_written += add->bytes_written; + total->num_write_ops += add->num_write_ops; + total->bytes_unmapped += add->bytes_unmapped; + total->num_unmap_ops += add->num_unmap_ops; + total->read_latency_ticks += add->read_latency_ticks; + total->write_latency_ticks += add->write_latency_ticks; + total->unmap_latency_ticks += add->unmap_latency_ticks; +} + +static void +bdev_channel_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_bdev_channel *ch = ctx_buf; + struct spdk_bdev_mgmt_channel *mgmt_ch; + struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, + spdk_get_thread()); + + /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ + pthread_mutex_lock(&ch->bdev->internal.mutex); + bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); + pthread_mutex_unlock(&ch->bdev->internal.mutex); + + mgmt_ch = shared_resource->mgmt_ch; + + bdev_abort_all_queued_io(&ch->queued_resets, ch); + bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); + bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); + bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); + + if (ch->histogram) { + spdk_histogram_data_free(ch->histogram); + } + + bdev_channel_destroy_resource(ch); +} + +int +spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) +{ + struct spdk_bdev_alias *tmp; + + if (alias == NULL) { + SPDK_ERRLOG("Empty alias passed\n"); + return -EINVAL; + } + + if (spdk_bdev_get_by_name(alias)) { + SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); + return -EEXIST; + } + + tmp = calloc(1, sizeof(*tmp)); + if (tmp == NULL) { + SPDK_ERRLOG("Unable to allocate alias\n"); + return -ENOMEM; + } + + tmp->alias = strdup(alias); + if (tmp->alias == NULL) { + free(tmp); + SPDK_ERRLOG("Unable to allocate alias\n"); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); + + return 0; +} + +int +spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) +{ + struct spdk_bdev_alias *tmp; + + TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { + if (strcmp(alias, tmp->alias) == 0) { + TAILQ_REMOVE(&bdev->aliases, tmp, tailq); + free(tmp->alias); + free(tmp); + return 0; + } + } + + SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); + + return -ENOENT; +} + +void +spdk_bdev_alias_del_all(struct spdk_bdev *bdev) +{ + struct spdk_bdev_alias *p, *tmp; + + TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { + TAILQ_REMOVE(&bdev->aliases, p, tailq); + free(p->alias); + free(p); + } +} + +struct spdk_io_channel * +spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) +{ + return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); +} + +const char * +spdk_bdev_get_name(const struct spdk_bdev *bdev) +{ + return bdev->name; +} + +const char * +spdk_bdev_get_product_name(const struct spdk_bdev *bdev) +{ + return bdev->product_name; +} + +const struct spdk_bdev_aliases_list * +spdk_bdev_get_aliases(const struct spdk_bdev *bdev) +{ + return &bdev->aliases; +} + +uint32_t +spdk_bdev_get_block_size(const struct spdk_bdev *bdev) +{ + return bdev->blocklen; +} + +uint32_t +spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) +{ + return bdev->write_unit_size; +} + +uint64_t +spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) +{ + return bdev->blockcnt; +} + +const char * +spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) +{ + return qos_rpc_type[type]; +} + +void +spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) +{ + int i; + + memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); + + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev->internal.qos) { + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (bdev->internal.qos->rate_limits[i].limit != + SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + limits[i] = bdev->internal.qos->rate_limits[i].limit; + if (bdev_qos_is_iops_rate_limit(i) == false) { + /* Change from Byte to Megabyte which is user visible. */ + limits[i] = limits[i] / 1024 / 1024; + } + } + } + } + pthread_mutex_unlock(&bdev->internal.mutex); +} + +size_t +spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) +{ + return 1 << bdev->required_alignment; +} + +uint32_t +spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) +{ + return bdev->optimal_io_boundary; +} + +bool +spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) +{ + return bdev->write_cache; +} + +const struct spdk_uuid * +spdk_bdev_get_uuid(const struct spdk_bdev *bdev) +{ + return &bdev->uuid; +} + +uint16_t +spdk_bdev_get_acwu(const struct spdk_bdev *bdev) +{ + return bdev->acwu; +} + +uint32_t +spdk_bdev_get_md_size(const struct spdk_bdev *bdev) +{ + return bdev->md_len; +} + +bool +spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) +{ + return (bdev->md_len != 0) && bdev->md_interleave; +} + +bool +spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) +{ + return (bdev->md_len != 0) && !bdev->md_interleave; +} + +bool +spdk_bdev_is_zoned(const struct spdk_bdev *bdev) +{ + return bdev->zoned; +} + +uint32_t +spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) +{ + if (spdk_bdev_is_md_interleaved(bdev)) { + return bdev->blocklen - bdev->md_len; + } else { + return bdev->blocklen; + } +} + +static uint32_t +_bdev_get_block_size_with_md(const struct spdk_bdev *bdev) +{ + if (!spdk_bdev_is_md_interleaved(bdev)) { + return bdev->blocklen + bdev->md_len; + } else { + return bdev->blocklen; + } +} + +enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) +{ + if (bdev->md_len != 0) { + return bdev->dif_type; + } else { + return SPDK_DIF_DISABLE; + } +} + +bool +spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) +{ + if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { + return bdev->dif_is_head_of_md; + } else { + return false; + } +} + +bool +spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, + enum spdk_dif_check_type check_type) +{ + if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { + return false; + } + + switch (check_type) { + case SPDK_DIF_CHECK_TYPE_REFTAG: + return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; + case SPDK_DIF_CHECK_TYPE_APPTAG: + return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; + case SPDK_DIF_CHECK_TYPE_GUARD: + return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; + default: + return false; + } +} + +uint64_t +spdk_bdev_get_qd(const struct spdk_bdev *bdev) +{ + return bdev->internal.measured_queue_depth; +} + +uint64_t +spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) +{ + return bdev->internal.period; +} + +uint64_t +spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) +{ + return bdev->internal.weighted_io_time; +} + +uint64_t +spdk_bdev_get_io_time(const struct spdk_bdev *bdev) +{ + return bdev->internal.io_time; +} + +static void +_calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); + + bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; + + if (bdev->internal.measured_queue_depth) { + bdev->internal.io_time += bdev->internal.period; + bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; + } +} + +static void +_calculate_measured_qd(struct spdk_io_channel_iter *i) +{ + struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); + + bdev->internal.temporary_queue_depth += ch->io_outstanding; + spdk_for_each_channel_continue(i, 0); +} + +static int +bdev_calculate_measured_queue_depth(void *ctx) +{ + struct spdk_bdev *bdev = ctx; + bdev->internal.temporary_queue_depth = 0; + spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, + _calculate_measured_qd_cpl); + return SPDK_POLLER_BUSY; +} + +void +spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) +{ + bdev->internal.period = period; + + if (bdev->internal.qd_poller != NULL) { + spdk_poller_unregister(&bdev->internal.qd_poller); + bdev->internal.measured_queue_depth = UINT64_MAX; + } + + if (period != 0) { + bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev, + period); + } +} + +static void +_resize_notify(void *arg) +{ + struct spdk_bdev_desc *desc = arg; + + pthread_mutex_lock(&desc->mutex); + desc->refs--; + if (!desc->closed) { + pthread_mutex_unlock(&desc->mutex); + desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, + desc->bdev, + desc->callback.ctx); + return; + } else if (0 == desc->refs) { + /* This descriptor was closed after this resize_notify message was sent. + * spdk_bdev_close() could not free the descriptor since this message was + * in flight, so we free it now using bdev_desc_free(). + */ + pthread_mutex_unlock(&desc->mutex); + bdev_desc_free(desc); + return; + } + pthread_mutex_unlock(&desc->mutex); +} + +int +spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) +{ + struct spdk_bdev_desc *desc; + int ret; + + pthread_mutex_lock(&bdev->internal.mutex); + + /* bdev has open descriptors */ + if (!TAILQ_EMPTY(&bdev->internal.open_descs) && + bdev->blockcnt > size) { + ret = -EBUSY; + } else { + bdev->blockcnt = size; + TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { + pthread_mutex_lock(&desc->mutex); + if (desc->callback.open_with_ext && !desc->closed) { + desc->refs++; + spdk_thread_send_msg(desc->thread, _resize_notify, desc); + } + pthread_mutex_unlock(&desc->mutex); + } + ret = 0; + } + + pthread_mutex_unlock(&bdev->internal.mutex); + + return ret; +} + +/* + * Convert I/O offset and length from bytes to blocks. + * + * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. + */ +static uint64_t +bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, + uint64_t num_bytes, uint64_t *num_blocks) +{ + uint32_t block_size = bdev->blocklen; + uint8_t shift_cnt; + + /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ + if (spdk_likely(spdk_u32_is_pow2(block_size))) { + shift_cnt = spdk_u32log2(block_size); + *offset_blocks = offset_bytes >> shift_cnt; + *num_blocks = num_bytes >> shift_cnt; + return (offset_bytes - (*offset_blocks << shift_cnt)) | + (num_bytes - (*num_blocks << shift_cnt)); + } else { + *offset_blocks = offset_bytes / block_size; + *num_blocks = num_bytes / block_size; + return (offset_bytes % block_size) | (num_bytes % block_size); + } +} + +static bool +bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) +{ + /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there + * has been an overflow and hence the offset has been wrapped around */ + if (offset_blocks + num_blocks < offset_blocks) { + return false; + } + + /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ + if (offset_blocks + num_blocks > bdev->blockcnt) { + return false; + } + + return true; +} + +static bool +_bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) +{ + return _is_buf_allocated(iovs) == (md_buf != NULL); +} + +static int +bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, + void *md_buf, int64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_READ; + bdev_io->u.bdev.iovs = &bdev_io->iov; + bdev_io->u.bdev.iovs[0].iov_base = buf; + bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; + bdev_io->u.bdev.iovcnt = 1; + bdev_io->u.bdev.md_buf = md_buf; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, uint64_t offset, uint64_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, + nbytes, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, void *md_buf, int64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct iovec iov = { + .iov_base = buf, + }; + + if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { + return -EINVAL; + } + + if (!_bdev_io_check_md_buf(&iov, md_buf)) { + return -EINVAL; + } + + return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, + cb, cb_arg); +} + +int +spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset, uint64_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, + nbytes, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); +} + +static int +bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, + uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_READ; + bdev_io->u.bdev.iovs = iov; + bdev_io->u.bdev.iovcnt = iovcnt; + bdev_io->u.bdev.md_buf = md_buf; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, + num_blocks, cb, cb_arg); +} + +int +spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, void *md_buf, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { + return -EINVAL; + } + + if (!_bdev_io_check_md_buf(iov, md_buf)) { + return -EINVAL; + } + + return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, + num_blocks, cb, cb_arg); +} + +static int +bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; + bdev_io->u.bdev.iovs = &bdev_io->iov; + bdev_io->u.bdev.iovs[0].iov_base = buf; + bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; + bdev_io->u.bdev.iovcnt = 1; + bdev_io->u.bdev.md_buf = md_buf; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, uint64_t offset, uint64_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, + nbytes, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, + cb, cb_arg); +} + +int +spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct iovec iov = { + .iov_base = buf, + }; + + if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { + return -EINVAL; + } + + if (!_bdev_io_check_md_buf(&iov, md_buf)) { + return -EINVAL; + } + + return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, + cb, cb_arg); +} + +static int +bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, void *md_buf, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; + bdev_io->u.bdev.iovs = iov; + bdev_io->u.bdev.iovcnt = iovcnt; + bdev_io->u.bdev.md_buf = md_buf; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset, uint64_t len, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, + len, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, + num_blocks, cb, cb_arg); +} + +int +spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, void *md_buf, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { + return -EINVAL; + } + + if (!_bdev_io_check_md_buf(iov, md_buf)) { + return -EINVAL; + } + + return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, + num_blocks, cb, cb_arg); +} + +static void +bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; + int i, rc = 0; + + if (!success) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); + spdk_bdev_free_io(bdev_io); + return; + } + + for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { + rc = memcmp(read_buf, + parent_io->u.bdev.iovs[i].iov_base, + parent_io->u.bdev.iovs[i].iov_len); + if (rc) { + break; + } + read_buf += parent_io->u.bdev.iovs[i].iov_len; + } + + spdk_bdev_free_io(bdev_io); + + if (rc == 0) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); + } else { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; + parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); + } +} + +static void +bdev_compare_do_read(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + int rc; + + rc = spdk_bdev_read_blocks(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, + bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, + bdev_compare_do_read_done, bdev_io); + + if (rc == -ENOMEM) { + bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); + } else if (rc != 0) { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } +} + +static int +bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, void *md_buf, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; + bdev_io->u.bdev.iovs = iov; + bdev_io->u.bdev.iovcnt = iovcnt; + bdev_io->u.bdev.md_buf = md_buf; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { + bdev_io_submit(bdev_io); + return 0; + } + + bdev_compare_do_read(bdev_io); + + return 0; +} + +int +spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, + num_blocks, cb, cb_arg); +} + +int +spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, void *md_buf, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { + return -EINVAL; + } + + if (!_bdev_io_check_md_buf(iov, md_buf)) { + return -EINVAL; + } + + return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, + num_blocks, cb, cb_arg); +} + +static int +bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; + bdev_io->u.bdev.iovs = &bdev_io->iov; + bdev_io->u.bdev.iovs[0].iov_base = buf; + bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; + bdev_io->u.bdev.iovcnt = 1; + bdev_io->u.bdev.md_buf = md_buf; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { + bdev_io_submit(bdev_io); + return 0; + } + + bdev_compare_do_read(bdev_io); + + return 0; +} + +int +spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, + cb, cb_arg); +} + +int +spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct iovec iov = { + .iov_base = buf, + }; + + if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { + return -EINVAL; + } + + if (!_bdev_io_check_md_buf(&iov, md_buf)) { + return -EINVAL; + } + + return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, + cb, cb_arg); +} + +static void +bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) +{ + struct spdk_bdev_io *bdev_io = ctx; + + if (unlock_status) { + SPDK_ERRLOG("LBA range unlock failed\n"); + } + + bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : + false, bdev_io->internal.caller_ctx); +} + +static void +bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) +{ + bdev_io->internal.status = status; + + bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), + bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, + bdev_comparev_and_writev_blocks_unlocked, bdev_io); +} + +static void +bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + + if (!success) { + SPDK_ERRLOG("Compare and write operation failed\n"); + } + + spdk_bdev_free_io(bdev_io); + + bdev_comparev_and_writev_blocks_unlock(parent_io, + success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); +} + +static void +bdev_compare_and_write_do_write(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + int rc; + + rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), + bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, + bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, + bdev_compare_and_write_do_write_done, bdev_io); + + + if (rc == -ENOMEM) { + bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); + } else if (rc != 0) { + bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (!success) { + bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); + return; + } + + bdev_compare_and_write_do_write(parent_io); +} + +static void +bdev_compare_and_write_do_compare(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + int rc; + + rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, + bdev_compare_and_write_do_compare_done, bdev_io); + + if (rc == -ENOMEM) { + bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); + } else if (rc != 0) { + bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); + } +} + +static void +bdev_comparev_and_writev_blocks_locked(void *ctx, int status) +{ + struct spdk_bdev_io *bdev_io = ctx; + + if (status) { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } + + bdev_compare_and_write_do_compare(bdev_io); +} + +int +spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *compare_iov, int compare_iovcnt, + struct iovec *write_iov, int write_iovcnt, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + if (num_blocks > bdev->acwu) { + return -EINVAL; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; + bdev_io->u.bdev.iovs = compare_iov; + bdev_io->u.bdev.iovcnt = compare_iovcnt; + bdev_io->u.bdev.fused_iovs = write_iov; + bdev_io->u.bdev.fused_iovcnt = write_iovcnt; + bdev_io->u.bdev.md_buf = NULL; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { + bdev_io_submit(bdev_io); + return 0; + } + + return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, + bdev_comparev_and_writev_blocks_locked, bdev_io); +} + +static void +bdev_zcopy_get_buf(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +{ + if (!success) { + /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; + bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx); + return; + } + + if (bdev_io->u.bdev.zcopy.populate) { + /* Read the real data into the buffer */ + bdev_io->type = SPDK_BDEV_IO_TYPE_READ; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; + bdev_io_submit(bdev_io); + return; + } + + /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx); +} + +int +spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset_blocks, uint64_t num_blocks, + bool populate, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { + return -ENOTSUP; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io->u.bdev.iovs = NULL; + bdev_io->u.bdev.iovcnt = 0; + bdev_io->u.bdev.md_buf = NULL; + bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; + bdev_io->u.bdev.zcopy.commit = 0; + bdev_io->u.bdev.zcopy.start = 1; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { + bdev_io_submit(bdev_io); + } else { + /* Emulate zcopy by allocating a buffer */ + spdk_bdev_io_get_buf(bdev_io, bdev_zcopy_get_buf, + bdev_io->u.bdev.num_blocks * bdev->blocklen); + } + + return 0; +} + +int +spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + /* This can happen if the zcopy was emulated in start */ + if (bdev_io->u.bdev.zcopy.start != 1) { + return -EINVAL; + } + bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; + } + + if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { + return -EINVAL; + } + + bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; + bdev_io->u.bdev.zcopy.start = 0; + bdev_io->internal.caller_ctx = cb_arg; + bdev_io->internal.cb = cb; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; + + if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { + bdev_io_submit(bdev_io); + return 0; + } + + if (!bdev_io->u.bdev.zcopy.commit) { + /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */ + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); + return 0; + } + + bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; + bdev_io_submit(bdev_io); + + return 0; +} + +int +spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset, uint64_t len, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, + len, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && + !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { + return -ENOTSUP; + } + + bdev_io = bdev_channel_get_io(channel); + + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { + bdev_io_submit(bdev_io); + return 0; + } + + assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); + assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); + bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; + bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; + bdev_write_zero_buffer_next(bdev_io); + + return 0; +} + +int +spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset, uint64_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, + nbytes, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + if (num_blocks == 0) { + SPDK_ERRLOG("Can't unmap 0 bytes\n"); + return -EINVAL; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; + + bdev_io->u.bdev.iovs = &bdev_io->iov; + bdev_io->u.bdev.iovs[0].iov_base = NULL; + bdev_io->u.bdev.iovs[0].iov_len = 0; + bdev_io->u.bdev.iovcnt = 1; + + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset, uint64_t length, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, + length, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; + bdev_io->u.bdev.iovs = NULL; + bdev_io->u.bdev.iovcnt = 0; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +static void +bdev_reset_dev(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); + struct spdk_bdev_io *bdev_io; + + bdev_io = TAILQ_FIRST(&ch->queued_resets); + TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); + bdev_io_submit_reset(bdev_io); +} + +static void +bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch; + struct spdk_bdev_channel *channel; + struct spdk_bdev_mgmt_channel *mgmt_channel; + struct spdk_bdev_shared_resource *shared_resource; + bdev_io_tailq_t tmp_queued; + + TAILQ_INIT(&tmp_queued); + + ch = spdk_io_channel_iter_get_channel(i); + channel = spdk_io_channel_get_ctx(ch); + shared_resource = channel->shared_resource; + mgmt_channel = shared_resource->mgmt_ch; + + channel->flags |= BDEV_CH_RESET_IN_PROGRESS; + + if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { + /* The QoS object is always valid and readable while + * the channel flag is set, so the lock here should not + * be necessary. We're not in the fast path though, so + * just take it anyway. */ + pthread_mutex_lock(&channel->bdev->internal.mutex); + if (channel->bdev->internal.qos->ch == channel) { + TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); + } + pthread_mutex_unlock(&channel->bdev->internal.mutex); + } + + bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); + bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); + bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); + bdev_abort_all_queued_io(&tmp_queued, channel); + + spdk_for_each_channel_continue(i, 0); +} + +static void +bdev_start_reset(void *ctx) +{ + struct spdk_bdev_channel *ch = ctx; + + spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, + ch, bdev_reset_dev); +} + +static void +bdev_channel_start_reset(struct spdk_bdev_channel *ch) +{ + struct spdk_bdev *bdev = ch->bdev; + + assert(!TAILQ_EMPTY(&ch->queued_resets)); + + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev->internal.reset_in_progress == NULL) { + bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); + /* + * Take a channel reference for the target bdev for the life of this + * reset. This guards against the channel getting destroyed while + * spdk_for_each_channel() calls related to this reset IO are in + * progress. We will release the reference when this reset is + * completed. + */ + bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); + bdev_start_reset(ch); + } + pthread_mutex_unlock(&bdev->internal.mutex); +} + +int +spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->internal.submit_tsc = spdk_get_ticks(); + bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; + bdev_io->u.reset.ch_ref = NULL; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + pthread_mutex_lock(&bdev->internal.mutex); + TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); + pthread_mutex_unlock(&bdev->internal.mutex); + + TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, + internal.ch_link); + + bdev_channel_start_reset(channel); + + return 0; +} + +void +spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, + struct spdk_bdev_io_stat *stat) +{ + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + *stat = channel->stat; +} + +static void +bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) +{ + void *io_device = spdk_io_channel_iter_get_io_device(i); + struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); + + bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, + bdev_iostat_ctx->cb_arg, 0); + free(bdev_iostat_ctx); +} + +static void +bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) +{ + struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); + spdk_for_each_channel_continue(i, 0); +} + +void +spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, + spdk_bdev_get_device_stat_cb cb, void *cb_arg) +{ + struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; + + assert(bdev != NULL); + assert(stat != NULL); + assert(cb != NULL); + + bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); + if (bdev_iostat_ctx == NULL) { + SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); + cb(bdev, stat, cb_arg, -ENOMEM); + return; + } + + bdev_iostat_ctx->stat = stat; + bdev_iostat_ctx->cb = cb; + bdev_iostat_ctx->cb_arg = cb_arg; + + /* Start with the statistics from previously deleted channels. */ + pthread_mutex_lock(&bdev->internal.mutex); + bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); + pthread_mutex_unlock(&bdev->internal.mutex); + + /* Then iterate and add the statistics from each existing channel. */ + spdk_for_each_channel(__bdev_to_io_dev(bdev), + bdev_get_each_channel_stat, + bdev_iostat_ctx, + bdev_get_device_stat_done); +} + +int +spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; + bdev_io->u.nvme_passthru.cmd = *cmd; + bdev_io->u.nvme_passthru.buf = buf; + bdev_io->u.nvme_passthru.nbytes = nbytes; + bdev_io->u.nvme_passthru.md_buf = NULL; + bdev_io->u.nvme_passthru.md_len = 0; + + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + /* + * Do not try to parse the NVMe command - we could maybe use bits in the opcode + * to easily determine if the command is a read or write, but for now just + * do not allow io_passthru with a read-only descriptor. + */ + return -EBADF; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; + bdev_io->u.nvme_passthru.cmd = *cmd; + bdev_io->u.nvme_passthru.buf = buf; + bdev_io->u.nvme_passthru.nbytes = nbytes; + bdev_io->u.nvme_passthru.md_buf = NULL; + bdev_io->u.nvme_passthru.md_len = 0; + + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + /* + * Do not try to parse the NVMe command - we could maybe use bits in the opcode + * to easily determine if the command is a read or write, but for now just + * do not allow io_passthru with a read-only descriptor. + */ + return -EBADF; + } + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; + bdev_io->u.nvme_passthru.cmd = *cmd; + bdev_io->u.nvme_passthru.buf = buf; + bdev_io->u.nvme_passthru.nbytes = nbytes; + bdev_io->u.nvme_passthru.md_buf = md_buf; + bdev_io->u.nvme_passthru.md_len = md_len; + + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +static void bdev_abort_retry(void *ctx); +static void bdev_abort(struct spdk_bdev_io *parent_io); + +static void +bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_channel *channel = bdev_io->internal.ch; + struct spdk_bdev_io *parent_io = cb_arg; + struct spdk_bdev_io *bio_to_abort, *tmp_io; + + bio_to_abort = bdev_io->u.abort.bio_to_abort; + + spdk_bdev_free_io(bdev_io); + + if (!success) { + /* Check if the target I/O completed in the meantime. */ + TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { + if (tmp_io == bio_to_abort) { + break; + } + } + + /* If the target I/O still exists, set the parent to failed. */ + if (tmp_io != NULL) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + } + } + + parent_io->u.bdev.split_outstanding--; + if (parent_io->u.bdev.split_outstanding == 0) { + if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { + bdev_abort_retry(parent_io); + } else { + bdev_io_complete(parent_io); + } + } +} + +static int +bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, + struct spdk_bdev_io *bio_to_abort, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + + if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || + bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { + /* TODO: Abort reset or abort request. */ + return -ENOTSUP; + } + + bdev_io = bdev_channel_get_io(channel); + if (bdev_io == NULL) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { + bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; + + /* Parent abort request is not submitted directly, but to manage its + * execution add it to the submitted list here. + */ + bdev_io->internal.submit_tsc = spdk_get_ticks(); + TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); + + bdev_abort(bdev_io); + + return 0; + } + + bdev_io->u.abort.bio_to_abort = bio_to_abort; + + /* Submit the abort request to the underlying bdev module. */ + bdev_io_submit(bdev_io); + + return 0; +} + +static uint32_t +_bdev_abort(struct spdk_bdev_io *parent_io) +{ + struct spdk_bdev_desc *desc = parent_io->internal.desc; + struct spdk_bdev_channel *channel = parent_io->internal.ch; + void *bio_cb_arg; + struct spdk_bdev_io *bio_to_abort; + uint32_t matched_ios; + int rc; + + bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; + + /* matched_ios is returned and will be kept by the caller. + * + * This funcion will be used for two cases, 1) the same cb_arg is used for + * multiple I/Os, 2) a single large I/O is split into smaller ones. + * Incrementing split_outstanding directly here may confuse readers especially + * for the 1st case. + * + * Completion of I/O abort is processed after stack unwinding. Hence this trick + * works as expected. + */ + matched_ios = 0; + parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + + TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { + if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { + continue; + } + + if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { + /* Any I/O which was submitted after this abort command should be excluded. */ + continue; + } + + rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); + if (rc != 0) { + if (rc == -ENOMEM) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; + } else { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + } + break; + } + matched_ios++; + } + + return matched_ios; +} + +static void +bdev_abort_retry(void *ctx) +{ + struct spdk_bdev_io *parent_io = ctx; + uint32_t matched_ios; + + matched_ios = _bdev_abort(parent_io); + + if (matched_ios == 0) { + if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { + bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); + } else { + /* For retry, the case that no target I/O was found is success + * because it means target I/Os completed in the meantime. + */ + bdev_io_complete(parent_io); + } + return; + } + + /* Use split_outstanding to manage the progress of aborting I/Os. */ + parent_io->u.bdev.split_outstanding = matched_ios; +} + +static void +bdev_abort(struct spdk_bdev_io *parent_io) +{ + uint32_t matched_ios; + + matched_ios = _bdev_abort(parent_io); + + if (matched_ios == 0) { + if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { + bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); + } else { + /* The case the no target I/O was found is failure. */ + parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + bdev_io_complete(parent_io); + } + return; + } + + /* Use split_outstanding to manage the progress of aborting I/Os. */ + parent_io->u.bdev.split_outstanding = matched_ios; +} + +int +spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *bio_cb_arg, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + struct spdk_bdev_io *bdev_io; + + if (bio_cb_arg == NULL) { + return -EINVAL; + } + + if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { + return -ENOTSUP; + } + + bdev_io = bdev_channel_get_io(channel); + if (bdev_io == NULL) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->internal.submit_tsc = spdk_get_ticks(); + bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; + + /* Parent abort request is not submitted directly, but to manage its execution, + * add it to the submitted list here. + */ + TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); + + bdev_abort(bdev_io); + + return 0; +} + +int +spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, + struct spdk_bdev_io_wait_entry *entry) +{ + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; + + if (bdev != entry->bdev) { + SPDK_ERRLOG("bdevs do not match\n"); + return -EINVAL; + } + + if (mgmt_ch->per_thread_cache_count > 0) { + SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); + return -EINVAL; + } + + TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); + return 0; +} + +static void +bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) +{ + struct spdk_bdev *bdev = bdev_ch->bdev; + struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; + struct spdk_bdev_io *bdev_io; + + if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { + /* + * Allow some more I/O to complete before retrying the nomem_io queue. + * Some drivers (such as nvme) cannot immediately take a new I/O in + * the context of a completion, because the resources for the I/O are + * not released until control returns to the bdev poller. Also, we + * may require several small I/O to complete before a larger I/O + * (that requires splitting) can be submitted. + */ + return; + } + + while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { + bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); + TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); + bdev_io->internal.ch->io_outstanding++; + shared_resource->io_outstanding++; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; + bdev_io->internal.error.nvme.cdw0 = 0; + bdev_io->num_retries++; + bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); + if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { + break; + } + } +} + +static inline void +bdev_io_complete(void *ctx) +{ + struct spdk_bdev_io *bdev_io = ctx; + struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; + uint64_t tsc, tsc_diff; + + if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { + /* + * Send the completion to the thread that originally submitted the I/O, + * which may not be the current thread in the case of QoS. + */ + if (bdev_io->internal.io_submit_ch) { + bdev_io->internal.ch = bdev_io->internal.io_submit_ch; + bdev_io->internal.io_submit_ch = NULL; + } + + /* + * Defer completion to avoid potential infinite recursion if the + * user's completion callback issues a new I/O. + */ + spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), + bdev_io_complete, bdev_io); + return; + } + + tsc = spdk_get_ticks(); + tsc_diff = tsc - bdev_io->internal.submit_tsc; + spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0); + + TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); + + if (bdev_io->internal.ch->histogram) { + spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); + } + + if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + bdev_io->internal.ch->stat.num_read_ops++; + bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; + break; + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + bdev_io->internal.ch->stat.num_write_ops++; + bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + bdev_io->internal.ch->stat.num_unmap_ops++; + bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; + break; + case SPDK_BDEV_IO_TYPE_ZCOPY: + /* Track the data in the start phase only */ + if (bdev_io->u.bdev.zcopy.start) { + if (bdev_io->u.bdev.zcopy.populate) { + bdev_io->internal.ch->stat.bytes_read += + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + bdev_io->internal.ch->stat.num_read_ops++; + bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; + } else { + bdev_io->internal.ch->stat.bytes_written += + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + bdev_io->internal.ch->stat.num_write_ops++; + bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; + } + } + break; + default: + break; + } + } + +#ifdef SPDK_CONFIG_VTUNE + uint64_t now_tsc = spdk_get_ticks(); + if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { + uint64_t data[5]; + + data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; + data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; + data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; + data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; + data[4] = bdev_io->bdev->fn_table->get_spin_time ? + bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; + + __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, + __itt_metadata_u64, 5, data); + + bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; + bdev_io->internal.ch->start_tsc = now_tsc; + } +#endif + + assert(bdev_io->internal.cb != NULL); + assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); + + bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, + bdev_io->internal.caller_ctx); +} + +static void +bdev_reset_complete(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); + + if (bdev_io->u.reset.ch_ref != NULL) { + spdk_put_io_channel(bdev_io->u.reset.ch_ref); + bdev_io->u.reset.ch_ref = NULL; + } + + bdev_io_complete(bdev_io); +} + +static void +bdev_unfreeze_channel(struct spdk_io_channel_iter *i) +{ + struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + struct spdk_bdev_io *queued_reset; + + ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; + while (!TAILQ_EMPTY(&ch->queued_resets)) { + queued_reset = TAILQ_FIRST(&ch->queued_resets); + TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); + spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); + } + + spdk_for_each_channel_continue(i, 0); +} + +void +spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; + struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; + + bdev_io->internal.status = status; + + if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { + bool unlock_channels = false; + + if (status == SPDK_BDEV_IO_STATUS_NOMEM) { + SPDK_ERRLOG("NOMEM returned for reset\n"); + } + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev_io == bdev->internal.reset_in_progress) { + bdev->internal.reset_in_progress = NULL; + unlock_channels = true; + } + pthread_mutex_unlock(&bdev->internal.mutex); + + if (unlock_channels) { + spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, + bdev_io, bdev_reset_complete); + return; + } + } else { + _bdev_io_unset_bounce_buf(bdev_io); + + assert(bdev_ch->io_outstanding > 0); + assert(shared_resource->io_outstanding > 0); + bdev_ch->io_outstanding--; + shared_resource->io_outstanding--; + + if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { + TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); + /* + * Wait for some of the outstanding I/O to complete before we + * retry any of the nomem_io. Normally we will wait for + * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue + * depth channels we will instead wait for half to complete. + */ + shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, + (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); + return; + } + + if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { + bdev_ch_retry_io(bdev_ch); + } + } + + bdev_io_complete(bdev_io); +} + +void +spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, + enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) +{ + if (sc == SPDK_SCSI_STATUS_GOOD) { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + } else { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; + bdev_io->internal.error.scsi.sc = sc; + bdev_io->internal.error.scsi.sk = sk; + bdev_io->internal.error.scsi.asc = asc; + bdev_io->internal.error.scsi.ascq = ascq; + } + + spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); +} + +void +spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, + int *sc, int *sk, int *asc, int *ascq) +{ + assert(sc != NULL); + assert(sk != NULL); + assert(asc != NULL); + assert(ascq != NULL); + + switch (bdev_io->internal.status) { + case SPDK_BDEV_IO_STATUS_SUCCESS: + *sc = SPDK_SCSI_STATUS_GOOD; + *sk = SPDK_SCSI_SENSE_NO_SENSE; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_BDEV_IO_STATUS_NVME_ERROR: + spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); + break; + case SPDK_BDEV_IO_STATUS_SCSI_ERROR: + *sc = bdev_io->internal.error.scsi.sc; + *sk = bdev_io->internal.error.scsi.sk; + *asc = bdev_io->internal.error.scsi.asc; + *ascq = bdev_io->internal.error.scsi.ascq; + break; + default: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } +} + +void +spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) +{ + if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + } else { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; + } + + bdev_io->internal.error.nvme.cdw0 = cdw0; + bdev_io->internal.error.nvme.sct = sct; + bdev_io->internal.error.nvme.sc = sc; + + spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); +} + +void +spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) +{ + assert(sct != NULL); + assert(sc != NULL); + assert(cdw0 != NULL); + + if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { + *sct = bdev_io->internal.error.nvme.sct; + *sc = bdev_io->internal.error.nvme.sc; + } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { + *sct = SPDK_NVME_SCT_GENERIC; + *sc = SPDK_NVME_SC_SUCCESS; + } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { + *sct = SPDK_NVME_SCT_GENERIC; + *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; + } else { + *sct = SPDK_NVME_SCT_GENERIC; + *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + } + + *cdw0 = bdev_io->internal.error.nvme.cdw0; +} + +void +spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, + int *first_sct, int *first_sc, int *second_sct, int *second_sc) +{ + assert(first_sct != NULL); + assert(first_sc != NULL); + assert(second_sct != NULL); + assert(second_sc != NULL); + assert(cdw0 != NULL); + + if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { + if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && + bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { + *first_sct = bdev_io->internal.error.nvme.sct; + *first_sc = bdev_io->internal.error.nvme.sc; + *second_sct = SPDK_NVME_SCT_GENERIC; + *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; + } else { + *first_sct = SPDK_NVME_SCT_GENERIC; + *first_sc = SPDK_NVME_SC_SUCCESS; + *second_sct = bdev_io->internal.error.nvme.sct; + *second_sc = bdev_io->internal.error.nvme.sc; + } + } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { + *first_sct = SPDK_NVME_SCT_GENERIC; + *first_sc = SPDK_NVME_SC_SUCCESS; + *second_sct = SPDK_NVME_SCT_GENERIC; + *second_sc = SPDK_NVME_SC_SUCCESS; + } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { + *first_sct = SPDK_NVME_SCT_GENERIC; + *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + *second_sct = SPDK_NVME_SCT_GENERIC; + *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; + } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { + *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; + *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; + *second_sct = SPDK_NVME_SCT_GENERIC; + *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; + } else { + *first_sct = SPDK_NVME_SCT_GENERIC; + *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + *second_sct = SPDK_NVME_SCT_GENERIC; + *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + } + + *cdw0 = bdev_io->internal.error.nvme.cdw0; +} + +struct spdk_thread * +spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) +{ + return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); +} + +struct spdk_io_channel * +spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) +{ + return bdev_io->internal.ch->channel; +} + +static void +bdev_qos_config_limit(struct spdk_bdev *bdev, uint64_t *limits) +{ + uint64_t min_qos_set; + int i; + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + break; + } + } + + if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { + SPDK_ERRLOG("Invalid rate limits set.\n"); + return; + } + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + continue; + } + + if (bdev_qos_is_iops_rate_limit(i) == true) { + min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; + } else { + min_qos_set = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; + } + + if (limits[i] == 0 || limits[i] % min_qos_set) { + SPDK_ERRLOG("Assigned limit %" PRIu64 " on bdev %s is not multiple of %" PRIu64 "\n", + limits[i], bdev->name, min_qos_set); + SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); + return; + } + } + + if (!bdev->internal.qos) { + bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); + if (!bdev->internal.qos) { + SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); + return; + } + } + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + bdev->internal.qos->rate_limits[i].limit = limits[i]; + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", + bdev->name, i, limits[i]); + } + + return; +} + +static void +bdev_qos_config(struct spdk_bdev *bdev) +{ + struct spdk_conf_section *sp = NULL; + const char *val = NULL; + int i = 0, j = 0; + uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES] = {}; + bool config_qos = false; + + sp = spdk_conf_find_section(NULL, "QoS"); + if (!sp) { + return; + } + + while (j < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { + limits[j] = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; + + i = 0; + while (true) { + val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 0); + if (!val) { + break; + } + + if (strcmp(bdev->name, val) != 0) { + i++; + continue; + } + + val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 1); + if (val) { + if (bdev_qos_is_iops_rate_limit(j) == true) { + limits[j] = strtoull(val, NULL, 10); + } else { + limits[j] = strtoull(val, NULL, 10) * 1024 * 1024; + } + config_qos = true; + } + + break; + } + + j++; + } + + if (config_qos == true) { + bdev_qos_config_limit(bdev, limits); + } + + return; +} + +static int +bdev_init(struct spdk_bdev *bdev) +{ + char *bdev_name; + + assert(bdev->module != NULL); + + if (!bdev->name) { + SPDK_ERRLOG("Bdev name is NULL\n"); + return -EINVAL; + } + + if (!strlen(bdev->name)) { + SPDK_ERRLOG("Bdev name must not be an empty string\n"); + return -EINVAL; + } + + if (spdk_bdev_get_by_name(bdev->name)) { + SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); + return -EEXIST; + } + + /* Users often register their own I/O devices using the bdev name. In + * order to avoid conflicts, prepend bdev_. */ + bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); + if (!bdev_name) { + SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); + return -ENOMEM; + } + + bdev->internal.status = SPDK_BDEV_STATUS_READY; + bdev->internal.measured_queue_depth = UINT64_MAX; + bdev->internal.claim_module = NULL; + bdev->internal.qd_poller = NULL; + bdev->internal.qos = NULL; + + /* If the user didn't specify a uuid, generate one. */ + if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { + spdk_uuid_generate(&bdev->uuid); + } + + if (spdk_bdev_get_buf_align(bdev) > 1) { + if (bdev->split_on_optimal_io_boundary) { + bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, + SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); + } else { + bdev->split_on_optimal_io_boundary = true; + bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; + } + } + + /* If the user didn't specify a write unit size, set it to one. */ + if (bdev->write_unit_size == 0) { + bdev->write_unit_size = 1; + } + + /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ + if (bdev->acwu == 0) { + bdev->acwu = 1; + } + + TAILQ_INIT(&bdev->internal.open_descs); + TAILQ_INIT(&bdev->internal.locked_ranges); + TAILQ_INIT(&bdev->internal.pending_locked_ranges); + + TAILQ_INIT(&bdev->aliases); + + bdev->internal.reset_in_progress = NULL; + + bdev_qos_config(bdev); + + spdk_io_device_register(__bdev_to_io_dev(bdev), + bdev_channel_create, bdev_channel_destroy, + sizeof(struct spdk_bdev_channel), + bdev_name); + + free(bdev_name); + + pthread_mutex_init(&bdev->internal.mutex, NULL); + return 0; +} + +static void +bdev_destroy_cb(void *io_device) +{ + int rc; + struct spdk_bdev *bdev; + spdk_bdev_unregister_cb cb_fn; + void *cb_arg; + + bdev = __bdev_from_io_dev(io_device); + cb_fn = bdev->internal.unregister_cb; + cb_arg = bdev->internal.unregister_ctx; + + rc = bdev->fn_table->destruct(bdev->ctxt); + if (rc < 0) { + SPDK_ERRLOG("destruct failed\n"); + } + if (rc <= 0 && cb_fn != NULL) { + cb_fn(cb_arg, rc); + } +} + + +static void +bdev_fini(struct spdk_bdev *bdev) +{ + pthread_mutex_destroy(&bdev->internal.mutex); + + free(bdev->internal.qos); + + spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); +} + +static void +bdev_start(struct spdk_bdev *bdev) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); + TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); + + /* Examine configuration before initializing I/O */ + bdev_examine(bdev); +} + +int +spdk_bdev_register(struct spdk_bdev *bdev) +{ + int rc = bdev_init(bdev); + + if (rc == 0) { + bdev_start(bdev); + } + + spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); + return rc; +} + +int +spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) +{ + SPDK_ERRLOG("This function is deprecated. Use spdk_bdev_register() instead.\n"); + return spdk_bdev_register(vbdev); +} + +void +spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) +{ + if (bdev->internal.unregister_cb != NULL) { + bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); + } +} + +static void +_remove_notify(void *arg) +{ + struct spdk_bdev_desc *desc = arg; + + pthread_mutex_lock(&desc->mutex); + desc->refs--; + + if (!desc->closed) { + pthread_mutex_unlock(&desc->mutex); + if (desc->callback.open_with_ext) { + desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); + } else { + desc->callback.remove_fn(desc->callback.ctx); + } + return; + } else if (0 == desc->refs) { + /* This descriptor was closed after this remove_notify message was sent. + * spdk_bdev_close() could not free the descriptor since this message was + * in flight, so we free it now using bdev_desc_free(). + */ + pthread_mutex_unlock(&desc->mutex); + bdev_desc_free(desc); + return; + } + pthread_mutex_unlock(&desc->mutex); +} + +/* Must be called while holding bdev->internal.mutex. + * returns: 0 - bdev removed and ready to be destructed. + * -EBUSY - bdev can't be destructed yet. */ +static int +bdev_unregister_unsafe(struct spdk_bdev *bdev) +{ + struct spdk_bdev_desc *desc, *tmp; + int rc = 0; + + /* Notify each descriptor about hotremoval */ + TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { + rc = -EBUSY; + pthread_mutex_lock(&desc->mutex); + /* + * Defer invocation of the event_cb to a separate message that will + * run later on its thread. This ensures this context unwinds and + * we don't recursively unregister this bdev again if the event_cb + * immediately closes its descriptor. + */ + desc->refs++; + spdk_thread_send_msg(desc->thread, _remove_notify, desc); + pthread_mutex_unlock(&desc->mutex); + } + + /* If there are no descriptors, proceed removing the bdev */ + if (rc == 0) { + TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list done\n", bdev->name); + spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); + } + + return rc; +} + +void +spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) +{ + struct spdk_thread *thread; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); + + thread = spdk_get_thread(); + if (!thread) { + /* The user called this from a non-SPDK thread. */ + if (cb_fn != NULL) { + cb_fn(cb_arg, -ENOTSUP); + } + return; + } + + pthread_mutex_lock(&g_bdev_mgr.mutex); + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { + pthread_mutex_unlock(&bdev->internal.mutex); + pthread_mutex_unlock(&g_bdev_mgr.mutex); + if (cb_fn) { + cb_fn(cb_arg, -EBUSY); + } + return; + } + + bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; + bdev->internal.unregister_cb = cb_fn; + bdev->internal.unregister_ctx = cb_arg; + + /* Call under lock. */ + rc = bdev_unregister_unsafe(bdev); + pthread_mutex_unlock(&bdev->internal.mutex); + pthread_mutex_unlock(&g_bdev_mgr.mutex); + + if (rc == 0) { + bdev_fini(bdev); + } +} + +static void +bdev_dummy_event_cb(void *remove_ctx) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev remove event received with no remove callback specified"); +} + +static int +bdev_start_qos(struct spdk_bdev *bdev) +{ + struct set_qos_limit_ctx *ctx; + + /* Enable QoS */ + if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); + return -ENOMEM; + } + ctx->bdev = bdev; + spdk_for_each_channel(__bdev_to_io_dev(bdev), + bdev_enable_qos_msg, ctx, + bdev_enable_qos_done); + } + + return 0; +} + +static int +bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) +{ + struct spdk_thread *thread; + int rc = 0; + + thread = spdk_get_thread(); + if (!thread) { + SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); + return -ENOTSUP; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, + spdk_get_thread()); + + desc->bdev = bdev; + desc->thread = thread; + desc->write = write; + + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { + pthread_mutex_unlock(&bdev->internal.mutex); + return -ENODEV; + } + + if (write && bdev->internal.claim_module) { + SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", + bdev->name, bdev->internal.claim_module->name); + pthread_mutex_unlock(&bdev->internal.mutex); + return -EPERM; + } + + rc = bdev_start_qos(bdev); + if (rc != 0) { + SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); + pthread_mutex_unlock(&bdev->internal.mutex); + return rc; + } + + TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); + + pthread_mutex_unlock(&bdev->internal.mutex); + + return 0; +} + +int +spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, + void *remove_ctx, struct spdk_bdev_desc **_desc) +{ + struct spdk_bdev_desc *desc; + int rc; + + desc = calloc(1, sizeof(*desc)); + if (desc == NULL) { + SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); + return -ENOMEM; + } + + if (remove_cb == NULL) { + remove_cb = bdev_dummy_event_cb; + } + + TAILQ_INIT(&desc->pending_media_events); + TAILQ_INIT(&desc->free_media_events); + + desc->callback.open_with_ext = false; + desc->callback.remove_fn = remove_cb; + desc->callback.ctx = remove_ctx; + pthread_mutex_init(&desc->mutex, NULL); + + pthread_mutex_lock(&g_bdev_mgr.mutex); + + rc = bdev_open(bdev, write, desc); + if (rc != 0) { + bdev_desc_free(desc); + desc = NULL; + } + + *_desc = desc; + + pthread_mutex_unlock(&g_bdev_mgr.mutex); + + return rc; +} + +int +spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, + void *event_ctx, struct spdk_bdev_desc **_desc) +{ + struct spdk_bdev_desc *desc; + struct spdk_bdev *bdev; + unsigned int event_id; + int rc; + + if (event_cb == NULL) { + SPDK_ERRLOG("Missing event callback function\n"); + return -EINVAL; + } + + pthread_mutex_lock(&g_bdev_mgr.mutex); + + bdev = spdk_bdev_get_by_name(bdev_name); + + if (bdev == NULL) { + SPDK_ERRLOG("Failed to find bdev with name: %s\n", bdev_name); + pthread_mutex_unlock(&g_bdev_mgr.mutex); + return -EINVAL; + } + + desc = calloc(1, sizeof(*desc)); + if (desc == NULL) { + SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); + pthread_mutex_unlock(&g_bdev_mgr.mutex); + return -ENOMEM; + } + + TAILQ_INIT(&desc->pending_media_events); + TAILQ_INIT(&desc->free_media_events); + + desc->callback.open_with_ext = true; + desc->callback.event_fn = event_cb; + desc->callback.ctx = event_ctx; + pthread_mutex_init(&desc->mutex, NULL); + + if (bdev->media_events) { + desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, + sizeof(*desc->media_events_buffer)); + if (desc->media_events_buffer == NULL) { + SPDK_ERRLOG("Failed to initialize media event pool\n"); + bdev_desc_free(desc); + pthread_mutex_unlock(&g_bdev_mgr.mutex); + return -ENOMEM; + } + + for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { + TAILQ_INSERT_TAIL(&desc->free_media_events, + &desc->media_events_buffer[event_id], tailq); + } + } + + rc = bdev_open(bdev, write, desc); + if (rc != 0) { + bdev_desc_free(desc); + desc = NULL; + } + + *_desc = desc; + + pthread_mutex_unlock(&g_bdev_mgr.mutex); + + return rc; +} + +void +spdk_bdev_close(struct spdk_bdev_desc *desc) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, + spdk_get_thread()); + + assert(desc->thread == spdk_get_thread()); + + spdk_poller_unregister(&desc->io_timeout_poller); + + pthread_mutex_lock(&bdev->internal.mutex); + pthread_mutex_lock(&desc->mutex); + + TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); + + desc->closed = true; + + if (0 == desc->refs) { + pthread_mutex_unlock(&desc->mutex); + bdev_desc_free(desc); + } else { + pthread_mutex_unlock(&desc->mutex); + } + + /* If no more descriptors, kill QoS channel */ + if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", + bdev->name, spdk_get_thread()); + + if (bdev_qos_destroy(bdev)) { + /* There isn't anything we can do to recover here. Just let the + * old QoS poller keep running. The QoS handling won't change + * cores when the user allocates a new channel, but it won't break. */ + SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); + } + } + + spdk_bdev_set_qd_sampling_period(bdev, 0); + + if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { + rc = bdev_unregister_unsafe(bdev); + pthread_mutex_unlock(&bdev->internal.mutex); + + if (rc == 0) { + bdev_fini(bdev); + } + } else { + pthread_mutex_unlock(&bdev->internal.mutex); + } +} + +int +spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_bdev_module *module) +{ + if (bdev->internal.claim_module != NULL) { + SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, + bdev->internal.claim_module->name); + return -EPERM; + } + + if (desc && !desc->write) { + desc->write = true; + } + + bdev->internal.claim_module = module; + return 0; +} + +void +spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) +{ + assert(bdev->internal.claim_module != NULL); + bdev->internal.claim_module = NULL; +} + +struct spdk_bdev * +spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) +{ + assert(desc != NULL); + return desc->bdev; +} + +void +spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) +{ + struct iovec *iovs; + int iovcnt; + + if (bdev_io == NULL) { + return; + } + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_ZCOPY: + iovs = bdev_io->u.bdev.iovs; + iovcnt = bdev_io->u.bdev.iovcnt; + break; + default: + iovs = NULL; + iovcnt = 0; + break; + } + + if (iovp) { + *iovp = iovs; + } + if (iovcntp) { + *iovcntp = iovcnt; + } +} + +void * +spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) +{ + if (bdev_io == NULL) { + return NULL; + } + + if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { + return NULL; + } + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || + bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + return bdev_io->u.bdev.md_buf; + } + + return NULL; +} + +void * +spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) +{ + if (bdev_io == NULL) { + assert(false); + return NULL; + } + + return bdev_io->internal.caller_ctx; +} + +void +spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) +{ + + if (spdk_bdev_module_list_find(bdev_module->name)) { + SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); + assert(false); + } + + /* + * Modules with examine callbacks must be initialized first, so they are + * ready to handle examine callbacks from later modules that will + * register physical bdevs. + */ + if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { + TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); + } else { + TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); + } +} + +struct spdk_bdev_module * +spdk_bdev_module_list_find(const char *name) +{ + struct spdk_bdev_module *bdev_module; + + TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (strcmp(name, bdev_module->name) == 0) { + break; + } + } + + return bdev_module; +} + +static void +bdev_write_zero_buffer_next(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + uint64_t num_bytes, num_blocks; + void *md_buf = NULL; + int rc; + + num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * + bdev_io->u.bdev.split_remaining_num_blocks, + ZERO_BUFFER_SIZE); + num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); + + if (spdk_bdev_is_md_separate(bdev_io->bdev)) { + md_buf = (char *)g_bdev_mgr.zero_buffer + + spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; + } + + rc = bdev_write_blocks_with_md(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), + g_bdev_mgr.zero_buffer, md_buf, + bdev_io->u.bdev.split_current_offset_blocks, num_blocks, + bdev_write_zero_buffer_done, bdev_io); + if (rc == 0) { + bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; + bdev_io->u.bdev.split_current_offset_blocks += num_blocks; + } else if (rc == -ENOMEM) { + bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); + } else { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } +} + +static void +bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (!success) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); + return; + } + + if (parent_io->u.bdev.split_remaining_num_blocks == 0) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); + return; + } + + bdev_write_zero_buffer_next(parent_io); +} + +static void +bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) +{ + pthread_mutex_lock(&ctx->bdev->internal.mutex); + ctx->bdev->internal.qos_mod_in_progress = false; + pthread_mutex_unlock(&ctx->bdev->internal.mutex); + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->cb_arg, status); + } + free(ctx); +} + +static void +bdev_disable_qos_done(void *cb_arg) +{ + struct set_qos_limit_ctx *ctx = cb_arg; + struct spdk_bdev *bdev = ctx->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_qos *qos; + + pthread_mutex_lock(&bdev->internal.mutex); + qos = bdev->internal.qos; + bdev->internal.qos = NULL; + pthread_mutex_unlock(&bdev->internal.mutex); + + while (!TAILQ_EMPTY(&qos->queued)) { + /* Send queued I/O back to their original thread for resubmission. */ + bdev_io = TAILQ_FIRST(&qos->queued); + TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); + + if (bdev_io->internal.io_submit_ch) { + /* + * Channel was changed when sending it to the QoS thread - change it back + * before sending it back to the original thread. + */ + bdev_io->internal.ch = bdev_io->internal.io_submit_ch; + bdev_io->internal.io_submit_ch = NULL; + } + + spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), + _bdev_io_submit, bdev_io); + } + + if (qos->thread != NULL) { + spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); + spdk_poller_unregister(&qos->poller); + } + + free(qos); + + bdev_set_qos_limit_done(ctx, 0); +} + +static void +bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) +{ + void *io_device = spdk_io_channel_iter_get_io_device(i); + struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); + struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_thread *thread; + + pthread_mutex_lock(&bdev->internal.mutex); + thread = bdev->internal.qos->thread; + pthread_mutex_unlock(&bdev->internal.mutex); + + if (thread != NULL) { + spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); + } else { + bdev_disable_qos_done(ctx); + } +} + +static void +bdev_disable_qos_msg(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); + + bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; + + spdk_for_each_channel_continue(i, 0); +} + +static void +bdev_update_qos_rate_limit_msg(void *cb_arg) +{ + struct set_qos_limit_ctx *ctx = cb_arg; + struct spdk_bdev *bdev = ctx->bdev; + + pthread_mutex_lock(&bdev->internal.mutex); + bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); + pthread_mutex_unlock(&bdev->internal.mutex); + + bdev_set_qos_limit_done(ctx, 0); +} + +static void +bdev_enable_qos_msg(struct spdk_io_channel_iter *i) +{ + void *io_device = spdk_io_channel_iter_get_io_device(i); + struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); + + pthread_mutex_lock(&bdev->internal.mutex); + bdev_enable_qos(bdev, bdev_ch); + pthread_mutex_unlock(&bdev->internal.mutex); + spdk_for_each_channel_continue(i, 0); +} + +static void +bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) +{ + struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + bdev_set_qos_limit_done(ctx, status); +} + +static void +bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) +{ + int i; + + assert(bdev->internal.qos != NULL); + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + bdev->internal.qos->rate_limits[i].limit = limits[i]; + + if (limits[i] == 0) { + bdev->internal.qos->rate_limits[i].limit = + SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; + } + } + } +} + +void +spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, + void (*cb_fn)(void *cb_arg, int status), void *cb_arg) +{ + struct set_qos_limit_ctx *ctx; + uint32_t limit_set_complement; + uint64_t min_limit_per_sec; + int i; + bool disable_rate_limit = true; + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + continue; + } + + if (limits[i] > 0) { + disable_rate_limit = false; + } + + if (bdev_qos_is_iops_rate_limit(i) == true) { + min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; + } else { + /* Change from megabyte to byte rate limit */ + limits[i] = limits[i] * 1024 * 1024; + min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; + } + + limit_set_complement = limits[i] % min_limit_per_sec; + if (limit_set_complement) { + SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", + limits[i], min_limit_per_sec); + limits[i] += min_limit_per_sec - limit_set_complement; + SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); + } + } + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->bdev = bdev; + + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev->internal.qos_mod_in_progress) { + pthread_mutex_unlock(&bdev->internal.mutex); + free(ctx); + cb_fn(cb_arg, -EAGAIN); + return; + } + bdev->internal.qos_mod_in_progress = true; + + if (disable_rate_limit == true && bdev->internal.qos) { + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && + (bdev->internal.qos->rate_limits[i].limit > 0 && + bdev->internal.qos->rate_limits[i].limit != + SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { + disable_rate_limit = false; + break; + } + } + } + + if (disable_rate_limit == false) { + if (bdev->internal.qos == NULL) { + bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); + if (!bdev->internal.qos) { + pthread_mutex_unlock(&bdev->internal.mutex); + SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); + bdev_set_qos_limit_done(ctx, -ENOMEM); + return; + } + } + + if (bdev->internal.qos->thread == NULL) { + /* Enabling */ + bdev_set_qos_rate_limits(bdev, limits); + + spdk_for_each_channel(__bdev_to_io_dev(bdev), + bdev_enable_qos_msg, ctx, + bdev_enable_qos_done); + } else { + /* Updating */ + bdev_set_qos_rate_limits(bdev, limits); + + spdk_thread_send_msg(bdev->internal.qos->thread, + bdev_update_qos_rate_limit_msg, ctx); + } + } else { + if (bdev->internal.qos != NULL) { + bdev_set_qos_rate_limits(bdev, limits); + + /* Disabling */ + spdk_for_each_channel(__bdev_to_io_dev(bdev), + bdev_disable_qos_msg, ctx, + bdev_disable_qos_msg_done); + } else { + pthread_mutex_unlock(&bdev->internal.mutex); + bdev_set_qos_limit_done(ctx, 0); + return; + } + } + + pthread_mutex_unlock(&bdev->internal.mutex); +} + +struct spdk_bdev_histogram_ctx { + spdk_bdev_histogram_status_cb cb_fn; + void *cb_arg; + struct spdk_bdev *bdev; + int status; +}; + +static void +bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + pthread_mutex_lock(&ctx->bdev->internal.mutex); + ctx->bdev->internal.histogram_in_progress = false; + pthread_mutex_unlock(&ctx->bdev->internal.mutex); + ctx->cb_fn(ctx->cb_arg, ctx->status); + free(ctx); +} + +static void +bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + + if (ch->histogram != NULL) { + spdk_histogram_data_free(ch->histogram); + ch->histogram = NULL; + } + spdk_for_each_channel_continue(i, 0); +} + +static void +bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + if (status != 0) { + ctx->status = status; + ctx->bdev->internal.histogram_enabled = false; + spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, + bdev_histogram_disable_channel_cb); + } else { + pthread_mutex_lock(&ctx->bdev->internal.mutex); + ctx->bdev->internal.histogram_in_progress = false; + pthread_mutex_unlock(&ctx->bdev->internal.mutex); + ctx->cb_fn(ctx->cb_arg, ctx->status); + free(ctx); + } +} + +static void +bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + int status = 0; + + if (ch->histogram == NULL) { + ch->histogram = spdk_histogram_data_alloc(); + if (ch->histogram == NULL) { + status = -ENOMEM; + } + } + + spdk_for_each_channel_continue(i, status); +} + +void +spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, + void *cb_arg, bool enable) +{ + struct spdk_bdev_histogram_ctx *ctx; + + ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->bdev = bdev; + ctx->status = 0; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev->internal.histogram_in_progress) { + pthread_mutex_unlock(&bdev->internal.mutex); + free(ctx); + cb_fn(cb_arg, -EAGAIN); + return; + } + + bdev->internal.histogram_in_progress = true; + pthread_mutex_unlock(&bdev->internal.mutex); + + bdev->internal.histogram_enabled = enable; + + if (enable) { + /* Allocate histogram for each channel */ + spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, + bdev_histogram_enable_channel_cb); + } else { + spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, + bdev_histogram_disable_channel_cb); + } +} + +struct spdk_bdev_histogram_data_ctx { + spdk_bdev_histogram_data_cb cb_fn; + void *cb_arg; + struct spdk_bdev *bdev; + /** merged histogram data from all channels */ + struct spdk_histogram_data *histogram; +}; + +static void +bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); + free(ctx); +} + +static void +bdev_histogram_get_channel(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + int status = 0; + + if (ch->histogram == NULL) { + status = -EFAULT; + } else { + spdk_histogram_data_merge(ctx->histogram, ch->histogram); + } + + spdk_for_each_channel_continue(i, status); +} + +void +spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, + spdk_bdev_histogram_data_cb cb_fn, + void *cb_arg) +{ + struct spdk_bdev_histogram_data_ctx *ctx; + + ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM, NULL); + return; + } + + ctx->bdev = bdev; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + ctx->histogram = histogram; + + spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, + bdev_histogram_get_channel_cb); +} + +size_t +spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, + size_t max_events) +{ + struct media_event_entry *entry; + size_t num_events = 0; + + for (; num_events < max_events; ++num_events) { + entry = TAILQ_FIRST(&desc->pending_media_events); + if (entry == NULL) { + break; + } + + events[num_events] = entry->event; + TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); + TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); + } + + return num_events; +} + +int +spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, + size_t num_events) +{ + struct spdk_bdev_desc *desc; + struct media_event_entry *entry; + size_t event_id; + int rc = 0; + + assert(bdev->media_events); + + pthread_mutex_lock(&bdev->internal.mutex); + TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { + if (desc->write) { + break; + } + } + + if (desc == NULL || desc->media_events_buffer == NULL) { + rc = -ENODEV; + goto out; + } + + for (event_id = 0; event_id < num_events; ++event_id) { + entry = TAILQ_FIRST(&desc->free_media_events); + if (entry == NULL) { + break; + } + + TAILQ_REMOVE(&desc->free_media_events, entry, tailq); + TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); + entry->event = events[event_id]; + } + + rc = event_id; +out: + pthread_mutex_unlock(&bdev->internal.mutex); + return rc; +} + +void +spdk_bdev_notify_media_management(struct spdk_bdev *bdev) +{ + struct spdk_bdev_desc *desc; + + pthread_mutex_lock(&bdev->internal.mutex); + TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { + if (!TAILQ_EMPTY(&desc->pending_media_events)) { + desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, + desc->callback.ctx); + } + } + pthread_mutex_unlock(&bdev->internal.mutex); +} + +struct locked_lba_range_ctx { + struct lba_range range; + struct spdk_bdev *bdev; + struct lba_range *current_range; + struct lba_range *owner_range; + struct spdk_poller *poller; + lock_range_cb cb_fn; + void *cb_arg; +}; + +static void +bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) +{ + struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + ctx->cb_fn(ctx->cb_arg, -ENOMEM); + free(ctx); +} + +static void +bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); + +static void +bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) +{ + struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_bdev *bdev = ctx->bdev; + + if (status == -ENOMEM) { + /* One of the channels could not allocate a range object. + * So we have to go back and clean up any ranges that were + * allocated successfully before we return error status to + * the caller. We can reuse the unlock function to do that + * clean up. + */ + spdk_for_each_channel(__bdev_to_io_dev(bdev), + bdev_unlock_lba_range_get_channel, ctx, + bdev_lock_error_cleanup_cb); + return; + } + + /* All channels have locked this range and no I/O overlapping the range + * are outstanding! Set the owner_ch for the range object for the + * locking channel, so that this channel will know that it is allowed + * to write to this range. + */ + ctx->owner_range->owner_ch = ctx->range.owner_ch; + ctx->cb_fn(ctx->cb_arg, status); + + /* Don't free the ctx here. Its range is in the bdev's global list of + * locked ranges still, and will be removed and freed when this range + * is later unlocked. + */ +} + +static int +bdev_lock_lba_range_check_io(void *_i) +{ + struct spdk_io_channel_iter *i = _i; + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct lba_range *range = ctx->current_range; + struct spdk_bdev_io *bdev_io; + + spdk_poller_unregister(&ctx->poller); + + /* The range is now in the locked_ranges, so no new IO can be submitted to this + * range. But we need to wait until any outstanding IO overlapping with this range + * are completed. + */ + TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { + if (bdev_io_range_is_locked(bdev_io, range)) { + ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); + return SPDK_POLLER_BUSY; + } + } + + spdk_for_each_channel_continue(i, 0); + return SPDK_POLLER_BUSY; +} + +static void +bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct lba_range *range; + + TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { + if (range->length == ctx->range.length && + range->offset == ctx->range.offset && + range->locked_ctx == ctx->range.locked_ctx) { + /* This range already exists on this channel, so don't add + * it again. This can happen when a new channel is created + * while the for_each_channel operation is in progress. + * Do not check for outstanding I/O in that case, since the + * range was locked before any I/O could be submitted to the + * new channel. + */ + spdk_for_each_channel_continue(i, 0); + return; + } + } + + range = calloc(1, sizeof(*range)); + if (range == NULL) { + spdk_for_each_channel_continue(i, -ENOMEM); + return; + } + + range->length = ctx->range.length; + range->offset = ctx->range.offset; + range->locked_ctx = ctx->range.locked_ctx; + ctx->current_range = range; + if (ctx->range.owner_ch == ch) { + /* This is the range object for the channel that will hold + * the lock. Store it in the ctx object so that we can easily + * set its owner_ch after the lock is finally acquired. + */ + ctx->owner_range = range; + } + TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); + bdev_lock_lba_range_check_io(i); +} + +static void +bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) +{ + assert(spdk_get_thread() == ctx->range.owner_ch->channel->thread); + + /* We will add a copy of this range to each channel now. */ + spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, + bdev_lock_lba_range_cb); +} + +static bool +bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) +{ + struct lba_range *r; + + TAILQ_FOREACH(r, tailq, tailq) { + if (bdev_lba_range_overlapped(range, r)) { + return true; + } + } + return false; +} + +static int +bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, + uint64_t offset, uint64_t length, + lock_range_cb cb_fn, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + struct locked_lba_range_ctx *ctx; + + if (cb_arg == NULL) { + SPDK_ERRLOG("cb_arg must not be NULL\n"); + return -EINVAL; + } + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + return -ENOMEM; + } + + ctx->range.offset = offset; + ctx->range.length = length; + ctx->range.owner_ch = ch; + ctx->range.locked_ctx = cb_arg; + ctx->bdev = bdev; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { + /* There is an active lock overlapping with this range. + * Put it on the pending list until this range no + * longer overlaps with another. + */ + TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); + } else { + TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); + bdev_lock_lba_range_ctx(bdev, ctx); + } + pthread_mutex_unlock(&bdev->internal.mutex); + return 0; +} + +static void +bdev_lock_lba_range_ctx_msg(void *_ctx) +{ + struct locked_lba_range_ctx *ctx = _ctx; + + bdev_lock_lba_range_ctx(ctx->bdev, ctx); +} + +static void +bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) +{ + struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct locked_lba_range_ctx *pending_ctx; + struct spdk_bdev_channel *ch = ctx->range.owner_ch; + struct spdk_bdev *bdev = ch->bdev; + struct lba_range *range, *tmp; + + pthread_mutex_lock(&bdev->internal.mutex); + /* Check if there are any pending locked ranges that overlap with this range + * that was just unlocked. If there are, check that it doesn't overlap with any + * other locked ranges before calling bdev_lock_lba_range_ctx which will start + * the lock process. + */ + TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { + if (bdev_lba_range_overlapped(range, &ctx->range) && + !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { + TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); + pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); + TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); + spdk_thread_send_msg(pending_ctx->range.owner_ch->channel->thread, + bdev_lock_lba_range_ctx_msg, pending_ctx); + } + } + pthread_mutex_unlock(&bdev->internal.mutex); + + ctx->cb_fn(ctx->cb_arg, status); + free(ctx); +} + +static void +bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + TAILQ_HEAD(, spdk_bdev_io) io_locked; + struct spdk_bdev_io *bdev_io; + struct lba_range *range; + + TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { + if (ctx->range.offset == range->offset && + ctx->range.length == range->length && + ctx->range.locked_ctx == range->locked_ctx) { + TAILQ_REMOVE(&ch->locked_ranges, range, tailq); + free(range); + break; + } + } + + /* Note: we should almost always be able to assert that the range specified + * was found. But there are some very rare corner cases where a new channel + * gets created simultaneously with a range unlock, where this function + * would execute on that new channel and wouldn't have the range. + * We also use this to clean up range allocations when a later allocation + * fails in the locking path. + * So we can't actually assert() here. + */ + + /* Swap the locked IO into a temporary list, and then try to submit them again. + * We could hyper-optimize this to only resubmit locked I/O that overlap + * with the range that was just unlocked, but this isn't a performance path so + * we go for simplicity here. + */ + TAILQ_INIT(&io_locked); + TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); + while (!TAILQ_EMPTY(&io_locked)) { + bdev_io = TAILQ_FIRST(&io_locked); + TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); + bdev_io_submit(bdev_io); + } + + spdk_for_each_channel_continue(i, 0); +} + +static int +bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, + uint64_t offset, uint64_t length, + lock_range_cb cb_fn, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + struct locked_lba_range_ctx *ctx; + struct lba_range *range; + bool range_found = false; + + /* Let's make sure the specified channel actually has a lock on + * the specified range. Note that the range must match exactly. + */ + TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { + if (range->offset == offset && range->length == length && + range->owner_ch == ch && range->locked_ctx == cb_arg) { + range_found = true; + break; + } + } + + if (!range_found) { + return -EINVAL; + } + + pthread_mutex_lock(&bdev->internal.mutex); + /* We confirmed that this channel has locked the specified range. To + * start the unlock the process, we find the range in the bdev's locked_ranges + * and remove it. This ensures new channels don't inherit the locked range. + * Then we will send a message to each channel (including the one specified + * here) to remove the range from its per-channel list. + */ + TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { + if (range->offset == offset && range->length == length && + range->locked_ctx == cb_arg) { + break; + } + } + if (range == NULL) { + assert(false); + pthread_mutex_unlock(&bdev->internal.mutex); + return -EINVAL; + } + TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); + ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); + pthread_mutex_unlock(&bdev->internal.mutex); + + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, + bdev_unlock_lba_range_cb); + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) + +SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) +{ + spdk_trace_register_owner(OWNER_BDEV, 'b'); + spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); + spdk_trace_register_description("BDEV_IO_START", TRACE_BDEV_IO_START, OWNER_BDEV, + OBJECT_BDEV_IO, 1, 0, "type: "); + spdk_trace_register_description("BDEV_IO_DONE", TRACE_BDEV_IO_DONE, OWNER_BDEV, + OBJECT_BDEV_IO, 0, 0, ""); +} diff --git a/src/spdk/lib/bdev/bdev_internal.h b/src/spdk/lib/bdev/bdev_internal.h new file mode 100644 index 000000000..d1fa6e65a --- /dev/null +++ b/src/spdk/lib/bdev/bdev_internal.h @@ -0,0 +1,50 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_INTERNAL_H +#define SPDK_BDEV_INTERNAL_H + +#include "spdk/bdev.h" + +struct spdk_bdev; +struct spdk_bdev_io; +struct spdk_bdev_channel; + +struct spdk_bdev_io *bdev_channel_get_io(struct spdk_bdev_channel *channel); + +void bdev_io_init(struct spdk_bdev_io *bdev_io, struct spdk_bdev *bdev, void *cb_arg, + spdk_bdev_io_completion_cb cb); + +void bdev_io_submit(struct spdk_bdev_io *bdev_io); + +#endif /* SPDK_BDEV_INTERNAL_H */ diff --git a/src/spdk/lib/bdev/bdev_rpc.c b/src/spdk/lib/bdev/bdev_rpc.c new file mode 100644 index 000000000..6ce7136c4 --- /dev/null +++ b/src/spdk/lib/bdev/bdev_rpc.c @@ -0,0 +1,98 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/bdev.h" + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +struct spdk_rpc_set_bdev_opts { + uint32_t bdev_io_pool_size; + uint32_t bdev_io_cache_size; + bool bdev_auto_examine; +}; + +static const struct spdk_json_object_decoder rpc_set_bdev_opts_decoders[] = { + {"bdev_io_pool_size", offsetof(struct spdk_rpc_set_bdev_opts, bdev_io_pool_size), spdk_json_decode_uint32, true}, + {"bdev_io_cache_size", offsetof(struct spdk_rpc_set_bdev_opts, bdev_io_cache_size), spdk_json_decode_uint32, true}, + {"bdev_auto_examine", offsetof(struct spdk_rpc_set_bdev_opts, bdev_auto_examine), spdk_json_decode_bool, true}, +}; + +static void +rpc_bdev_set_options(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +{ + struct spdk_rpc_set_bdev_opts rpc_opts; + struct spdk_bdev_opts bdev_opts; + struct spdk_json_write_ctx *w; + int rc; + + rpc_opts.bdev_io_pool_size = UINT32_MAX; + rpc_opts.bdev_io_cache_size = UINT32_MAX; + rpc_opts.bdev_auto_examine = true; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_set_bdev_opts_decoders, + SPDK_COUNTOF(rpc_set_bdev_opts_decoders), &rpc_opts)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + } + + spdk_bdev_get_opts(&bdev_opts); + if (rpc_opts.bdev_io_pool_size != UINT32_MAX) { + bdev_opts.bdev_io_pool_size = rpc_opts.bdev_io_pool_size; + } + if (rpc_opts.bdev_io_cache_size != UINT32_MAX) { + bdev_opts.bdev_io_cache_size = rpc_opts.bdev_io_cache_size; + } + bdev_opts.bdev_auto_examine = rpc_opts.bdev_auto_examine; + rc = spdk_bdev_set_opts(&bdev_opts); + + if (rc != 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Pool size %" PRIu32 " too small for cache size %" PRIu32, + bdev_opts.bdev_io_pool_size, bdev_opts.bdev_io_cache_size); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("bdev_set_options", rpc_bdev_set_options, SPDK_RPC_STARTUP) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_set_options, set_bdev_options) diff --git a/src/spdk/lib/bdev/bdev_zone.c b/src/spdk/lib/bdev/bdev_zone.c new file mode 100644 index 000000000..3cf2ecb67 --- /dev/null +++ b/src/spdk/lib/bdev/bdev_zone.c @@ -0,0 +1,201 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev_zone.h" +#include "spdk/bdev_module.h" + +#include "bdev_internal.h" + +uint64_t +spdk_bdev_get_zone_size(const struct spdk_bdev *bdev) +{ + return bdev->zone_size; +} + +uint32_t +spdk_bdev_get_max_open_zones(const struct spdk_bdev *bdev) +{ + return bdev->max_open_zones; +} + +uint32_t +spdk_bdev_get_optimal_open_zones(const struct spdk_bdev *bdev) +{ + return bdev->optimal_open_zones; +} + +int +spdk_bdev_get_zone_info(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t zone_id, size_t num_zones, struct spdk_bdev_zone_info *info, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_GET_ZONE_INFO; + bdev_io->u.zone_mgmt.zone_id = zone_id; + bdev_io->u.zone_mgmt.num_zones = num_zones; + bdev_io->u.zone_mgmt.buf = info; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_zone_management(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t zone_id, enum spdk_bdev_zone_action action, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT; + bdev_io->u.zone_mgmt.zone_action = action; + bdev_io->u.zone_mgmt.zone_id = zone_id; + bdev_io->u.zone_mgmt.num_zones = 1; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +static int +zone_bdev_append_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, void *md_buf, uint64_t zone_id, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_ZONE_APPEND; + bdev_io->u.bdev.iovs = &bdev_io->iov; + bdev_io->u.bdev.iovs[0].iov_base = buf; + bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; + bdev_io->u.bdev.iovcnt = 1; + bdev_io->u.bdev.md_buf = md_buf; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = zone_id; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_zone_append(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, uint64_t start_lba, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return zone_bdev_append_with_md(desc, ch, buf, NULL, start_lba, num_blocks, + cb, cb_arg); +} + +int +spdk_bdev_zone_append_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, void *md, uint64_t start_lba, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return zone_bdev_append_with_md(desc, ch, buf, md, start_lba, num_blocks, + cb, cb_arg); +} + +int +spdk_bdev_zone_appendv_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, void *md_buf, uint64_t zone_id, + uint64_t num_blocks, spdk_bdev_io_completion_cb cb, + void *cb_arg) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + bdev_io = bdev_channel_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_ZONE_APPEND; + bdev_io->u.bdev.iovs = iov; + bdev_io->u.bdev.iovcnt = iovcnt; + bdev_io->u.bdev.md_buf = md_buf; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = zone_id; + bdev_io_init(bdev_io, bdev, cb_arg, cb); + + bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_zone_appendv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iovs, int iovcnt, uint64_t zone_id, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + return spdk_bdev_zone_appendv_with_md(desc, ch, iovs, iovcnt, NULL, zone_id, num_blocks, + cb, cb_arg); +} + +uint64_t +spdk_bdev_io_get_append_location(struct spdk_bdev_io *bdev_io) +{ + return bdev_io->u.bdev.offset_blocks; +} diff --git a/src/spdk/lib/bdev/part.c b/src/spdk/lib/bdev/part.c new file mode 100644 index 000000000..01a395591 --- /dev/null +++ b/src/spdk/lib/bdev/part.c @@ -0,0 +1,524 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Common code for partition-like virtual bdevs. + */ + +#include "spdk/bdev.h" +#include "spdk/likely.h" +#include "spdk/log.h" +#include "spdk/string.h" +#include "spdk/thread.h" + +#include "spdk/bdev_module.h" + +struct spdk_bdev_part_base { + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + uint32_t ref; + uint32_t channel_size; + spdk_bdev_part_base_free_fn base_free_fn; + void *ctx; + bool claimed; + struct spdk_bdev_module *module; + struct spdk_bdev_fn_table *fn_table; + struct bdev_part_tailq *tailq; + spdk_io_channel_create_cb ch_create_cb; + spdk_io_channel_destroy_cb ch_destroy_cb; + struct spdk_thread *thread; +}; + +struct spdk_bdev * +spdk_bdev_part_base_get_bdev(struct spdk_bdev_part_base *part_base) +{ + return part_base->bdev; +} + +struct spdk_bdev_desc * +spdk_bdev_part_base_get_desc(struct spdk_bdev_part_base *part_base) +{ + return part_base->desc; +} + +struct bdev_part_tailq * +spdk_bdev_part_base_get_tailq(struct spdk_bdev_part_base *part_base) +{ + return part_base->tailq; +} + +void * +spdk_bdev_part_base_get_ctx(struct spdk_bdev_part_base *part_base) +{ + return part_base->ctx; +} + +const char * +spdk_bdev_part_base_get_bdev_name(struct spdk_bdev_part_base *part_base) +{ + return part_base->bdev->name; +} + +static void +bdev_part_base_free(void *ctx) +{ + struct spdk_bdev_desc *desc = ctx; + + spdk_bdev_close(desc); +} + +void +spdk_bdev_part_base_free(struct spdk_bdev_part_base *base) +{ + if (base->desc) { + /* Close the underlying bdev on its same opened thread. */ + if (base->thread && base->thread != spdk_get_thread()) { + spdk_thread_send_msg(base->thread, bdev_part_base_free, base->desc); + } else { + spdk_bdev_close(base->desc); + } + } + + if (base->base_free_fn != NULL) { + base->base_free_fn(base->ctx); + } + + free(base); +} + +static void +bdev_part_free_cb(void *io_device) +{ + struct spdk_bdev_part *part = io_device; + struct spdk_bdev_part_base *base; + + assert(part); + assert(part->internal.base); + + base = part->internal.base; + + TAILQ_REMOVE(base->tailq, part, tailq); + + if (--base->ref == 0) { + spdk_bdev_module_release_bdev(base->bdev); + spdk_bdev_part_base_free(base); + } + + spdk_bdev_destruct_done(&part->internal.bdev, 0); + free(part->internal.bdev.name); + free(part->internal.bdev.product_name); + free(part); +} + +int +spdk_bdev_part_free(struct spdk_bdev_part *part) +{ + spdk_io_device_unregister(part, bdev_part_free_cb); + + /* Return 1 to indicate that this is an asynchronous operation that isn't complete + * until spdk_bdev_destruct_done is called */ + return 1; +} + +void +spdk_bdev_part_base_hotremove(struct spdk_bdev_part_base *part_base, struct bdev_part_tailq *tailq) +{ + struct spdk_bdev_part *part, *tmp; + + TAILQ_FOREACH_SAFE(part, tailq, tailq, tmp) { + if (part->internal.base == part_base) { + spdk_bdev_unregister(&part->internal.bdev, NULL, NULL); + } + } +} + +static bool +bdev_part_io_type_supported(void *_part, enum spdk_bdev_io_type io_type) +{ + struct spdk_bdev_part *part = _part; + + /* We can't decode/modify passthrough NVMe commands, so don't report + * that a partition supports these io types, even if the underlying + * bdev does. + */ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_NVME_ADMIN: + case SPDK_BDEV_IO_TYPE_NVME_IO: + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + return false; + default: + break; + } + + return part->internal.base->bdev->fn_table->io_type_supported(part->internal.base->bdev->ctxt, + io_type); +} + +static struct spdk_io_channel * +bdev_part_get_io_channel(void *_part) +{ + struct spdk_bdev_part *part = _part; + + return spdk_get_io_channel(part); +} + +struct spdk_bdev * +spdk_bdev_part_get_bdev(struct spdk_bdev_part *part) +{ + return &part->internal.bdev; +} + +struct spdk_bdev_part_base * +spdk_bdev_part_get_base(struct spdk_bdev_part *part) +{ + return part->internal.base; +} + +struct spdk_bdev * +spdk_bdev_part_get_base_bdev(struct spdk_bdev_part *part) +{ + return part->internal.base->bdev; +} + +uint64_t +spdk_bdev_part_get_offset_blocks(struct spdk_bdev_part *part) +{ + return part->internal.offset_blocks; +} + +static int +bdev_part_remap_dif(struct spdk_bdev_io *bdev_io, uint32_t offset, + uint32_t remapped_offset) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_dif_ctx dif_ctx; + struct spdk_dif_error err_blk = {}; + int rc; + + if (spdk_likely(!(bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK))) { + return 0; + } + + rc = spdk_dif_ctx_init(&dif_ctx, + bdev->blocklen, bdev->md_len, bdev->md_interleave, + bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, + offset, 0, 0, 0, 0); + if (rc != 0) { + SPDK_ERRLOG("Initialization of DIF context failed\n"); + return rc; + } + + spdk_dif_ctx_set_remapped_init_ref_tag(&dif_ctx, remapped_offset); + + if (bdev->md_interleave) { + rc = spdk_dif_remap_ref_tag(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); + } else { + struct iovec md_iov = { + .iov_base = bdev_io->u.bdev.md_buf, + .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, + }; + + rc = spdk_dix_remap_ref_tag(&md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); + } + + if (rc != 0) { + SPDK_ERRLOG("Remapping reference tag failed. type=%d, offset=%" PRIu32 "\n", + err_blk.err_type, err_blk.err_offset); + } + + return rc; +} + +static void +bdev_part_complete_read_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *part_io = cb_arg; + uint32_t offset, remapped_offset; + int rc, status; + + offset = bdev_io->u.bdev.offset_blocks; + remapped_offset = part_io->u.bdev.offset_blocks; + + if (success) { + rc = bdev_part_remap_dif(bdev_io, offset, remapped_offset); + if (rc != 0) { + success = false; + } + } + + status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + spdk_bdev_io_complete(part_io, status); + spdk_bdev_free_io(bdev_io); +} + +static void +bdev_part_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *part_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + spdk_bdev_io_complete(part_io, status); + spdk_bdev_free_io(bdev_io); +} + +static void +bdev_part_complete_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *part_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + spdk_bdev_io_set_buf(part_io, bdev_io->u.bdev.iovs[0].iov_base, bdev_io->u.bdev.iovs[0].iov_len); + spdk_bdev_io_complete(part_io, status); + spdk_bdev_free_io(bdev_io); +} + +int +spdk_bdev_part_submit_request(struct spdk_bdev_part_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev_part *part = ch->part; + struct spdk_io_channel *base_ch = ch->base_ch; + struct spdk_bdev_desc *base_desc = part->internal.base->desc; + uint64_t offset, remapped_offset; + int rc = 0; + + offset = bdev_io->u.bdev.offset_blocks; + remapped_offset = offset + part->internal.offset_blocks; + + /* Modify the I/O to adjust for the offset within the base bdev. */ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + if (bdev_io->u.bdev.md_buf == NULL) { + rc = spdk_bdev_readv_blocks(base_desc, base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, remapped_offset, + bdev_io->u.bdev.num_blocks, + bdev_part_complete_read_io, bdev_io); + } else { + rc = spdk_bdev_readv_blocks_with_md(base_desc, base_ch, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.md_buf, remapped_offset, + bdev_io->u.bdev.num_blocks, + bdev_part_complete_read_io, bdev_io); + } + break; + case SPDK_BDEV_IO_TYPE_WRITE: + rc = bdev_part_remap_dif(bdev_io, offset, remapped_offset); + if (rc != 0) { + return SPDK_BDEV_IO_STATUS_FAILED; + } + + if (bdev_io->u.bdev.md_buf == NULL) { + rc = spdk_bdev_writev_blocks(base_desc, base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, remapped_offset, + bdev_io->u.bdev.num_blocks, + bdev_part_complete_io, bdev_io); + } else { + rc = spdk_bdev_writev_blocks_with_md(base_desc, base_ch, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.md_buf, remapped_offset, + bdev_io->u.bdev.num_blocks, + bdev_part_complete_io, bdev_io); + } + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + rc = spdk_bdev_write_zeroes_blocks(base_desc, base_ch, remapped_offset, + bdev_io->u.bdev.num_blocks, bdev_part_complete_io, + bdev_io); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + rc = spdk_bdev_unmap_blocks(base_desc, base_ch, remapped_offset, + bdev_io->u.bdev.num_blocks, bdev_part_complete_io, + bdev_io); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + rc = spdk_bdev_flush_blocks(base_desc, base_ch, remapped_offset, + bdev_io->u.bdev.num_blocks, bdev_part_complete_io, + bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + rc = spdk_bdev_reset(base_desc, base_ch, + bdev_part_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_ZCOPY: + rc = spdk_bdev_zcopy_start(base_desc, base_ch, remapped_offset, + bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.zcopy.populate, + bdev_part_complete_zcopy_io, bdev_io); + break; + default: + SPDK_ERRLOG("unknown I/O type %d\n", bdev_io->type); + return SPDK_BDEV_IO_STATUS_FAILED; + } + + return rc; +} + +static int +bdev_part_channel_create_cb(void *io_device, void *ctx_buf) +{ + struct spdk_bdev_part *part = (struct spdk_bdev_part *)io_device; + struct spdk_bdev_part_channel *ch = ctx_buf; + + ch->part = part; + ch->base_ch = spdk_bdev_get_io_channel(part->internal.base->desc); + if (ch->base_ch == NULL) { + return -1; + } + + if (part->internal.base->ch_create_cb) { + return part->internal.base->ch_create_cb(io_device, ctx_buf); + } else { + return 0; + } +} + +static void +bdev_part_channel_destroy_cb(void *io_device, void *ctx_buf) +{ + struct spdk_bdev_part *part = (struct spdk_bdev_part *)io_device; + struct spdk_bdev_part_channel *ch = ctx_buf; + + if (part->internal.base->ch_destroy_cb) { + part->internal.base->ch_destroy_cb(io_device, ctx_buf); + } + spdk_put_io_channel(ch->base_ch); +} + +struct spdk_bdev_part_base * + spdk_bdev_part_base_construct(struct spdk_bdev *bdev, + spdk_bdev_remove_cb_t remove_cb, struct spdk_bdev_module *module, + struct spdk_bdev_fn_table *fn_table, struct bdev_part_tailq *tailq, + spdk_bdev_part_base_free_fn free_fn, void *ctx, + uint32_t channel_size, spdk_io_channel_create_cb ch_create_cb, + spdk_io_channel_destroy_cb ch_destroy_cb) +{ + int rc; + struct spdk_bdev_part_base *base; + + base = calloc(1, sizeof(*base)); + if (!base) { + SPDK_ERRLOG("Memory allocation failure\n"); + return NULL; + } + fn_table->get_io_channel = bdev_part_get_io_channel; + fn_table->io_type_supported = bdev_part_io_type_supported; + + base->bdev = bdev; + base->desc = NULL; + base->ref = 0; + base->module = module; + base->fn_table = fn_table; + base->tailq = tailq; + base->base_free_fn = free_fn; + base->ctx = ctx; + base->claimed = false; + base->channel_size = channel_size; + base->ch_create_cb = ch_create_cb; + base->ch_destroy_cb = ch_destroy_cb; + + rc = spdk_bdev_open(bdev, false, remove_cb, base, &base->desc); + if (rc) { + spdk_bdev_part_base_free(base); + SPDK_ERRLOG("could not open bdev %s: %s\n", spdk_bdev_get_name(bdev), + spdk_strerror(-rc)); + return NULL; + } + + /* Save the thread where the base device is opened */ + base->thread = spdk_get_thread(); + + return base; +} + +int +spdk_bdev_part_construct(struct spdk_bdev_part *part, struct spdk_bdev_part_base *base, + char *name, uint64_t offset_blocks, uint64_t num_blocks, + char *product_name) +{ + part->internal.bdev.blocklen = base->bdev->blocklen; + part->internal.bdev.blockcnt = num_blocks; + part->internal.offset_blocks = offset_blocks; + + part->internal.bdev.write_cache = base->bdev->write_cache; + part->internal.bdev.required_alignment = base->bdev->required_alignment; + part->internal.bdev.ctxt = part; + part->internal.bdev.module = base->module; + part->internal.bdev.fn_table = base->fn_table; + + part->internal.bdev.md_interleave = base->bdev->md_interleave; + part->internal.bdev.md_len = base->bdev->md_len; + part->internal.bdev.dif_type = base->bdev->dif_type; + part->internal.bdev.dif_is_head_of_md = base->bdev->dif_is_head_of_md; + part->internal.bdev.dif_check_flags = base->bdev->dif_check_flags; + + part->internal.bdev.name = strdup(name); + part->internal.bdev.product_name = strdup(product_name); + + if (part->internal.bdev.name == NULL) { + SPDK_ERRLOG("Failed to allocate name for new part of bdev %s\n", spdk_bdev_get_name(base->bdev)); + return -1; + } else if (part->internal.bdev.product_name == NULL) { + free(part->internal.bdev.name); + SPDK_ERRLOG("Failed to allocate product name for new part of bdev %s\n", + spdk_bdev_get_name(base->bdev)); + return -1; + } + + base->ref++; + part->internal.base = base; + + if (!base->claimed) { + int rc; + + rc = spdk_bdev_module_claim_bdev(base->bdev, base->desc, base->module); + if (rc) { + SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(base->bdev)); + free(part->internal.bdev.name); + free(part->internal.bdev.product_name); + return -1; + } + base->claimed = true; + } + + spdk_io_device_register(part, bdev_part_channel_create_cb, + bdev_part_channel_destroy_cb, + base->channel_size, + name); + + spdk_bdev_register(&part->internal.bdev); + TAILQ_INSERT_TAIL(base->tailq, part, tailq); + + return 0; +} diff --git a/src/spdk/lib/bdev/scsi_nvme.c b/src/spdk/lib/bdev/scsi_nvme.c new file mode 100644 index 000000000..f9fe319bd --- /dev/null +++ b/src/spdk/lib/bdev/scsi_nvme.c @@ -0,0 +1,261 @@ +/*- + * BSD LICENSE + * + * Copyright (c) 2016 FUJITSU LIMITED, All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/bdev_module.h" + +#include "spdk/nvme_spec.h" + +void +spdk_scsi_nvme_translate(const struct spdk_bdev_io *bdev_io, int *sc, int *sk, + int *asc, int *ascq) +{ + int nvme_sct = bdev_io->internal.error.nvme.sct; + int nvme_sc = bdev_io->internal.error.nvme.sc; + + switch (nvme_sct) { + case SPDK_NVME_SCT_GENERIC: + switch (nvme_sc) { + case SPDK_NVME_SC_SUCCESS: + *sc = SPDK_SCSI_STATUS_GOOD; + *sk = SPDK_SCSI_SENSE_NO_SENSE; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_INVALID_OPCODE: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_INVALID_FIELD: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_DATA_TRANSFER_ERROR: + case SPDK_NVME_SC_CAPACITY_EXCEEDED: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_ABORTED_POWER_LOSS: + *sc = SPDK_SCSI_STATUS_TASK_ABORTED; + *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; + *asc = SPDK_SCSI_ASC_WARNING; + *ascq = SPDK_SCSI_ASCQ_POWER_LOSS_EXPECTED; + break; + case SPDK_NVME_SC_INTERNAL_DEVICE_ERROR: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_HARDWARE_ERROR; + *asc = SPDK_SCSI_ASC_INTERNAL_TARGET_FAILURE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_ABORTED_BY_REQUEST: + case SPDK_NVME_SC_ABORTED_SQ_DELETION: + case SPDK_NVME_SC_ABORTED_FAILED_FUSED: + case SPDK_NVME_SC_ABORTED_MISSING_FUSED: + *sc = SPDK_SCSI_STATUS_TASK_ABORTED; + *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_ACCESS_DENIED; + *ascq = SPDK_SCSI_ASCQ_INVALID_LU_IDENTIFIER; + break; + case SPDK_NVME_SC_LBA_OUT_OF_RANGE: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_ADDRESS_OUT_OF_RANGE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_NAMESPACE_NOT_READY: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_NOT_READY; + *asc = SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_READY; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_RESERVATION_CONFLICT: + *sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT; + *sk = SPDK_SCSI_SENSE_NO_SENSE; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_COMMAND_ID_CONFLICT: + case SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR: + case SPDK_NVME_SC_INVALID_SGL_SEG_DESCRIPTOR: + case SPDK_NVME_SC_INVALID_NUM_SGL_DESCIRPTORS: + case SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID: + case SPDK_NVME_SC_METADATA_SGL_LENGTH_INVALID: + case SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID: + case SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF: + case SPDK_NVME_SC_INVALID_PRP_OFFSET: + case SPDK_NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED: + case SPDK_NVME_SC_INVALID_SGL_OFFSET: + case SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT: + case SPDK_NVME_SC_KEEP_ALIVE_EXPIRED: + case SPDK_NVME_SC_KEEP_ALIVE_INVALID: + case SPDK_NVME_SC_FORMAT_IN_PROGRESS: + default: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } + break; + case SPDK_NVME_SCT_COMMAND_SPECIFIC: + switch (nvme_sc) { + case SPDK_NVME_SC_COMPLETION_QUEUE_INVALID: + case SPDK_NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_INVALID_FORMAT: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_FORMAT_COMMAND_FAILED; + *ascq = SPDK_SCSI_ASCQ_FORMAT_COMMAND_FAILED; + break; + case SPDK_NVME_SC_CONFLICTING_ATTRIBUTES: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_RANGE: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_DATA_PROTECT; + *asc = SPDK_SCSI_ASC_WRITE_PROTECTED; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER: + case SPDK_NVME_SC_INVALID_QUEUE_SIZE: + case SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED: + case SPDK_NVME_SC_INVALID_FIRMWARE_SLOT: + case SPDK_NVME_SC_INVALID_FIRMWARE_IMAGE: + case SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR: + case SPDK_NVME_SC_INVALID_LOG_PAGE: + case SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET: + case SPDK_NVME_SC_INVALID_QUEUE_DELETION: + case SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE: + case SPDK_NVME_SC_FEATURE_NOT_CHANGEABLE: + case SPDK_NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC: + case SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET: + case SPDK_NVME_SC_FIRMWARE_REQ_RESET: + case SPDK_NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION: + case SPDK_NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED: + case SPDK_NVME_SC_OVERLAPPING_RANGE: + case SPDK_NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY: + case SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE: + case SPDK_NVME_SC_NAMESPACE_ALREADY_ATTACHED: + case SPDK_NVME_SC_NAMESPACE_IS_PRIVATE: + case SPDK_NVME_SC_NAMESPACE_NOT_ATTACHED: + case SPDK_NVME_SC_THINPROVISIONING_NOT_SUPPORTED: + case SPDK_NVME_SC_CONTROLLER_LIST_INVALID: + case SPDK_NVME_SC_INVALID_PROTECTION_INFO: + default: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } + break; + case SPDK_NVME_SCT_MEDIA_ERROR: + switch (nvme_sc) { + case SPDK_NVME_SC_WRITE_FAULTS: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_PERIPHERAL_DEVICE_WRITE_FAULT; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_UNRECOVERED_READ_ERROR: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_UNRECOVERED_READ_ERROR; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_GUARD_CHECK_ERROR: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_GUARD_CHECK_FAILED; + *ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_GUARD_CHECK_FAILED; + break; + case SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; + *ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; + break; + case SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; + *ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; + break; + case SPDK_NVME_SC_COMPARE_FAILURE: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MISCOMPARE; + *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_ACCESS_DENIED: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_DATA_PROTECT; + *asc = SPDK_SCSI_ASC_ACCESS_DENIED; + *ascq = SPDK_SCSI_ASCQ_NO_ACCESS_RIGHTS; + break; + case SPDK_NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK: + default: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } + break; + case SPDK_NVME_SCT_VENDOR_SPECIFIC: + default: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } +} diff --git a/src/spdk/lib/bdev/spdk_bdev.map b/src/spdk/lib/bdev/spdk_bdev.map new file mode 100644 index 000000000..9f9c3c7e5 --- /dev/null +++ b/src/spdk/lib/bdev/spdk_bdev.map @@ -0,0 +1,154 @@ +{ + global: + + # Public functions in bdev.h + spdk_bdev_get_opts; + spdk_bdev_set_opts; + spdk_bdev_initialize; + spdk_bdev_finish; + spdk_bdev_config_text; + spdk_bdev_subsystem_config_json; + spdk_bdev_get_by_name; + spdk_bdev_first; + spdk_bdev_next; + spdk_bdev_first_leaf; + spdk_bdev_next_leaf; + spdk_bdev_open; + spdk_bdev_open_ext; + spdk_bdev_close; + spdk_bdev_desc_get_bdev; + spdk_bdev_set_timeout; + spdk_bdev_io_type_supported; + spdk_bdev_dump_info_json; + spdk_bdev_get_name; + spdk_bdev_get_product_name; + spdk_bdev_get_block_size; + spdk_bdev_get_write_unit_size; + spdk_bdev_get_num_blocks; + spdk_bdev_get_qos_rpc_type; + spdk_bdev_get_qos_rate_limits; + spdk_bdev_set_qos_rate_limits; + spdk_bdev_get_buf_align; + spdk_bdev_get_optimal_io_boundary; + spdk_bdev_has_write_cache; + spdk_bdev_get_uuid; + spdk_bdev_get_acwu; + spdk_bdev_get_md_size; + spdk_bdev_is_md_interleaved; + spdk_bdev_is_md_separate; + spdk_bdev_is_zoned; + spdk_bdev_get_data_block_size; + spdk_bdev_get_dif_type; + spdk_bdev_is_dif_head_of_md; + spdk_bdev_is_dif_check_enabled; + spdk_bdev_get_qd; + spdk_bdev_get_qd_sampling_period; + spdk_bdev_set_qd_sampling_period; + spdk_bdev_get_io_time; + spdk_bdev_get_weighted_io_time; + spdk_bdev_get_io_channel; + spdk_bdev_read; + spdk_bdev_read_blocks; + spdk_bdev_read_blocks_with_md; + spdk_bdev_readv; + spdk_bdev_readv_blocks; + spdk_bdev_readv_blocks_with_md; + spdk_bdev_write; + spdk_bdev_write_blocks; + spdk_bdev_write_blocks_with_md; + spdk_bdev_writev; + spdk_bdev_writev_blocks; + spdk_bdev_writev_blocks_with_md; + spdk_bdev_compare_blocks; + spdk_bdev_compare_blocks_with_md; + spdk_bdev_comparev_blocks; + spdk_bdev_comparev_blocks_with_md; + spdk_bdev_comparev_and_writev_blocks; + spdk_bdev_zcopy_start; + spdk_bdev_zcopy_end; + spdk_bdev_write_zeroes; + spdk_bdev_write_zeroes_blocks; + spdk_bdev_unmap; + spdk_bdev_unmap_blocks; + spdk_bdev_flush; + spdk_bdev_flush_blocks; + spdk_bdev_reset; + spdk_bdev_abort; + spdk_bdev_nvme_admin_passthru; + spdk_bdev_nvme_io_passthru; + spdk_bdev_nvme_io_passthru_md; + spdk_bdev_free_io; + spdk_bdev_queue_io_wait; + spdk_bdev_get_io_stat; + spdk_bdev_get_device_stat; + spdk_bdev_io_get_nvme_status; + spdk_bdev_io_get_nvme_fused_status; + spdk_bdev_io_get_scsi_status; + spdk_bdev_io_get_iovec; + spdk_bdev_io_get_md_buf; + spdk_bdev_io_get_cb_arg; + spdk_bdev_histogram_enable; + spdk_bdev_histogram_get; + spdk_bdev_get_media_events; + + # Public functions in bdev_module.h + spdk_bdev_register; + spdk_bdev_unregister; + spdk_bdev_destruct_done; + spdk_vbdev_register; + spdk_bdev_module_examine_done; + spdk_bdev_module_init_done; + spdk_bdev_module_finish_done; + spdk_bdev_module_claim_bdev; + spdk_bdev_module_release_bdev; + spdk_bdev_alias_add; + spdk_bdev_alias_del; + spdk_bdev_alias_del_all; + spdk_bdev_get_aliases; + spdk_bdev_io_get_buf; + spdk_bdev_io_get_aux_buf; + spdk_bdev_io_put_aux_buf; + spdk_bdev_io_set_buf; + spdk_bdev_io_set_md_buf; + spdk_bdev_io_complete; + spdk_bdev_io_complete_nvme_status; + spdk_bdev_io_complete_scsi_status; + spdk_bdev_io_get_thread; + spdk_bdev_io_get_io_channel; + spdk_bdev_notify_blockcnt_change; + spdk_scsi_nvme_translate; + spdk_bdev_module_list_add; + spdk_bdev_module_list_find; + spdk_bdev_part_base_get_bdev; + spdk_bdev_part_base_get_bdev_name; + spdk_bdev_part_base_get_desc; + spdk_bdev_part_base_get_tailq; + spdk_bdev_part_base_get_ctx; + spdk_bdev_part_base_free; + spdk_bdev_part_free; + spdk_bdev_part_base_hotremove; + spdk_bdev_part_base_construct; + spdk_bdev_part_construct; + spdk_bdev_part_submit_request; + spdk_bdev_part_get_bdev; + spdk_bdev_part_get_base; + spdk_bdev_part_get_base_bdev; + spdk_bdev_part_get_offset_blocks; + spdk_bdev_push_media_events; + spdk_bdev_notify_media_management; + + # Public functions in bdev_zone.h + spdk_bdev_get_zone_size; + spdk_bdev_get_max_open_zones; + spdk_bdev_get_optimal_open_zones; + spdk_bdev_get_zone_info; + spdk_bdev_zone_management; + spdk_bdev_zone_append; + spdk_bdev_zone_appendv; + spdk_bdev_zone_append_with_md; + spdk_bdev_zone_appendv_with_md; + spdk_bdev_io_get_append_location; + + # Everything else + local: *; +}; diff --git a/src/spdk/lib/bdev/vtune.c b/src/spdk/lib/bdev/vtune.c new file mode 100644 index 000000000..2cb48826e --- /dev/null +++ b/src/spdk/lib/bdev/vtune.c @@ -0,0 +1,49 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/config.h" +#if SPDK_CONFIG_VTUNE + +/* Disable warnings triggered by the VTune code */ +#if defined(__GNUC__) && \ + __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic ignored "-Wsign-compare" +#if __GNUC__ >= 7 +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#endif +#endif + +#include "ittnotify_static.c" + +#endif diff --git a/src/spdk/lib/blob/Makefile b/src/spdk/lib/blob/Makefile new file mode 100644 index 000000000..53ae6800b --- /dev/null +++ b/src/spdk/lib/blob/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 + +C_SRCS = blobstore.c request.c zeroes.c blob_bs_dev.c +LIBNAME = blob + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blob.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/blob/blob_bs_dev.c b/src/spdk/lib/blob/blob_bs_dev.c new file mode 100644 index 000000000..8705a1c16 --- /dev/null +++ b/src/spdk/lib/blob/blob_bs_dev.c @@ -0,0 +1,150 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/blob.h" +#include "spdk/log.h" +#include "blobstore.h" + +static void +blob_bs_dev_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +blob_bs_dev_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +blob_bs_dev_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +blob_bs_dev_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +blob_bs_dev_read_cpl(void *cb_arg, int bserrno) +{ + struct spdk_bs_dev_cb_args *cb_args = (struct spdk_bs_dev_cb_args *)cb_arg; + + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, bserrno); +} + +static inline void +blob_bs_dev_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev; + + spdk_blob_io_read(b->blob, channel, payload, lba, lba_count, + blob_bs_dev_read_cpl, cb_args); +} + +static inline void +blob_bs_dev_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev; + + spdk_blob_io_readv(b->blob, channel, iov, iovcnt, lba, lba_count, + blob_bs_dev_read_cpl, cb_args); +} + +static void +blob_bs_dev_destroy_cpl(void *cb_arg, int bserrno) +{ + if (bserrno != 0) { + SPDK_ERRLOG("Error on blob_bs_dev destroy: %d", bserrno); + } + + /* Free blob_bs_dev */ + free(cb_arg); +} + +static void +blob_bs_dev_destroy(struct spdk_bs_dev *bs_dev) +{ + struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)bs_dev; + + spdk_blob_close(b->blob, blob_bs_dev_destroy_cpl, b); +} + + +struct spdk_bs_dev * +bs_create_blob_bs_dev(struct spdk_blob *blob) +{ + struct spdk_blob_bs_dev *b; + + b = calloc(1, sizeof(*b)); + if (b == NULL) { + return NULL; + } + /* snapshot blob */ + b->bs_dev.blockcnt = blob->active.num_clusters * + blob->bs->pages_per_cluster * bs_io_unit_per_page(blob->bs); + b->bs_dev.blocklen = spdk_bs_get_io_unit_size(blob->bs); + b->bs_dev.create_channel = NULL; + b->bs_dev.destroy_channel = NULL; + b->bs_dev.destroy = blob_bs_dev_destroy; + b->bs_dev.write = blob_bs_dev_write; + b->bs_dev.writev = blob_bs_dev_writev; + b->bs_dev.read = blob_bs_dev_read; + b->bs_dev.readv = blob_bs_dev_readv; + b->bs_dev.write_zeroes = blob_bs_dev_write_zeroes; + b->bs_dev.unmap = blob_bs_dev_unmap; + b->blob = blob; + + return &b->bs_dev; +} diff --git a/src/spdk/lib/blob/blobstore.c b/src/spdk/lib/blob/blobstore.c new file mode 100644 index 000000000..768fc5b45 --- /dev/null +++ b/src/spdk/lib/blob/blobstore.c @@ -0,0 +1,7461 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/blob.h" +#include "spdk/crc32.h" +#include "spdk/env.h" +#include "spdk/queue.h" +#include "spdk/thread.h" +#include "spdk/bit_array.h" +#include "spdk/likely.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/assert.h" +#include "spdk_internal/log.h" + +#include "blobstore.h" + +#define BLOB_CRC32C_INITIAL 0xffffffffUL + +static int bs_register_md_thread(struct spdk_blob_store *bs); +static int bs_unregister_md_thread(struct spdk_blob_store *bs); +static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno); +static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, + uint64_t cluster, uint32_t extent, spdk_blob_op_complete cb_fn, void *cb_arg); + +static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, + uint16_t value_len, bool internal); +static int blob_get_xattr_value(struct spdk_blob *blob, const char *name, + const void **value, size_t *value_len, bool internal); +static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal); + +static void blob_insert_extent(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, + spdk_blob_op_complete cb_fn, void *cb_arg); + +static void +blob_verify_md_op(struct spdk_blob *blob) +{ + assert(blob != NULL); + assert(spdk_get_thread() == blob->bs->md_thread); + assert(blob->state != SPDK_BLOB_STATE_LOADING); +} + +static struct spdk_blob_list * +bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid) +{ + struct spdk_blob_list *snapshot_entry = NULL; + + TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { + if (snapshot_entry->id == blobid) { + break; + } + } + + return snapshot_entry; +} + +static void +bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page) +{ + assert(page < spdk_bit_array_capacity(bs->used_md_pages)); + assert(spdk_bit_array_get(bs->used_md_pages, page) == false); + + spdk_bit_array_set(bs->used_md_pages, page); +} + +static void +bs_release_md_page(struct spdk_blob_store *bs, uint32_t page) +{ + assert(page < spdk_bit_array_capacity(bs->used_md_pages)); + assert(spdk_bit_array_get(bs->used_md_pages, page) == true); + + spdk_bit_array_clear(bs->used_md_pages, page); +} + +static void +bs_claim_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) +{ + assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters)); + assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == false); + assert(bs->num_free_clusters > 0); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %u\n", cluster_num); + + spdk_bit_array_set(bs->used_clusters, cluster_num); + bs->num_free_clusters--; +} + +static int +blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster) +{ + uint64_t *cluster_lba = &blob->active.clusters[cluster_num]; + + blob_verify_md_op(blob); + + if (*cluster_lba != 0) { + return -EEXIST; + } + + *cluster_lba = bs_cluster_to_lba(blob->bs, cluster); + return 0; +} + +static int +bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, + uint64_t *lowest_free_cluster, uint32_t *lowest_free_md_page, bool update_map) +{ + uint32_t *extent_page = 0; + + pthread_mutex_lock(&blob->bs->used_clusters_mutex); + *lowest_free_cluster = spdk_bit_array_find_first_clear(blob->bs->used_clusters, + *lowest_free_cluster); + if (*lowest_free_cluster == UINT32_MAX) { + /* No more free clusters. Cannot satisfy the request */ + pthread_mutex_unlock(&blob->bs->used_clusters_mutex); + return -ENOSPC; + } + + if (blob->use_extent_table) { + extent_page = bs_cluster_to_extent_page(blob, cluster_num); + if (*extent_page == 0) { + /* No extent_page is allocated for the cluster */ + *lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, + *lowest_free_md_page); + if (*lowest_free_md_page == UINT32_MAX) { + /* No more free md pages. Cannot satisfy the request */ + pthread_mutex_unlock(&blob->bs->used_clusters_mutex); + return -ENOSPC; + } + bs_claim_md_page(blob->bs, *lowest_free_md_page); + } + } + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %lu for blob %lu\n", *lowest_free_cluster, blob->id); + bs_claim_cluster(blob->bs, *lowest_free_cluster); + + pthread_mutex_unlock(&blob->bs->used_clusters_mutex); + + if (update_map) { + blob_insert_cluster(blob, cluster_num, *lowest_free_cluster); + if (blob->use_extent_table && *extent_page == 0) { + *extent_page = *lowest_free_md_page; + } + } + + return 0; +} + +static void +bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) +{ + assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters)); + assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == true); + assert(bs->num_free_clusters < bs->total_clusters); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Releasing cluster %u\n", cluster_num); + + pthread_mutex_lock(&bs->used_clusters_mutex); + spdk_bit_array_clear(bs->used_clusters, cluster_num); + bs->num_free_clusters++; + pthread_mutex_unlock(&bs->used_clusters_mutex); +} + +static void +blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs) +{ + xattrs->count = 0; + xattrs->names = NULL; + xattrs->ctx = NULL; + xattrs->get_value = NULL; +} + +void +spdk_blob_opts_init(struct spdk_blob_opts *opts) +{ + opts->num_clusters = 0; + opts->thin_provision = false; + opts->clear_method = BLOB_CLEAR_WITH_DEFAULT; + blob_xattrs_init(&opts->xattrs); + opts->use_extent_table = true; +} + +void +spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts) +{ + opts->clear_method = BLOB_CLEAR_WITH_DEFAULT; +} + +static struct spdk_blob * +blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id) +{ + struct spdk_blob *blob; + + blob = calloc(1, sizeof(*blob)); + if (!blob) { + return NULL; + } + + blob->id = id; + blob->bs = bs; + + blob->parent_id = SPDK_BLOBID_INVALID; + + blob->state = SPDK_BLOB_STATE_DIRTY; + blob->extent_rle_found = false; + blob->extent_table_found = false; + blob->active.num_pages = 1; + blob->active.pages = calloc(1, sizeof(*blob->active.pages)); + if (!blob->active.pages) { + free(blob); + return NULL; + } + + blob->active.pages[0] = bs_blobid_to_page(id); + + TAILQ_INIT(&blob->xattrs); + TAILQ_INIT(&blob->xattrs_internal); + TAILQ_INIT(&blob->pending_persists); + + return blob; +} + +static void +xattrs_free(struct spdk_xattr_tailq *xattrs) +{ + struct spdk_xattr *xattr, *xattr_tmp; + + TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) { + TAILQ_REMOVE(xattrs, xattr, link); + free(xattr->name); + free(xattr->value); + free(xattr); + } +} + +static void +blob_free(struct spdk_blob *blob) +{ + assert(blob != NULL); + assert(TAILQ_EMPTY(&blob->pending_persists)); + + free(blob->active.extent_pages); + free(blob->clean.extent_pages); + free(blob->active.clusters); + free(blob->clean.clusters); + free(blob->active.pages); + free(blob->clean.pages); + + xattrs_free(&blob->xattrs); + xattrs_free(&blob->xattrs_internal); + + if (blob->back_bs_dev) { + blob->back_bs_dev->destroy(blob->back_bs_dev); + } + + free(blob); +} + +struct freeze_io_ctx { + struct spdk_bs_cpl cpl; + struct spdk_blob *blob; +}; + +static void +blob_io_sync(struct spdk_io_channel_iter *i) +{ + spdk_for_each_channel_continue(i, 0); +} + +static void +blob_execute_queued_io(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch); + struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_bs_request_set *set; + struct spdk_bs_user_op_args *args; + spdk_bs_user_op_t *op, *tmp; + + TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) { + set = (struct spdk_bs_request_set *)op; + args = &set->u.user_op; + + if (args->blob == ctx->blob) { + TAILQ_REMOVE(&ch->queued_io, op, link); + bs_user_op_execute(op); + } + } + + spdk_for_each_channel_continue(i, 0); +} + +static void +blob_io_cpl(struct spdk_io_channel_iter *i, int status) +{ + struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0); + + free(ctx); +} + +static void +blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct freeze_io_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + ctx->cpl.u.blob_basic.cb_fn = cb_fn; + ctx->cpl.u.blob_basic.cb_arg = cb_arg; + ctx->blob = blob; + + /* Freeze I/O on blob */ + blob->frozen_refcnt++; + + if (blob->frozen_refcnt == 1) { + spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl); + } else { + cb_fn(cb_arg, 0); + free(ctx); + } +} + +static void +blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct freeze_io_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + ctx->cpl.u.blob_basic.cb_fn = cb_fn; + ctx->cpl.u.blob_basic.cb_arg = cb_arg; + ctx->blob = blob; + + assert(blob->frozen_refcnt > 0); + + blob->frozen_refcnt--; + + if (blob->frozen_refcnt == 0) { + spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl); + } else { + cb_fn(cb_arg, 0); + free(ctx); + } +} + +static int +blob_mark_clean(struct spdk_blob *blob) +{ + uint32_t *extent_pages = NULL; + uint64_t *clusters = NULL; + uint32_t *pages = NULL; + + assert(blob != NULL); + + if (blob->active.num_extent_pages) { + assert(blob->active.extent_pages); + extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages)); + if (!extent_pages) { + return -ENOMEM; + } + memcpy(extent_pages, blob->active.extent_pages, + blob->active.num_extent_pages * sizeof(*extent_pages)); + } + + if (blob->active.num_clusters) { + assert(blob->active.clusters); + clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters)); + if (!clusters) { + free(extent_pages); + return -ENOMEM; + } + memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); + } + + if (blob->active.num_pages) { + assert(blob->active.pages); + pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages)); + if (!pages) { + free(extent_pages); + free(clusters); + return -ENOMEM; + } + memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); + } + + free(blob->clean.extent_pages); + free(blob->clean.clusters); + free(blob->clean.pages); + + blob->clean.num_extent_pages = blob->active.num_extent_pages; + blob->clean.extent_pages = blob->active.extent_pages; + blob->clean.num_clusters = blob->active.num_clusters; + blob->clean.clusters = blob->active.clusters; + blob->clean.num_pages = blob->active.num_pages; + blob->clean.pages = blob->active.pages; + + blob->active.extent_pages = extent_pages; + blob->active.clusters = clusters; + blob->active.pages = pages; + + /* If the metadata was dirtied again while the metadata was being written to disk, + * we do not want to revert the DIRTY state back to CLEAN here. + */ + if (blob->state == SPDK_BLOB_STATE_LOADING) { + blob->state = SPDK_BLOB_STATE_CLEAN; + } + + return 0; +} + +static int +blob_deserialize_xattr(struct spdk_blob *blob, + struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal) +{ + struct spdk_xattr *xattr; + + if (desc_xattr->length != sizeof(desc_xattr->name_length) + + sizeof(desc_xattr->value_length) + + desc_xattr->name_length + desc_xattr->value_length) { + return -EINVAL; + } + + xattr = calloc(1, sizeof(*xattr)); + if (xattr == NULL) { + return -ENOMEM; + } + + xattr->name = malloc(desc_xattr->name_length + 1); + if (xattr->name == NULL) { + free(xattr); + return -ENOMEM; + } + memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length); + xattr->name[desc_xattr->name_length] = '\0'; + + xattr->value = malloc(desc_xattr->value_length); + if (xattr->value == NULL) { + free(xattr->name); + free(xattr); + return -ENOMEM; + } + xattr->value_len = desc_xattr->value_length; + memcpy(xattr->value, + (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), + desc_xattr->value_length); + + TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link); + + return 0; +} + + +static int +blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob) +{ + struct spdk_blob_md_descriptor *desc; + size_t cur_desc = 0; + void *tmp; + + desc = (struct spdk_blob_md_descriptor *)page->descriptors; + while (cur_desc < sizeof(page->descriptors)) { + if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { + if (desc->length == 0) { + /* If padding and length are 0, this terminates the page */ + break; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { + struct spdk_blob_md_descriptor_flags *desc_flags; + + desc_flags = (struct spdk_blob_md_descriptor_flags *)desc; + + if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) { + return -EINVAL; + } + + if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) != + SPDK_BLOB_INVALID_FLAGS_MASK) { + return -EINVAL; + } + + if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) != + SPDK_BLOB_DATA_RO_FLAGS_MASK) { + blob->data_ro = true; + blob->md_ro = true; + } + + if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) != + SPDK_BLOB_MD_RO_FLAGS_MASK) { + blob->md_ro = true; + } + + if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) { + blob->data_ro = true; + blob->md_ro = true; + } + + blob->invalid_flags = desc_flags->invalid_flags; + blob->data_ro_flags = desc_flags->data_ro_flags; + blob->md_ro_flags = desc_flags->md_ro_flags; + + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { + struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; + unsigned int i, j; + unsigned int cluster_count = blob->active.num_clusters; + + if (blob->extent_table_found) { + /* Extent Table already present in the md, + * both descriptors should never be at the same time. */ + return -EINVAL; + } + blob->extent_rle_found = true; + + desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; + + if (desc_extent_rle->length == 0 || + (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) { + return -EINVAL; + } + + for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { + for (j = 0; j < desc_extent_rle->extents[i].length; j++) { + if (desc_extent_rle->extents[i].cluster_idx != 0) { + if (!spdk_bit_array_get(blob->bs->used_clusters, + desc_extent_rle->extents[i].cluster_idx + j)) { + return -EINVAL; + } + } + cluster_count++; + } + } + + if (cluster_count == 0) { + return -EINVAL; + } + tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters)); + if (tmp == NULL) { + return -ENOMEM; + } + blob->active.clusters = tmp; + blob->active.cluster_array_size = cluster_count; + + for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { + for (j = 0; j < desc_extent_rle->extents[i].length; j++) { + if (desc_extent_rle->extents[i].cluster_idx != 0) { + blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, + desc_extent_rle->extents[i].cluster_idx + j); + } else if (spdk_blob_is_thin_provisioned(blob)) { + blob->active.clusters[blob->active.num_clusters++] = 0; + } else { + return -EINVAL; + } + } + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { + struct spdk_blob_md_descriptor_extent_table *desc_extent_table; + uint32_t num_extent_pages = blob->active.num_extent_pages; + uint32_t i, j; + size_t extent_pages_length; + + desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; + extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); + + if (blob->extent_rle_found) { + /* This means that Extent RLE is present in MD, + * both should never be at the same time. */ + return -EINVAL; + } else if (blob->extent_table_found && + desc_extent_table->num_clusters != blob->remaining_clusters_in_et) { + /* Number of clusters in this ET does not match number + * from previously read EXTENT_TABLE. */ + return -EINVAL; + } + + blob->extent_table_found = true; + + if (desc_extent_table->length == 0 || + (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { + return -EINVAL; + } + + for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { + num_extent_pages += desc_extent_table->extent_page[i].num_pages; + } + + tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t)); + if (tmp == NULL) { + return -ENOMEM; + } + blob->active.extent_pages = tmp; + blob->active.extent_pages_array_size = num_extent_pages; + + blob->remaining_clusters_in_et = desc_extent_table->num_clusters; + + /* Extent table entries contain md page numbers for extent pages. + * Zeroes represent unallocated extent pages, those are run-length-encoded. + */ + for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { + if (desc_extent_table->extent_page[i].page_idx != 0) { + assert(desc_extent_table->extent_page[i].num_pages == 1); + blob->active.extent_pages[blob->active.num_extent_pages++] = + desc_extent_table->extent_page[i].page_idx; + } else if (spdk_blob_is_thin_provisioned(blob)) { + for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) { + blob->active.extent_pages[blob->active.num_extent_pages++] = 0; + } + } else { + return -EINVAL; + } + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { + struct spdk_blob_md_descriptor_extent_page *desc_extent; + unsigned int i; + unsigned int cluster_count = 0; + size_t cluster_idx_length; + + if (blob->extent_rle_found) { + /* This means that Extent RLE is present in MD, + * both should never be at the same time. */ + return -EINVAL; + } + + desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; + cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); + + if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || + (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { + return -EINVAL; + } + + for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { + if (desc_extent->cluster_idx[i] != 0) { + if (!spdk_bit_array_get(blob->bs->used_clusters, desc_extent->cluster_idx[i])) { + return -EINVAL; + } + } + cluster_count++; + } + + if (cluster_count == 0) { + return -EINVAL; + } + + /* When reading extent pages sequentially starting cluster idx should match + * current size of a blob. + * If changed to batch reading, this check shall be removed. */ + if (desc_extent->start_cluster_idx != blob->active.num_clusters) { + return -EINVAL; + } + + tmp = realloc(blob->active.clusters, + (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters)); + if (tmp == NULL) { + return -ENOMEM; + } + blob->active.clusters = tmp; + blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters); + + for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { + if (desc_extent->cluster_idx[i] != 0) { + blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, + desc_extent->cluster_idx[i]); + } else if (spdk_blob_is_thin_provisioned(blob)) { + blob->active.clusters[blob->active.num_clusters++] = 0; + } else { + return -EINVAL; + } + } + assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters); + assert(blob->remaining_clusters_in_et >= cluster_count); + blob->remaining_clusters_in_et -= cluster_count; + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { + int rc; + + rc = blob_deserialize_xattr(blob, + (struct spdk_blob_md_descriptor_xattr *) desc, false); + if (rc != 0) { + return rc; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { + int rc; + + rc = blob_deserialize_xattr(blob, + (struct spdk_blob_md_descriptor_xattr *) desc, true); + if (rc != 0) { + return rc; + } + } else { + /* Unrecognized descriptor type. Do not fail - just continue to the + * next descriptor. If this descriptor is associated with some feature + * defined in a newer version of blobstore, that version of blobstore + * should create and set an associated feature flag to specify if this + * blob can be loaded or not. + */ + } + + /* Advance to the next descriptor */ + cur_desc += sizeof(*desc) + desc->length; + if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { + break; + } + desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); + } + + return 0; +} + +static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page); + +static int +blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob) +{ + assert(blob != NULL); + assert(blob->state == SPDK_BLOB_STATE_LOADING); + + if (bs_load_cur_extent_page_valid(extent_page) == false) { + return -ENOENT; + } + + return blob_parse_page(extent_page, blob); +} + +static int +blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count, + struct spdk_blob *blob) +{ + const struct spdk_blob_md_page *page; + uint32_t i; + int rc; + + assert(page_count > 0); + assert(pages[0].sequence_num == 0); + assert(blob != NULL); + assert(blob->state == SPDK_BLOB_STATE_LOADING); + assert(blob->active.clusters == NULL); + + /* The blobid provided doesn't match what's in the MD, this can + * happen for example if a bogus blobid is passed in through open. + */ + if (blob->id != pages[0].id) { + SPDK_ERRLOG("Blobid (%lu) doesn't match what's in metadata (%lu)\n", + blob->id, pages[0].id); + return -ENOENT; + } + + for (i = 0; i < page_count; i++) { + page = &pages[i]; + + assert(page->id == blob->id); + assert(page->sequence_num == i); + + rc = blob_parse_page(page, blob); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +static int +blob_serialize_add_page(const struct spdk_blob *blob, + struct spdk_blob_md_page **pages, + uint32_t *page_count, + struct spdk_blob_md_page **last_page) +{ + struct spdk_blob_md_page *page; + + assert(pages != NULL); + assert(page_count != NULL); + + if (*page_count == 0) { + assert(*pages == NULL); + *page_count = 1; + *pages = spdk_malloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, + NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + } else { + assert(*pages != NULL); + (*page_count)++; + *pages = spdk_realloc(*pages, + SPDK_BS_PAGE_SIZE * (*page_count), + SPDK_BS_PAGE_SIZE); + } + + if (*pages == NULL) { + *page_count = 0; + *last_page = NULL; + return -ENOMEM; + } + + page = &(*pages)[*page_count - 1]; + memset(page, 0, sizeof(*page)); + page->id = blob->id; + page->sequence_num = *page_count - 1; + page->next = SPDK_INVALID_MD_PAGE; + *last_page = page; + + return 0; +} + +/* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor. + * Update required_sz on both success and failure. + * + */ +static int +blob_serialize_xattr(const struct spdk_xattr *xattr, + uint8_t *buf, size_t buf_sz, + size_t *required_sz, bool internal) +{ + struct spdk_blob_md_descriptor_xattr *desc; + + *required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) + + strlen(xattr->name) + + xattr->value_len; + + if (buf_sz < *required_sz) { + return -1; + } + + desc = (struct spdk_blob_md_descriptor_xattr *)buf; + + desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR; + desc->length = sizeof(desc->name_length) + + sizeof(desc->value_length) + + strlen(xattr->name) + + xattr->value_len; + desc->name_length = strlen(xattr->name); + desc->value_length = xattr->value_len; + + memcpy(desc->name, xattr->name, desc->name_length); + memcpy((void *)((uintptr_t)desc->name + desc->name_length), + xattr->value, + desc->value_length); + + return 0; +} + +static void +blob_serialize_extent_table_entry(const struct spdk_blob *blob, + uint64_t start_ep, uint64_t *next_ep, + uint8_t **buf, size_t *remaining_sz) +{ + struct spdk_blob_md_descriptor_extent_table *desc; + size_t cur_sz; + uint64_t i, et_idx; + uint32_t extent_page, ep_len; + + /* The buffer must have room for at least num_clusters entry */ + cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters); + if (*remaining_sz < cur_sz) { + *next_ep = start_ep; + return; + } + + desc = (struct spdk_blob_md_descriptor_extent_table *)*buf; + desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE; + + desc->num_clusters = blob->active.num_clusters; + + ep_len = 1; + et_idx = 0; + for (i = start_ep; i < blob->active.num_extent_pages; i++) { + if (*remaining_sz < cur_sz + sizeof(desc->extent_page[0])) { + /* If we ran out of buffer space, return */ + break; + } + + extent_page = blob->active.extent_pages[i]; + /* Verify that next extent_page is unallocated */ + if (extent_page == 0 && + (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) { + ep_len++; + continue; + } + desc->extent_page[et_idx].page_idx = extent_page; + desc->extent_page[et_idx].num_pages = ep_len; + et_idx++; + + ep_len = 1; + cur_sz += sizeof(desc->extent_page[et_idx]); + } + *next_ep = i; + + desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx; + *remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length; + *buf += sizeof(struct spdk_blob_md_descriptor) + desc->length; +} + +static int +blob_serialize_extent_table(const struct spdk_blob *blob, + struct spdk_blob_md_page **pages, + struct spdk_blob_md_page *cur_page, + uint32_t *page_count, uint8_t **buf, + size_t *remaining_sz) +{ + uint64_t last_extent_page; + int rc; + + last_extent_page = 0; + /* At least single extent table entry has to be always persisted. + * Such case occurs with num_extent_pages == 0. */ + while (last_extent_page <= blob->active.num_extent_pages) { + blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf, + remaining_sz); + + if (last_extent_page == blob->active.num_extent_pages) { + break; + } + + rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); + if (rc < 0) { + return rc; + } + + *buf = (uint8_t *)cur_page->descriptors; + *remaining_sz = sizeof(cur_page->descriptors); + } + + return 0; +} + +static void +blob_serialize_extent_rle(const struct spdk_blob *blob, + uint64_t start_cluster, uint64_t *next_cluster, + uint8_t **buf, size_t *buf_sz) +{ + struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; + size_t cur_sz; + uint64_t i, extent_idx; + uint64_t lba, lba_per_cluster, lba_count; + + /* The buffer must have room for at least one extent */ + cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]); + if (*buf_sz < cur_sz) { + *next_cluster = start_cluster; + return; + } + + desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf; + desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE; + + lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); + + lba = blob->active.clusters[start_cluster]; + lba_count = lba_per_cluster; + extent_idx = 0; + for (i = start_cluster + 1; i < blob->active.num_clusters; i++) { + if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) { + /* Run-length encode sequential non-zero LBA */ + lba_count += lba_per_cluster; + continue; + } else if (lba == 0 && blob->active.clusters[i] == 0) { + /* Run-length encode unallocated clusters */ + lba_count += lba_per_cluster; + continue; + } + desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; + desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; + extent_idx++; + + cur_sz += sizeof(desc_extent_rle->extents[extent_idx]); + + if (*buf_sz < cur_sz) { + /* If we ran out of buffer space, return */ + *next_cluster = i; + break; + } + + lba = blob->active.clusters[i]; + lba_count = lba_per_cluster; + } + + if (*buf_sz >= cur_sz) { + desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; + desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; + extent_idx++; + + *next_cluster = blob->active.num_clusters; + } + + desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx; + *buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; + *buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; +} + +static int +blob_serialize_extents_rle(const struct spdk_blob *blob, + struct spdk_blob_md_page **pages, + struct spdk_blob_md_page *cur_page, + uint32_t *page_count, uint8_t **buf, + size_t *remaining_sz) +{ + uint64_t last_cluster; + int rc; + + last_cluster = 0; + while (last_cluster < blob->active.num_clusters) { + blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz); + + if (last_cluster == blob->active.num_clusters) { + break; + } + + rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); + if (rc < 0) { + return rc; + } + + *buf = (uint8_t *)cur_page->descriptors; + *remaining_sz = sizeof(cur_page->descriptors); + } + + return 0; +} + +static void +blob_serialize_extent_page(const struct spdk_blob *blob, + uint64_t cluster, struct spdk_blob_md_page *page) +{ + struct spdk_blob_md_descriptor_extent_page *desc_extent; + uint64_t i, extent_idx; + uint64_t lba, lba_per_cluster; + uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP; + + desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors; + desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE; + + lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); + + desc_extent->start_cluster_idx = start_cluster_idx; + extent_idx = 0; + for (i = start_cluster_idx; i < blob->active.num_clusters; i++) { + lba = blob->active.clusters[i]; + desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster; + if (extent_idx >= SPDK_EXTENTS_PER_EP) { + break; + } + } + desc_extent->length = sizeof(desc_extent->start_cluster_idx) + + sizeof(desc_extent->cluster_idx[0]) * extent_idx; +} + +static void +blob_serialize_flags(const struct spdk_blob *blob, + uint8_t *buf, size_t *buf_sz) +{ + struct spdk_blob_md_descriptor_flags *desc; + + /* + * Flags get serialized first, so we should always have room for the flags + * descriptor. + */ + assert(*buf_sz >= sizeof(*desc)); + + desc = (struct spdk_blob_md_descriptor_flags *)buf; + desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS; + desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor); + desc->invalid_flags = blob->invalid_flags; + desc->data_ro_flags = blob->data_ro_flags; + desc->md_ro_flags = blob->md_ro_flags; + + *buf_sz -= sizeof(*desc); +} + +static int +blob_serialize_xattrs(const struct spdk_blob *blob, + const struct spdk_xattr_tailq *xattrs, bool internal, + struct spdk_blob_md_page **pages, + struct spdk_blob_md_page *cur_page, + uint32_t *page_count, uint8_t **buf, + size_t *remaining_sz) +{ + const struct spdk_xattr *xattr; + int rc; + + TAILQ_FOREACH(xattr, xattrs, link) { + size_t required_sz = 0; + + rc = blob_serialize_xattr(xattr, + *buf, *remaining_sz, + &required_sz, internal); + if (rc < 0) { + /* Need to add a new page to the chain */ + rc = blob_serialize_add_page(blob, pages, page_count, + &cur_page); + if (rc < 0) { + spdk_free(*pages); + *pages = NULL; + *page_count = 0; + return rc; + } + + *buf = (uint8_t *)cur_page->descriptors; + *remaining_sz = sizeof(cur_page->descriptors); + + /* Try again */ + required_sz = 0; + rc = blob_serialize_xattr(xattr, + *buf, *remaining_sz, + &required_sz, internal); + + if (rc < 0) { + spdk_free(*pages); + *pages = NULL; + *page_count = 0; + return rc; + } + } + + *remaining_sz -= required_sz; + *buf += required_sz; + } + + return 0; +} + +static int +blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages, + uint32_t *page_count) +{ + struct spdk_blob_md_page *cur_page; + int rc; + uint8_t *buf; + size_t remaining_sz; + + assert(pages != NULL); + assert(page_count != NULL); + assert(blob != NULL); + assert(blob->state == SPDK_BLOB_STATE_DIRTY); + + *pages = NULL; + *page_count = 0; + + /* A blob always has at least 1 page, even if it has no descriptors */ + rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); + if (rc < 0) { + return rc; + } + + buf = (uint8_t *)cur_page->descriptors; + remaining_sz = sizeof(cur_page->descriptors); + + /* Serialize flags */ + blob_serialize_flags(blob, buf, &remaining_sz); + buf += sizeof(struct spdk_blob_md_descriptor_flags); + + /* Serialize xattrs */ + rc = blob_serialize_xattrs(blob, &blob->xattrs, false, + pages, cur_page, page_count, &buf, &remaining_sz); + if (rc < 0) { + return rc; + } + + /* Serialize internal xattrs */ + rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true, + pages, cur_page, page_count, &buf, &remaining_sz); + if (rc < 0) { + return rc; + } + + if (blob->use_extent_table) { + /* Serialize extent table */ + rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz); + } else { + /* Serialize extents */ + rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz); + } + + return rc; +} + +struct spdk_blob_load_ctx { + struct spdk_blob *blob; + + struct spdk_blob_md_page *pages; + uint32_t num_pages; + uint32_t next_extent_page; + spdk_bs_sequence_t *seq; + + spdk_bs_sequence_cpl cb_fn; + void *cb_arg; +}; + +static uint32_t +blob_md_page_calc_crc(void *page) +{ + uint32_t crc; + + crc = BLOB_CRC32C_INITIAL; + crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc); + crc ^= BLOB_CRC32C_INITIAL; + + return crc; + +} + +static void +blob_load_final(void *cb_arg, int bserrno) +{ + struct spdk_blob_load_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + + if (bserrno == 0) { + blob_mark_clean(blob); + } + + ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno); + + /* Free the memory */ + spdk_free(ctx->pages); + free(ctx); +} + +static void +blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno) +{ + struct spdk_blob_load_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + + if (bserrno == 0) { + blob->back_bs_dev = bs_create_blob_bs_dev(snapshot); + if (blob->back_bs_dev == NULL) { + bserrno = -ENOMEM; + } + } + if (bserrno != 0) { + SPDK_ERRLOG("Snapshot fail\n"); + } + + blob_load_final(ctx, bserrno); +} + +static void blob_update_clear_method(struct spdk_blob *blob); + +static void +blob_load_backing_dev(void *cb_arg) +{ + struct spdk_blob_load_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + const void *value; + size_t len; + int rc; + + if (spdk_blob_is_thin_provisioned(blob)) { + rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true); + if (rc == 0) { + if (len != sizeof(spdk_blob_id)) { + blob_load_final(ctx, -EINVAL); + return; + } + /* open snapshot blob and continue in the callback function */ + blob->parent_id = *(spdk_blob_id *)value; + spdk_bs_open_blob(blob->bs, blob->parent_id, + blob_load_snapshot_cpl, ctx); + return; + } else { + /* add zeroes_dev for thin provisioned blob */ + blob->back_bs_dev = bs_create_zeroes_dev(); + } + } else { + /* standard blob */ + blob->back_bs_dev = NULL; + } + blob_load_final(ctx, 0); +} + +static void +blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_load_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_md_page *page; + uint64_t i; + uint32_t crc; + uint64_t lba; + void *tmp; + uint64_t sz; + + if (bserrno) { + SPDK_ERRLOG("Extent page read failed: %d\n", bserrno); + blob_load_final(ctx, bserrno); + return; + } + + if (ctx->pages == NULL) { + /* First iteration of this function, allocate buffer for single EXTENT_PAGE */ + ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, NULL, SPDK_ENV_SOCKET_ID_ANY, + SPDK_MALLOC_DMA); + if (!ctx->pages) { + blob_load_final(ctx, -ENOMEM); + return; + } + ctx->num_pages = 1; + ctx->next_extent_page = 0; + } else { + page = &ctx->pages[0]; + crc = blob_md_page_calc_crc(page); + if (crc != page->crc) { + blob_load_final(ctx, -EINVAL); + return; + } + + if (page->next != SPDK_INVALID_MD_PAGE) { + blob_load_final(ctx, -EINVAL); + return; + } + + bserrno = blob_parse_extent_page(page, blob); + if (bserrno) { + blob_load_final(ctx, bserrno); + return; + } + } + + for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) { + if (blob->active.extent_pages[i] != 0) { + /* Extent page was allocated, read and parse it. */ + lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]); + ctx->next_extent_page = i + 1; + + bs_sequence_read_dev(seq, &ctx->pages[0], lba, + bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), + blob_load_cpl_extents_cpl, ctx); + return; + } else { + /* Thin provisioned blobs can point to unallocated extent pages. + * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */ + + sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP); + blob->active.num_clusters += sz; + blob->remaining_clusters_in_et -= sz; + + assert(spdk_blob_is_thin_provisioned(blob)); + assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0); + + tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); + if (tmp == NULL) { + blob_load_final(ctx, -ENOMEM); + return; + } + memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0, + sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size)); + blob->active.clusters = tmp; + blob->active.cluster_array_size = blob->active.num_clusters; + } + } + + blob_load_backing_dev(ctx); +} + +static void +blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_load_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_md_page *page; + int rc; + uint32_t crc; + uint32_t current_page; + + if (ctx->num_pages == 1) { + current_page = bs_blobid_to_page(blob->id); + } else { + assert(ctx->num_pages != 0); + page = &ctx->pages[ctx->num_pages - 2]; + current_page = page->next; + } + + if (bserrno) { + SPDK_ERRLOG("Metadata page %d read failed for blobid %lu: %d\n", + current_page, blob->id, bserrno); + blob_load_final(ctx, bserrno); + return; + } + + page = &ctx->pages[ctx->num_pages - 1]; + crc = blob_md_page_calc_crc(page); + if (crc != page->crc) { + SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %lu\n", + current_page, blob->id); + blob_load_final(ctx, -EINVAL); + return; + } + + if (page->next != SPDK_INVALID_MD_PAGE) { + uint32_t next_page = page->next; + uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page); + + /* Read the next page */ + ctx->num_pages++; + ctx->pages = spdk_realloc(ctx->pages, (sizeof(*page) * ctx->num_pages), + sizeof(*page)); + if (ctx->pages == NULL) { + blob_load_final(ctx, -ENOMEM); + return; + } + + bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1], + next_lba, + bs_byte_to_lba(blob->bs, sizeof(*page)), + blob_load_cpl, ctx); + return; + } + + /* Parse the pages */ + rc = blob_parse(ctx->pages, ctx->num_pages, blob); + if (rc) { + blob_load_final(ctx, rc); + return; + } + + if (blob->extent_table_found == true) { + /* If EXTENT_TABLE was found, that means support for it should be enabled. */ + assert(blob->extent_rle_found == false); + blob->use_extent_table = true; + } else { + /* If EXTENT_RLE or no extent_* descriptor was found disable support + * for extent table. No extent_* descriptors means that blob has length of 0 + * and no extent_rle descriptors were persisted for it. + * EXTENT_TABLE if used, is always present in metadata regardless of length. */ + blob->use_extent_table = false; + } + + /* Check the clear_method stored in metadata vs what may have been passed + * via spdk_bs_open_blob_ext() and update accordingly. + */ + blob_update_clear_method(blob); + + spdk_free(ctx->pages); + ctx->pages = NULL; + + if (blob->extent_table_found) { + blob_load_cpl_extents_cpl(seq, ctx, 0); + } else { + blob_load_backing_dev(ctx); + } +} + +/* Load a blob from disk given a blobid */ +static void +blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_blob_load_ctx *ctx; + struct spdk_blob_store *bs; + uint32_t page_num; + uint64_t lba; + + blob_verify_md_op(blob); + + bs = blob->bs; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(seq, cb_arg, -ENOMEM); + return; + } + + ctx->blob = blob; + ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE); + if (!ctx->pages) { + free(ctx); + cb_fn(seq, cb_arg, -ENOMEM); + return; + } + ctx->num_pages = 1; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->seq = seq; + + page_num = bs_blobid_to_page(blob->id); + lba = bs_md_page_to_lba(blob->bs, page_num); + + blob->state = SPDK_BLOB_STATE_LOADING; + + bs_sequence_read_dev(seq, &ctx->pages[0], lba, + bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE), + blob_load_cpl, ctx); +} + +struct spdk_blob_persist_ctx { + struct spdk_blob *blob; + + struct spdk_bs_super_block *super; + + struct spdk_blob_md_page *pages; + uint32_t next_extent_page; + struct spdk_blob_md_page *extent_page; + + spdk_bs_sequence_t *seq; + spdk_bs_sequence_cpl cb_fn; + void *cb_arg; + TAILQ_ENTRY(spdk_blob_persist_ctx) link; +}; + +static void +bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba, + uint32_t lba_count) +{ + switch (ctx->blob->clear_method) { + case BLOB_CLEAR_WITH_DEFAULT: + case BLOB_CLEAR_WITH_UNMAP: + bs_batch_unmap_dev(batch, lba, lba_count); + break; + case BLOB_CLEAR_WITH_WRITE_ZEROES: + bs_batch_write_zeroes_dev(batch, lba, lba_count); + break; + case BLOB_CLEAR_WITH_NONE: + default: + break; + } +} + +static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx); + +static void +blob_persist_complete(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob_persist_ctx *next_persist; + struct spdk_blob *blob = ctx->blob; + + if (bserrno == 0) { + blob_mark_clean(blob); + } + + assert(ctx == TAILQ_FIRST(&blob->pending_persists)); + TAILQ_REMOVE(&blob->pending_persists, ctx, link); + + next_persist = TAILQ_FIRST(&blob->pending_persists); + + /* Call user callback */ + ctx->cb_fn(seq, ctx->cb_arg, bserrno); + + /* Free the memory */ + spdk_free(ctx->pages); + free(ctx); + + if (next_persist != NULL) { + blob_persist_check_dirty(next_persist); + } +} + +static void +blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + size_t i; + + if (bserrno != 0) { + blob_persist_complete(seq, ctx, bserrno); + return; + } + + /* Release all clusters that were truncated */ + for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { + uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]); + + /* Nothing to release if it was not allocated */ + if (blob->active.clusters[i] != 0) { + bs_release_cluster(bs, cluster_num); + } + } + + if (blob->active.num_clusters == 0) { + free(blob->active.clusters); + blob->active.clusters = NULL; + blob->active.cluster_array_size = 0; + } else if (blob->active.num_clusters != blob->active.cluster_array_size) { +#ifndef __clang_analyzer__ + void *tmp; + + /* scan-build really can't figure reallocs, workaround it */ + tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters); + assert(tmp != NULL); + blob->active.clusters = tmp; + + tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages); + assert(tmp != NULL); + blob->active.extent_pages = tmp; +#endif + blob->active.extent_pages_array_size = blob->active.num_extent_pages; + blob->active.cluster_array_size = blob->active.num_clusters; + } + + /* TODO: Add path to persist clear extent pages. */ + blob_persist_complete(seq, ctx, bserrno); +} + +static void +blob_persist_clear_clusters(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + spdk_bs_batch_t *batch; + size_t i; + uint64_t lba; + uint32_t lba_count; + + if (bserrno != 0) { + blob_persist_complete(seq, ctx, bserrno); + return; + } + + /* Clusters don't move around in blobs. The list shrinks or grows + * at the end, but no changes ever occur in the middle of the list. + */ + + batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx); + + /* Clear all clusters that were truncated */ + lba = 0; + lba_count = 0; + for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { + uint64_t next_lba = blob->active.clusters[i]; + uint32_t next_lba_count = bs_cluster_to_lba(bs, 1); + + if (next_lba > 0 && (lba + lba_count) == next_lba) { + /* This cluster is contiguous with the previous one. */ + lba_count += next_lba_count; + continue; + } + + /* This cluster is not contiguous with the previous one. */ + + /* If a run of LBAs previously existing, clear them now */ + if (lba_count > 0) { + bs_batch_clear_dev(ctx, batch, lba, lba_count); + } + + /* Start building the next batch */ + lba = next_lba; + if (next_lba > 0) { + lba_count = next_lba_count; + } else { + lba_count = 0; + } + } + + /* If we ended with a contiguous set of LBAs, clear them now */ + if (lba_count > 0) { + bs_batch_clear_dev(ctx, batch, lba, lba_count); + } + + bs_batch_close(batch); +} + +static void +blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + size_t i; + + if (bserrno != 0) { + blob_persist_complete(seq, ctx, bserrno); + return; + } + + /* This loop starts at 1 because the first page is special and handled + * below. The pages (except the first) are never written in place, + * so any pages in the clean list must be zeroed. + */ + for (i = 1; i < blob->clean.num_pages; i++) { + bs_release_md_page(bs, blob->clean.pages[i]); + } + + if (blob->active.num_pages == 0) { + uint32_t page_num; + + page_num = bs_blobid_to_page(blob->id); + bs_release_md_page(bs, page_num); + } + + /* Move on to clearing clusters */ + blob_persist_clear_clusters(seq, ctx, 0); +} + +static void +blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + uint64_t lba; + uint32_t lba_count; + spdk_bs_batch_t *batch; + size_t i; + + if (bserrno != 0) { + blob_persist_complete(seq, ctx, bserrno); + return; + } + + batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx); + + lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); + + /* This loop starts at 1 because the first page is special and handled + * below. The pages (except the first) are never written in place, + * so any pages in the clean list must be zeroed. + */ + for (i = 1; i < blob->clean.num_pages; i++) { + lba = bs_md_page_to_lba(bs, blob->clean.pages[i]); + + bs_batch_write_zeroes_dev(batch, lba, lba_count); + } + + /* The first page will only be zeroed if this is a delete. */ + if (blob->active.num_pages == 0) { + uint32_t page_num; + + /* The first page in the metadata goes where the blobid indicates */ + page_num = bs_blobid_to_page(blob->id); + lba = bs_md_page_to_lba(bs, page_num); + + bs_batch_write_zeroes_dev(batch, lba, lba_count); + } + + bs_batch_close(batch); +} + +static void +blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + uint64_t lba; + uint32_t lba_count; + struct spdk_blob_md_page *page; + + if (bserrno != 0) { + blob_persist_complete(seq, ctx, bserrno); + return; + } + + if (blob->active.num_pages == 0) { + /* Move on to the next step */ + blob_persist_zero_pages(seq, ctx, 0); + return; + } + + lba_count = bs_byte_to_lba(bs, sizeof(*page)); + + page = &ctx->pages[0]; + /* The first page in the metadata goes where the blobid indicates */ + lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id)); + + bs_sequence_write_dev(seq, page, lba, lba_count, + blob_persist_zero_pages, ctx); +} + +static void +blob_persist_write_page_chain(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + uint64_t lba; + uint32_t lba_count; + struct spdk_blob_md_page *page; + spdk_bs_batch_t *batch; + size_t i; + + if (bserrno != 0) { + blob_persist_complete(seq, ctx, bserrno); + return; + } + + /* Clusters don't move around in blobs. The list shrinks or grows + * at the end, but no changes ever occur in the middle of the list. + */ + + lba_count = bs_byte_to_lba(bs, sizeof(*page)); + + batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx); + + /* This starts at 1. The root page is not written until + * all of the others are finished + */ + for (i = 1; i < blob->active.num_pages; i++) { + page = &ctx->pages[i]; + assert(page->sequence_num == i); + + lba = bs_md_page_to_lba(bs, blob->active.pages[i]); + + bs_batch_write_dev(batch, page, lba, lba_count); + } + + bs_batch_close(batch); +} + +static int +blob_resize(struct spdk_blob *blob, uint64_t sz) +{ + uint64_t i; + uint64_t *tmp; + uint64_t lfc; /* lowest free cluster */ + uint32_t lfmd; /* lowest free md page */ + uint64_t num_clusters; + uint32_t *ep_tmp; + uint64_t new_num_ep = 0, current_num_ep = 0; + struct spdk_blob_store *bs; + + bs = blob->bs; + + blob_verify_md_op(blob); + + if (blob->active.num_clusters == sz) { + return 0; + } + + if (blob->active.num_clusters < blob->active.cluster_array_size) { + /* If this blob was resized to be larger, then smaller, then + * larger without syncing, then the cluster array already + * contains spare assigned clusters we can use. + */ + num_clusters = spdk_min(blob->active.cluster_array_size, + sz); + } else { + num_clusters = blob->active.num_clusters; + } + + if (blob->use_extent_table) { + /* Round up since every cluster beyond current Extent Table size, + * requires new extent page. */ + new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP); + current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP); + } + + /* Do two passes - one to verify that we can obtain enough clusters + * and md pages, another to actually claim them. + */ + + if (spdk_blob_is_thin_provisioned(blob) == false) { + lfc = 0; + for (i = num_clusters; i < sz; i++) { + lfc = spdk_bit_array_find_first_clear(bs->used_clusters, lfc); + if (lfc == UINT32_MAX) { + /* No more free clusters. Cannot satisfy the request */ + return -ENOSPC; + } + lfc++; + } + lfmd = 0; + for (i = current_num_ep; i < new_num_ep ; i++) { + lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd); + if (lfmd == UINT32_MAX) { + /* No more free md pages. Cannot satisfy the request */ + return -ENOSPC; + } + } + } + + if (sz > num_clusters) { + /* Expand the cluster array if necessary. + * We only shrink the array when persisting. + */ + tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz); + if (sz > 0 && tmp == NULL) { + return -ENOMEM; + } + memset(tmp + blob->active.cluster_array_size, 0, + sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size)); + blob->active.clusters = tmp; + blob->active.cluster_array_size = sz; + + /* Expand the extents table, only if enough clusters were added */ + if (new_num_ep > current_num_ep && blob->use_extent_table) { + ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep); + if (new_num_ep > 0 && ep_tmp == NULL) { + return -ENOMEM; + } + memset(ep_tmp + blob->active.extent_pages_array_size, 0, + sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size)); + blob->active.extent_pages = ep_tmp; + blob->active.extent_pages_array_size = new_num_ep; + } + } + + blob->state = SPDK_BLOB_STATE_DIRTY; + + if (spdk_blob_is_thin_provisioned(blob) == false) { + lfc = 0; + lfmd = 0; + for (i = num_clusters; i < sz; i++) { + bs_allocate_cluster(blob, i, &lfc, &lfmd, true); + lfc++; + lfmd++; + } + } + + blob->active.num_clusters = sz; + blob->active.num_extent_pages = new_num_ep; + + return 0; +} + +static void +blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx) +{ + spdk_bs_sequence_t *seq = ctx->seq; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + uint64_t i; + uint32_t page_num; + void *tmp; + int rc; + + /* Generate the new metadata */ + rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages); + if (rc < 0) { + blob_persist_complete(seq, ctx, rc); + return; + } + + assert(blob->active.num_pages >= 1); + + /* Resize the cache of page indices */ + tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); + if (!tmp) { + blob_persist_complete(seq, ctx, -ENOMEM); + return; + } + blob->active.pages = tmp; + + /* Assign this metadata to pages. This requires two passes - + * one to verify that there are enough pages and a second + * to actually claim them. */ + page_num = 0; + /* Note that this loop starts at one. The first page location is fixed by the blobid. */ + for (i = 1; i < blob->active.num_pages; i++) { + page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); + if (page_num == UINT32_MAX) { + blob_persist_complete(seq, ctx, -ENOMEM); + return; + } + page_num++; + } + + page_num = 0; + blob->active.pages[0] = bs_blobid_to_page(blob->id); + for (i = 1; i < blob->active.num_pages; i++) { + page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); + ctx->pages[i - 1].next = page_num; + /* Now that previous metadata page is complete, calculate the crc for it. */ + ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); + blob->active.pages[i] = page_num; + bs_claim_md_page(bs, page_num); + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming page %u for blob %lu\n", page_num, blob->id); + page_num++; + } + ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); + /* Start writing the metadata from last page to first */ + blob->state = SPDK_BLOB_STATE_CLEAN; + blob_persist_write_page_chain(seq, ctx, 0); +} + +static void +blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + size_t i; + uint32_t extent_page_id; + uint32_t page_count = 0; + int rc; + + if (ctx->extent_page != NULL) { + spdk_free(ctx->extent_page); + ctx->extent_page = NULL; + } + + if (bserrno != 0) { + blob_persist_complete(seq, ctx, bserrno); + return; + } + + /* Only write out changed extent pages */ + for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) { + extent_page_id = blob->active.extent_pages[i]; + if (extent_page_id == 0) { + /* No Extent Page to persist */ + assert(spdk_blob_is_thin_provisioned(blob)); + continue; + } + /* Writing out new extent page for the first time. Either active extent pages is larger + * than clean extent pages or there was no extent page assigned due to thin provisioning. */ + if (i >= blob->clean.extent_pages_array_size || blob->clean.extent_pages[i] == 0) { + blob->state = SPDK_BLOB_STATE_DIRTY; + assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id)); + ctx->next_extent_page = i + 1; + rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page); + if (rc < 0) { + blob_persist_complete(seq, ctx, rc); + return; + } + + blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page); + + ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page); + + bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id), + bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), + blob_persist_write_extent_pages, ctx); + return; + } + assert(blob->clean.extent_pages[i] != 0); + } + + blob_persist_generate_new_md(ctx); +} + +static void +blob_persist_start(struct spdk_blob_persist_ctx *ctx) +{ + spdk_bs_sequence_t *seq = ctx->seq; + struct spdk_blob *blob = ctx->blob; + + if (blob->active.num_pages == 0) { + /* This is the signal that the blob should be deleted. + * Immediately jump to the clean up routine. */ + assert(blob->clean.num_pages > 0); + blob->state = SPDK_BLOB_STATE_CLEAN; + blob_persist_zero_pages(seq, ctx, 0); + return; + + } + + blob_persist_write_extent_pages(seq, ctx, 0); +} + +static void +blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + + spdk_free(ctx->super); + + if (bserrno != 0) { + blob_persist_complete(seq, ctx, bserrno); + return; + } + + ctx->blob->bs->clean = 0; + + blob_persist_start(ctx); +} + +static void +bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, + struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg); + + +static void +blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + + if (bserrno != 0) { + spdk_free(ctx->super); + blob_persist_complete(seq, ctx, bserrno); + return; + } + + ctx->super->clean = 0; + if (ctx->super->size == 0) { + ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen; + } + + bs_write_super(seq, ctx->blob->bs, ctx->super, blob_persist_dirty_cpl, ctx); +} + +static void +blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx) +{ + if (ctx->blob->bs->clean) { + ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->super) { + blob_persist_complete(ctx->seq, ctx, -ENOMEM); + return; + } + + bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->blob->bs, 0), + bs_byte_to_lba(ctx->blob->bs, sizeof(*ctx->super)), + blob_persist_dirty, ctx); + } else { + blob_persist_start(ctx); + } +} + +/* Write a blob to disk */ +static void +blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_blob_persist_ctx *ctx; + + blob_verify_md_op(blob); + + if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->pending_persists)) { + cb_fn(seq, cb_arg, 0); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(seq, cb_arg, -ENOMEM); + return; + } + ctx->blob = blob; + ctx->seq = seq; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->next_extent_page = 0; + + /* Multiple blob persists can affect one another, via blob->state or + * blob mutable data changes. To prevent it, queue up the persists. */ + if (!TAILQ_EMPTY(&blob->pending_persists)) { + TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link); + return; + } + TAILQ_INSERT_HEAD(&blob->pending_persists, ctx, link); + + blob_persist_check_dirty(ctx); +} + +struct spdk_blob_copy_cluster_ctx { + struct spdk_blob *blob; + uint8_t *buf; + uint64_t page; + uint64_t new_cluster; + uint32_t new_extent_page; + spdk_bs_sequence_t *seq; +}; + +static void +blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno) +{ + struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq; + TAILQ_HEAD(, spdk_bs_request_set) requests; + spdk_bs_user_op_t *op; + + TAILQ_INIT(&requests); + TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link); + + while (!TAILQ_EMPTY(&requests)) { + op = TAILQ_FIRST(&requests); + TAILQ_REMOVE(&requests, op, link); + if (bserrno == 0) { + bs_user_op_execute(op); + } else { + bs_user_op_abort(op); + } + } + + spdk_free(ctx->buf); + free(ctx); +} + +static void +blob_insert_cluster_cpl(void *cb_arg, int bserrno) +{ + struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; + + if (bserrno) { + if (bserrno == -EEXIST) { + /* The metadata insert failed because another thread + * allocated the cluster first. Free our cluster + * but continue without error. */ + bserrno = 0; + } + bs_release_cluster(ctx->blob->bs, ctx->new_cluster); + if (ctx->new_extent_page != 0) { + bs_release_md_page(ctx->blob->bs, ctx->new_extent_page); + } + } + + bs_sequence_finish(ctx->seq, bserrno); +} + +static void +blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; + uint32_t cluster_number; + + if (bserrno) { + /* The write failed, so jump to the final completion handler */ + bs_sequence_finish(seq, bserrno); + return; + } + + cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page); + + blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, + ctx->new_extent_page, blob_insert_cluster_cpl, ctx); +} + +static void +blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; + + if (bserrno != 0) { + /* The read failed, so jump to the final completion handler */ + bs_sequence_finish(seq, bserrno); + return; + } + + /* Write whole cluster */ + bs_sequence_write_dev(seq, ctx->buf, + bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster), + bs_cluster_to_lba(ctx->blob->bs, 1), + blob_write_copy_cpl, ctx); +} + +static void +bs_allocate_and_copy_cluster(struct spdk_blob *blob, + struct spdk_io_channel *_ch, + uint64_t io_unit, spdk_bs_user_op_t *op) +{ + struct spdk_bs_cpl cpl; + struct spdk_bs_channel *ch; + struct spdk_blob_copy_cluster_ctx *ctx; + uint32_t cluster_start_page; + uint32_t cluster_number; + int rc; + + ch = spdk_io_channel_get_ctx(_ch); + + if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) { + /* There are already operations pending. Queue this user op + * and return because it will be re-executed when the outstanding + * cluster allocation completes. */ + TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); + return; + } + + /* Round the io_unit offset down to the first page in the cluster */ + cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit); + + /* Calculate which index in the metadata cluster array the corresponding + * cluster is supposed to be at. */ + cluster_number = bs_io_unit_to_cluster_number(blob, io_unit); + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + bs_user_op_abort(op); + return; + } + + assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0); + + ctx->blob = blob; + ctx->page = cluster_start_page; + + if (blob->parent_id != SPDK_BLOBID_INVALID) { + ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen, + NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->buf) { + SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n", + blob->bs->cluster_sz); + free(ctx); + bs_user_op_abort(op); + return; + } + } + + rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page, + false); + if (rc != 0) { + spdk_free(ctx->buf); + free(ctx); + bs_user_op_abort(op); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl; + cpl.u.blob_basic.cb_arg = ctx; + + ctx->seq = bs_sequence_start(_ch, &cpl); + if (!ctx->seq) { + bs_release_cluster(blob->bs, ctx->new_cluster); + spdk_free(ctx->buf); + free(ctx); + bs_user_op_abort(op); + return; + } + + /* Queue the user op to block other incoming operations */ + TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); + + if (blob->parent_id != SPDK_BLOBID_INVALID) { + /* Read cluster from backing device */ + bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, + bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), + bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), + blob_write_copy, ctx); + } else { + blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, + ctx->new_extent_page, blob_insert_cluster_cpl, ctx); + } +} + +static inline void +blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length, + uint64_t *lba, uint32_t *lba_count) +{ + *lba_count = length; + + if (!bs_io_unit_is_allocated(blob, io_unit)) { + assert(blob->back_bs_dev != NULL); + *lba = bs_io_unit_to_back_dev_lba(blob, io_unit); + *lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count); + } else { + *lba = bs_blob_io_unit_to_lba(blob, io_unit); + } +} + +struct op_split_ctx { + struct spdk_blob *blob; + struct spdk_io_channel *channel; + uint64_t io_unit_offset; + uint64_t io_units_remaining; + void *curr_payload; + enum spdk_blob_op_type op_type; + spdk_bs_sequence_t *seq; +}; + +static void +blob_request_submit_op_split_next(void *cb_arg, int bserrno) +{ + struct op_split_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_io_channel *ch = ctx->channel; + enum spdk_blob_op_type op_type = ctx->op_type; + uint8_t *buf = ctx->curr_payload; + uint64_t offset = ctx->io_unit_offset; + uint64_t length = ctx->io_units_remaining; + uint64_t op_length; + + if (bserrno != 0 || ctx->io_units_remaining == 0) { + bs_sequence_finish(ctx->seq, bserrno); + free(ctx); + return; + } + + op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob, + offset)); + + /* Update length and payload for next operation */ + ctx->io_units_remaining -= op_length; + ctx->io_unit_offset += op_length; + if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) { + ctx->curr_payload += op_length * blob->bs->io_unit_size; + } + + switch (op_type) { + case SPDK_BLOB_READ: + spdk_blob_io_read(blob, ch, buf, offset, op_length, + blob_request_submit_op_split_next, ctx); + break; + case SPDK_BLOB_WRITE: + spdk_blob_io_write(blob, ch, buf, offset, op_length, + blob_request_submit_op_split_next, ctx); + break; + case SPDK_BLOB_UNMAP: + spdk_blob_io_unmap(blob, ch, offset, op_length, + blob_request_submit_op_split_next, ctx); + break; + case SPDK_BLOB_WRITE_ZEROES: + spdk_blob_io_write_zeroes(blob, ch, offset, op_length, + blob_request_submit_op_split_next, ctx); + break; + case SPDK_BLOB_READV: + case SPDK_BLOB_WRITEV: + SPDK_ERRLOG("readv/write not valid\n"); + bs_sequence_finish(ctx->seq, -EINVAL); + free(ctx); + break; + } +} + +static void +blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob, + void *payload, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) +{ + struct op_split_ctx *ctx; + spdk_bs_sequence_t *seq; + struct spdk_bs_cpl cpl; + + assert(blob != NULL); + + ctx = calloc(1, sizeof(struct op_split_ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + seq = bs_sequence_start(ch, &cpl); + if (!seq) { + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->blob = blob; + ctx->channel = ch; + ctx->curr_payload = payload; + ctx->io_unit_offset = offset; + ctx->io_units_remaining = length; + ctx->op_type = op_type; + ctx->seq = seq; + + blob_request_submit_op_split_next(ctx, 0); +} + +static void +blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob, + void *payload, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) +{ + struct spdk_bs_cpl cpl; + uint64_t lba; + uint32_t lba_count; + + assert(blob != NULL); + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); + + if (blob->frozen_refcnt) { + /* This blob I/O is frozen */ + spdk_bs_user_op_t *op; + struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch); + + op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); + if (!op) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); + + return; + } + + switch (op_type) { + case SPDK_BLOB_READ: { + spdk_bs_batch_t *batch; + + batch = bs_batch_open(_ch, &cpl); + if (!batch) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + if (bs_io_unit_is_allocated(blob, offset)) { + /* Read from the blob */ + bs_batch_read_dev(batch, payload, lba, lba_count); + } else { + /* Read from the backing block device */ + bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count); + } + + bs_batch_close(batch); + break; + } + case SPDK_BLOB_WRITE: + case SPDK_BLOB_WRITE_ZEROES: { + if (bs_io_unit_is_allocated(blob, offset)) { + /* Write to the blob */ + spdk_bs_batch_t *batch; + + if (lba_count == 0) { + cb_fn(cb_arg, 0); + return; + } + + batch = bs_batch_open(_ch, &cpl); + if (!batch) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + if (op_type == SPDK_BLOB_WRITE) { + bs_batch_write_dev(batch, payload, lba, lba_count); + } else { + bs_batch_write_zeroes_dev(batch, lba, lba_count); + } + + bs_batch_close(batch); + } else { + /* Queue this operation and allocate the cluster */ + spdk_bs_user_op_t *op; + + op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); + if (!op) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + bs_allocate_and_copy_cluster(blob, _ch, offset, op); + } + break; + } + case SPDK_BLOB_UNMAP: { + spdk_bs_batch_t *batch; + + batch = bs_batch_open(_ch, &cpl); + if (!batch) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + if (bs_io_unit_is_allocated(blob, offset)) { + bs_batch_unmap_dev(batch, lba, lba_count); + } + + bs_batch_close(batch); + break; + } + case SPDK_BLOB_READV: + case SPDK_BLOB_WRITEV: + SPDK_ERRLOG("readv/write not valid\n"); + cb_fn(cb_arg, -EINVAL); + break; + } +} + +static void +blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel, + void *payload, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) +{ + assert(blob != NULL); + + if (blob->data_ro && op_type != SPDK_BLOB_READ) { + cb_fn(cb_arg, -EPERM); + return; + } + + if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { + cb_fn(cb_arg, -EINVAL); + return; + } + if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) { + blob_request_submit_op_single(_channel, blob, payload, offset, length, + cb_fn, cb_arg, op_type); + } else { + blob_request_submit_op_split(_channel, blob, payload, offset, length, + cb_fn, cb_arg, op_type); + } +} + +struct rw_iov_ctx { + struct spdk_blob *blob; + struct spdk_io_channel *channel; + spdk_blob_op_complete cb_fn; + void *cb_arg; + bool read; + int iovcnt; + struct iovec *orig_iov; + uint64_t io_unit_offset; + uint64_t io_units_remaining; + uint64_t io_units_done; + struct iovec iov[0]; +}; + +static void +rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + assert(cb_arg == NULL); + bs_sequence_finish(seq, bserrno); +} + +static void +rw_iov_split_next(void *cb_arg, int bserrno) +{ + struct rw_iov_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct iovec *iov, *orig_iov; + int iovcnt; + size_t orig_iovoff; + uint64_t io_units_count, io_units_to_boundary, io_unit_offset; + uint64_t byte_count; + + if (bserrno != 0 || ctx->io_units_remaining == 0) { + ctx->cb_fn(ctx->cb_arg, bserrno); + free(ctx); + return; + } + + io_unit_offset = ctx->io_unit_offset; + io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset); + io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary); + /* + * Get index and offset into the original iov array for our current position in the I/O sequence. + * byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will + * point to the current position in the I/O sequence. + */ + byte_count = ctx->io_units_done * blob->bs->io_unit_size; + orig_iov = &ctx->orig_iov[0]; + orig_iovoff = 0; + while (byte_count > 0) { + if (byte_count >= orig_iov->iov_len) { + byte_count -= orig_iov->iov_len; + orig_iov++; + } else { + orig_iovoff = byte_count; + byte_count = 0; + } + } + + /* + * Build an iov array for the next I/O in the sequence. byte_count will keep track of how many + * bytes of this next I/O remain to be accounted for in the new iov array. + */ + byte_count = io_units_count * blob->bs->io_unit_size; + iov = &ctx->iov[0]; + iovcnt = 0; + while (byte_count > 0) { + assert(iovcnt < ctx->iovcnt); + iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff); + iov->iov_base = orig_iov->iov_base + orig_iovoff; + byte_count -= iov->iov_len; + orig_iovoff = 0; + orig_iov++; + iov++; + iovcnt++; + } + + ctx->io_unit_offset += io_units_count; + ctx->io_units_remaining -= io_units_count; + ctx->io_units_done += io_units_count; + iov = &ctx->iov[0]; + + if (ctx->read) { + spdk_blob_io_readv(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, + io_units_count, rw_iov_split_next, ctx); + } else { + spdk_blob_io_writev(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, + io_units_count, rw_iov_split_next, ctx); + } +} + +static void +blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel, + struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg, bool read) +{ + struct spdk_bs_cpl cpl; + + assert(blob != NULL); + + if (!read && blob->data_ro) { + cb_fn(cb_arg, -EPERM); + return; + } + + if (length == 0) { + cb_fn(cb_arg, 0); + return; + } + + if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { + cb_fn(cb_arg, -EINVAL); + return; + } + + /* + * For now, we implement readv/writev using a sequence (instead of a batch) to account for having + * to split a request that spans a cluster boundary. For I/O that do not span a cluster boundary, + * there will be no noticeable difference compared to using a batch. For I/O that do span a cluster + * boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need + * to allocate a separate iov array and split the I/O such that none of the resulting + * smaller I/O cross a cluster boundary. These smaller I/O will be issued in sequence (not in parallel) + * but since this case happens very infrequently, any performance impact will be negligible. + * + * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs + * for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them + * in a batch. That would also require creating an intermediate spdk_bs_cpl that would get called + * when the batch was completed, to allow for freeing the memory for the iov arrays. + */ + if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) { + uint32_t lba_count; + uint64_t lba; + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + if (blob->frozen_refcnt) { + /* This blob I/O is frozen */ + enum spdk_blob_op_type op_type; + spdk_bs_user_op_t *op; + struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel); + + op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV; + op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length); + if (!op) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); + + return; + } + + blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); + + if (read) { + spdk_bs_sequence_t *seq; + + seq = bs_sequence_start(_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + if (bs_io_unit_is_allocated(blob, offset)) { + bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); + } else { + bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count, + rw_iov_done, NULL); + } + } else { + if (bs_io_unit_is_allocated(blob, offset)) { + spdk_bs_sequence_t *seq; + + seq = bs_sequence_start(_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); + } else { + /* Queue this operation and allocate the cluster */ + spdk_bs_user_op_t *op; + + op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset, + length); + if (!op) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + bs_allocate_and_copy_cluster(blob, _channel, offset, op); + } + } + } else { + struct rw_iov_ctx *ctx; + + ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->blob = blob; + ctx->channel = _channel; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->read = read; + ctx->orig_iov = iov; + ctx->iovcnt = iovcnt; + ctx->io_unit_offset = offset; + ctx->io_units_remaining = length; + ctx->io_units_done = 0; + + rw_iov_split_next(ctx, 0); + } +} + +static struct spdk_blob * +blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid) +{ + struct spdk_blob *blob; + + if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) { + return NULL; + } + + TAILQ_FOREACH(blob, &bs->blobs, link) { + if (blob->id == blobid) { + return blob; + } + } + + return NULL; +} + +static void +blob_get_snapshot_and_clone_entries(struct spdk_blob *blob, + struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry) +{ + assert(blob != NULL); + *snapshot_entry = NULL; + *clone_entry = NULL; + + if (blob->parent_id == SPDK_BLOBID_INVALID) { + return; + } + + TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) { + if ((*snapshot_entry)->id == blob->parent_id) { + break; + } + } + + if (*snapshot_entry != NULL) { + TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) { + if ((*clone_entry)->id == blob->id) { + break; + } + } + + assert(clone_entry != NULL); + } +} + +static int +bs_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_blob_store *bs = io_device; + struct spdk_bs_channel *channel = ctx_buf; + struct spdk_bs_dev *dev; + uint32_t max_ops = bs->max_channel_ops; + uint32_t i; + + dev = bs->dev; + + channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set)); + if (!channel->req_mem) { + return -1; + } + + TAILQ_INIT(&channel->reqs); + + for (i = 0; i < max_ops; i++) { + TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link); + } + + channel->bs = bs; + channel->dev = dev; + channel->dev_channel = dev->create_channel(dev); + + if (!channel->dev_channel) { + SPDK_ERRLOG("Failed to create device channel.\n"); + free(channel->req_mem); + return -1; + } + + TAILQ_INIT(&channel->need_cluster_alloc); + TAILQ_INIT(&channel->queued_io); + + return 0; +} + +static void +bs_channel_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_bs_channel *channel = ctx_buf; + spdk_bs_user_op_t *op; + + while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) { + op = TAILQ_FIRST(&channel->need_cluster_alloc); + TAILQ_REMOVE(&channel->need_cluster_alloc, op, link); + bs_user_op_abort(op); + } + + while (!TAILQ_EMPTY(&channel->queued_io)) { + op = TAILQ_FIRST(&channel->queued_io); + TAILQ_REMOVE(&channel->queued_io, op, link); + bs_user_op_abort(op); + } + + free(channel->req_mem); + channel->dev->destroy_channel(channel->dev, channel->dev_channel); +} + +static void +bs_dev_destroy(void *io_device) +{ + struct spdk_blob_store *bs = io_device; + struct spdk_blob *blob, *blob_tmp; + + bs->dev->destroy(bs->dev); + + TAILQ_FOREACH_SAFE(blob, &bs->blobs, link, blob_tmp) { + TAILQ_REMOVE(&bs->blobs, blob, link); + spdk_bit_array_clear(bs->open_blobids, blob->id); + blob_free(blob); + } + + pthread_mutex_destroy(&bs->used_clusters_mutex); + + spdk_bit_array_free(&bs->open_blobids); + spdk_bit_array_free(&bs->used_blobids); + spdk_bit_array_free(&bs->used_md_pages); + spdk_bit_array_free(&bs->used_clusters); + /* + * If this function is called for any reason except a successful unload, + * the unload_cpl type will be NONE and this will be a nop. + */ + bs_call_cpl(&bs->unload_cpl, bs->unload_err); + + free(bs); +} + +static int +bs_blob_list_add(struct spdk_blob *blob) +{ + spdk_blob_id snapshot_id; + struct spdk_blob_list *snapshot_entry = NULL; + struct spdk_blob_list *clone_entry = NULL; + + assert(blob != NULL); + + snapshot_id = blob->parent_id; + if (snapshot_id == SPDK_BLOBID_INVALID) { + return 0; + } + + snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id); + if (snapshot_entry == NULL) { + /* Snapshot not found */ + snapshot_entry = calloc(1, sizeof(struct spdk_blob_list)); + if (snapshot_entry == NULL) { + return -ENOMEM; + } + snapshot_entry->id = snapshot_id; + TAILQ_INIT(&snapshot_entry->clones); + TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link); + } else { + TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { + if (clone_entry->id == blob->id) { + break; + } + } + } + + if (clone_entry == NULL) { + /* Clone not found */ + clone_entry = calloc(1, sizeof(struct spdk_blob_list)); + if (clone_entry == NULL) { + return -ENOMEM; + } + clone_entry->id = blob->id; + TAILQ_INIT(&clone_entry->clones); + TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link); + snapshot_entry->clone_count++; + } + + return 0; +} + +static void +bs_blob_list_remove(struct spdk_blob *blob) +{ + struct spdk_blob_list *snapshot_entry = NULL; + struct spdk_blob_list *clone_entry = NULL; + + blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry); + + if (snapshot_entry == NULL) { + return; + } + + blob->parent_id = SPDK_BLOBID_INVALID; + TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); + free(clone_entry); + + snapshot_entry->clone_count--; +} + +static int +bs_blob_list_free(struct spdk_blob_store *bs) +{ + struct spdk_blob_list *snapshot_entry; + struct spdk_blob_list *snapshot_entry_tmp; + struct spdk_blob_list *clone_entry; + struct spdk_blob_list *clone_entry_tmp; + + TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) { + TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) { + TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); + free(clone_entry); + } + TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link); + free(snapshot_entry); + } + + return 0; +} + +static void +bs_free(struct spdk_blob_store *bs) +{ + bs_blob_list_free(bs); + + bs_unregister_md_thread(bs); + spdk_io_device_unregister(bs, bs_dev_destroy); +} + +void +spdk_bs_opts_init(struct spdk_bs_opts *opts) +{ + opts->cluster_sz = SPDK_BLOB_OPTS_CLUSTER_SZ; + opts->num_md_pages = SPDK_BLOB_OPTS_NUM_MD_PAGES; + opts->max_md_ops = SPDK_BLOB_OPTS_MAX_MD_OPS; + opts->max_channel_ops = SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS; + opts->clear_method = BS_CLEAR_WITH_UNMAP; + memset(&opts->bstype, 0, sizeof(opts->bstype)); + opts->iter_cb_fn = NULL; + opts->iter_cb_arg = NULL; +} + +static int +bs_opts_verify(struct spdk_bs_opts *opts) +{ + if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 || + opts->max_channel_ops == 0) { + SPDK_ERRLOG("Blobstore options cannot be set to 0\n"); + return -1; + } + + return 0; +} + +static int +bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs) +{ + struct spdk_blob_store *bs; + uint64_t dev_size; + int rc; + + dev_size = dev->blocklen * dev->blockcnt; + if (dev_size < opts->cluster_sz) { + /* Device size cannot be smaller than cluster size of blobstore */ + SPDK_INFOLOG(SPDK_LOG_BLOB, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n", + dev_size, opts->cluster_sz); + return -ENOSPC; + } + if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) { + /* Cluster size cannot be smaller than page size */ + SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n", + opts->cluster_sz, SPDK_BS_PAGE_SIZE); + return -EINVAL; + } + bs = calloc(1, sizeof(struct spdk_blob_store)); + if (!bs) { + return -ENOMEM; + } + + TAILQ_INIT(&bs->blobs); + TAILQ_INIT(&bs->snapshots); + bs->dev = dev; + bs->md_thread = spdk_get_thread(); + assert(bs->md_thread != NULL); + + /* + * Do not use bs_lba_to_cluster() here since blockcnt may not be an + * even multiple of the cluster size. + */ + bs->cluster_sz = opts->cluster_sz; + bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen); + bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE; + if (spdk_u32_is_pow2(bs->pages_per_cluster)) { + bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster); + } + bs->num_free_clusters = bs->total_clusters; + bs->used_clusters = spdk_bit_array_create(bs->total_clusters); + bs->io_unit_size = dev->blocklen; + if (bs->used_clusters == NULL) { + free(bs); + return -ENOMEM; + } + + bs->max_channel_ops = opts->max_channel_ops; + bs->super_blob = SPDK_BLOBID_INVALID; + memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype)); + + /* The metadata is assumed to be at least 1 page */ + bs->used_md_pages = spdk_bit_array_create(1); + bs->used_blobids = spdk_bit_array_create(0); + bs->open_blobids = spdk_bit_array_create(0); + + pthread_mutex_init(&bs->used_clusters_mutex, NULL); + + spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy, + sizeof(struct spdk_bs_channel), "blobstore"); + rc = bs_register_md_thread(bs); + if (rc == -1) { + spdk_io_device_unregister(bs, NULL); + pthread_mutex_destroy(&bs->used_clusters_mutex); + spdk_bit_array_free(&bs->open_blobids); + spdk_bit_array_free(&bs->used_blobids); + spdk_bit_array_free(&bs->used_md_pages); + spdk_bit_array_free(&bs->used_clusters); + free(bs); + /* FIXME: this is a lie but don't know how to get a proper error code here */ + return -ENOMEM; + } + + *_bs = bs; + return 0; +} + +/* START spdk_bs_load, spdk_bs_load_ctx will used for both load and unload. */ + +struct spdk_bs_load_ctx { + struct spdk_blob_store *bs; + struct spdk_bs_super_block *super; + + struct spdk_bs_md_mask *mask; + bool in_page_chain; + uint32_t page_index; + uint32_t cur_page; + struct spdk_blob_md_page *page; + + uint64_t num_extent_pages; + uint32_t *extent_page_num; + struct spdk_blob_md_page *extent_pages; + + spdk_bs_sequence_t *seq; + spdk_blob_op_with_handle_complete iter_cb_fn; + void *iter_cb_arg; + struct spdk_blob *blob; + spdk_blob_id blobid; +}; + +static void +bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno) +{ + assert(bserrno != 0); + + spdk_free(ctx->super); + bs_sequence_finish(ctx->seq, bserrno); + bs_free(ctx->bs); + free(ctx); +} + +static void +bs_set_mask(struct spdk_bit_array *array, struct spdk_bs_md_mask *mask) +{ + uint32_t i = 0; + + while (true) { + i = spdk_bit_array_find_first_set(array, i); + if (i >= mask->length) { + break; + } + mask->mask[i / 8] |= 1U << (i % 8); + i++; + } +} + +static int +bs_load_mask(struct spdk_bit_array **array_ptr, struct spdk_bs_md_mask *mask) +{ + struct spdk_bit_array *array; + uint32_t i; + + if (spdk_bit_array_resize(array_ptr, mask->length) < 0) { + return -ENOMEM; + } + + array = *array_ptr; + for (i = 0; i < mask->length; i++) { + if (mask->mask[i / 8] & (1U << (i % 8))) { + spdk_bit_array_set(array, i); + } + } + + return 0; +} + +static void +bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, + struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + /* Update the values in the super block */ + super->super_blob = bs->super_blob; + memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype)); + super->crc = blob_md_page_calc_crc(super); + bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0), + bs_byte_to_lba(bs, sizeof(*super)), + cb_fn, cb_arg); +} + +static void +bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) +{ + struct spdk_bs_load_ctx *ctx = arg; + uint64_t mask_size, lba, lba_count; + + /* Write out the used clusters mask */ + mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->mask) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + + ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS; + ctx->mask->length = ctx->bs->total_clusters; + assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_clusters)); + + bs_set_mask(ctx->bs->used_clusters, ctx->mask); + lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); + lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); + bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); +} + +static void +bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) +{ + struct spdk_bs_load_ctx *ctx = arg; + uint64_t mask_size, lba, lba_count; + + mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->mask) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + + ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES; + ctx->mask->length = ctx->super->md_len; + assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages)); + + bs_set_mask(ctx->bs->used_md_pages, ctx->mask); + lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); + lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); + bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); +} + +static void +bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) +{ + struct spdk_bs_load_ctx *ctx = arg; + uint64_t mask_size, lba, lba_count; + + if (ctx->super->used_blobid_mask_len == 0) { + /* + * This is a pre-v3 on-disk format where the blobid mask does not get + * written to disk. + */ + cb_fn(seq, arg, 0); + return; + } + + mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, + SPDK_MALLOC_DMA); + if (!ctx->mask) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + + ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS; + ctx->mask->length = ctx->super->md_len; + assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids)); + + bs_set_mask(ctx->bs->used_blobids, ctx->mask); + lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); + lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); + bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); +} + +static void +blob_set_thin_provision(struct spdk_blob *blob) +{ + blob_verify_md_op(blob); + blob->invalid_flags |= SPDK_BLOB_THIN_PROV; + blob->state = SPDK_BLOB_STATE_DIRTY; +} + +static void +blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method) +{ + blob_verify_md_op(blob); + blob->clear_method = clear_method; + blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT); + blob->state = SPDK_BLOB_STATE_DIRTY; +} + +static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno); + +static void +bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + spdk_blob_id id; + int64_t page_num; + + /* Iterate to next blob (we can't use spdk_bs_iter_next function as our + * last blob has been removed */ + page_num = bs_blobid_to_page(ctx->blobid); + page_num++; + page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num); + if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) { + bs_load_iter(ctx, NULL, -ENOENT); + return; + } + + id = bs_page_to_blobid(page_num); + + spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx); +} + +static void +bs_delete_corrupted_close_cb(void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + if (bserrno != 0) { + SPDK_ERRLOG("Failed to close corrupted blob\n"); + spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); + return; + } + + spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx); +} + +static void +bs_delete_corrupted_blob(void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint64_t i; + + if (bserrno != 0) { + SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); + spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); + return; + } + + /* Snapshot and clone have the same copy of cluster map and extent pages + * at this point. Let's clear both for snpashot now, + * so that it won't be cleared for clone later when we remove snapshot. + * Also set thin provision to pass data corruption check */ + for (i = 0; i < ctx->blob->active.num_clusters; i++) { + ctx->blob->active.clusters[i] = 0; + } + for (i = 0; i < ctx->blob->active.num_extent_pages; i++) { + ctx->blob->active.extent_pages[i] = 0; + } + + ctx->blob->md_ro = false; + + blob_set_thin_provision(ctx->blob); + + ctx->blobid = ctx->blob->id; + + spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx); +} + +static void +bs_update_corrupted_blob(void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + if (bserrno != 0) { + SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); + spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); + return; + } + + ctx->blob->md_ro = false; + blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true); + blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true); + spdk_blob_set_read_only(ctx->blob); + + if (ctx->iter_cb_fn) { + ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0); + } + bs_blob_list_add(ctx->blob); + + spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); +} + +static void +bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + if (bserrno != 0) { + SPDK_ERRLOG("Failed to open clone of a corrupted blob\n"); + spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); + return; + } + + if (blob->parent_id == ctx->blob->id) { + /* Power failure occured before updating clone (snapshot delete case) + * or after updating clone (creating snapshot case) - keep snapshot */ + spdk_blob_close(blob, bs_update_corrupted_blob, ctx); + } else { + /* Power failure occured after updating clone (snapshot delete case) + * or before updating clone (creating snapshot case) - remove snapshot */ + spdk_blob_close(blob, bs_delete_corrupted_blob, ctx); + } +} + +static void +bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = arg; + const void *value; + size_t len; + int rc = 0; + + if (bserrno == 0) { + /* Examine blob if it is corrupted after power failure. Fix + * the ones that can be fixed and remove any other corrupted + * ones. If it is not corrupted just process it */ + rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true); + if (rc != 0) { + rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true); + if (rc != 0) { + /* Not corrupted - process it and continue with iterating through blobs */ + if (ctx->iter_cb_fn) { + ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0); + } + bs_blob_list_add(blob); + spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx); + return; + } + + } + + assert(len == sizeof(spdk_blob_id)); + + ctx->blob = blob; + + /* Open clone to check if we are able to fix this blob or should we remove it */ + spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx); + return; + } else if (bserrno == -ENOENT) { + bserrno = 0; + } else { + /* + * This case needs to be looked at further. Same problem + * exists with applications that rely on explicit blob + * iteration. We should just skip the blob that failed + * to load and continue on to the next one. + */ + SPDK_ERRLOG("Error in iterating blobs\n"); + } + + ctx->iter_cb_fn = NULL; + + spdk_free(ctx->super); + spdk_free(ctx->mask); + bs_sequence_finish(ctx->seq, bserrno); + free(ctx); +} + +static void +bs_load_complete(struct spdk_bs_load_ctx *ctx) +{ + spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx); +} + +static void +bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + int rc; + + /* The type must be correct */ + assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS); + + /* The length of the mask (in bits) must not be greater than + * the length of the buffer (converted to bits) */ + assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8)); + + /* The length of the mask must be exactly equal to the size + * (in pages) of the metadata region */ + assert(ctx->mask->length == ctx->super->md_len); + + rc = bs_load_mask(&ctx->bs->used_blobids, ctx->mask); + if (rc < 0) { + spdk_free(ctx->mask); + bs_load_ctx_fail(ctx, rc); + return; + } + + bs_load_complete(ctx); +} + +static void +bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint64_t lba, lba_count, mask_size; + int rc; + + if (bserrno != 0) { + bs_load_ctx_fail(ctx, bserrno); + return; + } + + /* The type must be correct */ + assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); + /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ + assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( + struct spdk_blob_md_page) * 8)); + /* The length of the mask must be exactly equal to the total number of clusters */ + assert(ctx->mask->length == ctx->bs->total_clusters); + + rc = bs_load_mask(&ctx->bs->used_clusters, ctx->mask); + if (rc < 0) { + spdk_free(ctx->mask); + bs_load_ctx_fail(ctx, rc); + return; + } + + ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->bs->used_clusters); + assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters); + + spdk_free(ctx->mask); + + /* Read the used blobids mask */ + mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, + SPDK_MALLOC_DMA); + if (!ctx->mask) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); + lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); + bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, + bs_load_used_blobids_cpl, ctx); +} + +static void +bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint64_t lba, lba_count, mask_size; + int rc; + + if (bserrno != 0) { + bs_load_ctx_fail(ctx, bserrno); + return; + } + + /* The type must be correct */ + assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES); + /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ + assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE * + 8)); + /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */ + assert(ctx->mask->length == ctx->super->md_len); + + rc = bs_load_mask(&ctx->bs->used_md_pages, ctx->mask); + if (rc < 0) { + spdk_free(ctx->mask); + bs_load_ctx_fail(ctx, rc); + return; + } + + spdk_free(ctx->mask); + + /* Read the used clusters mask */ + mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, + SPDK_MALLOC_DMA); + if (!ctx->mask) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); + lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); + bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, + bs_load_used_clusters_cpl, ctx); +} + +static void +bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx) +{ + uint64_t lba, lba_count, mask_size; + + /* Read the used pages mask */ + mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->mask) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + + lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); + lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); + bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, + bs_load_used_pages_cpl, ctx); +} + +static int +bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page) +{ + struct spdk_blob_store *bs = ctx->bs; + struct spdk_blob_md_descriptor *desc; + size_t cur_desc = 0; + + desc = (struct spdk_blob_md_descriptor *)page->descriptors; + while (cur_desc < sizeof(page->descriptors)) { + if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { + if (desc->length == 0) { + /* If padding and length are 0, this terminates the page */ + break; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { + struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; + unsigned int i, j; + unsigned int cluster_count = 0; + uint32_t cluster_idx; + + desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; + + for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { + for (j = 0; j < desc_extent_rle->extents[i].length; j++) { + cluster_idx = desc_extent_rle->extents[i].cluster_idx; + /* + * cluster_idx = 0 means an unallocated cluster - don't mark that + * in the used cluster map. + */ + if (cluster_idx != 0) { + spdk_bit_array_set(bs->used_clusters, cluster_idx + j); + if (bs->num_free_clusters == 0) { + return -ENOSPC; + } + bs->num_free_clusters--; + } + cluster_count++; + } + } + if (cluster_count == 0) { + return -EINVAL; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { + struct spdk_blob_md_descriptor_extent_page *desc_extent; + uint32_t i; + uint32_t cluster_count = 0; + uint32_t cluster_idx; + size_t cluster_idx_length; + + desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; + cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); + + if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || + (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { + return -EINVAL; + } + + for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { + cluster_idx = desc_extent->cluster_idx[i]; + /* + * cluster_idx = 0 means an unallocated cluster - don't mark that + * in the used cluster map. + */ + if (cluster_idx != 0) { + if (cluster_idx < desc_extent->start_cluster_idx && + cluster_idx >= desc_extent->start_cluster_idx + cluster_count) { + return -EINVAL; + } + spdk_bit_array_set(bs->used_clusters, cluster_idx); + if (bs->num_free_clusters == 0) { + return -ENOSPC; + } + bs->num_free_clusters--; + } + cluster_count++; + } + + if (cluster_count == 0) { + return -EINVAL; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { + /* Skip this item */ + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { + /* Skip this item */ + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { + /* Skip this item */ + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { + struct spdk_blob_md_descriptor_extent_table *desc_extent_table; + uint32_t num_extent_pages = ctx->num_extent_pages; + uint32_t i; + size_t extent_pages_length; + void *tmp; + + desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; + extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); + + if (desc_extent_table->length == 0 || + (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { + return -EINVAL; + } + + for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { + if (desc_extent_table->extent_page[i].page_idx != 0) { + if (desc_extent_table->extent_page[i].num_pages != 1) { + return -EINVAL; + } + num_extent_pages += 1; + } + } + + if (num_extent_pages > 0) { + tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t)); + if (tmp == NULL) { + return -ENOMEM; + } + ctx->extent_page_num = tmp; + + /* Extent table entries contain md page numbers for extent pages. + * Zeroes represent unallocated extent pages, those are run-length-encoded. + */ + for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { + if (desc_extent_table->extent_page[i].page_idx != 0) { + ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx; + ctx->num_extent_pages += 1; + } + } + } + } else { + /* Error */ + return -EINVAL; + } + /* Advance to the next descriptor */ + cur_desc += sizeof(*desc) + desc->length; + if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { + break; + } + desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); + } + return 0; +} + +static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page) +{ + uint32_t crc; + struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors; + size_t desc_len; + + crc = blob_md_page_calc_crc(page); + if (crc != page->crc) { + return false; + } + + /* Extent page should always be of sequence num 0. */ + if (page->sequence_num != 0) { + return false; + } + + /* Descriptor type must be EXTENT_PAGE. */ + if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { + return false; + } + + /* Descriptor length cannot exceed the page. */ + desc_len = sizeof(*desc) + desc->length; + if (desc_len > sizeof(page->descriptors)) { + return false; + } + + /* It has to be the only descriptor in the page. */ + if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) { + desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len); + if (desc->length != 0) { + return false; + } + } + + return true; +} + +static bool bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx) +{ + uint32_t crc; + struct spdk_blob_md_page *page = ctx->page; + + crc = blob_md_page_calc_crc(page); + if (crc != page->crc) { + return false; + } + + /* First page of a sequence should match the blobid. */ + if (page->sequence_num == 0 && + bs_page_to_blobid(ctx->cur_page) != page->id) { + return false; + } + assert(bs_load_cur_extent_page_valid(page) == false); + + return true; +} + +static void +bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx); + +static void +bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + if (bserrno != 0) { + bs_load_ctx_fail(ctx, bserrno); + return; + } + + bs_load_complete(ctx); +} + +static void +bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_free(ctx->mask); + ctx->mask = NULL; + + if (bserrno != 0) { + bs_load_ctx_fail(ctx, bserrno); + return; + } + + bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl); +} + +static void +bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_free(ctx->mask); + ctx->mask = NULL; + + if (bserrno != 0) { + bs_load_ctx_fail(ctx, bserrno); + return; + } + + bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl); +} + +static void +bs_load_write_used_md(struct spdk_bs_load_ctx *ctx) +{ + bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl); +} + +static void +bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx) +{ + uint64_t num_md_clusters; + uint64_t i; + + ctx->in_page_chain = false; + + do { + ctx->page_index++; + } while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true); + + if (ctx->page_index < ctx->super->md_len) { + ctx->cur_page = ctx->page_index; + bs_load_replay_cur_md_page(ctx); + } else { + /* Claim all of the clusters used by the metadata */ + num_md_clusters = spdk_divide_round_up(ctx->super->md_len, ctx->bs->pages_per_cluster); + for (i = 0; i < num_md_clusters; i++) { + bs_claim_cluster(ctx->bs, i); + } + spdk_free(ctx->page); + bs_load_write_used_md(ctx); + } +} + +static void +bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint32_t page_num; + uint64_t i; + + if (bserrno != 0) { + spdk_free(ctx->extent_pages); + bs_load_ctx_fail(ctx, bserrno); + return; + } + + for (i = 0; i < ctx->num_extent_pages; i++) { + /* Extent pages are only read when present within in chain md. + * Integrity of md is not right if that page was not a valid extent page. */ + if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) { + spdk_free(ctx->extent_pages); + bs_load_ctx_fail(ctx, -EILSEQ); + return; + } + + page_num = ctx->extent_page_num[i]; + spdk_bit_array_set(ctx->bs->used_md_pages, page_num); + if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) { + spdk_free(ctx->extent_pages); + bs_load_ctx_fail(ctx, -EILSEQ); + return; + } + } + + spdk_free(ctx->extent_pages); + free(ctx->extent_page_num); + ctx->extent_page_num = NULL; + ctx->num_extent_pages = 0; + + bs_load_replay_md_chain_cpl(ctx); +} + +static void +bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx) +{ + spdk_bs_batch_t *batch; + uint32_t page; + uint64_t lba; + uint64_t i; + + ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, SPDK_BS_PAGE_SIZE, + NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->extent_pages) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + + batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx); + + for (i = 0; i < ctx->num_extent_pages; i++) { + page = ctx->extent_page_num[i]; + assert(page < ctx->super->md_len); + lba = bs_md_page_to_lba(ctx->bs, page); + bs_batch_read_dev(batch, &ctx->extent_pages[i], lba, + bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE)); + } + + bs_batch_close(batch); +} + +static void +bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint32_t page_num; + struct spdk_blob_md_page *page; + + if (bserrno != 0) { + bs_load_ctx_fail(ctx, bserrno); + return; + } + + page_num = ctx->cur_page; + page = ctx->page; + if (bs_load_cur_md_page_valid(ctx) == true) { + if (page->sequence_num == 0 || ctx->in_page_chain == true) { + bs_claim_md_page(ctx->bs, page_num); + if (page->sequence_num == 0) { + spdk_bit_array_set(ctx->bs->used_blobids, page_num); + } + if (bs_load_replay_md_parse_page(ctx, page)) { + bs_load_ctx_fail(ctx, -EILSEQ); + return; + } + if (page->next != SPDK_INVALID_MD_PAGE) { + ctx->in_page_chain = true; + ctx->cur_page = page->next; + bs_load_replay_cur_md_page(ctx); + return; + } + if (ctx->num_extent_pages != 0) { + bs_load_replay_extent_pages(ctx); + return; + } + } + } + bs_load_replay_md_chain_cpl(ctx); +} + +static void +bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx) +{ + uint64_t lba; + + assert(ctx->cur_page < ctx->super->md_len); + lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page); + bs_sequence_read_dev(ctx->seq, ctx->page, lba, + bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), + bs_load_replay_md_cpl, ctx); +} + +static void +bs_load_replay_md(struct spdk_bs_load_ctx *ctx) +{ + ctx->page_index = 0; + ctx->cur_page = 0; + ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, + NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->page) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + bs_load_replay_cur_md_page(ctx); +} + +static void +bs_recover(struct spdk_bs_load_ctx *ctx) +{ + int rc; + + rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len); + if (rc < 0) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + + rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len); + if (rc < 0) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + + rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters); + if (rc < 0) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + + rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len); + if (rc < 0) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + + ctx->bs->num_free_clusters = ctx->bs->total_clusters; + bs_load_replay_md(ctx); +} + +static void +bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint32_t crc; + int rc; + static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; + + if (ctx->super->version > SPDK_BS_VERSION || + ctx->super->version < SPDK_BS_INITIAL_VERSION) { + bs_load_ctx_fail(ctx, -EILSEQ); + return; + } + + if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, + sizeof(ctx->super->signature)) != 0) { + bs_load_ctx_fail(ctx, -EILSEQ); + return; + } + + crc = blob_md_page_calc_crc(ctx->super); + if (crc != ctx->super->crc) { + bs_load_ctx_fail(ctx, -EILSEQ); + return; + } + + if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Bstype matched - loading blobstore\n"); + } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Bstype wildcard used - loading blobstore regardless bstype\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Unexpected bstype\n"); + SPDK_LOGDUMP(SPDK_LOG_BLOB, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); + SPDK_LOGDUMP(SPDK_LOG_BLOB, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); + bs_load_ctx_fail(ctx, -ENXIO); + return; + } + + if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { + SPDK_NOTICELOG("Size mismatch, dev size: %lu, blobstore size: %lu\n", + ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); + bs_load_ctx_fail(ctx, -EILSEQ); + return; + } + + if (ctx->super->size == 0) { + ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; + } + + if (ctx->super->io_unit_size == 0) { + ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; + } + + /* Parse the super block */ + ctx->bs->clean = 1; + ctx->bs->cluster_sz = ctx->super->cluster_size; + ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; + ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; + if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { + ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); + } + ctx->bs->io_unit_size = ctx->super->io_unit_size; + rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters); + if (rc < 0) { + bs_load_ctx_fail(ctx, -ENOMEM); + return; + } + ctx->bs->md_start = ctx->super->md_start; + ctx->bs->md_len = ctx->super->md_len; + ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( + ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); + ctx->bs->super_blob = ctx->super->super_blob; + memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); + + if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) { + bs_recover(ctx); + } else { + bs_load_read_used_pages(ctx); + } +} + +void +spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, + spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_blob_store *bs; + struct spdk_bs_cpl cpl; + struct spdk_bs_load_ctx *ctx; + struct spdk_bs_opts opts = {}; + int err; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Loading blobstore from dev %p\n", dev); + + if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "unsupported dev block length of %d\n", dev->blocklen); + dev->destroy(dev); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + if (o) { + opts = *o; + } else { + spdk_bs_opts_init(&opts); + } + + if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { + dev->destroy(dev); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + err = bs_alloc(dev, &opts, &bs); + if (err) { + dev->destroy(dev); + cb_fn(cb_arg, NULL, err); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + ctx->bs = bs; + ctx->iter_cb_fn = opts.iter_cb_fn; + ctx->iter_cb_arg = opts.iter_cb_arg; + + /* Allocate memory for the super block */ + ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->super) { + free(ctx); + bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; + cpl.u.bs_handle.cb_fn = cb_fn; + cpl.u.bs_handle.cb_arg = cb_arg; + cpl.u.bs_handle.bs = bs; + + ctx->seq = bs_sequence_start(bs->md_channel, &cpl); + if (!ctx->seq) { + spdk_free(ctx->super); + free(ctx); + bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + /* Read the super block */ + bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), + bs_byte_to_lba(bs, sizeof(*ctx->super)), + bs_load_super_cpl, ctx); +} + +/* END spdk_bs_load */ + +/* START spdk_bs_dump */ + +struct spdk_bs_dump_ctx { + struct spdk_blob_store *bs; + struct spdk_bs_super_block *super; + uint32_t cur_page; + struct spdk_blob_md_page *page; + spdk_bs_sequence_t *seq; + FILE *fp; + spdk_bs_dump_print_xattr print_xattr_fn; + char xattr_name[4096]; +}; + +static void +bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_dump_ctx *ctx, int bserrno) +{ + spdk_free(ctx->super); + + /* + * We need to defer calling bs_call_cpl() until after + * dev destruction, so tuck these away for later use. + */ + ctx->bs->unload_err = bserrno; + memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); + seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; + + bs_sequence_finish(seq, 0); + bs_free(ctx->bs); + free(ctx); +} + +static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg); + +static void +bs_dump_print_md_page(struct spdk_bs_dump_ctx *ctx) +{ + uint32_t page_idx = ctx->cur_page; + struct spdk_blob_md_page *page = ctx->page; + struct spdk_blob_md_descriptor *desc; + size_t cur_desc = 0; + uint32_t crc; + + fprintf(ctx->fp, "=========\n"); + fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx); + fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id); + + crc = blob_md_page_calc_crc(page); + fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch"); + + desc = (struct spdk_blob_md_descriptor *)page->descriptors; + while (cur_desc < sizeof(page->descriptors)) { + if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { + if (desc->length == 0) { + /* If padding and length are 0, this terminates the page */ + break; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { + struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; + unsigned int i; + + desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; + + for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { + if (desc_extent_rle->extents[i].cluster_idx != 0) { + fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, + desc_extent_rle->extents[i].cluster_idx); + } else { + fprintf(ctx->fp, "Unallocated Extent - "); + } + fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length); + fprintf(ctx->fp, "\n"); + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { + struct spdk_blob_md_descriptor_extent_page *desc_extent; + unsigned int i; + + desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; + + for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) { + if (desc_extent->cluster_idx[i] != 0) { + fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, + desc_extent->cluster_idx[i]); + } else { + fprintf(ctx->fp, "Unallocated Extent"); + } + fprintf(ctx->fp, "\n"); + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { + struct spdk_blob_md_descriptor_xattr *desc_xattr; + uint32_t i; + + desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc; + + if (desc_xattr->length != + sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) + + desc_xattr->name_length + desc_xattr->value_length) { + } + + memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length); + ctx->xattr_name[desc_xattr->name_length] = '\0'; + fprintf(ctx->fp, "XATTR: name = \"%s\"\n", ctx->xattr_name); + fprintf(ctx->fp, " value = \""); + ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name, + (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), + desc_xattr->value_length); + fprintf(ctx->fp, "\"\n"); + for (i = 0; i < desc_xattr->value_length; i++) { + if (i % 16 == 0) { + fprintf(ctx->fp, " "); + } + fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i)); + if ((i + 1) % 16 == 0) { + fprintf(ctx->fp, "\n"); + } + } + if (i % 16 != 0) { + fprintf(ctx->fp, "\n"); + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { + /* TODO */ + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { + /* TODO */ + } else { + /* Error */ + } + /* Advance to the next descriptor */ + cur_desc += sizeof(*desc) + desc->length; + if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { + break; + } + desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); + } +} + +static void +bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_dump_ctx *ctx = cb_arg; + + if (bserrno != 0) { + bs_dump_finish(seq, ctx, bserrno); + return; + } + + if (ctx->page->id != 0) { + bs_dump_print_md_page(ctx); + } + + ctx->cur_page++; + + if (ctx->cur_page < ctx->super->md_len) { + bs_dump_read_md_page(seq, ctx); + } else { + spdk_free(ctx->page); + bs_dump_finish(seq, ctx, 0); + } +} + +static void +bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg) +{ + struct spdk_bs_dump_ctx *ctx = cb_arg; + uint64_t lba; + + assert(ctx->cur_page < ctx->super->md_len); + lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page); + bs_sequence_read_dev(seq, ctx->page, lba, + bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), + bs_dump_read_md_page_cpl, ctx); +} + +static void +bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_dump_ctx *ctx = cb_arg; + + fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature); + if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, + sizeof(ctx->super->signature)) != 0) { + fprintf(ctx->fp, "(Mismatch)\n"); + bs_dump_finish(seq, ctx, bserrno); + return; + } else { + fprintf(ctx->fp, "(OK)\n"); + } + fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version); + fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc, + (ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch"); + fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype); + fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size); + fprintf(ctx->fp, "Super Blob ID: "); + if (ctx->super->super_blob == SPDK_BLOBID_INVALID) { + fprintf(ctx->fp, "(None)\n"); + } else { + fprintf(ctx->fp, "%" PRIu64 "\n", ctx->super->super_blob); + } + fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean); + fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start); + fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len); + fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start); + fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len); + fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start); + fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len); + fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start); + fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len); + + ctx->cur_page = 0; + ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, + NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->page) { + bs_dump_finish(seq, ctx, -ENOMEM); + return; + } + bs_dump_read_md_page(seq, ctx); +} + +void +spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn, + spdk_bs_op_complete cb_fn, void *cb_arg) +{ + struct spdk_blob_store *bs; + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + struct spdk_bs_dump_ctx *ctx; + struct spdk_bs_opts opts = {}; + int err; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Dumping blobstore from dev %p\n", dev); + + spdk_bs_opts_init(&opts); + + err = bs_alloc(dev, &opts, &bs); + if (err) { + dev->destroy(dev); + cb_fn(cb_arg, err); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + bs_free(bs); + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->bs = bs; + ctx->fp = fp; + ctx->print_xattr_fn = print_xattr_fn; + + /* Allocate memory for the super block */ + ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->super) { + free(ctx); + bs_free(bs); + cb_fn(cb_arg, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + cpl.u.bs_basic.cb_fn = cb_fn; + cpl.u.bs_basic.cb_arg = cb_arg; + + seq = bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + spdk_free(ctx->super); + free(ctx); + bs_free(bs); + cb_fn(cb_arg, -ENOMEM); + return; + } + + /* Read the super block */ + bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0), + bs_byte_to_lba(bs, sizeof(*ctx->super)), + bs_dump_super_cpl, ctx); +} + +/* END spdk_bs_dump */ + +/* START spdk_bs_init */ + +struct spdk_bs_init_ctx { + struct spdk_blob_store *bs; + struct spdk_bs_super_block *super; +}; + +static void +bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_init_ctx *ctx = cb_arg; + + spdk_free(ctx->super); + free(ctx); + + bs_sequence_finish(seq, bserrno); +} + +static void +bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_init_ctx *ctx = cb_arg; + + /* Write super block */ + bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0), + bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), + bs_init_persist_super_cpl, ctx); +} + +void +spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, + spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_init_ctx *ctx; + struct spdk_blob_store *bs; + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + spdk_bs_batch_t *batch; + uint64_t num_md_lba; + uint64_t num_md_pages; + uint64_t num_md_clusters; + uint32_t i; + struct spdk_bs_opts opts = {}; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Initializing blobstore on dev %p\n", dev); + + if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { + SPDK_ERRLOG("unsupported dev block length of %d\n", + dev->blocklen); + dev->destroy(dev); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + if (o) { + opts = *o; + } else { + spdk_bs_opts_init(&opts); + } + + if (bs_opts_verify(&opts) != 0) { + dev->destroy(dev); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + rc = bs_alloc(dev, &opts, &bs); + if (rc) { + dev->destroy(dev); + cb_fn(cb_arg, NULL, rc); + return; + } + + if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) { + /* By default, allocate 1 page per cluster. + * Technically, this over-allocates metadata + * because more metadata will reduce the number + * of usable clusters. This can be addressed with + * more complex math in the future. + */ + bs->md_len = bs->total_clusters; + } else { + bs->md_len = opts.num_md_pages; + } + rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len); + if (rc < 0) { + bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len); + if (rc < 0) { + bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len); + if (rc < 0) { + bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + ctx->bs = bs; + + /* Allocate memory for the super block */ + ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->super) { + free(ctx); + bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, + sizeof(ctx->super->signature)); + ctx->super->version = SPDK_BS_VERSION; + ctx->super->length = sizeof(*ctx->super); + ctx->super->super_blob = bs->super_blob; + ctx->super->clean = 0; + ctx->super->cluster_size = bs->cluster_sz; + ctx->super->io_unit_size = bs->io_unit_size; + memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype)); + + /* Calculate how many pages the metadata consumes at the front + * of the disk. + */ + + /* The super block uses 1 page */ + num_md_pages = 1; + + /* The used_md_pages mask requires 1 bit per metadata page, rounded + * up to the nearest page, plus a header. + */ + ctx->super->used_page_mask_start = num_md_pages; + ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + + spdk_divide_round_up(bs->md_len, 8), + SPDK_BS_PAGE_SIZE); + num_md_pages += ctx->super->used_page_mask_len; + + /* The used_clusters mask requires 1 bit per cluster, rounded + * up to the nearest page, plus a header. + */ + ctx->super->used_cluster_mask_start = num_md_pages; + ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + + spdk_divide_round_up(bs->total_clusters, 8), + SPDK_BS_PAGE_SIZE); + num_md_pages += ctx->super->used_cluster_mask_len; + + /* The used_blobids mask requires 1 bit per metadata page, rounded + * up to the nearest page, plus a header. + */ + ctx->super->used_blobid_mask_start = num_md_pages; + ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + + spdk_divide_round_up(bs->md_len, 8), + SPDK_BS_PAGE_SIZE); + num_md_pages += ctx->super->used_blobid_mask_len; + + /* The metadata region size was chosen above */ + ctx->super->md_start = bs->md_start = num_md_pages; + ctx->super->md_len = bs->md_len; + num_md_pages += bs->md_len; + + num_md_lba = bs_page_to_lba(bs, num_md_pages); + + ctx->super->size = dev->blockcnt * dev->blocklen; + + ctx->super->crc = blob_md_page_calc_crc(ctx->super); + + num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster); + if (num_md_clusters > bs->total_clusters) { + SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, " + "please decrease number of pages reserved for metadata " + "or increase cluster size.\n"); + spdk_free(ctx->super); + free(ctx); + bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + /* Claim all of the clusters used by the metadata */ + for (i = 0; i < num_md_clusters; i++) { + bs_claim_cluster(bs, i); + } + + bs->total_data_clusters = bs->num_free_clusters; + + cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; + cpl.u.bs_handle.cb_fn = cb_fn; + cpl.u.bs_handle.cb_arg = cb_arg; + cpl.u.bs_handle.bs = bs; + + seq = bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + spdk_free(ctx->super); + free(ctx); + bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx); + + /* Clear metadata space */ + bs_batch_write_zeroes_dev(batch, 0, num_md_lba); + + switch (opts.clear_method) { + case BS_CLEAR_WITH_UNMAP: + /* Trim data clusters */ + bs_batch_unmap_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba); + break; + case BS_CLEAR_WITH_WRITE_ZEROES: + /* Write_zeroes to data clusters */ + bs_batch_write_zeroes_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba); + break; + case BS_CLEAR_WITH_NONE: + default: + break; + } + + bs_batch_close(batch); +} + +/* END spdk_bs_init */ + +/* START spdk_bs_destroy */ + +static void +bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_init_ctx *ctx = cb_arg; + struct spdk_blob_store *bs = ctx->bs; + + /* + * We need to defer calling bs_call_cpl() until after + * dev destruction, so tuck these away for later use. + */ + bs->unload_err = bserrno; + memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); + seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; + + bs_sequence_finish(seq, bserrno); + + bs_free(bs); + free(ctx); +} + +void +spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, + void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + struct spdk_bs_init_ctx *ctx; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Destroying blobstore\n"); + + if (!TAILQ_EMPTY(&bs->blobs)) { + SPDK_ERRLOG("Blobstore still has open blobs\n"); + cb_fn(cb_arg, -EBUSY); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + cpl.u.bs_basic.cb_fn = cb_fn; + cpl.u.bs_basic.cb_arg = cb_arg; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->bs = bs; + + seq = bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + /* Write zeroes to the super block */ + bs_sequence_write_zeroes_dev(seq, + bs_page_to_lba(bs, 0), + bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)), + bs_destroy_trim_cpl, ctx); +} + +/* END spdk_bs_destroy */ + +/* START spdk_bs_unload */ + +static void +bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno) +{ + spdk_bs_sequence_t *seq = ctx->seq; + + spdk_free(ctx->super); + + /* + * We need to defer calling bs_call_cpl() until after + * dev destruction, so tuck these away for later use. + */ + ctx->bs->unload_err = bserrno; + memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); + seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; + + bs_sequence_finish(seq, bserrno); + + bs_free(ctx->bs); + free(ctx); +} + +static void +bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + bs_unload_finish(ctx, bserrno); +} + +static void +bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_free(ctx->mask); + + if (bserrno != 0) { + bs_unload_finish(ctx, bserrno); + return; + } + + ctx->super->clean = 1; + + bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx); +} + +static void +bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_free(ctx->mask); + ctx->mask = NULL; + + if (bserrno != 0) { + bs_unload_finish(ctx, bserrno); + return; + } + + bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl); +} + +static void +bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_free(ctx->mask); + ctx->mask = NULL; + + if (bserrno != 0) { + bs_unload_finish(ctx, bserrno); + return; + } + + bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl); +} + +static void +bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + if (bserrno != 0) { + bs_unload_finish(ctx, bserrno); + return; + } + + bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl); +} + +void +spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_cpl cpl; + struct spdk_bs_load_ctx *ctx; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Syncing blobstore\n"); + + if (!TAILQ_EMPTY(&bs->blobs)) { + SPDK_ERRLOG("Blobstore still has open blobs\n"); + cb_fn(cb_arg, -EBUSY); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->bs = bs; + + ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->super) { + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + cpl.u.bs_basic.cb_fn = cb_fn; + cpl.u.bs_basic.cb_arg = cb_arg; + + ctx->seq = bs_sequence_start(bs->md_channel, &cpl); + if (!ctx->seq) { + spdk_free(ctx->super); + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + /* Read super block */ + bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), + bs_byte_to_lba(bs, sizeof(*ctx->super)), + bs_unload_read_super_cpl, ctx); +} + +/* END spdk_bs_unload */ + +/* START spdk_bs_set_super */ + +struct spdk_bs_set_super_ctx { + struct spdk_blob_store *bs; + struct spdk_bs_super_block *super; +}; + +static void +bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_set_super_ctx *ctx = cb_arg; + + if (bserrno != 0) { + SPDK_ERRLOG("Unable to write to super block of blobstore\n"); + } + + spdk_free(ctx->super); + + bs_sequence_finish(seq, bserrno); + + free(ctx); +} + +static void +bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_set_super_ctx *ctx = cb_arg; + + if (bserrno != 0) { + SPDK_ERRLOG("Unable to read super block of blobstore\n"); + spdk_free(ctx->super); + bs_sequence_finish(seq, bserrno); + free(ctx); + return; + } + + bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx); +} + +void +spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid, + spdk_bs_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + struct spdk_bs_set_super_ctx *ctx; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Setting super blob id on blobstore\n"); + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->bs = bs; + + ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->super) { + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + cpl.u.bs_basic.cb_fn = cb_fn; + cpl.u.bs_basic.cb_arg = cb_arg; + + seq = bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + spdk_free(ctx->super); + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + bs->super_blob = blobid; + + /* Read super block */ + bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0), + bs_byte_to_lba(bs, sizeof(*ctx->super)), + bs_set_super_read_cpl, ctx); +} + +/* END spdk_bs_set_super */ + +void +spdk_bs_get_super(struct spdk_blob_store *bs, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + if (bs->super_blob == SPDK_BLOBID_INVALID) { + cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT); + } else { + cb_fn(cb_arg, bs->super_blob, 0); + } +} + +uint64_t +spdk_bs_get_cluster_size(struct spdk_blob_store *bs) +{ + return bs->cluster_sz; +} + +uint64_t +spdk_bs_get_page_size(struct spdk_blob_store *bs) +{ + return SPDK_BS_PAGE_SIZE; +} + +uint64_t +spdk_bs_get_io_unit_size(struct spdk_blob_store *bs) +{ + return bs->io_unit_size; +} + +uint64_t +spdk_bs_free_cluster_count(struct spdk_blob_store *bs) +{ + return bs->num_free_clusters; +} + +uint64_t +spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs) +{ + return bs->total_data_clusters; +} + +static int +bs_register_md_thread(struct spdk_blob_store *bs) +{ + bs->md_channel = spdk_get_io_channel(bs); + if (!bs->md_channel) { + SPDK_ERRLOG("Failed to get IO channel.\n"); + return -1; + } + + return 0; +} + +static int +bs_unregister_md_thread(struct spdk_blob_store *bs) +{ + spdk_put_io_channel(bs->md_channel); + + return 0; +} + +spdk_blob_id spdk_blob_get_id(struct spdk_blob *blob) +{ + assert(blob != NULL); + + return blob->id; +} + +uint64_t spdk_blob_get_num_pages(struct spdk_blob *blob) +{ + assert(blob != NULL); + + return bs_cluster_to_page(blob->bs, blob->active.num_clusters); +} + +uint64_t spdk_blob_get_num_io_units(struct spdk_blob *blob) +{ + assert(blob != NULL); + + return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs); +} + +uint64_t spdk_blob_get_num_clusters(struct spdk_blob *blob) +{ + assert(blob != NULL); + + return blob->active.num_clusters; +} + +/* START spdk_bs_create_blob */ + +static void +bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob *blob = cb_arg; + uint32_t page_idx = bs_blobid_to_page(blob->id); + + if (bserrno != 0) { + spdk_bit_array_clear(blob->bs->used_blobids, page_idx); + bs_release_md_page(blob->bs, page_idx); + } + + blob_free(blob); + + bs_sequence_finish(seq, bserrno); +} + +static int +blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs, + bool internal) +{ + uint64_t i; + size_t value_len = 0; + int rc; + const void *value = NULL; + if (xattrs->count > 0 && xattrs->get_value == NULL) { + return -EINVAL; + } + for (i = 0; i < xattrs->count; i++) { + xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len); + if (value == NULL || value_len == 0) { + return -EINVAL; + } + rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal); + if (rc < 0) { + return rc; + } + } + return 0; +} + +static void +bs_create_blob(struct spdk_blob_store *bs, + const struct spdk_blob_opts *opts, + const struct spdk_blob_xattr_opts *internal_xattrs, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + struct spdk_blob *blob; + uint32_t page_idx; + struct spdk_bs_cpl cpl; + struct spdk_blob_opts opts_default; + struct spdk_blob_xattr_opts internal_xattrs_default; + spdk_bs_sequence_t *seq; + spdk_blob_id id; + int rc; + + assert(spdk_get_thread() == bs->md_thread); + + page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0); + if (page_idx == UINT32_MAX) { + cb_fn(cb_arg, 0, -ENOMEM); + return; + } + spdk_bit_array_set(bs->used_blobids, page_idx); + bs_claim_md_page(bs, page_idx); + + id = bs_page_to_blobid(page_idx); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Creating blob with id %lu at page %u\n", id, page_idx); + + blob = blob_alloc(bs, id); + if (!blob) { + spdk_bit_array_clear(bs->used_blobids, page_idx); + bs_release_md_page(bs, page_idx); + cb_fn(cb_arg, 0, -ENOMEM); + return; + } + + if (!opts) { + spdk_blob_opts_init(&opts_default); + opts = &opts_default; + } + + blob->use_extent_table = opts->use_extent_table; + if (blob->use_extent_table) { + blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE; + } + + if (!internal_xattrs) { + blob_xattrs_init(&internal_xattrs_default); + internal_xattrs = &internal_xattrs_default; + } + + rc = blob_set_xattrs(blob, &opts->xattrs, false); + if (rc < 0) { + blob_free(blob); + spdk_bit_array_clear(bs->used_blobids, page_idx); + bs_release_md_page(bs, page_idx); + cb_fn(cb_arg, 0, rc); + return; + } + + rc = blob_set_xattrs(blob, internal_xattrs, true); + if (rc < 0) { + blob_free(blob); + spdk_bit_array_clear(bs->used_blobids, page_idx); + bs_release_md_page(bs, page_idx); + cb_fn(cb_arg, 0, rc); + return; + } + + if (opts->thin_provision) { + blob_set_thin_provision(blob); + } + + blob_set_clear_method(blob, opts->clear_method); + + rc = blob_resize(blob, opts->num_clusters); + if (rc < 0) { + blob_free(blob); + spdk_bit_array_clear(bs->used_blobids, page_idx); + bs_release_md_page(bs, page_idx); + cb_fn(cb_arg, 0, rc); + return; + } + cpl.type = SPDK_BS_CPL_TYPE_BLOBID; + cpl.u.blobid.cb_fn = cb_fn; + cpl.u.blobid.cb_arg = cb_arg; + cpl.u.blobid.blobid = blob->id; + + seq = bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + blob_free(blob); + spdk_bit_array_clear(bs->used_blobids, page_idx); + bs_release_md_page(bs, page_idx); + cb_fn(cb_arg, 0, -ENOMEM); + return; + } + + blob_persist(seq, blob, bs_create_blob_cpl, blob); +} + +void spdk_bs_create_blob(struct spdk_blob_store *bs, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg); +} + +void spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + bs_create_blob(bs, opts, NULL, cb_fn, cb_arg); +} + +/* END spdk_bs_create_blob */ + +/* START blob_cleanup */ + +struct spdk_clone_snapshot_ctx { + struct spdk_bs_cpl cpl; + int bserrno; + bool frozen; + + struct spdk_io_channel *channel; + + /* Current cluster for inflate operation */ + uint64_t cluster; + + /* For inflation force allocation of all unallocated clusters and remove + * thin-provisioning. Otherwise only decouple parent and keep clone thin. */ + bool allocate_all; + + struct { + spdk_blob_id id; + struct spdk_blob *blob; + } original; + struct { + spdk_blob_id id; + struct spdk_blob *blob; + } new; + + /* xattrs specified for snapshot/clones only. They have no impact on + * the original blobs xattrs. */ + const struct spdk_blob_xattr_opts *xattrs; +}; + +static void +bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = cb_arg; + struct spdk_bs_cpl *cpl = &ctx->cpl; + + if (bserrno != 0) { + if (ctx->bserrno != 0) { + SPDK_ERRLOG("Cleanup error %d\n", bserrno); + } else { + ctx->bserrno = bserrno; + } + } + + switch (cpl->type) { + case SPDK_BS_CPL_TYPE_BLOBID: + cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno); + break; + case SPDK_BS_CPL_TYPE_BLOB_BASIC: + cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno); + break; + default: + SPDK_UNREACHABLE(); + break; + } + + free(ctx); +} + +static void +bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + + if (bserrno != 0) { + if (ctx->bserrno != 0) { + SPDK_ERRLOG("Unfreeze error %d\n", bserrno); + } else { + ctx->bserrno = bserrno; + } + } + + ctx->original.id = origblob->id; + origblob->locked_operation_in_progress = false; + + spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx); +} + +static void +bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + + if (bserrno != 0) { + if (ctx->bserrno != 0) { + SPDK_ERRLOG("Cleanup error %d\n", bserrno); + } else { + ctx->bserrno = bserrno; + } + } + + if (ctx->frozen) { + /* Unfreeze any outstanding I/O */ + blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx); + } else { + bs_snapshot_unfreeze_cpl(ctx, 0); + } + +} + +static void +bs_clone_snapshot_newblob_cleanup(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *newblob = ctx->new.blob; + + if (bserrno != 0) { + if (ctx->bserrno != 0) { + SPDK_ERRLOG("Cleanup error %d\n", bserrno); + } else { + ctx->bserrno = bserrno; + } + } + + ctx->new.id = newblob->id; + spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx); +} + +/* END blob_cleanup */ + +/* START spdk_bs_create_snapshot */ + +static void +bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2) +{ + uint64_t *cluster_temp; + uint32_t *extent_page_temp; + + cluster_temp = blob1->active.clusters; + blob1->active.clusters = blob2->active.clusters; + blob2->active.clusters = cluster_temp; + + extent_page_temp = blob1->active.extent_pages; + blob1->active.extent_pages = blob2->active.extent_pages; + blob2->active.extent_pages = extent_page_temp; +} + +static void +bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + struct spdk_blob *newblob = ctx->new.blob; + + if (bserrno != 0) { + bs_snapshot_swap_cluster_maps(newblob, origblob); + bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + /* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */ + bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true); + if (bserrno != 0) { + bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + bs_blob_list_add(ctx->original.blob); + + spdk_blob_set_read_only(newblob); + + /* sync snapshot metadata */ + spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx); +} + +static void +bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + struct spdk_blob *newblob = ctx->new.blob; + + if (bserrno != 0) { + /* return cluster map back to original */ + bs_snapshot_swap_cluster_maps(newblob, origblob); + + /* Newblob md sync failed. Valid clusters are only present in origblob. + * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occured. + * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */ + blob_set_thin_provision(newblob); + assert(spdk_mem_all_zero(newblob->active.clusters, + newblob->active.num_clusters * sizeof(*newblob->active.clusters))); + assert(spdk_mem_all_zero(newblob->active.extent_pages, + newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); + + bs_clone_snapshot_newblob_cleanup(ctx, bserrno); + return; + } + + /* Set internal xattr for snapshot id */ + bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true); + if (bserrno != 0) { + /* return cluster map back to original */ + bs_snapshot_swap_cluster_maps(newblob, origblob); + bs_clone_snapshot_newblob_cleanup(ctx, bserrno); + return; + } + + bs_blob_list_remove(origblob); + origblob->parent_id = newblob->id; + + /* Create new back_bs_dev for snapshot */ + origblob->back_bs_dev = bs_create_blob_bs_dev(newblob); + if (origblob->back_bs_dev == NULL) { + /* return cluster map back to original */ + bs_snapshot_swap_cluster_maps(newblob, origblob); + bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL); + return; + } + + /* set clone blob as thin provisioned */ + blob_set_thin_provision(origblob); + + bs_blob_list_add(newblob); + + /* sync clone metadata */ + spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx); +} + +static void +bs_snapshot_freeze_cpl(void *cb_arg, int rc) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + struct spdk_blob *newblob = ctx->new.blob; + int bserrno; + + if (rc != 0) { + bs_clone_snapshot_newblob_cleanup(ctx, rc); + return; + } + + ctx->frozen = true; + + /* set new back_bs_dev for snapshot */ + newblob->back_bs_dev = origblob->back_bs_dev; + /* Set invalid flags from origblob */ + newblob->invalid_flags = origblob->invalid_flags; + + /* inherit parent from original blob if set */ + newblob->parent_id = origblob->parent_id; + if (origblob->parent_id != SPDK_BLOBID_INVALID) { + /* Set internal xattr for snapshot id */ + bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT, + &origblob->parent_id, sizeof(spdk_blob_id), true); + if (bserrno != 0) { + bs_clone_snapshot_newblob_cleanup(ctx, bserrno); + return; + } + } + + /* swap cluster maps */ + bs_snapshot_swap_cluster_maps(newblob, origblob); + + /* Set the clear method on the new blob to match the original. */ + blob_set_clear_method(newblob, origblob->clear_method); + + /* sync snapshot metadata */ + spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx); +} + +static void +bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + struct spdk_blob *newblob = _blob; + + if (bserrno != 0) { + bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + ctx->new.blob = newblob; + assert(spdk_blob_is_thin_provisioned(newblob)); + assert(spdk_mem_all_zero(newblob->active.clusters, + newblob->active.num_clusters * sizeof(*newblob->active.clusters))); + assert(spdk_mem_all_zero(newblob->active.extent_pages, + newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); + + blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx); +} + +static void +bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + + if (bserrno != 0) { + bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + ctx->new.id = blobid; + ctx->cpl.u.blobid.blobid = blobid; + + spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx); +} + + +static void +bs_xattr_snapshot(void *arg, const char *name, + const void **value, size_t *value_len) +{ + assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0); + + struct spdk_blob *blob = (struct spdk_blob *)arg; + *value = &blob->id; + *value_len = sizeof(blob->id); +} + +static void +bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob_opts opts; + struct spdk_blob_xattr_opts internal_xattrs; + char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS }; + + if (bserrno != 0) { + bs_clone_snapshot_cleanup_finish(ctx, bserrno); + return; + } + + ctx->original.blob = _blob; + + if (_blob->data_ro || _blob->md_ro) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create snapshot from read only blob with id %lu\n", + _blob->id); + ctx->bserrno = -EINVAL; + spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); + return; + } + + if (_blob->locked_operation_in_progress) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create snapshot - another operation in progress\n"); + ctx->bserrno = -EBUSY; + spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); + return; + } + + _blob->locked_operation_in_progress = true; + + spdk_blob_opts_init(&opts); + blob_xattrs_init(&internal_xattrs); + + /* Change the size of new blob to the same as in original blob, + * but do not allocate clusters */ + opts.thin_provision = true; + opts.num_clusters = spdk_blob_get_num_clusters(_blob); + opts.use_extent_table = _blob->use_extent_table; + + /* If there are any xattrs specified for snapshot, set them now */ + if (ctx->xattrs) { + memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); + } + /* Set internal xattr SNAPSHOT_IN_PROGRESS */ + internal_xattrs.count = 1; + internal_xattrs.ctx = _blob; + internal_xattrs.names = xattrs_names; + internal_xattrs.get_value = bs_xattr_snapshot; + + bs_create_blob(_blob->bs, &opts, &internal_xattrs, + bs_snapshot_newblob_create_cpl, ctx); +} + +void spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid, + const struct spdk_blob_xattr_opts *snapshot_xattrs, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); + + if (!ctx) { + cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); + return; + } + ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; + ctx->cpl.u.blobid.cb_fn = cb_fn; + ctx->cpl.u.blobid.cb_arg = cb_arg; + ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; + ctx->bserrno = 0; + ctx->frozen = false; + ctx->original.id = blobid; + ctx->xattrs = snapshot_xattrs; + + spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx); +} +/* END spdk_bs_create_snapshot */ + +/* START spdk_bs_create_clone */ + +static void +bs_xattr_clone(void *arg, const char *name, + const void **value, size_t *value_len) +{ + assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0); + + struct spdk_blob *blob = (struct spdk_blob *)arg; + *value = &blob->id; + *value_len = sizeof(blob->id); +} + +static void +bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *clone = _blob; + + ctx->new.blob = clone; + bs_blob_list_add(clone); + + spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx); +} + +static void +bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + + ctx->cpl.u.blobid.blobid = blobid; + spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx); +} + +static void +bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob_opts opts; + struct spdk_blob_xattr_opts internal_xattrs; + char *xattr_names[] = { BLOB_SNAPSHOT }; + + if (bserrno != 0) { + bs_clone_snapshot_cleanup_finish(ctx, bserrno); + return; + } + + ctx->original.blob = _blob; + + if (!_blob->data_ro || !_blob->md_ro) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Clone not from read-only blob\n"); + ctx->bserrno = -EINVAL; + spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); + return; + } + + if (_blob->locked_operation_in_progress) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create clone - another operation in progress\n"); + ctx->bserrno = -EBUSY; + spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); + return; + } + + _blob->locked_operation_in_progress = true; + + spdk_blob_opts_init(&opts); + blob_xattrs_init(&internal_xattrs); + + opts.thin_provision = true; + opts.num_clusters = spdk_blob_get_num_clusters(_blob); + opts.use_extent_table = _blob->use_extent_table; + if (ctx->xattrs) { + memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); + } + + /* Set internal xattr BLOB_SNAPSHOT */ + internal_xattrs.count = 1; + internal_xattrs.ctx = _blob; + internal_xattrs.names = xattr_names; + internal_xattrs.get_value = bs_xattr_clone; + + bs_create_blob(_blob->bs, &opts, &internal_xattrs, + bs_clone_newblob_create_cpl, ctx); +} + +void spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid, + const struct spdk_blob_xattr_opts *clone_xattrs, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); + + if (!ctx) { + cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); + return; + } + + ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; + ctx->cpl.u.blobid.cb_fn = cb_fn; + ctx->cpl.u.blobid.cb_arg = cb_arg; + ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; + ctx->bserrno = 0; + ctx->xattrs = clone_xattrs; + ctx->original.id = blobid; + + spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx); +} + +/* END spdk_bs_create_clone */ + +/* START spdk_bs_inflate_blob */ + +static void +bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *_blob = ctx->original.blob; + + if (bserrno != 0) { + bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + assert(_parent != NULL); + + bs_blob_list_remove(_blob); + _blob->parent_id = _parent->id; + blob_set_xattr(_blob, BLOB_SNAPSHOT, &_blob->parent_id, + sizeof(spdk_blob_id), true); + + _blob->back_bs_dev->destroy(_blob->back_bs_dev); + _blob->back_bs_dev = bs_create_blob_bs_dev(_parent); + bs_blob_list_add(_blob); + + spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); +} + +static void +bs_inflate_blob_done(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *_blob = ctx->original.blob; + struct spdk_blob *_parent; + + if (bserrno != 0) { + bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + if (ctx->allocate_all) { + /* remove thin provisioning */ + bs_blob_list_remove(_blob); + blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); + _blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV; + _blob->back_bs_dev->destroy(_blob->back_bs_dev); + _blob->back_bs_dev = NULL; + _blob->parent_id = SPDK_BLOBID_INVALID; + } else { + _parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob; + if (_parent->parent_id != SPDK_BLOBID_INVALID) { + /* We must change the parent of the inflated blob */ + spdk_bs_open_blob(_blob->bs, _parent->parent_id, + bs_inflate_blob_set_parent_cpl, ctx); + return; + } + + bs_blob_list_remove(_blob); + blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); + _blob->parent_id = SPDK_BLOBID_INVALID; + _blob->back_bs_dev->destroy(_blob->back_bs_dev); + _blob->back_bs_dev = bs_create_zeroes_dev(); + } + + _blob->state = SPDK_BLOB_STATE_DIRTY; + spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); +} + +/* Check if cluster needs allocation */ +static inline bool +bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all) +{ + struct spdk_blob_bs_dev *b; + + assert(blob != NULL); + + if (blob->active.clusters[cluster] != 0) { + /* Cluster is already allocated */ + return false; + } + + if (blob->parent_id == SPDK_BLOBID_INVALID) { + /* Blob have no parent blob */ + return allocate_all; + } + + b = (struct spdk_blob_bs_dev *)blob->back_bs_dev; + return (allocate_all || b->blob->active.clusters[cluster] != 0); +} + +static void +bs_inflate_blob_touch_next(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *_blob = ctx->original.blob; + uint64_t offset; + + if (bserrno != 0) { + bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) { + if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) { + break; + } + } + + if (ctx->cluster < _blob->active.num_clusters) { + offset = bs_cluster_to_lba(_blob->bs, ctx->cluster); + + /* We may safely increment a cluster before write */ + ctx->cluster++; + + /* Use zero length write to touch a cluster */ + spdk_blob_io_write(_blob, ctx->channel, NULL, offset, 0, + bs_inflate_blob_touch_next, ctx); + } else { + bs_inflate_blob_done(cb_arg, bserrno); + } +} + +static void +bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + uint64_t lfc; /* lowest free cluster */ + uint64_t i; + + if (bserrno != 0) { + bs_clone_snapshot_cleanup_finish(ctx, bserrno); + return; + } + + ctx->original.blob = _blob; + + if (_blob->locked_operation_in_progress) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot inflate blob - another operation in progress\n"); + ctx->bserrno = -EBUSY; + spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); + return; + } + + _blob->locked_operation_in_progress = true; + + if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) { + /* This blob have no parent, so we cannot decouple it. */ + SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n"); + bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL); + return; + } + + if (spdk_blob_is_thin_provisioned(_blob) == false) { + /* This is not thin provisioned blob. No need to inflate. */ + bs_clone_snapshot_origblob_cleanup(ctx, 0); + return; + } + + /* Do two passes - one to verify that we can obtain enough clusters + * and another to actually claim them. + */ + lfc = 0; + for (i = 0; i < _blob->active.num_clusters; i++) { + if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) { + lfc = spdk_bit_array_find_first_clear(_blob->bs->used_clusters, lfc); + if (lfc == UINT32_MAX) { + /* No more free clusters. Cannot satisfy the request */ + bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC); + return; + } + lfc++; + } + } + + ctx->cluster = 0; + bs_inflate_blob_touch_next(ctx, 0); +} + +static void +bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, + spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); + + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + ctx->cpl.u.bs_basic.cb_fn = cb_fn; + ctx->cpl.u.bs_basic.cb_arg = cb_arg; + ctx->bserrno = 0; + ctx->original.id = blobid; + ctx->channel = channel; + ctx->allocate_all = allocate_all; + + spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx); +} + +void +spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, + spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg); +} + +void +spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel, + spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg); +} +/* END spdk_bs_inflate_blob */ + +/* START spdk_blob_resize */ +struct spdk_bs_resize_ctx { + spdk_blob_op_complete cb_fn; + void *cb_arg; + struct spdk_blob *blob; + uint64_t sz; + int rc; +}; + +static void +bs_resize_unfreeze_cpl(void *cb_arg, int rc) +{ + struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; + + if (rc != 0) { + SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc); + } + + if (ctx->rc != 0) { + SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc); + rc = ctx->rc; + } + + ctx->blob->locked_operation_in_progress = false; + + ctx->cb_fn(ctx->cb_arg, rc); + free(ctx); +} + +static void +bs_resize_freeze_cpl(void *cb_arg, int rc) +{ + struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; + + if (rc != 0) { + ctx->blob->locked_operation_in_progress = false; + ctx->cb_fn(ctx->cb_arg, rc); + free(ctx); + return; + } + + ctx->rc = blob_resize(ctx->blob, ctx->sz); + + blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx); +} + +void +spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_resize_ctx *ctx; + + blob_verify_md_op(blob); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Resizing blob %lu to %lu clusters\n", blob->id, sz); + + if (blob->md_ro) { + cb_fn(cb_arg, -EPERM); + return; + } + + if (sz == blob->active.num_clusters) { + cb_fn(cb_arg, 0); + return; + } + + if (blob->locked_operation_in_progress) { + cb_fn(cb_arg, -EBUSY); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + blob->locked_operation_in_progress = true; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->blob = blob; + ctx->sz = sz; + blob_freeze_io(blob, bs_resize_freeze_cpl, ctx); +} + +/* END spdk_blob_resize */ + + +/* START spdk_bs_delete_blob */ + +static void +bs_delete_close_cpl(void *cb_arg, int bserrno) +{ + spdk_bs_sequence_t *seq = cb_arg; + + bs_sequence_finish(seq, bserrno); +} + +static void +bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob *blob = cb_arg; + + if (bserrno != 0) { + /* + * We already removed this blob from the blobstore tailq, so + * we need to free it here since this is the last reference + * to it. + */ + blob_free(blob); + bs_delete_close_cpl(seq, bserrno); + return; + } + + /* + * This will immediately decrement the ref_count and call + * the completion routine since the metadata state is clean. + * By calling spdk_blob_close, we reduce the number of call + * points into code that touches the blob->open_ref count + * and the blobstore's blob list. + */ + spdk_blob_close(blob, bs_delete_close_cpl, seq); +} + +struct delete_snapshot_ctx { + struct spdk_blob_list *parent_snapshot_entry; + struct spdk_blob *snapshot; + bool snapshot_md_ro; + struct spdk_blob *clone; + bool clone_md_ro; + spdk_blob_op_with_handle_complete cb_fn; + void *cb_arg; + int bserrno; +}; + +static void +delete_blob_cleanup_finish(void *cb_arg, int bserrno) +{ + struct delete_snapshot_ctx *ctx = cb_arg; + + if (bserrno != 0) { + SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno); + } + + assert(ctx != NULL); + + if (bserrno != 0 && ctx->bserrno == 0) { + ctx->bserrno = bserrno; + } + + ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno); + free(ctx); +} + +static void +delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno) +{ + struct delete_snapshot_ctx *ctx = cb_arg; + + if (bserrno != 0) { + ctx->bserrno = bserrno; + SPDK_ERRLOG("Clone cleanup error %d\n", bserrno); + } + + if (ctx->bserrno != 0) { + assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL); + TAILQ_INSERT_HEAD(&ctx->snapshot->bs->blobs, ctx->snapshot, link); + spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id); + } + + ctx->snapshot->locked_operation_in_progress = false; + ctx->snapshot->md_ro = ctx->snapshot_md_ro; + + spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx); +} + +static void +delete_snapshot_cleanup_clone(void *cb_arg, int bserrno) +{ + struct delete_snapshot_ctx *ctx = cb_arg; + + ctx->clone->locked_operation_in_progress = false; + ctx->clone->md_ro = ctx->clone_md_ro; + + spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); +} + +static void +delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) +{ + struct delete_snapshot_ctx *ctx = cb_arg; + + if (bserrno) { + ctx->bserrno = bserrno; + delete_snapshot_cleanup_clone(ctx, 0); + return; + } + + ctx->clone->locked_operation_in_progress = false; + spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx); +} + +static void +delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno) +{ + struct delete_snapshot_ctx *ctx = cb_arg; + struct spdk_blob_list *parent_snapshot_entry = NULL; + struct spdk_blob_list *snapshot_entry = NULL; + struct spdk_blob_list *clone_entry = NULL; + struct spdk_blob_list *snapshot_clone_entry = NULL; + + if (bserrno) { + SPDK_ERRLOG("Failed to sync MD on blob\n"); + ctx->bserrno = bserrno; + delete_snapshot_cleanup_clone(ctx, 0); + return; + } + + /* Get snapshot entry for the snapshot we want to remove */ + snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id); + + assert(snapshot_entry != NULL); + + /* Remove clone entry in this snapshot (at this point there can be only one clone) */ + clone_entry = TAILQ_FIRST(&snapshot_entry->clones); + assert(clone_entry != NULL); + TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); + snapshot_entry->clone_count--; + assert(TAILQ_EMPTY(&snapshot_entry->clones)); + + if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) { + /* This snapshot is at the same time a clone of another snapshot - we need to + * update parent snapshot (remove current clone, add new one inherited from + * the snapshot that is being removed) */ + + /* Get snapshot entry for parent snapshot and clone entry within that snapshot for + * snapshot that we are removing */ + blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry, + &snapshot_clone_entry); + + /* Switch clone entry in parent snapshot */ + TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link); + TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link); + free(snapshot_clone_entry); + } else { + /* No parent snapshot - just remove clone entry */ + free(clone_entry); + } + + /* Restore md_ro flags */ + ctx->clone->md_ro = ctx->clone_md_ro; + ctx->snapshot->md_ro = ctx->snapshot_md_ro; + + blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx); +} + +static void +delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno) +{ + struct delete_snapshot_ctx *ctx = cb_arg; + uint64_t i; + + ctx->snapshot->md_ro = false; + + if (bserrno) { + SPDK_ERRLOG("Failed to sync MD on clone\n"); + ctx->bserrno = bserrno; + + /* Restore snapshot to previous state */ + bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true); + if (bserrno != 0) { + delete_snapshot_cleanup_clone(ctx, bserrno); + return; + } + + spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx); + return; + } + + /* Clear cluster map entries for snapshot */ + for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { + if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) { + ctx->snapshot->active.clusters[i] = 0; + } + } + for (i = 0; i < ctx->snapshot->active.num_extent_pages && + i < ctx->clone->active.num_extent_pages; i++) { + if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) { + ctx->snapshot->active.extent_pages[i] = 0; + } + } + + blob_set_thin_provision(ctx->snapshot); + ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY; + + if (ctx->parent_snapshot_entry != NULL) { + ctx->snapshot->back_bs_dev = NULL; + } + + spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx); +} + +static void +delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno) +{ + struct delete_snapshot_ctx *ctx = cb_arg; + uint64_t i; + + /* Temporarily override md_ro flag for clone for MD modification */ + ctx->clone_md_ro = ctx->clone->md_ro; + ctx->clone->md_ro = false; + + if (bserrno) { + SPDK_ERRLOG("Failed to sync MD with xattr on blob\n"); + ctx->bserrno = bserrno; + delete_snapshot_cleanup_clone(ctx, 0); + return; + } + + /* Copy snapshot map to clone map (only unallocated clusters in clone) */ + for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { + if (ctx->clone->active.clusters[i] == 0) { + ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i]; + } + } + for (i = 0; i < ctx->snapshot->active.num_extent_pages && + i < ctx->clone->active.num_extent_pages; i++) { + if (ctx->clone->active.extent_pages[i] == 0) { + ctx->clone->active.extent_pages[i] = ctx->snapshot->active.extent_pages[i]; + } + } + + /* Delete old backing bs_dev from clone (related to snapshot that will be removed) */ + ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev); + + /* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */ + if (ctx->parent_snapshot_entry != NULL) { + /* ...to parent snapshot */ + ctx->clone->parent_id = ctx->parent_snapshot_entry->id; + ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev; + blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id, + sizeof(spdk_blob_id), + true); + } else { + /* ...to blobid invalid and zeroes dev */ + ctx->clone->parent_id = SPDK_BLOBID_INVALID; + ctx->clone->back_bs_dev = bs_create_zeroes_dev(); + blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true); + } + + spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx); +} + +static void +delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno) +{ + struct delete_snapshot_ctx *ctx = cb_arg; + + if (bserrno) { + SPDK_ERRLOG("Failed to freeze I/O on clone\n"); + ctx->bserrno = bserrno; + delete_snapshot_cleanup_clone(ctx, 0); + return; + } + + /* Temporarily override md_ro flag for snapshot for MD modification */ + ctx->snapshot_md_ro = ctx->snapshot->md_ro; + ctx->snapshot->md_ro = false; + + /* Mark blob as pending for removal for power failure safety, use clone id for recovery */ + ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id, + sizeof(spdk_blob_id), true); + if (ctx->bserrno != 0) { + delete_snapshot_cleanup_clone(ctx, 0); + return; + } + + spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx); +} + +static void +delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno) +{ + struct delete_snapshot_ctx *ctx = cb_arg; + + if (bserrno) { + SPDK_ERRLOG("Failed to open clone\n"); + ctx->bserrno = bserrno; + delete_snapshot_cleanup_snapshot(ctx, 0); + return; + } + + ctx->clone = clone; + + if (clone->locked_operation_in_progress) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot remove blob - another operation in progress on its clone\n"); + ctx->bserrno = -EBUSY; + spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); + return; + } + + clone->locked_operation_in_progress = true; + + blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx); +} + +static void +update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx) +{ + struct spdk_blob_list *snapshot_entry = NULL; + struct spdk_blob_list *clone_entry = NULL; + struct spdk_blob_list *snapshot_clone_entry = NULL; + + /* Get snapshot entry for the snapshot we want to remove */ + snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id); + + assert(snapshot_entry != NULL); + + /* Get clone of the snapshot (at this point there can be only one clone) */ + clone_entry = TAILQ_FIRST(&snapshot_entry->clones); + assert(snapshot_entry->clone_count == 1); + assert(clone_entry != NULL); + + /* Get snapshot entry for parent snapshot and clone entry within that snapshot for + * snapshot that we are removing */ + blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry, + &snapshot_clone_entry); + + spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx); +} + +static void +bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno) +{ + spdk_bs_sequence_t *seq = cb_arg; + struct spdk_blob_list *snapshot_entry = NULL; + uint32_t page_num; + + if (bserrno) { + SPDK_ERRLOG("Failed to remove blob\n"); + bs_sequence_finish(seq, bserrno); + return; + } + + /* Remove snapshot from the list */ + snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); + if (snapshot_entry != NULL) { + TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link); + free(snapshot_entry); + } + + page_num = bs_blobid_to_page(blob->id); + spdk_bit_array_clear(blob->bs->used_blobids, page_num); + blob->state = SPDK_BLOB_STATE_DIRTY; + blob->active.num_pages = 0; + blob_resize(blob, 0); + + blob_persist(seq, blob, bs_delete_persist_cpl, blob); +} + +static int +bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone) +{ + struct spdk_blob_list *snapshot_entry = NULL; + struct spdk_blob_list *clone_entry = NULL; + struct spdk_blob *clone = NULL; + bool has_one_clone = false; + + /* Check if this is a snapshot with clones */ + snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); + if (snapshot_entry != NULL) { + if (snapshot_entry->clone_count > 1) { + SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n"); + return -EBUSY; + } else if (snapshot_entry->clone_count == 1) { + has_one_clone = true; + } + } + + /* Check if someone has this blob open (besides this delete context): + * - open_ref = 1 - only this context opened blob, so it is ok to remove it + * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot + * and that is ok, because we will update it accordingly */ + if (blob->open_ref <= 2 && has_one_clone) { + clone_entry = TAILQ_FIRST(&snapshot_entry->clones); + assert(clone_entry != NULL); + clone = blob_lookup(blob->bs, clone_entry->id); + + if (blob->open_ref == 2 && clone == NULL) { + /* Clone is closed and someone else opened this blob */ + SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); + return -EBUSY; + } + + *update_clone = true; + return 0; + } + + if (blob->open_ref > 1) { + SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); + return -EBUSY; + } + + assert(has_one_clone == false); + *update_clone = false; + return 0; +} + +static void +bs_delete_enomem_close_cpl(void *cb_arg, int bserrno) +{ + spdk_bs_sequence_t *seq = cb_arg; + + bs_sequence_finish(seq, -ENOMEM); +} + +static void +bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno) +{ + spdk_bs_sequence_t *seq = cb_arg; + struct delete_snapshot_ctx *ctx; + bool update_clone = false; + + if (bserrno != 0) { + bs_sequence_finish(seq, bserrno); + return; + } + + blob_verify_md_op(blob); + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq); + return; + } + + ctx->snapshot = blob; + ctx->cb_fn = bs_delete_blob_finish; + ctx->cb_arg = seq; + + /* Check if blob can be removed and if it is a snapshot with clone on top of it */ + ctx->bserrno = bs_is_blob_deletable(blob, &update_clone); + if (ctx->bserrno) { + spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); + return; + } + + if (blob->locked_operation_in_progress) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot remove blob - another operation in progress\n"); + ctx->bserrno = -EBUSY; + spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); + return; + } + + blob->locked_operation_in_progress = true; + + /* + * Remove the blob from the blob_store list now, to ensure it does not + * get returned after this point by blob_lookup(). + */ + spdk_bit_array_clear(blob->bs->open_blobids, blob->id); + TAILQ_REMOVE(&blob->bs->blobs, blob, link); + + if (update_clone) { + /* This blob is a snapshot with active clone - update clone first */ + update_clone_on_snapshot_deletion(blob, ctx); + } else { + /* This blob does not have any clones - just remove it */ + bs_blob_list_remove(blob); + bs_delete_blob_finish(seq, blob, 0); + free(ctx); + } +} + +void +spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Deleting blob %lu\n", blobid); + + assert(spdk_get_thread() == bs->md_thread); + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + seq = bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq); +} + +/* END spdk_bs_delete_blob */ + +/* START spdk_bs_open_blob */ + +static void +bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob *blob = cb_arg; + + if (bserrno != 0) { + blob_free(blob); + seq->cpl.u.blob_handle.blob = NULL; + bs_sequence_finish(seq, bserrno); + return; + } + + blob->open_ref++; + + spdk_bit_array_set(blob->bs->open_blobids, blob->id); + TAILQ_INSERT_HEAD(&blob->bs->blobs, blob, link); + + bs_sequence_finish(seq, bserrno); +} + +static void +bs_open_blob(struct spdk_blob_store *bs, + spdk_blob_id blobid, + struct spdk_blob_open_opts *opts, + spdk_blob_op_with_handle_complete cb_fn, + void *cb_arg) +{ + struct spdk_blob *blob; + struct spdk_bs_cpl cpl; + struct spdk_blob_open_opts opts_default; + spdk_bs_sequence_t *seq; + uint32_t page_num; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Opening blob %lu\n", blobid); + assert(spdk_get_thread() == bs->md_thread); + + page_num = bs_blobid_to_page(blobid); + if (spdk_bit_array_get(bs->used_blobids, page_num) == false) { + /* Invalid blobid */ + cb_fn(cb_arg, NULL, -ENOENT); + return; + } + + blob = blob_lookup(bs, blobid); + if (blob) { + blob->open_ref++; + cb_fn(cb_arg, blob, 0); + return; + } + + blob = blob_alloc(bs, blobid); + if (!blob) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + if (!opts) { + spdk_blob_open_opts_init(&opts_default); + opts = &opts_default; + } + + blob->clear_method = opts->clear_method; + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE; + cpl.u.blob_handle.cb_fn = cb_fn; + cpl.u.blob_handle.cb_arg = cb_arg; + cpl.u.blob_handle.blob = blob; + + seq = bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + blob_free(blob); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + blob_load(seq, blob, bs_open_blob_cpl, blob); +} + +void spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, + spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) +{ + bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg); +} + +void spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid, + struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) +{ + bs_open_blob(bs, blobid, opts, cb_fn, cb_arg); +} + +/* END spdk_bs_open_blob */ + +/* START spdk_blob_set_read_only */ +int spdk_blob_set_read_only(struct spdk_blob *blob) +{ + blob_verify_md_op(blob); + + blob->data_ro_flags |= SPDK_BLOB_READ_ONLY; + + blob->state = SPDK_BLOB_STATE_DIRTY; + return 0; +} +/* END spdk_blob_set_read_only */ + +/* START spdk_blob_sync_md */ + +static void +blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob *blob = cb_arg; + + if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) { + blob->data_ro = true; + blob->md_ro = true; + } + + bs_sequence_finish(seq, bserrno); +} + +static void +blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + seq = bs_sequence_start(blob->bs->md_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + blob_persist(seq, blob, blob_sync_md_cpl, blob); +} + +void +spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + blob_verify_md_op(blob); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Syncing blob %lu\n", blob->id); + + if (blob->md_ro) { + assert(blob->state == SPDK_BLOB_STATE_CLEAN); + cb_fn(cb_arg, 0); + return; + } + + blob_sync_md(blob, cb_fn, cb_arg); +} + +/* END spdk_blob_sync_md */ + +struct spdk_blob_insert_cluster_ctx { + struct spdk_thread *thread; + struct spdk_blob *blob; + uint32_t cluster_num; /* cluster index in blob */ + uint32_t cluster; /* cluster on disk */ + uint32_t extent_page; /* extent page on disk */ + int rc; + spdk_blob_op_complete cb_fn; + void *cb_arg; +}; + +static void +blob_insert_cluster_msg_cpl(void *arg) +{ + struct spdk_blob_insert_cluster_ctx *ctx = arg; + + ctx->cb_fn(ctx->cb_arg, ctx->rc); + free(ctx); +} + +static void +blob_insert_cluster_msg_cb(void *arg, int bserrno) +{ + struct spdk_blob_insert_cluster_ctx *ctx = arg; + + ctx->rc = bserrno; + spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); +} + +static void +blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_md_page *page = cb_arg; + + bs_sequence_finish(seq, bserrno); + spdk_free(page); +} + +static void +blob_insert_extent(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + spdk_bs_sequence_t *seq; + struct spdk_bs_cpl cpl; + struct spdk_blob_md_page *page = NULL; + uint32_t page_count = 0; + int rc; + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + seq = bs_sequence_start(blob->bs->md_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + rc = blob_serialize_add_page(blob, &page, &page_count, &page); + if (rc < 0) { + bs_sequence_finish(seq, rc); + return; + } + + blob_serialize_extent_page(blob, cluster_num, page); + + page->crc = blob_md_page_calc_crc(page); + + assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true); + + bs_sequence_write_dev(seq, page, bs_md_page_to_lba(blob->bs, extent), + bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), + blob_persist_extent_page_cpl, page); +} + +static void +blob_insert_cluster_msg(void *arg) +{ + struct spdk_blob_insert_cluster_ctx *ctx = arg; + uint32_t *extent_page; + + ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster); + if (ctx->rc != 0) { + spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); + return; + } + + if (ctx->blob->use_extent_table == false) { + /* Extent table is not used, proceed with sync of md that will only use extents_rle. */ + ctx->blob->state = SPDK_BLOB_STATE_DIRTY; + blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); + return; + } + + extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); + if (*extent_page == 0) { + /* Extent page requires allocation. + * It was already claimed in the used_md_pages map and placed in ctx. + * Blob persist will take care of writing out new extent page on disk. */ + assert(ctx->extent_page != 0); + assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); + *extent_page = ctx->extent_page; + ctx->blob->state = SPDK_BLOB_STATE_DIRTY; + blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); + } else { + /* It is possible for original thread to allocate extent page for + * different cluster in the same extent page. In such case proceed with + * updating the existing extent page, but release the additional one. */ + if (ctx->extent_page != 0) { + assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); + bs_release_md_page(ctx->blob->bs, ctx->extent_page); + ctx->extent_page = 0; + } + /* Extent page already allocated. + * Every cluster allocation, requires just an update of single extent page. */ + blob_insert_extent(ctx->blob, *extent_page, ctx->cluster_num, + blob_insert_cluster_msg_cb, ctx); + } +} + +static void +blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, + uint64_t cluster, uint32_t extent_page, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_blob_insert_cluster_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->thread = spdk_get_thread(); + ctx->blob = blob; + ctx->cluster_num = cluster_num; + ctx->cluster = cluster; + ctx->extent_page = extent_page; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx); +} + +/* START spdk_blob_close */ + +static void +blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob *blob = cb_arg; + + if (bserrno == 0) { + blob->open_ref--; + if (blob->open_ref == 0) { + /* + * Blobs with active.num_pages == 0 are deleted blobs. + * these blobs are removed from the blob_store list + * when the deletion process starts - so don't try to + * remove them again. + */ + if (blob->active.num_pages > 0) { + spdk_bit_array_clear(blob->bs->open_blobids, blob->id); + TAILQ_REMOVE(&blob->bs->blobs, blob, link); + } + blob_free(blob); + } + } + + bs_sequence_finish(seq, bserrno); +} + +void spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + + blob_verify_md_op(blob); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Closing blob %lu\n", blob->id); + + if (blob->open_ref == 0) { + cb_fn(cb_arg, -EBADF); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + seq = bs_sequence_start(blob->bs->md_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + /* Sync metadata */ + blob_persist(seq, blob, blob_close_cpl, blob); +} + +/* END spdk_blob_close */ + +struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs) +{ + return spdk_get_io_channel(bs); +} + +void spdk_bs_free_io_channel(struct spdk_io_channel *channel) +{ + spdk_put_io_channel(channel); +} + +void spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel, + uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, + SPDK_BLOB_UNMAP); +} + +void spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel, + uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, + SPDK_BLOB_WRITE_ZEROES); +} + +void spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, + SPDK_BLOB_WRITE); +} + +void spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, + SPDK_BLOB_READ); +} + +void spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false); +} + +void spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true); +} + +struct spdk_bs_iter_ctx { + int64_t page_num; + struct spdk_blob_store *bs; + + spdk_blob_op_with_handle_complete cb_fn; + void *cb_arg; +}; + +static void +bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_bs_iter_ctx *ctx = cb_arg; + struct spdk_blob_store *bs = ctx->bs; + spdk_blob_id id; + + if (bserrno == 0) { + ctx->cb_fn(ctx->cb_arg, _blob, bserrno); + free(ctx); + return; + } + + ctx->page_num++; + ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num); + if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) { + ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT); + free(ctx); + return; + } + + id = bs_page_to_blobid(ctx->page_num); + + spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx); +} + +void +spdk_bs_iter_first(struct spdk_blob_store *bs, + spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_iter_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + ctx->page_num = -1; + ctx->bs = bs; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + bs_iter_cpl(ctx, NULL, -1); +} + +static void +bs_iter_close_cpl(void *cb_arg, int bserrno) +{ + struct spdk_bs_iter_ctx *ctx = cb_arg; + + bs_iter_cpl(ctx, NULL, -1); +} + +void +spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob, + spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_iter_ctx *ctx; + + assert(blob != NULL); + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + ctx->page_num = bs_blobid_to_page(blob->id); + ctx->bs = bs; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + /* Close the existing blob */ + spdk_blob_close(blob, bs_iter_close_cpl, ctx); +} + +static int +blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, + uint16_t value_len, bool internal) +{ + struct spdk_xattr_tailq *xattrs; + struct spdk_xattr *xattr; + size_t desc_size; + void *tmp; + + blob_verify_md_op(blob); + + if (blob->md_ro) { + return -EPERM; + } + + desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len; + if (desc_size > SPDK_BS_MAX_DESC_SIZE) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Xattr '%s' of size %ld does not fix into single page %ld\n", name, + desc_size, SPDK_BS_MAX_DESC_SIZE); + return -ENOMEM; + } + + if (internal) { + xattrs = &blob->xattrs_internal; + blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR; + } else { + xattrs = &blob->xattrs; + } + + TAILQ_FOREACH(xattr, xattrs, link) { + if (!strcmp(name, xattr->name)) { + tmp = malloc(value_len); + if (!tmp) { + return -ENOMEM; + } + + free(xattr->value); + xattr->value_len = value_len; + xattr->value = tmp; + memcpy(xattr->value, value, value_len); + + blob->state = SPDK_BLOB_STATE_DIRTY; + + return 0; + } + } + + xattr = calloc(1, sizeof(*xattr)); + if (!xattr) { + return -ENOMEM; + } + + xattr->name = strdup(name); + if (!xattr->name) { + free(xattr); + return -ENOMEM; + } + + xattr->value_len = value_len; + xattr->value = malloc(value_len); + if (!xattr->value) { + free(xattr->name); + free(xattr); + return -ENOMEM; + } + memcpy(xattr->value, value, value_len); + TAILQ_INSERT_TAIL(xattrs, xattr, link); + + blob->state = SPDK_BLOB_STATE_DIRTY; + + return 0; +} + +int +spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, + uint16_t value_len) +{ + return blob_set_xattr(blob, name, value, value_len, false); +} + +static int +blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal) +{ + struct spdk_xattr_tailq *xattrs; + struct spdk_xattr *xattr; + + blob_verify_md_op(blob); + + if (blob->md_ro) { + return -EPERM; + } + xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; + + TAILQ_FOREACH(xattr, xattrs, link) { + if (!strcmp(name, xattr->name)) { + TAILQ_REMOVE(xattrs, xattr, link); + free(xattr->value); + free(xattr->name); + free(xattr); + + if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) { + blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR; + } + blob->state = SPDK_BLOB_STATE_DIRTY; + + return 0; + } + } + + return -ENOENT; +} + +int +spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name) +{ + return blob_remove_xattr(blob, name, false); +} + +static int +blob_get_xattr_value(struct spdk_blob *blob, const char *name, + const void **value, size_t *value_len, bool internal) +{ + struct spdk_xattr *xattr; + struct spdk_xattr_tailq *xattrs; + + xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; + + TAILQ_FOREACH(xattr, xattrs, link) { + if (!strcmp(name, xattr->name)) { + *value = xattr->value; + *value_len = xattr->value_len; + return 0; + } + } + return -ENOENT; +} + +int +spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name, + const void **value, size_t *value_len) +{ + blob_verify_md_op(blob); + + return blob_get_xattr_value(blob, name, value, value_len, false); +} + +struct spdk_xattr_names { + uint32_t count; + const char *names[0]; +}; + +static int +blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names) +{ + struct spdk_xattr *xattr; + int count = 0; + + TAILQ_FOREACH(xattr, xattrs, link) { + count++; + } + + *names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *)); + if (*names == NULL) { + return -ENOMEM; + } + + TAILQ_FOREACH(xattr, xattrs, link) { + (*names)->names[(*names)->count++] = xattr->name; + } + + return 0; +} + +int +spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names) +{ + blob_verify_md_op(blob); + + return blob_get_xattr_names(&blob->xattrs, names); +} + +uint32_t +spdk_xattr_names_get_count(struct spdk_xattr_names *names) +{ + assert(names != NULL); + + return names->count; +} + +const char * +spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index) +{ + if (index >= names->count) { + return NULL; + } + + return names->names[index]; +} + +void +spdk_xattr_names_free(struct spdk_xattr_names *names) +{ + free(names); +} + +struct spdk_bs_type +spdk_bs_get_bstype(struct spdk_blob_store *bs) +{ + return bs->bstype; +} + +void +spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype) +{ + memcpy(&bs->bstype, &bstype, sizeof(bstype)); +} + +bool +spdk_blob_is_read_only(struct spdk_blob *blob) +{ + assert(blob != NULL); + return (blob->data_ro || blob->md_ro); +} + +bool +spdk_blob_is_snapshot(struct spdk_blob *blob) +{ + struct spdk_blob_list *snapshot_entry; + + assert(blob != NULL); + + snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); + if (snapshot_entry == NULL) { + return false; + } + + return true; +} + +bool +spdk_blob_is_clone(struct spdk_blob *blob) +{ + assert(blob != NULL); + + if (blob->parent_id != SPDK_BLOBID_INVALID) { + assert(spdk_blob_is_thin_provisioned(blob)); + return true; + } + + return false; +} + +bool +spdk_blob_is_thin_provisioned(struct spdk_blob *blob) +{ + assert(blob != NULL); + return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV); +} + +static void +blob_update_clear_method(struct spdk_blob *blob) +{ + enum blob_clear_method stored_cm; + + assert(blob != NULL); + + /* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored + * in metadata previously. If something other than the default was + * specified, ignore stored value and used what was passed in. + */ + stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT); + + if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) { + blob->clear_method = stored_cm; + } else if (blob->clear_method != stored_cm) { + SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n", + blob->clear_method, stored_cm); + } +} + +spdk_blob_id +spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id) +{ + struct spdk_blob_list *snapshot_entry = NULL; + struct spdk_blob_list *clone_entry = NULL; + + TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { + TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { + if (clone_entry->id == blob_id) { + return snapshot_entry->id; + } + } + } + + return SPDK_BLOBID_INVALID; +} + +int +spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids, + size_t *count) +{ + struct spdk_blob_list *snapshot_entry, *clone_entry; + size_t n; + + snapshot_entry = bs_get_snapshot_entry(bs, blobid); + if (snapshot_entry == NULL) { + *count = 0; + return 0; + } + + if (ids == NULL || *count < snapshot_entry->clone_count) { + *count = snapshot_entry->clone_count; + return -ENOMEM; + } + *count = snapshot_entry->clone_count; + + n = 0; + TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { + ids[n++] = clone_entry->id; + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("blob", SPDK_LOG_BLOB) diff --git a/src/spdk/lib/blob/blobstore.h b/src/spdk/lib/blob/blobstore.h new file mode 100644 index 000000000..5e93bd6ad --- /dev/null +++ b/src/spdk/lib/blob/blobstore.h @@ -0,0 +1,702 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BLOBSTORE_H +#define SPDK_BLOBSTORE_H + +#include "spdk/assert.h" +#include "spdk/blob.h" +#include "spdk/queue.h" +#include "spdk/util.h" + +#include "request.h" + +/* In Memory Data Structures + * + * The following data structures exist only in memory. + */ + +#define SPDK_BLOB_OPTS_CLUSTER_SZ (1024 * 1024) +#define SPDK_BLOB_OPTS_NUM_MD_PAGES UINT32_MAX +#define SPDK_BLOB_OPTS_MAX_MD_OPS 32 +#define SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS 512 +#define SPDK_BLOB_BLOBID_HIGH_BIT (1ULL << 32) + +struct spdk_xattr { + uint32_t index; + uint16_t value_len; + char *name; + void *value; + TAILQ_ENTRY(spdk_xattr) link; +}; + +/* The mutable part of the blob data that is sync'd to + * disk. The data in here is both mutable and persistent. + */ +struct spdk_blob_mut_data { + /* Number of data clusters in the blob */ + uint64_t num_clusters; + + /* Array LBAs that are the beginning of a cluster, in + * the order they appear in the blob. + */ + uint64_t *clusters; + + /* The size of the clusters array. This is greater than or + * equal to 'num_clusters'. + */ + size_t cluster_array_size; + + /* Number of extent pages */ + uint64_t num_extent_pages; + + /* Array of page offsets into the metadata region, + * containing extents. Can contain entries for not yet + * allocated pages. */ + uint32_t *extent_pages; + + /* The size of the extent page array. This is greater than or + * equal to 'num_extent_pages'. */ + size_t extent_pages_array_size; + + /* Number of metadata pages */ + uint32_t num_pages; + + /* Array of page offsets into the metadata region, in + * the order of the metadata page sequence. + */ + uint32_t *pages; +}; + +enum spdk_blob_state { + /* The blob in-memory version does not match the on-disk + * version. + */ + SPDK_BLOB_STATE_DIRTY, + + /* The blob in memory version of the blob matches the on disk + * version. + */ + SPDK_BLOB_STATE_CLEAN, + + /* The in-memory state being synchronized with the on-disk + * blob state. */ + SPDK_BLOB_STATE_LOADING, +}; + +TAILQ_HEAD(spdk_xattr_tailq, spdk_xattr); + +struct spdk_blob_list { + spdk_blob_id id; + size_t clone_count; + TAILQ_HEAD(, spdk_blob_list) clones; + TAILQ_ENTRY(spdk_blob_list) link; +}; + +struct spdk_blob { + struct spdk_blob_store *bs; + + uint32_t open_ref; + + spdk_blob_id id; + spdk_blob_id parent_id; + + enum spdk_blob_state state; + + /* Two copies of the mutable data. One is a version + * that matches the last known data on disk (clean). + * The other (active) is the current data. Syncing + * a blob makes the clean match the active. + */ + struct spdk_blob_mut_data clean; + struct spdk_blob_mut_data active; + + bool invalid; + bool data_ro; + bool md_ro; + + uint64_t invalid_flags; + uint64_t data_ro_flags; + uint64_t md_ro_flags; + + struct spdk_bs_dev *back_bs_dev; + + /* TODO: The xattrs are mutable, but we don't want to be + * copying them unnecessarily. Figure this out. + */ + struct spdk_xattr_tailq xattrs; + struct spdk_xattr_tailq xattrs_internal; + + TAILQ_ENTRY(spdk_blob) link; + + uint32_t frozen_refcnt; + bool locked_operation_in_progress; + enum blob_clear_method clear_method; + bool extent_rle_found; + bool extent_table_found; + bool use_extent_table; + + /* A list of pending metadata pending_persists */ + TAILQ_HEAD(, spdk_blob_persist_ctx) pending_persists; + + /* Number of data clusters retrived from extent table, + * that many have to be read from extent pages. */ + uint64_t remaining_clusters_in_et; +}; + +struct spdk_blob_store { + uint64_t md_start; /* Offset from beginning of disk, in pages */ + uint32_t md_len; /* Count, in pages */ + + struct spdk_io_channel *md_channel; + uint32_t max_channel_ops; + + struct spdk_thread *md_thread; + + struct spdk_bs_dev *dev; + + struct spdk_bit_array *used_md_pages; + struct spdk_bit_array *used_clusters; + struct spdk_bit_array *used_blobids; + struct spdk_bit_array *open_blobids; + + pthread_mutex_t used_clusters_mutex; + + uint32_t cluster_sz; + uint64_t total_clusters; + uint64_t total_data_clusters; + uint64_t num_free_clusters; + uint64_t pages_per_cluster; + uint8_t pages_per_cluster_shift; + uint32_t io_unit_size; + + spdk_blob_id super_blob; + struct spdk_bs_type bstype; + + struct spdk_bs_cpl unload_cpl; + int unload_err; + + TAILQ_HEAD(, spdk_blob) blobs; + TAILQ_HEAD(, spdk_blob_list) snapshots; + + bool clean; +}; + +struct spdk_bs_channel { + struct spdk_bs_request_set *req_mem; + TAILQ_HEAD(, spdk_bs_request_set) reqs; + + struct spdk_blob_store *bs; + + struct spdk_bs_dev *dev; + struct spdk_io_channel *dev_channel; + + TAILQ_HEAD(, spdk_bs_request_set) need_cluster_alloc; + TAILQ_HEAD(, spdk_bs_request_set) queued_io; +}; + +/** operation type */ +enum spdk_blob_op_type { + SPDK_BLOB_WRITE, + SPDK_BLOB_READ, + SPDK_BLOB_UNMAP, + SPDK_BLOB_WRITE_ZEROES, + SPDK_BLOB_WRITEV, + SPDK_BLOB_READV, +}; + +/* back bs_dev */ + +#define BLOB_SNAPSHOT "SNAP" +#define SNAPSHOT_IN_PROGRESS "SNAPTMP" +#define SNAPSHOT_PENDING_REMOVAL "SNAPRM" + +struct spdk_blob_bs_dev { + struct spdk_bs_dev bs_dev; + struct spdk_blob *blob; +}; + +/* On-Disk Data Structures + * + * The following data structures exist on disk. + */ +#define SPDK_BS_INITIAL_VERSION 1 +#define SPDK_BS_VERSION 3 /* current version */ + +#pragma pack(push, 1) + +#define SPDK_MD_MASK_TYPE_USED_PAGES 0 +#define SPDK_MD_MASK_TYPE_USED_CLUSTERS 1 +#define SPDK_MD_MASK_TYPE_USED_BLOBIDS 2 + +struct spdk_bs_md_mask { + uint8_t type; + uint32_t length; /* In bits */ + uint8_t mask[0]; +}; + +#define SPDK_MD_DESCRIPTOR_TYPE_PADDING 0 +#define SPDK_MD_DESCRIPTOR_TYPE_XATTR 2 +#define SPDK_MD_DESCRIPTOR_TYPE_FLAGS 3 +#define SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL 4 + +/* Following descriptors define cluster layout in a blob. + * EXTENT_RLE cannot be present in blobs metadata, + * at the same time as EXTENT_TABLE and EXTENT_PAGE descriptors. */ + +/* EXTENT_RLE descriptor holds an array of LBA that points to + * beginning of allocated clusters. The array is run-length encoded, + * with 0's being unallocated clusters. It is part of serialized + * metadata chain for a blob. */ +#define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE 1 +/* EXTENT_TABLE descriptor holds array of md page offsets that + * point to pages with EXTENT_PAGE descriptor. The 0's in the array + * are run-length encoded, non-zero values are unallocated pages. + * It is part of serialized metadata chain for a blob. */ +#define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE 5 +/* EXTENT_PAGE descriptor holds an array of LBAs that point to + * beginning of allocated clusters. The array is run-length encoded, + * with 0's being unallocated clusters. It is NOT part of + * serialized metadata chain for a blob. */ +#define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE 6 + +struct spdk_blob_md_descriptor_xattr { + uint8_t type; + uint32_t length; + + uint16_t name_length; + uint16_t value_length; + + char name[0]; + /* String name immediately followed by string value. */ +}; + +struct spdk_blob_md_descriptor_extent_rle { + uint8_t type; + uint32_t length; + + struct { + uint32_t cluster_idx; + uint32_t length; /* In units of clusters */ + } extents[0]; +}; + +struct spdk_blob_md_descriptor_extent_table { + uint8_t type; + uint32_t length; + + /* Number of data clusters in the blob */ + uint64_t num_clusters; + + struct { + uint32_t page_idx; + uint32_t num_pages; /* In units of pages */ + } extent_page[0]; +}; + +struct spdk_blob_md_descriptor_extent_page { + uint8_t type; + uint32_t length; + + /* First cluster index in this extent page */ + uint32_t start_cluster_idx; + + uint32_t cluster_idx[0]; +}; + +#define SPDK_BLOB_THIN_PROV (1ULL << 0) +#define SPDK_BLOB_INTERNAL_XATTR (1ULL << 1) +#define SPDK_BLOB_EXTENT_TABLE (1ULL << 2) +#define SPDK_BLOB_INVALID_FLAGS_MASK (SPDK_BLOB_THIN_PROV | SPDK_BLOB_INTERNAL_XATTR | SPDK_BLOB_EXTENT_TABLE) + +#define SPDK_BLOB_READ_ONLY (1ULL << 0) +#define SPDK_BLOB_DATA_RO_FLAGS_MASK SPDK_BLOB_READ_ONLY + +#define SPDK_BLOB_CLEAR_METHOD_SHIFT 0 +#define SPDK_BLOB_CLEAR_METHOD (3ULL << SPDK_BLOB_CLEAR_METHOD_SHIFT) +#define SPDK_BLOB_MD_RO_FLAGS_MASK SPDK_BLOB_CLEAR_METHOD + +struct spdk_blob_md_descriptor_flags { + uint8_t type; + uint32_t length; + + /* + * If a flag in invalid_flags is set that the application is not aware of, + * it will not allow the blob to be opened. + */ + uint64_t invalid_flags; + + /* + * If a flag in data_ro_flags is set that the application is not aware of, + * allow the blob to be opened in data_read_only and md_read_only mode. + */ + uint64_t data_ro_flags; + + /* + * If a flag in md_ro_flags is set the the application is not aware of, + * allow the blob to be opened in md_read_only mode. + */ + uint64_t md_ro_flags; +}; + +struct spdk_blob_md_descriptor { + uint8_t type; + uint32_t length; +}; + +#define SPDK_INVALID_MD_PAGE UINT32_MAX + +struct spdk_blob_md_page { + spdk_blob_id id; + + uint32_t sequence_num; + uint32_t reserved0; + + /* Descriptors here */ + uint8_t descriptors[4072]; + + uint32_t next; + uint32_t crc; +}; +#define SPDK_BS_PAGE_SIZE 0x1000 +SPDK_STATIC_ASSERT(SPDK_BS_PAGE_SIZE == sizeof(struct spdk_blob_md_page), "Invalid md page size"); + +#define SPDK_BS_MAX_DESC_SIZE sizeof(((struct spdk_blob_md_page*)0)->descriptors) + +/* Maximum number of extents a single Extent Page can fit. + * For an SPDK_BS_PAGE_SIZE of 4K SPDK_EXTENTS_PER_EP would be 512. */ +#define SPDK_EXTENTS_PER_EP_MAX ((SPDK_BS_MAX_DESC_SIZE - sizeof(struct spdk_blob_md_descriptor_extent_page)) / sizeof(uint32_t)) +#define SPDK_EXTENTS_PER_EP (spdk_align64pow2(SPDK_EXTENTS_PER_EP_MAX + 1) >> 1u) + +#define SPDK_BS_SUPER_BLOCK_SIG "SPDKBLOB" + +struct spdk_bs_super_block { + uint8_t signature[8]; + uint32_t version; + uint32_t length; + uint32_t clean; /* If there was a clean shutdown, this is 1. */ + spdk_blob_id super_blob; + + uint32_t cluster_size; /* In bytes */ + + uint32_t used_page_mask_start; /* Offset from beginning of disk, in pages */ + uint32_t used_page_mask_len; /* Count, in pages */ + + uint32_t used_cluster_mask_start; /* Offset from beginning of disk, in pages */ + uint32_t used_cluster_mask_len; /* Count, in pages */ + + uint32_t md_start; /* Offset from beginning of disk, in pages */ + uint32_t md_len; /* Count, in pages */ + + struct spdk_bs_type bstype; /* blobstore type */ + + uint32_t used_blobid_mask_start; /* Offset from beginning of disk, in pages */ + uint32_t used_blobid_mask_len; /* Count, in pages */ + + uint64_t size; /* size of blobstore in bytes */ + uint32_t io_unit_size; /* Size of io unit in bytes */ + + uint8_t reserved[4000]; + uint32_t crc; +}; +SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_super_block) == 0x1000, "Invalid super block size"); + +#pragma pack(pop) + +struct spdk_bs_dev *bs_create_zeroes_dev(void); +struct spdk_bs_dev *bs_create_blob_bs_dev(struct spdk_blob *blob); + +/* Unit Conversions + * + * The blobstore works with several different units: + * - Byte: Self explanatory + * - LBA: The logical blocks on the backing storage device. + * - Page: The read/write units of blobs and metadata. This is + * an offset into a blob in units of 4KiB. + * - Cluster Index: The disk is broken into a sequential list of + * clusters. This is the offset from the beginning. + * + * NOTE: These conversions all act on simple magnitudes, not with any sort + * of knowledge about the blobs themselves. For instance, converting + * a page to an lba with the conversion function below simply converts + * a number of pages to an equivalent number of lbas, but that + * lba certainly isn't the right lba that corresponds to a page offset + * for a particular blob. + */ +static inline uint64_t +bs_byte_to_lba(struct spdk_blob_store *bs, uint64_t length) +{ + assert(length % bs->dev->blocklen == 0); + + return length / bs->dev->blocklen; +} + +static inline uint64_t +bs_dev_byte_to_lba(struct spdk_bs_dev *bs_dev, uint64_t length) +{ + assert(length % bs_dev->blocklen == 0); + + return length / bs_dev->blocklen; +} + +static inline uint64_t +bs_page_to_lba(struct spdk_blob_store *bs, uint64_t page) +{ + return page * SPDK_BS_PAGE_SIZE / bs->dev->blocklen; +} + +static inline uint64_t +bs_md_page_to_lba(struct spdk_blob_store *bs, uint32_t page) +{ + assert(page < bs->md_len); + return bs_page_to_lba(bs, page + bs->md_start); +} + +static inline uint64_t +bs_dev_page_to_lba(struct spdk_bs_dev *bs_dev, uint64_t page) +{ + return page * SPDK_BS_PAGE_SIZE / bs_dev->blocklen; +} + +static inline uint64_t +bs_io_unit_per_page(struct spdk_blob_store *bs) +{ + return SPDK_BS_PAGE_SIZE / bs->io_unit_size; +} + +static inline uint64_t +bs_io_unit_to_page(struct spdk_blob_store *bs, uint64_t io_unit) +{ + return io_unit / bs_io_unit_per_page(bs); +} + +static inline uint64_t +bs_cluster_to_page(struct spdk_blob_store *bs, uint32_t cluster) +{ + return (uint64_t)cluster * bs->pages_per_cluster; +} + +static inline uint32_t +bs_page_to_cluster(struct spdk_blob_store *bs, uint64_t page) +{ + assert(page % bs->pages_per_cluster == 0); + + return page / bs->pages_per_cluster; +} + +static inline uint64_t +bs_cluster_to_lba(struct spdk_blob_store *bs, uint32_t cluster) +{ + return (uint64_t)cluster * (bs->cluster_sz / bs->dev->blocklen); +} + +static inline uint32_t +bs_lba_to_cluster(struct spdk_blob_store *bs, uint64_t lba) +{ + assert(lba % (bs->cluster_sz / bs->dev->blocklen) == 0); + + return lba / (bs->cluster_sz / bs->dev->blocklen); +} + +static inline uint64_t +bs_io_unit_to_back_dev_lba(struct spdk_blob *blob, uint64_t io_unit) +{ + return io_unit * (blob->bs->io_unit_size / blob->back_bs_dev->blocklen); +} + +static inline uint64_t +bs_back_dev_lba_to_io_unit(struct spdk_blob *blob, uint64_t lba) +{ + return lba * (blob->back_bs_dev->blocklen / blob->bs->io_unit_size); +} + +static inline uint64_t +bs_cluster_to_extent_table_id(uint64_t cluster_num) +{ + return cluster_num / SPDK_EXTENTS_PER_EP; +} + +static inline uint32_t * +bs_cluster_to_extent_page(struct spdk_blob *blob, uint64_t cluster_num) +{ + uint64_t extent_table_id = bs_cluster_to_extent_table_id(cluster_num); + + assert(blob->use_extent_table); + assert(extent_table_id < blob->active.extent_pages_array_size); + + return &blob->active.extent_pages[extent_table_id]; +} + +/* End basic conversions */ + +static inline uint64_t +bs_blobid_to_page(spdk_blob_id id) +{ + return id & 0xFFFFFFFF; +} + +/* The blob id is a 64 bit number. The lower 32 bits are the page_idx. The upper + * 32 bits are not currently used. Stick a 1 there just to catch bugs where the + * code assumes blob id == page_idx. + */ +static inline spdk_blob_id +bs_page_to_blobid(uint64_t page_idx) +{ + if (page_idx > UINT32_MAX) { + return SPDK_BLOBID_INVALID; + } + return SPDK_BLOB_BLOBID_HIGH_BIT | page_idx; +} + +/* Given an io unit offset into a blob, look up the LBA for the + * start of that io unit. + */ +static inline uint64_t +bs_blob_io_unit_to_lba(struct spdk_blob *blob, uint64_t io_unit) +{ + uint64_t lba; + uint64_t pages_per_cluster; + uint8_t shift; + uint64_t io_units_per_cluster; + uint64_t io_units_per_page; + uint64_t page; + + page = bs_io_unit_to_page(blob->bs, io_unit); + + pages_per_cluster = blob->bs->pages_per_cluster; + shift = blob->bs->pages_per_cluster_shift; + io_units_per_page = bs_io_unit_per_page(blob->bs); + + assert(page < blob->active.num_clusters * pages_per_cluster); + + if (shift != 0) { + io_units_per_cluster = io_units_per_page << shift; + lba = blob->active.clusters[page >> shift]; + } else { + io_units_per_cluster = io_units_per_page * pages_per_cluster; + lba = blob->active.clusters[page / pages_per_cluster]; + } + lba += io_unit % io_units_per_cluster; + return lba; +} + +/* Given an io_unit offset into a blob, look up the number of io_units until the + * next cluster boundary. + */ +static inline uint32_t +bs_num_io_units_to_cluster_boundary(struct spdk_blob *blob, uint64_t io_unit) +{ + uint64_t io_units_per_cluster; + uint8_t shift = blob->bs->pages_per_cluster_shift; + + if (shift != 0) { + io_units_per_cluster = bs_io_unit_per_page(blob->bs) << shift; + } else { + io_units_per_cluster = bs_io_unit_per_page(blob->bs) * blob->bs->pages_per_cluster; + } + + return io_units_per_cluster - (io_unit % io_units_per_cluster); +} + +/* Given a page offset into a blob, look up the number of pages until the + * next cluster boundary. + */ +static inline uint32_t +bs_num_pages_to_cluster_boundary(struct spdk_blob *blob, uint64_t page) +{ + uint64_t pages_per_cluster; + + pages_per_cluster = blob->bs->pages_per_cluster; + + return pages_per_cluster - (page % pages_per_cluster); +} + +/* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */ +static inline uint32_t +bs_io_unit_to_cluster_start(struct spdk_blob *blob, uint64_t io_unit) +{ + uint64_t pages_per_cluster; + uint64_t page; + + pages_per_cluster = blob->bs->pages_per_cluster; + page = bs_io_unit_to_page(blob->bs, io_unit); + + return page - (page % pages_per_cluster); +} + +/* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */ +static inline uint32_t +bs_io_unit_to_cluster_number(struct spdk_blob *blob, uint64_t io_unit) +{ + uint64_t pages_per_cluster = blob->bs->pages_per_cluster; + uint8_t shift = blob->bs->pages_per_cluster_shift; + uint32_t page_offset; + + page_offset = io_unit / bs_io_unit_per_page(blob->bs); + if (shift != 0) { + return page_offset >> shift; + } else { + return page_offset / pages_per_cluster; + } +} + +/* Given an io unit offset into a blob, look up if it is from allocated cluster. */ +static inline bool +bs_io_unit_is_allocated(struct spdk_blob *blob, uint64_t io_unit) +{ + uint64_t lba; + uint64_t page; + uint64_t pages_per_cluster; + uint8_t shift; + + shift = blob->bs->pages_per_cluster_shift; + pages_per_cluster = blob->bs->pages_per_cluster; + page = bs_io_unit_to_page(blob->bs, io_unit); + + assert(page < blob->active.num_clusters * pages_per_cluster); + + if (shift != 0) { + lba = blob->active.clusters[page >> shift]; + } else { + lba = blob->active.clusters[page / pages_per_cluster]; + } + + if (lba == 0) { + assert(spdk_blob_is_thin_provisioned(blob)); + return false; + } else { + return true; + } +} + +#endif diff --git a/src/spdk/lib/blob/request.c b/src/spdk/lib/blob/request.c new file mode 100644 index 000000000..0975bcf24 --- /dev/null +++ b/src/spdk/lib/blob/request.c @@ -0,0 +1,521 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "blobstore.h" +#include "request.h" + +#include "spdk/thread.h" +#include "spdk/queue.h" + +#include "spdk_internal/log.h" + +void +bs_call_cpl(struct spdk_bs_cpl *cpl, int bserrno) +{ + switch (cpl->type) { + case SPDK_BS_CPL_TYPE_BS_BASIC: + cpl->u.bs_basic.cb_fn(cpl->u.bs_basic.cb_arg, + bserrno); + break; + case SPDK_BS_CPL_TYPE_BS_HANDLE: + cpl->u.bs_handle.cb_fn(cpl->u.bs_handle.cb_arg, + bserrno == 0 ? cpl->u.bs_handle.bs : NULL, + bserrno); + break; + case SPDK_BS_CPL_TYPE_BLOB_BASIC: + cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, + bserrno); + break; + case SPDK_BS_CPL_TYPE_BLOBID: + cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, + bserrno == 0 ? cpl->u.blobid.blobid : SPDK_BLOBID_INVALID, + bserrno); + break; + case SPDK_BS_CPL_TYPE_BLOB_HANDLE: + cpl->u.blob_handle.cb_fn(cpl->u.blob_handle.cb_arg, + bserrno == 0 ? cpl->u.blob_handle.blob : NULL, + bserrno); + break; + case SPDK_BS_CPL_TYPE_NESTED_SEQUENCE: + cpl->u.nested_seq.cb_fn(cpl->u.nested_seq.cb_arg, + cpl->u.nested_seq.parent, + bserrno); + break; + case SPDK_BS_CPL_TYPE_NONE: + /* this completion's callback is handled elsewhere */ + break; + } +} + +static void +bs_request_set_complete(struct spdk_bs_request_set *set) +{ + struct spdk_bs_cpl cpl = set->cpl; + int bserrno = set->bserrno; + + TAILQ_INSERT_TAIL(&set->channel->reqs, set, link); + + bs_call_cpl(&cpl, bserrno); +} + +static void +bs_sequence_completion(struct spdk_io_channel *channel, void *cb_arg, int bserrno) +{ + struct spdk_bs_request_set *set = cb_arg; + + set->bserrno = bserrno; + set->u.sequence.cb_fn((spdk_bs_sequence_t *)set, set->u.sequence.cb_arg, bserrno); +} + +spdk_bs_sequence_t * +bs_sequence_start(struct spdk_io_channel *_channel, + struct spdk_bs_cpl *cpl) +{ + struct spdk_bs_channel *channel; + struct spdk_bs_request_set *set; + + channel = spdk_io_channel_get_ctx(_channel); + assert(channel != NULL); + set = TAILQ_FIRST(&channel->reqs); + if (!set) { + return NULL; + } + TAILQ_REMOVE(&channel->reqs, set, link); + + set->cpl = *cpl; + set->bserrno = 0; + set->channel = channel; + + set->cb_args.cb_fn = bs_sequence_completion; + set->cb_args.cb_arg = set; + set->cb_args.channel = channel->dev_channel; + + return (spdk_bs_sequence_t *)set; +} + +void +bs_sequence_read_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev, + void *payload, uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + bs_dev->read(bs_dev, spdk_io_channel_from_ctx(channel), payload, lba, lba_count, &set->cb_args); +} + +void +bs_sequence_read_dev(spdk_bs_sequence_t *seq, void *payload, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + channel->dev->read(channel->dev, channel->dev_channel, payload, lba, lba_count, &set->cb_args); +} + +void +bs_sequence_write_dev(spdk_bs_sequence_t *seq, void *payload, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + channel->dev->write(channel->dev, channel->dev_channel, payload, lba, lba_count, + &set->cb_args); +} + +void +bs_sequence_readv_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev, + struct iovec *iov, int iovcnt, uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + bs_dev->readv(bs_dev, spdk_io_channel_from_ctx(channel), iov, iovcnt, lba, lba_count, + &set->cb_args); +} + +void +bs_sequence_readv_dev(spdk_bs_sequence_t *seq, struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + channel->dev->readv(channel->dev, channel->dev_channel, iov, iovcnt, lba, lba_count, + &set->cb_args); +} + +void +bs_sequence_writev_dev(spdk_bs_sequence_t *seq, struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + channel->dev->writev(channel->dev, channel->dev_channel, iov, iovcnt, lba, lba_count, + &set->cb_args); +} + +void +bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "writing zeroes to %" PRIu32 " blocks at LBA %" PRIu64 "\n", + lba_count, lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + channel->dev->write_zeroes(channel->dev, channel->dev_channel, lba, lba_count, + &set->cb_args); +} + +void +bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno) +{ + if (bserrno != 0) { + seq->bserrno = bserrno; + } + bs_request_set_complete((struct spdk_bs_request_set *)seq); +} + +void +bs_user_op_sequence_finish(void *cb_arg, int bserrno) +{ + spdk_bs_sequence_t *seq = cb_arg; + + bs_sequence_finish(seq, bserrno); +} + +static void +bs_batch_completion(struct spdk_io_channel *_channel, + void *cb_arg, int bserrno) +{ + struct spdk_bs_request_set *set = cb_arg; + + set->u.batch.outstanding_ops--; + if (bserrno != 0) { + set->bserrno = bserrno; + } + + if (set->u.batch.outstanding_ops == 0 && set->u.batch.batch_closed) { + if (set->u.batch.cb_fn) { + set->cb_args.cb_fn = bs_sequence_completion; + set->u.batch.cb_fn((spdk_bs_sequence_t *)set, set->u.batch.cb_arg, bserrno); + } else { + bs_request_set_complete(set); + } + } +} + +spdk_bs_batch_t * +bs_batch_open(struct spdk_io_channel *_channel, + struct spdk_bs_cpl *cpl) +{ + struct spdk_bs_channel *channel; + struct spdk_bs_request_set *set; + + channel = spdk_io_channel_get_ctx(_channel); + assert(channel != NULL); + set = TAILQ_FIRST(&channel->reqs); + if (!set) { + return NULL; + } + TAILQ_REMOVE(&channel->reqs, set, link); + + set->cpl = *cpl; + set->bserrno = 0; + set->channel = channel; + + set->u.batch.cb_fn = NULL; + set->u.batch.cb_arg = NULL; + set->u.batch.outstanding_ops = 0; + set->u.batch.batch_closed = 0; + + set->cb_args.cb_fn = bs_batch_completion; + set->cb_args.cb_arg = set; + set->cb_args.channel = channel->dev_channel; + + return (spdk_bs_batch_t *)set; +} + +void +bs_batch_read_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev, + void *payload, uint64_t lba, uint32_t lba_count) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.batch.outstanding_ops++; + bs_dev->read(bs_dev, spdk_io_channel_from_ctx(channel), payload, lba, lba_count, &set->cb_args); +} + +void +bs_batch_read_dev(spdk_bs_batch_t *batch, void *payload, + uint64_t lba, uint32_t lba_count) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.batch.outstanding_ops++; + channel->dev->read(channel->dev, channel->dev_channel, payload, lba, lba_count, &set->cb_args); +} + +void +bs_batch_write_dev(spdk_bs_batch_t *batch, void *payload, + uint64_t lba, uint32_t lba_count) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks to LBA %" PRIu64 "\n", lba_count, lba); + + set->u.batch.outstanding_ops++; + channel->dev->write(channel->dev, channel->dev_channel, payload, lba, lba_count, + &set->cb_args); +} + +void +bs_batch_unmap_dev(spdk_bs_batch_t *batch, + uint64_t lba, uint32_t lba_count) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Unmapping %" PRIu32 " blocks at LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.batch.outstanding_ops++; + channel->dev->unmap(channel->dev, channel->dev_channel, lba, lba_count, + &set->cb_args); +} + +void +bs_batch_write_zeroes_dev(spdk_bs_batch_t *batch, + uint64_t lba, uint32_t lba_count) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Zeroing %" PRIu32 " blocks at LBA %" PRIu64 "\n", lba_count, lba); + + set->u.batch.outstanding_ops++; + channel->dev->write_zeroes(channel->dev, channel->dev_channel, lba, lba_count, + &set->cb_args); +} + +void +bs_batch_close(spdk_bs_batch_t *batch) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + + set->u.batch.batch_closed = 1; + + if (set->u.batch.outstanding_ops == 0) { + if (set->u.batch.cb_fn) { + set->cb_args.cb_fn = bs_sequence_completion; + set->u.batch.cb_fn((spdk_bs_sequence_t *)set, set->u.batch.cb_arg, set->bserrno); + } else { + bs_request_set_complete(set); + } + } +} + +spdk_bs_batch_t * +bs_sequence_to_batch(spdk_bs_sequence_t *seq, spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + + set->u.batch.cb_fn = cb_fn; + set->u.batch.cb_arg = cb_arg; + set->u.batch.outstanding_ops = 0; + set->u.batch.batch_closed = 0; + + set->cb_args.cb_fn = bs_batch_completion; + + return set; +} + +spdk_bs_user_op_t * +bs_user_op_alloc(struct spdk_io_channel *_channel, struct spdk_bs_cpl *cpl, + enum spdk_blob_op_type op_type, struct spdk_blob *blob, + void *payload, int iovcnt, uint64_t offset, uint64_t length) +{ + struct spdk_bs_channel *channel; + struct spdk_bs_request_set *set; + struct spdk_bs_user_op_args *args; + + channel = spdk_io_channel_get_ctx(_channel); + assert(channel != NULL); + set = TAILQ_FIRST(&channel->reqs); + if (!set) { + return NULL; + } + TAILQ_REMOVE(&channel->reqs, set, link); + + set->cpl = *cpl; + set->channel = channel; + + args = &set->u.user_op; + + args->type = op_type; + args->iovcnt = iovcnt; + args->blob = blob; + args->offset = offset; + args->length = length; + args->payload = payload; + + return (spdk_bs_user_op_t *)set; +} + +void +bs_user_op_execute(spdk_bs_user_op_t *op) +{ + struct spdk_bs_request_set *set; + struct spdk_bs_user_op_args *args; + struct spdk_io_channel *ch; + + set = (struct spdk_bs_request_set *)op; + args = &set->u.user_op; + ch = spdk_io_channel_from_ctx(set->channel); + + switch (args->type) { + case SPDK_BLOB_READ: + spdk_blob_io_read(args->blob, ch, args->payload, args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + case SPDK_BLOB_WRITE: + spdk_blob_io_write(args->blob, ch, args->payload, args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + case SPDK_BLOB_UNMAP: + spdk_blob_io_unmap(args->blob, ch, args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + case SPDK_BLOB_WRITE_ZEROES: + spdk_blob_io_write_zeroes(args->blob, ch, args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + case SPDK_BLOB_READV: + spdk_blob_io_readv(args->blob, ch, args->payload, args->iovcnt, + args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + case SPDK_BLOB_WRITEV: + spdk_blob_io_writev(args->blob, ch, args->payload, args->iovcnt, + args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + } + TAILQ_INSERT_TAIL(&set->channel->reqs, set, link); +} + +void +bs_user_op_abort(spdk_bs_user_op_t *op) +{ + struct spdk_bs_request_set *set; + + set = (struct spdk_bs_request_set *)op; + + set->cpl.u.blob_basic.cb_fn(set->cpl.u.blob_basic.cb_arg, -EIO); + TAILQ_INSERT_TAIL(&set->channel->reqs, set, link); +} + +void +bs_sequence_to_batch_completion(void *cb_arg, int bserrno) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)cb_arg; + + set->u.batch.outstanding_ops--; + + if (set->u.batch.outstanding_ops == 0 && set->u.batch.batch_closed) { + if (set->cb_args.cb_fn) { + set->cb_args.cb_fn(set->cb_args.channel, set->cb_args.cb_arg, bserrno); + } + } +} + +SPDK_LOG_REGISTER_COMPONENT("blob_rw", SPDK_LOG_BLOB_RW) diff --git a/src/spdk/lib/blob/request.h b/src/spdk/lib/blob/request.h new file mode 100644 index 000000000..81dc161db --- /dev/null +++ b/src/spdk/lib/blob/request.h @@ -0,0 +1,217 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BS_REQUEST_H +#define SPDK_BS_REQUEST_H + +#include "spdk/stdinc.h" + +#include "spdk/blob.h" + +enum spdk_bs_cpl_type { + SPDK_BS_CPL_TYPE_NONE, + SPDK_BS_CPL_TYPE_BS_BASIC, + SPDK_BS_CPL_TYPE_BS_HANDLE, + SPDK_BS_CPL_TYPE_BLOB_BASIC, + SPDK_BS_CPL_TYPE_BLOBID, + SPDK_BS_CPL_TYPE_BLOB_HANDLE, + SPDK_BS_CPL_TYPE_NESTED_SEQUENCE, +}; + +enum spdk_blob_op_type; + +struct spdk_bs_request_set; + +/* Use a sequence to submit a set of requests serially */ +typedef struct spdk_bs_request_set spdk_bs_sequence_t; + +/* Use a batch to submit a set of requests in parallel */ +typedef struct spdk_bs_request_set spdk_bs_batch_t; + +/* Use a user_op to queue a user operation for later execution */ +typedef struct spdk_bs_request_set spdk_bs_user_op_t; + +typedef void (*spdk_bs_nested_seq_complete)(void *cb_arg, spdk_bs_sequence_t *parent, int bserrno); + +struct spdk_bs_cpl { + enum spdk_bs_cpl_type type; + union { + struct { + spdk_bs_op_complete cb_fn; + void *cb_arg; + } bs_basic; + + struct { + spdk_bs_op_with_handle_complete cb_fn; + void *cb_arg; + struct spdk_blob_store *bs; + } bs_handle; + + struct { + spdk_blob_op_complete cb_fn; + void *cb_arg; + } blob_basic; + + struct { + spdk_blob_op_with_id_complete cb_fn; + void *cb_arg; + spdk_blob_id blobid; + } blobid; + + struct { + spdk_blob_op_with_handle_complete cb_fn; + void *cb_arg; + struct spdk_blob *blob; + } blob_handle; + + struct { + spdk_bs_nested_seq_complete cb_fn; + void *cb_arg; + spdk_bs_sequence_t *parent; + } nested_seq; + } u; +}; + +typedef void (*spdk_bs_sequence_cpl)(spdk_bs_sequence_t *sequence, + void *cb_arg, int bserrno); + +/* A generic request set. Can be a sequence, batch or a user_op. */ +struct spdk_bs_request_set { + struct spdk_bs_cpl cpl; + + int bserrno; + + struct spdk_bs_channel *channel; + + struct spdk_bs_dev_cb_args cb_args; + + union { + struct { + spdk_bs_sequence_cpl cb_fn; + void *cb_arg; + } sequence; + + struct { + uint32_t outstanding_ops; + uint32_t batch_closed; + spdk_bs_sequence_cpl cb_fn; + void *cb_arg; + } batch; + + struct spdk_bs_user_op_args { + int type; + int iovcnt; + struct spdk_blob *blob; + uint64_t offset; + uint64_t length; + spdk_blob_op_complete cb_fn; + void *cb_arg; + void *payload; /* cast to iov for readv/writev */ + } user_op; + } u; + + TAILQ_ENTRY(spdk_bs_request_set) link; +}; + +void bs_call_cpl(struct spdk_bs_cpl *cpl, int bserrno); + +spdk_bs_sequence_t *bs_sequence_start(struct spdk_io_channel *channel, + struct spdk_bs_cpl *cpl); + +void bs_sequence_read_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev, + void *payload, uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void bs_sequence_read_dev(spdk_bs_sequence_t *seq, void *payload, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void bs_sequence_write_dev(spdk_bs_sequence_t *seq, void *payload, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void bs_sequence_readv_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev, + struct iovec *iov, int iovcnt, uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void bs_sequence_readv_dev(spdk_bs_batch_t *batch, struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void bs_sequence_writev_dev(spdk_bs_batch_t *batch, struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno); + +void bs_user_op_sequence_finish(void *cb_arg, int bserrno); + +spdk_bs_batch_t *bs_batch_open(struct spdk_io_channel *channel, + struct spdk_bs_cpl *cpl); + +void bs_batch_read_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev, + void *payload, uint64_t lba, uint32_t lba_count); + +void bs_batch_read_dev(spdk_bs_batch_t *batch, void *payload, + uint64_t lba, uint32_t lba_count); + +void bs_batch_write_dev(spdk_bs_batch_t *batch, void *payload, + uint64_t lba, uint32_t lba_count); + +void bs_batch_unmap_dev(spdk_bs_batch_t *batch, + uint64_t lba, uint32_t lba_count); + +void bs_batch_write_zeroes_dev(spdk_bs_batch_t *batch, + uint64_t lba, uint32_t lba_count); + +void bs_batch_close(spdk_bs_batch_t *batch); + +spdk_bs_batch_t *bs_sequence_to_batch(spdk_bs_sequence_t *seq, + spdk_bs_sequence_cpl cb_fn, + void *cb_arg); + +spdk_bs_user_op_t *bs_user_op_alloc(struct spdk_io_channel *channel, struct spdk_bs_cpl *cpl, + enum spdk_blob_op_type op_type, struct spdk_blob *blob, + void *payload, int iovcnt, uint64_t offset, uint64_t length); + +void bs_user_op_execute(spdk_bs_user_op_t *op); + +void bs_user_op_abort(spdk_bs_user_op_t *op); + +void bs_sequence_to_batch_completion(void *cb_arg, int bserrno); + +#endif diff --git a/src/spdk/lib/blob/spdk_blob.map b/src/spdk/lib/blob/spdk_blob.map new file mode 100644 index 000000000..7c1bc473f --- /dev/null +++ b/src/spdk/lib/blob/spdk_blob.map @@ -0,0 +1,64 @@ +{ + global: + + # Public functions + spdk_bs_opts_init; + spdk_bs_load; + spdk_bs_init; + spdk_bs_dump; + spdk_bs_destroy; + spdk_bs_unload; + spdk_bs_set_super; + spdk_bs_get_super; + spdk_bs_get_cluster_size; + spdk_bs_get_page_size; + spdk_bs_get_io_unit_size; + spdk_bs_free_cluster_count; + spdk_bs_total_data_cluster_count; + spdk_blob_get_id; + spdk_blob_get_num_pages; + spdk_blob_get_num_io_units; + spdk_blob_get_num_clusters; + spdk_blob_opts_init; + spdk_bs_create_blob_ext; + spdk_bs_create_blob; + spdk_bs_create_snapshot; + spdk_bs_create_clone; + spdk_blob_get_clones; + spdk_blob_get_parent_snapshot; + spdk_blob_is_read_only; + spdk_blob_is_snapshot; + spdk_blob_is_clone; + spdk_blob_is_thin_provisioned; + spdk_bs_delete_blob; + spdk_bs_inflate_blob; + spdk_bs_blob_decouple_parent; + spdk_blob_open_opts_init; + spdk_bs_open_blob; + spdk_bs_open_blob_ext; + spdk_blob_resize; + spdk_blob_set_read_only; + spdk_blob_sync_md; + spdk_blob_close; + spdk_bs_alloc_io_channel; + spdk_bs_free_io_channel; + spdk_blob_io_write; + spdk_blob_io_read; + spdk_blob_io_writev; + spdk_blob_io_readv; + spdk_blob_io_unmap; + spdk_blob_io_write_zeroes; + spdk_bs_iter_first; + spdk_bs_iter_next; + spdk_blob_set_xattr; + spdk_blob_remove_xattr; + spdk_blob_get_xattr_value; + spdk_blob_get_xattr_names; + spdk_xattr_names_get_count; + spdk_xattr_names_get_name; + spdk_xattr_names_free; + spdk_bs_get_bstype; + spdk_bs_set_bstype; + + local: *; +}; diff --git a/src/spdk/lib/blob/zeroes.c b/src/spdk/lib/blob/zeroes.c new file mode 100644 index 000000000..5e8d70545 --- /dev/null +++ b/src/spdk/lib/blob/zeroes.c @@ -0,0 +1,122 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/blob.h" + +#include "blobstore.h" + +static void +zeroes_destroy(struct spdk_bs_dev *bs_dev) +{ + return; +} + +static void +zeroes_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + memset(payload, 0, dev->blocklen * lba_count); + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0); +} + +static void +zeroes_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +zeroes_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + int i; + + for (i = 0; i < iovcnt; i++) { + memset(iov[i].iov_base, 0, iov[i].iov_len); + } + + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0); +} + +static void +zeroes_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +zeroes_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +zeroes_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static struct spdk_bs_dev g_zeroes_bs_dev = { + .blockcnt = UINT64_MAX, + .blocklen = 512, + .create_channel = NULL, + .destroy_channel = NULL, + .destroy = zeroes_destroy, + .read = zeroes_read, + .write = zeroes_write, + .readv = zeroes_readv, + .writev = zeroes_writev, + .write_zeroes = zeroes_write_zeroes, + .unmap = zeroes_unmap, +}; + +struct spdk_bs_dev * +bs_create_zeroes_dev(void) +{ + return &g_zeroes_bs_dev; +} diff --git a/src/spdk/lib/blobfs/Makefile b/src/spdk/lib/blobfs/Makefile new file mode 100644 index 000000000..d0c46de02 --- /dev/null +++ b/src/spdk/lib/blobfs/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 + +C_SRCS = blobfs.c tree.c +LIBNAME = blobfs + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blobfs.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/blobfs/blobfs.c b/src/spdk/lib/blobfs/blobfs.c new file mode 100644 index 000000000..3af6b0639 --- /dev/null +++ b/src/spdk/lib/blobfs/blobfs.c @@ -0,0 +1,2980 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/blobfs.h" +#include "spdk/conf.h" +#include "tree.h" + +#include "spdk/queue.h" +#include "spdk/thread.h" +#include "spdk/assert.h" +#include "spdk/env.h" +#include "spdk/util.h" +#include "spdk_internal/log.h" +#include "spdk/trace.h" + +#define BLOBFS_TRACE(file, str, args...) \ + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s " str, file->name, ##args) + +#define BLOBFS_TRACE_RW(file, str, args...) \ + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS_RW, "file=%s " str, file->name, ##args) + +#define BLOBFS_DEFAULT_CACHE_SIZE (4ULL * 1024 * 1024 * 1024) +#define SPDK_BLOBFS_DEFAULT_OPTS_CLUSTER_SZ (1024 * 1024) + +#define SPDK_BLOBFS_SIGNATURE "BLOBFS" + +static uint64_t g_fs_cache_size = BLOBFS_DEFAULT_CACHE_SIZE; +static struct spdk_mempool *g_cache_pool; +static TAILQ_HEAD(, spdk_file) g_caches; +static struct spdk_poller *g_cache_pool_mgmt_poller; +static struct spdk_thread *g_cache_pool_thread; +#define BLOBFS_CACHE_POOL_POLL_PERIOD_IN_US 1000ULL +static int g_fs_count = 0; +static pthread_mutex_t g_cache_init_lock = PTHREAD_MUTEX_INITIALIZER; + +#define TRACE_GROUP_BLOBFS 0x7 +#define TRACE_BLOBFS_XATTR_START SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x0) +#define TRACE_BLOBFS_XATTR_END SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x1) +#define TRACE_BLOBFS_OPEN SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x2) +#define TRACE_BLOBFS_CLOSE SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x3) +#define TRACE_BLOBFS_DELETE_START SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x4) +#define TRACE_BLOBFS_DELETE_DONE SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x5) + +SPDK_TRACE_REGISTER_FN(blobfs_trace, "blobfs", TRACE_GROUP_BLOBFS) +{ + spdk_trace_register_description("BLOBFS_XATTR_START", + TRACE_BLOBFS_XATTR_START, + OWNER_NONE, OBJECT_NONE, 0, + SPDK_TRACE_ARG_TYPE_STR, + "file: "); + spdk_trace_register_description("BLOBFS_XATTR_END", + TRACE_BLOBFS_XATTR_END, + OWNER_NONE, OBJECT_NONE, 0, + SPDK_TRACE_ARG_TYPE_STR, + "file: "); + spdk_trace_register_description("BLOBFS_OPEN", + TRACE_BLOBFS_OPEN, + OWNER_NONE, OBJECT_NONE, 0, + SPDK_TRACE_ARG_TYPE_STR, + "file: "); + spdk_trace_register_description("BLOBFS_CLOSE", + TRACE_BLOBFS_CLOSE, + OWNER_NONE, OBJECT_NONE, 0, + SPDK_TRACE_ARG_TYPE_STR, + "file: "); + spdk_trace_register_description("BLOBFS_DELETE_START", + TRACE_BLOBFS_DELETE_START, + OWNER_NONE, OBJECT_NONE, 0, + SPDK_TRACE_ARG_TYPE_STR, + "file: "); + spdk_trace_register_description("BLOBFS_DELETE_DONE", + TRACE_BLOBFS_DELETE_DONE, + OWNER_NONE, OBJECT_NONE, 0, + SPDK_TRACE_ARG_TYPE_STR, + "file: "); +} + +void +cache_buffer_free(struct cache_buffer *cache_buffer) +{ + spdk_mempool_put(g_cache_pool, cache_buffer->buf); + free(cache_buffer); +} + +#define CACHE_READAHEAD_THRESHOLD (128 * 1024) + +struct spdk_file { + struct spdk_filesystem *fs; + struct spdk_blob *blob; + char *name; + uint64_t trace_arg_name; + uint64_t length; + bool is_deleted; + bool open_for_writing; + uint64_t length_flushed; + uint64_t length_xattr; + uint64_t append_pos; + uint64_t seq_byte_count; + uint64_t next_seq_offset; + uint32_t priority; + TAILQ_ENTRY(spdk_file) tailq; + spdk_blob_id blobid; + uint32_t ref_count; + pthread_spinlock_t lock; + struct cache_buffer *last; + struct cache_tree *tree; + TAILQ_HEAD(open_requests_head, spdk_fs_request) open_requests; + TAILQ_HEAD(sync_requests_head, spdk_fs_request) sync_requests; + TAILQ_ENTRY(spdk_file) cache_tailq; +}; + +struct spdk_deleted_file { + spdk_blob_id id; + TAILQ_ENTRY(spdk_deleted_file) tailq; +}; + +struct spdk_filesystem { + struct spdk_blob_store *bs; + TAILQ_HEAD(, spdk_file) files; + struct spdk_bs_opts bs_opts; + struct spdk_bs_dev *bdev; + fs_send_request_fn send_request; + + struct { + uint32_t max_ops; + struct spdk_io_channel *sync_io_channel; + struct spdk_fs_channel *sync_fs_channel; + } sync_target; + + struct { + uint32_t max_ops; + struct spdk_io_channel *md_io_channel; + struct spdk_fs_channel *md_fs_channel; + } md_target; + + struct { + uint32_t max_ops; + } io_target; +}; + +struct spdk_fs_cb_args { + union { + spdk_fs_op_with_handle_complete fs_op_with_handle; + spdk_fs_op_complete fs_op; + spdk_file_op_with_handle_complete file_op_with_handle; + spdk_file_op_complete file_op; + spdk_file_stat_op_complete stat_op; + } fn; + void *arg; + sem_t *sem; + struct spdk_filesystem *fs; + struct spdk_file *file; + int rc; + struct iovec *iovs; + uint32_t iovcnt; + struct iovec iov; + union { + struct { + TAILQ_HEAD(, spdk_deleted_file) deleted_files; + } fs_load; + struct { + uint64_t length; + } truncate; + struct { + struct spdk_io_channel *channel; + void *pin_buf; + int is_read; + off_t offset; + size_t length; + uint64_t start_lba; + uint64_t num_lba; + uint32_t blocklen; + } rw; + struct { + const char *old_name; + const char *new_name; + } rename; + struct { + struct cache_buffer *cache_buffer; + uint64_t length; + } flush; + struct { + struct cache_buffer *cache_buffer; + uint64_t length; + uint64_t offset; + } readahead; + struct { + /* offset of the file when the sync request was made */ + uint64_t offset; + TAILQ_ENTRY(spdk_fs_request) tailq; + bool xattr_in_progress; + /* length written to the xattr for this file - this should + * always be the same as the offset if only one thread is + * writing to the file, but could differ if multiple threads + * are appending + */ + uint64_t length; + } sync; + struct { + uint32_t num_clusters; + } resize; + struct { + const char *name; + uint32_t flags; + TAILQ_ENTRY(spdk_fs_request) tailq; + } open; + struct { + const char *name; + struct spdk_blob *blob; + } create; + struct { + const char *name; + } delete; + struct { + const char *name; + } stat; + } op; +}; + +static void file_free(struct spdk_file *file); +static void fs_io_device_unregister(struct spdk_filesystem *fs); +static void fs_free_io_channels(struct spdk_filesystem *fs); + +void +spdk_fs_opts_init(struct spdk_blobfs_opts *opts) +{ + opts->cluster_sz = SPDK_BLOBFS_DEFAULT_OPTS_CLUSTER_SZ; +} + +static int _blobfs_cache_pool_reclaim(void *arg); + +static bool +blobfs_cache_pool_need_reclaim(void) +{ + size_t count; + + count = spdk_mempool_count(g_cache_pool); + /* We define a aggressive policy here as the requirements from db_bench are batched, so start the poller + * when the number of available cache buffer is less than 1/5 of total buffers. + */ + if (count > (size_t)g_fs_cache_size / CACHE_BUFFER_SIZE / 5) { + return false; + } + + return true; +} + +static void +__start_cache_pool_mgmt(void *ctx) +{ + assert(g_cache_pool == NULL); + + g_cache_pool = spdk_mempool_create("spdk_fs_cache", + g_fs_cache_size / CACHE_BUFFER_SIZE, + CACHE_BUFFER_SIZE, + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!g_cache_pool) { + SPDK_ERRLOG("Create mempool failed, you may " + "increase the memory and try again\n"); + assert(false); + } + TAILQ_INIT(&g_caches); + + assert(g_cache_pool_mgmt_poller == NULL); + g_cache_pool_mgmt_poller = SPDK_POLLER_REGISTER(_blobfs_cache_pool_reclaim, NULL, + BLOBFS_CACHE_POOL_POLL_PERIOD_IN_US); +} + +static void +__stop_cache_pool_mgmt(void *ctx) +{ + spdk_poller_unregister(&g_cache_pool_mgmt_poller); + + assert(g_cache_pool != NULL); + assert(spdk_mempool_count(g_cache_pool) == g_fs_cache_size / CACHE_BUFFER_SIZE); + spdk_mempool_free(g_cache_pool); + g_cache_pool = NULL; + + spdk_thread_exit(g_cache_pool_thread); +} + +static void +initialize_global_cache(void) +{ + pthread_mutex_lock(&g_cache_init_lock); + if (g_fs_count == 0) { + g_cache_pool_thread = spdk_thread_create("cache_pool_mgmt", NULL); + assert(g_cache_pool_thread != NULL); + spdk_thread_send_msg(g_cache_pool_thread, __start_cache_pool_mgmt, NULL); + } + g_fs_count++; + pthread_mutex_unlock(&g_cache_init_lock); +} + +static void +free_global_cache(void) +{ + pthread_mutex_lock(&g_cache_init_lock); + g_fs_count--; + if (g_fs_count == 0) { + spdk_thread_send_msg(g_cache_pool_thread, __stop_cache_pool_mgmt, NULL); + } + pthread_mutex_unlock(&g_cache_init_lock); +} + +static uint64_t +__file_get_blob_size(struct spdk_file *file) +{ + uint64_t cluster_sz; + + cluster_sz = file->fs->bs_opts.cluster_sz; + return cluster_sz * spdk_blob_get_num_clusters(file->blob); +} + +struct spdk_fs_request { + struct spdk_fs_cb_args args; + TAILQ_ENTRY(spdk_fs_request) link; + struct spdk_fs_channel *channel; +}; + +struct spdk_fs_channel { + struct spdk_fs_request *req_mem; + TAILQ_HEAD(, spdk_fs_request) reqs; + sem_t sem; + struct spdk_filesystem *fs; + struct spdk_io_channel *bs_channel; + fs_send_request_fn send_request; + bool sync; + uint32_t outstanding_reqs; + pthread_spinlock_t lock; +}; + +/* For now, this is effectively an alias. But eventually we'll shift + * some data members over. */ +struct spdk_fs_thread_ctx { + struct spdk_fs_channel ch; +}; + +static struct spdk_fs_request * +alloc_fs_request_with_iov(struct spdk_fs_channel *channel, uint32_t iovcnt) +{ + struct spdk_fs_request *req; + struct iovec *iovs = NULL; + + if (iovcnt > 1) { + iovs = calloc(iovcnt, sizeof(struct iovec)); + if (!iovs) { + return NULL; + } + } + + if (channel->sync) { + pthread_spin_lock(&channel->lock); + } + + req = TAILQ_FIRST(&channel->reqs); + if (req) { + channel->outstanding_reqs++; + TAILQ_REMOVE(&channel->reqs, req, link); + } + + if (channel->sync) { + pthread_spin_unlock(&channel->lock); + } + + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate req on spdk_fs_channel =%p\n", channel); + free(iovs); + return NULL; + } + memset(req, 0, sizeof(*req)); + req->channel = channel; + if (iovcnt > 1) { + req->args.iovs = iovs; + } else { + req->args.iovs = &req->args.iov; + } + req->args.iovcnt = iovcnt; + + return req; +} + +static struct spdk_fs_request * +alloc_fs_request(struct spdk_fs_channel *channel) +{ + return alloc_fs_request_with_iov(channel, 0); +} + +static void +free_fs_request(struct spdk_fs_request *req) +{ + struct spdk_fs_channel *channel = req->channel; + + if (req->args.iovcnt > 1) { + free(req->args.iovs); + } + + if (channel->sync) { + pthread_spin_lock(&channel->lock); + } + + TAILQ_INSERT_HEAD(&req->channel->reqs, req, link); + channel->outstanding_reqs--; + + if (channel->sync) { + pthread_spin_unlock(&channel->lock); + } +} + +static int +fs_channel_create(struct spdk_filesystem *fs, struct spdk_fs_channel *channel, + uint32_t max_ops) +{ + uint32_t i; + + channel->req_mem = calloc(max_ops, sizeof(struct spdk_fs_request)); + if (!channel->req_mem) { + return -1; + } + + channel->outstanding_reqs = 0; + TAILQ_INIT(&channel->reqs); + sem_init(&channel->sem, 0, 0); + + for (i = 0; i < max_ops; i++) { + TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link); + } + + channel->fs = fs; + + return 0; +} + +static int +fs_md_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_filesystem *fs; + struct spdk_fs_channel *channel = ctx_buf; + + fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, md_target); + + return fs_channel_create(fs, channel, fs->md_target.max_ops); +} + +static int +fs_sync_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_filesystem *fs; + struct spdk_fs_channel *channel = ctx_buf; + + fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, sync_target); + + return fs_channel_create(fs, channel, fs->sync_target.max_ops); +} + +static int +fs_io_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_filesystem *fs; + struct spdk_fs_channel *channel = ctx_buf; + + fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, io_target); + + return fs_channel_create(fs, channel, fs->io_target.max_ops); +} + +static void +fs_channel_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_fs_channel *channel = ctx_buf; + + if (channel->outstanding_reqs > 0) { + SPDK_ERRLOG("channel freed with %" PRIu32 " outstanding requests!\n", + channel->outstanding_reqs); + } + + free(channel->req_mem); + if (channel->bs_channel != NULL) { + spdk_bs_free_io_channel(channel->bs_channel); + } +} + +static void +__send_request_direct(fs_request_fn fn, void *arg) +{ + fn(arg); +} + +static void +common_fs_bs_init(struct spdk_filesystem *fs, struct spdk_blob_store *bs) +{ + fs->bs = bs; + fs->bs_opts.cluster_sz = spdk_bs_get_cluster_size(bs); + fs->md_target.md_fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs); + fs->md_target.md_fs_channel->send_request = __send_request_direct; + fs->sync_target.sync_fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs); + fs->sync_target.sync_fs_channel->send_request = __send_request_direct; + + initialize_global_cache(); +} + +static void +init_cb(void *ctx, struct spdk_blob_store *bs, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + + if (bserrno == 0) { + common_fs_bs_init(fs, bs); + } else { + free(fs); + fs = NULL; + } + + args->fn.fs_op_with_handle(args->arg, fs, bserrno); + free_fs_request(req); +} + +static void +fs_conf_parse(void) +{ + struct spdk_conf_section *sp; + int cache_buffer_shift; + + sp = spdk_conf_find_section(NULL, "Blobfs"); + if (sp == NULL) { + g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT; + return; + } + + cache_buffer_shift = spdk_conf_section_get_intval(sp, "CacheBufferShift"); + if (cache_buffer_shift <= 0) { + g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT; + } else { + g_fs_cache_buffer_shift = cache_buffer_shift; + } +} + +static struct spdk_filesystem * +fs_alloc(struct spdk_bs_dev *dev, fs_send_request_fn send_request_fn) +{ + struct spdk_filesystem *fs; + + fs = calloc(1, sizeof(*fs)); + if (fs == NULL) { + return NULL; + } + + fs->bdev = dev; + fs->send_request = send_request_fn; + TAILQ_INIT(&fs->files); + + fs->md_target.max_ops = 512; + spdk_io_device_register(&fs->md_target, fs_md_channel_create, fs_channel_destroy, + sizeof(struct spdk_fs_channel), "blobfs_md"); + fs->md_target.md_io_channel = spdk_get_io_channel(&fs->md_target); + fs->md_target.md_fs_channel = spdk_io_channel_get_ctx(fs->md_target.md_io_channel); + + fs->sync_target.max_ops = 512; + spdk_io_device_register(&fs->sync_target, fs_sync_channel_create, fs_channel_destroy, + sizeof(struct spdk_fs_channel), "blobfs_sync"); + fs->sync_target.sync_io_channel = spdk_get_io_channel(&fs->sync_target); + fs->sync_target.sync_fs_channel = spdk_io_channel_get_ctx(fs->sync_target.sync_io_channel); + + fs->io_target.max_ops = 512; + spdk_io_device_register(&fs->io_target, fs_io_channel_create, fs_channel_destroy, + sizeof(struct spdk_fs_channel), "blobfs_io"); + + return fs; +} + +static void +__wake_caller(void *arg, int fserrno) +{ + struct spdk_fs_cb_args *args = arg; + + args->rc = fserrno; + sem_post(args->sem); +} + +void +spdk_fs_init(struct spdk_bs_dev *dev, struct spdk_blobfs_opts *opt, + fs_send_request_fn send_request_fn, + spdk_fs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_filesystem *fs; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + struct spdk_bs_opts opts = {}; + + fs = fs_alloc(dev, send_request_fn); + if (fs == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + fs_conf_parse(); + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + fs_free_io_channels(fs); + fs_io_device_unregister(fs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + args = &req->args; + args->fn.fs_op_with_handle = cb_fn; + args->arg = cb_arg; + args->fs = fs; + + spdk_bs_opts_init(&opts); + snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), SPDK_BLOBFS_SIGNATURE); + if (opt) { + opts.cluster_sz = opt->cluster_sz; + } + spdk_bs_init(dev, &opts, init_cb, req); +} + +static struct spdk_file * +file_alloc(struct spdk_filesystem *fs) +{ + struct spdk_file *file; + + file = calloc(1, sizeof(*file)); + if (file == NULL) { + return NULL; + } + + file->tree = calloc(1, sizeof(*file->tree)); + if (file->tree == NULL) { + free(file); + return NULL; + } + + if (pthread_spin_init(&file->lock, 0)) { + free(file->tree); + free(file); + return NULL; + } + + file->fs = fs; + TAILQ_INIT(&file->open_requests); + TAILQ_INIT(&file->sync_requests); + TAILQ_INSERT_TAIL(&fs->files, file, tailq); + file->priority = SPDK_FILE_PRIORITY_LOW; + return file; +} + +static void fs_load_done(void *ctx, int bserrno); + +static int +_handle_deleted_files(struct spdk_fs_request *req) +{ + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + + if (!TAILQ_EMPTY(&args->op.fs_load.deleted_files)) { + struct spdk_deleted_file *deleted_file; + + deleted_file = TAILQ_FIRST(&args->op.fs_load.deleted_files); + TAILQ_REMOVE(&args->op.fs_load.deleted_files, deleted_file, tailq); + spdk_bs_delete_blob(fs->bs, deleted_file->id, fs_load_done, req); + free(deleted_file); + return 0; + } + + return 1; +} + +static void +fs_load_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + + /* The filesystem has been loaded. Now check if there are any files that + * were marked for deletion before last unload. Do not complete the + * fs_load callback until all of them have been deleted on disk. + */ + if (_handle_deleted_files(req) == 0) { + /* We found a file that's been marked for deleting but not actually + * deleted yet. This function will get called again once the delete + * operation is completed. + */ + return; + } + + args->fn.fs_op_with_handle(args->arg, fs, 0); + free_fs_request(req); + +} + +static void +_file_build_trace_arg_name(struct spdk_file *f) +{ + f->trace_arg_name = 0; + memcpy(&f->trace_arg_name, f->name, + spdk_min(sizeof(f->trace_arg_name), strlen(f->name))); +} + +static void +iter_cb(void *ctx, struct spdk_blob *blob, int rc) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + uint64_t *length; + const char *name; + uint32_t *is_deleted; + size_t value_len; + + if (rc < 0) { + args->fn.fs_op_with_handle(args->arg, fs, rc); + free_fs_request(req); + return; + } + + rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&name, &value_len); + if (rc < 0) { + args->fn.fs_op_with_handle(args->arg, fs, rc); + free_fs_request(req); + return; + } + + rc = spdk_blob_get_xattr_value(blob, "length", (const void **)&length, &value_len); + if (rc < 0) { + args->fn.fs_op_with_handle(args->arg, fs, rc); + free_fs_request(req); + return; + } + + assert(value_len == 8); + + /* This file could be deleted last time without close it, then app crashed, so we delete it now */ + rc = spdk_blob_get_xattr_value(blob, "is_deleted", (const void **)&is_deleted, &value_len); + if (rc < 0) { + struct spdk_file *f; + + f = file_alloc(fs); + if (f == NULL) { + SPDK_ERRLOG("Cannot allocate file to handle deleted file on disk\n"); + args->fn.fs_op_with_handle(args->arg, fs, -ENOMEM); + free_fs_request(req); + return; + } + + f->name = strdup(name); + _file_build_trace_arg_name(f); + f->blobid = spdk_blob_get_id(blob); + f->length = *length; + f->length_flushed = *length; + f->length_xattr = *length; + f->append_pos = *length; + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "added file %s length=%ju\n", f->name, f->length); + } else { + struct spdk_deleted_file *deleted_file; + + deleted_file = calloc(1, sizeof(*deleted_file)); + if (deleted_file == NULL) { + args->fn.fs_op_with_handle(args->arg, fs, -ENOMEM); + free_fs_request(req); + return; + } + deleted_file->id = spdk_blob_get_id(blob); + TAILQ_INSERT_TAIL(&args->op.fs_load.deleted_files, deleted_file, tailq); + } +} + +static void +load_cb(void *ctx, struct spdk_blob_store *bs, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + struct spdk_bs_type bstype; + static const struct spdk_bs_type blobfs_type = {SPDK_BLOBFS_SIGNATURE}; + static const struct spdk_bs_type zeros; + + if (bserrno != 0) { + args->fn.fs_op_with_handle(args->arg, NULL, bserrno); + free_fs_request(req); + fs_free_io_channels(fs); + fs_io_device_unregister(fs); + return; + } + + bstype = spdk_bs_get_bstype(bs); + + if (!memcmp(&bstype, &zeros, sizeof(bstype))) { + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "assigning bstype\n"); + spdk_bs_set_bstype(bs, blobfs_type); + } else if (memcmp(&bstype, &blobfs_type, sizeof(bstype))) { + SPDK_ERRLOG("not blobfs\n"); + SPDK_LOGDUMP(SPDK_LOG_BLOBFS, "bstype", &bstype, sizeof(bstype)); + args->fn.fs_op_with_handle(args->arg, NULL, -EINVAL); + free_fs_request(req); + fs_free_io_channels(fs); + fs_io_device_unregister(fs); + return; + } + + common_fs_bs_init(fs, bs); + fs_load_done(req, 0); +} + +static void +fs_io_device_unregister(struct spdk_filesystem *fs) +{ + assert(fs != NULL); + spdk_io_device_unregister(&fs->md_target, NULL); + spdk_io_device_unregister(&fs->sync_target, NULL); + spdk_io_device_unregister(&fs->io_target, NULL); + free(fs); +} + +static void +fs_free_io_channels(struct spdk_filesystem *fs) +{ + assert(fs != NULL); + spdk_fs_free_io_channel(fs->md_target.md_io_channel); + spdk_fs_free_io_channel(fs->sync_target.sync_io_channel); +} + +void +spdk_fs_load(struct spdk_bs_dev *dev, fs_send_request_fn send_request_fn, + spdk_fs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_filesystem *fs; + struct spdk_fs_cb_args *args; + struct spdk_fs_request *req; + struct spdk_bs_opts bs_opts; + + fs = fs_alloc(dev, send_request_fn); + if (fs == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + fs_conf_parse(); + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + fs_free_io_channels(fs); + fs_io_device_unregister(fs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + args = &req->args; + args->fn.fs_op_with_handle = cb_fn; + args->arg = cb_arg; + args->fs = fs; + TAILQ_INIT(&args->op.fs_load.deleted_files); + spdk_bs_opts_init(&bs_opts); + bs_opts.iter_cb_fn = iter_cb; + bs_opts.iter_cb_arg = req; + spdk_bs_load(dev, &bs_opts, load_cb, req); +} + +static void +unload_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + struct spdk_file *file, *tmp; + + TAILQ_FOREACH_SAFE(file, &fs->files, tailq, tmp) { + TAILQ_REMOVE(&fs->files, file, tailq); + file_free(file); + } + + free_global_cache(); + + args->fn.fs_op(args->arg, bserrno); + free(req); + + fs_io_device_unregister(fs); +} + +void +spdk_fs_unload(struct spdk_filesystem *fs, spdk_fs_op_complete cb_fn, void *cb_arg) +{ + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + /* + * We must free the md_channel before unloading the blobstore, so just + * allocate this request from the general heap. + */ + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->fn.fs_op = cb_fn; + args->arg = cb_arg; + args->fs = fs; + + fs_free_io_channels(fs); + spdk_bs_unload(fs->bs, unload_cb, req); +} + +static struct spdk_file * +fs_find_file(struct spdk_filesystem *fs, const char *name) +{ + struct spdk_file *file; + + TAILQ_FOREACH(file, &fs->files, tailq) { + if (!strncmp(name, file->name, SPDK_FILE_NAME_MAX)) { + return file; + } + } + + return NULL; +} + +void +spdk_fs_file_stat_async(struct spdk_filesystem *fs, const char *name, + spdk_file_stat_op_complete cb_fn, void *cb_arg) +{ + struct spdk_file_stat stat; + struct spdk_file *f = NULL; + + if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) { + cb_fn(cb_arg, NULL, -ENAMETOOLONG); + return; + } + + f = fs_find_file(fs, name); + if (f != NULL) { + stat.blobid = f->blobid; + stat.size = f->append_pos >= f->length ? f->append_pos : f->length; + cb_fn(cb_arg, &stat, 0); + return; + } + + cb_fn(cb_arg, NULL, -ENOENT); +} + +static void +__copy_stat(void *arg, struct spdk_file_stat *stat, int fserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + args->rc = fserrno; + if (fserrno == 0) { + memcpy(args->arg, stat, sizeof(*stat)); + } + sem_post(args->sem); +} + +static void +__file_stat(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + spdk_fs_file_stat_async(args->fs, args->op.stat.name, + args->fn.stat_op, req); +} + +int +spdk_fs_file_stat(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx, + const char *name, struct spdk_file_stat *stat) +{ + struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx; + struct spdk_fs_request *req; + int rc; + + req = alloc_fs_request(channel); + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate stat req on file=%s\n", name); + return -ENOMEM; + } + + req->args.fs = fs; + req->args.op.stat.name = name; + req->args.fn.stat_op = __copy_stat; + req->args.arg = stat; + req->args.sem = &channel->sem; + channel->send_request(__file_stat, req); + sem_wait(&channel->sem); + + rc = req->args.rc; + free_fs_request(req); + + return rc; +} + +static void +fs_create_blob_close_cb(void *ctx, int bserrno) +{ + int rc; + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + rc = args->rc ? args->rc : bserrno; + args->fn.file_op(args->arg, rc); + free_fs_request(req); +} + +static void +fs_create_blob_resize_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *f = args->file; + struct spdk_blob *blob = args->op.create.blob; + uint64_t length = 0; + + args->rc = bserrno; + if (bserrno) { + spdk_blob_close(blob, fs_create_blob_close_cb, args); + return; + } + + spdk_blob_set_xattr(blob, "name", f->name, strlen(f->name) + 1); + spdk_blob_set_xattr(blob, "length", &length, sizeof(length)); + + spdk_blob_close(blob, fs_create_blob_close_cb, args); +} + +static void +fs_create_blob_open_cb(void *ctx, struct spdk_blob *blob, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + if (bserrno) { + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); + return; + } + + args->op.create.blob = blob; + spdk_blob_resize(blob, 1, fs_create_blob_resize_cb, req); +} + +static void +fs_create_blob_create_cb(void *ctx, spdk_blob_id blobid, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *f = args->file; + + if (bserrno) { + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); + return; + } + + f->blobid = blobid; + spdk_bs_open_blob(f->fs->bs, blobid, fs_create_blob_open_cb, req); +} + +void +spdk_fs_create_file_async(struct spdk_filesystem *fs, const char *name, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_file *file; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) { + cb_fn(cb_arg, -ENAMETOOLONG); + return; + } + + file = fs_find_file(fs, name); + if (file != NULL) { + cb_fn(cb_arg, -EEXIST); + return; + } + + file = file_alloc(fs); + if (file == NULL) { + SPDK_ERRLOG("Cannot allocate new file for creation\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate create async req for file=%s\n", name); + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->file = file; + args->fn.file_op = cb_fn; + args->arg = cb_arg; + + file->name = strdup(name); + _file_build_trace_arg_name(file); + spdk_bs_create_blob(fs->bs, fs_create_blob_create_cb, args); +} + +static void +__fs_create_file_done(void *arg, int fserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + __wake_caller(args, fserrno); + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.create.name); +} + +static void +__fs_create_file(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.create.name); + spdk_fs_create_file_async(args->fs, args->op.create.name, __fs_create_file_done, req); +} + +int +spdk_fs_create_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx, const char *name) +{ + struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name); + + req = alloc_fs_request(channel); + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate req to create file=%s\n", name); + return -ENOMEM; + } + + args = &req->args; + args->fs = fs; + args->op.create.name = name; + args->sem = &channel->sem; + fs->send_request(__fs_create_file, req); + sem_wait(&channel->sem); + rc = args->rc; + free_fs_request(req); + + return rc; +} + +static void +fs_open_blob_done(void *ctx, struct spdk_blob *blob, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *f = args->file; + + f->blob = blob; + while (!TAILQ_EMPTY(&f->open_requests)) { + req = TAILQ_FIRST(&f->open_requests); + args = &req->args; + TAILQ_REMOVE(&f->open_requests, req, args.op.open.tailq); + spdk_trace_record(TRACE_BLOBFS_OPEN, 0, 0, 0, f->trace_arg_name); + args->fn.file_op_with_handle(args->arg, f, bserrno); + free_fs_request(req); + } +} + +static void +fs_open_blob_create_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + struct spdk_filesystem *fs = args->fs; + + if (file == NULL) { + /* + * This is from an open with CREATE flag - the file + * is now created so look it up in the file list for this + * filesystem. + */ + file = fs_find_file(fs, args->op.open.name); + assert(file != NULL); + args->file = file; + } + + file->ref_count++; + TAILQ_INSERT_TAIL(&file->open_requests, req, args.op.open.tailq); + if (file->ref_count == 1) { + assert(file->blob == NULL); + spdk_bs_open_blob(fs->bs, file->blobid, fs_open_blob_done, req); + } else if (file->blob != NULL) { + fs_open_blob_done(req, file->blob, 0); + } else { + /* + * The blob open for this file is in progress due to a previous + * open request. When that open completes, it will invoke the + * open callback for this request. + */ + } +} + +void +spdk_fs_open_file_async(struct spdk_filesystem *fs, const char *name, uint32_t flags, + spdk_file_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_file *f = NULL; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) { + cb_fn(cb_arg, NULL, -ENAMETOOLONG); + return; + } + + f = fs_find_file(fs, name); + if (f == NULL && !(flags & SPDK_BLOBFS_OPEN_CREATE)) { + cb_fn(cb_arg, NULL, -ENOENT); + return; + } + + if (f != NULL && f->is_deleted == true) { + cb_fn(cb_arg, NULL, -ENOENT); + return; + } + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate async open req for file=%s\n", name); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + args = &req->args; + args->fn.file_op_with_handle = cb_fn; + args->arg = cb_arg; + args->file = f; + args->fs = fs; + args->op.open.name = name; + + if (f == NULL) { + spdk_fs_create_file_async(fs, name, fs_open_blob_create_cb, req); + } else { + fs_open_blob_create_cb(req, 0); + } +} + +static void +__fs_open_file_done(void *arg, struct spdk_file *file, int bserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + args->file = file; + __wake_caller(args, bserrno); + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.open.name); +} + +static void +__fs_open_file(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.open.name); + spdk_fs_open_file_async(args->fs, args->op.open.name, args->op.open.flags, + __fs_open_file_done, req); +} + +int +spdk_fs_open_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx, + const char *name, uint32_t flags, struct spdk_file **file) +{ + struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name); + + req = alloc_fs_request(channel); + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate req for opening file=%s\n", name); + return -ENOMEM; + } + + args = &req->args; + args->fs = fs; + args->op.open.name = name; + args->op.open.flags = flags; + args->sem = &channel->sem; + fs->send_request(__fs_open_file, req); + sem_wait(&channel->sem); + rc = args->rc; + if (rc == 0) { + *file = args->file; + } else { + *file = NULL; + } + free_fs_request(req); + + return rc; +} + +static void +fs_rename_blob_close_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + args->fn.fs_op(args->arg, bserrno); + free_fs_request(req); +} + +static void +fs_rename_blob_open_cb(void *ctx, struct spdk_blob *blob, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + const char *new_name = args->op.rename.new_name; + + spdk_blob_set_xattr(blob, "name", new_name, strlen(new_name) + 1); + spdk_blob_close(blob, fs_rename_blob_close_cb, req); +} + +static void +_fs_md_rename_file(struct spdk_fs_request *req) +{ + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *f; + + f = fs_find_file(args->fs, args->op.rename.old_name); + if (f == NULL) { + args->fn.fs_op(args->arg, -ENOENT); + free_fs_request(req); + return; + } + + free(f->name); + f->name = strdup(args->op.rename.new_name); + _file_build_trace_arg_name(f); + args->file = f; + spdk_bs_open_blob(args->fs->bs, f->blobid, fs_rename_blob_open_cb, req); +} + +static void +fs_rename_delete_done(void *arg, int fserrno) +{ + _fs_md_rename_file(arg); +} + +void +spdk_fs_rename_file_async(struct spdk_filesystem *fs, + const char *old_name, const char *new_name, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_file *f; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "old=%s new=%s\n", old_name, new_name); + if (strnlen(new_name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) { + cb_fn(cb_arg, -ENAMETOOLONG); + return; + } + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate rename async req for renaming file from %s to %s\n", old_name, + new_name); + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->fn.fs_op = cb_fn; + args->fs = fs; + args->arg = cb_arg; + args->op.rename.old_name = old_name; + args->op.rename.new_name = new_name; + + f = fs_find_file(fs, new_name); + if (f == NULL) { + _fs_md_rename_file(req); + return; + } + + /* + * The rename overwrites an existing file. So delete the existing file, then + * do the actual rename. + */ + spdk_fs_delete_file_async(fs, new_name, fs_rename_delete_done, req); +} + +static void +__fs_rename_file_done(void *arg, int fserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + __wake_caller(args, fserrno); +} + +static void +__fs_rename_file(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + spdk_fs_rename_file_async(args->fs, args->op.rename.old_name, args->op.rename.new_name, + __fs_rename_file_done, req); +} + +int +spdk_fs_rename_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx, + const char *old_name, const char *new_name) +{ + struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + int rc; + + req = alloc_fs_request(channel); + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate rename req for file=%s\n", old_name); + return -ENOMEM; + } + + args = &req->args; + + args->fs = fs; + args->op.rename.old_name = old_name; + args->op.rename.new_name = new_name; + args->sem = &channel->sem; + fs->send_request(__fs_rename_file, req); + sem_wait(&channel->sem); + rc = args->rc; + free_fs_request(req); + return rc; +} + +static void +blob_delete_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); +} + +void +spdk_fs_delete_file_async(struct spdk_filesystem *fs, const char *name, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_file *f; + spdk_blob_id blobid; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name); + + if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) { + cb_fn(cb_arg, -ENAMETOOLONG); + return; + } + + f = fs_find_file(fs, name); + if (f == NULL) { + SPDK_ERRLOG("Cannot find the file=%s to deleted\n", name); + cb_fn(cb_arg, -ENOENT); + return; + } + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate the req for the file=%s to deleted\n", name); + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->fn.file_op = cb_fn; + args->arg = cb_arg; + + if (f->ref_count > 0) { + /* If the ref > 0, we mark the file as deleted and delete it when we close it. */ + f->is_deleted = true; + spdk_blob_set_xattr(f->blob, "is_deleted", &f->is_deleted, sizeof(bool)); + spdk_blob_sync_md(f->blob, blob_delete_cb, req); + return; + } + + blobid = f->blobid; + TAILQ_REMOVE(&fs->files, f, tailq); + + file_free(f); + + spdk_bs_delete_blob(fs->bs, blobid, blob_delete_cb, req); +} + +static uint64_t +fs_name_to_uint64(const char *name) +{ + uint64_t result = 0; + memcpy(&result, name, spdk_min(sizeof(result), strlen(name))); + return result; +} + +static void +__fs_delete_file_done(void *arg, int fserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + spdk_trace_record(TRACE_BLOBFS_DELETE_DONE, 0, 0, 0, fs_name_to_uint64(args->op.delete.name)); + __wake_caller(args, fserrno); +} + +static void +__fs_delete_file(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + spdk_trace_record(TRACE_BLOBFS_DELETE_START, 0, 0, 0, fs_name_to_uint64(args->op.delete.name)); + spdk_fs_delete_file_async(args->fs, args->op.delete.name, __fs_delete_file_done, req); +} + +int +spdk_fs_delete_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx, + const char *name) +{ + struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + int rc; + + req = alloc_fs_request(channel); + if (req == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "Cannot allocate req to delete file=%s\n", name); + return -ENOMEM; + } + + args = &req->args; + args->fs = fs; + args->op.delete.name = name; + args->sem = &channel->sem; + fs->send_request(__fs_delete_file, req); + sem_wait(&channel->sem); + rc = args->rc; + free_fs_request(req); + + return rc; +} + +spdk_fs_iter +spdk_fs_iter_first(struct spdk_filesystem *fs) +{ + struct spdk_file *f; + + f = TAILQ_FIRST(&fs->files); + return f; +} + +spdk_fs_iter +spdk_fs_iter_next(spdk_fs_iter iter) +{ + struct spdk_file *f = iter; + + if (f == NULL) { + return NULL; + } + + f = TAILQ_NEXT(f, tailq); + return f; +} + +const char * +spdk_file_get_name(struct spdk_file *file) +{ + return file->name; +} + +uint64_t +spdk_file_get_length(struct spdk_file *file) +{ + uint64_t length; + + assert(file != NULL); + + length = file->append_pos >= file->length ? file->append_pos : file->length; + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s length=0x%jx\n", file->name, length); + return length; +} + +static void +fs_truncate_complete_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); +} + +static void +fs_truncate_resize_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + uint64_t *length = &args->op.truncate.length; + + if (bserrno) { + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); + return; + } + + spdk_blob_set_xattr(file->blob, "length", length, sizeof(*length)); + + file->length = *length; + if (file->append_pos > file->length) { + file->append_pos = file->length; + } + + spdk_blob_sync_md(file->blob, fs_truncate_complete_cb, req); +} + +static uint64_t +__bytes_to_clusters(uint64_t length, uint64_t cluster_sz) +{ + return (length + cluster_sz - 1) / cluster_sz; +} + +void +spdk_file_truncate_async(struct spdk_file *file, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_filesystem *fs; + size_t num_clusters; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s old=0x%jx new=0x%jx\n", file->name, file->length, length); + if (length == file->length) { + cb_fn(cb_arg, 0); + return; + } + + req = alloc_fs_request(file->fs->md_target.md_fs_channel); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->fn.file_op = cb_fn; + args->arg = cb_arg; + args->file = file; + args->op.truncate.length = length; + fs = file->fs; + + num_clusters = __bytes_to_clusters(length, fs->bs_opts.cluster_sz); + + spdk_blob_resize(file->blob, num_clusters, fs_truncate_resize_cb, req); +} + +static void +__truncate(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + spdk_file_truncate_async(args->file, args->op.truncate.length, + args->fn.file_op, args); +} + +int +spdk_file_truncate(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx, + uint64_t length) +{ + struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + int rc; + + req = alloc_fs_request(channel); + if (req == NULL) { + return -ENOMEM; + } + + args = &req->args; + + args->file = file; + args->op.truncate.length = length; + args->fn.file_op = __wake_caller; + args->sem = &channel->sem; + + channel->send_request(__truncate, req); + sem_wait(&channel->sem); + rc = args->rc; + free_fs_request(req); + + return rc; +} + +static void +__rw_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + spdk_free(args->op.rw.pin_buf); + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); +} + +static void +_copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) +{ + int i; + size_t len; + + for (i = 0; i < iovcnt; i++) { + len = spdk_min(iovs[i].iov_len, buf_len); + memcpy(buf, iovs[i].iov_base, len); + buf += len; + assert(buf_len >= len); + buf_len -= len; + } +} + +static void +_copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) +{ + int i; + size_t len; + + for (i = 0; i < iovcnt; i++) { + len = spdk_min(iovs[i].iov_len, buf_len); + memcpy(iovs[i].iov_base, buf, len); + buf += len; + assert(buf_len >= len); + buf_len -= len; + } +} + +static void +__read_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + void *buf; + + assert(req != NULL); + buf = (void *)((uintptr_t)args->op.rw.pin_buf + (args->op.rw.offset & (args->op.rw.blocklen - 1))); + if (args->op.rw.is_read) { + _copy_buf_to_iovs(args->iovs, args->iovcnt, buf, args->op.rw.length); + __rw_done(req, 0); + } else { + _copy_iovs_to_buf(buf, args->op.rw.length, args->iovs, args->iovcnt); + spdk_blob_io_write(args->file->blob, args->op.rw.channel, + args->op.rw.pin_buf, + args->op.rw.start_lba, args->op.rw.num_lba, + __rw_done, req); + } +} + +static void +__do_blob_read(void *ctx, int fserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + if (fserrno) { + __rw_done(req, fserrno); + return; + } + spdk_blob_io_read(args->file->blob, args->op.rw.channel, + args->op.rw.pin_buf, + args->op.rw.start_lba, args->op.rw.num_lba, + __read_done, req); +} + +static void +__get_page_parameters(struct spdk_file *file, uint64_t offset, uint64_t length, + uint64_t *start_lba, uint32_t *lba_size, uint64_t *num_lba) +{ + uint64_t end_lba; + + *lba_size = spdk_bs_get_io_unit_size(file->fs->bs); + *start_lba = offset / *lba_size; + end_lba = (offset + length - 1) / *lba_size; + *num_lba = (end_lba - *start_lba + 1); +} + +static bool +__is_lba_aligned(struct spdk_file *file, uint64_t offset, uint64_t length) +{ + uint32_t lba_size = spdk_bs_get_io_unit_size(file->fs->bs); + + if ((offset % lba_size == 0) && (length % lba_size == 0)) { + return true; + } + + return false; +} + +static void +_fs_request_setup_iovs(struct spdk_fs_request *req, struct iovec *iovs, uint32_t iovcnt) +{ + uint32_t i; + + for (i = 0; i < iovcnt; i++) { + req->args.iovs[i].iov_base = iovs[i].iov_base; + req->args.iovs[i].iov_len = iovs[i].iov_len; + } +} + +static void +__readvwritev(struct spdk_file *file, struct spdk_io_channel *_channel, + struct iovec *iovs, uint32_t iovcnt, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg, int is_read) +{ + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + uint64_t start_lba, num_lba, pin_buf_length; + uint32_t lba_size; + + if (is_read && offset + length > file->length) { + cb_fn(cb_arg, -EINVAL); + return; + } + + req = alloc_fs_request_with_iov(channel, iovcnt); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + __get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba); + + args = &req->args; + args->fn.file_op = cb_fn; + args->arg = cb_arg; + args->file = file; + args->op.rw.channel = channel->bs_channel; + _fs_request_setup_iovs(req, iovs, iovcnt); + args->op.rw.is_read = is_read; + args->op.rw.offset = offset; + args->op.rw.blocklen = lba_size; + + pin_buf_length = num_lba * lba_size; + args->op.rw.length = pin_buf_length; + args->op.rw.pin_buf = spdk_malloc(pin_buf_length, lba_size, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (args->op.rw.pin_buf == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "Failed to allocate buf for: file=%s offset=%jx length=%jx\n", + file->name, offset, length); + free_fs_request(req); + cb_fn(cb_arg, -ENOMEM); + return; + } + + args->op.rw.start_lba = start_lba; + args->op.rw.num_lba = num_lba; + + if (!is_read && file->length < offset + length) { + spdk_file_truncate_async(file, offset + length, __do_blob_read, req); + } else if (!is_read && __is_lba_aligned(file, offset, length)) { + _copy_iovs_to_buf(args->op.rw.pin_buf, args->op.rw.length, args->iovs, args->iovcnt); + spdk_blob_io_write(args->file->blob, args->op.rw.channel, + args->op.rw.pin_buf, + args->op.rw.start_lba, args->op.rw.num_lba, + __rw_done, req); + } else { + __do_blob_read(req, 0); + } +} + +static void +__readwrite(struct spdk_file *file, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg, int is_read) +{ + struct iovec iov; + + iov.iov_base = payload; + iov.iov_len = (size_t)length; + + __readvwritev(file, channel, &iov, 1, offset, length, cb_fn, cb_arg, is_read); +} + +void +spdk_file_write_async(struct spdk_file *file, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + __readwrite(file, channel, payload, offset, length, cb_fn, cb_arg, 0); +} + +void +spdk_file_writev_async(struct spdk_file *file, struct spdk_io_channel *channel, + struct iovec *iovs, uint32_t iovcnt, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s offset=%jx length=%jx\n", + file->name, offset, length); + + __readvwritev(file, channel, iovs, iovcnt, offset, length, cb_fn, cb_arg, 0); +} + +void +spdk_file_read_async(struct spdk_file *file, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s offset=%jx length=%jx\n", + file->name, offset, length); + __readwrite(file, channel, payload, offset, length, cb_fn, cb_arg, 1); +} + +void +spdk_file_readv_async(struct spdk_file *file, struct spdk_io_channel *channel, + struct iovec *iovs, uint32_t iovcnt, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s offset=%jx length=%jx\n", + file->name, offset, length); + + __readvwritev(file, channel, iovs, iovcnt, offset, length, cb_fn, cb_arg, 1); +} + +struct spdk_io_channel * +spdk_fs_alloc_io_channel(struct spdk_filesystem *fs) +{ + struct spdk_io_channel *io_channel; + struct spdk_fs_channel *fs_channel; + + io_channel = spdk_get_io_channel(&fs->io_target); + fs_channel = spdk_io_channel_get_ctx(io_channel); + fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs); + fs_channel->send_request = __send_request_direct; + + return io_channel; +} + +void +spdk_fs_free_io_channel(struct spdk_io_channel *channel) +{ + spdk_put_io_channel(channel); +} + +struct spdk_fs_thread_ctx * +spdk_fs_alloc_thread_ctx(struct spdk_filesystem *fs) +{ + struct spdk_fs_thread_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + return NULL; + } + + if (pthread_spin_init(&ctx->ch.lock, 0)) { + free(ctx); + return NULL; + } + + fs_channel_create(fs, &ctx->ch, 512); + + ctx->ch.send_request = fs->send_request; + ctx->ch.sync = 1; + + return ctx; +} + + +void +spdk_fs_free_thread_ctx(struct spdk_fs_thread_ctx *ctx) +{ + assert(ctx->ch.sync == 1); + + while (true) { + pthread_spin_lock(&ctx->ch.lock); + if (ctx->ch.outstanding_reqs == 0) { + pthread_spin_unlock(&ctx->ch.lock); + break; + } + pthread_spin_unlock(&ctx->ch.lock); + usleep(1000); + } + + fs_channel_destroy(NULL, &ctx->ch); + free(ctx); +} + +int +spdk_fs_set_cache_size(uint64_t size_in_mb) +{ + /* setting g_fs_cache_size is only permitted if cache pool + * is already freed or hasn't been initialized + */ + if (g_cache_pool != NULL) { + return -EPERM; + } + + g_fs_cache_size = size_in_mb * 1024 * 1024; + + return 0; +} + +uint64_t +spdk_fs_get_cache_size(void) +{ + return g_fs_cache_size / (1024 * 1024); +} + +static void __file_flush(void *ctx); + +/* Try to free some cache buffers from this file. + */ +static int +reclaim_cache_buffers(struct spdk_file *file) +{ + int rc; + + BLOBFS_TRACE(file, "free=%s\n", file->name); + + /* The function is safe to be called with any threads, while the file + * lock maybe locked by other thread for now, so try to get the file + * lock here. + */ + rc = pthread_spin_trylock(&file->lock); + if (rc != 0) { + return -1; + } + + if (file->tree->present_mask == 0) { + pthread_spin_unlock(&file->lock); + return -1; + } + tree_free_buffers(file->tree); + + TAILQ_REMOVE(&g_caches, file, cache_tailq); + /* If not freed, put it in the end of the queue */ + if (file->tree->present_mask != 0) { + TAILQ_INSERT_TAIL(&g_caches, file, cache_tailq); + } else { + file->last = NULL; + } + pthread_spin_unlock(&file->lock); + + return 0; +} + +static int +_blobfs_cache_pool_reclaim(void *arg) +{ + struct spdk_file *file, *tmp; + int rc; + + if (!blobfs_cache_pool_need_reclaim()) { + return SPDK_POLLER_IDLE; + } + + TAILQ_FOREACH_SAFE(file, &g_caches, cache_tailq, tmp) { + if (!file->open_for_writing && + file->priority == SPDK_FILE_PRIORITY_LOW) { + rc = reclaim_cache_buffers(file); + if (rc < 0) { + continue; + } + if (!blobfs_cache_pool_need_reclaim()) { + return SPDK_POLLER_BUSY; + } + break; + } + } + + TAILQ_FOREACH_SAFE(file, &g_caches, cache_tailq, tmp) { + if (!file->open_for_writing) { + rc = reclaim_cache_buffers(file); + if (rc < 0) { + continue; + } + if (!blobfs_cache_pool_need_reclaim()) { + return SPDK_POLLER_BUSY; + } + break; + } + } + + TAILQ_FOREACH_SAFE(file, &g_caches, cache_tailq, tmp) { + rc = reclaim_cache_buffers(file); + if (rc < 0) { + continue; + } + break; + } + + return SPDK_POLLER_BUSY; +} + +static void +_add_file_to_cache_pool(void *ctx) +{ + struct spdk_file *file = ctx; + + TAILQ_INSERT_TAIL(&g_caches, file, cache_tailq); +} + +static void +_remove_file_from_cache_pool(void *ctx) +{ + struct spdk_file *file = ctx; + + TAILQ_REMOVE(&g_caches, file, cache_tailq); +} + +static struct cache_buffer * +cache_insert_buffer(struct spdk_file *file, uint64_t offset) +{ + struct cache_buffer *buf; + int count = 0; + bool need_update = false; + + buf = calloc(1, sizeof(*buf)); + if (buf == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "calloc failed\n"); + return NULL; + } + + do { + buf->buf = spdk_mempool_get(g_cache_pool); + if (buf->buf) { + break; + } + if (count++ == 100) { + SPDK_ERRLOG("Could not allocate cache buffer for file=%p on offset=%jx\n", + file, offset); + free(buf); + return NULL; + } + usleep(BLOBFS_CACHE_POOL_POLL_PERIOD_IN_US); + } while (true); + + buf->buf_size = CACHE_BUFFER_SIZE; + buf->offset = offset; + + if (file->tree->present_mask == 0) { + need_update = true; + } + file->tree = tree_insert_buffer(file->tree, buf); + + if (need_update) { + spdk_thread_send_msg(g_cache_pool_thread, _add_file_to_cache_pool, file); + } + + return buf; +} + +static struct cache_buffer * +cache_append_buffer(struct spdk_file *file) +{ + struct cache_buffer *last; + + assert(file->last == NULL || file->last->bytes_filled == file->last->buf_size); + assert((file->append_pos % CACHE_BUFFER_SIZE) == 0); + + last = cache_insert_buffer(file, file->append_pos); + if (last == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "cache_insert_buffer failed\n"); + return NULL; + } + + file->last = last; + + return last; +} + +static void __check_sync_reqs(struct spdk_file *file); + +static void +__file_cache_finish_sync(void *ctx, int bserrno) +{ + struct spdk_file *file; + struct spdk_fs_request *sync_req = ctx; + struct spdk_fs_cb_args *sync_args; + + sync_args = &sync_req->args; + file = sync_args->file; + pthread_spin_lock(&file->lock); + file->length_xattr = sync_args->op.sync.length; + assert(sync_args->op.sync.offset <= file->length_flushed); + spdk_trace_record(TRACE_BLOBFS_XATTR_END, 0, sync_args->op.sync.offset, + 0, file->trace_arg_name); + BLOBFS_TRACE(file, "sync done offset=%jx\n", sync_args->op.sync.offset); + TAILQ_REMOVE(&file->sync_requests, sync_req, args.op.sync.tailq); + pthread_spin_unlock(&file->lock); + + sync_args->fn.file_op(sync_args->arg, bserrno); + + free_fs_request(sync_req); + __check_sync_reqs(file); +} + +static void +__check_sync_reqs(struct spdk_file *file) +{ + struct spdk_fs_request *sync_req; + + pthread_spin_lock(&file->lock); + + TAILQ_FOREACH(sync_req, &file->sync_requests, args.op.sync.tailq) { + if (sync_req->args.op.sync.offset <= file->length_flushed) { + break; + } + } + + if (sync_req != NULL && !sync_req->args.op.sync.xattr_in_progress) { + BLOBFS_TRACE(file, "set xattr length 0x%jx\n", file->length_flushed); + sync_req->args.op.sync.xattr_in_progress = true; + sync_req->args.op.sync.length = file->length_flushed; + spdk_blob_set_xattr(file->blob, "length", &file->length_flushed, + sizeof(file->length_flushed)); + + pthread_spin_unlock(&file->lock); + spdk_trace_record(TRACE_BLOBFS_XATTR_START, 0, file->length_flushed, + 0, file->trace_arg_name); + spdk_blob_sync_md(file->blob, __file_cache_finish_sync, sync_req); + } else { + pthread_spin_unlock(&file->lock); + } +} + +static void +__file_flush_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + struct cache_buffer *next = args->op.flush.cache_buffer; + + BLOBFS_TRACE(file, "length=%jx\n", args->op.flush.length); + + pthread_spin_lock(&file->lock); + next->in_progress = false; + next->bytes_flushed += args->op.flush.length; + file->length_flushed += args->op.flush.length; + if (file->length_flushed > file->length) { + file->length = file->length_flushed; + } + if (next->bytes_flushed == next->buf_size) { + BLOBFS_TRACE(file, "write buffer fully flushed 0x%jx\n", file->length_flushed); + next = tree_find_buffer(file->tree, file->length_flushed); + } + + /* + * Assert that there is no cached data that extends past the end of the underlying + * blob. + */ + assert(next == NULL || next->offset < __file_get_blob_size(file) || + next->bytes_filled == 0); + + pthread_spin_unlock(&file->lock); + + __check_sync_reqs(file); + + __file_flush(req); +} + +static void +__file_flush(void *ctx) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + struct cache_buffer *next; + uint64_t offset, length, start_lba, num_lba; + uint32_t lba_size; + + pthread_spin_lock(&file->lock); + next = tree_find_buffer(file->tree, file->length_flushed); + if (next == NULL || next->in_progress || + ((next->bytes_filled < next->buf_size) && TAILQ_EMPTY(&file->sync_requests))) { + /* + * There is either no data to flush, a flush I/O is already in + * progress, or the next buffer is partially filled but there's no + * outstanding request to sync it. + * So return immediately - if a flush I/O is in progress we will flush + * more data after that is completed, or a partial buffer will get flushed + * when it is either filled or the file is synced. + */ + free_fs_request(req); + if (next == NULL) { + /* + * For cases where a file's cache was evicted, and then the + * file was later appended, we will write the data directly + * to disk and bypass cache. So just update length_flushed + * here to reflect that all data was already written to disk. + */ + file->length_flushed = file->append_pos; + } + pthread_spin_unlock(&file->lock); + if (next == NULL) { + /* + * There is no data to flush, but we still need to check for any + * outstanding sync requests to make sure metadata gets updated. + */ + __check_sync_reqs(file); + } + return; + } + + offset = next->offset + next->bytes_flushed; + length = next->bytes_filled - next->bytes_flushed; + if (length == 0) { + free_fs_request(req); + pthread_spin_unlock(&file->lock); + /* + * There is no data to flush, but we still need to check for any + * outstanding sync requests to make sure metadata gets updated. + */ + __check_sync_reqs(file); + return; + } + args->op.flush.length = length; + args->op.flush.cache_buffer = next; + + __get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba); + + next->in_progress = true; + BLOBFS_TRACE(file, "offset=0x%jx length=0x%jx page start=0x%jx num=0x%jx\n", + offset, length, start_lba, num_lba); + pthread_spin_unlock(&file->lock); + spdk_blob_io_write(file->blob, file->fs->sync_target.sync_fs_channel->bs_channel, + next->buf + (start_lba * lba_size) - next->offset, + start_lba, num_lba, __file_flush_done, req); +} + +static void +__file_extend_done(void *arg, int bserrno) +{ + struct spdk_fs_cb_args *args = arg; + + __wake_caller(args, bserrno); +} + +static void +__file_extend_resize_cb(void *_args, int bserrno) +{ + struct spdk_fs_cb_args *args = _args; + struct spdk_file *file = args->file; + + if (bserrno) { + __wake_caller(args, bserrno); + return; + } + + spdk_blob_sync_md(file->blob, __file_extend_done, args); +} + +static void +__file_extend_blob(void *_args) +{ + struct spdk_fs_cb_args *args = _args; + struct spdk_file *file = args->file; + + spdk_blob_resize(file->blob, args->op.resize.num_clusters, __file_extend_resize_cb, args); +} + +static void +__rw_from_file_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + + __wake_caller(&req->args, bserrno); + free_fs_request(req); +} + +static void +__rw_from_file(void *ctx) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + + if (args->op.rw.is_read) { + spdk_file_read_async(file, file->fs->sync_target.sync_io_channel, args->iovs[0].iov_base, + args->op.rw.offset, (uint64_t)args->iovs[0].iov_len, + __rw_from_file_done, req); + } else { + spdk_file_write_async(file, file->fs->sync_target.sync_io_channel, args->iovs[0].iov_base, + args->op.rw.offset, (uint64_t)args->iovs[0].iov_len, + __rw_from_file_done, req); + } +} + +static int +__send_rw_from_file(struct spdk_file *file, void *payload, + uint64_t offset, uint64_t length, bool is_read, + struct spdk_fs_channel *channel) +{ + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + req = alloc_fs_request_with_iov(channel, 1); + if (req == NULL) { + sem_post(&channel->sem); + return -ENOMEM; + } + + args = &req->args; + args->file = file; + args->sem = &channel->sem; + args->iovs[0].iov_base = payload; + args->iovs[0].iov_len = (size_t)length; + args->op.rw.offset = offset; + args->op.rw.is_read = is_read; + file->fs->send_request(__rw_from_file, req); + return 0; +} + +int +spdk_file_write(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx, + void *payload, uint64_t offset, uint64_t length) +{ + struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx; + struct spdk_fs_request *flush_req; + uint64_t rem_length, copy, blob_size, cluster_sz; + uint32_t cache_buffers_filled = 0; + uint8_t *cur_payload; + struct cache_buffer *last; + + BLOBFS_TRACE_RW(file, "offset=%jx length=%jx\n", offset, length); + + if (length == 0) { + return 0; + } + + if (offset != file->append_pos) { + BLOBFS_TRACE(file, " error offset=%jx append_pos=%jx\n", offset, file->append_pos); + return -EINVAL; + } + + pthread_spin_lock(&file->lock); + file->open_for_writing = true; + + if ((file->last == NULL) && (file->append_pos % CACHE_BUFFER_SIZE == 0)) { + cache_append_buffer(file); + } + + if (file->last == NULL) { + int rc; + + file->append_pos += length; + pthread_spin_unlock(&file->lock); + rc = __send_rw_from_file(file, payload, offset, length, false, channel); + sem_wait(&channel->sem); + return rc; + } + + blob_size = __file_get_blob_size(file); + + if ((offset + length) > blob_size) { + struct spdk_fs_cb_args extend_args = {}; + + cluster_sz = file->fs->bs_opts.cluster_sz; + extend_args.sem = &channel->sem; + extend_args.op.resize.num_clusters = __bytes_to_clusters((offset + length), cluster_sz); + extend_args.file = file; + BLOBFS_TRACE(file, "start resize to %u clusters\n", extend_args.op.resize.num_clusters); + pthread_spin_unlock(&file->lock); + file->fs->send_request(__file_extend_blob, &extend_args); + sem_wait(&channel->sem); + if (extend_args.rc) { + return extend_args.rc; + } + } + + flush_req = alloc_fs_request(channel); + if (flush_req == NULL) { + pthread_spin_unlock(&file->lock); + return -ENOMEM; + } + + last = file->last; + rem_length = length; + cur_payload = payload; + while (rem_length > 0) { + copy = last->buf_size - last->bytes_filled; + if (copy > rem_length) { + copy = rem_length; + } + BLOBFS_TRACE_RW(file, " fill offset=%jx length=%jx\n", file->append_pos, copy); + memcpy(&last->buf[last->bytes_filled], cur_payload, copy); + file->append_pos += copy; + if (file->length < file->append_pos) { + file->length = file->append_pos; + } + cur_payload += copy; + last->bytes_filled += copy; + rem_length -= copy; + if (last->bytes_filled == last->buf_size) { + cache_buffers_filled++; + last = cache_append_buffer(file); + if (last == NULL) { + BLOBFS_TRACE(file, "nomem\n"); + free_fs_request(flush_req); + pthread_spin_unlock(&file->lock); + return -ENOMEM; + } + } + } + + pthread_spin_unlock(&file->lock); + + if (cache_buffers_filled == 0) { + free_fs_request(flush_req); + return 0; + } + + flush_req->args.file = file; + file->fs->send_request(__file_flush, flush_req); + return 0; +} + +static void +__readahead_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct cache_buffer *cache_buffer = args->op.readahead.cache_buffer; + struct spdk_file *file = args->file; + + BLOBFS_TRACE(file, "offset=%jx\n", cache_buffer->offset); + + pthread_spin_lock(&file->lock); + cache_buffer->bytes_filled = args->op.readahead.length; + cache_buffer->bytes_flushed = args->op.readahead.length; + cache_buffer->in_progress = false; + pthread_spin_unlock(&file->lock); + + free_fs_request(req); +} + +static void +__readahead(void *ctx) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + uint64_t offset, length, start_lba, num_lba; + uint32_t lba_size; + + offset = args->op.readahead.offset; + length = args->op.readahead.length; + assert(length > 0); + + __get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba); + + BLOBFS_TRACE(file, "offset=%jx length=%jx page start=%jx num=%jx\n", + offset, length, start_lba, num_lba); + spdk_blob_io_read(file->blob, file->fs->sync_target.sync_fs_channel->bs_channel, + args->op.readahead.cache_buffer->buf, + start_lba, num_lba, __readahead_done, req); +} + +static uint64_t +__next_cache_buffer_offset(uint64_t offset) +{ + return (offset + CACHE_BUFFER_SIZE) & ~(CACHE_TREE_LEVEL_MASK(0)); +} + +static void +check_readahead(struct spdk_file *file, uint64_t offset, + struct spdk_fs_channel *channel) +{ + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + offset = __next_cache_buffer_offset(offset); + if (tree_find_buffer(file->tree, offset) != NULL || file->length <= offset) { + return; + } + + req = alloc_fs_request(channel); + if (req == NULL) { + return; + } + args = &req->args; + + BLOBFS_TRACE(file, "offset=%jx\n", offset); + + args->file = file; + args->op.readahead.offset = offset; + args->op.readahead.cache_buffer = cache_insert_buffer(file, offset); + if (!args->op.readahead.cache_buffer) { + BLOBFS_TRACE(file, "Cannot allocate buf for offset=%jx\n", offset); + free_fs_request(req); + return; + } + + args->op.readahead.cache_buffer->in_progress = true; + if (file->length < (offset + CACHE_BUFFER_SIZE)) { + args->op.readahead.length = file->length & (CACHE_BUFFER_SIZE - 1); + } else { + args->op.readahead.length = CACHE_BUFFER_SIZE; + } + file->fs->send_request(__readahead, req); +} + +int64_t +spdk_file_read(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx, + void *payload, uint64_t offset, uint64_t length) +{ + struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx; + uint64_t final_offset, final_length; + uint32_t sub_reads = 0; + struct cache_buffer *buf; + uint64_t read_len; + int rc = 0; + + pthread_spin_lock(&file->lock); + + BLOBFS_TRACE_RW(file, "offset=%ju length=%ju\n", offset, length); + + file->open_for_writing = false; + + if (length == 0 || offset >= file->append_pos) { + pthread_spin_unlock(&file->lock); + return 0; + } + + if (offset + length > file->append_pos) { + length = file->append_pos - offset; + } + + if (offset != file->next_seq_offset) { + file->seq_byte_count = 0; + } + file->seq_byte_count += length; + file->next_seq_offset = offset + length; + if (file->seq_byte_count >= CACHE_READAHEAD_THRESHOLD) { + check_readahead(file, offset, channel); + check_readahead(file, offset + CACHE_BUFFER_SIZE, channel); + } + + final_length = 0; + final_offset = offset + length; + while (offset < final_offset) { + length = NEXT_CACHE_BUFFER_OFFSET(offset) - offset; + if (length > (final_offset - offset)) { + length = final_offset - offset; + } + + buf = tree_find_filled_buffer(file->tree, offset); + if (buf == NULL) { + pthread_spin_unlock(&file->lock); + rc = __send_rw_from_file(file, payload, offset, length, true, channel); + pthread_spin_lock(&file->lock); + if (rc == 0) { + sub_reads++; + } + } else { + read_len = length; + if ((offset + length) > (buf->offset + buf->bytes_filled)) { + read_len = buf->offset + buf->bytes_filled - offset; + } + BLOBFS_TRACE(file, "read %p offset=%ju length=%ju\n", payload, offset, read_len); + memcpy(payload, &buf->buf[offset - buf->offset], read_len); + if ((offset + read_len) % CACHE_BUFFER_SIZE == 0) { + tree_remove_buffer(file->tree, buf); + if (file->tree->present_mask == 0) { + spdk_thread_send_msg(g_cache_pool_thread, _remove_file_from_cache_pool, file); + } + } + } + + if (rc == 0) { + final_length += length; + } else { + break; + } + payload += length; + offset += length; + } + pthread_spin_unlock(&file->lock); + while (sub_reads > 0) { + sem_wait(&channel->sem); + sub_reads--; + } + if (rc == 0) { + return final_length; + } else { + return rc; + } +} + +static void +_file_sync(struct spdk_file *file, struct spdk_fs_channel *channel, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_fs_request *sync_req; + struct spdk_fs_request *flush_req; + struct spdk_fs_cb_args *sync_args; + struct spdk_fs_cb_args *flush_args; + + BLOBFS_TRACE(file, "offset=%jx\n", file->append_pos); + + pthread_spin_lock(&file->lock); + if (file->append_pos <= file->length_xattr) { + BLOBFS_TRACE(file, "done - file already synced\n"); + pthread_spin_unlock(&file->lock); + cb_fn(cb_arg, 0); + return; + } + + sync_req = alloc_fs_request(channel); + if (!sync_req) { + SPDK_ERRLOG("Cannot allocate sync req for file=%s\n", file->name); + pthread_spin_unlock(&file->lock); + cb_fn(cb_arg, -ENOMEM); + return; + } + sync_args = &sync_req->args; + + flush_req = alloc_fs_request(channel); + if (!flush_req) { + SPDK_ERRLOG("Cannot allocate flush req for file=%s\n", file->name); + free_fs_request(sync_req); + pthread_spin_unlock(&file->lock); + cb_fn(cb_arg, -ENOMEM); + return; + } + flush_args = &flush_req->args; + + sync_args->file = file; + sync_args->fn.file_op = cb_fn; + sync_args->arg = cb_arg; + sync_args->op.sync.offset = file->append_pos; + sync_args->op.sync.xattr_in_progress = false; + TAILQ_INSERT_TAIL(&file->sync_requests, sync_req, args.op.sync.tailq); + pthread_spin_unlock(&file->lock); + + flush_args->file = file; + channel->send_request(__file_flush, flush_req); +} + +int +spdk_file_sync(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx) +{ + struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx; + struct spdk_fs_cb_args args = {}; + + args.sem = &channel->sem; + _file_sync(file, channel, __wake_caller, &args); + sem_wait(&channel->sem); + + return args.rc; +} + +void +spdk_file_sync_async(struct spdk_file *file, struct spdk_io_channel *_channel, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + + _file_sync(file, channel, cb_fn, cb_arg); +} + +void +spdk_file_set_priority(struct spdk_file *file, uint32_t priority) +{ + BLOBFS_TRACE(file, "priority=%u\n", priority); + file->priority = priority; + +} + +/* + * Close routines + */ + +static void +__file_close_async_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + + spdk_trace_record(TRACE_BLOBFS_CLOSE, 0, 0, 0, file->trace_arg_name); + + if (file->is_deleted) { + spdk_fs_delete_file_async(file->fs, file->name, blob_delete_cb, ctx); + return; + } + + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); +} + +static void +__file_close_async(struct spdk_file *file, struct spdk_fs_request *req) +{ + struct spdk_blob *blob; + + pthread_spin_lock(&file->lock); + if (file->ref_count == 0) { + pthread_spin_unlock(&file->lock); + __file_close_async_done(req, -EBADF); + return; + } + + file->ref_count--; + if (file->ref_count > 0) { + pthread_spin_unlock(&file->lock); + req->args.fn.file_op(req->args.arg, 0); + free_fs_request(req); + return; + } + + pthread_spin_unlock(&file->lock); + + blob = file->blob; + file->blob = NULL; + spdk_blob_close(blob, __file_close_async_done, req); +} + +static void +__file_close_async__sync_done(void *arg, int fserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + __file_close_async(args->file, req); +} + +void +spdk_file_close_async(struct spdk_file *file, spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + req = alloc_fs_request(file->fs->md_target.md_fs_channel); + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate close async req for file=%s\n", file->name); + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->file = file; + args->fn.file_op = cb_fn; + args->arg = cb_arg; + + spdk_file_sync_async(file, file->fs->md_target.md_io_channel, __file_close_async__sync_done, req); +} + +static void +__file_close(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + + __file_close_async(file, req); +} + +int +spdk_file_close(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx) +{ + struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + req = alloc_fs_request(channel); + if (req == NULL) { + SPDK_ERRLOG("Cannot allocate close req for file=%s\n", file->name); + return -ENOMEM; + } + + args = &req->args; + + spdk_file_sync(file, ctx); + BLOBFS_TRACE(file, "name=%s\n", file->name); + args->file = file; + args->sem = &channel->sem; + args->fn.file_op = __wake_caller; + args->arg = args; + channel->send_request(__file_close, req); + sem_wait(&channel->sem); + + return args->rc; +} + +int +spdk_file_get_id(struct spdk_file *file, void *id, size_t size) +{ + if (size < sizeof(spdk_blob_id)) { + return -EINVAL; + } + + memcpy(id, &file->blobid, sizeof(spdk_blob_id)); + + return sizeof(spdk_blob_id); +} + +static void +_file_free(void *ctx) +{ + struct spdk_file *file = ctx; + + TAILQ_REMOVE(&g_caches, file, cache_tailq); + + free(file->name); + free(file->tree); + free(file); +} + +static void +file_free(struct spdk_file *file) +{ + BLOBFS_TRACE(file, "free=%s\n", file->name); + pthread_spin_lock(&file->lock); + if (file->tree->present_mask == 0) { + pthread_spin_unlock(&file->lock); + free(file->name); + free(file->tree); + free(file); + return; + } + + tree_free_buffers(file->tree); + assert(file->tree->present_mask == 0); + spdk_thread_send_msg(g_cache_pool_thread, _file_free, file); + pthread_spin_unlock(&file->lock); +} + +SPDK_LOG_REGISTER_COMPONENT("blobfs", SPDK_LOG_BLOBFS) +SPDK_LOG_REGISTER_COMPONENT("blobfs_rw", SPDK_LOG_BLOBFS_RW) diff --git a/src/spdk/lib/blobfs/spdk_blobfs.map b/src/spdk/lib/blobfs/spdk_blobfs.map new file mode 100644 index 000000000..91c02f61e --- /dev/null +++ b/src/spdk/lib/blobfs/spdk_blobfs.map @@ -0,0 +1,45 @@ +{ + global: + + # public functions + spdk_fs_opts_init; + spdk_fs_init; + spdk_fs_load; + spdk_fs_unload; + spdk_fs_alloc_io_channel; + spdk_fs_free_io_channel; + spdk_fs_alloc_thread_ctx; + spdk_fs_free_thread_ctx; + spdk_fs_file_stat; + spdk_fs_create_file; + spdk_fs_open_file; + spdk_file_close; + spdk_fs_rename_file; + spdk_fs_delete_file; + spdk_fs_iter_first; + spdk_fs_iter_next; + spdk_file_truncate; + spdk_file_get_name; + spdk_file_get_length; + spdk_file_write; + spdk_file_read; + spdk_fs_set_cache_size; + spdk_fs_get_cache_size; + spdk_file_set_priority; + spdk_file_sync; + spdk_file_get_id; + spdk_file_readv_async; + spdk_file_writev_async; + spdk_fs_file_stat_async; + spdk_fs_create_file_async; + spdk_fs_open_file_async; + spdk_file_close_async; + spdk_fs_rename_file_async; + spdk_fs_delete_file_async; + spdk_file_truncate_async; + spdk_file_write_async; + spdk_file_read_async; + spdk_file_sync_async; + + local: *; +}; diff --git a/src/spdk/lib/blobfs/tree.c b/src/spdk/lib/blobfs/tree.c new file mode 100644 index 000000000..32779766f --- /dev/null +++ b/src/spdk/lib/blobfs/tree.c @@ -0,0 +1,181 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/blobfs.h" +#include "tree.h" + +#include "spdk/queue.h" +#include "spdk/assert.h" +#include "spdk/env.h" +#include "spdk_internal/log.h" + +uint32_t g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT; + +struct cache_buffer * +tree_find_buffer(struct cache_tree *tree, uint64_t offset) +{ + uint64_t index; + + while (tree != NULL) { + index = offset / CACHE_TREE_LEVEL_SIZE(tree->level); + if (index >= CACHE_TREE_WIDTH) { + return NULL; + } + if (tree->level == 0) { + return tree->u.buffer[index]; + } else { + offset &= CACHE_TREE_LEVEL_MASK(tree->level); + tree = tree->u.tree[index]; + } + } + + return NULL; +} + +struct cache_buffer * +tree_find_filled_buffer(struct cache_tree *tree, uint64_t offset) +{ + struct cache_buffer *buf; + + buf = tree_find_buffer(tree, offset); + if (buf != NULL && buf->bytes_filled > 0) { + return buf; + } else { + return NULL; + } +} + +struct cache_tree * +tree_insert_buffer(struct cache_tree *root, struct cache_buffer *buffer) +{ + struct cache_tree *tree; + uint64_t index, offset; + + offset = buffer->offset; + while (offset >= CACHE_TREE_LEVEL_SIZE(root->level + 1)) { + if (root->present_mask != 0) { + tree = calloc(1, sizeof(*tree)); + tree->level = root->level + 1; + tree->u.tree[0] = root; + root = tree; + root->present_mask = 0x1ULL; + } else { + root->level++; + } + } + + tree = root; + while (tree->level > 0) { + index = offset / CACHE_TREE_LEVEL_SIZE(tree->level); + assert(index < CACHE_TREE_WIDTH); + offset &= CACHE_TREE_LEVEL_MASK(tree->level); + if (tree->u.tree[index] == NULL) { + tree->u.tree[index] = calloc(1, sizeof(*tree)); + tree->u.tree[index]->level = tree->level - 1; + tree->present_mask |= (1ULL << index); + } + tree = tree->u.tree[index]; + } + + index = offset / CACHE_BUFFER_SIZE; + assert(index < CACHE_TREE_WIDTH); + assert(tree->u.buffer[index] == NULL); + tree->u.buffer[index] = buffer; + tree->present_mask |= (1ULL << index); + return root; +} + +void +tree_remove_buffer(struct cache_tree *tree, struct cache_buffer *buffer) +{ + struct cache_tree *child; + uint64_t index; + + index = CACHE_TREE_INDEX(tree->level, buffer->offset); + + if (tree->level == 0) { + assert(tree->u.buffer[index] != NULL); + assert(buffer == tree->u.buffer[index]); + tree->present_mask &= ~(1ULL << index); + tree->u.buffer[index] = NULL; + cache_buffer_free(buffer); + return; + } + + child = tree->u.tree[index]; + assert(child != NULL); + tree_remove_buffer(child, buffer); + if (child->present_mask == 0) { + tree->present_mask &= ~(1ULL << index); + tree->u.tree[index] = NULL; + free(child); + } +} + +void +tree_free_buffers(struct cache_tree *tree) +{ + struct cache_buffer *buffer; + struct cache_tree *child; + uint32_t i; + + if (tree->present_mask == 0) { + return; + } + + if (tree->level == 0) { + for (i = 0; i < CACHE_TREE_WIDTH; i++) { + buffer = tree->u.buffer[i]; + if (buffer != NULL && buffer->in_progress == false && + buffer->bytes_filled == buffer->bytes_flushed) { + cache_buffer_free(buffer); + tree->u.buffer[i] = NULL; + tree->present_mask &= ~(1ULL << i); + } + } + } else { + for (i = 0; i < CACHE_TREE_WIDTH; i++) { + child = tree->u.tree[i]; + if (child != NULL) { + tree_free_buffers(child); + if (child->present_mask == 0) { + free(child); + tree->u.tree[i] = NULL; + tree->present_mask &= ~(1ULL << i); + } + } + } + } +} diff --git a/src/spdk/lib/blobfs/tree.h b/src/spdk/lib/blobfs/tree.h new file mode 100644 index 000000000..71df71090 --- /dev/null +++ b/src/spdk/lib/blobfs/tree.h @@ -0,0 +1,77 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_TREE_H_ +#define SPDK_TREE_H_ + +struct cache_buffer { + uint8_t *buf; + uint64_t offset; + uint32_t buf_size; + uint32_t bytes_filled; + uint32_t bytes_flushed; + bool in_progress; +}; + +extern uint32_t g_fs_cache_buffer_shift; + +#define CACHE_BUFFER_SHIFT_DEFAULT 18 +#define CACHE_BUFFER_SIZE (1U << g_fs_cache_buffer_shift) +#define NEXT_CACHE_BUFFER_OFFSET(offset) \ + (((offset + CACHE_BUFFER_SIZE) >> g_fs_cache_buffer_shift) << g_fs_cache_buffer_shift) + +#define CACHE_TREE_SHIFT 6 +#define CACHE_TREE_WIDTH (1U << CACHE_TREE_SHIFT) +#define CACHE_TREE_LEVEL_SHIFT(level) (g_fs_cache_buffer_shift + (level) * CACHE_TREE_SHIFT) +#define CACHE_TREE_LEVEL_SIZE(level) (1ULL << CACHE_TREE_LEVEL_SHIFT(level)) +#define CACHE_TREE_LEVEL_MASK(level) (CACHE_TREE_LEVEL_SIZE(level) - 1) +#define CACHE_TREE_INDEX(level, offset) ((offset >> CACHE_TREE_LEVEL_SHIFT(level)) & (CACHE_TREE_WIDTH - 1)) + +struct cache_tree { + uint8_t level; + uint64_t present_mask; + union { + struct cache_buffer *buffer[CACHE_TREE_WIDTH]; + struct cache_tree *tree[CACHE_TREE_WIDTH]; + } u; +}; + +void cache_buffer_free(struct cache_buffer *cache_buffer); + +struct cache_tree *tree_insert_buffer(struct cache_tree *root, struct cache_buffer *buffer); +void tree_free_buffers(struct cache_tree *tree); +struct cache_buffer *tree_find_buffer(struct cache_tree *tree, uint64_t offset); +struct cache_buffer *tree_find_filled_buffer(struct cache_tree *tree, uint64_t offset); +void tree_remove_buffer(struct cache_tree *tree, struct cache_buffer *buffer); + +#endif /* SPDK_TREE_H_ */ diff --git a/src/spdk/lib/conf/Makefile b/src/spdk/lib/conf/Makefile new file mode 100644 index 000000000..09966ea12 --- /dev/null +++ b/src/spdk/lib/conf/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 1 + +C_SRCS = conf.c +LIBNAME = conf + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_conf.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/conf/conf.c b/src/spdk/lib/conf/conf.c new file mode 100644 index 000000000..287e157a5 --- /dev/null +++ b/src/spdk/lib/conf/conf.c @@ -0,0 +1,704 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/conf.h" +#include "spdk/string.h" +#include "spdk/log.h" + +struct spdk_conf_value { + struct spdk_conf_value *next; + char *value; +}; + +struct spdk_conf_item { + struct spdk_conf_item *next; + char *key; + struct spdk_conf_value *val; +}; + +struct spdk_conf_section { + struct spdk_conf_section *next; + char *name; + int num; + struct spdk_conf_item *item; +}; + +struct spdk_conf { + char *file; + struct spdk_conf_section *current_section; + struct spdk_conf_section *section; + bool merge_sections; +}; + +#define CF_DELIM " \t" +#define CF_DELIM_KEY " \t=" + +#define LIB_MAX_TMPBUF 1024 + +static struct spdk_conf *default_config = NULL; + +struct spdk_conf * +spdk_conf_allocate(void) +{ + struct spdk_conf *ret = calloc(1, sizeof(struct spdk_conf)); + + if (ret) { + ret->merge_sections = true; + } + + return ret; +} + +static void +free_conf_value(struct spdk_conf_value *vp) +{ + if (vp == NULL) { + return; + } + + if (vp->value) { + free(vp->value); + } + + free(vp); +} + +static void +free_all_conf_value(struct spdk_conf_value *vp) +{ + struct spdk_conf_value *next; + + if (vp == NULL) { + return; + } + + while (vp != NULL) { + next = vp->next; + free_conf_value(vp); + vp = next; + } +} + +static void +free_conf_item(struct spdk_conf_item *ip) +{ + if (ip == NULL) { + return; + } + + if (ip->val != NULL) { + free_all_conf_value(ip->val); + } + + if (ip->key != NULL) { + free(ip->key); + } + + free(ip); +} + +static void +free_all_conf_item(struct spdk_conf_item *ip) +{ + struct spdk_conf_item *next; + + if (ip == NULL) { + return; + } + + while (ip != NULL) { + next = ip->next; + free_conf_item(ip); + ip = next; + } +} + +static void +free_conf_section(struct spdk_conf_section *sp) +{ + if (sp == NULL) { + return; + } + + if (sp->item) { + free_all_conf_item(sp->item); + } + + if (sp->name) { + free(sp->name); + } + + free(sp); +} + +static void +free_all_conf_section(struct spdk_conf_section *sp) +{ + struct spdk_conf_section *next; + + if (sp == NULL) { + return; + } + + while (sp != NULL) { + next = sp->next; + free_conf_section(sp); + sp = next; + } +} + +void +spdk_conf_free(struct spdk_conf *cp) +{ + if (cp == NULL) { + return; + } + + if (cp->section != NULL) { + free_all_conf_section(cp->section); + } + + if (cp->file != NULL) { + free(cp->file); + } + + free(cp); +} + +static struct spdk_conf_section * +allocate_cf_section(void) +{ + return calloc(1, sizeof(struct spdk_conf_section)); +} + +static struct spdk_conf_item * +allocate_cf_item(void) +{ + return calloc(1, sizeof(struct spdk_conf_item)); +} + +static struct spdk_conf_value * +allocate_cf_value(void) +{ + return calloc(1, sizeof(struct spdk_conf_value)); +} + + +#define CHECK_CP_OR_USE_DEFAULT(cp) (((cp) == NULL) && (default_config != NULL)) ? default_config : (cp) + +struct spdk_conf_section * +spdk_conf_find_section(struct spdk_conf *cp, const char *name) +{ + struct spdk_conf_section *sp; + + if (name == NULL || name[0] == '\0') { + return NULL; + } + + cp = CHECK_CP_OR_USE_DEFAULT(cp); + if (cp == NULL) { + return NULL; + } + + for (sp = cp->section; sp != NULL; sp = sp->next) { + if (sp->name != NULL && sp->name[0] == name[0] + && strcasecmp(sp->name, name) == 0) { + return sp; + } + } + + return NULL; +} + +struct spdk_conf_section * +spdk_conf_first_section(struct spdk_conf *cp) +{ + cp = CHECK_CP_OR_USE_DEFAULT(cp); + if (cp == NULL) { + return NULL; + } + + return cp->section; +} + +struct spdk_conf_section * +spdk_conf_next_section(struct spdk_conf_section *sp) +{ + if (sp == NULL) { + return NULL; + } + + return sp->next; +} + +static void +append_cf_section(struct spdk_conf *cp, struct spdk_conf_section *sp) +{ + struct spdk_conf_section *last; + + cp = CHECK_CP_OR_USE_DEFAULT(cp); + if (cp == NULL) { + SPDK_ERRLOG("cp == NULL\n"); + return; + } + + if (cp->section == NULL) { + cp->section = sp; + return; + } + + for (last = cp->section; last->next != NULL; last = last->next) + ; + last->next = sp; +} + +static struct spdk_conf_item * +find_cf_nitem(struct spdk_conf_section *sp, const char *key, int idx) +{ + struct spdk_conf_item *ip; + int i; + + if (key == NULL || key[0] == '\0') { + return NULL; + } + + i = 0; + for (ip = sp->item; ip != NULL; ip = ip->next) { + if (ip->key != NULL && ip->key[0] == key[0] + && strcasecmp(ip->key, key) == 0) { + if (i == idx) { + return ip; + } + i++; + } + } + + return NULL; +} + +static void +append_cf_item(struct spdk_conf_section *sp, struct spdk_conf_item *ip) +{ + struct spdk_conf_item *last; + + if (sp == NULL) { + return; + } + + if (sp->item == NULL) { + sp->item = ip; + return; + } + + for (last = sp->item; last->next != NULL; last = last->next) + ; + last->next = ip; +} + +static void +append_cf_value(struct spdk_conf_item *ip, struct spdk_conf_value *vp) +{ + struct spdk_conf_value *last; + + if (ip == NULL) { + return; + } + + if (ip->val == NULL) { + ip->val = vp; + return; + } + + for (last = ip->val; last->next != NULL; last = last->next) + ; + last->next = vp; +} + +bool +spdk_conf_section_match_prefix(const struct spdk_conf_section *sp, const char *name_prefix) +{ + return strncasecmp(sp->name, name_prefix, strlen(name_prefix)) == 0; +} + +const char * +spdk_conf_section_get_name(const struct spdk_conf_section *sp) +{ + return sp->name; +} + +int +spdk_conf_section_get_num(const struct spdk_conf_section *sp) +{ + return sp->num; +} + +char * +spdk_conf_section_get_nmval(struct spdk_conf_section *sp, const char *key, int idx1, int idx2) +{ + struct spdk_conf_item *ip; + struct spdk_conf_value *vp; + int i; + + ip = find_cf_nitem(sp, key, idx1); + if (ip == NULL) { + return NULL; + } + + vp = ip->val; + if (vp == NULL) { + return NULL; + } + + for (i = 0; vp != NULL; vp = vp->next, i++) { + if (i == idx2) { + return vp->value; + } + } + + return NULL; +} + +char * +spdk_conf_section_get_nval(struct spdk_conf_section *sp, const char *key, int idx) +{ + struct spdk_conf_item *ip; + struct spdk_conf_value *vp; + + ip = find_cf_nitem(sp, key, idx); + if (ip == NULL) { + return NULL; + } + + vp = ip->val; + if (vp == NULL) { + return NULL; + } + + return vp->value; +} + +char * +spdk_conf_section_get_val(struct spdk_conf_section *sp, const char *key) +{ + return spdk_conf_section_get_nval(sp, key, 0); +} + +int +spdk_conf_section_get_intval(struct spdk_conf_section *sp, const char *key) +{ + const char *v; + int value; + + v = spdk_conf_section_get_nval(sp, key, 0); + if (v == NULL) { + return -1; + } + + value = (int)spdk_strtol(v, 10); + return value; +} + +bool +spdk_conf_section_get_boolval(struct spdk_conf_section *sp, const char *key, bool default_val) +{ + const char *v; + + v = spdk_conf_section_get_nval(sp, key, 0); + if (v == NULL) { + return default_val; + } + + if (!strcasecmp(v, "Yes") || !strcasecmp(v, "Y") || !strcasecmp(v, "True")) { + return true; + } + + if (!strcasecmp(v, "No") || !strcasecmp(v, "N") || !strcasecmp(v, "False")) { + return false; + } + + return default_val; +} + +static int +parse_line(struct spdk_conf *cp, char *lp) +{ + struct spdk_conf_section *sp; + struct spdk_conf_item *ip; + struct spdk_conf_value *vp; + char *arg; + char *key; + char *val; + char *p; + int num; + + arg = spdk_str_trim(lp); + if (arg == NULL) { + SPDK_ERRLOG("no section\n"); + return -1; + } + + if (arg[0] == '[') { + /* section */ + arg++; + key = spdk_strsepq(&arg, "]"); + if (key == NULL || arg != NULL) { + SPDK_ERRLOG("broken section\n"); + return -1; + } + /* determine section number */ + for (p = key; *p != '\0' && !isdigit((int) *p); p++) + ; + if (*p != '\0') { + num = (int)spdk_strtol(p, 10); + } else { + num = 0; + } + + if (cp->merge_sections) { + sp = spdk_conf_find_section(cp, key); + } else { + sp = NULL; + } + + if (sp == NULL) { + sp = allocate_cf_section(); + append_cf_section(cp, sp); + + sp->name = strdup(key); + if (sp->name == NULL) { + SPDK_ERRLOG("cannot duplicate %s to sp->name\n", key); + return -1; + } + } + cp->current_section = sp; + + + sp->num = num; + } else { + /* parameters */ + sp = cp->current_section; + if (sp == NULL) { + SPDK_ERRLOG("unknown section\n"); + return -1; + } + key = spdk_strsepq(&arg, CF_DELIM_KEY); + if (key == NULL) { + SPDK_ERRLOG("broken key\n"); + return -1; + } + + ip = allocate_cf_item(); + if (ip == NULL) { + SPDK_ERRLOG("cannot allocate cf item\n"); + return -1; + } + append_cf_item(sp, ip); + ip->key = strdup(key); + if (ip->key == NULL) { + SPDK_ERRLOG("cannot make duplicate of %s\n", key); + return -1; + } + ip->val = NULL; + if (arg != NULL) { + /* key has value(s) */ + while (arg != NULL) { + val = spdk_strsepq(&arg, CF_DELIM); + vp = allocate_cf_value(); + if (vp == NULL) { + SPDK_ERRLOG("cannot allocate cf value\n"); + return -1; + } + append_cf_value(ip, vp); + vp->value = strdup(val); + if (vp->value == NULL) { + SPDK_ERRLOG("cannot duplicate %s to vp->value\n", val); + return -1; + } + } + } + } + + return 0; +} + +static char * +fgets_line(FILE *fp) +{ + char *dst, *dst2, *p; + size_t total, len; + + dst = p = malloc(LIB_MAX_TMPBUF); + if (!dst) { + return NULL; + } + + dst[0] = '\0'; + total = 0; + + while (fgets(p, LIB_MAX_TMPBUF, fp) != NULL) { + len = strlen(p); + total += len; + if (len + 1 < LIB_MAX_TMPBUF || dst[total - 1] == '\n') { + dst2 = realloc(dst, total + 1); + if (!dst2) { + free(dst); + return NULL; + } else { + return dst2; + } + } + + dst2 = realloc(dst, total + LIB_MAX_TMPBUF); + if (!dst2) { + free(dst); + return NULL; + } else { + dst = dst2; + } + + p = dst + total; + } + + if (feof(fp) && total != 0) { + dst2 = realloc(dst, total + 2); + if (!dst2) { + free(dst); + return NULL; + } else { + dst = dst2; + } + + dst[total] = '\n'; + dst[total + 1] = '\0'; + return dst; + } + + free(dst); + + return NULL; +} + +int +spdk_conf_read(struct spdk_conf *cp, const char *file) +{ + FILE *fp; + char *lp, *p; + char *lp2, *q; + int line; + int n, n2; + + if (file == NULL || file[0] == '\0') { + return -1; + } + SPDK_ERRLOG("INI configuration has been deprecated and will be removed in a future release. Please switch to JSON-RPC.\n"); + + fp = fopen(file, "r"); + if (fp == NULL) { + SPDK_ERRLOG("open error: %s\n", file); + return -1; + } + + cp->file = strdup(file); + if (cp->file == NULL) { + SPDK_ERRLOG("cannot duplicate %s to cp->file\n", file); + fclose(fp); + return -1; + } + + line = 1; + while ((lp = fgets_line(fp)) != NULL) { + /* skip spaces */ + for (p = lp; *p != '\0' && isspace((int) *p); p++) + ; + /* skip comment, empty line */ + if (p[0] == '#' || p[0] == '\0') { + goto next_line; + } + + /* concatenate line end with '\' */ + n = strlen(p); + while (n > 2 && p[n - 1] == '\n' && p[n - 2] == '\\') { + n -= 2; + lp2 = fgets_line(fp); + if (lp2 == NULL) { + break; + } + + line++; + n2 = strlen(lp2); + + q = malloc(n + n2 + 1); + if (!q) { + free(lp2); + free(lp); + SPDK_ERRLOG("malloc failed at line %d of %s\n", line, cp->file); + fclose(fp); + return -1; + } + + memcpy(q, p, n); + memcpy(q + n, lp2, n2); + q[n + n2] = '\0'; + free(lp2); + free(lp); + p = lp = q; + n += n2; + } + + /* parse one line */ + if (parse_line(cp, p) < 0) { + SPDK_ERRLOG("parse error at line %d of %s\n", line, cp->file); + } +next_line: + line++; + free(lp); + } + + fclose(fp); + return 0; +} + +void +spdk_conf_set_as_default(struct spdk_conf *cp) +{ + default_config = cp; +} + +void +spdk_conf_disable_sections_merge(struct spdk_conf *cp) +{ + cp->merge_sections = false; +} diff --git a/src/spdk/lib/conf/spdk_conf.map b/src/spdk/lib/conf/spdk_conf.map new file mode 100644 index 000000000..0fc01c8aa --- /dev/null +++ b/src/spdk/lib/conf/spdk_conf.map @@ -0,0 +1,23 @@ +{ + global: + + # Public functions + spdk_conf_allocate; + spdk_conf_free; + spdk_conf_read; + spdk_conf_find_section; + spdk_conf_first_section; + spdk_conf_next_section; + spdk_conf_section_match_prefix; + spdk_conf_section_get_name; + spdk_conf_section_get_num; + spdk_conf_section_get_nmval; + spdk_conf_section_get_nval; + spdk_conf_section_get_val; + spdk_conf_section_get_intval; + spdk_conf_section_get_boolval; + spdk_conf_set_as_default; + spdk_conf_disable_sections_merge; + + local: *; +}; diff --git a/src/spdk/lib/env_dpdk/Makefile b/src/spdk/lib/env_dpdk/Makefile new file mode 100644 index 000000000..11433fe86 --- /dev/null +++ b/src/spdk/lib/env_dpdk/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 5 +SO_MINOR := 0 + +CFLAGS += $(ENV_CFLAGS) +C_SRCS = env.c memory.c pci.c init.c threads.c +C_SRCS += pci_ioat.c pci_virtio.c pci_vmd.c pci_idxd.c +LIBNAME = env_dpdk + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_env_dpdk.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/env_dpdk/env.c b/src/spdk/lib/env_dpdk/env.c new file mode 100644 index 000000000..94b709de9 --- /dev/null +++ b/src/spdk/lib/env_dpdk/env.c @@ -0,0 +1,451 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/util.h" +#include "spdk/env_dpdk.h" + +#include "env_internal.h" + +#include <rte_config.h> +#include <rte_cycles.h> +#include <rte_malloc.h> +#include <rte_mempool.h> +#include <rte_memzone.h> +#include <rte_version.h> + +static uint64_t +virt_to_phys(void *vaddr) +{ + uint64_t ret; + + ret = rte_malloc_virt2iova(vaddr); + if (ret != RTE_BAD_IOVA) { + return ret; + } + + return spdk_vtophys(vaddr, NULL); +} + +void * +spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags) +{ + void *buf; + + if (flags == 0) { + return NULL; + } + + align = spdk_max(align, RTE_CACHE_LINE_SIZE); + buf = rte_malloc_socket(NULL, size, align, socket_id); + if (buf && phys_addr) { +#ifdef DEBUG + fprintf(stderr, "phys_addr param in spdk_*malloc() is deprecated\n"); +#endif + *phys_addr = virt_to_phys(buf); + } + return buf; +} + +void * +spdk_zmalloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags) +{ + void *buf = spdk_malloc(size, align, phys_addr, socket_id, flags); + if (buf) { + memset(buf, 0, size); + } + return buf; +} + +void * +spdk_realloc(void *buf, size_t size, size_t align) +{ + align = spdk_max(align, RTE_CACHE_LINE_SIZE); + return rte_realloc(buf, size, align); +} + +void +spdk_free(void *buf) +{ + rte_free(buf); +} + +void * +spdk_dma_malloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id) +{ + return spdk_malloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE)); +} + +void * +spdk_dma_zmalloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id) +{ + return spdk_zmalloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE)); +} + +void * +spdk_dma_malloc(size_t size, size_t align, uint64_t *phys_addr) +{ + return spdk_dma_malloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY); +} + +void * +spdk_dma_zmalloc(size_t size, size_t align, uint64_t *phys_addr) +{ + return spdk_dma_zmalloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY); +} + +void * +spdk_dma_realloc(void *buf, size_t size, size_t align, uint64_t *phys_addr) +{ + void *new_buf; + + align = spdk_max(align, RTE_CACHE_LINE_SIZE); + new_buf = rte_realloc(buf, size, align); + if (new_buf && phys_addr) { + *phys_addr = virt_to_phys(new_buf); + } + return new_buf; +} + +void +spdk_dma_free(void *buf) +{ + spdk_free(buf); +} + +void * +spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id, + unsigned flags, unsigned align) +{ + const struct rte_memzone *mz; + unsigned dpdk_flags = 0; + + if ((flags & SPDK_MEMZONE_NO_IOVA_CONTIG) == 0) { + dpdk_flags |= RTE_MEMZONE_IOVA_CONTIG; + } + + if (socket_id == SPDK_ENV_SOCKET_ID_ANY) { + socket_id = SOCKET_ID_ANY; + } + + mz = rte_memzone_reserve_aligned(name, len, socket_id, dpdk_flags, align); + + if (mz != NULL) { + memset(mz->addr, 0, len); + return mz->addr; + } else { + return NULL; + } +} + +void * +spdk_memzone_reserve(const char *name, size_t len, int socket_id, unsigned flags) +{ + return spdk_memzone_reserve_aligned(name, len, socket_id, flags, + RTE_CACHE_LINE_SIZE); +} + +void * +spdk_memzone_lookup(const char *name) +{ + const struct rte_memzone *mz = rte_memzone_lookup(name); + + if (mz != NULL) { + return mz->addr; + } else { + return NULL; + } +} + +int +spdk_memzone_free(const char *name) +{ + const struct rte_memzone *mz = rte_memzone_lookup(name); + + if (mz != NULL) { + return rte_memzone_free(mz); + } + + return -1; +} + +void +spdk_memzone_dump(FILE *f) +{ + rte_memzone_dump(f); +} + +struct spdk_mempool * +spdk_mempool_create_ctor(const char *name, size_t count, + size_t ele_size, size_t cache_size, int socket_id, + spdk_mempool_obj_cb_t *obj_init, void *obj_init_arg) +{ + struct rte_mempool *mp; + size_t tmp; + + if (socket_id == SPDK_ENV_SOCKET_ID_ANY) { + socket_id = SOCKET_ID_ANY; + } + + /* No more than half of all elements can be in cache */ + tmp = (count / 2) / rte_lcore_count(); + if (cache_size > tmp) { + cache_size = tmp; + } + + if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE) { + cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE; + } + + mp = rte_mempool_create(name, count, ele_size, cache_size, + 0, NULL, NULL, (rte_mempool_obj_cb_t *)obj_init, obj_init_arg, + socket_id, MEMPOOL_F_NO_PHYS_CONTIG); + + return (struct spdk_mempool *)mp; +} + + +struct spdk_mempool * +spdk_mempool_create(const char *name, size_t count, + size_t ele_size, size_t cache_size, int socket_id) +{ + return spdk_mempool_create_ctor(name, count, ele_size, cache_size, socket_id, + NULL, NULL); +} + +char * +spdk_mempool_get_name(struct spdk_mempool *mp) +{ + return ((struct rte_mempool *)mp)->name; +} + +void +spdk_mempool_free(struct spdk_mempool *mp) +{ + rte_mempool_free((struct rte_mempool *)mp); +} + +void * +spdk_mempool_get(struct spdk_mempool *mp) +{ + void *ele = NULL; + int rc; + + rc = rte_mempool_get((struct rte_mempool *)mp, &ele); + if (rc != 0) { + return NULL; + } + return ele; +} + +int +spdk_mempool_get_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count) +{ + return rte_mempool_get_bulk((struct rte_mempool *)mp, ele_arr, count); +} + +void +spdk_mempool_put(struct spdk_mempool *mp, void *ele) +{ + rte_mempool_put((struct rte_mempool *)mp, ele); +} + +void +spdk_mempool_put_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count) +{ + rte_mempool_put_bulk((struct rte_mempool *)mp, ele_arr, count); +} + +size_t +spdk_mempool_count(const struct spdk_mempool *pool) +{ + return rte_mempool_avail_count((struct rte_mempool *)pool); +} + +uint32_t +spdk_mempool_obj_iter(struct spdk_mempool *mp, spdk_mempool_obj_cb_t obj_cb, + void *obj_cb_arg) +{ + return rte_mempool_obj_iter((struct rte_mempool *)mp, (rte_mempool_obj_cb_t *)obj_cb, + obj_cb_arg); +} + +struct spdk_mempool * +spdk_mempool_lookup(const char *name) +{ + return (struct spdk_mempool *)rte_mempool_lookup(name); +} + +bool +spdk_process_is_primary(void) +{ + return (rte_eal_process_type() == RTE_PROC_PRIMARY); +} + +uint64_t spdk_get_ticks(void) +{ + return rte_get_timer_cycles(); +} + +uint64_t spdk_get_ticks_hz(void) +{ + return rte_get_timer_hz(); +} + +void spdk_delay_us(unsigned int us) +{ + rte_delay_us(us); +} + +void spdk_pause(void) +{ + rte_pause(); +} + +void +spdk_unaffinitize_thread(void) +{ + rte_cpuset_t new_cpuset, orig_cpuset; + long num_cores, i, orig_num_cores; + + CPU_ZERO(&new_cpuset); + + num_cores = sysconf(_SC_NPROCESSORS_CONF); + + /* Create a mask containing all CPUs */ + for (i = 0; i < num_cores; i++) { + CPU_SET(i, &new_cpuset); + } + + rte_thread_get_affinity(&orig_cpuset); + orig_num_cores = CPU_COUNT(&orig_cpuset); + if (orig_num_cores < num_cores) { + for (i = 0; i < orig_num_cores; i++) { + if (CPU_ISSET(i, &orig_cpuset)) { + CPU_CLR(i, &new_cpuset); + } + } + } + + rte_thread_set_affinity(&new_cpuset); +} + +void * +spdk_call_unaffinitized(void *cb(void *arg), void *arg) +{ + rte_cpuset_t orig_cpuset; + void *ret; + + if (cb == NULL) { + return NULL; + } + + rte_thread_get_affinity(&orig_cpuset); + + spdk_unaffinitize_thread(); + + ret = cb(arg); + + rte_thread_set_affinity(&orig_cpuset); + + return ret; +} + +struct spdk_ring * +spdk_ring_create(enum spdk_ring_type type, size_t count, int socket_id) +{ + char ring_name[64]; + static uint32_t ring_num = 0; + unsigned flags = RING_F_EXACT_SZ; + + switch (type) { + case SPDK_RING_TYPE_SP_SC: + flags |= RING_F_SP_ENQ | RING_F_SC_DEQ; + break; + case SPDK_RING_TYPE_MP_SC: + flags |= RING_F_SC_DEQ; + break; + case SPDK_RING_TYPE_MP_MC: + flags |= 0; + break; + default: + return NULL; + } + + snprintf(ring_name, sizeof(ring_name), "ring_%u_%d", + __atomic_fetch_add(&ring_num, 1, __ATOMIC_RELAXED), getpid()); + + return (struct spdk_ring *)rte_ring_create(ring_name, count, socket_id, flags); +} + +void +spdk_ring_free(struct spdk_ring *ring) +{ + rte_ring_free((struct rte_ring *)ring); +} + +size_t +spdk_ring_count(struct spdk_ring *ring) +{ + return rte_ring_count((struct rte_ring *)ring); +} + +size_t +spdk_ring_enqueue(struct spdk_ring *ring, void **objs, size_t count, + size_t *free_space) +{ + return rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count, + (unsigned int *)free_space); +} + +size_t +spdk_ring_dequeue(struct spdk_ring *ring, void **objs, size_t count) +{ + return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count, NULL); +} + +void +spdk_env_dpdk_dump_mem_stats(FILE *file) +{ + fprintf(file, "DPDK memory size %lu\n", rte_eal_get_physmem_size()); + fprintf(file, "DPDK memory layout\n"); + rte_dump_physmem_layout(file); + fprintf(file, "DPDK memzones.\n"); + rte_memzone_dump(file); + fprintf(file, "DPDK mempools.\n"); + rte_mempool_list_dump(file); + fprintf(file, "DPDK malloc stats.\n"); + rte_malloc_dump_stats(file, NULL); + fprintf(file, "DPDK malloc heaps.\n"); + rte_malloc_dump_heaps(file); +} diff --git a/src/spdk/lib/env_dpdk/env.mk b/src/spdk/lib/env_dpdk/env.mk new file mode 100644 index 000000000..c2bfb0d19 --- /dev/null +++ b/src/spdk/lib/env_dpdk/env.mk @@ -0,0 +1,176 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# This makefile snippet must define the following flags: +# ENV_CFLAGS +# ENV_CXXFLAGS +# ENV_LIBS +# ENV_LINKER_ARGS + +DPDK_DIR = $(CONFIG_DPDK_DIR) + +export DPDK_ABS_DIR = $(abspath $(DPDK_DIR)) + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/include/rte_config.h)) +DPDK_INC_DIR := $(DPDK_ABS_DIR)/include +else +DPDK_INC_DIR := $(DPDK_ABS_DIR)/include/dpdk +endif +DPDK_INC := -I$(DPDK_INC_DIR) + +ifeq ($(CONFIG_SHARED),y) +DPDK_LIB_EXT = .so +else +DPDK_LIB_EXT = .a +endif + +DPDK_LIB_LIST = rte_eal rte_mempool rte_ring rte_mbuf + +# librte_mempool_ring was new added from DPDK 17.05. Link this library used for +# ring based mempool management API. +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_mempool_ring.*)) +DPDK_LIB_LIST += rte_mempool_ring +endif + +# librte_malloc was removed after DPDK 2.1. Link this library conditionally based on its +# existence to maintain backward compatibility. +ifneq ($(wildcard $(DPDK_ABS_DIR)/lib/librte_malloc.*),) +DPDK_LIB_LIST += rte_malloc +endif + +# librte_pci and librte_bus_pci were added in DPDK 17.11. Link these libraries conditionally +# based on their existence to maintain backward compatibility. +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_pci.*)) +DPDK_LIB_LIST += rte_pci +endif + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_bus_pci.*)) +DPDK_LIB_LIST += rte_bus_pci +endif + +# DPDK 20.05 eal dependency +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_telemetry.*)) +DPDK_LIB_LIST += rte_telemetry +endif + +# There are some complex dependencies when using crypto, reduce or both so +# here we add the feature specific ones and set a flag to add the common +# ones after that. +DPDK_FRAMEWORK=n +ifeq ($(CONFIG_CRYPTO),y) +DPDK_FRAMEWORK=y +DPDK_LIB_LIST += rte_pmd_aesni_mb rte_reorder +endif + +ifeq ($(CONFIG_REDUCE),y) +DPDK_FRAMEWORK=y +DPDK_LIB_LIST += rte_pmd_isal +endif + +ifeq ($(DPDK_FRAMEWORK),y) +DPDK_LIB_LIST += rte_cryptodev rte_compressdev rte_bus_vdev rte_pmd_qat +endif + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_kvargs.*)) +DPDK_LIB_LIST += rte_kvargs +endif + +LINK_HASH=n + +ifeq ($(CONFIG_VHOST),y) +ifneq ($(CONFIG_VHOST_INTERNAL_LIB),y) +DPDK_LIB_LIST += rte_vhost rte_net +LINK_HASH=y +ifneq ($(DPDK_FRAMEWORK),y) +DPDK_LIB_LIST += rte_cryptodev +endif +endif +endif + +ifeq ($(CONFIG_RAID5),y) +LINK_HASH=y +endif + +ifeq ($(LINK_HASH),y) +DPDK_LIB_LIST += rte_hash +endif + +define dpdk_lib_list_to_libs +$(1:%=$(DPDK_ABS_DIR)/lib/lib%$(DPDK_LIB_EXT)) +endef + +define dpdk_env_linker_args +$(ENV_DPDK_FILE) -Wl,--whole-archive,--no-as-needed $(call dpdk_lib_list_to_libs,$1) -Wl,--no-whole-archive +endef + +DPDK_LIB = $(call dpdk_lib_list_to_libs,$(DPDK_LIB_LIST)) + +# SPDK memory registration requires experimental (deprecated) rte_memory API for DPDK 18.05 +ENV_CFLAGS = $(DPDK_INC) -Wno-deprecated-declarations +ENV_CXXFLAGS = $(ENV_CFLAGS) +ifeq ($(CONFIG_SHARED),y) +ENV_DPDK_FILE = $(call spdk_lib_list_to_shared_libs,env_dpdk) +else +ENV_DPDK_FILE = $(call spdk_lib_list_to_static_libs,env_dpdk) +endif +ENV_LIBS = $(ENV_DPDK_FILE) $(DPDK_LIB) +ENV_LINKER_ARGS = -Wl,-rpath-link $(DPDK_ABS_DIR)/lib +ENV_LINKER_ARGS += $(call dpdk_env_linker_args,$(DPDK_LIB_LIST)) + +ifeq ($(CONFIG_IPSEC_MB),y) +ENV_LINKER_ARGS += -lIPSec_MB -L$(IPSEC_MB_DIR) +endif + +ifeq ($(CONFIG_REDUCE),y) +ENV_LINKER_ARGS += -lisal -L$(ISAL_DIR)/.libs +endif + +ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_config.h)) +ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_config.h)) +ENV_LINKER_ARGS += -lnuma +endif +endif + +# DPDK built with meson puts those defines elsewhere +ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_build_config.h)) +ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_build_config.h)) +ENV_LINKER_ARGS += -lnuma +endif +endif + +ifeq ($(OS),Linux) +ENV_LINKER_ARGS += -ldl +endif +ifeq ($(OS),FreeBSD) +ENV_LINKER_ARGS += -lexecinfo +endif diff --git a/src/spdk/lib/env_dpdk/env_internal.h b/src/spdk/lib/env_dpdk/env_internal.h new file mode 100644 index 000000000..c7900d9d3 --- /dev/null +++ b/src/spdk/lib/env_dpdk/env_internal.h @@ -0,0 +1,98 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ENV_INTERNAL_H +#define SPDK_ENV_INTERNAL_H + +#include "spdk/stdinc.h" + +#include "spdk/env.h" + +#include <rte_config.h> +#include <rte_version.h> +#include <rte_eal.h> +#include <rte_bus.h> +#include <rte_pci.h> +#include <rte_bus_pci.h> +#include <rte_dev.h> + +#if RTE_VERSION < RTE_VERSION_NUM(18, 11, 0, 0) +#error RTE_VERSION is too old! Minimum 18.11 is required. +#endif + +/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47], + * which is enough to cover 256 TB. + */ +#define SHIFT_256TB 48 /* (1 << 48) == 256 TB */ +#define MASK_256TB ((1ULL << SHIFT_256TB) - 1) + +#define SHIFT_1GB 30 /* (1 << 30) == 1 GB */ +#define MASK_1GB ((1ULL << SHIFT_1GB) - 1) + +#define SPDK_PCI_DRIVER_MAX_NAME_LEN 32 +struct spdk_pci_driver { + struct rte_pci_driver driver; + + const char *name; + const struct spdk_pci_id *id_table; + uint32_t drv_flags; + + spdk_pci_enum_cb cb_fn; + void *cb_arg; + TAILQ_ENTRY(spdk_pci_driver) tailq; +}; + +int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device); +int pci_device_fini(struct rte_pci_device *device); + +void pci_env_init(void); +void pci_env_reinit(void); +void pci_env_fini(void); +int mem_map_init(bool legacy_mem); +int vtophys_init(void); + +/** + * Report a DMA-capable PCI device to the vtophys translation code. + * Increases the refcount of active DMA-capable devices managed by SPDK. + * This must be called after a `rte_pci_device` is created. + */ +void vtophys_pci_device_added(struct rte_pci_device *pci_device); + +/** + * Report the removal of a DMA-capable PCI device to the vtophys translation code. + * Decreases the refcount of active DMA-capable devices managed by SPDK. + * This must be called before a `rte_pci_device` is destroyed. + */ +void vtophys_pci_device_removed(struct rte_pci_device *pci_device); + +#endif diff --git a/src/spdk/lib/env_dpdk/init.c b/src/spdk/lib/env_dpdk/init.c new file mode 100644 index 000000000..0376dbe7b --- /dev/null +++ b/src/spdk/lib/env_dpdk/init.c @@ -0,0 +1,604 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include "spdk/version.h" +#include "spdk/env_dpdk.h" + +#include <rte_config.h> +#include <rte_eal.h> +#include <rte_errno.h> +#include <rte_vfio.h> + +#define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" +#define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 +#define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 +#define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1 +#define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 +#define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" +#define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 + +static char **g_eal_cmdline; +static int g_eal_cmdline_argcount; +static bool g_external_init = true; + +static char * +_sprintf_alloc(const char *format, ...) +{ + va_list args; + va_list args_copy; + char *buf; + size_t bufsize; + int rc; + + va_start(args, format); + + /* Try with a small buffer first. */ + bufsize = 32; + + /* Limit maximum buffer size to something reasonable so we don't loop forever. */ + while (bufsize <= 1024 * 1024) { + buf = malloc(bufsize); + if (buf == NULL) { + va_end(args); + return NULL; + } + + va_copy(args_copy, args); + rc = vsnprintf(buf, bufsize, format, args_copy); + va_end(args_copy); + + /* + * If vsnprintf() returned a count within our current buffer size, we are done. + * The count does not include the \0 terminator, so rc == bufsize is not OK. + */ + if (rc >= 0 && (size_t)rc < bufsize) { + va_end(args); + return buf; + } + + /* + * vsnprintf() should return the required space, but some libc versions do not + * implement this correctly, so just double the buffer size and try again. + * + * We don't need the data in buf, so rather than realloc(), use free() and malloc() + * again to avoid a copy. + */ + free(buf); + bufsize *= 2; + } + + va_end(args); + return NULL; +} + +void +spdk_env_opts_init(struct spdk_env_opts *opts) +{ + if (!opts) { + return; + } + + memset(opts, 0, sizeof(*opts)); + + opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; + opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; + opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; + opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; + opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE; + opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; + opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR; +} + +static void +free_args(char **args, int argcount) +{ + int i; + + if (args == NULL) { + return; + } + + for (i = 0; i < argcount; i++) { + free(args[i]); + } + + if (argcount) { + free(args); + } +} + +static char ** +push_arg(char *args[], int *argcount, char *arg) +{ + char **tmp; + + if (arg == NULL) { + fprintf(stderr, "%s: NULL arg supplied\n", __func__); + free_args(args, *argcount); + return NULL; + } + + tmp = realloc(args, sizeof(char *) * (*argcount + 1)); + if (tmp == NULL) { + free(arg); + free_args(args, *argcount); + return NULL; + } + + tmp[*argcount] = arg; + (*argcount)++; + + return tmp; +} + +#if defined(__linux__) && defined(__x86_64__) + +/* TODO: Can likely get this value from rlimits in the future */ +#define SPDK_IOMMU_VA_REQUIRED_WIDTH 48 +#define VTD_CAP_MGAW_SHIFT 16 +#define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT) + +static int +get_iommu_width(void) +{ + DIR *dir; + FILE *file; + struct dirent *entry; + char mgaw_path[64]; + char buf[64]; + char *end; + long long int val; + int width, tmp; + + dir = opendir("/sys/devices/virtual/iommu/"); + if (dir == NULL) { + return -EINVAL; + } + + width = 0; + + while ((entry = readdir(dir)) != NULL) { + /* Find directories named "dmar0", "dmar1", etc */ + if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) { + continue; + } + + tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap", + entry->d_name); + if ((unsigned)tmp >= sizeof(mgaw_path)) { + continue; + } + + file = fopen(mgaw_path, "r"); + if (file == NULL) { + continue; + } + + if (fgets(buf, sizeof(buf), file) == NULL) { + fclose(file); + continue; + } + + val = strtoll(buf, &end, 16); + if (val == LLONG_MIN || val == LLONG_MAX) { + fclose(file); + continue; + } + + tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1; + if (width == 0 || tmp < width) { + width = tmp; + } + + fclose(file); + } + + closedir(dir); + + return width; +} + +#endif + +static int +build_eal_cmdline(const struct spdk_env_opts *opts) +{ + int argcount = 0; + char **args; + + args = NULL; + + /* set the program name */ + args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); + if (args == NULL) { + return -1; + } + + /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ + if (opts->shm_id < 0) { + args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); + if (args == NULL) { + return -1; + } + } + + /* set the coremask */ + /* NOTE: If coremask starts with '[' and ends with ']' it is a core list + */ + if (opts->core_mask[0] == '[') { + char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); + + if (l_arg != NULL) { + int len = strlen(l_arg); + + if (l_arg[len - 1] == ']') { + l_arg[len - 1] = '\0'; + } + } + args = push_arg(args, &argcount, l_arg); + } else { + args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); + } + + if (args == NULL) { + return -1; + } + + /* set the memory channel number */ + if (opts->mem_channel > 0) { + args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); + if (args == NULL) { + return -1; + } + } + + /* set the memory size */ + if (opts->mem_size >= 0) { + args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); + if (args == NULL) { + return -1; + } + } + + /* set the master core */ + if (opts->master_core > 0) { + args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d", + opts->master_core)); + if (args == NULL) { + return -1; + } + } + + /* set no pci if enabled */ + if (opts->no_pci) { + args = push_arg(args, &argcount, _sprintf_alloc("--no-pci")); + if (args == NULL) { + return -1; + } + } + + /* create just one hugetlbfs file */ + if (opts->hugepage_single_segments) { + args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); + if (args == NULL) { + return -1; + } + } + + /* unlink hugepages after initialization */ + if (opts->unlink_hugepage) { + args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); + if (args == NULL) { + return -1; + } + } + + /* use a specific hugetlbfs mount */ + if (opts->hugedir) { + args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir)); + if (args == NULL) { + return -1; + } + } + + if (opts->num_pci_addr) { + size_t i; + char bdf[32]; + struct spdk_pci_addr *pci_addr = + opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist; + + for (i = 0; i < opts->num_pci_addr; i++) { + spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); + args = push_arg(args, &argcount, _sprintf_alloc("%s=%s", + (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"), + bdf)); + if (args == NULL) { + return -1; + } + } + } + + /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages. + * This can be overridden by specifying the same option in opts->env_context + */ + args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6")); + if (args == NULL) { + return -1; + } + + /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs. + * This can be overridden by specifying the same option in opts->env_context + */ + args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5")); + if (args == NULL) { + return -1; + } + + /* `user1` log type is used by rte_vhost, which prints an INFO log for each received + * vhost user message. We don't want that. The same log type is also used by a couple + * of other DPDK libs, but none of which we make use right now. If necessary, this can + * be overridden via opts->env_context. + */ + args = push_arg(args, &argcount, strdup("--log-level=user1:6")); + if (args == NULL) { + return -1; + } + + if (opts->env_context) { + args = push_arg(args, &argcount, strdup(opts->env_context)); + if (args == NULL) { + return -1; + } + } + +#ifdef __linux__ + + if (opts->iova_mode) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode)); + if (args == NULL) { + return -1; + } + } else { + /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa, + * but DPDK guesses it should be iova-mode=va. Add a check and force + * iova-mode=pa here. */ + if (rte_vfio_noiommu_is_enabled()) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } + } + +#if defined(__x86_64__) + /* DPDK by default guesses that it should be using iova-mode=va so that it can + * support running as an unprivileged user. However, some systems (especially + * virtual machines) don't have an IOMMU capable of handling the full virtual + * address space and DPDK doesn't currently catch that. Add a check in SPDK + * and force iova-mode=pa here. */ + if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) { + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } + } +#elif defined(__PPC64__) + /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly + * auto-detect at the moment, so we'll just force it here. */ + args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa")); + if (args == NULL) { + return -1; + } +#endif + } + + + /* Set the base virtual address - it must be an address that is not in the + * ASAN shadow region, otherwise ASAN-enabled builds will ignore the + * mmap hint. + * + * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm + */ + args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr)); + if (args == NULL) { + return -1; + } + + /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood. + * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two + * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split + * the memory for a buffer over two allocations meaning the buffer will be split over a memory region. + */ +#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) + if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) { + args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations")); + if (args == NULL) { + return -1; + } + } +#endif + + if (opts->shm_id < 0) { + args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", + getpid())); + if (args == NULL) { + return -1; + } + } else { + args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", + opts->shm_id)); + if (args == NULL) { + return -1; + } + + /* set the process type */ + args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); + if (args == NULL) { + return -1; + } + } +#endif + + g_eal_cmdline = args; + g_eal_cmdline_argcount = argcount; + return argcount; +} + +int +spdk_env_dpdk_post_init(bool legacy_mem) +{ + int rc; + + pci_env_init(); + + rc = mem_map_init(legacy_mem); + if (rc < 0) { + fprintf(stderr, "Failed to allocate mem_map\n"); + return rc; + } + + rc = vtophys_init(); + if (rc < 0) { + fprintf(stderr, "Failed to initialize vtophys\n"); + return rc; + } + + return 0; +} + +void +spdk_env_dpdk_post_fini(void) +{ + pci_env_fini(); + + free_args(g_eal_cmdline, g_eal_cmdline_argcount); + g_eal_cmdline = NULL; + g_eal_cmdline_argcount = 0; +} + +int +spdk_env_init(const struct spdk_env_opts *opts) +{ + char **dpdk_args = NULL; + int i, rc; + int orig_optind; + bool legacy_mem; + + /* If SPDK env has been initialized before, then only pci env requires + * reinitialization. + */ + if (g_external_init == false) { + if (opts != NULL) { + fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n"); + return -EINVAL; + } + + printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version()); + pci_env_reinit(); + + return 0; + } + + if (opts == NULL) { + fprintf(stderr, "NULL arguments to initialize DPDK\n"); + return -EINVAL; + } + + rc = build_eal_cmdline(opts); + if (rc < 0) { + fprintf(stderr, "Invalid arguments to initialize DPDK\n"); + return -EINVAL; + } + + printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); + printf("[ DPDK EAL parameters: "); + for (i = 0; i < g_eal_cmdline_argcount; i++) { + printf("%s ", g_eal_cmdline[i]); + } + printf("]\n"); + + /* DPDK rearranges the array we pass to it, so make a copy + * before passing so we can still free the individual strings + * correctly. + */ + dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *)); + if (dpdk_args == NULL) { + fprintf(stderr, "Failed to allocate dpdk_args\n"); + return -ENOMEM; + } + memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount); + + fflush(stdout); + orig_optind = optind; + optind = 1; + rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args); + optind = orig_optind; + + free(dpdk_args); + + if (rc < 0) { + if (rte_errno == EALREADY) { + fprintf(stderr, "DPDK already initialized\n"); + } else { + fprintf(stderr, "Failed to initialize DPDK\n"); + } + return -rte_errno; + } + + legacy_mem = false; + if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) { + legacy_mem = true; + } + + rc = spdk_env_dpdk_post_init(legacy_mem); + if (rc == 0) { + g_external_init = false; + } + + return rc; +} + +void +spdk_env_fini(void) +{ + spdk_env_dpdk_post_fini(); +} + +bool +spdk_env_dpdk_external_init(void) +{ + return g_external_init; +} diff --git a/src/spdk/lib/env_dpdk/memory.c b/src/spdk/lib/env_dpdk/memory.c new file mode 100644 index 000000000..4c2205a46 --- /dev/null +++ b/src/spdk/lib/env_dpdk/memory.c @@ -0,0 +1,1442 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include <rte_config.h> +#include <rte_memory.h> +#include <rte_eal_memconfig.h> + +#include "spdk_internal/assert.h" + +#include "spdk/assert.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/util.h" +#include "spdk/memory.h" +#include "spdk/env_dpdk.h" + +#ifdef __FreeBSD__ +#define VFIO_ENABLED 0 +#else +#include <linux/version.h> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#define VFIO_ENABLED 1 +#include <linux/vfio.h> +#include <rte_vfio.h> + +struct spdk_vfio_dma_map { + struct vfio_iommu_type1_dma_map map; + struct vfio_iommu_type1_dma_unmap unmap; + TAILQ_ENTRY(spdk_vfio_dma_map) tailq; +}; + +struct vfio_cfg { + int fd; + bool enabled; + bool noiommu_enabled; + unsigned device_ref; + TAILQ_HEAD(, spdk_vfio_dma_map) maps; + pthread_mutex_t mutex; +}; + +static struct vfio_cfg g_vfio = { + .fd = -1, + .enabled = false, + .noiommu_enabled = false, + .device_ref = 0, + .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), + .mutex = PTHREAD_MUTEX_INITIALIZER +}; + +#else +#define VFIO_ENABLED 0 +#endif +#endif + +#if DEBUG +#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif + +#define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) +#define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) + +#define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) +#define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) + +/* Page is registered */ +#define REG_MAP_REGISTERED (1ULL << 62) + +/* A notification region barrier. The 2MB translation entry that's marked + * with this flag must be unregistered separately. This allows contiguous + * regions to be unregistered in the same chunks they were registered. + */ +#define REG_MAP_NOTIFY_START (1ULL << 63) + +/* Translation of a single 2MB page. */ +struct map_2mb { + uint64_t translation_2mb; +}; + +/* Second-level map table indexed by bits [21..29] of the virtual address. + * Each entry contains the address translation or error for entries that haven't + * been retrieved yet. + */ +struct map_1gb { + struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; +}; + +/* Top-level map table indexed by bits [30..47] of the virtual address. + * Each entry points to a second-level map table or NULL. + */ +struct map_256tb { + struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; +}; + +/* Page-granularity memory address translation */ +struct spdk_mem_map { + struct map_256tb map_256tb; + pthread_mutex_t mutex; + uint64_t default_translation; + struct spdk_mem_map_ops ops; + void *cb_ctx; + TAILQ_ENTRY(spdk_mem_map) tailq; +}; + +/* Registrations map. The 64 bit translations are bit fields with the + * following layout (starting with the low bits): + * 0 - 61 : reserved + * 62 - 63 : flags + */ +static struct spdk_mem_map *g_mem_reg_map; +static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps = + TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); +static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; + +static bool g_legacy_mem; + +/* + * Walk the currently registered memory via the main memory registration map + * and call the new map's notify callback for each virtually contiguous region. + */ +static int +mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) +{ + size_t idx_256tb; + uint64_t idx_1gb; + uint64_t contig_start = UINT64_MAX; + uint64_t contig_end = UINT64_MAX; + struct map_1gb *map_1gb; + int rc; + + if (!g_mem_reg_map) { + return -EINVAL; + } + + /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ + pthread_mutex_lock(&g_mem_reg_map->mutex); + + for (idx_256tb = 0; + idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); + idx_256tb++) { + map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + if (contig_start != UINT64_MAX) { + /* End of of a virtually contiguous range */ + rc = map->ops.notify_cb(map->cb_ctx, map, action, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + /* Don't bother handling unregister failures. It can't be any worse */ + if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { + goto err_unregister; + } + } + contig_start = UINT64_MAX; + continue; + } + + for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { + if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && + (contig_start == UINT64_MAX || + (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { + /* Rebuild the virtual address from the indexes */ + uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); + + if (contig_start == UINT64_MAX) { + contig_start = vaddr; + } + + contig_end = vaddr; + } else { + if (contig_start != UINT64_MAX) { + /* End of of a virtually contiguous range */ + rc = map->ops.notify_cb(map->cb_ctx, map, action, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + /* Don't bother handling unregister failures. It can't be any worse */ + if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { + goto err_unregister; + } + + /* This page might be a part of a neighbour region, so process + * it again. The idx_1gb will be incremented immediately. + */ + idx_1gb--; + } + contig_start = UINT64_MAX; + } + } + } + + pthread_mutex_unlock(&g_mem_reg_map->mutex); + return 0; + +err_unregister: + /* Unwind to the first empty translation so we don't unregister + * a region that just failed to register. + */ + idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); + idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); + contig_start = UINT64_MAX; + contig_end = UINT64_MAX; + + /* Unregister any memory we managed to register before the failure */ + for (; idx_256tb < SIZE_MAX; idx_256tb--) { + map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + if (contig_end != UINT64_MAX) { + /* End of of a virtually contiguous range */ + map->ops.notify_cb(map->cb_ctx, map, + SPDK_MEM_MAP_NOTIFY_UNREGISTER, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + } + contig_end = UINT64_MAX; + continue; + } + + for (; idx_1gb < UINT64_MAX; idx_1gb--) { + if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && + (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { + /* Rebuild the virtual address from the indexes */ + uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); + + if (contig_end == UINT64_MAX) { + contig_end = vaddr; + } + contig_start = vaddr; + } else { + if (contig_end != UINT64_MAX) { + /* End of of a virtually contiguous range */ + map->ops.notify_cb(map->cb_ctx, map, + SPDK_MEM_MAP_NOTIFY_UNREGISTER, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + idx_1gb++; + } + contig_end = UINT64_MAX; + } + } + idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; + } + + pthread_mutex_unlock(&g_mem_reg_map->mutex); + return rc; +} + +struct spdk_mem_map * +spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) +{ + struct spdk_mem_map *map; + int rc; + + map = calloc(1, sizeof(*map)); + if (map == NULL) { + return NULL; + } + + if (pthread_mutex_init(&map->mutex, NULL)) { + free(map); + return NULL; + } + + map->default_translation = default_translation; + map->cb_ctx = cb_ctx; + if (ops) { + map->ops = *ops; + } + + if (ops && ops->notify_cb) { + pthread_mutex_lock(&g_spdk_mem_map_mutex); + rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + DEBUG_PRINT("Initial mem_map notify failed\n"); + pthread_mutex_destroy(&map->mutex); + free(map); + return NULL; + } + TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + } + + return map; +} + +void +spdk_mem_map_free(struct spdk_mem_map **pmap) +{ + struct spdk_mem_map *map; + size_t i; + + if (!pmap) { + return; + } + + map = *pmap; + + if (!map) { + return; + } + + if (map->ops.notify_cb) { + pthread_mutex_lock(&g_spdk_mem_map_mutex); + mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); + TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + } + + for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { + free(map->map_256tb.map[i]); + } + + pthread_mutex_destroy(&map->mutex); + + free(map); + *pmap = NULL; +} + +int +spdk_mem_register(void *vaddr, size_t len) +{ + struct spdk_mem_map *map; + int rc; + void *seg_vaddr; + size_t seg_len; + uint64_t reg; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + if (len == 0) { + return 0; + } + + pthread_mutex_lock(&g_spdk_mem_map_mutex); + + seg_vaddr = vaddr; + seg_len = len; + while (seg_len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + if (reg & REG_MAP_REGISTERED) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -EBUSY; + } + seg_vaddr += VALUE_2MB; + seg_len -= VALUE_2MB; + } + + seg_vaddr = vaddr; + seg_len = 0; + while (len > 0) { + spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, + seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); + seg_len += VALUE_2MB; + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return 0; +} + +int +spdk_mem_unregister(void *vaddr, size_t len) +{ + struct spdk_mem_map *map; + int rc; + void *seg_vaddr; + size_t seg_len; + uint64_t reg, newreg; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + pthread_mutex_lock(&g_spdk_mem_map_mutex); + + /* The first page must be a start of a region. Also check if it's + * registered to make sure we don't return -ERANGE for non-registered + * regions. + */ + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); + if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -ERANGE; + } + + seg_vaddr = vaddr; + seg_len = len; + while (seg_len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + if ((reg & REG_MAP_REGISTERED) == 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -EINVAL; + } + seg_vaddr += VALUE_2MB; + seg_len -= VALUE_2MB; + } + + newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + /* If the next page is registered, it must be a start of a region as well, + * otherwise we'd be unregistering only a part of a region. + */ + if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -ERANGE; + } + seg_vaddr = vaddr; + seg_len = 0; + + while (len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); + spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); + + if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { + TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + + seg_vaddr = vaddr; + seg_len = VALUE_2MB; + } else { + seg_len += VALUE_2MB; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + if (seg_len > 0) { + TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + } + + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return 0; +} + +int +spdk_mem_reserve(void *vaddr, size_t len) +{ + struct spdk_mem_map *map; + void *seg_vaddr; + size_t seg_len; + uint64_t reg; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + if (len == 0) { + return 0; + } + + pthread_mutex_lock(&g_spdk_mem_map_mutex); + + /* Check if any part of this range is already registered */ + seg_vaddr = vaddr; + seg_len = len; + while (seg_len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + if (reg & REG_MAP_REGISTERED) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -EBUSY; + } + seg_vaddr += VALUE_2MB; + seg_len -= VALUE_2MB; + } + + /* Simply set the translation to the memory map's default. This allocates the space in the + * map but does not provide a valid translation. */ + spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len, + g_mem_reg_map->default_translation); + + TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { + spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation); + } + + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return 0; +} + +static struct map_1gb * +mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) +{ + struct map_1gb *map_1gb; + uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); + size_t i; + + if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { + return NULL; + } + + map_1gb = map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + pthread_mutex_lock(&map->mutex); + + /* Recheck to make sure nobody else got the mutex first. */ + map_1gb = map->map_256tb.map[idx_256tb]; + if (!map_1gb) { + map_1gb = malloc(sizeof(struct map_1gb)); + if (map_1gb) { + /* initialize all entries to default translation */ + for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { + map_1gb->map[i].translation_2mb = map->default_translation; + } + map->map_256tb.map[idx_256tb] = map_1gb; + } + } + + pthread_mutex_unlock(&map->mutex); + + if (!map_1gb) { + DEBUG_PRINT("allocation failed\n"); + return NULL; + } + } + + return map_1gb; +} + +int +spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, + uint64_t translation) +{ + uint64_t vfn_2mb; + struct map_1gb *map_1gb; + uint64_t idx_1gb; + struct map_2mb *map_2mb; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); + return -EINVAL; + } + + /* For now, only 2 MB-aligned registrations are supported */ + if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", + __func__, vaddr, size); + return -EINVAL; + } + + vfn_2mb = vaddr >> SHIFT_2MB; + + while (size) { + map_1gb = mem_map_get_map_1gb(map, vfn_2mb); + if (!map_1gb) { + DEBUG_PRINT("could not get %p map\n", (void *)vaddr); + return -ENOMEM; + } + + idx_1gb = MAP_1GB_IDX(vfn_2mb); + map_2mb = &map_1gb->map[idx_1gb]; + map_2mb->translation_2mb = translation; + + size -= VALUE_2MB; + vfn_2mb++; + } + + return 0; +} + +int +spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) +{ + return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation); +} + +inline uint64_t +spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) +{ + const struct map_1gb *map_1gb; + const struct map_2mb *map_2mb; + uint64_t idx_256tb; + uint64_t idx_1gb; + uint64_t vfn_2mb; + uint64_t cur_size; + uint64_t prev_translation; + uint64_t orig_translation; + + if (spdk_unlikely(vaddr & ~MASK_256TB)) { + DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); + return map->default_translation; + } + + vfn_2mb = vaddr >> SHIFT_2MB; + idx_256tb = MAP_256TB_IDX(vfn_2mb); + idx_1gb = MAP_1GB_IDX(vfn_2mb); + + map_1gb = map->map_256tb.map[idx_256tb]; + if (spdk_unlikely(!map_1gb)) { + return map->default_translation; + } + + cur_size = VALUE_2MB - _2MB_OFFSET(vaddr); + map_2mb = &map_1gb->map[idx_1gb]; + if (size == NULL || map->ops.are_contiguous == NULL || + map_2mb->translation_2mb == map->default_translation) { + if (size != NULL) { + *size = spdk_min(*size, cur_size); + } + return map_2mb->translation_2mb; + } + + orig_translation = map_2mb->translation_2mb; + prev_translation = orig_translation; + while (cur_size < *size) { + vfn_2mb++; + idx_256tb = MAP_256TB_IDX(vfn_2mb); + idx_1gb = MAP_1GB_IDX(vfn_2mb); + + map_1gb = map->map_256tb.map[idx_256tb]; + if (spdk_unlikely(!map_1gb)) { + break; + } + + map_2mb = &map_1gb->map[idx_1gb]; + if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { + break; + } + + cur_size += VALUE_2MB; + prev_translation = map_2mb->translation_2mb; + } + + *size = spdk_min(*size, cur_size); + return orig_translation; +} + +static void +memory_hotplug_cb(enum rte_mem_event event_type, + const void *addr, size_t len, void *arg) +{ + if (event_type == RTE_MEM_EVENT_ALLOC) { + spdk_mem_register((void *)addr, len); + +#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) + if (!spdk_env_dpdk_external_init()) { + return; + } +#endif + + /* Prior to DPDK 19.02, we have to worry about DPDK + * freeing memory in different units than it was allocated. + * That doesn't work with things like RDMA MRs. So for + * those versions of DPDK, mark each segment so that DPDK + * won't later free it. That ensures we don't have to deal + * with that scenario. + * + * DPDK 19.02 added the --match-allocations RTE flag to + * avoid this condition. + * + * Note: if the user initialized DPDK separately, we can't + * be sure that --match-allocations was specified, so need + * to still mark the segments so they aren't freed. + */ + while (len > 0) { + struct rte_memseg *seg; + + seg = rte_mem_virt2memseg(addr, NULL); + assert(seg != NULL); + seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; + addr = (void *)((uintptr_t)addr + seg->hugepage_sz); + len -= seg->hugepage_sz; + } + } else if (event_type == RTE_MEM_EVENT_FREE) { + spdk_mem_unregister((void *)addr, len); + } +} + +static int +memory_iter_cb(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg) +{ + return spdk_mem_register(ms->addr, len); +} + +int +mem_map_init(bool legacy_mem) +{ + g_legacy_mem = legacy_mem; + + g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); + if (g_mem_reg_map == NULL) { + DEBUG_PRINT("memory registration map allocation failed\n"); + return -ENOMEM; + } + + /* + * Walk all DPDK memory segments and register them + * with the master memory map + */ + rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); + rte_memseg_contig_walk(memory_iter_cb, NULL); + return 0; +} + +bool +spdk_iommu_is_enabled(void) +{ +#if VFIO_ENABLED + return g_vfio.enabled && !g_vfio.noiommu_enabled; +#else + return false; +#endif +} + +struct spdk_vtophys_pci_device { + struct rte_pci_device *pci_device; + TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; +}; + +static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; +static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = + TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); + +static struct spdk_mem_map *g_vtophys_map; +static struct spdk_mem_map *g_phys_ref_map; + +#if VFIO_ENABLED +static int +vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) +{ + struct spdk_vfio_dma_map *dma_map; + uint64_t refcount; + int ret; + + refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); + assert(refcount < UINT64_MAX); + if (refcount > 0) { + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); + return 0; + } + + dma_map = calloc(1, sizeof(*dma_map)); + if (dma_map == NULL) { + return -ENOMEM; + } + + dma_map->map.argsz = sizeof(dma_map->map); + dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + dma_map->map.vaddr = vaddr; + dma_map->map.iova = iova; + dma_map->map.size = size; + + dma_map->unmap.argsz = sizeof(dma_map->unmap); + dma_map->unmap.flags = 0; + dma_map->unmap.iova = iova; + dma_map->unmap.size = size; + + pthread_mutex_lock(&g_vfio.mutex); + if (g_vfio.device_ref == 0) { + /* VFIO requires at least one device (IOMMU group) to be added to + * a VFIO container before it is possible to perform any IOMMU + * operations on that container. This memory will be mapped once + * the first device (IOMMU group) is hotplugged. + * + * Since the vfio container is managed internally by DPDK, it is + * also possible that some device is already in that container, but + * it's not managed by SPDK - e.g. an NIC attached internally + * inside DPDK. We could map the memory straight away in such + * scenario, but there's no need to do it. DPDK devices clearly + * don't need our mappings and hence we defer the mapping + * unconditionally until the first SPDK-managed device is + * hotplugged. + */ + goto out_insert; + } + + ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); + if (ret) { + DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); + pthread_mutex_unlock(&g_vfio.mutex); + free(dma_map); + return ret; + } + +out_insert: + TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); + pthread_mutex_unlock(&g_vfio.mutex); + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1); + return 0; +} + +static int +vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) +{ + struct spdk_vfio_dma_map *dma_map; + uint64_t refcount; + int ret; + + pthread_mutex_lock(&g_vfio.mutex); + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + if (dma_map->map.iova == iova) { + break; + } + } + + if (dma_map == NULL) { + DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); + pthread_mutex_unlock(&g_vfio.mutex); + return -ENXIO; + } + + refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL); + assert(refcount < UINT64_MAX); + if (refcount > 0) { + spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1); + } + + /* We still have outstanding references, don't clear it. */ + if (refcount > 1) { + pthread_mutex_unlock(&g_vfio.mutex); + return 0; + } + + /** don't support partial or multiple-page unmap for now */ + assert(dma_map->map.size == size); + + if (g_vfio.device_ref == 0) { + /* Memory is not mapped anymore, just remove it's references */ + goto out_remove; + } + + + ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); + if (ret) { + DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); + pthread_mutex_unlock(&g_vfio.mutex); + return ret; + } + +out_remove: + TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); + pthread_mutex_unlock(&g_vfio.mutex); + free(dma_map); + return 0; +} +#endif + +static uint64_t +vtophys_get_paddr_memseg(uint64_t vaddr) +{ + uintptr_t paddr; + struct rte_memseg *seg; + + seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); + if (seg != NULL) { + paddr = seg->phys_addr; + if (paddr == RTE_BAD_IOVA) { + return SPDK_VTOPHYS_ERROR; + } + paddr += (vaddr - (uintptr_t)seg->addr); + return paddr; + } + + return SPDK_VTOPHYS_ERROR; +} + +/* Try to get the paddr from /proc/self/pagemap */ +static uint64_t +vtophys_get_paddr_pagemap(uint64_t vaddr) +{ + uintptr_t paddr; + + /* Silence static analyzers */ + assert(vaddr != 0); + paddr = rte_mem_virt2iova((void *)vaddr); + if (paddr == RTE_BAD_IOVA) { + /* + * The vaddr may be valid but doesn't have a backing page + * assigned yet. Touch the page to ensure a backing page + * gets assigned, then try to translate again. + */ + rte_atomic64_read((rte_atomic64_t *)vaddr); + paddr = rte_mem_virt2iova((void *)vaddr); + } + if (paddr == RTE_BAD_IOVA) { + /* Unable to get to the physical address. */ + return SPDK_VTOPHYS_ERROR; + } + + return paddr; +} + +/* Try to get the paddr from pci devices */ +static uint64_t +vtophys_get_paddr_pci(uint64_t vaddr) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + uintptr_t paddr; + struct rte_pci_device *dev; + struct rte_mem_resource *res; + unsigned r; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { + dev = vtophys_dev->pci_device; + + for (r = 0; r < PCI_MAX_RESOURCE; r++) { + res = &dev->mem_resource[r]; + if (res->phys_addr && vaddr >= (uint64_t)res->addr && + vaddr < (uint64_t)res->addr + res->len) { + paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); + DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, + (void *)paddr); + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + return paddr; + } + } + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + + return SPDK_VTOPHYS_ERROR; +} + +static int +vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t len) +{ + int rc = 0, pci_phys = 0; + uint64_t paddr; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n", + vaddr, len); + return -EINVAL; + } + + /* Get the physical address from the DPDK memsegs */ + paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); + + switch (action) { + case SPDK_MEM_MAP_NOTIFY_REGISTER: + if (paddr == SPDK_VTOPHYS_ERROR) { + /* This is not an address that DPDK is managing. */ +#if VFIO_ENABLED + enum rte_iova_mode iova_mode; + +#if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0) + iova_mode = rte_eal_iova_mode(); +#else + iova_mode = rte_eal_get_configuration()->iova_mode; +#endif + + if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) { + /* We'll use the virtual address as the iova to match DPDK. */ + paddr = (uint64_t)vaddr; + rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len); + if (rc) { + return -EFAULT; + } + while (len > 0) { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); + if (rc != 0) { + return rc; + } + vaddr += VALUE_2MB; + paddr += VALUE_2MB; + len -= VALUE_2MB; + } + } else +#endif + { + /* Get the physical address from /proc/self/pagemap. */ + paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); + if (paddr == SPDK_VTOPHYS_ERROR) { + /* Get the physical address from PCI devices */ + paddr = vtophys_get_paddr_pci((uint64_t)vaddr); + if (paddr == SPDK_VTOPHYS_ERROR) { + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; + } + /* The beginning of this address range points to a PCI resource, + * so the rest must point to a PCI resource as well. + */ + pci_phys = 1; + } + + /* Get paddr for each 2MB chunk in this address range */ + while (len > 0) { + /* Get the physical address from /proc/self/pagemap. */ + if (pci_phys) { + paddr = vtophys_get_paddr_pci((uint64_t)vaddr); + } else { + paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); + } + + if (paddr == SPDK_VTOPHYS_ERROR) { + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; + } + + /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ + if (!pci_phys && (paddr & MASK_2MB)) { + DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); + return -EINVAL; + } +#if VFIO_ENABLED + /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory + * with the IOMMU using the physical address to match. */ + if (spdk_iommu_is_enabled()) { + rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); + if (rc) { + DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr); + return -EFAULT; + } + } +#endif + + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); + if (rc != 0) { + return rc; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + } + } else { + /* This is an address managed by DPDK. Just setup the translations. */ + while (len > 0) { + paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); + if (paddr == SPDK_VTOPHYS_ERROR) { + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; + } + + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); + if (rc != 0) { + return rc; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + } + + break; + case SPDK_MEM_MAP_NOTIFY_UNREGISTER: +#if VFIO_ENABLED + if (paddr == SPDK_VTOPHYS_ERROR) { + /* + * This is not an address that DPDK is managing. If vfio is enabled, + * we need to unmap the range from the IOMMU + */ + if (spdk_iommu_is_enabled()) { + uint64_t buffer_len = len; + uint8_t *va = vaddr; + enum rte_iova_mode iova_mode; + +#if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0) + iova_mode = rte_eal_iova_mode(); +#else + iova_mode = rte_eal_get_configuration()->iova_mode; +#endif + /* + * In virtual address mode, the region is contiguous and can be done in + * one unmap. + */ + if (iova_mode == RTE_IOVA_VA) { + paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len); + if (buffer_len != len || paddr != (uintptr_t)va) { + DEBUG_PRINT("Unmapping %p with length %lu failed because " + "translation had address 0x%" PRIx64 " and length %lu\n", + va, len, paddr, buffer_len); + return -EINVAL; + } + rc = vtophys_iommu_unmap_dma(paddr, len); + if (rc) { + DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); + return -EFAULT; + } + } else if (iova_mode == RTE_IOVA_PA) { + /* Get paddr for each 2MB chunk in this address range */ + while (buffer_len > 0) { + paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL); + + if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) { + DEBUG_PRINT("could not get phys addr for %p\n", va); + return -EFAULT; + } + + rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); + if (rc) { + DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr); + return -EFAULT; + } + + va += VALUE_2MB; + buffer_len -= VALUE_2MB; + } + } + } + } +#endif + while (len > 0) { + rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); + if (rc != 0) { + return rc; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + break; + default: + SPDK_UNREACHABLE(); + } + + return rc; +} + +static int +vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2) +{ + /* This function is always called with paddrs for two subsequent + * 2MB chunks in virtual address space, so those chunks will be only + * physically contiguous if the physical addresses are 2MB apart + * from each other as well. + */ + return (paddr2 - paddr1 == VALUE_2MB); +} + +#if VFIO_ENABLED + +static bool +vfio_enabled(void) +{ + return rte_vfio_is_enabled("vfio_pci"); +} + +/* Check if IOMMU is enabled on the system */ +static bool +has_iommu_groups(void) +{ + struct dirent *d; + int count = 0; + DIR *dir = opendir("/sys/kernel/iommu_groups"); + + if (dir == NULL) { + return false; + } + + while (count < 3 && (d = readdir(dir)) != NULL) { + count++; + } + + closedir(dir); + /* there will always be ./ and ../ entries */ + return count > 2; +} + +static bool +vfio_noiommu_enabled(void) +{ + return rte_vfio_noiommu_is_enabled(); +} + +static void +vtophys_iommu_init(void) +{ + char proc_fd_path[PATH_MAX + 1]; + char link_path[PATH_MAX + 1]; + const char vfio_path[] = "/dev/vfio/vfio"; + DIR *dir; + struct dirent *d; + + if (!vfio_enabled()) { + return; + } + + if (vfio_noiommu_enabled()) { + g_vfio.noiommu_enabled = true; + } else if (!has_iommu_groups()) { + return; + } + + dir = opendir("/proc/self/fd"); + if (!dir) { + DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); + return; + } + + while ((d = readdir(dir)) != NULL) { + if (d->d_type != DT_LNK) { + continue; + } + + snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); + if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { + continue; + } + + if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { + sscanf(d->d_name, "%d", &g_vfio.fd); + break; + } + } + + closedir(dir); + + if (g_vfio.fd < 0) { + DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); + return; + } + + g_vfio.enabled = true; + + return; +} +#endif + +void +vtophys_pci_device_added(struct rte_pci_device *pci_device) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + + vtophys_dev = calloc(1, sizeof(*vtophys_dev)); + if (vtophys_dev) { + vtophys_dev->pci_device = pci_device; + TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); + } else { + DEBUG_PRINT("Memory allocation error\n"); + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + +#if VFIO_ENABLED + struct spdk_vfio_dma_map *dma_map; + int ret; + + if (!g_vfio.enabled) { + return; + } + + pthread_mutex_lock(&g_vfio.mutex); + g_vfio.device_ref++; + if (g_vfio.device_ref > 1) { + pthread_mutex_unlock(&g_vfio.mutex); + return; + } + + /* This is the first SPDK device using DPDK vfio. This means that the first + * IOMMU group might have been just been added to the DPDK vfio container. + * From this point it is certain that the memory can be mapped now. + */ + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); + if (ret) { + DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); + break; + } + } + pthread_mutex_unlock(&g_vfio.mutex); +#endif +} + +void +vtophys_pci_device_removed(struct rte_pci_device *pci_device) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { + if (vtophys_dev->pci_device == pci_device) { + TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); + free(vtophys_dev); + break; + } + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + +#if VFIO_ENABLED + struct spdk_vfio_dma_map *dma_map; + int ret; + + if (!g_vfio.enabled) { + return; + } + + pthread_mutex_lock(&g_vfio.mutex); + assert(g_vfio.device_ref > 0); + g_vfio.device_ref--; + if (g_vfio.device_ref > 0) { + pthread_mutex_unlock(&g_vfio.mutex); + return; + } + + /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have + * any additional devices using it's vfio container, all the mappings + * will be automatically removed by the Linux vfio driver. We unmap + * the memory manually to be able to easily re-map it later regardless + * of other, external factors. + */ + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); + if (ret) { + DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); + break; + } + } + pthread_mutex_unlock(&g_vfio.mutex); +#endif +} + +int +vtophys_init(void) +{ + const struct spdk_mem_map_ops vtophys_map_ops = { + .notify_cb = vtophys_notify, + .are_contiguous = vtophys_check_contiguous_entries, + }; + + const struct spdk_mem_map_ops phys_ref_map_ops = { + .notify_cb = NULL, + .are_contiguous = NULL, + }; + +#if VFIO_ENABLED + vtophys_iommu_init(); +#endif + + g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL); + if (g_phys_ref_map == NULL) { + DEBUG_PRINT("phys_ref map allocation failed.\n"); + return -ENOMEM; + } + + g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); + if (g_vtophys_map == NULL) { + DEBUG_PRINT("vtophys map allocation failed\n"); + return -ENOMEM; + } + return 0; +} + +uint64_t +spdk_vtophys(void *buf, uint64_t *size) +{ + uint64_t vaddr, paddr_2mb; + + vaddr = (uint64_t)buf; + paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size); + + /* + * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, + * we will still bitwise-or it with the buf offset below, but the result will still be + * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being + * unaligned) we must now check the return value before addition. + */ + SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); + if (paddr_2mb == SPDK_VTOPHYS_ERROR) { + return SPDK_VTOPHYS_ERROR; + } else { + return paddr_2mb + (vaddr & MASK_2MB); + } +} diff --git a/src/spdk/lib/env_dpdk/pci.c b/src/spdk/lib/env_dpdk/pci.c new file mode 100644 index 000000000..5fd1b4abd --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci.c @@ -0,0 +1,1063 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include <rte_alarm.h> +#include <rte_devargs.h> +#include "spdk/env.h" + +#define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers" + +#define PCI_CFG_SIZE 256 +#define PCI_EXT_CAP_ID_SN 0x03 + +/* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time + * might cause the internal IPC to misbehave. Just retry in such case. + */ +#define DPDK_HOTPLUG_RETRY_COUNT 4 + +/* DPDK alarm/interrupt thread */ +static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER; +static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices); +/* devices hotplugged on a dpdk thread */ +static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices = + TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices); +static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers); + +static int +map_bar_rte(struct spdk_pci_device *device, uint32_t bar, + void **mapped_addr, uint64_t *phys_addr, uint64_t *size) +{ + struct rte_pci_device *dev = device->dev_handle; + + *mapped_addr = dev->mem_resource[bar].addr; + *phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr; + *size = (uint64_t)dev->mem_resource[bar].len; + + return 0; +} + +static int +unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr) +{ + return 0; +} + +static int +cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + int rc; + + rc = rte_pci_read_config(dev->dev_handle, value, len, offset); + + return (rc > 0 && (uint32_t) rc == len) ? 0 : -1; +} + +static int +cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + int rc; + + rc = rte_pci_write_config(dev->dev_handle, value, len, offset); + +#ifdef __FreeBSD__ + /* DPDK returns 0 on success and -1 on failure */ + return rc; +#endif + return (rc > 0 && (uint32_t) rc == len) ? 0 : -1; +} + +static void +remove_rte_dev(struct rte_pci_device *rte_dev) +{ + char bdf[32]; + int i = 0, rc; + + snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name); + do { + rc = rte_eal_hotplug_remove("pci", bdf); + } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT); +} + +static void +detach_rte_cb(void *_dev) +{ + remove_rte_dev(_dev); +} + +static void +detach_rte(struct spdk_pci_device *dev) +{ + struct rte_pci_device *rte_dev = dev->dev_handle; + int i; + bool removed; + + if (!spdk_process_is_primary()) { + remove_rte_dev(rte_dev); + return; + } + + pthread_mutex_lock(&g_pci_mutex); + dev->internal.attached = false; + /* prevent the hotremove notification from removing this device */ + dev->internal.pending_removal = true; + pthread_mutex_unlock(&g_pci_mutex); + + rte_eal_alarm_set(1, detach_rte_cb, rte_dev); + + /* wait up to 2s for the cb to execute */ + for (i = 2000; i > 0; i--) { + + spdk_delay_us(1000); + pthread_mutex_lock(&g_pci_mutex); + removed = dev->internal.removed; + pthread_mutex_unlock(&g_pci_mutex); + + if (removed) { + break; + } + } + + /* besides checking the removed flag, we also need to wait + * for the dpdk detach function to unwind, as it's doing some + * operations even after calling our detach callback. Simply + * cancel the alarm - if it started executing already, this + * call will block and wait for it to finish. + */ + rte_eal_alarm_cancel(detach_rte_cb, rte_dev); + + /* the device could have been finally removed, so just check + * it again. + */ + pthread_mutex_lock(&g_pci_mutex); + removed = dev->internal.removed; + pthread_mutex_unlock(&g_pci_mutex); + if (!removed) { + fprintf(stderr, "Timeout waiting for DPDK to remove PCI device %s.\n", + rte_dev->name); + /* If we reach this state, then the device couldn't be removed and most likely + a subsequent hot add of a device in the same BDF will fail */ + } +} + +void +spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags) +{ + struct spdk_pci_driver *driver; + + driver = calloc(1, sizeof(*driver)); + if (!driver) { + /* we can't do any better than bailing atm */ + return; + } + + driver->name = name; + driver->id_table = id_table; + driver->drv_flags = flags; + TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq); +} + +struct spdk_pci_driver * +spdk_pci_nvme_get_driver(void) +{ + return spdk_pci_get_driver("nvme"); +} + +struct spdk_pci_driver * +spdk_pci_get_driver(const char *name) +{ + struct spdk_pci_driver *driver; + + TAILQ_FOREACH(driver, &g_pci_drivers, tailq) { + if (strcmp(driver->name, name) == 0) { + return driver; + } + } + + return NULL; +} + +static void +pci_device_rte_hotremove(const char *device_name, + enum rte_dev_event_type event, + void *cb_arg) +{ + struct spdk_pci_device *dev; + bool can_detach = false; + + if (event != RTE_DEV_EVENT_REMOVE) { + return; + } + + pthread_mutex_lock(&g_pci_mutex); + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + struct rte_pci_device *rte_dev = dev->dev_handle; + + if (strcmp(rte_dev->name, device_name) == 0 && + !dev->internal.pending_removal) { + can_detach = !dev->internal.attached; + /* prevent any further attaches */ + dev->internal.pending_removal = true; + break; + } + } + pthread_mutex_unlock(&g_pci_mutex); + + if (dev != NULL && can_detach) { + /* if device is not attached we can remove it right away. + * Otherwise it will be removed at detach. + */ + remove_rte_dev(dev->dev_handle); + } +} + +static void +cleanup_pci_devices(void) +{ + struct spdk_pci_device *dev, *tmp; + + pthread_mutex_lock(&g_pci_mutex); + /* cleanup removed devices */ + TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) { + if (!dev->internal.removed) { + continue; + } + + vtophys_pci_device_removed(dev->dev_handle); + TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq); + free(dev); + } + + /* add newly-attached devices */ + TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) { + TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq); + TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq); + vtophys_pci_device_added(dev->dev_handle); + } + pthread_mutex_unlock(&g_pci_mutex); +} + +static int scan_pci_bus(bool delay_init); + +/* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */ +static int +register_rte_driver(struct spdk_pci_driver *driver) +{ + unsigned pci_id_count = 0; + struct rte_pci_id *rte_id_table; + char *rte_name; + size_t rte_name_len; + uint32_t rte_flags; + + assert(driver->id_table); + while (driver->id_table[pci_id_count].vendor_id) { + pci_id_count++; + } + assert(pci_id_count > 0); + + rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table)); + if (!rte_id_table) { + return -ENOMEM; + } + + while (pci_id_count > 0) { + struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1]; + const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1]; + + rte_id->class_id = spdk_id->class_id; + rte_id->vendor_id = spdk_id->vendor_id; + rte_id->device_id = spdk_id->device_id; + rte_id->subsystem_vendor_id = spdk_id->subvendor_id; + rte_id->subsystem_device_id = spdk_id->subdevice_id; + pci_id_count--; + } + + assert(driver->name); + rte_name_len = strlen(driver->name) + strlen("spdk_") + 1; + rte_name = calloc(rte_name_len, 1); + if (!rte_name) { + free(rte_id_table); + return -ENOMEM; + } + + snprintf(rte_name, rte_name_len, "spdk_%s", driver->name); + driver->driver.driver.name = rte_name; + driver->driver.id_table = rte_id_table; + + rte_flags = 0; + if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) { + rte_flags |= RTE_PCI_DRV_NEED_MAPPING; + } + if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) { + rte_flags |= RTE_PCI_DRV_WC_ACTIVATE; + } + driver->driver.drv_flags = rte_flags; + + driver->driver.probe = pci_device_init; + driver->driver.remove = pci_device_fini; + + rte_pci_register(&driver->driver); + return 0; +} + +static inline void +_pci_env_init(void) +{ + /* We assume devices were present on the bus for more than 2 seconds + * before initializing SPDK and there's no need to wait more. We scan + * the bus, but we don't blacklist any devices. + */ + scan_pci_bus(false); + + /* Register a single hotremove callback for all devices. */ + if (spdk_process_is_primary()) { + rte_dev_event_callback_register(NULL, pci_device_rte_hotremove, NULL); + } +} + +void +pci_env_init(void) +{ + struct spdk_pci_driver *driver; + + TAILQ_FOREACH(driver, &g_pci_drivers, tailq) { + register_rte_driver(driver); + } + + _pci_env_init(); +} + +void +pci_env_reinit(void) +{ + /* There is no need to register pci drivers again, since they were + * already pre-registered in pci_env_init. + */ + + _pci_env_init(); +} + +void +pci_env_fini(void) +{ + struct spdk_pci_device *dev; + char bdf[32]; + + cleanup_pci_devices(); + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (dev->internal.attached) { + spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr); + fprintf(stderr, "Device %s is still attached at shutdown!\n", bdf); + } + } + + if (spdk_process_is_primary()) { + rte_dev_event_callback_unregister(NULL, pci_device_rte_hotremove, NULL); + } +} + +int +pci_device_init(struct rte_pci_driver *_drv, + struct rte_pci_device *_dev) +{ + struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv; + struct spdk_pci_device *dev; + int rc; + + dev = calloc(1, sizeof(*dev)); + if (dev == NULL) { + return -1; + } + + dev->dev_handle = _dev; + + dev->addr.domain = _dev->addr.domain; + dev->addr.bus = _dev->addr.bus; + dev->addr.dev = _dev->addr.devid; + dev->addr.func = _dev->addr.function; + dev->id.class_id = _dev->id.class_id; + dev->id.vendor_id = _dev->id.vendor_id; + dev->id.device_id = _dev->id.device_id; + dev->id.subvendor_id = _dev->id.subsystem_vendor_id; + dev->id.subdevice_id = _dev->id.subsystem_device_id; + dev->socket_id = _dev->device.numa_node; + dev->type = "pci"; + + dev->map_bar = map_bar_rte; + dev->unmap_bar = unmap_bar_rte; + dev->cfg_read = cfg_read_rte; + dev->cfg_write = cfg_write_rte; + + dev->internal.driver = driver; + dev->internal.claim_fd = -1; + + if (driver->cb_fn != NULL) { + rc = driver->cb_fn(driver->cb_arg, dev); + if (rc != 0) { + free(dev); + return rc; + } + dev->internal.attached = true; + } + + pthread_mutex_lock(&g_pci_mutex); + TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq); + pthread_mutex_unlock(&g_pci_mutex); + return 0; +} + +int +pci_device_fini(struct rte_pci_device *_dev) +{ + struct spdk_pci_device *dev; + + pthread_mutex_lock(&g_pci_mutex); + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (dev->dev_handle == _dev) { + break; + } + } + + if (dev == NULL || dev->internal.attached) { + /* The device might be still referenced somewhere in SPDK. */ + pthread_mutex_unlock(&g_pci_mutex); + return -1; + } + + /* remove our whitelist_at option */ + if (_dev->device.devargs) { + _dev->device.devargs->data = NULL; + } + + assert(!dev->internal.removed); + dev->internal.removed = true; + pthread_mutex_unlock(&g_pci_mutex); + return 0; + +} + +void +spdk_pci_device_detach(struct spdk_pci_device *dev) +{ + assert(dev->internal.attached); + + if (dev->internal.claim_fd >= 0) { + spdk_pci_device_unclaim(dev); + } + + if (strcmp(dev->type, "pci") == 0) { + /* if it's a physical device we need to deal with DPDK on + * a different process and we can't just unset one flag + * here. We also want to stop using any device resources + * so that the device isn't "in use" by the userspace driver + * once we detach it. This would allow attaching the device + * to a different process, or to a kernel driver like nvme. + */ + detach_rte(dev); + } else { + dev->internal.attached = false; + } + + cleanup_pci_devices(); +} + +static int +scan_pci_bus(bool delay_init) +{ + struct spdk_pci_driver *driver; + struct rte_pci_device *rte_dev; + uint64_t now; + + rte_bus_scan(); + now = spdk_get_ticks(); + + driver = TAILQ_FIRST(&g_pci_drivers); + if (!driver) { + return 0; + } + + TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) { + struct rte_devargs *da; + + da = rte_dev->device.devargs; + if (!da) { + char devargs_str[128]; + + /* the device was never blacklisted or whitelisted */ + da = calloc(1, sizeof(*da)); + if (!da) { + return -1; + } + + snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name); + if (rte_devargs_parse(da, devargs_str) != 0) { + free(da); + return -1; + } + + rte_devargs_insert(&da); + rte_dev->device.devargs = da; + } + + if (da->data) { + uint64_t whitelist_at = (uint64_t)(uintptr_t)da->data; + + /* this device was seen by spdk before... */ + if (da->policy == RTE_DEV_BLACKLISTED && whitelist_at <= now) { + da->policy = RTE_DEV_WHITELISTED; + } + } else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_WHITELIST && + da->policy == RTE_DEV_WHITELISTED) || da->policy != RTE_DEV_BLACKLISTED) { + /* override the policy only if not permanently blacklisted */ + + if (delay_init) { + da->policy = RTE_DEV_BLACKLISTED; + da->data = (void *)(now + 2 * spdk_get_ticks_hz()); + } else { + da->policy = RTE_DEV_WHITELISTED; + da->data = (void *)(uintptr_t)now; + } + } + } + + return 0; +} + +int +spdk_pci_device_attach(struct spdk_pci_driver *driver, + spdk_pci_enum_cb enum_cb, + void *enum_ctx, struct spdk_pci_addr *pci_address) +{ + struct spdk_pci_device *dev; + struct rte_pci_device *rte_dev; + struct rte_devargs *da; + int rc; + char bdf[32]; + + spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address); + + cleanup_pci_devices(); + + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) { + break; + } + } + + if (dev != NULL && dev->internal.driver == driver) { + pthread_mutex_lock(&g_pci_mutex); + if (dev->internal.attached || dev->internal.pending_removal) { + pthread_mutex_unlock(&g_pci_mutex); + return -1; + } + + rc = enum_cb(enum_ctx, dev); + if (rc == 0) { + dev->internal.attached = true; + } + pthread_mutex_unlock(&g_pci_mutex); + return rc; + } + + driver->cb_fn = enum_cb; + driver->cb_arg = enum_ctx; + + int i = 0; + + do { + rc = rte_eal_hotplug_add("pci", bdf, ""); + } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT); + + if (i > 1 && rc == -EEXIST) { + /* Even though the previous request timed out, the device + * was attached successfully. + */ + rc = 0; + } + + driver->cb_arg = NULL; + driver->cb_fn = NULL; + + cleanup_pci_devices(); + + if (rc != 0) { + return -1; + } + + /* explicit attach ignores the whitelist, so if we blacklisted this + * device before let's enable it now - just for clarity. + */ + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) { + break; + } + } + assert(dev != NULL); + + rte_dev = dev->dev_handle; + da = rte_dev->device.devargs; + if (da && da->data) { + da->data = (void *)(uintptr_t)spdk_get_ticks(); + da->policy = RTE_DEV_WHITELISTED; + } + + return 0; +} + +/* Note: You can call spdk_pci_enumerate from more than one thread + * simultaneously safely, but you cannot call spdk_pci_enumerate + * and rte_eal_pci_probe simultaneously. + */ +int +spdk_pci_enumerate(struct spdk_pci_driver *driver, + spdk_pci_enum_cb enum_cb, + void *enum_ctx) +{ + struct spdk_pci_device *dev; + int rc; + + cleanup_pci_devices(); + + pthread_mutex_lock(&g_pci_mutex); + TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) { + if (dev->internal.attached || + dev->internal.driver != driver || + dev->internal.pending_removal) { + continue; + } + + rc = enum_cb(enum_ctx, dev); + if (rc == 0) { + dev->internal.attached = true; + } else if (rc < 0) { + pthread_mutex_unlock(&g_pci_mutex); + return -1; + } + } + pthread_mutex_unlock(&g_pci_mutex); + + if (scan_pci_bus(true) != 0) { + return -1; + } + + driver->cb_fn = enum_cb; + driver->cb_arg = enum_ctx; + + if (rte_bus_probe() != 0) { + driver->cb_arg = NULL; + driver->cb_fn = NULL; + return -1; + } + + driver->cb_arg = NULL; + driver->cb_fn = NULL; + + cleanup_pci_devices(); + return 0; +} + +struct spdk_pci_device * +spdk_pci_get_first_device(void) +{ + return TAILQ_FIRST(&g_pci_devices); +} + +struct spdk_pci_device * +spdk_pci_get_next_device(struct spdk_pci_device *prev) +{ + return TAILQ_NEXT(prev, internal.tailq); +} + +int +spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar, + void **mapped_addr, uint64_t *phys_addr, uint64_t *size) +{ + return dev->map_bar(dev, bar, mapped_addr, phys_addr, size); +} + +int +spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr) +{ + return dev->unmap_bar(dev, bar, addr); +} + +uint32_t +spdk_pci_device_get_domain(struct spdk_pci_device *dev) +{ + return dev->addr.domain; +} + +uint8_t +spdk_pci_device_get_bus(struct spdk_pci_device *dev) +{ + return dev->addr.bus; +} + +uint8_t +spdk_pci_device_get_dev(struct spdk_pci_device *dev) +{ + return dev->addr.dev; +} + +uint8_t +spdk_pci_device_get_func(struct spdk_pci_device *dev) +{ + return dev->addr.func; +} + +uint16_t +spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev) +{ + return dev->id.vendor_id; +} + +uint16_t +spdk_pci_device_get_device_id(struct spdk_pci_device *dev) +{ + return dev->id.device_id; +} + +uint16_t +spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev) +{ + return dev->id.subvendor_id; +} + +uint16_t +spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev) +{ + return dev->id.subdevice_id; +} + +struct spdk_pci_id +spdk_pci_device_get_id(struct spdk_pci_device *dev) +{ + return dev->id; +} + +int +spdk_pci_device_get_socket_id(struct spdk_pci_device *dev) +{ + return dev->socket_id; +} + +int +spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + return dev->cfg_read(dev, value, len, offset); +} + +int +spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + return dev->cfg_write(dev, value, len, offset); +} + +int +spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 1, offset); +} + +int +spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 1, offset); +} + +int +spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 2, offset); +} + +int +spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 2, offset); +} + +int +spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 4, offset); +} + +int +spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 4, offset); +} + +int +spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len) +{ + int err; + uint32_t pos, header = 0; + uint32_t i, buf[2]; + + if (len < 17) { + return -1; + } + + err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE); + if (err || !header) { + return -1; + } + + pos = PCI_CFG_SIZE; + while (1) { + if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) { + if (pos) { + /* skip the header */ + pos += 4; + for (i = 0; i < 2; i++) { + err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i); + if (err) { + return -1; + } + } + snprintf(sn, len, "%08x%08x", buf[1], buf[0]); + return 0; + } + } + pos = (header >> 20) & 0xffc; + /* 0 if no other items exist */ + if (pos < PCI_CFG_SIZE) { + return -1; + } + err = spdk_pci_device_cfg_read32(dev, &header, pos); + if (err) { + return -1; + } + } + return -1; +} + +struct spdk_pci_addr +spdk_pci_device_get_addr(struct spdk_pci_device *dev) +{ + return dev->addr; +} + +bool +spdk_pci_device_is_removed(struct spdk_pci_device *dev) +{ + return dev->internal.pending_removal; +} + +int +spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2) +{ + if (a1->domain > a2->domain) { + return 1; + } else if (a1->domain < a2->domain) { + return -1; + } else if (a1->bus > a2->bus) { + return 1; + } else if (a1->bus < a2->bus) { + return -1; + } else if (a1->dev > a2->dev) { + return 1; + } else if (a1->dev < a2->dev) { + return -1; + } else if (a1->func > a2->func) { + return 1; + } else if (a1->func < a2->func) { + return -1; + } + + return 0; +} + +#ifdef __linux__ +int +spdk_pci_device_claim(struct spdk_pci_device *dev) +{ + int dev_fd; + char dev_name[64]; + int pid; + void *dev_map; + struct flock pcidev_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + }; + + snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", + dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func); + + dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (dev_fd == -1) { + fprintf(stderr, "could not open %s\n", dev_name); + return -errno; + } + + if (ftruncate(dev_fd, sizeof(int)) != 0) { + fprintf(stderr, "could not truncate %s\n", dev_name); + close(dev_fd); + return -errno; + } + + dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, + MAP_SHARED, dev_fd, 0); + if (dev_map == MAP_FAILED) { + fprintf(stderr, "could not mmap dev %s (%d)\n", dev_name, errno); + close(dev_fd); + return -errno; + } + + if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) { + pid = *(int *)dev_map; + fprintf(stderr, "Cannot create lock on device %s, probably" + " process %d has claimed it\n", dev_name, pid); + munmap(dev_map, sizeof(int)); + close(dev_fd); + /* F_SETLK returns unspecified errnos, normalize them */ + return -EACCES; + } + + *(int *)dev_map = (int)getpid(); + munmap(dev_map, sizeof(int)); + dev->internal.claim_fd = dev_fd; + /* Keep dev_fd open to maintain the lock. */ + return 0; +} + +void +spdk_pci_device_unclaim(struct spdk_pci_device *dev) +{ + char dev_name[64]; + + snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", + dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func); + + close(dev->internal.claim_fd); + dev->internal.claim_fd = -1; + unlink(dev_name); +} +#endif /* __linux__ */ + +#ifdef __FreeBSD__ +int +spdk_pci_device_claim(struct spdk_pci_device *dev) +{ + /* TODO */ + return 0; +} + +void +spdk_pci_device_unclaim(struct spdk_pci_device *dev) +{ + /* TODO */ +} +#endif /* __FreeBSD__ */ + +int +spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf) +{ + unsigned domain, bus, dev, func; + + if (addr == NULL || bdf == NULL) { + return -EINVAL; + } + + if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) || + (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) { + /* Matched a full address - all variables are initialized */ + } else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) { + func = 0; + } else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) || + (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) { + domain = 0; + } else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) || + (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) { + domain = 0; + func = 0; + } else { + return -EINVAL; + } + + if (bus > 0xFF || dev > 0x1F || func > 7) { + return -EINVAL; + } + + addr->domain = domain; + addr->bus = bus; + addr->dev = dev; + addr->func = func; + + return 0; +} + +int +spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr) +{ + int rc; + + rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x", + addr->domain, addr->bus, + addr->dev, addr->func); + + if (rc > 0 && (size_t)rc < sz) { + return 0; + } + + return -1; +} + +void +spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev) +{ + assert(dev->map_bar != NULL); + assert(dev->unmap_bar != NULL); + assert(dev->cfg_read != NULL); + assert(dev->cfg_write != NULL); + dev->internal.driver = drv; + TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq); +} + +void +spdk_pci_unhook_device(struct spdk_pci_device *dev) +{ + assert(!dev->internal.attached); + TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq); +} + +const char * +spdk_pci_device_get_type(const struct spdk_pci_device *dev) +{ + return dev->type; +} diff --git a/src/spdk/lib/env_dpdk/pci_idxd.c b/src/spdk/lib/env_dpdk/pci_idxd.c new file mode 100644 index 000000000..eddbfa4af --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_idxd.c @@ -0,0 +1,50 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +#define SPDK_IDXD_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID) +static struct spdk_pci_id idxd_driver_id[] = { + {SPDK_IDXD_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IDXD)}, + { .vendor_id = 0, /* sentinel */ }, +}; + +struct spdk_pci_driver * +spdk_pci_idxd_get_driver(void) +{ + return spdk_pci_get_driver("idxd"); +} + +SPDK_PCI_DRIVER_REGISTER("idxd", idxd_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING); diff --git a/src/spdk/lib/env_dpdk/pci_ioat.c b/src/spdk/lib/env_dpdk/pci_ioat.c new file mode 100644 index 000000000..28b7bdb44 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_ioat.c @@ -0,0 +1,98 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +#define SPDK_IOAT_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID) +static struct spdk_pci_id ioat_driver_id[] = { + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SKX)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_ICX)}, + { .vendor_id = 0, /* sentinel */ }, +}; + +struct spdk_pci_driver * +spdk_pci_ioat_get_driver(void) +{ + return spdk_pci_get_driver("ioat"); +} + +SPDK_PCI_DRIVER_REGISTER("ioat", ioat_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING); diff --git a/src/spdk/lib/env_dpdk/pci_virtio.c b/src/spdk/lib/env_dpdk/pci_virtio.c new file mode 100644 index 000000000..e525a4a8e --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_virtio.c @@ -0,0 +1,53 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +static struct spdk_pci_id virtio_pci_driver_id[] = { + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_MODERN) }, + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_MODERN) }, + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_LEGACY) }, + { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_LEGACY) }, + { .vendor_id = 0, /* sentinel */ }, +}; + +struct spdk_pci_driver * +spdk_pci_virtio_get_driver(void) +{ + return spdk_pci_get_driver("virtio"); +} + +SPDK_PCI_DRIVER_REGISTER("virtio", virtio_pci_driver_id, + SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE); diff --git a/src/spdk/lib/env_dpdk/pci_vmd.c b/src/spdk/lib/env_dpdk/pci_vmd.c new file mode 100644 index 000000000..fb6860873 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_vmd.c @@ -0,0 +1,50 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +static struct spdk_pci_id vmd_pci_driver_id[] = { + { SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, PCI_DEVICE_ID_INTEL_VMD) }, + { .vendor_id = 0, /* sentinel */ }, +}; + +struct spdk_pci_driver * +spdk_pci_vmd_get_driver(void) +{ + return spdk_pci_get_driver("vmd"); +} + +SPDK_PCI_DRIVER_REGISTER("vmd", vmd_pci_driver_id, + SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE); diff --git a/src/spdk/lib/env_dpdk/spdk_env_dpdk.map b/src/spdk/lib/env_dpdk/spdk_env_dpdk.map new file mode 100644 index 000000000..a465f0938 --- /dev/null +++ b/src/spdk/lib/env_dpdk/spdk_env_dpdk.map @@ -0,0 +1,114 @@ +{ + global: + + # Public functions in env.h + spdk_malloc; + spdk_zmalloc; + spdk_realloc; + spdk_free; + spdk_env_opts_init; + spdk_env_init; + spdk_env_fini; + spdk_dma_malloc; + spdk_dma_malloc_socket; + spdk_dma_zmalloc; + spdk_dma_zmalloc_socket; + spdk_dma_realloc; + spdk_dma_free; + spdk_memzone_reserve; + spdk_memzone_reserve_aligned; + spdk_memzone_lookup; + spdk_memzone_free; + spdk_memzone_dump; + spdk_mempool_create; + spdk_mempool_create_ctor; + spdk_mempool_get_name; + spdk_mempool_free; + spdk_mempool_get; + spdk_mempool_get_bulk; + spdk_mempool_put; + spdk_mempool_put_bulk; + spdk_mempool_count; + spdk_mempool_obj_iter; + spdk_mempool_lookup; + spdk_env_get_core_count; + spdk_env_get_current_core; + spdk_env_get_first_core; + spdk_env_get_last_core; + spdk_env_get_next_core; + spdk_env_get_socket_id; + spdk_env_thread_launch_pinned; + spdk_env_thread_wait_all; + spdk_process_is_primary; + spdk_get_ticks; + spdk_get_ticks_hz; + spdk_delay_us; + spdk_pause; + spdk_ring_create; + spdk_ring_free; + spdk_ring_count; + spdk_ring_enqueue; + spdk_ring_dequeue; + spdk_iommu_is_enabled; + spdk_vtophys; + spdk_pci_get_driver; + spdk_pci_driver_register; + spdk_pci_nvme_get_driver; + spdk_pci_vmd_get_driver; + spdk_pci_idxd_get_driver; + spdk_pci_ioat_get_driver; + spdk_pci_virtio_get_driver; + spdk_pci_enumerate; + spdk_pci_get_first_device; + spdk_pci_get_next_device; + spdk_pci_device_map_bar; + spdk_pci_device_unmap_bar; + spdk_pci_device_get_domain; + spdk_pci_device_get_bus; + spdk_pci_device_get_dev; + spdk_pci_device_get_func; + spdk_pci_device_get_addr; + spdk_pci_device_get_vendor_id; + spdk_pci_device_get_device_id; + spdk_pci_device_get_subvendor_id; + spdk_pci_device_get_subdevice_id; + spdk_pci_device_get_id; + spdk_pci_device_get_socket_id; + spdk_pci_device_get_serial_number; + spdk_pci_device_claim; + spdk_pci_device_unclaim; + spdk_pci_device_detach; + spdk_pci_device_attach; + spdk_pci_device_cfg_read; + spdk_pci_device_cfg_write; + spdk_pci_device_cfg_read8; + spdk_pci_device_cfg_write8; + spdk_pci_device_cfg_read16; + spdk_pci_device_cfg_write16; + spdk_pci_device_cfg_read32; + spdk_pci_device_cfg_write32; + spdk_pci_device_is_removed; + spdk_pci_addr_compare; + spdk_pci_addr_parse; + spdk_pci_addr_fmt; + spdk_pci_hook_device; + spdk_pci_unhook_device; + spdk_pci_device_get_type; + spdk_unaffinitize_thread; + spdk_call_unaffinitized; + spdk_mem_map_alloc; + spdk_mem_map_free; + spdk_mem_map_set_translation; + spdk_mem_map_clear_translation; + spdk_mem_map_translate; + spdk_mem_register; + spdk_mem_unregister; + + # Public functions in env_dpdk.h + spdk_env_dpdk_post_init; + spdk_env_dpdk_post_fini; + spdk_env_dpdk_external_init; + spdk_env_dpdk_dump_mem_stats; + + local: *; +}; diff --git a/src/spdk/lib/env_dpdk/threads.c b/src/spdk/lib/env_dpdk/threads.c new file mode 100644 index 000000000..01c7b8d9f --- /dev/null +++ b/src/spdk/lib/env_dpdk/threads.c @@ -0,0 +1,108 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include <rte_config.h> +#include <rte_lcore.h> + +uint32_t +spdk_env_get_core_count(void) +{ + return rte_lcore_count(); +} + +uint32_t +spdk_env_get_current_core(void) +{ + return rte_lcore_id(); +} + +uint32_t +spdk_env_get_first_core(void) +{ + return rte_get_next_lcore(-1, 0, 0); +} + +uint32_t +spdk_env_get_last_core(void) +{ + uint32_t i; + uint32_t last_core = UINT32_MAX; + + SPDK_ENV_FOREACH_CORE(i) { + last_core = i; + } + + assert(last_core != UINT32_MAX); + + return last_core; +} + +uint32_t +spdk_env_get_next_core(uint32_t prev_core) +{ + unsigned lcore; + + lcore = rte_get_next_lcore(prev_core, 0, 0); + if (lcore == RTE_MAX_LCORE) { + return UINT32_MAX; + } + return lcore; +} + +uint32_t +spdk_env_get_socket_id(uint32_t core) +{ + if (core >= RTE_MAX_LCORE) { + return SPDK_ENV_SOCKET_ID_ANY; + } + + return rte_lcore_to_socket_id(core); +} + +int +spdk_env_thread_launch_pinned(uint32_t core, thread_start_fn fn, void *arg) +{ + int rc; + + rc = rte_eal_remote_launch(fn, arg, core); + + return rc; +} + +void +spdk_env_thread_wait_all(void) +{ + rte_eal_mp_wait_lcore(); +} diff --git a/src/spdk/lib/env_ocf/.gitignore b/src/spdk/lib/env_ocf/.gitignore new file mode 100644 index 000000000..f5452c248 --- /dev/null +++ b/src/spdk/lib/env_ocf/.gitignore @@ -0,0 +1,2 @@ +src/ +include/ diff --git a/src/spdk/lib/env_ocf/Makefile b/src/spdk/lib/env_ocf/Makefile new file mode 100644 index 000000000..0ac51eecd --- /dev/null +++ b/src/spdk/lib/env_ocf/Makefile @@ -0,0 +1,108 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# OCF requires users to build with their sources +# If SPDK is configured with OCF source directory, +# we export its files and then compile SPDK LIB with them +# Else if SPDK is configured with OCF precompiled library +# we just use it as SPDK lib by copying it to /build/lib/ + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +OCFDIR=$(CONFIG_OCF_DIR) + +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +LIBNAME := ocfenv + +CFLAGS += $(ENV_CFLAGS) -I$(CURDIR) -I$(CURDIR)/include -w +C_SRCS = $(shell find -name \*.c) + +LIB = $(call spdk_lib_list_to_static_libs,$(LIBNAME)) + + +ifeq ($(CONFIG_CUSTOMOCF),y) + +.PHONY: all clean install + +all: + $(Q)$(MAKE) $(LIB) + +clean: + $(Q)rm -f $(LIB) + +$(LIB): + cp $(CONFIG_OCF_PATH) $(LIB) + +install: + +uninstall: + $(UNINSTALL_LIB) + +else + +.PHONY: all clean install ocf_inc ocf_src ocf_distclean all exportlib + +all: ocf_inc ocf_src + $(Q)$(MAKE) $(LIB) + +ocf_inc: + $(Q)$(MAKE) -C "$(CONFIG_OCF_PATH)" inc O="$(SPDK_ROOT_DIR)/lib/env_ocf/" ENV= --quiet + +ocf_src: ocf_inc + $(Q)$(MAKE) -C "$(CONFIG_OCF_PATH)" src O="$(SPDK_ROOT_DIR)/lib/env_ocf/" CMD=cp ENV= --quiet + +ocf_distclean: + $(Q)$(MAKE) -C "$(CONFIG_OCF_PATH)" distclean O="$(SPDK_ROOT_DIR)/lib/env_ocf/" ENV= --quiet + +clean: ocf_distclean + $(Q)rm -rf "$(SPDK_ROOT_DIR)/lib/env_ocf/include" \ + "$(SPDK_ROOT_DIR)/lib/env_ocf/src" \ + $(LIB) $(OBJS); + +$(LIB): $(OBJS) + $(LIB_C) + +install: + +uninstall: + $(UNINSTALL_LIB) + +endif + +exportlib: all + @ if [ -z $(O) ]; then echo "No output specified"; exit 1; fi + cp $(LIB) $(O) + +help: + @ echo "all Default" + @ echo "exportlib O=<outpath> Default build to specified outpath" diff --git a/src/spdk/lib/env_ocf/ocf_env.c b/src/spdk/lib/env_ocf/ocf_env.c new file mode 100644 index 000000000..ab5445203 --- /dev/null +++ b/src/spdk/lib/env_ocf/ocf_env.c @@ -0,0 +1,176 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "ocf/ocf_def.h" +#include "ocf_env.h" + +#include "spdk/crc32.h" +#include "spdk/env.h" +#include "spdk_internal/log.h" + +/* Number of buffers for mempool + * Need to be power of two - 1 for better memory utilization + * It depends on memory usage of OCF which + * in itself depends on the workload + * It is a big number because OCF uses allocators + * for every request it sends and recieves + */ +#define ENV_ALLOCATOR_NBUFS 32767 + +/* Use unique index for env allocators */ +static env_atomic g_env_allocator_index = 0; + +void * +env_allocator_new(env_allocator *allocator) +{ + void *mem = spdk_mempool_get(allocator->mempool); + + if (spdk_likely(mem)) { + memset(mem, 0, allocator->element_size); + } + + return mem; +} + +env_allocator * +env_allocator_create(uint32_t size, const char *name) +{ + env_allocator *allocator; + char qualified_name[128] = {0}; + + snprintf(qualified_name, 128, "ocf_env_%d", env_atomic_inc_return(&g_env_allocator_index)); + + allocator = calloc(1, sizeof(*allocator)); + if (!allocator) { + return NULL; + } + + allocator->mempool = spdk_mempool_create(qualified_name, + ENV_ALLOCATOR_NBUFS, size, + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + + if (!allocator->mempool) { + free(allocator); + return NULL; + } + + allocator->element_size = size; + + return allocator; +} + +void +env_allocator_del(env_allocator *allocator, void *item) +{ + spdk_mempool_put(allocator->mempool, item); +} + +void +env_allocator_destroy(env_allocator *allocator) +{ + if (allocator) { + if (ENV_ALLOCATOR_NBUFS - spdk_mempool_count(allocator->mempool)) { + SPDK_ERRLOG("Not all objects deallocated\n"); + assert(false); + } + + spdk_mempool_free(allocator->mempool); + free(allocator); + } +} +/* *** CRC *** */ + +uint32_t +env_crc32(uint32_t crc, uint8_t const *message, size_t len) +{ + return spdk_crc32_ieee_update(message, len, crc); +} + +/* EXECUTION CONTEXTS */ +pthread_mutex_t *exec_context_mutex; + +static void __attribute__((constructor)) init_execution_context(void) +{ + unsigned count = env_get_execution_context_count(); + unsigned i; + + ENV_BUG_ON(count == 0); + exec_context_mutex = malloc(count * sizeof(exec_context_mutex[0])); + ENV_BUG_ON(exec_context_mutex == NULL); + for (i = 0; i < count; i++) { + ENV_BUG_ON(pthread_mutex_init(&exec_context_mutex[i], NULL)); + } +} + +static void __attribute__((destructor)) deinit_execution_context(void) +{ + unsigned count = env_get_execution_context_count(); + unsigned i; + + ENV_BUG_ON(count == 0); + ENV_BUG_ON(exec_context_mutex == NULL); + + for (i = 0; i < count; i++) { + ENV_BUG_ON(pthread_mutex_destroy(&exec_context_mutex[i])); + } + free(exec_context_mutex); +} + +/* get_execuction_context must assure that after the call finishes, the caller + * will not get preempted from current execution context. For userspace env + * we simulate this behavior by acquiring per execution context mutex. As a + * result the caller might actually get preempted, but no other thread will + * execute in this context by the time the caller puts current execution ctx. */ +unsigned env_get_execution_context(void) +{ + unsigned cpu; + + cpu = sched_getcpu(); + cpu = (cpu == -1) ? 0 : cpu; + + ENV_BUG_ON(pthread_mutex_lock(&exec_context_mutex[cpu])); + + return cpu; +} + +void env_put_execution_context(unsigned ctx) +{ + pthread_mutex_unlock(&exec_context_mutex[ctx]); +} + +unsigned env_get_execution_context_count(void) +{ + int num = sysconf(_SC_NPROCESSORS_ONLN); + + return (num == -1) ? 0 : num; +} diff --git a/src/spdk/lib/env_ocf/ocf_env.h b/src/spdk/lib/env_ocf/ocf_env.h new file mode 100644 index 000000000..81d2e814b --- /dev/null +++ b/src/spdk/lib/env_ocf/ocf_env.h @@ -0,0 +1,834 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef __LIBOCF_ENV_H__ +#define __LIBOCF_ENV_H__ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#ifndef __USE_GNU +#define __USE_GNU +#endif + +#include <linux/limits.h> +#include <linux/stddef.h> + +#include "spdk/stdinc.h" +#include "spdk/likely.h" +#include "spdk/env.h" +#include "spdk/util.h" +#include "spdk_internal/log.h" + +#include "ocf_env_list.h" +#include "ocf/ocf_err.h" + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef uint64_t sector_t; + +#define __packed __attribute__((packed)) +#define __aligned(x) __attribute__((aligned(x))) + +/* linux sector 512-bytes */ +#define ENV_SECTOR_SHIFT 9 +#define ENV_SECTOR_SIZE (1<<ENV_SECTOR_SHIFT) +#define BYTES_TO_SECTOR(x) ((x) >> ENV_SECTOR_SHIFT) + +/* *** MEMORY MANAGEMENT *** */ + +#define ENV_MEM_NORMAL 0 +#define ENV_MEM_NOIO 0 +#define ENV_MEM_ATOMIC 0 + +#define likely spdk_likely +#define unlikely spdk_unlikely + +#define min(x, y) MIN(x, y) +#ifndef MIN +#define MIN(x, y) spdk_min(x, y) +#endif + +#define ARRAY_SIZE(x) SPDK_COUNTOF(x) + +/* LOGGING */ +#define ENV_PRIu64 PRIu64 + +#define ENV_WARN(cond, fmt, args...) ({ \ + if (spdk_unlikely((uintptr_t)(cond))) \ + SPDK_NOTICELOG("WARNING" fmt, ##args); \ + }) + +#define ENV_WARN_ON(cond) ({ \ + if (spdk_unlikely((uintptr_t)(cond))) \ + SPDK_NOTICELOG("WARNING\n"); \ + }) + +#define ENV_BUG() ({ \ + SPDK_ERRLOG("BUG\n"); \ + assert(0); \ + abort(); \ + }) + +#define ENV_BUG_ON(cond) ({ \ + if (spdk_unlikely((uintptr_t)(cond))) { \ + SPDK_ERRLOG("BUG\n"); \ + assert(0); \ + abort(); \ + } \ + }) + +#define ENV_BUILD_BUG_ON(cond) _Static_assert(!(cond), "static "\ + "assertion failure") + +#define container_of(ptr, type, member) SPDK_CONTAINEROF(ptr, type, member) + +static inline void *env_malloc(size_t size, int flags) +{ + return spdk_malloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); +} + +static inline void *env_zalloc(size_t size, int flags) +{ + return spdk_zmalloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); +} + +static inline void env_free(const void *ptr) +{ + return spdk_free((void *)ptr); +} + +static inline void *env_vmalloc(size_t size) +{ + return spdk_malloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); +} + +static inline void *env_vzalloc(size_t size) +{ + /* TODO: raw_ram init can request huge amount of memory to store + * hashtable in it. need to ensure that allocation succedds */ + return spdk_zmalloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); +} + +static inline void *env_vzalloc_flags(size_t size, int flags) +{ + return env_vzalloc(size); +} + +static inline void *env_secure_alloc(size_t size) +{ + return spdk_zmalloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); +} + +static inline void env_secure_free(const void *ptr, size_t size) +{ + return spdk_free((void *)ptr); +} + +static inline void env_vfree(const void *ptr) +{ + return spdk_free((void *)ptr); +} + +static inline uint64_t env_get_free_memory(void) +{ + return -1; +} + +/* *** ALLOCATOR *** */ + +#define OCF_ALLOCATOR_NAME_MAX 128 + +typedef struct { + struct spdk_mempool *mempool; + size_t element_size; +} env_allocator; + +env_allocator *env_allocator_create(uint32_t size, const char *name); + +void env_allocator_destroy(env_allocator *allocator); + +void *env_allocator_new(env_allocator *allocator); + +void env_allocator_del(env_allocator *allocator, void *item); + +uint32_t env_allocator_item_count(env_allocator *allocator); + +/* *** MUTEX *** */ + +typedef struct { + pthread_mutex_t m; +} env_mutex; + +static inline int env_mutex_init(env_mutex *mutex) +{ + return !!pthread_mutex_init(&mutex->m, NULL); +} + +static inline void env_mutex_lock(env_mutex *mutex) +{ + ENV_BUG_ON(pthread_mutex_lock(&mutex->m)); +} + +static inline int env_mutex_lock_interruptible(env_mutex *mutex) +{ + env_mutex_lock(mutex); + return 0; +} + +static inline int env_mutex_trylock(env_mutex *mutex) +{ + return pthread_mutex_trylock(&mutex->m) ? -OCF_ERR_NO_LOCK : 0; +} + +static inline void env_mutex_unlock(env_mutex *mutex) +{ + ENV_BUG_ON(pthread_mutex_unlock(&mutex->m)); +} + +static inline int env_mutex_is_locked(env_mutex *mutex) +{ + if (env_mutex_trylock(mutex) == 0) { + env_mutex_unlock(mutex); + return 0; + } + + return 1; +} + +static inline int env_mutex_destroy(env_mutex *mutex) +{ + if (pthread_mutex_destroy(&mutex->m)) { + return 1; + } + + return 0; +} + +/* *** RECURSIVE MUTEX *** */ + +typedef env_mutex env_rmutex; + +static inline int env_rmutex_init(env_rmutex *rmutex) +{ + pthread_mutexattr_t attr; + + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&rmutex->m, &attr); + + return 0; +} + +static inline void env_rmutex_lock(env_rmutex *rmutex) +{ + env_mutex_lock(rmutex); +} + +static inline int env_rmutex_lock_interruptible(env_rmutex *rmutex) +{ + return env_mutex_lock_interruptible(rmutex); +} + +static inline int env_rmutex_trylock(env_rmutex *rmutex) +{ + return env_mutex_trylock(rmutex); +} + +static inline void env_rmutex_unlock(env_rmutex *rmutex) +{ + env_mutex_unlock(rmutex); +} + +static inline int env_rmutex_is_locked(env_rmutex *rmutex) +{ + return env_mutex_is_locked(rmutex); +} + +static inline int env_rmutex_destroy(env_rmutex *rmutex) +{ + return env_mutex_destroy(rmutex); +} + +/* *** RW SEMAPHORE *** */ +typedef struct { + pthread_rwlock_t lock; +} env_rwsem; + +static inline int env_rwsem_init(env_rwsem *s) +{ + return !!pthread_rwlock_init(&s->lock, NULL); +} + +static inline void env_rwsem_up_read(env_rwsem *s) +{ + ENV_BUG_ON(pthread_rwlock_unlock(&s->lock)); +} + +static inline void env_rwsem_down_read(env_rwsem *s) +{ + ENV_BUG_ON(pthread_rwlock_rdlock(&s->lock)); +} + +static inline int env_rwsem_down_read_trylock(env_rwsem *s) +{ + return pthread_rwlock_tryrdlock(&s->lock) ? -OCF_ERR_NO_LOCK : 0; +} + +static inline void env_rwsem_up_write(env_rwsem *s) +{ + ENV_BUG_ON(pthread_rwlock_unlock(&s->lock)); +} + +static inline void env_rwsem_down_write(env_rwsem *s) +{ + ENV_BUG_ON(pthread_rwlock_wrlock(&s->lock)); +} + +static inline int env_rwsem_down_write_trylock(env_rwsem *s) +{ + return pthread_rwlock_trywrlock(&s->lock) ? -OCF_ERR_NO_LOCK : 0; +} + +static inline int env_rwsem_is_locked(env_rwsem *s) +{ + if (env_rwsem_down_read_trylock(s) == 0) { + env_rwsem_up_read(s); + return 0; + } + + return 1; +} + +static inline int env_rwsem_down_read_interruptible(env_rwsem *s) +{ + return pthread_rwlock_rdlock(&s->lock); +} +static inline int env_rwsem_down_write_interruptible(env_rwsem *s) +{ + return pthread_rwlock_wrlock(&s->lock); +} + +static inline int env_rwsem_destroy(env_rwsem *s) +{ + return pthread_rwlock_destroy(&s->lock); +} + +/* *** ATOMIC VARIABLES *** */ + +typedef int env_atomic; + +typedef long env_atomic64; + +#ifndef atomic_read +#define atomic_read(ptr) (*(__typeof__(*ptr) *volatile) (ptr)) +#endif + +#ifndef atomic_set +#define atomic_set(ptr, i) ((*(__typeof__(*ptr) *volatile) (ptr)) = (i)) +#endif + +#define atomic_inc(ptr) ((void) __sync_fetch_and_add(ptr, 1)) +#define atomic_dec(ptr) ((void) __sync_fetch_and_add(ptr, -1)) +#define atomic_add(ptr, n) ((void) __sync_fetch_and_add(ptr, n)) +#define atomic_sub(ptr, n) ((void) __sync_fetch_and_sub(ptr, n)) + +#define atomic_cmpxchg __sync_val_compare_and_swap + +static inline int env_atomic_read(const env_atomic *a) +{ + return atomic_read(a); +} + +static inline void env_atomic_set(env_atomic *a, int i) +{ + atomic_set(a, i); +} + +static inline void env_atomic_add(int i, env_atomic *a) +{ + atomic_add(a, i); +} + +static inline void env_atomic_sub(int i, env_atomic *a) +{ + atomic_sub(a, i); +} + +static inline bool env_atomic_sub_and_test(int i, env_atomic *a) +{ + return __sync_sub_and_fetch(a, i) == 0; +} + +static inline void env_atomic_inc(env_atomic *a) +{ + atomic_inc(a); +} + +static inline void env_atomic_dec(env_atomic *a) +{ + atomic_dec(a); +} + +static inline bool env_atomic_dec_and_test(env_atomic *a) +{ + return __sync_sub_and_fetch(a, 1) == 0; +} + +static inline bool env_atomic_inc_and_test(env_atomic *a) +{ + return __sync_add_and_fetch(a, 1) == 0; +} + +static inline int env_atomic_add_return(int i, env_atomic *a) +{ + return __sync_add_and_fetch(a, i); +} + +static inline int env_atomic_sub_return(int i, env_atomic *a) +{ + return __sync_sub_and_fetch(a, i); +} + +static inline int env_atomic_inc_return(env_atomic *a) +{ + return env_atomic_add_return(1, a); +} + +static inline int env_atomic_dec_return(env_atomic *a) +{ + return env_atomic_sub_return(1, a); +} + +static inline int env_atomic_cmpxchg(env_atomic *a, int old, int new_value) +{ + return atomic_cmpxchg(a, old, new_value); +} + +static inline int env_atomic_add_unless(env_atomic *a, int i, int u) +{ + int c, old; + c = env_atomic_read(a); + for (;;) { + if (spdk_unlikely(c == (u))) { + break; + } + old = env_atomic_cmpxchg((a), c, c + (i)); + if (spdk_likely(old == c)) { + break; + } + c = old; + } + return c != (u); +} + +static inline long env_atomic64_read(const env_atomic64 *a) +{ + return atomic_read(a); +} + +static inline void env_atomic64_set(env_atomic64 *a, long i) +{ + atomic_set(a, i); +} + +static inline void env_atomic64_add(long i, env_atomic64 *a) +{ + atomic_add(a, i); +} + +static inline void env_atomic64_sub(long i, env_atomic64 *a) +{ + atomic_sub(a, i); +} + +static inline void env_atomic64_inc(env_atomic64 *a) +{ + atomic_inc(a); +} + +static inline void env_atomic64_dec(env_atomic64 *a) +{ + atomic_dec(a); +} + +static inline int env_atomic64_add_return(int i, env_atomic *a) +{ + return __sync_add_and_fetch(a, i); +} + +static inline int env_atomic64_sub_return(int i, env_atomic *a) +{ + return __sync_sub_and_fetch(a, i); +} + +static inline int env_atomic64_inc_return(env_atomic *a) +{ + return env_atomic64_add_return(1, a); +} + +static inline int env_atomic64_dec_return(env_atomic *a) +{ + return env_atomic_sub_return(1, a); +} + +static inline long env_atomic64_cmpxchg(env_atomic64 *a, long old, long new) +{ + return atomic_cmpxchg(a, old, new); +} + +/* *** COMPLETION *** */ +typedef struct completion { + sem_t sem; +} env_completion; + +static inline void env_completion_init(env_completion *completion) +{ + sem_init(&completion->sem, 0, 0); +} + +static inline void env_completion_wait(env_completion *completion) +{ + sem_wait(&completion->sem); +} + +static inline void env_completion_complete(env_completion *completion) +{ + sem_post(&completion->sem); +} + +static inline void env_completion_destroy(env_completion *completion) +{ + sem_destroy(&completion->sem); +} + +/* *** SPIN LOCKS *** */ + +typedef struct { + pthread_spinlock_t lock; +} env_spinlock; + +static inline int env_spinlock_init(env_spinlock *l) +{ + return pthread_spin_init(&l->lock, 0); +} + +static inline int env_spinlock_trylock(env_spinlock *l) +{ + return pthread_spin_trylock(&l->lock) ? -OCF_ERR_NO_LOCK : 0; +} + +static inline void env_spinlock_lock(env_spinlock *l) +{ + ENV_BUG_ON(pthread_spin_lock(&l->lock)); +} + +static inline void env_spinlock_unlock(env_spinlock *l) +{ + ENV_BUG_ON(pthread_spin_unlock(&l->lock)); +} + +#define env_spinlock_lock_irqsave(l, flags) \ + (void)flags; \ + env_spinlock_lock(l) + +#define env_spinlock_unlock_irqrestore(l, flags) \ + (void)flags; \ + env_spinlock_unlock(l) + +static inline void env_spinlock_destroy(env_spinlock *l) +{ + ENV_BUG_ON(pthread_spin_destroy(&l->lock)); +} + +/* *** RW LOCKS *** */ + +typedef struct { + pthread_rwlock_t lock; +} env_rwlock; + +static inline void env_rwlock_init(env_rwlock *l) +{ + ENV_BUG_ON(pthread_rwlock_init(&l->lock, NULL)); +} + +static inline void env_rwlock_read_lock(env_rwlock *l) +{ + ENV_BUG_ON(pthread_rwlock_rdlock(&l->lock)); +} + +static inline void env_rwlock_read_unlock(env_rwlock *l) +{ + ENV_BUG_ON(pthread_rwlock_unlock(&l->lock)); +} + +static inline void env_rwlock_write_lock(env_rwlock *l) +{ + ENV_BUG_ON(pthread_rwlock_wrlock(&l->lock)); +} + +static inline void env_rwlock_write_unlock(env_rwlock *l) +{ + ENV_BUG_ON(pthread_rwlock_unlock(&l->lock)); +} + +static inline void env_rwlock_destroy(env_rwlock *l) +{ + ENV_BUG_ON(pthread_rwlock_destroy(&l->lock)); +} + +static inline void env_bit_set(int nr, volatile void *addr) +{ + char *byte = (char *)addr + (nr >> 3); + char mask = 1 << (nr & 7); + + __sync_or_and_fetch(byte, mask); +} + +static inline void env_bit_clear(int nr, volatile void *addr) +{ + char *byte = (char *)addr + (nr >> 3); + char mask = 1 << (nr & 7); + + mask = ~mask; + __sync_and_and_fetch(byte, mask); +} + +static inline bool env_bit_test(int nr, const volatile unsigned long *addr) +{ + const char *byte = (char *)addr + (nr >> 3); + char mask = 1 << (nr & 7); + + return !!(*byte & mask); +} + +/* *** WAITQUEUE *** */ + +typedef struct { + sem_t sem; +} env_waitqueue; + +static inline void env_waitqueue_init(env_waitqueue *w) +{ + sem_init(&w->sem, 0, 0); +} + +static inline void env_waitqueue_wake_up(env_waitqueue *w) +{ + sem_post(&w->sem); +} + +#define env_waitqueue_wait(w, condition) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + sem_wait(&w.sem); \ + __ret = __ret; \ +}) + +/* *** SCHEDULING *** */ + +/* CAS does not need this while in user-space */ +static inline void env_schedule(void) +{ +} + +#define env_cond_resched env_schedule + +static inline int env_in_interrupt(void) +{ + return 0; +} + +static inline uint64_t env_get_tick_count(void) +{ + return spdk_get_ticks(); +} + +static inline uint64_t env_ticks_to_secs(uint64_t j) +{ + return j / spdk_get_ticks_hz(); +} + +static inline uint64_t env_ticks_to_msecs(uint64_t j) +{ + return env_ticks_to_secs(j) * 1000; +} + +static inline uint64_t env_ticks_to_nsecs(uint64_t j) +{ + return env_ticks_to_secs(j) * 1000 * 1000; +} + +static inline uint64_t env_ticks_to_usecs(uint64_t j) +{ + return env_ticks_to_secs(j) * 1000 * 1000 * 1000; +} + +static inline uint64_t env_secs_to_ticks(uint64_t j) +{ + return j * spdk_get_ticks_hz(); +} + +/* *** STRING OPERATIONS *** */ + +/* 512 KB is sufficient amount of memory for OCF operations */ +#define ENV_MAX_MEM (512 * 1024) + +static inline int env_memset(void *dest, size_t len, uint8_t value) +{ + if (dest == NULL || len == 0) { + return 1; + } + + memset(dest, value, len); + return 0; +} + +static inline int env_memcpy(void *dest, size_t dmax, const void *src, size_t len) +{ + if (dest == NULL || src == NULL) { + return 1; + } + if (dmax == 0 || dmax > ENV_MAX_MEM) { + return 1; + } + if (len == 0 || len > dmax) { + return 1; + } + + memcpy(dest, src, len); + return 0; +} + +static inline int env_memcmp(const void *aptr, size_t dmax, const void *bptr, size_t len, + int *diff) +{ + if (diff == NULL || aptr == NULL || bptr == NULL) { + return 1; + } + if (dmax == 0 || dmax > ENV_MAX_MEM) { + return 1; + } + if (len == 0 || len > dmax) { + return 1; + } + + *diff = memcmp(aptr, bptr, len); + return 0; +} + +/* 4096 is sufficient max length for any OCF operation on string */ +#define ENV_MAX_STR (4 * 1024) + +static inline size_t env_strnlen(const char *src, size_t dmax) +{ + return strnlen(src, dmax); +} + +static inline int env_strncpy(char *dest, size_t dmax, const char *src, size_t len) +{ + if (dest == NULL || src == NULL) { + return 1; + } + if (dmax == 0 || dmax > ENV_MAX_STR) { + return 1; + } + if (len == 0) { + return 1; + } + /* Just copy as many characters as we can instead of return failure */ + len = min(len, dmax); + + strncpy(dest, src, len); + return 0; +} + +#define env_strncmp(s1, slen1, s2, slen2) strncmp(s1, s2, min(slen1, slen2)) + +static inline char *env_strdup(const char *src, int flags) +{ + int len; + char *ret; + + if (src == NULL) { + return NULL; + } + + len = env_strnlen(src, ENV_MAX_STR) + 1; + ret = env_malloc(len, flags); + + if (env_strncpy(ret, ENV_MAX_STR, src, len)) { + return NULL; + } else { + return ret; + } +} + +/* *** SORTING *** */ + +static inline void env_sort(void *base, size_t num, size_t size, + int (*cmp_fn)(const void *, const void *), + void (*swap_fn)(void *, void *, int size)) +{ + qsort(base, num, size, cmp_fn); +} + +static inline void env_msleep(uint64_t n) +{ + usleep(n * 1000); +} + +static inline void env_touch_softlockup_wd(void) +{ +} + +/* *** CRC *** */ + +uint32_t env_crc32(uint32_t crc, uint8_t const *data, size_t len); + +/* EXECUTION CONTEXTS */ +unsigned env_get_execution_context(void); +void env_put_execution_context(unsigned ctx); +unsigned env_get_execution_context_count(void); + +#endif /* __OCF_ENV_H__ */ diff --git a/src/spdk/lib/env_ocf/ocf_env_headers.h b/src/spdk/lib/env_ocf/ocf_env_headers.h new file mode 100644 index 000000000..742479374 --- /dev/null +++ b/src/spdk/lib/env_ocf/ocf_env_headers.h @@ -0,0 +1,43 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __OCF_ENV_HEADERS_H__ +#define __OCF_ENV_HEADERS_H__ + +#include "spdk/stdinc.h" + +#define OCF_VERSION_MAIN 20 +#define OCF_VERSION_MAJOR 3 +#define OCF_VERSION_MINOR 0 + +#endif /* __OCF_ENV_HEADERS_H__ */ diff --git a/src/spdk/lib/env_ocf/ocf_env_list.h b/src/spdk/lib/env_ocf/ocf_env_list.h new file mode 100644 index 000000000..e5f60d6c3 --- /dev/null +++ b/src/spdk/lib/env_ocf/ocf_env_list.h @@ -0,0 +1,185 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __OCF_LIST_H__ +#define __OCF_LIST_H__ + +#define LIST_POISON1 ((void *) 0x00100100) +#define LIST_POISON2 ((void *) 0x00200200) + +/** + * List entry structure mimicking linux kernel based one. + */ +struct list_head { + struct list_head *next; + struct list_head *prev; +}; + +/** + * start an empty list + */ +#define INIT_LIST_HEAD(l) { (l)->prev = l; (l)->next = l; } + +/** + * Add item to list head. + * @param it list entry to be added + * @param l1 list main node (head) + */ +static inline void list_add(struct list_head *it, struct list_head *l1) +{ + it->prev = l1; + it->next = l1->next; + + l1->next->prev = it; + l1->next = it; +} + +/** + * Add item it to tail. + * @param it list entry to be added + * @param l1 list main node (head) + */ +static inline void list_add_tail(struct list_head *it, struct list_head *l1) +{ + it->prev = l1->prev; + it->next = l1; + + l1->prev->next = it; + l1->prev = it; +} + +/** + * check if a list is empty (return true) + */ +static inline int list_empty(struct list_head *it) +{ + return it->next == it; +} + +/** + * delete an entry from a list + */ +static inline void list_del(struct list_head *it) +{ + it->next->prev = it->prev; + it->prev->next = it->next; +} + +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + list_del(list); + list_add_tail(list, head); +} + +static inline void list_move(struct list_head *list, + struct list_head *head) +{ + list_del(list); + list_add(list, head); +} + +/** + * Extract an entry. + * @param list_head_i list head item, from which entry is extracted + * @param item_type type (struct) of list entry + * @param field_name name of list_head field within item_type + */ +#define list_entry(list_head_i, item_type, field_name) \ + (item_type *)(((void*)(list_head_i)) - offsetof(item_type, field_name)) + +#define list_first_entry(list_head_i, item_type, field_name) \ + list_entry((list_head_i)->next, item_type, field_name) + +/** + * @param iterator uninitialized list_head pointer, to be used as iterator + * @param plist list head (main node) + */ +#define list_for_each(iterator, plist) \ + for (iterator = (plist)->next; \ + (iterator)->next != (plist)->next; \ + iterator = (iterator)->next) + +/** + * Safe version of list_for_each which works even if entries are deleted during + * loop. + * @param iterator uninitialized list_head pointer, to be used as iterator + * @param q another uninitialized list_head, used as helper + * @param plist list head (main node) + */ +/* + * Algorithm handles situation, where q is deleted. + * consider in example 3 element list with header h: + * + * h -> 1 -> 2 -> 3 -> + *1. i q + * + *2. i q + * + *3. q i + */ +#define list_for_each_safe(iterator, q, plist) \ + for (iterator = (q = (plist)->next->next)->prev; \ + (q) != (plist)->next; \ + iterator = (q = (q)->next)->prev) + +#define _list_entry_helper(item, head, field_name) list_entry(head, typeof(*item), field_name) + +/** + * Iterate over list entries. + * @param list pointer to list item (iterator) + * @param plist pointer to list_head item + * @param field_name name of list_head field in list entry + */ +#define list_for_each_entry(item, plist, field_name) \ + for (item = _list_entry_helper(item, (plist)->next, field_name); \ + _list_entry_helper(item, (item)->field_name.next, field_name) !=\ + _list_entry_helper(item, (plist)->next, field_name); \ + item = _list_entry_helper(item, (item)->field_name.next, field_name)) + +/** + * Safe version of list_for_each_entry which works even if entries are deleted + * during loop. + * @param list pointer to list item (iterator) + * @param q another pointer to list item, used as helper + * @param plist pointer to list_head item + * @param field_name name of list_head field in list entry + */ +#define list_for_each_entry_safe(item, q, plist, field_name) \ + for (item = _list_entry_helper(item, (plist)->next, field_name), \ + q = _list_entry_helper(item, (item)->field_name.next, field_name); \ + _list_entry_helper(item, (item)->field_name.next, field_name) != \ + _list_entry_helper(item, (plist)->next, field_name); \ + item = q, q = _list_entry_helper(q, (q)->field_name.next, field_name)) + +#endif diff --git a/src/spdk/lib/event/Makefile b/src/spdk/lib/event/Makefile new file mode 100644 index 000000000..87a6209c7 --- /dev/null +++ b/src/spdk/lib/event/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 5 +SO_MINOR := 0 + +LIBNAME = event +C_SRCS = app.c reactor.c rpc.c subsystem.c json_config.c + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_event.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/app.c b/src/spdk/lib/event/app.c new file mode 100644 index 000000000..b6cab05a3 --- /dev/null +++ b/src/spdk/lib/event/app.c @@ -0,0 +1,1177 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/version.h" + +#include "spdk_internal/event.h" + +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/conf.h" +#include "spdk/thread.h" +#include "spdk/trace.h" +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" + +#define SPDK_APP_DEFAULT_LOG_LEVEL SPDK_LOG_NOTICE +#define SPDK_APP_DEFAULT_LOG_PRINT_LEVEL SPDK_LOG_INFO +#define SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES SPDK_DEFAULT_NUM_TRACE_ENTRIES + +#define SPDK_APP_DPDK_DEFAULT_MEM_SIZE -1 +#define SPDK_APP_DPDK_DEFAULT_MASTER_CORE -1 +#define SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL -1 +#define SPDK_APP_DPDK_DEFAULT_CORE_MASK "0x1" +#define SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 +#define SPDK_APP_DEFAULT_CORE_LIMIT 0x140000000 /* 5 GiB */ + +struct spdk_app { + struct spdk_conf *config; + const char *json_config_file; + bool json_config_ignore_errors; + const char *rpc_addr; + int shm_id; + spdk_app_shutdown_cb shutdown_cb; + int rc; +}; + +static struct spdk_app g_spdk_app; +static spdk_msg_fn g_start_fn = NULL; +static void *g_start_arg = NULL; +static struct spdk_thread *g_app_thread = NULL; +static bool g_delay_subsystem_init = false; +static bool g_shutdown_sig_received = false; +static char *g_executable_name; +static struct spdk_app_opts g_default_opts; + +int +spdk_app_get_shm_id(void) +{ + return g_spdk_app.shm_id; +} + +/* append one empty option to indicate the end of the array */ +static const struct option g_cmdline_options[] = { +#define CONFIG_FILE_OPT_IDX 'c' + {"config", required_argument, NULL, CONFIG_FILE_OPT_IDX}, +#define LIMIT_COREDUMP_OPT_IDX 'd' + {"limit-coredump", no_argument, NULL, LIMIT_COREDUMP_OPT_IDX}, +#define TPOINT_GROUP_MASK_OPT_IDX 'e' + {"tpoint-group-mask", required_argument, NULL, TPOINT_GROUP_MASK_OPT_IDX}, +#define SINGLE_FILE_SEGMENTS_OPT_IDX 'g' + {"single-file-segments", no_argument, NULL, SINGLE_FILE_SEGMENTS_OPT_IDX}, +#define HELP_OPT_IDX 'h' + {"help", no_argument, NULL, HELP_OPT_IDX}, +#define SHM_ID_OPT_IDX 'i' + {"shm-id", required_argument, NULL, SHM_ID_OPT_IDX}, +#define CPUMASK_OPT_IDX 'm' + {"cpumask", required_argument, NULL, CPUMASK_OPT_IDX}, +#define MEM_CHANNELS_OPT_IDX 'n' + {"mem-channels", required_argument, NULL, MEM_CHANNELS_OPT_IDX}, +#define MASTER_CORE_OPT_IDX 'p' + {"master-core", required_argument, NULL, MASTER_CORE_OPT_IDX}, +#define RPC_SOCKET_OPT_IDX 'r' + {"rpc-socket", required_argument, NULL, RPC_SOCKET_OPT_IDX}, +#define MEM_SIZE_OPT_IDX 's' + {"mem-size", required_argument, NULL, MEM_SIZE_OPT_IDX}, +#define NO_PCI_OPT_IDX 'u' + {"no-pci", no_argument, NULL, NO_PCI_OPT_IDX}, +#define VERSION_OPT_IDX 'v' + {"version", no_argument, NULL, VERSION_OPT_IDX}, +#define PCI_BLACKLIST_OPT_IDX 'B' + {"pci-blacklist", required_argument, NULL, PCI_BLACKLIST_OPT_IDX}, +#define LOGFLAG_OPT_IDX 'L' + {"logflag", required_argument, NULL, LOGFLAG_OPT_IDX}, +#define HUGE_UNLINK_OPT_IDX 'R' + {"huge-unlink", no_argument, NULL, HUGE_UNLINK_OPT_IDX}, +#define PCI_WHITELIST_OPT_IDX 'W' + {"pci-whitelist", required_argument, NULL, PCI_WHITELIST_OPT_IDX}, +#define SILENCE_NOTICELOG_OPT_IDX 257 + {"silence-noticelog", no_argument, NULL, SILENCE_NOTICELOG_OPT_IDX}, +#define WAIT_FOR_RPC_OPT_IDX 258 + {"wait-for-rpc", no_argument, NULL, WAIT_FOR_RPC_OPT_IDX}, +#define HUGE_DIR_OPT_IDX 259 + {"huge-dir", required_argument, NULL, HUGE_DIR_OPT_IDX}, +#define NUM_TRACE_ENTRIES_OPT_IDX 260 + {"num-trace-entries", required_argument, NULL, NUM_TRACE_ENTRIES_OPT_IDX}, +#define MAX_REACTOR_DELAY_OPT_IDX 261 + {"max-delay", required_argument, NULL, MAX_REACTOR_DELAY_OPT_IDX}, +#define JSON_CONFIG_OPT_IDX 262 + {"json", required_argument, NULL, JSON_CONFIG_OPT_IDX}, +#define JSON_CONFIG_IGNORE_INIT_ERRORS_IDX 263 + {"json-ignore-init-errors", no_argument, NULL, JSON_CONFIG_IGNORE_INIT_ERRORS_IDX}, +#define IOVA_MODE_OPT_IDX 264 + {"iova-mode", required_argument, NULL, IOVA_MODE_OPT_IDX}, +#define BASE_VIRTADDR_OPT_IDX 265 + {"base-virtaddr", required_argument, NULL, BASE_VIRTADDR_OPT_IDX}, +}; + +/* Global section */ +#define GLOBAL_CONFIG_TMPL \ +"# Configuration file\n" \ +"#\n" \ +"# Please write all parameters using ASCII.\n" \ +"# The parameter must be quoted if it includes whitespace.\n" \ +"#\n" \ +"# Configuration syntax:\n" \ +"# Spaces at head of line are deleted, other spaces are as separator\n" \ +"# Lines starting with '#' are comments and not evaluated.\n" \ +"# Lines ending with '\\' are concatenated with the next line.\n" \ +"# Bracketed keys are section keys grouping the following value keys.\n" \ +"# Number of section key is used as a tag number.\n" \ +"# Ex. [TargetNode1] = TargetNode section key with tag number 1\n" \ +"[Global]\n" \ +" Comment \"Global section\"\n" \ +"\n" \ +" # Users can restrict work items to only run on certain cores by\n" \ +" # specifying a ReactorMask. Default is to allow work items to run\n" \ +" # on all cores. Core 0 must be set in the mask if one is specified.\n" \ +" # Default: 0xFFFF (cores 0-15)\n" \ +" ReactorMask \"0x%s\"\n" \ +"\n" \ +" # Tracepoint group mask for spdk trace buffers\n" \ +" # Default: 0x0 (all tracepoint groups disabled)\n" \ +" # Set to 0xFFFF to enable all tracepoint groups.\n" \ +" TpointGroupMask \"0x%" PRIX64 "\"\n" \ +"\n" \ + +static void +app_config_dump_global_section(FILE *fp) +{ + struct spdk_cpuset *coremask; + + if (NULL == fp) { + return; + } + + coremask = spdk_app_get_core_mask(); + + fprintf(fp, GLOBAL_CONFIG_TMPL, spdk_cpuset_fmt(coremask), + spdk_trace_get_tpoint_group_mask()); +} + +int +spdk_app_get_running_config(char **config_str, char *name) +{ + FILE *fp = NULL; + int fd = -1; + long length = 0, ret = 0; + char vbuf[BUFSIZ]; + char config_template[64]; + + snprintf(config_template, sizeof(config_template), "/tmp/%s.XXXXXX", name); + /* Create temporary file to hold config */ + fd = mkstemp(config_template); + if (fd == -1) { + SPDK_ERRLOG("mkstemp failed\n"); + return -1; + } + fp = fdopen(fd, "wb+"); + if (NULL == fp) { + SPDK_ERRLOG("error opening tmpfile fd = %d\n", fd); + return -1; + } + + /* Buffered IO */ + setvbuf(fp, vbuf, _IOFBF, BUFSIZ); + + app_config_dump_global_section(fp); + spdk_subsystem_config(fp); + + length = ftell(fp); + + *config_str = malloc(length + 1); + if (!*config_str) { + SPDK_ERRLOG("out-of-memory for config\n"); + fclose(fp); + return -1; + } + fseek(fp, 0, SEEK_SET); + ret = fread(*config_str, sizeof(char), length, fp); + if (ret < length) { + SPDK_ERRLOG("short read\n"); + } + fclose(fp); + (*config_str)[length] = '\0'; + + return 0; +} + +static void +app_start_shutdown(void *ctx) +{ + if (g_spdk_app.shutdown_cb) { + g_spdk_app.shutdown_cb(); + g_spdk_app.shutdown_cb = NULL; + } else { + spdk_app_stop(0); + } +} + +void +spdk_app_start_shutdown(void) +{ + spdk_thread_send_critical_msg(g_app_thread, app_start_shutdown); +} + +static void +__shutdown_signal(int signo) +{ + if (!g_shutdown_sig_received) { + g_shutdown_sig_received = true; + spdk_app_start_shutdown(); + } +} + +static int +app_opts_validate(const char *app_opts) +{ + int i = 0, j; + + for (i = 0; app_opts[i] != '\0'; i++) { + /* ignore getopt control characters */ + if (app_opts[i] == ':' || app_opts[i] == '+' || app_opts[i] == '-') { + continue; + } + + for (j = 0; SPDK_APP_GETOPT_STRING[j] != '\0'; j++) { + if (app_opts[i] == SPDK_APP_GETOPT_STRING[j]) { + return app_opts[i]; + } + } + } + return 0; +} + +void +spdk_app_opts_init(struct spdk_app_opts *opts) +{ + if (!opts) { + return; + } + + memset(opts, 0, sizeof(*opts)); + + opts->enable_coredump = true; + opts->shm_id = -1; + opts->mem_size = SPDK_APP_DPDK_DEFAULT_MEM_SIZE; + opts->master_core = SPDK_APP_DPDK_DEFAULT_MASTER_CORE; + opts->mem_channel = SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL; + opts->reactor_mask = NULL; + opts->base_virtaddr = SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR; + opts->print_level = SPDK_APP_DEFAULT_LOG_PRINT_LEVEL; + opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR; + opts->num_entries = SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES; + opts->delay_subsystem_init = false; +} + +static int +app_setup_signal_handlers(struct spdk_app_opts *opts) +{ + struct sigaction sigact; + sigset_t sigmask; + int rc; + + sigemptyset(&sigmask); + memset(&sigact, 0, sizeof(sigact)); + sigemptyset(&sigact.sa_mask); + + sigact.sa_handler = SIG_IGN; + rc = sigaction(SIGPIPE, &sigact, NULL); + if (rc < 0) { + SPDK_ERRLOG("sigaction(SIGPIPE) failed\n"); + return rc; + } + + /* Install the same handler for SIGINT and SIGTERM */ + g_shutdown_sig_received = false; + sigact.sa_handler = __shutdown_signal; + rc = sigaction(SIGINT, &sigact, NULL); + if (rc < 0) { + SPDK_ERRLOG("sigaction(SIGINT) failed\n"); + return rc; + } + sigaddset(&sigmask, SIGINT); + + rc = sigaction(SIGTERM, &sigact, NULL); + if (rc < 0) { + SPDK_ERRLOG("sigaction(SIGTERM) failed\n"); + return rc; + } + sigaddset(&sigmask, SIGTERM); + + if (opts->usr1_handler != NULL) { + sigact.sa_handler = opts->usr1_handler; + rc = sigaction(SIGUSR1, &sigact, NULL); + if (rc < 0) { + SPDK_ERRLOG("sigaction(SIGUSR1) failed\n"); + return rc; + } + sigaddset(&sigmask, SIGUSR1); + } + + pthread_sigmask(SIG_UNBLOCK, &sigmask, NULL); + + return 0; +} + +static void +app_start_application(void) +{ + assert(spdk_get_thread() == g_app_thread); + + g_start_fn(g_start_arg); +} + +static void +app_start_rpc(int rc, void *arg1) +{ + if (rc) { + spdk_app_stop(rc); + return; + } + + spdk_rpc_initialize(g_spdk_app.rpc_addr); + if (!g_delay_subsystem_init) { + spdk_rpc_set_state(SPDK_RPC_RUNTIME); + app_start_application(); + } +} + +static struct spdk_conf * +app_setup_conf(const char *config_file) +{ + struct spdk_conf *config; + int rc; + + config = spdk_conf_allocate(); + assert(config != NULL); + if (config_file) { + rc = spdk_conf_read(config, config_file); + if (rc != 0) { + SPDK_ERRLOG("Could not read config file %s\n", config_file); + goto error; + } + if (spdk_conf_first_section(config) == NULL) { + SPDK_ERRLOG("Invalid config file %s\n", config_file); + goto error; + } + } + spdk_conf_set_as_default(config); + return config; + +error: + spdk_conf_free(config); + return NULL; +} + +static int +app_opts_add_pci_addr(struct spdk_app_opts *opts, struct spdk_pci_addr **list, char *bdf) +{ + struct spdk_pci_addr *tmp = *list; + size_t i = opts->num_pci_addr; + + tmp = realloc(tmp, sizeof(*tmp) * (i + 1)); + if (tmp == NULL) { + SPDK_ERRLOG("realloc error\n"); + return -ENOMEM; + } + + *list = tmp; + if (spdk_pci_addr_parse(*list + i, bdf) < 0) { + SPDK_ERRLOG("Invalid address %s\n", bdf); + return -EINVAL; + } + + opts->num_pci_addr++; + return 0; +} + +static int +app_read_config_file_global_params(struct spdk_app_opts *opts) +{ + struct spdk_conf_section *sp; + char *bdf; + int i, rc = 0; + + sp = spdk_conf_find_section(NULL, "Global"); + + if (opts->shm_id == -1) { + if (sp != NULL) { + opts->shm_id = spdk_conf_section_get_intval(sp, "SharedMemoryID"); + } + } + + if (opts->reactor_mask == NULL) { + if (sp && spdk_conf_section_get_val(sp, "ReactorMask")) { + SPDK_ERRLOG("ReactorMask config option is deprecated. Use -m/--cpumask\n" + "command line parameter instead.\n"); + opts->reactor_mask = spdk_conf_section_get_val(sp, "ReactorMask"); + } else { + opts->reactor_mask = SPDK_APP_DPDK_DEFAULT_CORE_MASK; + } + } + + if (!opts->no_pci && sp) { + opts->no_pci = spdk_conf_section_get_boolval(sp, "NoPci", false); + } + + if (opts->tpoint_group_mask == NULL) { + if (sp != NULL) { + opts->tpoint_group_mask = spdk_conf_section_get_val(sp, "TpointGroupMask"); + } + } + + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + bdf = spdk_conf_section_get_nmval(sp, "PciBlacklist", i, 0); + if (!bdf) { + break; + } + + rc = app_opts_add_pci_addr(opts, &opts->pci_blacklist, bdf); + if (rc != 0) { + free(opts->pci_blacklist); + return rc; + } + } + + for (i = 0; ; i++) { + bdf = spdk_conf_section_get_nmval(sp, "PciWhitelist", i, 0); + if (!bdf) { + break; + } + + if (opts->pci_blacklist != NULL) { + SPDK_ERRLOG("PciBlacklist and PciWhitelist cannot be used at the same time\n"); + free(opts->pci_blacklist); + return -EINVAL; + } + + rc = app_opts_add_pci_addr(opts, &opts->pci_whitelist, bdf); + if (rc != 0) { + free(opts->pci_whitelist); + return rc; + } + } + return 0; +} + +static int +app_setup_env(struct spdk_app_opts *opts) +{ + struct spdk_env_opts env_opts = {}; + int rc; + + if (opts == NULL) { + rc = spdk_env_init(NULL); + if (rc != 0) { + SPDK_ERRLOG("Unable to reinitialize SPDK env\n"); + } + + return rc; + } + + + spdk_env_opts_init(&env_opts); + + env_opts.name = opts->name; + env_opts.core_mask = opts->reactor_mask; + env_opts.shm_id = opts->shm_id; + env_opts.mem_channel = opts->mem_channel; + env_opts.master_core = opts->master_core; + env_opts.mem_size = opts->mem_size; + env_opts.hugepage_single_segments = opts->hugepage_single_segments; + env_opts.unlink_hugepage = opts->unlink_hugepage; + env_opts.hugedir = opts->hugedir; + env_opts.no_pci = opts->no_pci; + env_opts.num_pci_addr = opts->num_pci_addr; + env_opts.pci_blacklist = opts->pci_blacklist; + env_opts.pci_whitelist = opts->pci_whitelist; + env_opts.env_context = opts->env_context; + env_opts.iova_mode = opts->iova_mode; + + rc = spdk_env_init(&env_opts); + free(env_opts.pci_blacklist); + free(env_opts.pci_whitelist); + + + if (rc < 0) { + SPDK_ERRLOG("Unable to initialize SPDK env\n"); + } + + return rc; +} + +static int +app_setup_trace(struct spdk_app_opts *opts) +{ + char shm_name[64]; + uint64_t tpoint_group_mask; + char *end; + + if (opts->shm_id >= 0) { + snprintf(shm_name, sizeof(shm_name), "/%s_trace.%d", opts->name, opts->shm_id); + } else { + snprintf(shm_name, sizeof(shm_name), "/%s_trace.pid%d", opts->name, (int)getpid()); + } + + if (spdk_trace_init(shm_name, opts->num_entries) != 0) { + return -1; + } + + if (opts->tpoint_group_mask != NULL) { + errno = 0; + tpoint_group_mask = strtoull(opts->tpoint_group_mask, &end, 16); + if (*end != '\0' || errno) { + SPDK_ERRLOG("invalid tpoint mask %s\n", opts->tpoint_group_mask); + } else { + SPDK_NOTICELOG("Tracepoint Group Mask %s specified.\n", opts->tpoint_group_mask); + SPDK_NOTICELOG("Use 'spdk_trace -s %s %s %d' to capture a snapshot of events at runtime.\n", + opts->name, + opts->shm_id >= 0 ? "-i" : "-p", + opts->shm_id >= 0 ? opts->shm_id : getpid()); +#if defined(__linux__) + SPDK_NOTICELOG("Or copy /dev/shm%s for offline analysis/debug.\n", shm_name); +#endif + spdk_trace_set_tpoint_group_mask(tpoint_group_mask); + } + } + + return 0; +} + +static void +bootstrap_fn(void *arg1) +{ + if (g_spdk_app.json_config_file) { + g_delay_subsystem_init = false; + spdk_app_json_config_load(g_spdk_app.json_config_file, g_spdk_app.rpc_addr, app_start_rpc, + NULL, !g_spdk_app.json_config_ignore_errors); + } else { + if (!g_delay_subsystem_init) { + spdk_subsystem_init(app_start_rpc, NULL); + } else { + spdk_rpc_initialize(g_spdk_app.rpc_addr); + } + } +} + +int +spdk_app_start(struct spdk_app_opts *opts, spdk_msg_fn start_fn, + void *arg1) +{ + struct spdk_conf *config = NULL; + int rc; + char *tty; + struct spdk_cpuset tmp_cpumask = {}; + static bool g_env_was_setup = false; + + if (!opts) { + SPDK_ERRLOG("opts should not be NULL\n"); + return 1; + } + + if (!start_fn) { + SPDK_ERRLOG("start_fn should not be NULL\n"); + return 1; + } + + tty = ttyname(STDERR_FILENO); + if (opts->print_level > SPDK_LOG_WARN && + isatty(STDERR_FILENO) && + tty && + !strncmp(tty, "/dev/tty", strlen("/dev/tty"))) { + printf("Warning: printing stderr to console terminal without -q option specified.\n"); + printf("Suggest using --silence-noticelog to disable logging to stderr and\n"); + printf("monitor syslog, or redirect stderr to a file.\n"); + printf("(Delaying for 10 seconds...)\n"); + sleep(10); + } + + spdk_log_set_print_level(opts->print_level); + +#ifndef SPDK_NO_RLIMIT + if (opts->enable_coredump) { + struct rlimit core_limits; + + core_limits.rlim_cur = core_limits.rlim_max = SPDK_APP_DEFAULT_CORE_LIMIT; + setrlimit(RLIMIT_CORE, &core_limits); + } +#endif + + config = app_setup_conf(opts->config_file); + if (config == NULL) { + return 1; + } + + if (app_read_config_file_global_params(opts) < 0) { + spdk_conf_free(config); + return 1; + } + + memset(&g_spdk_app, 0, sizeof(g_spdk_app)); + g_spdk_app.config = config; + g_spdk_app.json_config_file = opts->json_config_file; + g_spdk_app.json_config_ignore_errors = opts->json_config_ignore_errors; + g_spdk_app.rpc_addr = opts->rpc_addr; + g_spdk_app.shm_id = opts->shm_id; + g_spdk_app.shutdown_cb = opts->shutdown_cb; + g_spdk_app.rc = 0; + + spdk_log_set_level(SPDK_APP_DEFAULT_LOG_LEVEL); + + /* Pass NULL to app_setup_env if SPDK app has been set up, in order to + * indicate that this is a reinitialization. + */ + if (app_setup_env(g_env_was_setup ? NULL : opts) < 0) { + return 1; + } + + spdk_log_open(opts->log); + SPDK_NOTICELOG("Total cores available: %d\n", spdk_env_get_core_count()); + + /* + * If mask not specified on command line or in configuration file, + * reactor_mask will be 0x1 which will enable core 0 to run one + * reactor. + */ + if ((rc = spdk_reactors_init()) != 0) { + SPDK_ERRLOG("Reactor Initilization failed: rc = %d\n", rc); + return 1; + } + + spdk_cpuset_set_cpu(&tmp_cpumask, spdk_env_get_current_core(), true); + + /* Now that the reactors have been initialized, we can create an + * initialization thread. */ + g_app_thread = spdk_thread_create("app_thread", &tmp_cpumask); + if (!g_app_thread) { + SPDK_ERRLOG("Unable to create an spdk_thread for initialization\n"); + return 1; + } + + /* + * Note the call to app_setup_trace() is located here + * ahead of app_setup_signal_handlers(). + * That's because there is not an easy/direct clean + * way of unwinding alloc'd resources that can occur + * in app_setup_signal_handlers(). + */ + if (app_setup_trace(opts) != 0) { + return 1; + } + + if ((rc = app_setup_signal_handlers(opts)) != 0) { + return 1; + } + + g_delay_subsystem_init = opts->delay_subsystem_init; + g_start_fn = start_fn; + g_start_arg = arg1; + + spdk_thread_send_msg(g_app_thread, bootstrap_fn, NULL); + + /* This blocks until spdk_app_stop is called */ + spdk_reactors_start(); + + g_env_was_setup = true; + + return g_spdk_app.rc; +} + +void +spdk_app_fini(void) +{ + spdk_trace_cleanup(); + spdk_reactors_fini(); + spdk_env_fini(); + spdk_conf_free(g_spdk_app.config); + spdk_log_close(); +} + +static void +app_stop(void *arg1) +{ + spdk_rpc_finish(); + spdk_subsystem_fini(spdk_reactors_stop, NULL); +} + +void +spdk_app_stop(int rc) +{ + if (rc) { + SPDK_WARNLOG("spdk_app_stop'd on non-zero\n"); + } + g_spdk_app.rc = rc; + /* + * We want to run spdk_subsystem_fini() from the same thread where spdk_subsystem_init() + * was called. + */ + spdk_thread_send_msg(g_app_thread, app_stop, NULL); +} + +static void +usage(void (*app_usage)(void)) +{ + printf("%s [options]\n", g_executable_name); + printf("options:\n"); + printf(" -c, --config <config> config file (default %s)\n", + g_default_opts.config_file != NULL ? g_default_opts.config_file : "none"); + printf(" --json <config> JSON config file (default %s)\n", + g_default_opts.json_config_file != NULL ? g_default_opts.json_config_file : "none"); + printf(" --json-ignore-init-errors\n"); + printf(" don't exit on invalid config entry\n"); + printf(" -d, --limit-coredump do not set max coredump size to RLIM_INFINITY\n"); + printf(" -g, --single-file-segments\n"); + printf(" force creating just one hugetlbfs file\n"); + printf(" -h, --help show this usage\n"); + printf(" -i, --shm-id <id> shared memory ID (optional)\n"); + printf(" -m, --cpumask <mask> core mask for DPDK\n"); + printf(" -n, --mem-channels <num> channel number of memory channels used for DPDK\n"); + printf(" -p, --master-core <id> master (primary) core for DPDK\n"); + printf(" -r, --rpc-socket <path> RPC listen address (default %s)\n", SPDK_DEFAULT_RPC_ADDR); + printf(" -s, --mem-size <size> memory size in MB for DPDK (default: "); +#ifndef __linux__ + if (g_default_opts.mem_size <= 0) { + printf("all hugepage memory)\n"); + } else +#endif + { + printf("%dMB)\n", g_default_opts.mem_size >= 0 ? g_default_opts.mem_size : 0); + } + printf(" --silence-noticelog disable notice level logging to stderr\n"); + printf(" -u, --no-pci disable PCI access\n"); + printf(" --wait-for-rpc wait for RPCs to initialize subsystems\n"); + printf(" --max-delay <num> maximum reactor delay (in microseconds)\n"); + printf(" -B, --pci-blacklist <bdf>\n"); + printf(" pci addr to blacklist (can be used more than once)\n"); + printf(" -R, --huge-unlink unlink huge files after initialization\n"); + printf(" -v, --version print SPDK version\n"); + printf(" -W, --pci-whitelist <bdf>\n"); + printf(" pci addr to whitelist (-B and -W cannot be used at the same time)\n"); + printf(" --huge-dir <path> use a specific hugetlbfs mount to reserve memory from\n"); + printf(" --iova-mode <pa/va> set IOVA mode ('pa' for IOVA_PA and 'va' for IOVA_VA)\n"); + printf(" --base-virtaddr <addr> the base virtual address for DPDK (default: 0x200000000000)\n"); + printf(" --num-trace-entries <num> number of trace entries for each core, must be power of 2. (default %d)\n", + SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES); + spdk_log_usage(stdout, "-L"); + spdk_trace_mask_usage(stdout, "-e"); + if (app_usage) { + app_usage(); + } +} + +spdk_app_parse_args_rvals_t +spdk_app_parse_args(int argc, char **argv, struct spdk_app_opts *opts, + const char *app_getopt_str, struct option *app_long_opts, + int (*app_parse)(int ch, char *arg), + void (*app_usage)(void)) +{ + int ch, rc, opt_idx, global_long_opts_len, app_long_opts_len; + struct option *cmdline_options; + char *cmdline_short_opts = NULL; + enum spdk_app_parse_args_rvals retval = SPDK_APP_PARSE_ARGS_FAIL; + long int tmp; + + memcpy(&g_default_opts, opts, sizeof(g_default_opts)); + + if (opts->config_file && access(opts->config_file, R_OK) != 0) { + SPDK_WARNLOG("Can't read legacy configuration file '%s'\n", opts->config_file); + opts->config_file = NULL; + } + + if (opts->json_config_file && access(opts->json_config_file, R_OK) != 0) { + SPDK_WARNLOG("Can't read JSON configuration file '%s'\n", opts->json_config_file); + opts->json_config_file = NULL; + } + + if (app_long_opts == NULL) { + app_long_opts_len = 0; + } else { + for (app_long_opts_len = 0; + app_long_opts[app_long_opts_len].name != NULL; + app_long_opts_len++); + } + + global_long_opts_len = SPDK_COUNTOF(g_cmdline_options); + + cmdline_options = calloc(global_long_opts_len + app_long_opts_len + 1, sizeof(*cmdline_options)); + if (!cmdline_options) { + SPDK_ERRLOG("Out of memory\n"); + return SPDK_APP_PARSE_ARGS_FAIL; + } + + memcpy(&cmdline_options[0], g_cmdline_options, sizeof(g_cmdline_options)); + if (app_long_opts) { + memcpy(&cmdline_options[global_long_opts_len], app_long_opts, + app_long_opts_len * sizeof(*app_long_opts)); + } + + if (app_getopt_str != NULL) { + ch = app_opts_validate(app_getopt_str); + if (ch) { + SPDK_ERRLOG("Duplicated option '%c' between the generic and application specific spdk opts.\n", + ch); + goto out; + } + } + + cmdline_short_opts = spdk_sprintf_alloc("%s%s", app_getopt_str, SPDK_APP_GETOPT_STRING); + if (!cmdline_short_opts) { + SPDK_ERRLOG("Out of memory\n"); + goto out; + } + + g_executable_name = argv[0]; + + while ((ch = getopt_long(argc, argv, cmdline_short_opts, cmdline_options, &opt_idx)) != -1) { + switch (ch) { + case CONFIG_FILE_OPT_IDX: + opts->config_file = optarg; + break; + case JSON_CONFIG_OPT_IDX: + opts->json_config_file = optarg; + break; + case JSON_CONFIG_IGNORE_INIT_ERRORS_IDX: + opts->json_config_ignore_errors = true; + break; + case LIMIT_COREDUMP_OPT_IDX: + opts->enable_coredump = false; + break; + case TPOINT_GROUP_MASK_OPT_IDX: + opts->tpoint_group_mask = optarg; + break; + case SINGLE_FILE_SEGMENTS_OPT_IDX: + opts->hugepage_single_segments = true; + break; + case HELP_OPT_IDX: + usage(app_usage); + retval = SPDK_APP_PARSE_ARGS_HELP; + goto out; + case SHM_ID_OPT_IDX: + opts->shm_id = spdk_strtol(optarg, 0); + if (opts->shm_id < 0) { + SPDK_ERRLOG("Invalid shared memory ID %s\n", optarg); + goto out; + } + break; + case CPUMASK_OPT_IDX: + opts->reactor_mask = optarg; + break; + case MEM_CHANNELS_OPT_IDX: + opts->mem_channel = spdk_strtol(optarg, 0); + if (opts->mem_channel < 0) { + SPDK_ERRLOG("Invalid memory channel %s\n", optarg); + goto out; + } + break; + case MASTER_CORE_OPT_IDX: + opts->master_core = spdk_strtol(optarg, 0); + if (opts->master_core < 0) { + SPDK_ERRLOG("Invalid master core %s\n", optarg); + goto out; + } + break; + case SILENCE_NOTICELOG_OPT_IDX: + opts->print_level = SPDK_LOG_WARN; + break; + case RPC_SOCKET_OPT_IDX: + opts->rpc_addr = optarg; + break; + case MEM_SIZE_OPT_IDX: { + uint64_t mem_size_mb; + bool mem_size_has_prefix; + + rc = spdk_parse_capacity(optarg, &mem_size_mb, &mem_size_has_prefix); + if (rc != 0) { + SPDK_ERRLOG("invalid memory pool size `-s %s`\n", optarg); + usage(app_usage); + goto out; + } + + if (mem_size_has_prefix) { + /* the mem size is in MB by default, so if a prefix was + * specified, we need to manually convert to MB. + */ + mem_size_mb /= 1024 * 1024; + } + + if (mem_size_mb > INT_MAX) { + SPDK_ERRLOG("invalid memory pool size `-s %s`\n", optarg); + usage(app_usage); + goto out; + } + + opts->mem_size = (int) mem_size_mb; + break; + } + case NO_PCI_OPT_IDX: + opts->no_pci = true; + break; + case WAIT_FOR_RPC_OPT_IDX: + opts->delay_subsystem_init = true; + break; + case PCI_BLACKLIST_OPT_IDX: + if (opts->pci_whitelist) { + free(opts->pci_whitelist); + opts->pci_whitelist = NULL; + SPDK_ERRLOG("-B and -W cannot be used at the same time\n"); + usage(app_usage); + goto out; + } + + rc = app_opts_add_pci_addr(opts, &opts->pci_blacklist, optarg); + if (rc != 0) { + free(opts->pci_blacklist); + opts->pci_blacklist = NULL; + goto out; + } + break; + case LOGFLAG_OPT_IDX: +#ifndef DEBUG + SPDK_ERRLOG("%s must be configured with --enable-debug for -L flag\n", + argv[0]); + usage(app_usage); + goto out; +#else + rc = spdk_log_set_flag(optarg); + if (rc < 0) { + SPDK_ERRLOG("unknown flag\n"); + usage(app_usage); + goto out; + } + opts->print_level = SPDK_LOG_DEBUG; + break; +#endif + case HUGE_UNLINK_OPT_IDX: + opts->unlink_hugepage = true; + break; + case PCI_WHITELIST_OPT_IDX: + if (opts->pci_blacklist) { + free(opts->pci_blacklist); + opts->pci_blacklist = NULL; + SPDK_ERRLOG("-B and -W cannot be used at the same time\n"); + usage(app_usage); + goto out; + } + + rc = app_opts_add_pci_addr(opts, &opts->pci_whitelist, optarg); + if (rc != 0) { + free(opts->pci_whitelist); + opts->pci_whitelist = NULL; + goto out; + } + break; + case BASE_VIRTADDR_OPT_IDX: + tmp = spdk_strtoll(optarg, 0); + if (tmp <= 0) { + SPDK_ERRLOG("Invalid base-virtaddr %s\n", optarg); + usage(app_usage); + goto out; + } + opts->base_virtaddr = (uint64_t)tmp; + break; + case HUGE_DIR_OPT_IDX: + opts->hugedir = optarg; + break; + case IOVA_MODE_OPT_IDX: + opts->iova_mode = optarg; + break; + case NUM_TRACE_ENTRIES_OPT_IDX: + tmp = spdk_strtoll(optarg, 0); + if (tmp <= 0) { + SPDK_ERRLOG("Invalid num-trace-entries %s\n", optarg); + usage(app_usage); + goto out; + } + opts->num_entries = (uint64_t)tmp; + if (opts->num_entries & (opts->num_entries - 1)) { + SPDK_ERRLOG("num-trace-entries must be power of 2\n"); + usage(app_usage); + goto out; + } + break; + case MAX_REACTOR_DELAY_OPT_IDX: + SPDK_ERRLOG("Deprecation warning: The maximum allowed latency parameter is no longer supported.\n"); + break; + case VERSION_OPT_IDX: + printf(SPDK_VERSION_STRING"\n"); + retval = SPDK_APP_PARSE_ARGS_HELP; + goto out; + case '?': + /* + * In the event getopt() above detects an option + * in argv that is NOT in the getopt_str, + * getopt() will return a '?' indicating failure. + */ + usage(app_usage); + goto out; + default: + rc = app_parse(ch, optarg); + if (rc) { + SPDK_ERRLOG("Parsing application specific arguments failed: %d\n", rc); + goto out; + } + } + } + + if (opts->config_file && opts->json_config_file) { + SPDK_ERRLOG("ERROR: Legacy config and JSON config can't be used together.\n"); + goto out; + } + + if (opts->json_config_file && opts->delay_subsystem_init) { + SPDK_ERRLOG("ERROR: JSON configuration file can't be used together with --wait-for-rpc.\n"); + goto out; + } + + /* TBD: Replace warning by failure when RPCs for startup are prepared. */ + if (opts->config_file && opts->delay_subsystem_init) { + fprintf(stderr, + "WARNING: --wait-for-rpc and config file are used at the same time. " + "- Please be careful one options might overwrite others.\n"); + } + + retval = SPDK_APP_PARSE_ARGS_SUCCESS; +out: + if (retval != SPDK_APP_PARSE_ARGS_SUCCESS) { + free(opts->pci_blacklist); + opts->pci_blacklist = NULL; + free(opts->pci_whitelist); + opts->pci_whitelist = NULL; + } + free(cmdline_short_opts); + free(cmdline_options); + return retval; +} + +void +spdk_app_usage(void) +{ + if (g_executable_name == NULL) { + SPDK_ERRLOG("%s not valid before calling spdk_app_parse_args()\n", __func__); + return; + } + + usage(NULL); +} + +static void +rpc_framework_start_init_cpl(int rc, void *arg1) +{ + struct spdk_jsonrpc_request *request = arg1; + struct spdk_json_write_ctx *w; + + assert(spdk_get_thread() == g_app_thread); + + if (rc) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "framework_initialization failed"); + return; + } + + spdk_rpc_set_state(SPDK_RPC_RUNTIME); + app_start_application(); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_framework_start_init(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "framework_start_init requires no parameters"); + return; + } + + spdk_subsystem_init(rpc_framework_start_init_cpl, request); +} +SPDK_RPC_REGISTER("framework_start_init", rpc_framework_start_init, SPDK_RPC_STARTUP) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(framework_start_init, start_subsystem_init) + +struct subsystem_init_poller_ctx { + struct spdk_poller *init_poller; + struct spdk_jsonrpc_request *request; +}; + +static int +rpc_subsystem_init_poller_ctx(void *ctx) +{ + struct spdk_json_write_ctx *w; + struct subsystem_init_poller_ctx *poller_ctx = ctx; + + if (spdk_rpc_get_state() == SPDK_RPC_RUNTIME) { + w = spdk_jsonrpc_begin_result(poller_ctx->request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(poller_ctx->request, w); + spdk_poller_unregister(&poller_ctx->init_poller); + free(poller_ctx); + } + + return SPDK_POLLER_BUSY; +} + +static void +rpc_framework_wait_init(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + struct subsystem_init_poller_ctx *ctx; + + if (spdk_rpc_get_state() == SPDK_RPC_RUNTIME) { + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + } else { + ctx = malloc(sizeof(struct subsystem_init_poller_ctx)); + if (ctx == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to allocate memory for the request context\n"); + return; + } + ctx->request = request; + ctx->init_poller = SPDK_POLLER_REGISTER(rpc_subsystem_init_poller_ctx, ctx, 0); + } +} +SPDK_RPC_REGISTER("framework_wait_init", rpc_framework_wait_init, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(framework_wait_init, wait_subsystem_init) diff --git a/src/spdk/lib/event/json_config.c b/src/spdk/lib/event/json_config.c new file mode 100644 index 000000000..69a95097a --- /dev/null +++ b/src/spdk/lib/event/json_config.c @@ -0,0 +1,630 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/util.h" +#include "spdk/file.h" +#include "spdk/log.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/jsonrpc.h" +#include "spdk/rpc.h" + +#include "spdk_internal/event.h" +#include "spdk_internal/log.h" + +#define SPDK_DEBUG_APP_CFG(...) SPDK_DEBUGLOG(SPDK_LOG_APP_CONFIG, __VA_ARGS__) + +/* JSON configuration format is as follows + * + * { + * "subsystems" : [ <<== *subsystems JSON array + * { <<== *subsystems_it array entry pointer (iterator) + * "subsystem": "<< SUBSYSTEM NAME >>", + * "config": [ <<== *config JSON array + * { <<== *config_it array entry pointer (iterator) + * "method": "<< METHOD NAME >>", <<== *method + * "params": { << PARAMS >> } <<== *params + * }, + * << MORE "config" ARRY ENTRIES >> + * ] + * }, + * << MORE "subsystems" ARRAY ENTRIES >> + * ] + * + * << ANYTHING ELSE IS IGNORRED IN ROOT OBJECT>> + * } + * + */ + +struct load_json_config_ctx; +typedef void (*client_resp_handler)(struct load_json_config_ctx *, + struct spdk_jsonrpc_client_response *); + +#define RPC_SOCKET_PATH_MAX sizeof(((struct sockaddr_un *)0)->sun_path) + +/* 1s connections timeout */ +#define RPC_CLIENT_CONNECT_TIMEOUT_US (1U * 1000U * 1000U) + +/* + * Currently there is no timeout in SPDK for any RPC command. This result that + * we can't put a hard limit during configuration load as it most likely randomly fail. + * So just print WARNLOG every 10s. */ +#define RPC_CLIENT_REQUEST_TIMEOUT_US (10U * 1000 * 1000) + +struct load_json_config_ctx { + /* Thread used during configuration. */ + struct spdk_thread *thread; + spdk_subsystem_init_fn cb_fn; + void *cb_arg; + bool stop_on_error; + + /* Current subsystem */ + struct spdk_json_val *subsystems; /* "subsystems" array */ + struct spdk_json_val *subsystems_it; /* current subsystem array position in "subsystems" array */ + + struct spdk_json_val *subsystem_name; /* current subsystem name */ + + /* Current "config" entry we are processing */ + struct spdk_json_val *config; /* "config" array */ + struct spdk_json_val *config_it; /* current config position in "config" array */ + + /* Current request id we are sending. */ + uint32_t rpc_request_id; + + /* Whole configuration file read and parsed. */ + size_t json_data_size; + char *json_data; + + size_t values_cnt; + struct spdk_json_val *values; + + char rpc_socket_path_temp[RPC_SOCKET_PATH_MAX + 1]; + + struct spdk_jsonrpc_client *client_conn; + struct spdk_poller *client_conn_poller; + + client_resp_handler client_resp_cb; + + /* Timeout for current RPC client action. */ + uint64_t timeout; +}; + +static void app_json_config_load_subsystem(void *_ctx); + +static void +app_json_config_load_done(struct load_json_config_ctx *ctx, int rc) +{ + spdk_poller_unregister(&ctx->client_conn_poller); + if (ctx->client_conn != NULL) { + spdk_jsonrpc_client_close(ctx->client_conn); + } + + spdk_rpc_finish(); + + SPDK_DEBUG_APP_CFG("Config load finished with rc %d\n", rc); + ctx->cb_fn(rc, ctx->cb_arg); + + free(ctx->json_data); + free(ctx->values); + free(ctx); +} + +static void +rpc_client_set_timeout(struct load_json_config_ctx *ctx, uint64_t timeout_us) +{ + ctx->timeout = spdk_get_ticks() + timeout_us * spdk_get_ticks_hz() / (1000 * 1000); +} + +static int +rpc_client_check_timeout(struct load_json_config_ctx *ctx) +{ + if (ctx->timeout < spdk_get_ticks()) { + SPDK_WARNLOG("RPC client command timeout.\n"); + return -ETIMEDOUT; + } + + return 0; +} + +struct json_write_buf { + char data[1024]; + unsigned cur_off; +}; + +static int +json_write_stdout(void *cb_ctx, const void *data, size_t size) +{ + struct json_write_buf *buf = cb_ctx; + size_t rc; + + rc = snprintf(buf->data + buf->cur_off, sizeof(buf->data) - buf->cur_off, + "%s", (const char *)data); + if (rc > 0) { + buf->cur_off += rc; + } + return rc == size ? 0 : -1; +} + +static int +rpc_client_poller(void *arg) +{ + struct load_json_config_ctx *ctx = arg; + struct spdk_jsonrpc_client_response *resp; + client_resp_handler cb; + int rc; + + assert(spdk_get_thread() == ctx->thread); + + rc = spdk_jsonrpc_client_poll(ctx->client_conn, 0); + if (rc == 0) { + rc = rpc_client_check_timeout(ctx); + if (rc == -ETIMEDOUT) { + rpc_client_set_timeout(ctx, RPC_CLIENT_REQUEST_TIMEOUT_US); + rc = 0; + } + } + + if (rc == 0) { + /* No response yet */ + return SPDK_POLLER_BUSY; + } else if (rc < 0) { + app_json_config_load_done(ctx, rc); + return SPDK_POLLER_BUSY; + } + + resp = spdk_jsonrpc_client_get_response(ctx->client_conn); + assert(resp); + + if (resp->error) { + struct json_write_buf buf = {}; + struct spdk_json_write_ctx *w = spdk_json_write_begin(json_write_stdout, + &buf, SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE); + + if (w == NULL) { + SPDK_ERRLOG("error response: (?)\n"); + } else { + spdk_json_write_val(w, resp->error); + spdk_json_write_end(w); + SPDK_ERRLOG("error response: \n%s\n", buf.data); + } + } + + if (resp->error && ctx->stop_on_error) { + spdk_jsonrpc_client_free_response(resp); + app_json_config_load_done(ctx, -EINVAL); + } else { + /* We have response so we must have callback for it. */ + cb = ctx->client_resp_cb; + assert(cb != NULL); + + /* Mark we are done with this handler. */ + ctx->client_resp_cb = NULL; + cb(ctx, resp); + } + + + return SPDK_POLLER_BUSY; +} + +static int +rpc_client_connect_poller(void *_ctx) +{ + struct load_json_config_ctx *ctx = _ctx; + int rc; + + rc = spdk_jsonrpc_client_poll(ctx->client_conn, 0); + if (rc != -ENOTCONN) { + /* We are connected. Start regular poller and issue first request */ + spdk_poller_unregister(&ctx->client_conn_poller); + ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_poller, ctx, 100); + app_json_config_load_subsystem(ctx); + } else { + rc = rpc_client_check_timeout(ctx); + if (rc) { + app_json_config_load_done(ctx, rc); + } + + return SPDK_POLLER_IDLE; + } + + return SPDK_POLLER_BUSY; +} + +static int +client_send_request(struct load_json_config_ctx *ctx, struct spdk_jsonrpc_client_request *request, + client_resp_handler client_resp_cb) +{ + int rc; + + assert(spdk_get_thread() == ctx->thread); + + ctx->client_resp_cb = client_resp_cb; + rpc_client_set_timeout(ctx, RPC_CLIENT_REQUEST_TIMEOUT_US); + rc = spdk_jsonrpc_client_send_request(ctx->client_conn, request); + + if (rc) { + SPDK_DEBUG_APP_CFG("Sending request to client failed (%d)\n", rc); + } + + return rc; +} + +static int +cap_string(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + if (val->type != SPDK_JSON_VAL_STRING) { + return -EINVAL; + } + + *vptr = val; + return 0; +} + +static int +cap_object(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + if (val->type != SPDK_JSON_VAL_OBJECT_BEGIN) { + return -EINVAL; + } + + *vptr = val; + return 0; +} + + +static int +cap_array_or_null(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + if (val->type != SPDK_JSON_VAL_ARRAY_BEGIN && val->type != SPDK_JSON_VAL_NULL) { + return -EINVAL; + } + + *vptr = val; + return 0; +} + +struct config_entry { + char *method; + struct spdk_json_val *params; +}; + +static struct spdk_json_object_decoder jsonrpc_cmd_decoders[] = { + {"method", offsetof(struct config_entry, method), spdk_json_decode_string}, + {"params", offsetof(struct config_entry, params), cap_object, true} +}; + +static void app_json_config_load_subsystem_config_entry(void *_ctx); + +static void +app_json_config_load_subsystem_config_entry_next(struct load_json_config_ctx *ctx, + struct spdk_jsonrpc_client_response *resp) +{ + /* Don't care about the response */ + spdk_jsonrpc_client_free_response(resp); + + ctx->config_it = spdk_json_next(ctx->config_it); + app_json_config_load_subsystem_config_entry(ctx); +} + +/* Load "config" entry */ +static void +app_json_config_load_subsystem_config_entry(void *_ctx) +{ + struct load_json_config_ctx *ctx = _ctx; + struct spdk_jsonrpc_client_request *rpc_request; + struct spdk_json_write_ctx *w; + struct config_entry cfg = {}; + struct spdk_json_val *params_end; + size_t params_len; + int rc; + + if (ctx->config_it == NULL) { + SPDK_DEBUG_APP_CFG("Subsystem '%.*s': configuration done.\n", ctx->subsystem_name->len, + (char *)ctx->subsystem_name->start); + ctx->subsystems_it = spdk_json_next(ctx->subsystems_it); + /* Invoke later to avoid recurrency */ + spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem, ctx); + return; + } + + if (spdk_json_decode_object(ctx->config_it, jsonrpc_cmd_decoders, + SPDK_COUNTOF(jsonrpc_cmd_decoders), &cfg)) { + params_end = spdk_json_next(ctx->config_it); + assert(params_end != NULL); + params_len = params_end->start - ctx->config->start + 1; + SPDK_ERRLOG("Failed to decode config entry: %.*s!\n", (int)params_len, (char *)ctx->config_it); + app_json_config_load_done(ctx, -EINVAL); + goto out; + } + + rc = spdk_rpc_is_method_allowed(cfg.method, spdk_rpc_get_state()); + if (rc == -EPERM) { + SPDK_DEBUG_APP_CFG("Method '%s' not allowed -> skipping\n", cfg.method); + /* Invoke later to avoid recurrency */ + ctx->config_it = spdk_json_next(ctx->config_it); + spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem_config_entry, ctx); + goto out; + } + + /* Get _END by skipping params and going back by one element. */ + params_end = cfg.params + spdk_json_val_len(cfg.params) - 1; + + /* Need to add one character to include '}' */ + params_len = params_end->start - cfg.params->start + 1; + + SPDK_DEBUG_APP_CFG("\tmethod: %s\n", cfg.method); + SPDK_DEBUG_APP_CFG("\tparams: %.*s\n", (int)params_len, (char *)cfg.params->start); + + rpc_request = spdk_jsonrpc_client_create_request(); + if (!rpc_request) { + app_json_config_load_done(ctx, -errno); + goto out; + } + + w = spdk_jsonrpc_begin_request(rpc_request, ctx->rpc_request_id, NULL); + if (!w) { + spdk_jsonrpc_client_free_request(rpc_request); + app_json_config_load_done(ctx, -ENOMEM); + goto out; + } + + spdk_json_write_named_string(w, "method", cfg.method); + + /* No need to parse "params". Just dump the whole content of "params" + * directly into the request and let the remote side verify it. */ + spdk_json_write_name(w, "params"); + spdk_json_write_val_raw(w, cfg.params->start, params_len); + spdk_jsonrpc_end_request(rpc_request, w); + + rc = client_send_request(ctx, rpc_request, app_json_config_load_subsystem_config_entry_next); + if (rc != 0) { + app_json_config_load_done(ctx, -rc); + goto out; + } +out: + free(cfg.method); +} + +static void +subsystem_init_done(int rc, void *arg1) +{ + struct load_json_config_ctx *ctx = arg1; + + if (rc) { + app_json_config_load_done(ctx, rc); + return; + } + + spdk_rpc_set_state(SPDK_RPC_RUNTIME); + /* Another round. This time for RUNTIME methods */ + SPDK_DEBUG_APP_CFG("'framework_start_init' done - continuing configuration\n"); + + assert(ctx != NULL); + if (ctx->subsystems) { + ctx->subsystems_it = spdk_json_array_first(ctx->subsystems); + } + + app_json_config_load_subsystem(ctx); +} + +static struct spdk_json_object_decoder subsystem_decoders[] = { + {"subsystem", offsetof(struct load_json_config_ctx, subsystem_name), cap_string}, + {"config", offsetof(struct load_json_config_ctx, config), cap_array_or_null} +}; + +/* + * Start loading subsystem pointed by ctx->subsystems_it. This must point to the + * beginning of the "subsystem" object in "subsystems" array or be NULL. If it is + * NULL then no more subsystems to load. + * + * There are two iterations: + * + * In first iteration only STARTUP RPC methods are used, other methods are ignored. When + * allsubsystems are walked the ctx->subsystems_it became NULL and "framework_start_init" + * is called to let the SPDK move to RUNTIME state (initialize all subsystems) and + * second iteration begins. + * + * In second iteration "subsystems" array is walked through again, this time only + * RUNTIME RPC methods are used. When ctx->subsystems_it became NULL second time it + * indicate that there is no more subsystems to load. The cb_fn is called to finish + * configuration. + */ +static void +app_json_config_load_subsystem(void *_ctx) +{ + struct load_json_config_ctx *ctx = _ctx; + + if (ctx->subsystems_it == NULL) { + if (spdk_rpc_get_state() == SPDK_RPC_STARTUP) { + SPDK_DEBUG_APP_CFG("No more entries for current state, calling 'framework_start_init'\n"); + spdk_subsystem_init(subsystem_init_done, ctx); + } else { + app_json_config_load_done(ctx, 0); + } + + return; + } + + /* Capture subsystem name and config array */ + if (spdk_json_decode_object(ctx->subsystems_it, subsystem_decoders, + SPDK_COUNTOF(subsystem_decoders), ctx)) { + SPDK_ERRLOG("Failed to parse subsystem configuration\n"); + app_json_config_load_done(ctx, -EINVAL); + return; + } + + SPDK_DEBUG_APP_CFG("Loading subsystem '%.*s' configuration\n", ctx->subsystem_name->len, + (char *)ctx->subsystem_name->start); + + /* Get 'config' array first configuration entry */ + ctx->config_it = spdk_json_array_first(ctx->config); + app_json_config_load_subsystem_config_entry(ctx); +} + +static void * +read_file(const char *filename, size_t *size) +{ + FILE *file = fopen(filename, "r"); + void *data; + + if (file == NULL) { + /* errno is set by fopen */ + return NULL; + } + + data = spdk_posix_file_load(file, size); + fclose(file); + return data; +} + +static int +app_json_config_read(const char *config_file, struct load_json_config_ctx *ctx) +{ + struct spdk_json_val *values = NULL; + void *json = NULL, *end; + ssize_t values_cnt, rc; + size_t json_size; + + json = read_file(config_file, &json_size); + if (!json) { + return -errno; + } + + rc = spdk_json_parse(json, json_size, NULL, 0, &end, + SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS); + if (rc < 0) { + SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc); + goto err; + } + + values_cnt = rc; + values = calloc(values_cnt, sizeof(struct spdk_json_val)); + if (values == NULL) { + SPDK_ERRLOG("Out of memory\n"); + goto err; + } + + rc = spdk_json_parse(json, json_size, values, values_cnt, &end, + SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS); + if (rc != values_cnt) { + SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc); + goto err; + } + + ctx->json_data = json; + ctx->json_data_size = json_size; + + ctx->values = values; + ctx->values_cnt = values_cnt; + + return 0; +err: + free(json); + free(values); + return rc; +} + +void +spdk_app_json_config_load(const char *json_config_file, const char *rpc_addr, + spdk_subsystem_init_fn cb_fn, void *cb_arg, + bool stop_on_error) +{ + struct load_json_config_ctx *ctx = calloc(1, sizeof(*ctx)); + int rc; + + assert(cb_fn); + if (!ctx) { + cb_fn(-ENOMEM, cb_arg); + return; + } + + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->stop_on_error = stop_on_error; + ctx->thread = spdk_get_thread(); + + rc = app_json_config_read(json_config_file, ctx); + if (rc) { + goto fail; + } + + /* Capture subsystems array */ + rc = spdk_json_find_array(ctx->values, "subsystems", NULL, &ctx->subsystems); + if (rc) { + SPDK_WARNLOG("No 'subsystems' key JSON configuration file.\n"); + } else { + /* Get first subsystem */ + ctx->subsystems_it = spdk_json_array_first(ctx->subsystems); + if (ctx->subsystems_it == NULL) { + SPDK_NOTICELOG("'subsystems' configuration is empty\n"); + } + } + + /* If rpc_addr is not an Unix socket use default address as prefix. */ + if (rpc_addr == NULL || rpc_addr[0] != '/') { + rpc_addr = SPDK_DEFAULT_RPC_ADDR; + } + + /* FIXME: rpc client should use socketpair() instead of this temporary socket nonsense */ + rc = snprintf(ctx->rpc_socket_path_temp, sizeof(ctx->rpc_socket_path_temp), "%s.%d_config", + rpc_addr, getpid()); + if (rc >= (int)sizeof(ctx->rpc_socket_path_temp)) { + SPDK_ERRLOG("Socket name create failed\n"); + goto fail; + } + + /* FIXME: spdk_rpc_initialize() function should return error code. */ + spdk_rpc_initialize(ctx->rpc_socket_path_temp); + ctx->client_conn = spdk_jsonrpc_client_connect(ctx->rpc_socket_path_temp, AF_UNIX); + if (ctx->client_conn == NULL) { + SPDK_ERRLOG("Failed to connect to '%s'\n", ctx->rpc_socket_path_temp); + goto fail; + } + + rpc_client_set_timeout(ctx, RPC_CLIENT_CONNECT_TIMEOUT_US); + ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_connect_poller, ctx, 100); + return; + +fail: + app_json_config_load_done(ctx, -EINVAL); +} + +SPDK_LOG_REGISTER_COMPONENT("app_config", SPDK_LOG_APP_CONFIG) diff --git a/src/spdk/lib/event/reactor.c b/src/spdk/lib/event/reactor.c new file mode 100644 index 000000000..cda4a32b1 --- /dev/null +++ b/src/spdk/lib/event/reactor.c @@ -0,0 +1,664 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/likely.h" + +#include "spdk_internal/event.h" +#include "spdk_internal/log.h" +#include "spdk_internal/thread.h" + +#include "spdk/log.h" +#include "spdk/thread.h" +#include "spdk/env.h" +#include "spdk/util.h" + +#ifdef __linux__ +#include <sys/prctl.h> +#endif + +#ifdef __FreeBSD__ +#include <pthread_np.h> +#endif + +#define SPDK_EVENT_BATCH_SIZE 8 + +static struct spdk_reactor *g_reactors; +static struct spdk_cpuset g_reactor_core_mask; +static enum spdk_reactor_state g_reactor_state = SPDK_REACTOR_STATE_UNINITIALIZED; + +static bool g_framework_context_switch_monitor_enabled = true; + +static struct spdk_mempool *g_spdk_event_mempool = NULL; + +static void +reactor_construct(struct spdk_reactor *reactor, uint32_t lcore) +{ + reactor->lcore = lcore; + reactor->flags.is_valid = true; + + TAILQ_INIT(&reactor->threads); + reactor->thread_count = 0; + + reactor->events = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY); + assert(reactor->events != NULL); +} + +struct spdk_reactor * +spdk_reactor_get(uint32_t lcore) +{ + struct spdk_reactor *reactor; + + if (g_reactors == NULL) { + SPDK_WARNLOG("Called spdk_reactor_get() while the g_reactors array was NULL!\n"); + return NULL; + } + + reactor = &g_reactors[lcore]; + + if (reactor->flags.is_valid == false) { + return NULL; + } + + return reactor; +} + +static int reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op); +static bool reactor_thread_op_supported(enum spdk_thread_op op); + +int +spdk_reactors_init(void) +{ + int rc; + uint32_t i, last_core; + char mempool_name[32]; + + snprintf(mempool_name, sizeof(mempool_name), "evtpool_%d", getpid()); + g_spdk_event_mempool = spdk_mempool_create(mempool_name, + 262144 - 1, /* Power of 2 minus 1 is optimal for memory consumption */ + sizeof(struct spdk_event), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + + if (g_spdk_event_mempool == NULL) { + SPDK_ERRLOG("spdk_event_mempool creation failed\n"); + return -1; + } + + /* struct spdk_reactor must be aligned on 64 byte boundary */ + last_core = spdk_env_get_last_core(); + rc = posix_memalign((void **)&g_reactors, 64, + (last_core + 1) * sizeof(struct spdk_reactor)); + if (rc != 0) { + SPDK_ERRLOG("Could not allocate array size=%u for g_reactors\n", + last_core + 1); + spdk_mempool_free(g_spdk_event_mempool); + return -1; + } + + memset(g_reactors, 0, (last_core + 1) * sizeof(struct spdk_reactor)); + + spdk_thread_lib_init_ext(reactor_thread_op, reactor_thread_op_supported, + sizeof(struct spdk_lw_thread)); + + SPDK_ENV_FOREACH_CORE(i) { + reactor_construct(&g_reactors[i], i); + } + + g_reactor_state = SPDK_REACTOR_STATE_INITIALIZED; + + return 0; +} + +void +spdk_reactors_fini(void) +{ + uint32_t i; + struct spdk_reactor *reactor; + + if (g_reactor_state == SPDK_REACTOR_STATE_UNINITIALIZED) { + return; + } + + spdk_thread_lib_fini(); + + SPDK_ENV_FOREACH_CORE(i) { + reactor = spdk_reactor_get(i); + assert(reactor != NULL); + assert(reactor->thread_count == 0); + if (reactor->events != NULL) { + spdk_ring_free(reactor->events); + } + } + + spdk_mempool_free(g_spdk_event_mempool); + + free(g_reactors); + g_reactors = NULL; +} + +struct spdk_event * +spdk_event_allocate(uint32_t lcore, spdk_event_fn fn, void *arg1, void *arg2) +{ + struct spdk_event *event = NULL; + struct spdk_reactor *reactor = spdk_reactor_get(lcore); + + if (!reactor) { + assert(false); + return NULL; + } + + event = spdk_mempool_get(g_spdk_event_mempool); + if (event == NULL) { + assert(false); + return NULL; + } + + event->lcore = lcore; + event->fn = fn; + event->arg1 = arg1; + event->arg2 = arg2; + + return event; +} + +void +spdk_event_call(struct spdk_event *event) +{ + int rc; + struct spdk_reactor *reactor; + + reactor = spdk_reactor_get(event->lcore); + + assert(reactor != NULL); + assert(reactor->events != NULL); + + rc = spdk_ring_enqueue(reactor->events, (void **)&event, 1, NULL); + if (rc != 1) { + assert(false); + } +} + +static inline uint32_t +event_queue_run_batch(struct spdk_reactor *reactor) +{ + unsigned count, i; + void *events[SPDK_EVENT_BATCH_SIZE]; + struct spdk_thread *thread; + struct spdk_lw_thread *lw_thread; + +#ifdef DEBUG + /* + * spdk_ring_dequeue() fills events and returns how many entries it wrote, + * so we will never actually read uninitialized data from events, but just to be sure + * (and to silence a static analyzer false positive), initialize the array to NULL pointers. + */ + memset(events, 0, sizeof(events)); +#endif + + count = spdk_ring_dequeue(reactor->events, events, SPDK_EVENT_BATCH_SIZE); + if (count == 0) { + return 0; + } + + /* Execute the events. There are still some remaining events + * that must occur on an SPDK thread. To accomodate those, try to + * run them on the first thread in the list, if it exists. */ + lw_thread = TAILQ_FIRST(&reactor->threads); + if (lw_thread) { + thread = spdk_thread_get_from_ctx(lw_thread); + } else { + thread = NULL; + } + + spdk_set_thread(thread); + + for (i = 0; i < count; i++) { + struct spdk_event *event = events[i]; + + assert(event != NULL); + event->fn(event->arg1, event->arg2); + } + + spdk_set_thread(NULL); + + spdk_mempool_put_bulk(g_spdk_event_mempool, events, count); + + return count; +} + +/* 1s */ +#define CONTEXT_SWITCH_MONITOR_PERIOD 1000000 + +static int +get_rusage(struct spdk_reactor *reactor) +{ + struct rusage rusage; + + if (getrusage(RUSAGE_THREAD, &rusage) != 0) { + return -1; + } + + if (rusage.ru_nvcsw != reactor->rusage.ru_nvcsw || rusage.ru_nivcsw != reactor->rusage.ru_nivcsw) { + SPDK_INFOLOG(SPDK_LOG_REACTOR, + "Reactor %d: %ld voluntary context switches and %ld involuntary context switches in the last second.\n", + reactor->lcore, rusage.ru_nvcsw - reactor->rusage.ru_nvcsw, + rusage.ru_nivcsw - reactor->rusage.ru_nivcsw); + } + reactor->rusage = rusage; + + return -1; +} + +void +spdk_framework_enable_context_switch_monitor(bool enable) +{ + /* This global is being read by multiple threads, so this isn't + * strictly thread safe. However, we're toggling between true and + * false here, and if a thread sees the value update later than it + * should, it's no big deal. */ + g_framework_context_switch_monitor_enabled = enable; +} + +bool +spdk_framework_context_switch_monitor_enabled(void) +{ + return g_framework_context_switch_monitor_enabled; +} + +static void +_set_thread_name(const char *thread_name) +{ +#if defined(__linux__) + prctl(PR_SET_NAME, thread_name, 0, 0, 0); +#elif defined(__FreeBSD__) + pthread_set_name_np(pthread_self(), thread_name); +#else +#error missing platform support for thread name +#endif +} + +static int _reactor_schedule_thread(struct spdk_thread *thread); +static uint64_t g_rusage_period; + +static void +_reactor_run(struct spdk_reactor *reactor) +{ + struct spdk_thread *thread; + struct spdk_lw_thread *lw_thread, *tmp; + uint64_t now; + int rc; + + event_queue_run_batch(reactor); + + TAILQ_FOREACH_SAFE(lw_thread, &reactor->threads, link, tmp) { + thread = spdk_thread_get_from_ctx(lw_thread); + rc = spdk_thread_poll(thread, 0, reactor->tsc_last); + + now = spdk_thread_get_last_tsc(thread); + if (rc == 0) { + reactor->idle_tsc += now - reactor->tsc_last; + } else if (rc > 0) { + reactor->busy_tsc += now - reactor->tsc_last; + } + reactor->tsc_last = now; + + if (spdk_unlikely(lw_thread->resched)) { + lw_thread->resched = false; + TAILQ_REMOVE(&reactor->threads, lw_thread, link); + assert(reactor->thread_count > 0); + reactor->thread_count--; + _reactor_schedule_thread(thread); + continue; + } + + if (spdk_unlikely(spdk_thread_is_exited(thread) && + spdk_thread_is_idle(thread))) { + TAILQ_REMOVE(&reactor->threads, lw_thread, link); + assert(reactor->thread_count > 0); + reactor->thread_count--; + spdk_thread_destroy(thread); + continue; + } + } + + if (g_framework_context_switch_monitor_enabled) { + if ((reactor->last_rusage + g_rusage_period) < reactor->tsc_last) { + get_rusage(reactor); + reactor->last_rusage = reactor->tsc_last; + } + } +} + +static int +reactor_run(void *arg) +{ + struct spdk_reactor *reactor = arg; + struct spdk_thread *thread; + struct spdk_lw_thread *lw_thread, *tmp; + char thread_name[32]; + + SPDK_NOTICELOG("Reactor started on core %u\n", reactor->lcore); + + /* Rename the POSIX thread because the reactor is tied to the POSIX + * thread in the SPDK event library. + */ + snprintf(thread_name, sizeof(thread_name), "reactor_%u", reactor->lcore); + _set_thread_name(thread_name); + + reactor->tsc_last = spdk_get_ticks(); + + while (1) { + _reactor_run(reactor); + + if (g_reactor_state != SPDK_REACTOR_STATE_RUNNING) { + break; + } + } + + TAILQ_FOREACH(lw_thread, &reactor->threads, link) { + thread = spdk_thread_get_from_ctx(lw_thread); + spdk_set_thread(thread); + spdk_thread_exit(thread); + } + + while (!TAILQ_EMPTY(&reactor->threads)) { + TAILQ_FOREACH_SAFE(lw_thread, &reactor->threads, link, tmp) { + thread = spdk_thread_get_from_ctx(lw_thread); + spdk_set_thread(thread); + if (spdk_thread_is_exited(thread)) { + TAILQ_REMOVE(&reactor->threads, lw_thread, link); + assert(reactor->thread_count > 0); + reactor->thread_count--; + spdk_thread_destroy(thread); + } else { + spdk_thread_poll(thread, 0, 0); + } + } + } + + return 0; +} + +int +spdk_app_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask) +{ + int ret; + struct spdk_cpuset *validmask; + + ret = spdk_cpuset_parse(cpumask, mask); + if (ret < 0) { + return ret; + } + + validmask = spdk_app_get_core_mask(); + spdk_cpuset_and(cpumask, validmask); + + return 0; +} + +struct spdk_cpuset * +spdk_app_get_core_mask(void) +{ + return &g_reactor_core_mask; +} + +void +spdk_reactors_start(void) +{ + struct spdk_reactor *reactor; + struct spdk_cpuset tmp_cpumask = {}; + uint32_t i, current_core; + int rc; + char thread_name[32]; + + g_rusage_period = (CONTEXT_SWITCH_MONITOR_PERIOD * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC; + g_reactor_state = SPDK_REACTOR_STATE_RUNNING; + + current_core = spdk_env_get_current_core(); + SPDK_ENV_FOREACH_CORE(i) { + if (i != current_core) { + reactor = spdk_reactor_get(i); + if (reactor == NULL) { + continue; + } + + rc = spdk_env_thread_launch_pinned(reactor->lcore, reactor_run, reactor); + if (rc < 0) { + SPDK_ERRLOG("Unable to start reactor thread on core %u\n", reactor->lcore); + assert(false); + return; + } + + /* For now, for each reactor spawn one thread. */ + snprintf(thread_name, sizeof(thread_name), "reactor_%u", reactor->lcore); + + spdk_cpuset_zero(&tmp_cpumask); + spdk_cpuset_set_cpu(&tmp_cpumask, i, true); + + spdk_thread_create(thread_name, &tmp_cpumask); + } + spdk_cpuset_set_cpu(&g_reactor_core_mask, i, true); + } + + /* Start the master reactor */ + reactor = spdk_reactor_get(current_core); + assert(reactor != NULL); + reactor_run(reactor); + + spdk_env_thread_wait_all(); + + g_reactor_state = SPDK_REACTOR_STATE_SHUTDOWN; +} + +void +spdk_reactors_stop(void *arg1) +{ + g_reactor_state = SPDK_REACTOR_STATE_EXITING; +} + +static pthread_mutex_t g_scheduler_mtx = PTHREAD_MUTEX_INITIALIZER; +static uint32_t g_next_core = UINT32_MAX; + +static void +_schedule_thread(void *arg1, void *arg2) +{ + struct spdk_lw_thread *lw_thread = arg1; + struct spdk_thread *thread; + struct spdk_cpuset *cpumask; + struct spdk_reactor *reactor; + uint32_t current_core; + + current_core = spdk_env_get_current_core(); + + thread = spdk_thread_get_from_ctx(lw_thread); + cpumask = spdk_thread_get_cpumask(thread); + if (!spdk_cpuset_get_cpu(cpumask, current_core)) { + SPDK_ERRLOG("Thread was scheduled to the wrong core %d\n", current_core); + assert(false); + } + + reactor = spdk_reactor_get(current_core); + assert(reactor != NULL); + + TAILQ_INSERT_TAIL(&reactor->threads, lw_thread, link); + reactor->thread_count++; +} + +static int +_reactor_schedule_thread(struct spdk_thread *thread) +{ + uint32_t core; + struct spdk_lw_thread *lw_thread; + struct spdk_event *evt = NULL; + struct spdk_cpuset *cpumask; + uint32_t i; + + cpumask = spdk_thread_get_cpumask(thread); + + lw_thread = spdk_thread_get_ctx(thread); + assert(lw_thread != NULL); + memset(lw_thread, 0, sizeof(*lw_thread)); + + pthread_mutex_lock(&g_scheduler_mtx); + for (i = 0; i < spdk_env_get_core_count(); i++) { + if (g_next_core > spdk_env_get_last_core()) { + g_next_core = spdk_env_get_first_core(); + } + core = g_next_core; + g_next_core = spdk_env_get_next_core(g_next_core); + + if (spdk_cpuset_get_cpu(cpumask, core)) { + evt = spdk_event_allocate(core, _schedule_thread, lw_thread, NULL); + break; + } + } + pthread_mutex_unlock(&g_scheduler_mtx); + + assert(evt != NULL); + if (evt == NULL) { + SPDK_ERRLOG("Unable to schedule thread on requested core mask.\n"); + return -1; + } + + lw_thread->tsc_start = spdk_get_ticks(); + + spdk_event_call(evt); + + return 0; +} + +static void +_reactor_request_thread_reschedule(struct spdk_thread *thread) +{ + struct spdk_lw_thread *lw_thread; + + assert(thread == spdk_get_thread()); + + lw_thread = spdk_thread_get_ctx(thread); + + assert(lw_thread != NULL); + + lw_thread->resched = true; +} + +static int +reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op) +{ + switch (op) { + case SPDK_THREAD_OP_NEW: + return _reactor_schedule_thread(thread); + case SPDK_THREAD_OP_RESCHED: + _reactor_request_thread_reschedule(thread); + return 0; + default: + return -ENOTSUP; + } +} + +static bool +reactor_thread_op_supported(enum spdk_thread_op op) +{ + switch (op) { + case SPDK_THREAD_OP_NEW: + case SPDK_THREAD_OP_RESCHED: + return true; + default: + return false; + } +} + +struct call_reactor { + uint32_t cur_core; + spdk_event_fn fn; + void *arg1; + void *arg2; + + uint32_t orig_core; + spdk_event_fn cpl; +}; + +static void +on_reactor(void *arg1, void *arg2) +{ + struct call_reactor *cr = arg1; + struct spdk_event *evt; + + cr->fn(cr->arg1, cr->arg2); + + cr->cur_core = spdk_env_get_next_core(cr->cur_core); + + if (cr->cur_core > spdk_env_get_last_core()) { + SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "Completed reactor iteration\n"); + + evt = spdk_event_allocate(cr->orig_core, cr->cpl, cr->arg1, cr->arg2); + free(cr); + } else { + SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "Continuing reactor iteration to %d\n", + cr->cur_core); + + evt = spdk_event_allocate(cr->cur_core, on_reactor, arg1, NULL); + } + assert(evt != NULL); + spdk_event_call(evt); +} + +void +spdk_for_each_reactor(spdk_event_fn fn, void *arg1, void *arg2, spdk_event_fn cpl) +{ + struct call_reactor *cr; + struct spdk_event *evt; + + cr = calloc(1, sizeof(*cr)); + if (!cr) { + SPDK_ERRLOG("Unable to perform reactor iteration\n"); + cpl(arg1, arg2); + return; + } + + cr->fn = fn; + cr->arg1 = arg1; + cr->arg2 = arg2; + cr->cpl = cpl; + cr->orig_core = spdk_env_get_current_core(); + cr->cur_core = spdk_env_get_first_core(); + + SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "Starting reactor iteration from %d\n", cr->orig_core); + + evt = spdk_event_allocate(cr->cur_core, on_reactor, cr, NULL); + assert(evt != NULL); + + spdk_event_call(evt); +} + +SPDK_LOG_REGISTER_COMPONENT("reactor", SPDK_LOG_REACTOR) diff --git a/src/spdk/lib/event/rpc.c b/src/spdk/lib/event/rpc.c new file mode 100644 index 000000000..a42d5ebeb --- /dev/null +++ b/src/spdk/lib/event/rpc.c @@ -0,0 +1,87 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/log.h" +#include "spdk/rpc.h" + +#include "spdk_internal/event.h" + +#define RPC_SELECT_INTERVAL 4000 /* 4ms */ + +static struct spdk_poller *g_rpc_poller = NULL; + +static int +rpc_subsystem_poll(void *arg) +{ + spdk_rpc_accept(); + return SPDK_POLLER_BUSY; +} + +void +spdk_rpc_initialize(const char *listen_addr) +{ + int rc; + + if (listen_addr == NULL) { + return; + } + + if (!spdk_rpc_verify_methods()) { + spdk_app_stop(-EINVAL); + return; + } + + /* Listen on the requested address */ + rc = spdk_rpc_listen(listen_addr); + if (rc != 0) { + SPDK_ERRLOG("Unable to start RPC service at %s\n", listen_addr); + return; + } + + spdk_rpc_set_state(SPDK_RPC_STARTUP); + + /* Register a poller to periodically check for RPCs */ + g_rpc_poller = SPDK_POLLER_REGISTER(rpc_subsystem_poll, NULL, RPC_SELECT_INTERVAL); +} + +void +spdk_rpc_finish(void) +{ + spdk_rpc_close(); + spdk_poller_unregister(&g_rpc_poller); +} diff --git a/src/spdk/lib/event/spdk_event.map b/src/spdk/lib/event/spdk_event.map new file mode 100644 index 000000000..8208c5e1f --- /dev/null +++ b/src/spdk/lib/event/spdk_event.map @@ -0,0 +1,46 @@ +{ + global: + + # Public functions + spdk_app_opts_init; + spdk_app_start; + spdk_app_fini; + spdk_app_start_shutdown; + spdk_app_stop; + spdk_app_get_running_config; + spdk_app_get_shm_id; + spdk_app_parse_core_mask; + spdk_app_get_core_mask; + spdk_app_parse_args; + spdk_app_usage; + spdk_event_allocate; + spdk_event_call; + spdk_framework_enable_context_switch_monitor; + spdk_framework_context_switch_monitor_enabled; + + # Functions used by other SPDK libraries + spdk_reactors_init; + spdk_reactors_fini; + spdk_reactors_start; + spdk_reactors_stop; + spdk_reactor_get; + spdk_for_each_reactor; + spdk_subsystem_find; + spdk_subsystem_get_first; + spdk_subsystem_get_next; + spdk_subsystem_get_first_depend; + spdk_subsystem_get_next_depend; + spdk_add_subsystem; + spdk_add_subsystem_depend; + spdk_subsystem_init; + spdk_subsystem_fini; + spdk_subsystem_init_next; + spdk_subsystem_fini_next; + spdk_subsystem_config; + spdk_app_json_config_load; + spdk_subsystem_config_json; + spdk_rpc_initialize; + spdk_rpc_finish; + + local: *; +}; diff --git a/src/spdk/lib/event/subsystem.c b/src/spdk/lib/event/subsystem.c new file mode 100644 index 000000000..2cff890b2 --- /dev/null +++ b/src/spdk/lib/event/subsystem.c @@ -0,0 +1,288 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/log.h" +#include "spdk/thread.h" + +#include "spdk_internal/event.h" +#include "spdk/env.h" + +TAILQ_HEAD(spdk_subsystem_list, spdk_subsystem); +struct spdk_subsystem_list g_subsystems = TAILQ_HEAD_INITIALIZER(g_subsystems); + +TAILQ_HEAD(spdk_subsystem_depend_list, spdk_subsystem_depend); +struct spdk_subsystem_depend_list g_subsystems_deps = TAILQ_HEAD_INITIALIZER(g_subsystems_deps); +static struct spdk_subsystem *g_next_subsystem; +static bool g_subsystems_initialized = false; +static bool g_subsystems_init_interrupted = false; +static spdk_subsystem_init_fn g_subsystem_start_fn = NULL; +static void *g_subsystem_start_arg = NULL; +static spdk_msg_fn g_subsystem_stop_fn = NULL; +static void *g_subsystem_stop_arg = NULL; +static struct spdk_thread *g_fini_thread = NULL; + +void +spdk_add_subsystem(struct spdk_subsystem *subsystem) +{ + TAILQ_INSERT_TAIL(&g_subsystems, subsystem, tailq); +} + +void +spdk_add_subsystem_depend(struct spdk_subsystem_depend *depend) +{ + TAILQ_INSERT_TAIL(&g_subsystems_deps, depend, tailq); +} + +static struct spdk_subsystem * +_subsystem_find(struct spdk_subsystem_list *list, const char *name) +{ + struct spdk_subsystem *iter; + + TAILQ_FOREACH(iter, list, tailq) { + if (strcmp(name, iter->name) == 0) { + return iter; + } + } + + return NULL; +} + +struct spdk_subsystem * +spdk_subsystem_find(const char *name) +{ + return _subsystem_find(&g_subsystems, name); +} + +struct spdk_subsystem * +spdk_subsystem_get_first(void) +{ + return TAILQ_FIRST(&g_subsystems); +} + +struct spdk_subsystem * +spdk_subsystem_get_next(struct spdk_subsystem *cur_subsystem) +{ + return TAILQ_NEXT(cur_subsystem, tailq); +} + + +struct spdk_subsystem_depend * +spdk_subsystem_get_first_depend(void) +{ + return TAILQ_FIRST(&g_subsystems_deps); +} + +struct spdk_subsystem_depend * +spdk_subsystem_get_next_depend(struct spdk_subsystem_depend *cur_depend) +{ + return TAILQ_NEXT(cur_depend, tailq); +} + +static void +subsystem_sort(void) +{ + bool depends_on, depends_on_sorted; + struct spdk_subsystem *subsystem, *subsystem_tmp; + struct spdk_subsystem_depend *subsystem_dep; + + struct spdk_subsystem_list subsystems_list = TAILQ_HEAD_INITIALIZER(subsystems_list); + + while (!TAILQ_EMPTY(&g_subsystems)) { + TAILQ_FOREACH_SAFE(subsystem, &g_subsystems, tailq, subsystem_tmp) { + depends_on = false; + TAILQ_FOREACH(subsystem_dep, &g_subsystems_deps, tailq) { + if (strcmp(subsystem->name, subsystem_dep->name) == 0) { + depends_on = true; + depends_on_sorted = !!_subsystem_find(&subsystems_list, subsystem_dep->depends_on); + if (depends_on_sorted) { + continue; + } + break; + } + } + + if (depends_on == false) { + TAILQ_REMOVE(&g_subsystems, subsystem, tailq); + TAILQ_INSERT_TAIL(&subsystems_list, subsystem, tailq); + } else { + if (depends_on_sorted == true) { + TAILQ_REMOVE(&g_subsystems, subsystem, tailq); + TAILQ_INSERT_TAIL(&subsystems_list, subsystem, tailq); + } + } + } + } + + TAILQ_FOREACH_SAFE(subsystem, &subsystems_list, tailq, subsystem_tmp) { + TAILQ_REMOVE(&subsystems_list, subsystem, tailq); + TAILQ_INSERT_TAIL(&g_subsystems, subsystem, tailq); + } +} + +void +spdk_subsystem_init_next(int rc) +{ + /* The initialization is interrupted by the spdk_subsystem_fini, so just return */ + if (g_subsystems_init_interrupted) { + return; + } + + if (rc) { + SPDK_ERRLOG("Init subsystem %s failed\n", g_next_subsystem->name); + g_subsystem_start_fn(rc, g_subsystem_start_arg); + return; + } + + if (!g_next_subsystem) { + g_next_subsystem = TAILQ_FIRST(&g_subsystems); + } else { + g_next_subsystem = TAILQ_NEXT(g_next_subsystem, tailq); + } + + if (!g_next_subsystem) { + g_subsystems_initialized = true; + g_subsystem_start_fn(0, g_subsystem_start_arg); + return; + } + + if (g_next_subsystem->init) { + g_next_subsystem->init(); + } else { + spdk_subsystem_init_next(0); + } +} + +void +spdk_subsystem_init(spdk_subsystem_init_fn cb_fn, void *cb_arg) +{ + struct spdk_subsystem_depend *dep; + + g_subsystem_start_fn = cb_fn; + g_subsystem_start_arg = cb_arg; + + /* Verify that all dependency name and depends_on subsystems are registered */ + TAILQ_FOREACH(dep, &g_subsystems_deps, tailq) { + if (!spdk_subsystem_find(dep->name)) { + SPDK_ERRLOG("subsystem %s is missing\n", dep->name); + g_subsystem_start_fn(-1, g_subsystem_start_arg); + return; + } + if (!spdk_subsystem_find(dep->depends_on)) { + SPDK_ERRLOG("subsystem %s dependency %s is missing\n", + dep->name, dep->depends_on); + g_subsystem_start_fn(-1, g_subsystem_start_arg); + return; + } + } + + subsystem_sort(); + + spdk_subsystem_init_next(0); +} + +static void +subsystem_fini_next(void *arg1) +{ + assert(g_fini_thread == spdk_get_thread()); + + if (!g_next_subsystem) { + /* If the initialized flag is false, then we've failed to initialize + * the very first subsystem and no de-init is needed + */ + if (g_subsystems_initialized) { + g_next_subsystem = TAILQ_LAST(&g_subsystems, spdk_subsystem_list); + } + } else { + if (g_subsystems_initialized || g_subsystems_init_interrupted) { + g_next_subsystem = TAILQ_PREV(g_next_subsystem, spdk_subsystem_list, tailq); + } else { + g_subsystems_init_interrupted = true; + } + } + + while (g_next_subsystem) { + if (g_next_subsystem->fini) { + g_next_subsystem->fini(); + return; + } + g_next_subsystem = TAILQ_PREV(g_next_subsystem, spdk_subsystem_list, tailq); + } + + g_subsystem_stop_fn(g_subsystem_stop_arg); + return; +} + +void +spdk_subsystem_fini_next(void) +{ + if (g_fini_thread != spdk_get_thread()) { + spdk_thread_send_msg(g_fini_thread, subsystem_fini_next, NULL); + } else { + subsystem_fini_next(NULL); + } +} + +void +spdk_subsystem_fini(spdk_msg_fn cb_fn, void *cb_arg) +{ + g_subsystem_stop_fn = cb_fn; + g_subsystem_stop_arg = cb_arg; + + g_fini_thread = spdk_get_thread(); + + spdk_subsystem_fini_next(); +} + +void +spdk_subsystem_config(FILE *fp) +{ + struct spdk_subsystem *subsystem; + + TAILQ_FOREACH(subsystem, &g_subsystems, tailq) { + if (subsystem->config) { + subsystem->config(fp); + } + } +} + +void +spdk_subsystem_config_json(struct spdk_json_write_ctx *w, struct spdk_subsystem *subsystem) +{ + if (subsystem && subsystem->write_config_json) { + subsystem->write_config_json(w); + } else { + spdk_json_write_null(w); + } +} diff --git a/src/spdk/lib/ftl/Makefile b/src/spdk/lib/ftl/Makefile new file mode 100644 index 000000000..c24274622 --- /dev/null +++ b/src/spdk/lib/ftl/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = ftl_band.c ftl_core.c ftl_debug.c ftl_io.c ftl_reloc.c \ + ftl_restore.c ftl_init.c ftl_trace.c + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_ftl.map) + +LIBNAME = ftl + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/ftl/ftl_addr.h b/src/spdk/lib/ftl/ftl_addr.h new file mode 100644 index 000000000..36d2ffb00 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_addr.h @@ -0,0 +1,76 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_ADDR_H +#define FTL_ADDR_H + +#include "spdk/stdinc.h" + +/* Marks address as invalid */ +#define FTL_ADDR_INVALID (-1) +/* Marks LBA as invalid */ +#define FTL_LBA_INVALID ((uint64_t)-1) +/* Smallest data unit size */ +#define FTL_BLOCK_SIZE 4096 + +/* This structure represents on-disk address. It can have one of the following */ +/* formats: */ +/* - offset inside the disk */ +/* - cache_offset inside the cache (indicated by the cached flag) */ +/* - packed version of the two formats above (can be only used when the */ +/* offset can be represented in less than 32 bits) */ +/* Packed format is used, when possible, to avoid wasting RAM on the L2P table. */ +struct ftl_addr { + union { + struct { + uint64_t cache_offset : 63; + uint64_t cached : 1; + }; + + struct { + union { + struct { + uint32_t cache_offset : 31; + uint32_t cached : 1; + }; + + uint32_t offset; + }; + uint32_t rsvd; + } pack; + + uint64_t offset; + }; +}; + +#endif /* FTL_ADDR_H */ diff --git a/src/spdk/lib/ftl/ftl_band.c b/src/spdk/lib/ftl/ftl_band.c new file mode 100644 index 000000000..62221dcf6 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_band.c @@ -0,0 +1,1097 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/crc32.h" +#include "spdk/likely.h" +#include "spdk/util.h" +#include "spdk/ftl.h" + +#include "ftl_band.h" +#include "ftl_io.h" +#include "ftl_core.h" +#include "ftl_reloc.h" +#include "ftl_debug.h" + +/* TODO: define some signature for meta version */ +#define FTL_MD_VER 1 + +struct __attribute__((packed)) ftl_md_hdr { + /* Device instance */ + struct spdk_uuid uuid; + + /* Meta version */ + uint8_t ver; + + /* Sequence number */ + uint64_t seq; + + /* CRC32 checksum */ + uint32_t checksum; +}; + +/* End metadata layout stored on media (with all three being aligned to block size): */ +/* - header */ +/* - valid bitmap */ +/* - LBA map */ +struct __attribute__((packed)) ftl_tail_md { + struct ftl_md_hdr hdr; + + /* Max number of blocks */ + uint64_t num_blocks; + + uint8_t reserved[4059]; +}; +SPDK_STATIC_ASSERT(sizeof(struct ftl_tail_md) == FTL_BLOCK_SIZE, "Incorrect metadata size"); + +struct __attribute__((packed)) ftl_head_md { + struct ftl_md_hdr hdr; + + /* Number of defrag cycles */ + uint64_t wr_cnt; + + /* Number of surfaced LBAs */ + uint64_t lba_cnt; + + /* Transfer size */ + uint32_t xfer_size; +}; + +size_t +ftl_tail_md_hdr_num_blocks(void) +{ + return spdk_divide_round_up(sizeof(struct ftl_tail_md), FTL_BLOCK_SIZE); +} + +size_t +ftl_vld_map_num_blocks(const struct spdk_ftl_dev *dev) +{ + return spdk_divide_round_up(ftl_vld_map_size(dev), FTL_BLOCK_SIZE); +} + +size_t +ftl_lba_map_num_blocks(const struct spdk_ftl_dev *dev) +{ + return spdk_divide_round_up(ftl_get_num_blocks_in_band(dev) * sizeof(uint64_t), FTL_BLOCK_SIZE); +} + +size_t +ftl_head_md_num_blocks(const struct spdk_ftl_dev *dev) +{ + return dev->xfer_size; +} + +size_t +ftl_tail_md_num_blocks(const struct spdk_ftl_dev *dev) +{ + return spdk_divide_round_up(ftl_tail_md_hdr_num_blocks() + + ftl_vld_map_num_blocks(dev) + + ftl_lba_map_num_blocks(dev), + dev->xfer_size) * dev->xfer_size; +} + +static uint64_t +ftl_band_tail_md_offset(const struct ftl_band *band) +{ + return ftl_band_num_usable_blocks(band) - + ftl_tail_md_num_blocks(band->dev); +} + +int +ftl_band_full(struct ftl_band *band, size_t offset) +{ + return offset == ftl_band_tail_md_offset(band); +} + +void +ftl_band_write_failed(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + + band->high_prio = 1; + + ftl_reloc_add(dev->reloc, band, 0, ftl_get_num_blocks_in_band(dev), 1, true); + ftl_band_set_state(band, FTL_BAND_STATE_CLOSED); +} + +static void +ftl_band_free_lba_map(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_lba_map *lba_map = &band->lba_map; + + assert(band->state == FTL_BAND_STATE_CLOSED || + band->state == FTL_BAND_STATE_FREE); + assert(lba_map->ref_cnt == 0); + assert(lba_map->map != NULL); + assert(!band->high_prio); + + /* Verify that band's metadata is consistent with l2p */ + if (band->num_zones) { + assert(ftl_band_validate_md(band) == true); + } + + spdk_mempool_put(dev->lba_pool, lba_map->dma_buf); + lba_map->map = NULL; + lba_map->dma_buf = NULL; +} + +static void +_ftl_band_set_free(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_band *lband, *prev; + + /* Remove the band from the closed band list */ + LIST_REMOVE(band, list_entry); + + /* Keep the list sorted by band's write count */ + LIST_FOREACH(lband, &dev->free_bands, list_entry) { + if (lband->wr_cnt > band->wr_cnt) { + LIST_INSERT_BEFORE(lband, band, list_entry); + break; + } + prev = lband; + } + + if (!lband) { + if (LIST_EMPTY(&dev->free_bands)) { + LIST_INSERT_HEAD(&dev->free_bands, band, list_entry); + } else { + LIST_INSERT_AFTER(prev, band, list_entry); + } + } + +#if defined(DEBUG) + prev = NULL; + LIST_FOREACH(lband, &dev->free_bands, list_entry) { + if (!prev) { + continue; + } + assert(prev->wr_cnt <= lband->wr_cnt); + } +#endif + dev->num_free++; + ftl_apply_limits(dev); +} + +static void +_ftl_band_set_preparing(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + + /* Remove band from free list */ + LIST_REMOVE(band, list_entry); + + band->wr_cnt++; + + assert(dev->num_free > 0); + dev->num_free--; + + ftl_apply_limits(dev); +} + +static void +_ftl_band_set_closed(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + + /* Set the state as free_md() checks for that */ + band->state = FTL_BAND_STATE_CLOSED; + + /* Free the lba map if there are no outstanding IOs */ + ftl_band_release_lba_map(band); + + if (spdk_likely(band->num_zones)) { + LIST_INSERT_HEAD(&dev->shut_bands, band, list_entry); + } else { + LIST_REMOVE(band, list_entry); + } +} + +static uint32_t +ftl_md_calc_crc(const struct ftl_md_hdr *hdr, size_t size) +{ + size_t checkoff = offsetof(struct ftl_md_hdr, checksum); + size_t mdoff = checkoff + sizeof(hdr->checksum); + uint32_t crc; + + crc = spdk_crc32c_update(hdr, checkoff, 0); + return spdk_crc32c_update((const char *)hdr + mdoff, size - mdoff, crc); +} + +static void +ftl_set_md_hdr(struct ftl_band *band, struct ftl_md_hdr *hdr, size_t size) +{ + hdr->seq = band->seq; + hdr->ver = FTL_MD_VER; + hdr->uuid = band->dev->uuid; + hdr->checksum = ftl_md_calc_crc(hdr, size); +} + +static int +ftl_pack_head_md(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_head_md *head = band->lba_map.dma_buf; + + head->wr_cnt = band->wr_cnt; + head->lba_cnt = dev->num_lbas; + head->xfer_size = dev->xfer_size; + ftl_set_md_hdr(band, &head->hdr, sizeof(struct ftl_head_md)); + + return FTL_MD_SUCCESS; +} + +static int +ftl_pack_tail_md(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_lba_map *lba_map = &band->lba_map; + struct ftl_tail_md *tail = lba_map->dma_buf; + void *vld_offset; + + vld_offset = (char *)tail + ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE; + + /* Clear out the buffer */ + memset(tail, 0, ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE); + tail->num_blocks = ftl_get_num_blocks_in_band(dev); + + pthread_spin_lock(&lba_map->lock); + spdk_bit_array_store_mask(lba_map->vld, vld_offset); + pthread_spin_unlock(&lba_map->lock); + + ftl_set_md_hdr(band, &tail->hdr, ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE); + + return FTL_MD_SUCCESS; +} + +static int +ftl_md_hdr_vld(struct spdk_ftl_dev *dev, const struct ftl_md_hdr *hdr, size_t size) +{ + if (spdk_uuid_compare(&dev->uuid, &hdr->uuid) != 0) { + return FTL_MD_NO_MD; + } + + if (hdr->ver != FTL_MD_VER) { + return FTL_MD_INVALID_VER; + } + + if (ftl_md_calc_crc(hdr, size) != hdr->checksum) { + return FTL_MD_INVALID_CRC; + } + + return FTL_MD_SUCCESS; +} + +static int +ftl_unpack_tail_md(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + void *vld_offset; + struct ftl_lba_map *lba_map = &band->lba_map; + struct ftl_tail_md *tail = lba_map->dma_buf; + int rc; + + vld_offset = (char *)tail + ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE; + + rc = ftl_md_hdr_vld(dev, &tail->hdr, ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE); + if (rc) { + return rc; + } + + /* + * When restoring from a dirty shutdown it's possible old tail meta wasn't yet cleared - + * band had saved head meta, but didn't manage to send erase to all zones. + * The already found tail md header is valid, but inconsistent with the head meta. Treat + * such a band as open/without valid tail md. + */ + if (band->seq != tail->hdr.seq) { + return FTL_MD_NO_MD; + } + + if (tail->num_blocks != ftl_get_num_blocks_in_band(dev)) { + return FTL_MD_INVALID_SIZE; + } + + spdk_bit_array_load_mask(lba_map->vld, vld_offset); + + return FTL_MD_SUCCESS; +} + +static int +ftl_unpack_head_md(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_head_md *head = band->lba_map.dma_buf; + int rc; + + rc = ftl_md_hdr_vld(dev, &head->hdr, sizeof(struct ftl_head_md)); + if (rc) { + return rc; + } + + band->seq = head->hdr.seq; + band->wr_cnt = head->wr_cnt; + + if (dev->global_md.num_lbas == 0) { + dev->global_md.num_lbas = head->lba_cnt; + } + + if (dev->global_md.num_lbas != head->lba_cnt) { + return FTL_MD_INVALID_SIZE; + } + + if (dev->xfer_size != head->xfer_size) { + return FTL_MD_INVALID_SIZE; + } + + return FTL_MD_SUCCESS; +} + +struct ftl_addr +ftl_band_tail_md_addr(struct ftl_band *band) +{ + struct ftl_addr addr = {}; + struct ftl_zone *zone; + struct spdk_ftl_dev *dev = band->dev; + size_t xfer_size = dev->xfer_size; + size_t num_req = ftl_band_tail_md_offset(band) / xfer_size; + size_t i; + + if (spdk_unlikely(!band->num_zones)) { + return ftl_to_addr(FTL_ADDR_INVALID); + } + + /* Metadata should be aligned to xfer size */ + assert(ftl_band_tail_md_offset(band) % xfer_size == 0); + + zone = CIRCLEQ_FIRST(&band->zones); + for (i = 0; i < num_req % band->num_zones; ++i) { + zone = ftl_band_next_zone(band, zone); + } + + addr.offset = (num_req / band->num_zones) * xfer_size; + addr.offset += zone->info.zone_id; + + return addr; +} + +struct ftl_addr +ftl_band_head_md_addr(struct ftl_band *band) +{ + if (spdk_unlikely(!band->num_zones)) { + return ftl_to_addr(FTL_ADDR_INVALID); + } + + return ftl_to_addr(CIRCLEQ_FIRST(&band->zones)->info.zone_id); +} + +void +ftl_band_set_state(struct ftl_band *band, enum ftl_band_state state) +{ + switch (state) { + case FTL_BAND_STATE_FREE: + assert(band->state == FTL_BAND_STATE_CLOSED); + _ftl_band_set_free(band); + break; + + case FTL_BAND_STATE_PREP: + assert(band->state == FTL_BAND_STATE_FREE); + _ftl_band_set_preparing(band); + break; + + case FTL_BAND_STATE_CLOSED: + if (band->state != FTL_BAND_STATE_CLOSED) { + assert(band->state == FTL_BAND_STATE_CLOSING || band->high_prio); + _ftl_band_set_closed(band); + } + break; + + default: + break; + } + + band->state = state; +} + +void +ftl_band_set_addr(struct ftl_band *band, uint64_t lba, struct ftl_addr addr) +{ + struct ftl_lba_map *lba_map = &band->lba_map; + uint64_t offset; + + assert(lba != FTL_LBA_INVALID); + + offset = ftl_band_block_offset_from_addr(band, addr); + pthread_spin_lock(&lba_map->lock); + + lba_map->num_vld++; + lba_map->map[offset] = lba; + spdk_bit_array_set(lba_map->vld, offset); + + pthread_spin_unlock(&lba_map->lock); +} + +size_t +ftl_band_age(const struct ftl_band *band) +{ + return (size_t)(band->dev->seq - band->seq); +} + +size_t +ftl_band_num_usable_blocks(const struct ftl_band *band) +{ + return band->num_zones * ftl_get_num_blocks_in_zone(band->dev); +} + +size_t +ftl_band_user_blocks_left(const struct ftl_band *band, size_t offset) +{ + size_t tail_md_offset = ftl_band_tail_md_offset(band); + + if (spdk_unlikely(offset <= ftl_head_md_num_blocks(band->dev))) { + return ftl_band_user_blocks(band); + } + + if (spdk_unlikely(offset > tail_md_offset)) { + return 0; + } + + return tail_md_offset - offset; +} + +size_t +ftl_band_user_blocks(const struct ftl_band *band) +{ + return ftl_band_num_usable_blocks(band) - + ftl_head_md_num_blocks(band->dev) - + ftl_tail_md_num_blocks(band->dev); +} + +struct ftl_band * +ftl_band_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + size_t band_id = ftl_addr_get_band(dev, addr); + + assert(band_id < ftl_get_num_bands(dev)); + return &dev->bands[band_id]; +} + +struct ftl_zone * +ftl_band_zone_from_addr(struct ftl_band *band, struct ftl_addr addr) +{ + size_t pu_id = ftl_addr_get_punit(band->dev, addr); + + assert(pu_id < ftl_get_num_punits(band->dev)); + return &band->zone_buf[pu_id]; +} + +uint64_t +ftl_band_block_offset_from_addr(struct ftl_band *band, struct ftl_addr addr) +{ + assert(ftl_addr_get_band(band->dev, addr) == band->id); + assert(ftl_addr_get_punit(band->dev, addr) < ftl_get_num_punits(band->dev)); + return addr.offset % ftl_get_num_blocks_in_band(band->dev); +} + +struct ftl_addr +ftl_band_next_xfer_addr(struct ftl_band *band, struct ftl_addr addr, size_t num_blocks) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_zone *zone; + size_t num_xfers, num_stripes; + uint64_t offset; + + assert(ftl_addr_get_band(dev, addr) == band->id); + + offset = ftl_addr_get_zone_offset(dev, addr); + zone = ftl_band_zone_from_addr(band, addr); + + num_blocks += (offset % dev->xfer_size); + offset -= (offset % dev->xfer_size); + +#if defined(DEBUG) + /* Check that the number of zones has not been changed */ + struct ftl_zone *_zone; + size_t _num_zones = 0; + CIRCLEQ_FOREACH(_zone, &band->zones, circleq) { + if (spdk_likely(_zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE)) { + _num_zones++; + } + } + assert(band->num_zones == _num_zones); +#endif + assert(band->num_zones != 0); + num_stripes = (num_blocks / dev->xfer_size) / band->num_zones; + offset += num_stripes * dev->xfer_size; + num_blocks -= num_stripes * dev->xfer_size * band->num_zones; + + if (offset > ftl_get_num_blocks_in_zone(dev)) { + return ftl_to_addr(FTL_ADDR_INVALID); + } + + num_xfers = num_blocks / dev->xfer_size; + for (size_t i = 0; i < num_xfers; ++i) { + /* When the last zone is reached the block part of the address */ + /* needs to be increased by xfer_size */ + if (ftl_band_zone_is_last(band, zone)) { + offset += dev->xfer_size; + if (offset > ftl_get_num_blocks_in_zone(dev)) { + return ftl_to_addr(FTL_ADDR_INVALID); + } + } + + zone = ftl_band_next_operational_zone(band, zone); + assert(zone); + + num_blocks -= dev->xfer_size; + } + + if (num_blocks) { + offset += num_blocks; + if (offset > ftl_get_num_blocks_in_zone(dev)) { + return ftl_to_addr(FTL_ADDR_INVALID); + } + } + + addr.offset = zone->info.zone_id + offset; + return addr; +} + +static size_t +ftl_xfer_offset_from_addr(struct ftl_band *band, struct ftl_addr addr) +{ + struct ftl_zone *zone, *current_zone; + unsigned int punit_offset = 0; + size_t num_stripes, xfer_size = band->dev->xfer_size; + uint64_t offset; + + assert(ftl_addr_get_band(band->dev, addr) == band->id); + + offset = ftl_addr_get_zone_offset(band->dev, addr); + num_stripes = (offset / xfer_size) * band->num_zones; + + current_zone = ftl_band_zone_from_addr(band, addr); + CIRCLEQ_FOREACH(zone, &band->zones, circleq) { + if (current_zone == zone) { + break; + } + punit_offset++; + } + + return xfer_size * (num_stripes + punit_offset) + offset % xfer_size; +} + +struct ftl_addr +ftl_band_addr_from_block_offset(struct ftl_band *band, uint64_t block_off) +{ + struct ftl_addr addr = { .offset = 0 }; + + addr.offset = block_off + band->id * ftl_get_num_blocks_in_band(band->dev); + return addr; +} + +struct ftl_addr +ftl_band_next_addr(struct ftl_band *band, struct ftl_addr addr, size_t offset) +{ + uint64_t block_off = ftl_band_block_offset_from_addr(band, addr); + return ftl_band_addr_from_block_offset(band, block_off + offset); +} + +void +ftl_band_acquire_lba_map(struct ftl_band *band) +{ + assert(band->lba_map.map != NULL); + band->lba_map.ref_cnt++; +} + +int +ftl_band_alloc_lba_map(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_lba_map *lba_map = &band->lba_map; + + assert(lba_map->ref_cnt == 0); + assert(lba_map->map == NULL); + + lba_map->dma_buf = spdk_mempool_get(dev->lba_pool); + + if (!lba_map->dma_buf) { + return -1; + } + + memset(lba_map->dma_buf, 0, ftl_lba_map_pool_elem_size(band->dev)); + + lba_map->map = (uint64_t *)((char *)lba_map->dma_buf + FTL_BLOCK_SIZE * + (ftl_tail_md_hdr_num_blocks() + ftl_vld_map_num_blocks(dev))); + + lba_map->segments = (char *)lba_map->dma_buf + ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE; + + ftl_band_acquire_lba_map(band); + return 0; +} + +void +ftl_band_release_lba_map(struct ftl_band *band) +{ + struct ftl_lba_map *lba_map = &band->lba_map; + + assert(lba_map->map != NULL); + assert(lba_map->ref_cnt > 0); + lba_map->ref_cnt--; + + if (lba_map->ref_cnt == 0) { + ftl_band_free_lba_map(band); + } +} + +static void +ftl_read_md_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_md_io *md_io = (struct ftl_md_io *)io; + + if (!status) { + status = md_io->pack_fn(md_io->io.band); + } else { + status = FTL_MD_IO_FAILURE; + } + + md_io->cb_fn(io, md_io->cb_ctx, status); +} + +static struct ftl_md_io * +ftl_io_init_md_read(struct spdk_ftl_dev *dev, struct ftl_addr addr, + struct ftl_band *band, size_t num_blocks, void *buf, + ftl_io_fn fn, ftl_md_pack_fn pack_fn, ftl_io_fn cb_fn, void *cb_ctx) +{ + struct ftl_md_io *io; + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .band = band, + .size = sizeof(*io), + .flags = FTL_IO_MD | FTL_IO_PHYSICAL_MODE, + .type = FTL_IO_READ, + .num_blocks = num_blocks, + .cb_fn = fn, + .iovs = { + { + .iov_base = buf, + .iov_len = num_blocks * FTL_BLOCK_SIZE, + } + }, + .iovcnt = 1, + }; + + io = (struct ftl_md_io *)ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->io.addr = addr; + io->pack_fn = pack_fn; + io->cb_fn = cb_fn; + io->cb_ctx = cb_ctx; + + return io; +} + +static struct ftl_io * +ftl_io_init_md_write(struct spdk_ftl_dev *dev, struct ftl_band *band, + void *data, size_t num_blocks, ftl_io_fn cb) +{ + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .band = band, + .size = sizeof(struct ftl_io), + .flags = FTL_IO_MD | FTL_IO_PHYSICAL_MODE, + .type = FTL_IO_WRITE, + .num_blocks = num_blocks, + .cb_fn = cb, + .iovs = { + { + .iov_base = data, + .iov_len = num_blocks * FTL_BLOCK_SIZE, + } + }, + .iovcnt = 1, + .md = NULL, + }; + + return ftl_io_init_internal(&opts); +} + +static int +ftl_band_write_md(struct ftl_band *band, size_t num_blocks, + ftl_md_pack_fn md_fn, ftl_io_fn cb) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_io *io; + + io = ftl_io_init_md_write(dev, band, band->lba_map.dma_buf, num_blocks, cb); + if (!io) { + return -ENOMEM; + } + + md_fn(band); + + ftl_io_write(io); + return 0; +} + +void +ftl_band_md_clear(struct ftl_band *band) +{ + band->seq = 0; + band->wr_cnt = 0; + band->lba_map.num_vld = 0; + band->lba_map.map = NULL; +} + +int +ftl_band_write_head_md(struct ftl_band *band, ftl_io_fn cb) +{ + return ftl_band_write_md(band, ftl_head_md_num_blocks(band->dev), + ftl_pack_head_md, cb); +} + +int +ftl_band_write_tail_md(struct ftl_band *band, ftl_io_fn cb) +{ + return ftl_band_write_md(band, ftl_tail_md_num_blocks(band->dev), + ftl_pack_tail_md, cb); +} + +static struct ftl_addr +ftl_band_lba_map_addr(struct ftl_band *band, size_t offset) +{ + return ftl_band_next_xfer_addr(band, band->tail_md_addr, + ftl_tail_md_hdr_num_blocks() + + ftl_vld_map_num_blocks(band->dev) + + offset); +} + +static int +ftl_band_read_md(struct ftl_band *band, size_t num_blocks, struct ftl_addr start_addr, + void *buf, ftl_io_fn fn, ftl_md_pack_fn pack_fn, ftl_io_fn cb_fn, void *cb_ctx) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_md_io *io; + + if (spdk_unlikely(!band->num_zones)) { + return -ENOENT; + } + + io = ftl_io_init_md_read(dev, start_addr, band, num_blocks, buf, fn, pack_fn, cb_fn, cb_ctx); + if (!io) { + return -ENOMEM; + } + + ftl_io_read((struct ftl_io *)io); + return 0; +} + +int +ftl_band_read_tail_md(struct ftl_band *band, struct ftl_addr addr, ftl_io_fn cb_fn, void *cb_ctx) +{ + return ftl_band_read_md(band, ftl_tail_md_num_blocks(band->dev), addr, band->lba_map.dma_buf, + ftl_read_md_cb, ftl_unpack_tail_md, cb_fn, cb_ctx); +} + +static size_t +ftl_lba_map_request_segment_done(struct ftl_lba_map_request *request, size_t offset, + size_t num_segments) +{ + size_t i, num_done = 0; + + for (i = offset; i < offset + num_segments; ++i) { + if (spdk_bit_array_get(request->segments, i)) { + spdk_bit_array_clear(request->segments, offset); + num_done++; + } + } + + assert(request->num_pending >= num_done); + request->num_pending -= num_done; + + return num_done; +} + +static void +ftl_lba_map_set_segment_state(struct ftl_lba_map *lba_map, size_t offset, size_t num_segments, + enum ftl_lba_map_seg_state state) +{ + size_t i; + + for (i = offset; i < offset + num_segments; ++i) { + lba_map->segments[i] = state; + } +} + +static void +ftl_lba_map_request_free(struct spdk_ftl_dev *dev, struct ftl_lba_map_request *request) +{ + spdk_bit_array_clear_mask(request->segments); + spdk_mempool_put(dev->lba_request_pool, request); +} + +static void +ftl_process_lba_map_requests(struct spdk_ftl_dev *dev, struct ftl_lba_map *lba_map, size_t offset, + size_t num_segments, int status) +{ + struct ftl_lba_map_request *request, *trequest; + size_t num_done; + + LIST_FOREACH_SAFE(request, &lba_map->request_list, list_entry, trequest) { + num_done = ftl_lba_map_request_segment_done(request, offset, num_segments); + if (request->num_pending == 0 || (status && num_done)) { + request->cb(NULL, request->cb_ctx, status); + LIST_REMOVE(request, list_entry); + ftl_lba_map_request_free(dev, request); + } + } +} + +static size_t +ftl_lba_map_offset_from_addr(struct ftl_band *band, struct ftl_addr addr) +{ + size_t offset; + struct ftl_addr start_addr = ftl_band_lba_map_addr(band, 0); + + offset = ftl_xfer_offset_from_addr(band, addr) - ftl_xfer_offset_from_addr(band, start_addr); + assert(offset < ftl_lba_map_num_blocks(band->dev)); + + return offset; +} + +static void +ftl_read_lba_map_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_lba_map *lba_map = &io->band->lba_map; + uint64_t block_off; + + block_off = ftl_lba_map_offset_from_addr(io->band, io->addr); + assert(block_off + io->num_blocks <= ftl_lba_map_num_blocks(io->dev)); + + if (!status) { + ftl_lba_map_set_segment_state(lba_map, block_off, io->num_blocks, + FTL_LBA_MAP_SEG_CACHED); + } + + ftl_process_lba_map_requests(io->dev, lba_map, block_off, io->num_blocks, status); +} + +static struct ftl_lba_map_request * +ftl_lba_map_alloc_request(struct ftl_band *band, size_t offset, size_t num_segments, + ftl_io_fn cb, void *cb_ctx) +{ + struct ftl_lba_map_request *request; + struct spdk_ftl_dev *dev = band->dev; + size_t i; + + request = spdk_mempool_get(dev->lba_request_pool); + if (!request) { + return NULL; + } + + request->cb = cb; + request->cb_ctx = cb_ctx; + request->num_pending = num_segments; + + for (i = offset; i < offset + num_segments; ++i) { + spdk_bit_array_set(request->segments, i); + } + + return request; +} + +static size_t +ftl_lba_map_num_clear_segments(struct ftl_lba_map *lba_map, + size_t offset, size_t num_segments) +{ + size_t i, cnt = 0; + + for (i = offset; i < offset + num_segments; ++i) { + if (lba_map->segments[i] != FTL_LBA_MAP_SEG_CLEAR) { + break; + } + cnt++; + } + + return cnt; +} + +int +ftl_band_read_lba_map(struct ftl_band *band, size_t offset, size_t lba_cnt, + ftl_io_fn cb_fn, void *cb_ctx) +{ + size_t num_blocks, block_off, num_read, num_segments; + struct ftl_lba_map *lba_map = &band->lba_map; + struct ftl_lba_map_request *request; + int rc = 0; + + block_off = offset / FTL_NUM_LBA_IN_BLOCK; + num_segments = spdk_divide_round_up(offset + lba_cnt, FTL_NUM_LBA_IN_BLOCK); + num_blocks = num_segments - block_off; + assert(block_off + num_blocks <= ftl_lba_map_num_blocks(band->dev)); + + request = ftl_lba_map_alloc_request(band, block_off, num_blocks, cb_fn, cb_ctx); + if (!request) { + return -ENOMEM; + } + + while (num_blocks) { + if (lba_map->segments[block_off] != FTL_LBA_MAP_SEG_CLEAR) { + if (lba_map->segments[block_off] == FTL_LBA_MAP_SEG_CACHED) { + ftl_lba_map_request_segment_done(request, block_off, 1); + } + num_blocks--; + block_off++; + continue; + } + + num_read = ftl_lba_map_num_clear_segments(lba_map, block_off, num_blocks); + ftl_lba_map_set_segment_state(lba_map, block_off, num_read, + FTL_LBA_MAP_SEG_PENDING); + + rc = ftl_band_read_md(band, num_read, ftl_band_lba_map_addr(band, block_off), + (char *)band->lba_map.map + block_off * FTL_BLOCK_SIZE, + ftl_read_lba_map_cb, NULL, cb_fn, cb_ctx); + if (rc) { + ftl_lba_map_request_free(band->dev, request); + return rc; + } + + assert(num_blocks >= num_read); + num_blocks -= num_read; + block_off += num_read; + } + + if (request->num_pending) { + LIST_INSERT_HEAD(&lba_map->request_list, request, list_entry); + } else { + cb_fn(NULL, cb_ctx, 0); + ftl_lba_map_request_free(band->dev, request); + } + + return rc; +} + +int +ftl_band_read_head_md(struct ftl_band *band, ftl_io_fn cb_fn, void *cb_ctx) +{ + return ftl_band_read_md(band, + ftl_head_md_num_blocks(band->dev), + ftl_band_head_md_addr(band), + band->lba_map.dma_buf, + ftl_read_md_cb, + ftl_unpack_head_md, + cb_fn, + cb_ctx); +} + +void +ftl_band_remove_zone(struct ftl_band *band, struct ftl_zone *zone) +{ + CIRCLEQ_REMOVE(&band->zones, zone, circleq); + band->num_zones--; +} + +int +ftl_band_write_prep(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + + if (ftl_band_alloc_lba_map(band)) { + return -1; + } + + band->seq = ++dev->seq; + return 0; +} + +struct ftl_zone * +ftl_band_next_operational_zone(struct ftl_band *band, struct ftl_zone *zone) +{ + struct ftl_zone *result = NULL; + struct ftl_zone *entry; + + if (spdk_unlikely(!band->num_zones)) { + return NULL; + } + + /* Erasing band may fail after it was assigned to wptr. */ + /* In such a case zone is no longer in band->zones queue. */ + if (spdk_likely(zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE)) { + result = ftl_band_next_zone(band, zone); + } else { + CIRCLEQ_FOREACH_REVERSE(entry, &band->zones, circleq) { + if (entry->info.zone_id > zone->info.zone_id) { + result = entry; + } else { + if (!result) { + result = CIRCLEQ_FIRST(&band->zones); + } + break; + } + } + } + + return result; +} + +void +ftl_band_clear_lba_map(struct ftl_band *band) +{ + struct ftl_lba_map *lba_map = &band->lba_map; + size_t num_segments; + + spdk_bit_array_clear_mask(lba_map->vld); + memset(lba_map->map, 0, ftl_lba_map_num_blocks(band->dev) * FTL_BLOCK_SIZE); + + /* For open band all lba map segments are already cached */ + assert(band->state == FTL_BAND_STATE_PREP); + num_segments = spdk_divide_round_up(ftl_get_num_blocks_in_band(band->dev), FTL_NUM_LBA_IN_BLOCK); + ftl_lba_map_set_segment_state(&band->lba_map, 0, num_segments, FTL_LBA_MAP_SEG_CACHED); + + lba_map->num_vld = 0; +} + +size_t +ftl_lba_map_pool_elem_size(struct spdk_ftl_dev *dev) +{ + /* Map pool element holds the whole tail md + segments map */ + return ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE + + spdk_divide_round_up(ftl_get_num_blocks_in_band(dev), FTL_NUM_LBA_IN_BLOCK); +} diff --git a/src/spdk/lib/ftl/ftl_band.h b/src/spdk/lib/ftl/ftl_band.h new file mode 100644 index 000000000..109b369a5 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_band.h @@ -0,0 +1,287 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_BAND_H +#define FTL_BAND_H + +#include "spdk/stdinc.h" +#include "spdk/bit_array.h" +#include "spdk/queue.h" +#include "spdk/bdev_zone.h" + +#include "ftl_io.h" +#include "ftl_addr.h" +#include "ftl_core.h" + +/* Number of LBAs that could be stored in a single block */ +#define FTL_NUM_LBA_IN_BLOCK (FTL_BLOCK_SIZE / sizeof(uint64_t)) + +struct spdk_ftl_dev; +struct ftl_lba_map_request; + +struct ftl_zone { + struct spdk_bdev_zone_info info; + + /* Indicates that there is inflight write */ + bool busy; + + CIRCLEQ_ENTRY(ftl_zone) circleq; +}; + +enum ftl_md_status { + FTL_MD_SUCCESS, + /* Metadata read failure */ + FTL_MD_IO_FAILURE, + /* Invalid version */ + FTL_MD_INVALID_VER, + /* UUID doesn't match */ + FTL_MD_NO_MD, + /* UUID and version matches but CRC doesn't */ + FTL_MD_INVALID_CRC, + /* Vld or lba map size doesn't match */ + FTL_MD_INVALID_SIZE +}; + +enum ftl_lba_map_seg_state { + FTL_LBA_MAP_SEG_CLEAR, + FTL_LBA_MAP_SEG_PENDING, + FTL_LBA_MAP_SEG_CACHED +}; + +struct ftl_lba_map { + /* LBA/vld map lock */ + pthread_spinlock_t lock; + + /* Number of valid LBAs */ + size_t num_vld; + + /* LBA map's reference count */ + size_t ref_cnt; + + /* Bitmap of valid LBAs */ + struct spdk_bit_array *vld; + + /* LBA map (only valid for open/relocating bands) */ + uint64_t *map; + + /* LBA map segment state map (clear, pending, cached) */ + uint8_t *segments; + + LIST_HEAD(, ftl_lba_map_request) request_list; + + /* Metadata DMA buffer (only valid for open/relocating bands) */ + void *dma_buf; +}; + +enum ftl_band_state { + FTL_BAND_STATE_FREE, + FTL_BAND_STATE_PREP, + FTL_BAND_STATE_OPENING, + FTL_BAND_STATE_OPEN, + FTL_BAND_STATE_FULL, + FTL_BAND_STATE_CLOSING, + FTL_BAND_STATE_CLOSED, + FTL_BAND_STATE_MAX +}; + +struct ftl_lba_map_request { + /* Completion callback */ + ftl_io_fn cb; + + /* Completion callback context */ + void *cb_ctx; + + /* Bit array of requested segments */ + struct spdk_bit_array *segments; + + /* Number of pending segments to read */ + size_t num_pending; + + LIST_ENTRY(ftl_lba_map_request) list_entry; +}; + +struct ftl_band { + /* Device this band belongs to */ + struct spdk_ftl_dev *dev; + + /* Number of operational zones */ + size_t num_zones; + + /* Array of zones */ + struct ftl_zone *zone_buf; + + /* List of operational zones */ + CIRCLEQ_HEAD(, ftl_zone) zones; + + /* LBA map */ + struct ftl_lba_map lba_map; + + /* Band's state */ + enum ftl_band_state state; + + /* Band's index */ + unsigned int id; + + /* Latest merit calculation */ + double merit; + + /* High defrag priority - means that the metadata should be copied and */ + /* the band should be defragged immediately */ + int high_prio; + + /* Sequence number */ + uint64_t seq; + + /* Number of defrag cycles */ + uint64_t wr_cnt; + + /* End metadata start addr */ + struct ftl_addr tail_md_addr; + + /* Bitmap of all bands that have its data moved onto this band */ + struct spdk_bit_array *reloc_bitmap; + /* Number of open bands containing data moved from this band */ + size_t num_reloc_bands; + /* Number of blocks currently being moved from this band */ + size_t num_reloc_blocks; + + /* Free/shut bands' lists */ + LIST_ENTRY(ftl_band) list_entry; + + /* High priority queue link */ + STAILQ_ENTRY(ftl_band) prio_stailq; +}; + +uint64_t ftl_band_block_offset_from_addr(struct ftl_band *band, struct ftl_addr addr); +struct ftl_addr ftl_band_addr_from_block_offset(struct ftl_band *band, uint64_t block_off); +void ftl_band_set_state(struct ftl_band *band, enum ftl_band_state state); +size_t ftl_band_age(const struct ftl_band *band); +void ftl_band_acquire_lba_map(struct ftl_band *band); +int ftl_band_alloc_lba_map(struct ftl_band *band); +void ftl_band_clear_lba_map(struct ftl_band *band); +void ftl_band_release_lba_map(struct ftl_band *band); +int ftl_band_read_lba_map(struct ftl_band *band, + size_t offset, size_t lba_cnt, + ftl_io_fn cb_fn, void *cb_ctx); +struct ftl_addr ftl_band_next_xfer_addr(struct ftl_band *band, struct ftl_addr addr, + size_t num_blocks); +struct ftl_addr ftl_band_next_addr(struct ftl_band *band, struct ftl_addr addr, + size_t offset); +size_t ftl_band_num_usable_blocks(const struct ftl_band *band); +size_t ftl_band_user_blocks_left(const struct ftl_band *band, size_t offset); +size_t ftl_band_user_blocks(const struct ftl_band *band); +void ftl_band_set_addr(struct ftl_band *band, uint64_t lba, + struct ftl_addr addr); +struct ftl_band *ftl_band_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr); +struct ftl_zone *ftl_band_zone_from_addr(struct ftl_band *band, struct ftl_addr); +void ftl_band_md_clear(struct ftl_band *band); +int ftl_band_read_tail_md(struct ftl_band *band, struct ftl_addr, + ftl_io_fn cb_fn, void *cb_ctx); +int ftl_band_read_head_md(struct ftl_band *band, ftl_io_fn cb_fn, void *cb_ctx); +int ftl_band_write_tail_md(struct ftl_band *band, ftl_io_fn cb); +int ftl_band_write_head_md(struct ftl_band *band, ftl_io_fn cb); +struct ftl_addr ftl_band_tail_md_addr(struct ftl_band *band); +struct ftl_addr ftl_band_head_md_addr(struct ftl_band *band); +void ftl_band_write_failed(struct ftl_band *band); +int ftl_band_full(struct ftl_band *band, size_t offset); +int ftl_band_write_prep(struct ftl_band *band); +struct ftl_zone *ftl_band_next_operational_zone(struct ftl_band *band, + struct ftl_zone *zone); +size_t ftl_lba_map_pool_elem_size(struct spdk_ftl_dev *dev); +void ftl_band_remove_zone(struct ftl_band *band, struct ftl_zone *zone); + + +static inline int +ftl_band_empty(const struct ftl_band *band) +{ + return band->lba_map.num_vld == 0; +} + +static inline struct ftl_zone * +ftl_band_next_zone(struct ftl_band *band, struct ftl_zone *zone) +{ + assert(zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE); + return CIRCLEQ_LOOP_NEXT(&band->zones, zone, circleq); +} + +static inline void +ftl_band_set_next_state(struct ftl_band *band) +{ + ftl_band_set_state(band, (band->state + 1) % FTL_BAND_STATE_MAX); +} + +static inline int +ftl_band_state_changing(struct ftl_band *band) +{ + return band->state == FTL_BAND_STATE_OPENING || + band->state == FTL_BAND_STATE_CLOSING; +} + +static inline int +ftl_band_block_offset_valid(struct ftl_band *band, size_t block_off) +{ + struct ftl_lba_map *lba_map = &band->lba_map; + + pthread_spin_lock(&lba_map->lock); + if (spdk_bit_array_get(lba_map->vld, block_off)) { + pthread_spin_unlock(&lba_map->lock); + return 1; + } + + pthread_spin_unlock(&lba_map->lock); + return 0; +} + +static inline int +ftl_band_zone_is_last(struct ftl_band *band, struct ftl_zone *zone) +{ + return zone == CIRCLEQ_LAST(&band->zones); +} + +static inline int +ftl_band_zone_is_first(struct ftl_band *band, struct ftl_zone *zone) +{ + return zone == CIRCLEQ_FIRST(&band->zones); +} + +static inline int +ftl_zone_is_writable(const struct spdk_ftl_dev *dev, const struct ftl_zone *zone) +{ + bool busy = ftl_is_append_supported(dev) ? false : zone->busy; + + return (zone->info.state == SPDK_BDEV_ZONE_STATE_OPEN || + zone->info.state == SPDK_BDEV_ZONE_STATE_EMPTY) && + !busy; +} + +#endif /* FTL_BAND_H */ diff --git a/src/spdk/lib/ftl/ftl_core.c b/src/spdk/lib/ftl/ftl_core.c new file mode 100644 index 000000000..b0b448806 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_core.c @@ -0,0 +1,2460 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/likely.h" +#include "spdk/stdinc.h" +#include "spdk/nvme.h" +#include "spdk/thread.h" +#include "spdk/bdev_module.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" +#include "spdk/ftl.h" +#include "spdk/crc32.h" + +#include "ftl_core.h" +#include "ftl_band.h" +#include "ftl_io.h" +#include "ftl_debug.h" +#include "ftl_reloc.h" + +struct ftl_band_flush { + struct spdk_ftl_dev *dev; + /* Number of bands left to be flushed */ + size_t num_bands; + /* User callback */ + spdk_ftl_fn cb_fn; + /* Callback's argument */ + void *cb_arg; + /* List link */ + LIST_ENTRY(ftl_band_flush) list_entry; +}; + +struct ftl_wptr { + /* Owner device */ + struct spdk_ftl_dev *dev; + + /* Current address */ + struct ftl_addr addr; + + /* Band currently being written to */ + struct ftl_band *band; + + /* Current logical block's offset */ + uint64_t offset; + + /* Current zone */ + struct ftl_zone *zone; + + /* Pending IO queue */ + TAILQ_HEAD(, ftl_io) pending_queue; + + /* List link */ + LIST_ENTRY(ftl_wptr) list_entry; + + /* + * If setup in direct mode, there will be no offset or band state update after IO. + * The zoned bdev address is not assigned by wptr, and is instead taken directly + * from the request. + */ + bool direct_mode; + + /* Number of outstanding write requests */ + uint32_t num_outstanding; + + /* Marks that the band related to this wptr needs to be closed as soon as possible */ + bool flush; +}; + +struct ftl_flush { + /* Owner device */ + struct spdk_ftl_dev *dev; + + /* Number of batches to wait for */ + size_t num_req; + + /* Callback */ + struct { + spdk_ftl_fn fn; + void *ctx; + } cb; + + /* Batch bitmap */ + struct spdk_bit_array *bmap; + + /* List link */ + LIST_ENTRY(ftl_flush) list_entry; +}; + +static void +ftl_wptr_free(struct ftl_wptr *wptr) +{ + if (!wptr) { + return; + } + + free(wptr); +} + +static void +ftl_remove_wptr(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_band_flush *flush, *tmp; + + if (spdk_unlikely(wptr->flush)) { + LIST_FOREACH_SAFE(flush, &dev->band_flush_list, list_entry, tmp) { + assert(flush->num_bands > 0); + if (--flush->num_bands == 0) { + flush->cb_fn(flush->cb_arg, 0); + LIST_REMOVE(flush, list_entry); + free(flush); + } + } + } + + LIST_REMOVE(wptr, list_entry); + ftl_wptr_free(wptr); +} + +static struct ftl_wbuf_entry * +ftl_acquire_wbuf_entry(struct ftl_io_channel *io_channel, int io_flags) +{ + struct ftl_wbuf_entry *entry = NULL; + uint32_t qdepth; + + if (!(io_flags & FTL_IO_INTERNAL)) { + qdepth = __atomic_fetch_add(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + if (qdepth >= io_channel->qdepth_limit) { + __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + return NULL; + } + } + + if (spdk_ring_dequeue(io_channel->free_queue, (void **)&entry, 1) != 1) { + if (!(io_flags & FTL_IO_INTERNAL)) { + __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + } + + return NULL; + } + + assert(entry != NULL); + + ftl_evict_cache_entry(io_channel->dev, entry); + + entry->io_flags = io_flags; + entry->addr.offset = FTL_ADDR_INVALID; + entry->lba = FTL_LBA_INVALID; + entry->band = NULL; + entry->valid = false; + + return entry; +} + +static void +ftl_release_wbuf_entry(struct ftl_wbuf_entry *entry) +{ + struct ftl_io_channel *io_channel = entry->ioch; + + if (!(entry->io_flags & FTL_IO_INTERNAL)) { + __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + } + + spdk_ring_enqueue(io_channel->free_queue, (void **)&entry, 1, NULL); +} + +static struct ftl_batch * +ftl_get_next_batch(struct spdk_ftl_dev *dev) +{ + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; +#define FTL_DEQUEUE_ENTRIES 128 + struct ftl_wbuf_entry *entries[FTL_DEQUEUE_ENTRIES]; + TAILQ_HEAD(, ftl_io_channel) ioch_queue; + size_t i, num_dequeued, num_remaining; + uint64_t *metadata; + + if (batch == NULL) { + batch = TAILQ_FIRST(&dev->pending_batches); + if (batch != NULL) { + TAILQ_REMOVE(&dev->pending_batches, batch, tailq); + return batch; + } + + batch = TAILQ_FIRST(&dev->free_batches); + if (spdk_unlikely(batch == NULL)) { + return NULL; + } + + assert(TAILQ_EMPTY(&batch->entries)); + assert(batch->num_entries == 0); + TAILQ_REMOVE(&dev->free_batches, batch, tailq); + } + + /* + * Keep shifting the queue to ensure fairness in IO channel selection. Each time + * ftl_get_next_batch() is called, we're starting to dequeue write buffer entries from a + * different IO channel. + */ + TAILQ_INIT(&ioch_queue); + while (!TAILQ_EMPTY(&dev->ioch_queue)) { + ioch = TAILQ_FIRST(&dev->ioch_queue); + TAILQ_REMOVE(&dev->ioch_queue, ioch, tailq); + TAILQ_INSERT_TAIL(&ioch_queue, ioch, tailq); + + num_remaining = dev->xfer_size - batch->num_entries; + while (num_remaining > 0) { + num_dequeued = spdk_ring_dequeue(ioch->submit_queue, (void **)entries, + spdk_min(num_remaining, + FTL_DEQUEUE_ENTRIES)); + if (num_dequeued == 0) { + break; + } + + for (i = 0; i < num_dequeued; ++i) { + batch->iov[batch->num_entries + i].iov_base = entries[i]->payload; + batch->iov[batch->num_entries + i].iov_len = FTL_BLOCK_SIZE; + + if (batch->metadata != NULL) { + metadata = (uint64_t *)((char *)batch->metadata + + i * dev->md_size); + *metadata = entries[i]->lba; + } + + TAILQ_INSERT_TAIL(&batch->entries, entries[i], tailq); + } + + batch->num_entries += num_dequeued; + num_remaining -= num_dequeued; + } + + if (num_remaining == 0) { + break; + } + } + + TAILQ_CONCAT(&dev->ioch_queue, &ioch_queue, tailq); + + if (batch->num_entries == dev->xfer_size) { + dev->current_batch = NULL; + } else { + dev->current_batch = batch; + batch = NULL; + } + + return batch; +} + +static void +ftl_release_batch(struct spdk_ftl_dev *dev, struct ftl_batch *batch) +{ + struct ftl_wbuf_entry *entry; + + while (!TAILQ_EMPTY(&batch->entries)) { + entry = TAILQ_FIRST(&batch->entries); + TAILQ_REMOVE(&batch->entries, entry, tailq); + ftl_release_wbuf_entry(entry); + } + + batch->num_entries = 0; + TAILQ_INSERT_TAIL(&dev->free_batches, batch, tailq); +} + +static struct ftl_wbuf_entry * +ftl_get_entry_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_io_channel *ioch; + uint64_t ioch_offset, entry_offset; + + ioch_offset = addr.cache_offset & ((1 << dev->ioch_shift) - 1); + entry_offset = addr.cache_offset >> dev->ioch_shift; + ioch = dev->ioch_array[ioch_offset]; + + assert(ioch_offset < dev->conf.max_io_channels); + assert(entry_offset < ioch->num_entries); + assert(addr.cached == 1); + + return &ioch->wbuf_entries[entry_offset]; +} + +static struct ftl_addr +ftl_get_addr_from_entry(struct ftl_wbuf_entry *entry) +{ + struct ftl_io_channel *ioch = entry->ioch; + struct ftl_addr addr = {}; + + addr.cached = 1; + addr.cache_offset = (uint64_t)entry->index << ioch->dev->ioch_shift | ioch->index; + + return addr; +} + +static void +ftl_io_cmpl_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_io *io = cb_arg; + struct spdk_ftl_dev *dev = io->dev; + + if (spdk_unlikely(!success)) { + io->status = -EIO; + } + + ftl_trace_completion(dev, io, FTL_TRACE_COMPLETION_DISK); + + if (io->type == FTL_IO_WRITE && ftl_is_append_supported(dev)) { + assert(io->parent); + io->parent->addr.offset = spdk_bdev_io_get_append_location(bdev_io); + } + + ftl_io_dec_req(io); + if (ftl_io_done(io)) { + ftl_io_complete(io); + } + + spdk_bdev_free_io(bdev_io); +} + +static void +ftl_halt_writes(struct spdk_ftl_dev *dev, struct ftl_band *band) +{ + struct ftl_wptr *wptr = NULL; + + LIST_FOREACH(wptr, &dev->wptr_list, list_entry) { + if (wptr->band == band) { + break; + } + } + + /* If the band already has the high_prio flag set, other writes must */ + /* have failed earlier, so it's already taken care of. */ + if (band->high_prio) { + assert(wptr == NULL); + return; + } + + ftl_band_write_failed(band); + ftl_remove_wptr(wptr); +} + +static struct ftl_wptr * +ftl_wptr_from_band(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_wptr *wptr = NULL; + + LIST_FOREACH(wptr, &dev->wptr_list, list_entry) { + if (wptr->band == band) { + return wptr; + } + } + + return NULL; +} + +static void +ftl_md_write_fail(struct ftl_io *io, int status) +{ + struct ftl_band *band = io->band; + struct ftl_wptr *wptr; + char buf[128]; + + wptr = ftl_wptr_from_band(band); + assert(wptr); + + SPDK_ERRLOG("Metadata write failed @addr: %s, status: %d\n", + ftl_addr2str(wptr->addr, buf, sizeof(buf)), status); + + ftl_halt_writes(io->dev, band); +} + +static void +ftl_md_write_cb(struct ftl_io *io, void *arg, int status) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + struct ftl_band *band = io->band; + struct ftl_wptr *wptr; + size_t id; + + wptr = ftl_wptr_from_band(band); + assert(wptr); + + if (status) { + ftl_md_write_fail(io, status); + return; + } + + ftl_band_set_next_state(band); + if (band->state == FTL_BAND_STATE_CLOSED) { + if (ftl_dev_has_nv_cache(dev)) { + pthread_spin_lock(&nv_cache->lock); + nv_cache->num_available += ftl_band_user_blocks(band); + + if (spdk_unlikely(nv_cache->num_available > nv_cache->num_data_blocks)) { + nv_cache->num_available = nv_cache->num_data_blocks; + } + pthread_spin_unlock(&nv_cache->lock); + } + + /* + * Go through the reloc_bitmap, checking for all the bands that had its data moved + * onto current band and update their counters to allow them to be used for writing + * (once they're closed and empty). + */ + for (id = 0; id < ftl_get_num_bands(dev); ++id) { + if (spdk_bit_array_get(band->reloc_bitmap, id)) { + assert(dev->bands[id].num_reloc_bands > 0); + dev->bands[id].num_reloc_bands--; + + spdk_bit_array_clear(band->reloc_bitmap, id); + } + } + + ftl_remove_wptr(wptr); + } +} + +static int +ftl_read_next_physical_addr(struct ftl_io *io, struct ftl_addr *addr) +{ + struct spdk_ftl_dev *dev = io->dev; + size_t num_blocks, max_blocks; + + assert(ftl_io_mode_physical(io)); + assert(io->iov_pos < io->iov_cnt); + + if (io->pos == 0) { + *addr = io->addr; + } else { + *addr = ftl_band_next_xfer_addr(io->band, io->addr, io->pos); + } + + assert(!ftl_addr_invalid(*addr)); + + /* Metadata has to be read in the way it's written (jumping across */ + /* the zones in xfer_size increments) */ + if (io->flags & FTL_IO_MD) { + max_blocks = dev->xfer_size - (addr->offset % dev->xfer_size); + num_blocks = spdk_min(ftl_io_iovec_len_left(io), max_blocks); + assert(addr->offset / dev->xfer_size == + (addr->offset + num_blocks - 1) / dev->xfer_size); + } else { + num_blocks = ftl_io_iovec_len_left(io); + } + + return num_blocks; +} + +static int +ftl_wptr_close_band(struct ftl_wptr *wptr) +{ + struct ftl_band *band = wptr->band; + + ftl_band_set_state(band, FTL_BAND_STATE_CLOSING); + + return ftl_band_write_tail_md(band, ftl_md_write_cb); +} + +static int +ftl_wptr_open_band(struct ftl_wptr *wptr) +{ + struct ftl_band *band = wptr->band; + + assert(ftl_band_zone_is_first(band, wptr->zone)); + assert(band->lba_map.num_vld == 0); + + ftl_band_clear_lba_map(band); + + assert(band->state == FTL_BAND_STATE_PREP); + ftl_band_set_state(band, FTL_BAND_STATE_OPENING); + + return ftl_band_write_head_md(band, ftl_md_write_cb); +} + +static int +ftl_submit_erase(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_band *band = io->band; + struct ftl_addr addr = io->addr; + struct ftl_io_channel *ioch; + struct ftl_zone *zone; + int rc = 0; + size_t i; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + for (i = 0; i < io->num_blocks; ++i) { + if (i != 0) { + zone = ftl_band_next_zone(band, ftl_band_zone_from_addr(band, addr)); + assert(zone->info.state == SPDK_BDEV_ZONE_STATE_FULL); + addr.offset = zone->info.zone_id; + } + + assert(ftl_addr_get_zone_offset(dev, addr) == 0); + + ftl_trace_submission(dev, io, addr, 1); + rc = spdk_bdev_zone_management(dev->base_bdev_desc, ioch->base_ioch, addr.offset, + SPDK_BDEV_ZONE_RESET, ftl_io_cmpl_cb, io); + if (spdk_unlikely(rc)) { + ftl_io_fail(io, rc); + SPDK_ERRLOG("Vector reset failed with status: %d\n", rc); + break; + } + + ftl_io_inc_req(io); + ftl_io_advance(io, 1); + } + + if (ftl_io_done(io)) { + ftl_io_complete(io); + } + + return rc; +} + +static bool +ftl_check_core_thread(const struct spdk_ftl_dev *dev) +{ + return dev->core_thread == spdk_get_thread(); +} + +struct spdk_io_channel * +ftl_get_io_channel(const struct spdk_ftl_dev *dev) +{ + if (ftl_check_core_thread(dev)) { + return dev->ioch; + } + + return NULL; +} + +static void +ftl_erase_fail(struct ftl_io *io, int status) +{ + struct ftl_zone *zone; + struct ftl_band *band = io->band; + char buf[128]; + + SPDK_ERRLOG("Erase failed at address: %s, status: %d\n", + ftl_addr2str(io->addr, buf, sizeof(buf)), status); + + zone = ftl_band_zone_from_addr(band, io->addr); + zone->info.state = SPDK_BDEV_ZONE_STATE_OFFLINE; + ftl_band_remove_zone(band, zone); + band->tail_md_addr = ftl_band_tail_md_addr(band); +} + +static void +ftl_zone_erase_cb(struct ftl_io *io, void *ctx, int status) +{ + struct ftl_zone *zone; + + zone = ftl_band_zone_from_addr(io->band, io->addr); + zone->busy = false; + + if (spdk_unlikely(status)) { + ftl_erase_fail(io, status); + return; + } + + zone->info.state = SPDK_BDEV_ZONE_STATE_EMPTY; + zone->info.write_pointer = zone->info.zone_id; +} + +static int +ftl_band_erase(struct ftl_band *band) +{ + struct ftl_zone *zone; + struct ftl_io *io; + int rc = 0; + + assert(band->state == FTL_BAND_STATE_CLOSED || + band->state == FTL_BAND_STATE_FREE); + + ftl_band_set_state(band, FTL_BAND_STATE_PREP); + + CIRCLEQ_FOREACH(zone, &band->zones, circleq) { + if (zone->info.state == SPDK_BDEV_ZONE_STATE_EMPTY) { + continue; + } + + io = ftl_io_erase_init(band, 1, ftl_zone_erase_cb); + if (!io) { + rc = -ENOMEM; + break; + } + + zone->busy = true; + io->addr.offset = zone->info.zone_id; + rc = ftl_submit_erase(io); + if (rc) { + zone->busy = false; + assert(0); + /* TODO: change band's state back to close? */ + break; + } + } + + return rc; +} + +static struct ftl_band * +ftl_next_write_band(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + + /* Find a free band that has all of its data moved onto other closed bands */ + LIST_FOREACH(band, &dev->free_bands, list_entry) { + assert(band->state == FTL_BAND_STATE_FREE); + if (band->num_reloc_bands == 0 && band->num_reloc_blocks == 0) { + break; + } + } + + if (spdk_unlikely(!band)) { + return NULL; + } + + if (ftl_band_erase(band)) { + /* TODO: handle erase failure */ + return NULL; + } + + return band; +} + +static struct ftl_band * +ftl_next_wptr_band(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + + if (!dev->next_band) { + band = ftl_next_write_band(dev); + } else { + assert(dev->next_band->state == FTL_BAND_STATE_PREP); + band = dev->next_band; + dev->next_band = NULL; + } + + return band; +} + +static struct ftl_wptr * +ftl_wptr_init(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_wptr *wptr; + + wptr = calloc(1, sizeof(*wptr)); + if (!wptr) { + return NULL; + } + + wptr->dev = dev; + wptr->band = band; + wptr->zone = CIRCLEQ_FIRST(&band->zones); + wptr->addr.offset = wptr->zone->info.zone_id; + TAILQ_INIT(&wptr->pending_queue); + + return wptr; +} + +static int +ftl_add_direct_wptr(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_wptr *wptr; + + assert(band->state == FTL_BAND_STATE_OPEN); + + wptr = ftl_wptr_init(band); + if (!wptr) { + return -1; + } + + wptr->direct_mode = true; + + if (ftl_band_alloc_lba_map(band)) { + ftl_wptr_free(wptr); + return -1; + } + + LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: direct band %u\n", band->id); + ftl_trace_write_band(dev, band); + return 0; +} + +static void +ftl_close_direct_wptr(struct ftl_band *band) +{ + struct ftl_wptr *wptr = ftl_wptr_from_band(band); + + assert(wptr); + assert(wptr->direct_mode); + assert(band->state == FTL_BAND_STATE_CLOSED); + + ftl_band_release_lba_map(band); + + ftl_remove_wptr(wptr); +} + +int +ftl_band_set_direct_access(struct ftl_band *band, bool access) +{ + if (access) { + return ftl_add_direct_wptr(band); + } else { + ftl_close_direct_wptr(band); + return 0; + } +} + +static int +ftl_add_wptr(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + struct ftl_wptr *wptr; + + band = ftl_next_wptr_band(dev); + if (!band) { + return -1; + } + + wptr = ftl_wptr_init(band); + if (!wptr) { + return -1; + } + + if (ftl_band_write_prep(band)) { + ftl_wptr_free(wptr); + return -1; + } + + LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: band %u\n", band->id); + ftl_trace_write_band(dev, band); + return 0; +} + +static void +ftl_wptr_advance(struct ftl_wptr *wptr, size_t xfer_size) +{ + struct ftl_band *band = wptr->band; + struct spdk_ftl_dev *dev = wptr->dev; + struct spdk_ftl_conf *conf = &dev->conf; + size_t next_thld; + + if (spdk_unlikely(wptr->direct_mode)) { + return; + } + + wptr->offset += xfer_size; + next_thld = (ftl_band_num_usable_blocks(band) * conf->band_thld) / 100; + + if (ftl_band_full(band, wptr->offset)) { + ftl_band_set_state(band, FTL_BAND_STATE_FULL); + } + + wptr->zone->busy = true; + wptr->addr = ftl_band_next_xfer_addr(band, wptr->addr, xfer_size); + wptr->zone = ftl_band_next_operational_zone(band, wptr->zone); + + assert(!ftl_addr_invalid(wptr->addr)); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: pu:%lu band:%lu, offset:%lu\n", + ftl_addr_get_punit(dev, wptr->addr), + ftl_addr_get_band(dev, wptr->addr), + wptr->addr.offset); + + if (wptr->offset >= next_thld && !dev->next_band) { + dev->next_band = ftl_next_write_band(dev); + } +} + +static size_t +ftl_wptr_user_blocks_left(const struct ftl_wptr *wptr) +{ + return ftl_band_user_blocks_left(wptr->band, wptr->offset); +} + +static bool +ftl_wptr_ready(struct ftl_wptr *wptr) +{ + struct ftl_band *band = wptr->band; + + /* TODO: add handling of empty bands */ + + if (spdk_unlikely(!ftl_zone_is_writable(wptr->dev, wptr->zone))) { + /* Erasing band may fail after it was assigned to wptr. */ + if (spdk_unlikely(wptr->zone->info.state == SPDK_BDEV_ZONE_STATE_OFFLINE)) { + ftl_wptr_advance(wptr, wptr->dev->xfer_size); + } + return false; + } + + /* If we're in the process of writing metadata, wait till it is */ + /* completed. */ + /* TODO: we should probably change bands once we're writing tail md */ + if (ftl_band_state_changing(band)) { + return false; + } + + if (band->state == FTL_BAND_STATE_FULL) { + if (wptr->num_outstanding == 0) { + if (ftl_wptr_close_band(wptr)) { + /* TODO: need recovery here */ + assert(false); + } + } + + return false; + } + + if (band->state != FTL_BAND_STATE_OPEN) { + if (ftl_wptr_open_band(wptr)) { + /* TODO: need recovery here */ + assert(false); + } + + return false; + } + + return true; +} + +int +ftl_flush_active_bands(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_wptr *wptr; + struct ftl_band_flush *flush; + + assert(ftl_get_core_thread(dev) == spdk_get_thread()); + + flush = calloc(1, sizeof(*flush)); + if (spdk_unlikely(!flush)) { + return -ENOMEM; + } + + LIST_INSERT_HEAD(&dev->band_flush_list, flush, list_entry); + + flush->cb_fn = cb_fn; + flush->cb_arg = cb_arg; + flush->dev = dev; + + LIST_FOREACH(wptr, &dev->wptr_list, list_entry) { + wptr->flush = true; + flush->num_bands++; + } + + return 0; +} + +static const struct spdk_ftl_limit * +ftl_get_limit(const struct spdk_ftl_dev *dev, int type) +{ + assert(type < SPDK_FTL_LIMIT_MAX); + return &dev->conf.limits[type]; +} + +static bool +ftl_cache_lba_valid(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry) +{ + struct ftl_addr addr; + + /* If the LBA is invalid don't bother checking the md and l2p */ + if (spdk_unlikely(entry->lba == FTL_LBA_INVALID)) { + return false; + } + + addr = ftl_l2p_get(dev, entry->lba); + if (!(ftl_addr_cached(addr) && entry == ftl_get_entry_from_addr(dev, addr))) { + return false; + } + + return true; +} + +void +ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry) +{ + pthread_spin_lock(&entry->lock); + + if (!entry->valid) { + goto unlock; + } + + /* If the l2p wasn't updated and still points at the entry, fill it with the */ + /* on-disk address and clear the cache status bit. Otherwise, skip the l2p update */ + /* and just clear the cache status. */ + if (!ftl_cache_lba_valid(dev, entry)) { + goto clear; + } + + ftl_l2p_set(dev, entry->lba, entry->addr); +clear: + entry->valid = false; +unlock: + pthread_spin_unlock(&entry->lock); +} + +static void +ftl_pad_wbuf(struct spdk_ftl_dev *dev, size_t size) +{ + struct ftl_wbuf_entry *entry; + struct ftl_io_channel *ioch; + int flags = FTL_IO_PAD | FTL_IO_INTERNAL; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + for (size_t i = 0; i < size; ++i) { + entry = ftl_acquire_wbuf_entry(ioch, flags); + if (!entry) { + break; + } + + entry->lba = FTL_LBA_INVALID; + entry->addr = ftl_to_addr(FTL_ADDR_INVALID); + memset(entry->payload, 0, FTL_BLOCK_SIZE); + + spdk_ring_enqueue(ioch->submit_queue, (void **)&entry, 1, NULL); + } +} + +static void +ftl_remove_free_bands(struct spdk_ftl_dev *dev) +{ + while (!LIST_EMPTY(&dev->free_bands)) { + LIST_REMOVE(LIST_FIRST(&dev->free_bands), list_entry); + } + + dev->next_band = NULL; +} + +static void +ftl_wptr_pad_band(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; + size_t size, pad_size, blocks_left; + + size = batch != NULL ? batch->num_entries : 0; + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + size += spdk_ring_count(ioch->submit_queue); + } + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + blocks_left = ftl_wptr_user_blocks_left(wptr); + assert(size <= blocks_left); + assert(blocks_left % dev->xfer_size == 0); + pad_size = spdk_min(blocks_left - size, spdk_ring_count(ioch->free_queue)); + + ftl_pad_wbuf(dev, pad_size); +} + +static void +ftl_wptr_process_shutdown(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; + size_t size; + + size = batch != NULL ? batch->num_entries : 0; + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + size += spdk_ring_count(ioch->submit_queue); + } + + if (size >= dev->xfer_size) { + return; + } + + /* If we reach this point we need to remove free bands */ + /* and pad current wptr band to the end */ + ftl_remove_free_bands(dev); + ftl_wptr_pad_band(wptr); +} + +static int +ftl_shutdown_complete(struct spdk_ftl_dev *dev) +{ + struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(dev->ioch); + + return !__atomic_load_n(&dev->num_inflight, __ATOMIC_SEQ_CST) && + dev->num_io_channels == 1 && LIST_EMPTY(&dev->wptr_list) && + TAILQ_EMPTY(&ioch->retry_queue); +} + +void +ftl_apply_limits(struct spdk_ftl_dev *dev) +{ + const struct spdk_ftl_limit *limit; + struct ftl_io_channel *ioch; + struct ftl_stats *stats = &dev->stats; + uint32_t qdepth_limit = 100; + int i; + + /* Clear existing limit */ + dev->limit = SPDK_FTL_LIMIT_MAX; + + for (i = SPDK_FTL_LIMIT_CRIT; i < SPDK_FTL_LIMIT_MAX; ++i) { + limit = ftl_get_limit(dev, i); + + if (dev->num_free <= limit->thld) { + qdepth_limit = limit->limit; + stats->limits[i]++; + dev->limit = i; + break; + } + } + + ftl_trace_limits(dev, dev->limit, dev->num_free); + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + __atomic_store_n(&ioch->qdepth_limit, (qdepth_limit * ioch->num_entries) / 100, + __ATOMIC_SEQ_CST); + } +} + +static int +ftl_invalidate_addr_unlocked(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_band *band = ftl_band_from_addr(dev, addr); + struct ftl_lba_map *lba_map = &band->lba_map; + uint64_t offset; + + offset = ftl_band_block_offset_from_addr(band, addr); + + /* The bit might be already cleared if two writes are scheduled to the */ + /* same LBA at the same time */ + if (spdk_bit_array_get(lba_map->vld, offset)) { + assert(lba_map->num_vld > 0); + spdk_bit_array_clear(lba_map->vld, offset); + lba_map->num_vld--; + return 1; + } + + return 0; +} + +int +ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_band *band; + int rc; + + assert(!ftl_addr_cached(addr)); + band = ftl_band_from_addr(dev, addr); + + pthread_spin_lock(&band->lba_map.lock); + rc = ftl_invalidate_addr_unlocked(dev, addr); + pthread_spin_unlock(&band->lba_map.lock); + + return rc; +} + +static int +ftl_read_retry(int rc) +{ + return rc == -EAGAIN; +} + +static int +ftl_read_canceled(int rc) +{ + return rc == -EFAULT || rc == 0; +} + +static int +ftl_cache_read(struct ftl_io *io, uint64_t lba, + struct ftl_addr addr, void *buf) +{ + struct ftl_wbuf_entry *entry; + struct ftl_addr naddr; + int rc = 0; + + entry = ftl_get_entry_from_addr(io->dev, addr); + pthread_spin_lock(&entry->lock); + + naddr = ftl_l2p_get(io->dev, lba); + if (addr.offset != naddr.offset) { + rc = -1; + goto out; + } + + memcpy(buf, entry->payload, FTL_BLOCK_SIZE); +out: + pthread_spin_unlock(&entry->lock); + return rc; +} + +static int +ftl_read_next_logical_addr(struct ftl_io *io, struct ftl_addr *addr) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_addr next_addr; + size_t i; + + *addr = ftl_l2p_get(dev, ftl_io_current_lba(io)); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Read addr:%lx, lba:%lu\n", + addr->offset, ftl_io_current_lba(io)); + + /* If the address is invalid, skip it (the buffer should already be zero'ed) */ + if (ftl_addr_invalid(*addr)) { + return -EFAULT; + } + + if (ftl_addr_cached(*addr)) { + if (!ftl_cache_read(io, ftl_io_current_lba(io), *addr, ftl_io_iovec_addr(io))) { + return 0; + } + + /* If the state changed, we have to re-read the l2p */ + return -EAGAIN; + } + + for (i = 1; i < ftl_io_iovec_len_left(io); ++i) { + next_addr = ftl_l2p_get(dev, ftl_io_get_lba(io, io->pos + i)); + + if (ftl_addr_invalid(next_addr) || ftl_addr_cached(next_addr)) { + break; + } + + if (addr->offset + i != next_addr.offset) { + break; + } + } + + return i; +} + +static int +ftl_submit_read(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch; + struct ftl_addr addr; + int rc = 0, num_blocks; + + ioch = ftl_io_channel_get_ctx(io->ioch); + + assert(LIST_EMPTY(&io->children)); + + while (io->pos < io->num_blocks) { + if (ftl_io_mode_physical(io)) { + num_blocks = rc = ftl_read_next_physical_addr(io, &addr); + } else { + num_blocks = rc = ftl_read_next_logical_addr(io, &addr); + } + + /* We might need to retry the read from scratch (e.g. */ + /* because write was under way and completed before */ + /* we could read it from the write buffer */ + if (ftl_read_retry(rc)) { + continue; + } + + /* We don't have to schedule the read, as it was read from cache */ + if (ftl_read_canceled(rc)) { + ftl_io_advance(io, 1); + ftl_trace_completion(io->dev, io, rc ? FTL_TRACE_COMPLETION_INVALID : + FTL_TRACE_COMPLETION_CACHE); + rc = 0; + continue; + } + + assert(num_blocks > 0); + + ftl_trace_submission(dev, io, addr, num_blocks); + rc = spdk_bdev_read_blocks(dev->base_bdev_desc, ioch->base_ioch, + ftl_io_iovec_addr(io), + addr.offset, + num_blocks, ftl_io_cmpl_cb, io); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry); + rc = 0; + } else { + ftl_io_fail(io, rc); + } + break; + } + + ftl_io_inc_req(io); + ftl_io_advance(io, num_blocks); + } + + /* If we didn't have to read anything from the device, */ + /* complete the request right away */ + if (ftl_io_done(io)) { + ftl_io_complete(io); + } + + return rc; +} + +static void +ftl_complete_flush(struct ftl_flush *flush) +{ + assert(flush->num_req == 0); + LIST_REMOVE(flush, list_entry); + + flush->cb.fn(flush->cb.ctx, 0); + + spdk_bit_array_free(&flush->bmap); + free(flush); +} + +static void +ftl_process_flush(struct spdk_ftl_dev *dev, struct ftl_batch *batch) +{ + struct ftl_flush *flush, *tflush; + size_t offset; + + LIST_FOREACH_SAFE(flush, &dev->flush_list, list_entry, tflush) { + offset = batch->index; + + if (spdk_bit_array_get(flush->bmap, offset)) { + spdk_bit_array_clear(flush->bmap, offset); + if (!(--flush->num_req)) { + ftl_complete_flush(flush); + } + } + } +} + +static void +ftl_nv_cache_wrap_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache *nv_cache = cb_arg; + + if (!success) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n"); + /* TODO: go into read-only mode */ + assert(0); + } + + pthread_spin_lock(&nv_cache->lock); + nv_cache->ready = true; + pthread_spin_unlock(&nv_cache->lock); + + spdk_bdev_free_io(bdev_io); +} + +static void +ftl_nv_cache_wrap(void *ctx) +{ + struct ftl_nv_cache *nv_cache = ctx; + int rc; + + rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_wrap_cb, nv_cache); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header: %s\n", + spdk_strerror(-rc)); + /* TODO: go into read-only mode */ + assert(0); + } +} + +static uint64_t +ftl_reserve_nv_cache(struct ftl_nv_cache *nv_cache, size_t *num_blocks, unsigned int *phase) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + uint64_t num_available, cache_size, cache_addr = FTL_LBA_INVALID; + + cache_size = spdk_bdev_get_num_blocks(bdev); + + pthread_spin_lock(&nv_cache->lock); + if (spdk_unlikely(nv_cache->num_available == 0 || !nv_cache->ready)) { + goto out; + } + + num_available = spdk_min(nv_cache->num_available, *num_blocks); + num_available = spdk_min(num_available, dev->conf.nv_cache.max_request_cnt); + + if (spdk_unlikely(nv_cache->current_addr + num_available > cache_size)) { + *num_blocks = cache_size - nv_cache->current_addr; + } else { + *num_blocks = num_available; + } + + cache_addr = nv_cache->current_addr; + nv_cache->current_addr += *num_blocks; + nv_cache->num_available -= *num_blocks; + *phase = nv_cache->phase; + + if (nv_cache->current_addr == spdk_bdev_get_num_blocks(bdev)) { + nv_cache->current_addr = FTL_NV_CACHE_DATA_OFFSET; + nv_cache->phase = ftl_nv_cache_next_phase(nv_cache->phase); + nv_cache->ready = false; + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_nv_cache_wrap, nv_cache); + } +out: + pthread_spin_unlock(&nv_cache->lock); + return cache_addr; +} + +static struct ftl_io * +ftl_alloc_io_nv_cache(struct ftl_io *parent, size_t num_blocks) +{ + struct ftl_io_init_opts opts = { + .dev = parent->dev, + .parent = parent, + .iovcnt = 0, + .num_blocks = num_blocks, + .flags = parent->flags | FTL_IO_CACHE, + }; + + return ftl_io_init_internal(&opts); +} + +static void +ftl_nv_cache_submit_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_io *io = cb_arg; + struct ftl_nv_cache *nv_cache = &io->dev->nv_cache; + + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Non-volatile cache write failed at %"PRIx64"\n", io->addr.offset); + io->status = -EIO; + } + + ftl_io_dec_req(io); + if (ftl_io_done(io)) { + spdk_mempool_put(nv_cache->md_pool, io->md); + ftl_io_complete(io); + } + + spdk_bdev_free_io(bdev_io); +} + +static void +ftl_submit_nv_cache(void *ctx) +{ + struct ftl_io *io = ctx; + struct spdk_ftl_dev *dev = io->dev; + struct spdk_thread *thread; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + struct ftl_io_channel *ioch; + int rc; + + ioch = ftl_io_channel_get_ctx(io->ioch); + thread = spdk_io_channel_get_thread(io->ioch); + + rc = spdk_bdev_write_blocks_with_md(nv_cache->bdev_desc, ioch->cache_ioch, + ftl_io_iovec_addr(io), io->md, io->addr.offset, + io->num_blocks, ftl_nv_cache_submit_cb, io); + if (rc == -ENOMEM) { + spdk_thread_send_msg(thread, ftl_submit_nv_cache, io); + return; + } else if (rc) { + SPDK_ERRLOG("Write to persistent cache failed: %s (%"PRIu64", %"PRIu64")\n", + spdk_strerror(-rc), io->addr.offset, io->num_blocks); + spdk_mempool_put(nv_cache->md_pool, io->md); + io->status = -EIO; + ftl_io_complete(io); + return; + } + + ftl_io_advance(io, io->num_blocks); + ftl_io_inc_req(io); +} + +static void +ftl_nv_cache_fill_md(struct ftl_io *io, unsigned int phase) +{ + struct spdk_bdev *bdev; + struct ftl_nv_cache *nv_cache = &io->dev->nv_cache; + uint64_t block_off, lba; + void *md_buf = io->md; + + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + + for (block_off = 0; block_off < io->num_blocks; ++block_off) { + lba = ftl_nv_cache_pack_lba(ftl_io_get_lba(io, block_off), phase); + memcpy(md_buf, &lba, sizeof(lba)); + md_buf += spdk_bdev_get_md_size(bdev); + } +} + +static void +_ftl_write_nv_cache(void *ctx) +{ + struct ftl_io *child, *io = ctx; + struct spdk_ftl_dev *dev = io->dev; + struct spdk_thread *thread; + unsigned int phase; + uint64_t num_blocks; + + thread = spdk_io_channel_get_thread(io->ioch); + + while (io->pos < io->num_blocks) { + num_blocks = ftl_io_iovec_len_left(io); + + child = ftl_alloc_io_nv_cache(io, num_blocks); + if (spdk_unlikely(!child)) { + spdk_thread_send_msg(thread, _ftl_write_nv_cache, io); + return; + } + + child->md = spdk_mempool_get(dev->nv_cache.md_pool); + if (spdk_unlikely(!child->md)) { + ftl_io_free(child); + spdk_thread_send_msg(thread, _ftl_write_nv_cache, io); + break; + } + + /* Reserve area on the write buffer cache */ + child->addr.offset = ftl_reserve_nv_cache(&dev->nv_cache, &num_blocks, &phase); + if (child->addr.offset == FTL_LBA_INVALID) { + spdk_mempool_put(dev->nv_cache.md_pool, child->md); + ftl_io_free(child); + spdk_thread_send_msg(thread, _ftl_write_nv_cache, io); + break; + } + + /* Shrink the IO if there isn't enough room in the cache to fill the whole iovec */ + if (spdk_unlikely(num_blocks != ftl_io_iovec_len_left(io))) { + ftl_io_shrink_iovec(child, num_blocks); + } + + ftl_nv_cache_fill_md(child, phase); + ftl_submit_nv_cache(child); + } + + if (ftl_io_done(io)) { + ftl_io_complete(io); + } +} + +static void +ftl_write_nv_cache(struct ftl_io *parent) +{ + ftl_io_reset(parent); + parent->flags |= FTL_IO_CACHE; + _ftl_write_nv_cache(parent); +} + +int +ftl_nv_cache_write_header(struct ftl_nv_cache *nv_cache, bool shutdown, + spdk_bdev_io_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + struct ftl_nv_cache_header *hdr = nv_cache->dma_buf; + struct spdk_bdev *bdev; + struct ftl_io_channel *ioch; + + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + memset(hdr, 0, spdk_bdev_get_block_size(bdev)); + + hdr->phase = (uint8_t)nv_cache->phase; + hdr->size = spdk_bdev_get_num_blocks(bdev); + hdr->uuid = dev->uuid; + hdr->version = FTL_NV_CACHE_HEADER_VERSION; + hdr->current_addr = shutdown ? nv_cache->current_addr : FTL_LBA_INVALID; + hdr->checksum = spdk_crc32c_update(hdr, offsetof(struct ftl_nv_cache_header, checksum), 0); + + return spdk_bdev_write_blocks(nv_cache->bdev_desc, ioch->cache_ioch, hdr, 0, 1, + cb_fn, cb_arg); +} + +int +ftl_nv_cache_scrub(struct ftl_nv_cache *nv_cache, spdk_bdev_io_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + struct ftl_io_channel *ioch; + struct spdk_bdev *bdev; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + + return spdk_bdev_write_zeroes_blocks(nv_cache->bdev_desc, ioch->cache_ioch, 1, + spdk_bdev_get_num_blocks(bdev) - 1, + cb_fn, cb_arg); +} + +static void +ftl_write_fail(struct ftl_io *io, int status) +{ + struct ftl_batch *batch = io->batch; + struct spdk_ftl_dev *dev = io->dev; + struct ftl_wbuf_entry *entry; + struct ftl_band *band; + char buf[128]; + + entry = TAILQ_FIRST(&batch->entries); + + band = ftl_band_from_addr(io->dev, entry->addr); + SPDK_ERRLOG("Write failed @addr: %s, status: %d\n", + ftl_addr2str(entry->addr, buf, sizeof(buf)), status); + + /* Close the band and, halt wptr and defrag */ + ftl_halt_writes(dev, band); + + TAILQ_FOREACH(entry, &batch->entries, tailq) { + /* Invalidate meta set by process_writes() */ + ftl_invalidate_addr(dev, entry->addr); + } + + /* Reset the batch back to the write buffer to resend it later */ + TAILQ_INSERT_TAIL(&dev->pending_batches, batch, tailq); +} + +static void +ftl_write_cb(struct ftl_io *io, void *arg, int status) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_batch *batch = io->batch; + struct ftl_wbuf_entry *entry; + struct ftl_band *band; + struct ftl_addr prev_addr, addr = io->addr; + + if (status) { + ftl_write_fail(io, status); + return; + } + + assert(io->num_blocks == dev->xfer_size); + assert(!(io->flags & FTL_IO_MD)); + + TAILQ_FOREACH(entry, &batch->entries, tailq) { + band = entry->band; + if (!(entry->io_flags & FTL_IO_PAD)) { + /* Verify that the LBA is set for user blocks */ + assert(entry->lba != FTL_LBA_INVALID); + } + + if (band != NULL) { + assert(band->num_reloc_blocks > 0); + band->num_reloc_blocks--; + } + + entry->addr = addr; + if (entry->lba != FTL_LBA_INVALID) { + pthread_spin_lock(&entry->lock); + prev_addr = ftl_l2p_get(dev, entry->lba); + + /* If the l2p was updated in the meantime, don't update band's metadata */ + if (ftl_addr_cached(prev_addr) && + entry == ftl_get_entry_from_addr(dev, prev_addr)) { + /* Setting entry's cache bit needs to be done after metadata */ + /* within the band is updated to make sure that writes */ + /* invalidating the entry clear the metadata as well */ + ftl_band_set_addr(io->band, entry->lba, entry->addr); + entry->valid = true; + } + pthread_spin_unlock(&entry->lock); + } + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write addr:%lu, lba:%lu\n", + entry->addr.offset, entry->lba); + + addr = ftl_band_next_addr(io->band, addr, 1); + } + + ftl_process_flush(dev, batch); + ftl_release_batch(dev, batch); +} + +static void +ftl_update_stats(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry) +{ + if (!(entry->io_flags & FTL_IO_INTERNAL)) { + dev->stats.write_user++; + } + dev->stats.write_total++; +} + +static void +ftl_update_l2p(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry, + struct ftl_addr addr) +{ + struct ftl_addr prev_addr; + struct ftl_wbuf_entry *prev; + struct ftl_band *band; + int valid; + bool io_weak = entry->io_flags & FTL_IO_WEAK; + + prev_addr = ftl_l2p_get(dev, entry->lba); + if (ftl_addr_invalid(prev_addr)) { + ftl_l2p_set(dev, entry->lba, addr); + return; + } + + if (ftl_addr_cached(prev_addr)) { + prev = ftl_get_entry_from_addr(dev, prev_addr); + pthread_spin_lock(&prev->lock); + + /* Re-read the L2P under the lock to protect against updates */ + /* to this LBA from other threads */ + prev_addr = ftl_l2p_get(dev, entry->lba); + + /* If the entry is no longer in cache, another write has been */ + /* scheduled in the meantime, so we can return to evicted path */ + if (!ftl_addr_cached(prev_addr)) { + pthread_spin_unlock(&prev->lock); + goto evicted; + } + + /* + * Relocating block could still reside in cache due to fact that write + * buffers are independent for each IO channel and enough amount of data + * (write unit size) must be collected before it will be submitted to lower + * layer. + * When previous entry wasn't overwritten invalidate old address and entry. + * Otherwise skip relocating block. + */ + if (io_weak && + /* Check if prev_addr was updated in meantime */ + !(ftl_addr_cmp(prev_addr, ftl_get_addr_from_entry(prev)) && + /* Check if relocating address it the same as in previous entry */ + ftl_addr_cmp(prev->addr, entry->addr))) { + pthread_spin_unlock(&prev->lock); + return; + } + + /* + * If previous entry is part of cache and was written into disk remove + * and invalidate it + */ + if (prev->valid) { + ftl_invalidate_addr(dev, prev->addr); + prev->valid = false; + } + + ftl_l2p_set(dev, entry->lba, addr); + pthread_spin_unlock(&prev->lock); + return; + } + +evicted: + /* + * If the L2P's physical address is different than what we expected we don't need to + * do anything (someone's already overwritten our data). + */ + if (io_weak && !ftl_addr_cmp(prev_addr, entry->addr)) { + return; + } + + /* Lock the band containing previous physical address. This assures atomic changes to */ + /* the L2P as wall as metadata. The valid bits in metadata are used to */ + /* check weak writes validity. */ + band = ftl_band_from_addr(dev, prev_addr); + pthread_spin_lock(&band->lba_map.lock); + + valid = ftl_invalidate_addr_unlocked(dev, prev_addr); + + /* If the address has been invalidated already, we don't want to update */ + /* the L2P for weak writes, as it means the write is no longer valid. */ + if (!io_weak || valid) { + ftl_l2p_set(dev, entry->lba, addr); + } + + pthread_spin_unlock(&band->lba_map.lock); +} + +static struct ftl_io * +ftl_io_init_child_write(struct ftl_io *parent, struct ftl_addr addr, ftl_io_fn cb) +{ + struct ftl_io *io; + struct spdk_ftl_dev *dev = parent->dev; + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .parent = parent, + .band = parent->band, + .size = sizeof(struct ftl_io), + .flags = 0, + .type = parent->type, + .num_blocks = dev->xfer_size, + .cb_fn = cb, + .iovcnt = 0, + }; + + io = ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->addr = addr; + + return io; +} + +static void +ftl_io_child_write_cb(struct ftl_io *io, void *ctx, int status) +{ + struct ftl_zone *zone; + struct ftl_wptr *wptr; + + zone = ftl_band_zone_from_addr(io->band, io->addr); + wptr = ftl_wptr_from_band(io->band); + + zone->busy = false; + zone->info.write_pointer += io->num_blocks; + + if (zone->info.write_pointer == zone->info.zone_id + zone->info.capacity) { + zone->info.state = SPDK_BDEV_ZONE_STATE_FULL; + } + + /* If some other write on the same band failed the write pointer would already be freed */ + if (spdk_likely(wptr)) { + wptr->num_outstanding--; + } +} + +static int +ftl_submit_child_write(struct ftl_wptr *wptr, struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch; + struct ftl_io *child; + struct ftl_addr addr; + int rc; + + ioch = ftl_io_channel_get_ctx(io->ioch); + + if (spdk_likely(!wptr->direct_mode)) { + addr = wptr->addr; + } else { + assert(io->flags & FTL_IO_DIRECT_ACCESS); + assert(ftl_addr_get_band(dev, io->addr) == wptr->band->id); + addr = io->addr; + } + + /* Split IO to child requests and release zone immediately after child is completed */ + child = ftl_io_init_child_write(io, addr, ftl_io_child_write_cb); + if (!child) { + return -EAGAIN; + } + + wptr->num_outstanding++; + + if (ftl_is_append_supported(dev)) { + rc = spdk_bdev_zone_appendv(dev->base_bdev_desc, ioch->base_ioch, + child->iov, child->iov_cnt, + ftl_addr_get_zone_slba(dev, addr), + dev->xfer_size, ftl_io_cmpl_cb, child); + } else { + rc = spdk_bdev_writev_blocks(dev->base_bdev_desc, ioch->base_ioch, + child->iov, child->iov_cnt, addr.offset, + dev->xfer_size, ftl_io_cmpl_cb, child); + } + + if (rc) { + wptr->num_outstanding--; + ftl_io_fail(child, rc); + ftl_io_complete(child); + SPDK_ERRLOG("spdk_bdev_write_blocks_with_md failed with status:%d, addr:%lu\n", + rc, addr.offset); + return -EIO; + } + + ftl_io_inc_req(child); + ftl_io_advance(child, dev->xfer_size); + + return 0; +} + +static int +ftl_submit_write(struct ftl_wptr *wptr, struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + int rc = 0; + + assert(io->num_blocks % dev->xfer_size == 0); + + while (io->iov_pos < io->iov_cnt) { + /* There are no guarantees of the order of completion of NVMe IO submission queue */ + /* so wait until zone is not busy before submitting another write */ + if (!ftl_is_append_supported(dev) && wptr->zone->busy) { + TAILQ_INSERT_TAIL(&wptr->pending_queue, io, ioch_entry); + rc = -EAGAIN; + break; + } + + rc = ftl_submit_child_write(wptr, io); + if (spdk_unlikely(rc)) { + if (rc == -EAGAIN) { + TAILQ_INSERT_TAIL(&wptr->pending_queue, io, ioch_entry); + } else { + ftl_io_fail(io, rc); + } + break; + } + + ftl_trace_submission(dev, io, wptr->addr, dev->xfer_size); + ftl_wptr_advance(wptr, dev->xfer_size); + } + + if (ftl_io_done(io)) { + /* Parent IO will complete after all children are completed */ + ftl_io_complete(io); + } + + return rc; +} + +static void +ftl_flush_pad_batch(struct spdk_ftl_dev *dev) +{ + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; + size_t size = 0, num_entries = 0; + + assert(batch != NULL); + assert(batch->num_entries < dev->xfer_size); + + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + size += spdk_ring_count(ioch->submit_queue); + } + + num_entries = dev->xfer_size - batch->num_entries; + if (size < num_entries) { + ftl_pad_wbuf(dev, num_entries - size); + } +} + +static bool +ftl_check_io_channel_flush(struct spdk_ftl_dev *dev) +{ + struct ftl_io_channel *ioch; + + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + if (ioch->flush && spdk_ring_count(ioch->free_queue) != ioch->num_entries) { + return true; + } + } + + return false; +} + +static int +ftl_wptr_process_writes(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_batch *batch; + struct ftl_wbuf_entry *entry; + struct ftl_io *io; + + if (spdk_unlikely(!TAILQ_EMPTY(&wptr->pending_queue))) { + io = TAILQ_FIRST(&wptr->pending_queue); + TAILQ_REMOVE(&wptr->pending_queue, io, ioch_entry); + + if (ftl_submit_write(wptr, io) == -EAGAIN) { + return 0; + } + } + + /* Make sure the band is prepared for writing */ + if (!ftl_wptr_ready(wptr)) { + return 0; + } + + if (dev->halt) { + ftl_wptr_process_shutdown(wptr); + } + + if (spdk_unlikely(wptr->flush)) { + ftl_wptr_pad_band(wptr); + } + + batch = ftl_get_next_batch(dev); + if (!batch) { + /* If there are queued flush requests we need to pad the write buffer to */ + /* force out remaining entries */ + if (!LIST_EMPTY(&dev->flush_list) || ftl_check_io_channel_flush(dev)) { + ftl_flush_pad_batch(dev); + } + + return 0; + } + + io = ftl_io_wbuf_init(dev, wptr->addr, wptr->band, batch, ftl_write_cb); + if (!io) { + goto error; + } + + TAILQ_FOREACH(entry, &batch->entries, tailq) { + /* Update band's relocation stats if the IO comes from reloc */ + if (entry->io_flags & FTL_IO_WEAK) { + if (!spdk_bit_array_get(wptr->band->reloc_bitmap, entry->band->id)) { + spdk_bit_array_set(wptr->band->reloc_bitmap, entry->band->id); + entry->band->num_reloc_bands++; + } + } + + ftl_trace_wbuf_pop(dev, entry); + ftl_update_stats(dev, entry); + } + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write addr:%lx\n", wptr->addr.offset); + + if (ftl_submit_write(wptr, io)) { + /* TODO: we need some recovery here */ + assert(0 && "Write submit failed"); + if (ftl_io_done(io)) { + ftl_io_free(io); + } + } + + return dev->xfer_size; +error: + TAILQ_INSERT_TAIL(&dev->pending_batches, batch, tailq); + return 0; +} + +static int +ftl_process_writes(struct spdk_ftl_dev *dev) +{ + struct ftl_wptr *wptr, *twptr; + size_t num_active = 0; + enum ftl_band_state state; + + LIST_FOREACH_SAFE(wptr, &dev->wptr_list, list_entry, twptr) { + ftl_wptr_process_writes(wptr); + state = wptr->band->state; + + if (state != FTL_BAND_STATE_FULL && + state != FTL_BAND_STATE_CLOSING && + state != FTL_BAND_STATE_CLOSED) { + num_active++; + } + } + + if (num_active < 1) { + ftl_add_wptr(dev); + } + + return 0; +} + +static void +ftl_fill_wbuf_entry(struct ftl_wbuf_entry *entry, struct ftl_io *io) +{ + memcpy(entry->payload, ftl_io_iovec_addr(io), FTL_BLOCK_SIZE); + + if (entry->io_flags & FTL_IO_WEAK) { + entry->band = ftl_band_from_addr(io->dev, io->addr); + entry->addr = ftl_band_next_addr(entry->band, io->addr, io->pos); + entry->band->num_reloc_blocks++; + } + + entry->trace = io->trace; + entry->lba = ftl_io_current_lba(io); +} + +static int +ftl_wbuf_fill(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch; + struct ftl_wbuf_entry *entry; + + ioch = ftl_io_channel_get_ctx(io->ioch); + + while (io->pos < io->num_blocks) { + if (ftl_io_current_lba(io) == FTL_LBA_INVALID) { + ftl_io_advance(io, 1); + continue; + } + + entry = ftl_acquire_wbuf_entry(ioch, io->flags); + if (!entry) { + TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry); + return 0; + } + + ftl_fill_wbuf_entry(entry, io); + + ftl_trace_wbuf_fill(dev, io); + ftl_update_l2p(dev, entry, ftl_get_addr_from_entry(entry)); + ftl_io_advance(io, 1); + + /* Needs to be done after L2P is updated to avoid race with */ + /* write completion callback when it's processed faster than */ + /* L2P is set in update_l2p(). */ + spdk_ring_enqueue(ioch->submit_queue, (void **)&entry, 1, NULL); + } + + if (ftl_io_done(io)) { + if (ftl_dev_has_nv_cache(dev) && !(io->flags & FTL_IO_BYPASS_CACHE)) { + ftl_write_nv_cache(io); + } else { + TAILQ_INSERT_TAIL(&ioch->write_cmpl_queue, io, ioch_entry); + } + } + + return 0; +} + +static bool +ftl_dev_needs_defrag(struct spdk_ftl_dev *dev) +{ + const struct spdk_ftl_limit *limit = ftl_get_limit(dev, SPDK_FTL_LIMIT_START); + + if (ftl_reloc_is_halted(dev->reloc)) { + return false; + } + + if (ftl_reloc_is_defrag_active(dev->reloc)) { + return false; + } + + if (dev->num_free <= limit->thld) { + return true; + } + + return false; +} + +static double +ftl_band_calc_merit(struct ftl_band *band, size_t *threshold_valid) +{ + size_t usable, valid, invalid; + double vld_ratio; + + /* If the band doesn't have any usable blocks it's of no use */ + usable = ftl_band_num_usable_blocks(band); + if (usable == 0) { + return 0.0; + } + + valid = threshold_valid ? (usable - *threshold_valid) : band->lba_map.num_vld; + invalid = usable - valid; + + /* Add one to avoid division by 0 */ + vld_ratio = (double)invalid / (double)(valid + 1); + return vld_ratio * ftl_band_age(band); +} + +static bool +ftl_band_needs_defrag(struct ftl_band *band, struct spdk_ftl_dev *dev) +{ + struct spdk_ftl_conf *conf = &dev->conf; + size_t thld_vld; + + /* If we're in dire need of free bands, every band is worth defragging */ + if (ftl_current_limit(dev) == SPDK_FTL_LIMIT_CRIT) { + return true; + } + + thld_vld = (ftl_band_num_usable_blocks(band) * conf->invalid_thld) / 100; + + return band->merit > ftl_band_calc_merit(band, &thld_vld); +} + +static struct ftl_band * +ftl_select_defrag_band(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band, *mband = NULL; + double merit = 0; + + LIST_FOREACH(band, &dev->shut_bands, list_entry) { + assert(band->state == FTL_BAND_STATE_CLOSED); + band->merit = ftl_band_calc_merit(band, NULL); + if (band->merit > merit) { + merit = band->merit; + mband = band; + } + } + + if (mband && !ftl_band_needs_defrag(mband, dev)) { + mband = NULL; + } + + return mband; +} + +static void +ftl_process_relocs(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + + if (ftl_dev_needs_defrag(dev)) { + band = ftl_select_defrag_band(dev); + if (band) { + ftl_reloc_add(dev->reloc, band, 0, ftl_get_num_blocks_in_band(dev), 0, true); + ftl_trace_defrag_band(dev, band); + } + } + + ftl_reloc(dev->reloc); +} + +int +ftl_current_limit(const struct spdk_ftl_dev *dev) +{ + return dev->limit; +} + +void +spdk_ftl_dev_get_attrs(const struct spdk_ftl_dev *dev, struct spdk_ftl_attrs *attrs) +{ + attrs->uuid = dev->uuid; + attrs->num_blocks = dev->num_lbas; + attrs->block_size = FTL_BLOCK_SIZE; + attrs->num_zones = ftl_get_num_zones(dev); + attrs->zone_size = ftl_get_num_blocks_in_zone(dev); + attrs->conf = dev->conf; + attrs->base_bdev = spdk_bdev_get_name(spdk_bdev_desc_get_bdev(dev->base_bdev_desc)); + + attrs->cache_bdev = NULL; + if (dev->nv_cache.bdev_desc) { + attrs->cache_bdev = spdk_bdev_get_name( + spdk_bdev_desc_get_bdev(dev->nv_cache.bdev_desc)); + } +} + +static void +_ftl_io_write(void *ctx) +{ + ftl_io_write((struct ftl_io *)ctx); +} + +static int +ftl_submit_write_leaf(struct ftl_io *io) +{ + int rc; + + rc = ftl_submit_write(ftl_wptr_from_band(io->band), io); + if (rc == -EAGAIN) { + /* EAGAIN means that the request was put on the pending queue */ + return 0; + } + + return rc; +} + +void +ftl_io_write(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(io->ioch); + + /* Put the IO on retry queue in case IO channel is not initialized */ + if (spdk_unlikely(ioch->index == FTL_IO_CHANNEL_INDEX_INVALID)) { + TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry); + return; + } + + /* For normal IOs we just need to copy the data onto the write buffer */ + if (!(io->flags & FTL_IO_MD)) { + ftl_io_call_foreach_child(io, ftl_wbuf_fill); + } else { + /* Metadata has its own buffer, so it doesn't have to be copied, so just */ + /* send it the the core thread and schedule the write immediately */ + if (ftl_check_core_thread(dev)) { + ftl_io_call_foreach_child(io, ftl_submit_write_leaf); + } else { + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_write, io); + } + } +} + +int +spdk_ftl_write(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt, + struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_io *io; + + if (iov_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt != ftl_iovec_num_blocks(iov, iov_cnt)) { + return -EINVAL; + } + + if (!dev->initialized) { + return -EBUSY; + } + + io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_WRITE); + if (!io) { + return -ENOMEM; + } + + ftl_io_write(io); + + return 0; +} + +void +ftl_io_read(struct ftl_io *io) +{ + ftl_io_call_foreach_child(io, ftl_submit_read); +} + +int +spdk_ftl_read(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt, + struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_io *io; + + if (iov_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt != ftl_iovec_num_blocks(iov, iov_cnt)) { + return -EINVAL; + } + + if (!dev->initialized) { + return -EBUSY; + } + + io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_READ); + if (!io) { + return -ENOMEM; + } + + ftl_io_read(io); + return 0; +} + +static struct ftl_flush * +ftl_flush_init(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_flush *flush; + + flush = calloc(1, sizeof(*flush)); + if (!flush) { + return NULL; + } + + flush->bmap = spdk_bit_array_create(FTL_BATCH_COUNT); + if (!flush->bmap) { + goto error; + } + + flush->dev = dev; + flush->cb.fn = cb_fn; + flush->cb.ctx = cb_arg; + + return flush; +error: + free(flush); + return NULL; +} + +static void +_ftl_flush(void *ctx) +{ + struct ftl_flush *flush = ctx; + struct spdk_ftl_dev *dev = flush->dev; + uint32_t i; + + /* Attach flush object to all non-empty batches */ + for (i = 0; i < FTL_BATCH_COUNT; ++i) { + if (dev->batch_array[i].num_entries > 0) { + spdk_bit_array_set(flush->bmap, i); + flush->num_req++; + } + } + + LIST_INSERT_HEAD(&dev->flush_list, flush, list_entry); + + /* If the write buffer was already empty, the flush can be completed right away */ + if (!flush->num_req) { + ftl_complete_flush(flush); + } +} + +int +ftl_flush_wbuf(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_flush *flush; + + flush = ftl_flush_init(dev, cb_fn, cb_arg); + if (!flush) { + return -ENOMEM; + } + + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_flush, flush); + return 0; +} + +int +spdk_ftl_flush(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + if (!dev->initialized) { + return -EBUSY; + } + + return ftl_flush_wbuf(dev, cb_fn, cb_arg); +} + +bool +ftl_addr_is_written(struct ftl_band *band, struct ftl_addr addr) +{ + struct ftl_zone *zone = ftl_band_zone_from_addr(band, addr); + + return addr.offset < zone->info.write_pointer; +} + +static void ftl_process_media_event(struct spdk_ftl_dev *dev, struct spdk_bdev_media_event event); + +static void +_ftl_process_media_event(void *ctx) +{ + struct ftl_media_event *event = ctx; + struct spdk_ftl_dev *dev = event->dev; + + ftl_process_media_event(dev, event->event); + spdk_mempool_put(dev->media_events_pool, event); +} + +static void +ftl_process_media_event(struct spdk_ftl_dev *dev, struct spdk_bdev_media_event event) +{ + struct ftl_band *band; + struct ftl_addr addr = { .offset = event.offset }; + size_t block_off; + + if (!ftl_check_core_thread(dev)) { + struct ftl_media_event *media_event; + + media_event = spdk_mempool_get(dev->media_events_pool); + if (!media_event) { + SPDK_ERRLOG("Media event lost due to lack of memory"); + return; + } + + media_event->dev = dev; + media_event->event = event; + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_process_media_event, + media_event); + return; + } + + band = ftl_band_from_addr(dev, addr); + block_off = ftl_band_block_offset_from_addr(band, addr); + + ftl_reloc_add(dev->reloc, band, block_off, event.num_blocks, 0, false); +} + +void +ftl_get_media_events(struct spdk_ftl_dev *dev) +{ +#define FTL_MAX_MEDIA_EVENTS 128 + struct spdk_bdev_media_event events[FTL_MAX_MEDIA_EVENTS]; + size_t num_events, i; + + if (!dev->initialized) { + return; + } + + do { + num_events = spdk_bdev_get_media_events(dev->base_bdev_desc, + events, FTL_MAX_MEDIA_EVENTS); + + for (i = 0; i < num_events; ++i) { + ftl_process_media_event(dev, events[i]); + } + + } while (num_events); +} + +int +ftl_io_channel_poll(void *arg) +{ + struct ftl_io_channel *ch = arg; + struct ftl_io *io; + TAILQ_HEAD(, ftl_io) retry_queue; + + if (TAILQ_EMPTY(&ch->write_cmpl_queue) && TAILQ_EMPTY(&ch->retry_queue)) { + return SPDK_POLLER_IDLE; + } + + while (!TAILQ_EMPTY(&ch->write_cmpl_queue)) { + io = TAILQ_FIRST(&ch->write_cmpl_queue); + TAILQ_REMOVE(&ch->write_cmpl_queue, io, ioch_entry); + ftl_io_complete(io); + } + + /* + * Create local copy of the retry queue to prevent from infinite retrying if IO will be + * inserted to the retry queue again + */ + TAILQ_INIT(&retry_queue); + TAILQ_SWAP(&ch->retry_queue, &retry_queue, ftl_io, ioch_entry); + + while (!TAILQ_EMPTY(&retry_queue)) { + io = TAILQ_FIRST(&retry_queue); + TAILQ_REMOVE(&retry_queue, io, ioch_entry); + if (io->type == FTL_IO_WRITE) { + ftl_io_write(io); + } else { + ftl_io_read(io); + } + } + + return SPDK_POLLER_BUSY; +} + +int +ftl_task_core(void *ctx) +{ + struct spdk_ftl_dev *dev = ctx; + + if (dev->halt) { + if (ftl_shutdown_complete(dev)) { + spdk_poller_unregister(&dev->core_poller); + return SPDK_POLLER_IDLE; + } + } + + ftl_process_writes(dev); + ftl_process_relocs(dev); + + return SPDK_POLLER_BUSY; +} + +SPDK_LOG_REGISTER_COMPONENT("ftl_core", SPDK_LOG_FTL_CORE) diff --git a/src/spdk/lib/ftl/ftl_core.h b/src/spdk/lib/ftl/ftl_core.h new file mode 100644 index 000000000..b782ba731 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_core.h @@ -0,0 +1,552 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_CORE_H +#define FTL_CORE_H + +#include "spdk/stdinc.h" +#include "spdk/uuid.h" +#include "spdk/thread.h" +#include "spdk/util.h" +#include "spdk_internal/log.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/ftl.h" +#include "spdk/bdev.h" +#include "spdk/bdev_zone.h" + +#include "ftl_addr.h" +#include "ftl_io.h" +#include "ftl_trace.h" + +#ifdef SPDK_CONFIG_PMDK +#include "libpmem.h" +#endif /* SPDK_CONFIG_PMDK */ + +struct spdk_ftl_dev; +struct ftl_band; +struct ftl_zone; +struct ftl_io; +struct ftl_restore; +struct ftl_wptr; +struct ftl_flush; +struct ftl_reloc; +struct ftl_anm_event; +struct ftl_band_flush; + +struct ftl_stats { + /* Number of writes scheduled directly by the user */ + uint64_t write_user; + + /* Total number of writes */ + uint64_t write_total; + + /* Traces */ + struct ftl_trace trace; + + /* Number of limits applied */ + uint64_t limits[SPDK_FTL_LIMIT_MAX]; +}; + +struct ftl_global_md { + /* Device instance */ + struct spdk_uuid uuid; + /* Size of the l2p table */ + uint64_t num_lbas; +}; + +struct ftl_nv_cache { + /* Write buffer cache bdev */ + struct spdk_bdev_desc *bdev_desc; + /* Write pointer */ + uint64_t current_addr; + /* Number of available blocks left */ + uint64_t num_available; + /* Maximum number of blocks */ + uint64_t num_data_blocks; + /* + * Phase of the current cycle of writes. Each time whole cache area is filled, the phase is + * advanced. Current phase is saved in every IO's metadata, as well as in the header saved + * in the first sector. By looking at the phase of each block, it's possible to find the + * oldest block and replay the order of the writes when recovering the data from the cache. + */ + unsigned int phase; + /* Indicates that the data can be written to the cache */ + bool ready; + /* Metadata pool */ + struct spdk_mempool *md_pool; + /* DMA buffer for writing the header */ + void *dma_buf; + /* Cache lock */ + pthread_spinlock_t lock; +}; + +struct ftl_batch { + /* Queue of write buffer entries, can reach up to xfer_size entries */ + TAILQ_HEAD(, ftl_wbuf_entry) entries; + /* Number of entries in the queue above */ + uint32_t num_entries; + /* Index within spdk_ftl_dev.batch_array */ + uint32_t index; + struct iovec *iov; + void *metadata; + TAILQ_ENTRY(ftl_batch) tailq; +}; + +struct spdk_ftl_dev { + /* Device instance */ + struct spdk_uuid uuid; + /* Device name */ + char *name; + /* Configuration */ + struct spdk_ftl_conf conf; + + /* Indicates the device is fully initialized */ + int initialized; + /* Indicates the device is about to be stopped */ + int halt; + /* Indicates the device is about to start stopping - use to handle multiple stop request */ + bool halt_started; + + /* Underlying device */ + struct spdk_bdev_desc *base_bdev_desc; + + /* Non-volatile write buffer cache */ + struct ftl_nv_cache nv_cache; + + /* LBA map memory pool */ + struct spdk_mempool *lba_pool; + + /* LBA map requests pool */ + struct spdk_mempool *lba_request_pool; + + /* Media management events pool */ + struct spdk_mempool *media_events_pool; + + /* Statistics */ + struct ftl_stats stats; + + /* Current sequence number */ + uint64_t seq; + + /* Array of bands */ + struct ftl_band *bands; + /* Number of operational bands */ + size_t num_bands; + /* Next write band */ + struct ftl_band *next_band; + /* Free band list */ + LIST_HEAD(, ftl_band) free_bands; + /* Closed bands list */ + LIST_HEAD(, ftl_band) shut_bands; + /* Number of free bands */ + size_t num_free; + + /* List of write pointers */ + LIST_HEAD(, ftl_wptr) wptr_list; + + /* Logical -> physical table */ + void *l2p; + /* Size of the l2p table */ + uint64_t num_lbas; + /* Size of pages mmapped for l2p, valid only for mapping on persistent memory */ + size_t l2p_pmem_len; + + /* Address size */ + size_t addr_len; + + /* Flush list */ + LIST_HEAD(, ftl_flush) flush_list; + /* List of band flush requests */ + LIST_HEAD(, ftl_band_flush) band_flush_list; + + /* Device specific md buffer */ + struct ftl_global_md global_md; + + /* Metadata size */ + size_t md_size; + void *md_buf; + + /* Transfer unit size */ + size_t xfer_size; + + /* Current user write limit */ + int limit; + + /* Inflight IO operations */ + uint32_t num_inflight; + + /* Manages data relocation */ + struct ftl_reloc *reloc; + + /* Thread on which the poller is running */ + struct spdk_thread *core_thread; + /* IO channel */ + struct spdk_io_channel *ioch; + /* Poller */ + struct spdk_poller *core_poller; + + /* IO channel array provides means for retrieving write buffer entries + * from their address stored in L2P. The address is divided into two + * parts - IO channel offset poining at specific IO channel (within this + * array) and entry offset pointing at specific entry within that IO + * channel. + */ + struct ftl_io_channel **ioch_array; + TAILQ_HEAD(, ftl_io_channel) ioch_queue; + uint64_t num_io_channels; + /* Value required to shift address of a write buffer entry to retrieve + * the IO channel it's part of. The other part of the address describes + * the offset of an entry within the IO channel's entry array. + */ + uint64_t ioch_shift; + + /* Write buffer batches */ +#define FTL_BATCH_COUNT 4096 + struct ftl_batch batch_array[FTL_BATCH_COUNT]; + /* Iovec buffer used by batches */ + struct iovec *iov_buf; + /* Batch currently being filled */ + struct ftl_batch *current_batch; + /* Full and ready to be sent batches. A batch is put on this queue in + * case it's already filled, but cannot be sent. + */ + TAILQ_HEAD(, ftl_batch) pending_batches; + TAILQ_HEAD(, ftl_batch) free_batches; + + /* Devices' list */ + STAILQ_ENTRY(spdk_ftl_dev) stailq; +}; + +struct ftl_nv_cache_header { + /* Version of the header */ + uint32_t version; + /* UUID of the FTL device */ + struct spdk_uuid uuid; + /* Size of the non-volatile cache (in blocks) */ + uint64_t size; + /* Contains the next address to be written after clean shutdown, invalid LBA otherwise */ + uint64_t current_addr; + /* Current phase */ + uint8_t phase; + /* Checksum of the header, needs to be last element */ + uint32_t checksum; +} __attribute__((packed)); + +struct ftl_media_event { + /* Owner */ + struct spdk_ftl_dev *dev; + /* Media event */ + struct spdk_bdev_media_event event; +}; + +typedef void (*ftl_restore_fn)(struct ftl_restore *, int, void *cb_arg); + +void ftl_apply_limits(struct spdk_ftl_dev *dev); +void ftl_io_read(struct ftl_io *io); +void ftl_io_write(struct ftl_io *io); +int ftl_flush_wbuf(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg); +int ftl_current_limit(const struct spdk_ftl_dev *dev); +int ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr); +int ftl_task_core(void *ctx); +int ftl_task_read(void *ctx); +void ftl_process_anm_event(struct ftl_anm_event *event); +size_t ftl_tail_md_num_blocks(const struct spdk_ftl_dev *dev); +size_t ftl_tail_md_hdr_num_blocks(void); +size_t ftl_vld_map_num_blocks(const struct spdk_ftl_dev *dev); +size_t ftl_lba_map_num_blocks(const struct spdk_ftl_dev *dev); +size_t ftl_head_md_num_blocks(const struct spdk_ftl_dev *dev); +int ftl_restore_md(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg); +int ftl_restore_device(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg); +void ftl_restore_nv_cache(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg); +int ftl_band_set_direct_access(struct ftl_band *band, bool access); +bool ftl_addr_is_written(struct ftl_band *band, struct ftl_addr addr); +int ftl_flush_active_bands(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg); +int ftl_nv_cache_write_header(struct ftl_nv_cache *nv_cache, bool shutdown, + spdk_bdev_io_completion_cb cb_fn, void *cb_arg); +int ftl_nv_cache_scrub(struct ftl_nv_cache *nv_cache, spdk_bdev_io_completion_cb cb_fn, + void *cb_arg); +void ftl_get_media_events(struct spdk_ftl_dev *dev); +int ftl_io_channel_poll(void *arg); +void ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry); +struct spdk_io_channel *ftl_get_io_channel(const struct spdk_ftl_dev *dev); +struct ftl_io_channel *ftl_io_channel_get_ctx(struct spdk_io_channel *ioch); + + +#define ftl_to_addr(address) \ + (struct ftl_addr) { .offset = (uint64_t)(address) } + +#define ftl_to_addr_packed(address) \ + (struct ftl_addr) { .pack.offset = (uint32_t)(address) } + +static inline struct spdk_thread * +ftl_get_core_thread(const struct spdk_ftl_dev *dev) +{ + return dev->core_thread; +} + +static inline size_t +ftl_get_num_bands(const struct spdk_ftl_dev *dev) +{ + return dev->num_bands; +} + +static inline size_t +ftl_get_num_punits(const struct spdk_ftl_dev *dev) +{ + return spdk_bdev_get_optimal_open_zones(spdk_bdev_desc_get_bdev(dev->base_bdev_desc)); +} + +static inline size_t +ftl_get_num_zones(const struct spdk_ftl_dev *dev) +{ + return ftl_get_num_bands(dev) * ftl_get_num_punits(dev); +} + +static inline size_t +ftl_get_num_blocks_in_zone(const struct spdk_ftl_dev *dev) +{ + return spdk_bdev_get_zone_size(spdk_bdev_desc_get_bdev(dev->base_bdev_desc)); +} + +static inline uint64_t +ftl_get_num_blocks_in_band(const struct spdk_ftl_dev *dev) +{ + return ftl_get_num_punits(dev) * ftl_get_num_blocks_in_zone(dev); +} + +static inline uint64_t +ftl_addr_get_zone_slba(const struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + return addr.offset -= (addr.offset % ftl_get_num_blocks_in_zone(dev)); +} + +static inline uint64_t +ftl_addr_get_band(const struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + return addr.offset / ftl_get_num_blocks_in_band(dev); +} + +static inline uint64_t +ftl_addr_get_punit(const struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + return (addr.offset / ftl_get_num_blocks_in_zone(dev)) % ftl_get_num_punits(dev); +} + +static inline uint64_t +ftl_addr_get_zone_offset(const struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + return addr.offset % ftl_get_num_blocks_in_zone(dev); +} + +static inline size_t +ftl_vld_map_size(const struct spdk_ftl_dev *dev) +{ + return (size_t)spdk_divide_round_up(ftl_get_num_blocks_in_band(dev), CHAR_BIT); +} + +static inline int +ftl_addr_packed(const struct spdk_ftl_dev *dev) +{ + return dev->addr_len < 32; +} + +static inline void +ftl_l2p_lba_persist(const struct spdk_ftl_dev *dev, uint64_t lba) +{ +#ifdef SPDK_CONFIG_PMDK + size_t ftl_addr_size = ftl_addr_packed(dev) ? 4 : 8; + pmem_persist((char *)dev->l2p + (lba * ftl_addr_size), ftl_addr_size); +#else /* SPDK_CONFIG_PMDK */ + SPDK_ERRLOG("Libpmem not available, cannot flush l2p to pmem\n"); + assert(0); +#endif /* SPDK_CONFIG_PMDK */ +} + +static inline int +ftl_addr_invalid(struct ftl_addr addr) +{ + return addr.offset == ftl_to_addr(FTL_ADDR_INVALID).offset; +} + +static inline int +ftl_addr_cached(struct ftl_addr addr) +{ + return !ftl_addr_invalid(addr) && addr.cached; +} + +static inline struct ftl_addr +ftl_addr_to_packed(const struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_addr p = {}; + + if (ftl_addr_invalid(addr)) { + p = ftl_to_addr_packed(FTL_ADDR_INVALID); + } else if (ftl_addr_cached(addr)) { + p.pack.cached = 1; + p.pack.cache_offset = (uint32_t) addr.cache_offset; + } else { + p.pack.offset = (uint32_t) addr.offset; + } + + return p; +} + +static inline struct ftl_addr +ftl_addr_from_packed(const struct spdk_ftl_dev *dev, struct ftl_addr p) +{ + struct ftl_addr addr = {}; + + if (p.pack.offset == (uint32_t)FTL_ADDR_INVALID) { + addr = ftl_to_addr(FTL_ADDR_INVALID); + } else if (p.pack.cached) { + addr.cached = 1; + addr.cache_offset = p.pack.cache_offset; + } else { + addr = p; + } + + return addr; +} + +#define _ftl_l2p_set(l2p, off, val, bits) \ + __atomic_store_n(((uint##bits##_t *)(l2p)) + (off), val, __ATOMIC_SEQ_CST) + +#define _ftl_l2p_set32(l2p, off, val) \ + _ftl_l2p_set(l2p, off, val, 32) + +#define _ftl_l2p_set64(l2p, off, val) \ + _ftl_l2p_set(l2p, off, val, 64) + +#define _ftl_l2p_get(l2p, off, bits) \ + __atomic_load_n(((uint##bits##_t *)(l2p)) + (off), __ATOMIC_SEQ_CST) + +#define _ftl_l2p_get32(l2p, off) \ + _ftl_l2p_get(l2p, off, 32) + +#define _ftl_l2p_get64(l2p, off) \ + _ftl_l2p_get(l2p, off, 64) + +#define ftl_addr_cmp(p1, p2) \ + ((p1).offset == (p2).offset) + +static inline void +ftl_l2p_set(struct spdk_ftl_dev *dev, uint64_t lba, struct ftl_addr addr) +{ + assert(dev->num_lbas > lba); + + if (ftl_addr_packed(dev)) { + _ftl_l2p_set32(dev->l2p, lba, ftl_addr_to_packed(dev, addr).offset); + } else { + _ftl_l2p_set64(dev->l2p, lba, addr.offset); + } + + if (dev->l2p_pmem_len != 0) { + ftl_l2p_lba_persist(dev, lba); + } +} + +static inline struct ftl_addr +ftl_l2p_get(struct spdk_ftl_dev *dev, uint64_t lba) +{ + assert(dev->num_lbas > lba); + + if (ftl_addr_packed(dev)) { + return ftl_addr_from_packed(dev, ftl_to_addr_packed( + _ftl_l2p_get32(dev->l2p, lba))); + } else { + return ftl_to_addr(_ftl_l2p_get64(dev->l2p, lba)); + } +} + +static inline bool +ftl_dev_has_nv_cache(const struct spdk_ftl_dev *dev) +{ + return dev->nv_cache.bdev_desc != NULL; +} + +#define FTL_NV_CACHE_HEADER_VERSION (1) +#define FTL_NV_CACHE_DATA_OFFSET (1) +#define FTL_NV_CACHE_PHASE_OFFSET (62) +#define FTL_NV_CACHE_PHASE_COUNT (4) +#define FTL_NV_CACHE_PHASE_MASK (3ULL << FTL_NV_CACHE_PHASE_OFFSET) +#define FTL_NV_CACHE_LBA_INVALID (FTL_LBA_INVALID & ~FTL_NV_CACHE_PHASE_MASK) + +static inline bool +ftl_nv_cache_phase_is_valid(unsigned int phase) +{ + return phase > 0 && phase <= 3; +} + +static inline unsigned int +ftl_nv_cache_next_phase(unsigned int current) +{ + static const unsigned int phases[] = { 0, 2, 3, 1 }; + assert(ftl_nv_cache_phase_is_valid(current)); + return phases[current]; +} + +static inline unsigned int +ftl_nv_cache_prev_phase(unsigned int current) +{ + static const unsigned int phases[] = { 0, 3, 1, 2 }; + assert(ftl_nv_cache_phase_is_valid(current)); + return phases[current]; +} + +static inline uint64_t +ftl_nv_cache_pack_lba(uint64_t lba, unsigned int phase) +{ + assert(ftl_nv_cache_phase_is_valid(phase)); + return (lba & ~FTL_NV_CACHE_PHASE_MASK) | ((uint64_t)phase << FTL_NV_CACHE_PHASE_OFFSET); +} + +static inline void +ftl_nv_cache_unpack_lba(uint64_t in_lba, uint64_t *out_lba, unsigned int *phase) +{ + *out_lba = in_lba & ~FTL_NV_CACHE_PHASE_MASK; + *phase = (in_lba & FTL_NV_CACHE_PHASE_MASK) >> FTL_NV_CACHE_PHASE_OFFSET; + + /* If the phase is invalid the block wasn't written yet, so treat the LBA as invalid too */ + if (!ftl_nv_cache_phase_is_valid(*phase) || *out_lba == FTL_NV_CACHE_LBA_INVALID) { + *out_lba = FTL_LBA_INVALID; + } +} + +static inline bool +ftl_is_append_supported(const struct spdk_ftl_dev *dev) +{ + return dev->conf.use_append; +} + +#endif /* FTL_CORE_H */ diff --git a/src/spdk/lib/ftl/ftl_debug.c b/src/spdk/lib/ftl/ftl_debug.c new file mode 100644 index 000000000..9fbb43810 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_debug.c @@ -0,0 +1,169 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk_internal/log.h" +#include "spdk/ftl.h" +#include "ftl_debug.h" +#include "ftl_band.h" + +#if defined(DEBUG) +#if defined(FTL_META_DEBUG) + +static const char *ftl_band_state_str[] = { + "free", + "prep", + "opening", + "open", + "full", + "closing", + "closed", + "max" +}; + +bool +ftl_band_validate_md(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_lba_map *lba_map = &band->lba_map; + struct ftl_addr addr_md, addr_l2p; + size_t i, size, seg_off; + bool valid = true; + + size = ftl_get_num_blocks_in_band(dev); + + pthread_spin_lock(&lba_map->lock); + for (i = 0; i < size; ++i) { + if (!spdk_bit_array_get(lba_map->vld, i)) { + continue; + } + + seg_off = i / FTL_NUM_LBA_IN_BLOCK; + if (lba_map->segments[seg_off] != FTL_LBA_MAP_SEG_CACHED) { + continue; + } + + addr_md = ftl_band_addr_from_block_offset(band, i); + addr_l2p = ftl_l2p_get(dev, lba_map->map[i]); + + if (addr_l2p.cached) { + continue; + } + + if (addr_l2p.offset != addr_md.offset) { + valid = false; + break; + } + + } + + pthread_spin_unlock(&lba_map->lock); + + return valid; +} + +void +ftl_dev_dump_bands(struct spdk_ftl_dev *dev) +{ + size_t i, total = 0; + + if (!dev->bands) { + return; + } + + ftl_debug("Bands validity:\n"); + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + if (dev->bands[i].state == FTL_BAND_STATE_FREE && + dev->bands[i].wr_cnt == 0) { + continue; + } + + if (!dev->bands[i].num_zones) { + ftl_debug(" Band %3zu: all zones are offline\n", i + 1); + continue; + } + + total += dev->bands[i].lba_map.num_vld; + ftl_debug(" Band %3zu: %8zu / %zu \tnum_zones: %zu \twr_cnt: %"PRIu64"\tmerit:" + "%10.3f\tstate: %s\n", + i + 1, dev->bands[i].lba_map.num_vld, + ftl_band_user_blocks(&dev->bands[i]), + dev->bands[i].num_zones, + dev->bands[i].wr_cnt, + dev->bands[i].merit, + ftl_band_state_str[dev->bands[i].state]); + } +} + +#endif /* defined(FTL_META_DEBUG) */ + +#if defined(FTL_DUMP_STATS) + +void +ftl_dev_dump_stats(const struct spdk_ftl_dev *dev) +{ + size_t i, total = 0; + char uuid[SPDK_UUID_STRING_LEN]; + double waf; + const char *limits[] = { + [SPDK_FTL_LIMIT_CRIT] = "crit", + [SPDK_FTL_LIMIT_HIGH] = "high", + [SPDK_FTL_LIMIT_LOW] = "low", + [SPDK_FTL_LIMIT_START] = "start" + }; + + if (!dev->bands) { + return; + } + + /* Count the number of valid LBAs */ + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + total += dev->bands[i].lba_map.num_vld; + } + + waf = (double)dev->stats.write_total / (double)dev->stats.write_user; + + spdk_uuid_fmt_lower(uuid, sizeof(uuid), &dev->uuid); + ftl_debug("\n"); + ftl_debug("device UUID: %s\n", uuid); + ftl_debug("total valid LBAs: %zu\n", total); + ftl_debug("total writes: %"PRIu64"\n", dev->stats.write_total); + ftl_debug("user writes: %"PRIu64"\n", dev->stats.write_user); + ftl_debug("WAF: %.4lf\n", waf); + ftl_debug("limits:\n"); + for (i = 0; i < SPDK_FTL_LIMIT_MAX; ++i) { + ftl_debug(" %5s: %"PRIu64"\n", limits[i], dev->stats.limits[i]); + } +} + +#endif /* defined(FTL_DUMP_STATS) */ +#endif /* defined(DEBUG) */ diff --git a/src/spdk/lib/ftl/ftl_debug.h b/src/spdk/lib/ftl/ftl_debug.h new file mode 100644 index 000000000..c90c92ef2 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_debug.h @@ -0,0 +1,73 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_DEBUG_H +#define FTL_DEBUG_H + +#include "ftl_addr.h" +#include "ftl_band.h" +#include "ftl_core.h" + +#if defined(DEBUG) +/* Debug flags - enabled when defined */ +#define FTL_META_DEBUG 1 +#define FTL_DUMP_STATS 1 + +#define ftl_debug(msg, ...) \ + SPDK_ERRLOG(msg, ## __VA_ARGS__) +#else +#define ftl_debug(msg, ...) +#endif + +static inline const char * +ftl_addr2str(struct ftl_addr addr, char *buf, size_t size) +{ + snprintf(buf, size, "(%"PRIu64")", addr.offset); + return buf; +} + +#if defined(FTL_META_DEBUG) +bool ftl_band_validate_md(struct ftl_band *band); +void ftl_dev_dump_bands(struct spdk_ftl_dev *dev); +#else +#define ftl_band_validate_md(band) +#define ftl_dev_dump_bands(dev) +#endif + +#if defined(FTL_DUMP_STATS) +void ftl_dev_dump_stats(const struct spdk_ftl_dev *dev); +#else +#define ftl_dev_dump_stats(dev) +#endif + +#endif /* FTL_DEBUG_H */ diff --git a/src/spdk/lib/ftl/ftl_init.c b/src/spdk/lib/ftl/ftl_init.c new file mode 100644 index 000000000..15a8c21c9 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_init.c @@ -0,0 +1,1688 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/nvme.h" +#include "spdk/thread.h" +#include "spdk/string.h" +#include "spdk/likely.h" +#include "spdk_internal/log.h" +#include "spdk/ftl.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/bdev_zone.h" +#include "spdk/bdev_module.h" +#include "spdk/config.h" + +#include "ftl_core.h" +#include "ftl_io.h" +#include "ftl_reloc.h" +#include "ftl_band.h" +#include "ftl_debug.h" + +#ifdef SPDK_CONFIG_PMDK +#include "libpmem.h" +#endif /* SPDK_CONFIG_PMDK */ + +#define FTL_CORE_RING_SIZE 4096 +#define FTL_INIT_TIMEOUT 30 +#define FTL_NSID 1 +#define FTL_ZONE_INFO_COUNT 64 + +/* Dummy bdev module used to to claim bdevs. */ +static struct spdk_bdev_module g_ftl_bdev_module = { + .name = "ftl_lib", +}; + +struct ftl_dev_init_ctx { + /* Owner */ + struct spdk_ftl_dev *dev; + /* Initial arguments */ + struct spdk_ftl_dev_init_opts opts; + /* IO channel for zone info retrieving */ + struct spdk_io_channel *ioch; + /* Buffer for reading zone info */ + struct spdk_bdev_zone_info info[FTL_ZONE_INFO_COUNT]; + /* Currently read zone */ + size_t zone_id; + /* User's callback */ + spdk_ftl_init_fn cb_fn; + /* Callback's argument */ + void *cb_arg; + /* Thread to call the callback on */ + struct spdk_thread *thread; + /* Poller to check if the device has been destroyed/initialized */ + struct spdk_poller *poller; + /* Status to return for halt completion callback */ + int halt_complete_status; +}; + +static STAILQ_HEAD(, spdk_ftl_dev) g_ftl_queue = STAILQ_HEAD_INITIALIZER(g_ftl_queue); +static pthread_mutex_t g_ftl_queue_lock = PTHREAD_MUTEX_INITIALIZER; +static const struct spdk_ftl_conf g_default_conf = { + .limits = { + /* 5 free bands / 0 % host writes */ + [SPDK_FTL_LIMIT_CRIT] = { .thld = 5, .limit = 0 }, + /* 10 free bands / 5 % host writes */ + [SPDK_FTL_LIMIT_HIGH] = { .thld = 10, .limit = 5 }, + /* 20 free bands / 40 % host writes */ + [SPDK_FTL_LIMIT_LOW] = { .thld = 20, .limit = 40 }, + /* 40 free bands / 100 % host writes - defrag starts running */ + [SPDK_FTL_LIMIT_START] = { .thld = 40, .limit = 100 }, + }, + /* 10 percent valid blocks */ + .invalid_thld = 10, + /* 20% spare blocks */ + .lba_rsvd = 20, + /* 6M write buffer per each IO channel */ + .write_buffer_size = 6 * 1024 * 1024, + /* 90% band fill threshold */ + .band_thld = 90, + /* Max 32 IO depth per band relocate */ + .max_reloc_qdepth = 32, + /* Max 3 active band relocates */ + .max_active_relocs = 3, + /* IO pool size per user thread (this should be adjusted to thread IO qdepth) */ + .user_io_pool_size = 2048, + /* + * If clear ftl will return error when restoring after a dirty shutdown + * If set, last band will be padded, ftl will restore based only on closed bands - this + * will result in lost data after recovery. + */ + .allow_open_bands = false, + .max_io_channels = 128, + .nv_cache = { + /* Maximum number of concurrent requests */ + .max_request_cnt = 2048, + /* Maximum number of blocks per request */ + .max_request_size = 16, + } +}; + +static int +ftl_band_init_md(struct ftl_band *band) +{ + struct ftl_lba_map *lba_map = &band->lba_map; + int rc; + + lba_map->vld = spdk_bit_array_create(ftl_get_num_blocks_in_band(band->dev)); + if (!lba_map->vld) { + return -ENOMEM; + } + + rc = pthread_spin_init(&lba_map->lock, PTHREAD_PROCESS_PRIVATE); + if (rc) { + spdk_bit_array_free(&lba_map->vld); + return rc; + } + ftl_band_md_clear(band); + return 0; +} + +static int +ftl_check_conf(const struct spdk_ftl_dev *dev, const struct spdk_ftl_conf *conf) +{ + size_t i; + + if (conf->invalid_thld >= 100) { + return -1; + } + if (conf->lba_rsvd >= 100) { + return -1; + } + if (conf->lba_rsvd == 0) { + return -1; + } + if (conf->write_buffer_size == 0) { + return -1; + } + if (conf->write_buffer_size % FTL_BLOCK_SIZE != 0) { + return -1; + } + + for (i = 0; i < SPDK_FTL_LIMIT_MAX; ++i) { + if (conf->limits[i].limit > 100) { + return -1; + } + } + + return 0; +} + +static int +ftl_dev_init_bands(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band, *pband; + unsigned int i; + int rc = 0; + + LIST_INIT(&dev->free_bands); + LIST_INIT(&dev->shut_bands); + + dev->num_free = 0; + dev->bands = calloc(ftl_get_num_bands(dev), sizeof(*dev->bands)); + if (!dev->bands) { + return -1; + } + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + band = &dev->bands[i]; + band->id = i; + band->dev = dev; + band->state = FTL_BAND_STATE_CLOSED; + + if (LIST_EMPTY(&dev->shut_bands)) { + LIST_INSERT_HEAD(&dev->shut_bands, band, list_entry); + } else { + LIST_INSERT_AFTER(pband, band, list_entry); + } + pband = band; + + CIRCLEQ_INIT(&band->zones); + band->zone_buf = calloc(ftl_get_num_punits(dev), sizeof(*band->zone_buf)); + if (!band->zone_buf) { + SPDK_ERRLOG("Failed to allocate block state table for band: [%u]\n", i); + rc = -1; + break; + } + + rc = ftl_band_init_md(band); + if (rc) { + SPDK_ERRLOG("Failed to initialize metadata structures for band [%u]\n", i); + break; + } + + band->reloc_bitmap = spdk_bit_array_create(ftl_get_num_bands(dev)); + if (!band->reloc_bitmap) { + SPDK_ERRLOG("Failed to allocate band relocation bitmap\n"); + break; + } + } + + return rc; +} + +static void +ftl_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx) +{ + struct spdk_ftl_dev *dev = event_ctx; + + switch (type) { + case SPDK_BDEV_EVENT_REMOVE: + assert(0); + break; + case SPDK_BDEV_EVENT_MEDIA_MANAGEMENT: + assert(bdev == spdk_bdev_desc_get_bdev(dev->base_bdev_desc)); + ftl_get_media_events(dev); + default: + break; + } +} + +static int +ftl_dev_init_nv_cache(struct spdk_ftl_dev *dev, const char *bdev_name) +{ + struct spdk_bdev *bdev; + struct spdk_ftl_conf *conf = &dev->conf; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + char pool_name[128]; + int rc; + + if (!bdev_name) { + return 0; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + SPDK_ERRLOG("Unable to find bdev: %s\n", bdev_name); + return -1; + } + + if (spdk_bdev_open_ext(bdev_name, true, ftl_bdev_event_cb, + dev, &nv_cache->bdev_desc)) { + SPDK_ERRLOG("Unable to open bdev: %s\n", bdev_name); + return -1; + } + + if (spdk_bdev_module_claim_bdev(bdev, nv_cache->bdev_desc, &g_ftl_bdev_module)) { + spdk_bdev_close(nv_cache->bdev_desc); + nv_cache->bdev_desc = NULL; + SPDK_ERRLOG("Unable to claim bdev %s\n", bdev_name); + return -1; + } + + SPDK_INFOLOG(SPDK_LOG_FTL_INIT, "Using %s as write buffer cache\n", + spdk_bdev_get_name(bdev)); + + if (spdk_bdev_get_block_size(bdev) != FTL_BLOCK_SIZE) { + SPDK_ERRLOG("Unsupported block size (%d)\n", spdk_bdev_get_block_size(bdev)); + return -1; + } + + if (!spdk_bdev_is_md_separate(bdev)) { + SPDK_ERRLOG("Bdev %s doesn't support separate metadata buffer IO\n", + spdk_bdev_get_name(bdev)); + return -1; + } + + if (spdk_bdev_get_md_size(bdev) < sizeof(uint64_t)) { + SPDK_ERRLOG("Bdev's %s metadata is too small (%"PRIu32")\n", + spdk_bdev_get_name(bdev), spdk_bdev_get_md_size(bdev)); + return -1; + } + + if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { + SPDK_ERRLOG("Unsupported DIF type used by bdev %s\n", + spdk_bdev_get_name(bdev)); + return -1; + } + + /* The cache needs to be capable of storing at least two full bands. This requirement comes + * from the fact that cache works as a protection against power loss, so before the data + * inside the cache can be overwritten, the band it's stored on has to be closed. Plus one + * extra block is needed to store the header. + */ + if (spdk_bdev_get_num_blocks(bdev) < ftl_get_num_blocks_in_band(dev) * 2 + 1) { + SPDK_ERRLOG("Insufficient number of blocks for write buffer cache (available: %" + PRIu64", required: %"PRIu64")\n", spdk_bdev_get_num_blocks(bdev), + ftl_get_num_blocks_in_band(dev) * 2 + 1); + return -1; + } + + rc = snprintf(pool_name, sizeof(pool_name), "ftl-nvpool-%p", dev); + if (rc < 0 || rc >= 128) { + return -1; + } + + nv_cache->md_pool = spdk_mempool_create(pool_name, conf->nv_cache.max_request_cnt, + spdk_bdev_get_md_size(bdev) * + conf->nv_cache.max_request_size, + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!nv_cache->md_pool) { + SPDK_ERRLOG("Failed to initialize non-volatile cache metadata pool\n"); + return -1; + } + + nv_cache->dma_buf = spdk_dma_zmalloc(FTL_BLOCK_SIZE, spdk_bdev_get_buf_align(bdev), NULL); + if (!nv_cache->dma_buf) { + SPDK_ERRLOG("Memory allocation failure\n"); + return -1; + } + + if (pthread_spin_init(&nv_cache->lock, PTHREAD_PROCESS_PRIVATE)) { + SPDK_ERRLOG("Failed to initialize cache lock\n"); + return -1; + } + + nv_cache->current_addr = FTL_NV_CACHE_DATA_OFFSET; + nv_cache->num_data_blocks = spdk_bdev_get_num_blocks(bdev) - 1; + nv_cache->num_available = nv_cache->num_data_blocks; + nv_cache->ready = false; + + return 0; +} + +void +spdk_ftl_conf_init_defaults(struct spdk_ftl_conf *conf) +{ + *conf = g_default_conf; +} + +static void +ftl_lba_map_request_ctor(struct spdk_mempool *mp, void *opaque, void *obj, unsigned obj_idx) +{ + struct ftl_lba_map_request *request = obj; + struct spdk_ftl_dev *dev = opaque; + + request->segments = spdk_bit_array_create(spdk_divide_round_up( + ftl_get_num_blocks_in_band(dev), FTL_NUM_LBA_IN_BLOCK)); +} + +static int +ftl_init_media_events_pool(struct spdk_ftl_dev *dev) +{ + char pool_name[128]; + int rc; + + rc = snprintf(pool_name, sizeof(pool_name), "ftl-media-%p", dev); + if (rc < 0 || rc >= (int)sizeof(pool_name)) { + SPDK_ERRLOG("Failed to create media pool name\n"); + return -1; + } + + dev->media_events_pool = spdk_mempool_create(pool_name, 1024, + sizeof(struct ftl_media_event), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!dev->media_events_pool) { + SPDK_ERRLOG("Failed to create media events pool\n"); + return -1; + } + + return 0; +} + +static int +ftl_init_lba_map_pools(struct spdk_ftl_dev *dev) +{ +#define POOL_NAME_LEN 128 + char pool_name[POOL_NAME_LEN]; + int rc; + + rc = snprintf(pool_name, sizeof(pool_name), "%s-%s", dev->name, "ftl-lba-pool"); + if (rc < 0 || rc >= POOL_NAME_LEN) { + return -ENAMETOOLONG; + } + + /* We need to reserve at least 2 buffers for band close / open sequence + * alone, plus additional (8) buffers for handling write errors. + * TODO: This memory pool is utilized only by core thread - it introduce + * unnecessary overhead and should be replaced by different data structure. + */ + dev->lba_pool = spdk_mempool_create(pool_name, 2 + 8, + ftl_lba_map_pool_elem_size(dev), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!dev->lba_pool) { + return -ENOMEM; + } + + rc = snprintf(pool_name, sizeof(pool_name), "%s-%s", dev->name, "ftl-lbareq-pool"); + if (rc < 0 || rc >= POOL_NAME_LEN) { + return -ENAMETOOLONG; + } + + dev->lba_request_pool = spdk_mempool_create_ctor(pool_name, + dev->conf.max_reloc_qdepth * dev->conf.max_active_relocs, + sizeof(struct ftl_lba_map_request), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY, + ftl_lba_map_request_ctor, + dev); + if (!dev->lba_request_pool) { + return -ENOMEM; + } + + return 0; +} + +static void +ftl_init_wptr_list(struct spdk_ftl_dev *dev) +{ + LIST_INIT(&dev->wptr_list); + LIST_INIT(&dev->flush_list); + LIST_INIT(&dev->band_flush_list); +} + +static size_t +ftl_dev_band_max_seq(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + size_t seq = 0; + + LIST_FOREACH(band, &dev->shut_bands, list_entry) { + if (band->seq > seq) { + seq = band->seq; + } + } + + return seq; +} + +static void +_ftl_init_bands_state(void *ctx) +{ + struct ftl_band *band, *temp_band; + struct spdk_ftl_dev *dev = ctx; + + dev->seq = ftl_dev_band_max_seq(dev); + + LIST_FOREACH_SAFE(band, &dev->shut_bands, list_entry, temp_band) { + if (!band->lba_map.num_vld) { + ftl_band_set_state(band, FTL_BAND_STATE_FREE); + } + } + + ftl_reloc_resume(dev->reloc); + /* Clear the limit applications as they're incremented incorrectly by */ + /* the initialization code */ + memset(dev->stats.limits, 0, sizeof(dev->stats.limits)); +} + +static int +ftl_init_num_free_bands(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + int cnt = 0; + + LIST_FOREACH(band, &dev->shut_bands, list_entry) { + if (band->num_zones && !band->lba_map.num_vld) { + cnt++; + } + } + return cnt; +} + +static int +ftl_init_bands_state(struct spdk_ftl_dev *dev) +{ + /* TODO: Should we abort initialization or expose read only device */ + /* if there is no free bands? */ + /* If we abort initialization should we depend on condition that */ + /* we have no free bands or should we have some minimal number of */ + /* free bands? */ + if (!ftl_init_num_free_bands(dev)) { + return -1; + } + + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_init_bands_state, dev); + return 0; +} + +static void +_ftl_dev_init_core_thread(void *ctx) +{ + struct spdk_ftl_dev *dev = ctx; + + dev->core_poller = SPDK_POLLER_REGISTER(ftl_task_core, dev, 0); + if (!dev->core_poller) { + SPDK_ERRLOG("Unable to register core poller\n"); + assert(0); + } + + dev->ioch = spdk_get_io_channel(dev); +} + +static int +ftl_dev_init_core_thread(struct spdk_ftl_dev *dev, const struct spdk_ftl_dev_init_opts *opts) +{ + if (!opts->core_thread) { + return -1; + } + + dev->core_thread = opts->core_thread; + + spdk_thread_send_msg(opts->core_thread, _ftl_dev_init_core_thread, dev); + return 0; +} + +static int +ftl_dev_l2p_alloc_pmem(struct spdk_ftl_dev *dev, size_t l2p_size, const char *l2p_path) +{ +#ifdef SPDK_CONFIG_PMDK + int is_pmem; + + if ((dev->l2p = pmem_map_file(l2p_path, 0, + 0, 0, &dev->l2p_pmem_len, &is_pmem)) == NULL) { + SPDK_ERRLOG("Failed to mmap l2p_path\n"); + return -1; + } + + if (!is_pmem) { + SPDK_NOTICELOG("l2p_path mapped on non-pmem device\n"); + } + + if (dev->l2p_pmem_len < l2p_size) { + SPDK_ERRLOG("l2p_path file is too small\n"); + return -1; + } + + pmem_memset_persist(dev->l2p, FTL_ADDR_INVALID, l2p_size); + + return 0; +#else /* SPDK_CONFIG_PMDK */ + SPDK_ERRLOG("Libpmem not available, cannot use pmem l2p_path\n"); + return -1; +#endif /* SPDK_CONFIG_PMDK */ +} + +static int +ftl_dev_l2p_alloc_dram(struct spdk_ftl_dev *dev, size_t l2p_size) +{ + dev->l2p = malloc(l2p_size); + if (!dev->l2p) { + SPDK_ERRLOG("Failed to allocate l2p table\n"); + return -1; + } + + memset(dev->l2p, FTL_ADDR_INVALID, l2p_size); + + return 0; +} + +static int +ftl_dev_l2p_alloc(struct spdk_ftl_dev *dev) +{ + size_t addr_size = dev->addr_len >= 32 ? 8 : 4; + size_t l2p_size = dev->num_lbas * addr_size; + const char *l2p_path = dev->conf.l2p_path; + + if (dev->num_lbas == 0) { + SPDK_ERRLOG("Invalid l2p table size\n"); + return -1; + } + + if (dev->l2p) { + SPDK_ERRLOG("L2p table already allocated\n"); + return -1; + } + + dev->l2p_pmem_len = 0; + if (l2p_path) { + return ftl_dev_l2p_alloc_pmem(dev, l2p_size, l2p_path); + } else { + return ftl_dev_l2p_alloc_dram(dev, l2p_size); + } +} + +static void +ftl_dev_free_init_ctx(struct ftl_dev_init_ctx *init_ctx) +{ + if (!init_ctx) { + return; + } + + if (init_ctx->ioch) { + spdk_put_io_channel(init_ctx->ioch); + } + + free(init_ctx); +} + +static void +ftl_call_init_complete_cb(void *ctx) +{ + struct ftl_dev_init_ctx *init_ctx = ctx; + struct spdk_ftl_dev *dev = init_ctx->dev; + + if (init_ctx->cb_fn != NULL) { + init_ctx->cb_fn(dev, init_ctx->cb_arg, 0); + } + + ftl_dev_free_init_ctx(init_ctx); +} + +static void +ftl_init_complete(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + + pthread_mutex_lock(&g_ftl_queue_lock); + STAILQ_INSERT_HEAD(&g_ftl_queue, dev, stailq); + pthread_mutex_unlock(&g_ftl_queue_lock); + + dev->initialized = 1; + + spdk_thread_send_msg(init_ctx->thread, ftl_call_init_complete_cb, init_ctx); +} + +static void +ftl_init_fail_cb(struct spdk_ftl_dev *dev, void *ctx, int status) +{ + struct ftl_dev_init_ctx *init_ctx = ctx; + + if (init_ctx->cb_fn != NULL) { + init_ctx->cb_fn(NULL, init_ctx->cb_arg, -ENODEV); + } + + ftl_dev_free_init_ctx(init_ctx); +} + +static int ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg, + struct spdk_thread *thread); + +static void +ftl_init_fail(struct ftl_dev_init_ctx *init_ctx) +{ + if (ftl_dev_free(init_ctx->dev, ftl_init_fail_cb, init_ctx, init_ctx->thread)) { + SPDK_ERRLOG("Unable to free the device\n"); + assert(0); + } +} + +static void +ftl_write_nv_cache_md_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + struct spdk_ftl_dev *dev = init_ctx->dev; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Writing non-volatile cache's metadata header failed\n"); + ftl_init_fail(init_ctx); + return; + } + + dev->nv_cache.ready = true; + ftl_init_complete(init_ctx); +} + +static void +ftl_clear_nv_cache_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + struct spdk_ftl_dev *dev = init_ctx->dev; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Unable to clear the non-volatile cache bdev\n"); + ftl_init_fail(init_ctx); + return; + } + + nv_cache->phase = 1; + if (ftl_nv_cache_write_header(nv_cache, false, ftl_write_nv_cache_md_cb, init_ctx)) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n"); + ftl_init_fail(init_ctx); + } +} + +static void +_ftl_nv_cache_scrub(void *ctx) +{ + struct ftl_dev_init_ctx *init_ctx = ctx; + struct spdk_ftl_dev *dev = init_ctx->dev; + int rc; + + rc = ftl_nv_cache_scrub(&dev->nv_cache, ftl_clear_nv_cache_cb, init_ctx); + + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to clear the non-volatile cache bdev: %s\n", + spdk_strerror(-rc)); + ftl_init_fail(init_ctx); + } +} + +static int +ftl_setup_initial_state(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + struct spdk_ftl_conf *conf = &dev->conf; + size_t i; + + spdk_uuid_generate(&dev->uuid); + + dev->num_lbas = 0; + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + dev->num_lbas += ftl_band_num_usable_blocks(&dev->bands[i]); + } + + dev->num_lbas = (dev->num_lbas * (100 - conf->lba_rsvd)) / 100; + + if (ftl_dev_l2p_alloc(dev)) { + SPDK_ERRLOG("Unable to init l2p table\n"); + return -1; + } + + if (ftl_init_bands_state(dev)) { + SPDK_ERRLOG("Unable to finish the initialization\n"); + return -1; + } + + if (!ftl_dev_has_nv_cache(dev)) { + ftl_init_complete(init_ctx); + } else { + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_nv_cache_scrub, init_ctx); + } + + return 0; +} + +static void +ftl_restore_nv_cache_cb(struct ftl_restore *restore, int status, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + + if (spdk_unlikely(status != 0)) { + SPDK_ERRLOG("Failed to restore the non-volatile cache state\n"); + ftl_init_fail(init_ctx); + return; + } + + ftl_init_complete(init_ctx); +} + +static void +ftl_restore_device_cb(struct ftl_restore *restore, int status, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + struct spdk_ftl_dev *dev = init_ctx->dev; + + if (status) { + SPDK_ERRLOG("Failed to restore the device from the SSD\n"); + ftl_init_fail(init_ctx); + return; + } + + if (ftl_init_bands_state(dev)) { + SPDK_ERRLOG("Unable to finish the initialization\n"); + ftl_init_fail(init_ctx); + return; + } + + if (!ftl_dev_has_nv_cache(dev)) { + ftl_init_complete(init_ctx); + return; + } + + ftl_restore_nv_cache(restore, ftl_restore_nv_cache_cb, init_ctx); +} + +static void +ftl_restore_md_cb(struct ftl_restore *restore, int status, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + + if (status) { + SPDK_ERRLOG("Failed to restore the metadata from the SSD\n"); + goto error; + } + + /* After the metadata is read it should be possible to allocate the L2P */ + if (ftl_dev_l2p_alloc(init_ctx->dev)) { + SPDK_ERRLOG("Failed to allocate the L2P\n"); + goto error; + } + + if (ftl_restore_device(restore, ftl_restore_device_cb, init_ctx)) { + SPDK_ERRLOG("Failed to start device restoration from the SSD\n"); + goto error; + } + + return; +error: + ftl_init_fail(init_ctx); +} + +static int +ftl_restore_state(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + + dev->uuid = init_ctx->opts.uuid; + + if (ftl_restore_md(dev, ftl_restore_md_cb, init_ctx)) { + SPDK_ERRLOG("Failed to start metadata restoration from the SSD\n"); + return -1; + } + + return 0; +} + +static void +ftl_dev_update_bands(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band, *temp_band; + size_t i; + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + band = &dev->bands[i]; + band->tail_md_addr = ftl_band_tail_md_addr(band); + } + + /* Remove band from shut_bands list to prevent further processing */ + /* if all blocks on this band are bad */ + LIST_FOREACH_SAFE(band, &dev->shut_bands, list_entry, temp_band) { + if (!band->num_zones) { + dev->num_bands--; + LIST_REMOVE(band, list_entry); + } + } +} + +static void +ftl_dev_init_state(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + + ftl_dev_update_bands(dev); + + if (ftl_dev_init_core_thread(dev, &init_ctx->opts)) { + SPDK_ERRLOG("Unable to initialize device thread\n"); + ftl_init_fail(init_ctx); + return; + } + + if (init_ctx->opts.mode & SPDK_FTL_MODE_CREATE) { + if (ftl_setup_initial_state(init_ctx)) { + SPDK_ERRLOG("Failed to setup initial state of the device\n"); + ftl_init_fail(init_ctx); + return; + } + } else { + if (ftl_restore_state(init_ctx)) { + SPDK_ERRLOG("Unable to restore device's state from the SSD\n"); + ftl_init_fail(init_ctx); + return; + } + } +} + +static void ftl_dev_get_zone_info(struct ftl_dev_init_ctx *init_ctx); + +static void +ftl_dev_get_zone_info_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + struct spdk_ftl_dev *dev = init_ctx->dev; + struct ftl_band *band; + struct ftl_zone *zone; + struct ftl_addr addr; + size_t i, zones_left, num_zones; + + spdk_bdev_free_io(bdev_io); + + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Unable to read zone info for zone id: %"PRIu64"\n", init_ctx->zone_id); + ftl_init_fail(init_ctx); + return; + } + + zones_left = ftl_get_num_zones(dev) - (init_ctx->zone_id / ftl_get_num_blocks_in_zone(dev)); + num_zones = spdk_min(zones_left, FTL_ZONE_INFO_COUNT); + + for (i = 0; i < num_zones; ++i) { + addr.offset = init_ctx->info[i].zone_id; + band = &dev->bands[ftl_addr_get_band(dev, addr)]; + zone = &band->zone_buf[ftl_addr_get_punit(dev, addr)]; + zone->info = init_ctx->info[i]; + + /* TODO: add support for zone capacity less than zone size */ + if (zone->info.capacity != ftl_get_num_blocks_in_zone(dev)) { + zone->info.state = SPDK_BDEV_ZONE_STATE_OFFLINE; + SPDK_ERRLOG("Zone capacity is not equal zone size for " + "zone id: %"PRIu64"\n", init_ctx->zone_id); + } + + /* Set write pointer to the last block plus one for zone in full state */ + if (zone->info.state == SPDK_BDEV_ZONE_STATE_FULL) { + zone->info.write_pointer = zone->info.zone_id + zone->info.capacity; + } + + if (zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE) { + band->num_zones++; + CIRCLEQ_INSERT_TAIL(&band->zones, zone, circleq); + } + } + + init_ctx->zone_id = init_ctx->zone_id + num_zones * ftl_get_num_blocks_in_zone(dev); + + ftl_dev_get_zone_info(init_ctx); +} + +static void +ftl_dev_get_zone_info(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + size_t zones_left, num_zones; + int rc; + + zones_left = ftl_get_num_zones(dev) - (init_ctx->zone_id / ftl_get_num_blocks_in_zone(dev)); + if (zones_left == 0) { + ftl_dev_init_state(init_ctx); + return; + } + + num_zones = spdk_min(zones_left, FTL_ZONE_INFO_COUNT); + + rc = spdk_bdev_get_zone_info(dev->base_bdev_desc, init_ctx->ioch, + init_ctx->zone_id, num_zones, init_ctx->info, + ftl_dev_get_zone_info_cb, init_ctx); + + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to read zone info for zone id: %"PRIu64"\n", init_ctx->zone_id); + ftl_init_fail(init_ctx); + } +} + +static int +ftl_dev_init_zones(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + + init_ctx->zone_id = 0; + init_ctx->ioch = spdk_bdev_get_io_channel(dev->base_bdev_desc); + if (!init_ctx->ioch) { + SPDK_ERRLOG("Failed to get base bdev IO channel\n"); + return -1; + } + + ftl_dev_get_zone_info(init_ctx); + + return 0; +} + +struct _ftl_io_channel { + struct ftl_io_channel *ioch; +}; + +struct ftl_io_channel * +ftl_io_channel_get_ctx(struct spdk_io_channel *ioch) +{ + struct _ftl_io_channel *_ioch = spdk_io_channel_get_ctx(ioch); + + return _ioch->ioch; +} + +static void +ftl_io_channel_register(void *ctx) +{ + struct ftl_io_channel *ioch = ctx; + struct spdk_ftl_dev *dev = ioch->dev; + uint32_t ioch_index; + + for (ioch_index = 0; ioch_index < dev->conf.max_io_channels; ++ioch_index) { + if (dev->ioch_array[ioch_index] == NULL) { + dev->ioch_array[ioch_index] = ioch; + ioch->index = ioch_index; + break; + } + } + + assert(ioch_index < dev->conf.max_io_channels); + TAILQ_INSERT_TAIL(&dev->ioch_queue, ioch, tailq); +} + +static int +ftl_io_channel_init_wbuf(struct ftl_io_channel *ioch) +{ + struct spdk_ftl_dev *dev = ioch->dev; + struct ftl_wbuf_entry *entry; + uint32_t i; + int rc; + + ioch->num_entries = dev->conf.write_buffer_size / FTL_BLOCK_SIZE; + ioch->wbuf_entries = calloc(ioch->num_entries, sizeof(*ioch->wbuf_entries)); + if (ioch->wbuf_entries == NULL) { + SPDK_ERRLOG("Failed to allocate write buffer entry array\n"); + return -1; + } + + ioch->qdepth_limit = ioch->num_entries; + ioch->wbuf_payload = spdk_zmalloc(dev->conf.write_buffer_size, FTL_BLOCK_SIZE, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (ioch->wbuf_payload == NULL) { + SPDK_ERRLOG("Failed to allocate write buffer payload\n"); + goto error_entries; + } + + ioch->free_queue = spdk_ring_create(SPDK_RING_TYPE_SP_SC, + spdk_align32pow2(ioch->num_entries + 1), + SPDK_ENV_SOCKET_ID_ANY); + if (ioch->free_queue == NULL) { + SPDK_ERRLOG("Failed to allocate free queue\n"); + goto error_payload; + } + + ioch->submit_queue = spdk_ring_create(SPDK_RING_TYPE_SP_SC, + spdk_align32pow2(ioch->num_entries + 1), + SPDK_ENV_SOCKET_ID_ANY); + if (ioch->submit_queue == NULL) { + SPDK_ERRLOG("Failed to allocate submit queue\n"); + goto error_free_queue; + } + + for (i = 0; i < ioch->num_entries; ++i) { + entry = &ioch->wbuf_entries[i]; + entry->payload = (char *)ioch->wbuf_payload + i * FTL_BLOCK_SIZE; + entry->ioch = ioch; + entry->index = i; + entry->addr.offset = FTL_ADDR_INVALID; + + rc = pthread_spin_init(&entry->lock, PTHREAD_PROCESS_PRIVATE); + if (rc != 0) { + SPDK_ERRLOG("Failed to initialize spinlock\n"); + goto error_spinlock; + } + + spdk_ring_enqueue(ioch->free_queue, (void **)&entry, 1, NULL); + } + + return 0; +error_spinlock: + for (; i > 0; --i) { + pthread_spin_destroy(&ioch->wbuf_entries[i - 1].lock); + } + + spdk_ring_free(ioch->submit_queue); +error_free_queue: + spdk_ring_free(ioch->free_queue); +error_payload: + spdk_free(ioch->wbuf_payload); +error_entries: + free(ioch->wbuf_entries); + + return -1; +} + +static int +ftl_io_channel_create_cb(void *io_device, void *ctx) +{ + struct spdk_ftl_dev *dev = io_device; + struct _ftl_io_channel *_ioch = ctx; + struct ftl_io_channel *ioch; + uint32_t num_io_channels; + char mempool_name[32]; + int rc; + + num_io_channels = __atomic_fetch_add(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST); + if (num_io_channels >= dev->conf.max_io_channels) { + SPDK_ERRLOG("Reached maximum number of IO channels\n"); + __atomic_fetch_sub(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST); + return -1; + } + + ioch = calloc(1, sizeof(*ioch)); + if (ioch == NULL) { + SPDK_ERRLOG("Failed to allocate IO channel\n"); + return -1; + } + + rc = snprintf(mempool_name, sizeof(mempool_name), "ftl_io_%p", ioch); + if (rc < 0 || rc >= (int)sizeof(mempool_name)) { + SPDK_ERRLOG("Failed to create IO channel pool name\n"); + free(ioch); + return -1; + } + + ioch->cache_ioch = NULL; + ioch->index = FTL_IO_CHANNEL_INDEX_INVALID; + ioch->dev = dev; + ioch->elem_size = sizeof(struct ftl_md_io); + ioch->io_pool = spdk_mempool_create(mempool_name, + dev->conf.user_io_pool_size, + ioch->elem_size, + 0, + SPDK_ENV_SOCKET_ID_ANY); + if (!ioch->io_pool) { + SPDK_ERRLOG("Failed to create IO channel's IO pool\n"); + free(ioch); + return -1; + } + + ioch->base_ioch = spdk_bdev_get_io_channel(dev->base_bdev_desc); + if (!ioch->base_ioch) { + SPDK_ERRLOG("Failed to create base bdev IO channel\n"); + goto fail_ioch; + } + + if (ftl_dev_has_nv_cache(dev)) { + ioch->cache_ioch = spdk_bdev_get_io_channel(dev->nv_cache.bdev_desc); + if (!ioch->cache_ioch) { + SPDK_ERRLOG("Failed to create cache IO channel\n"); + goto fail_cache; + } + } + + TAILQ_INIT(&ioch->write_cmpl_queue); + TAILQ_INIT(&ioch->retry_queue); + ioch->poller = SPDK_POLLER_REGISTER(ftl_io_channel_poll, ioch, 0); + if (!ioch->poller) { + SPDK_ERRLOG("Failed to register IO channel poller\n"); + goto fail_poller; + } + + if (ftl_io_channel_init_wbuf(ioch)) { + SPDK_ERRLOG("Failed to initialize IO channel's write buffer\n"); + goto fail_wbuf; + } + + _ioch->ioch = ioch; + + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_io_channel_register, ioch); + + return 0; +fail_wbuf: + spdk_poller_unregister(&ioch->poller); +fail_poller: + if (ioch->cache_ioch) { + spdk_put_io_channel(ioch->cache_ioch); + } +fail_cache: + spdk_put_io_channel(ioch->base_ioch); +fail_ioch: + spdk_mempool_free(ioch->io_pool); + free(ioch); + + return -1; +} + +static void +ftl_io_channel_unregister(void *ctx) +{ + struct ftl_io_channel *ioch = ctx; + struct spdk_ftl_dev *dev = ioch->dev; + uint32_t i, num_io_channels __attribute__((unused)); + + assert(ioch->index < dev->conf.max_io_channels); + assert(dev->ioch_array[ioch->index] == ioch); + + dev->ioch_array[ioch->index] = NULL; + TAILQ_REMOVE(&dev->ioch_queue, ioch, tailq); + + num_io_channels = __atomic_fetch_sub(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST); + assert(num_io_channels > 0); + + for (i = 0; i < ioch->num_entries; ++i) { + pthread_spin_destroy(&ioch->wbuf_entries[i].lock); + } + + spdk_mempool_free(ioch->io_pool); + spdk_ring_free(ioch->free_queue); + spdk_ring_free(ioch->submit_queue); + spdk_free(ioch->wbuf_payload); + free(ioch->wbuf_entries); + free(ioch); +} + +static void +_ftl_io_channel_destroy_cb(void *ctx) +{ + struct ftl_io_channel *ioch = ctx; + struct spdk_ftl_dev *dev = ioch->dev; + uint32_t i; + + /* Do not destroy the channel if some of its entries are still in use */ + if (spdk_ring_count(ioch->free_queue) != ioch->num_entries) { + spdk_thread_send_msg(spdk_get_thread(), _ftl_io_channel_destroy_cb, ctx); + return; + } + + /* Evict all valid entries from cache */ + for (i = 0; i < ioch->num_entries; ++i) { + ftl_evict_cache_entry(dev, &ioch->wbuf_entries[i]); + } + + spdk_poller_unregister(&ioch->poller); + + spdk_put_io_channel(ioch->base_ioch); + if (ioch->cache_ioch) { + spdk_put_io_channel(ioch->cache_ioch); + } + + ioch->base_ioch = NULL; + ioch->cache_ioch = NULL; + + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_io_channel_unregister, ioch); +} + +static void +ftl_io_channel_destroy_cb(void *io_device, void *ctx) +{ + struct _ftl_io_channel *_ioch = ctx; + struct ftl_io_channel *ioch = _ioch->ioch; + + /* Mark the IO channel as being flush to force out any unwritten entries */ + ioch->flush = true; + + _ftl_io_channel_destroy_cb(ioch); +} + +static int +ftl_dev_init_io_channel(struct spdk_ftl_dev *dev) +{ + struct ftl_batch *batch; + uint32_t i; + + /* Align the IO channels to nearest power of 2 to allow for easy addr bit shift */ + dev->conf.max_io_channels = spdk_align32pow2(dev->conf.max_io_channels); + dev->ioch_shift = spdk_u32log2(dev->conf.max_io_channels); + + dev->ioch_array = calloc(dev->conf.max_io_channels, sizeof(*dev->ioch_array)); + if (!dev->ioch_array) { + SPDK_ERRLOG("Failed to allocate IO channel array\n"); + return -1; + } + + if (dev->md_size > 0) { + dev->md_buf = spdk_zmalloc(dev->md_size * dev->xfer_size * FTL_BATCH_COUNT, + dev->md_size, NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); + if (dev->md_buf == NULL) { + SPDK_ERRLOG("Failed to allocate metadata buffer\n"); + return -1; + } + } + + dev->iov_buf = calloc(FTL_BATCH_COUNT, dev->xfer_size * sizeof(struct iovec)); + if (!dev->iov_buf) { + SPDK_ERRLOG("Failed to allocate iovec buffer\n"); + return -1; + } + + TAILQ_INIT(&dev->free_batches); + TAILQ_INIT(&dev->pending_batches); + TAILQ_INIT(&dev->ioch_queue); + + for (i = 0; i < FTL_BATCH_COUNT; ++i) { + batch = &dev->batch_array[i]; + batch->iov = &dev->iov_buf[i * dev->xfer_size]; + batch->num_entries = 0; + batch->index = i; + TAILQ_INIT(&batch->entries); + if (dev->md_buf != NULL) { + batch->metadata = (char *)dev->md_buf + i * dev->xfer_size * dev->md_size; + } + + TAILQ_INSERT_TAIL(&dev->free_batches, batch, tailq); + } + + dev->num_io_channels = 0; + + spdk_io_device_register(dev, ftl_io_channel_create_cb, ftl_io_channel_destroy_cb, + sizeof(struct _ftl_io_channel), + NULL); + + return 0; +} + +static int +ftl_dev_init_base_bdev(struct spdk_ftl_dev *dev, const char *bdev_name) +{ + uint32_t block_size; + uint64_t num_blocks; + struct spdk_bdev *bdev; + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + SPDK_ERRLOG("Unable to find bdev: %s\n", bdev_name); + return -1; + } + + if (!spdk_bdev_is_zoned(bdev)) { + SPDK_ERRLOG("Bdev dosen't support zone capabilities: %s\n", + spdk_bdev_get_name(bdev)); + return -1; + } + + if (spdk_bdev_open_ext(bdev_name, true, ftl_bdev_event_cb, + dev, &dev->base_bdev_desc)) { + SPDK_ERRLOG("Unable to open bdev: %s\n", bdev_name); + return -1; + } + + if (spdk_bdev_module_claim_bdev(bdev, dev->base_bdev_desc, &g_ftl_bdev_module)) { + spdk_bdev_close(dev->base_bdev_desc); + dev->base_bdev_desc = NULL; + SPDK_ERRLOG("Unable to claim bdev %s\n", bdev_name); + return -1; + } + + dev->xfer_size = spdk_bdev_get_write_unit_size(bdev); + dev->md_size = spdk_bdev_get_md_size(bdev); + + block_size = spdk_bdev_get_block_size(bdev); + if (block_size != FTL_BLOCK_SIZE) { + SPDK_ERRLOG("Unsupported block size (%"PRIu32")\n", block_size); + return -1; + } + + num_blocks = spdk_bdev_get_num_blocks(bdev); + if (num_blocks % ftl_get_num_punits(dev)) { + SPDK_ERRLOG("Unsupported geometry. Base bdev block count must be multiple " + "of optimal number of zones.\n"); + return -1; + } + + if (ftl_is_append_supported(dev) && + !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZONE_APPEND)) { + SPDK_ERRLOG("Bdev dosen't support append: %s\n", + spdk_bdev_get_name(bdev)); + return -1; + } + + dev->num_bands = num_blocks / (ftl_get_num_punits(dev) * ftl_get_num_blocks_in_zone(dev)); + dev->addr_len = spdk_u64log2(num_blocks) + 1; + + return 0; +} + +static void +ftl_lba_map_request_dtor(struct spdk_mempool *mp, void *opaque, void *obj, unsigned obj_idx) +{ + struct ftl_lba_map_request *request = obj; + + spdk_bit_array_free(&request->segments); +} + +static void +ftl_release_bdev(struct spdk_bdev_desc *bdev_desc) +{ + if (!bdev_desc) { + return; + } + + spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_desc)); + spdk_bdev_close(bdev_desc); +} + +static void +ftl_dev_free_sync(struct spdk_ftl_dev *dev) +{ + struct spdk_ftl_dev *iter; + size_t i; + + if (!dev) { + return; + } + + pthread_mutex_lock(&g_ftl_queue_lock); + STAILQ_FOREACH(iter, &g_ftl_queue, stailq) { + if (iter == dev) { + STAILQ_REMOVE(&g_ftl_queue, dev, spdk_ftl_dev, stailq); + break; + } + } + pthread_mutex_unlock(&g_ftl_queue_lock); + + assert(LIST_EMPTY(&dev->wptr_list)); + assert(dev->current_batch == NULL); + + ftl_dev_dump_bands(dev); + ftl_dev_dump_stats(dev); + + if (dev->bands) { + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + free(dev->bands[i].zone_buf); + spdk_bit_array_free(&dev->bands[i].lba_map.vld); + spdk_bit_array_free(&dev->bands[i].reloc_bitmap); + } + } + + spdk_dma_free(dev->nv_cache.dma_buf); + + spdk_mempool_free(dev->lba_pool); + spdk_mempool_free(dev->nv_cache.md_pool); + spdk_mempool_free(dev->media_events_pool); + if (dev->lba_request_pool) { + spdk_mempool_obj_iter(dev->lba_request_pool, ftl_lba_map_request_dtor, NULL); + } + spdk_mempool_free(dev->lba_request_pool); + + ftl_reloc_free(dev->reloc); + + ftl_release_bdev(dev->nv_cache.bdev_desc); + ftl_release_bdev(dev->base_bdev_desc); + + spdk_free(dev->md_buf); + + assert(dev->num_io_channels == 0); + free(dev->ioch_array); + free(dev->iov_buf); + free(dev->name); + free(dev->bands); + if (dev->l2p_pmem_len != 0) { +#ifdef SPDK_CONFIG_PMDK + pmem_unmap(dev->l2p, dev->l2p_pmem_len); +#endif /* SPDK_CONFIG_PMDK */ + } else { + free(dev->l2p); + } + free((char *)dev->conf.l2p_path); + free(dev); +} + +int +spdk_ftl_dev_init(const struct spdk_ftl_dev_init_opts *_opts, spdk_ftl_init_fn cb_fn, void *cb_arg) +{ + struct spdk_ftl_dev *dev; + struct spdk_ftl_dev_init_opts opts = *_opts; + struct ftl_dev_init_ctx *init_ctx = NULL; + int rc = -ENOMEM; + + dev = calloc(1, sizeof(*dev)); + if (!dev) { + return -ENOMEM; + } + + init_ctx = calloc(1, sizeof(*init_ctx)); + if (!init_ctx) { + goto fail_sync; + } + + init_ctx->dev = dev; + init_ctx->opts = *_opts; + init_ctx->cb_fn = cb_fn; + init_ctx->cb_arg = cb_arg; + init_ctx->thread = spdk_get_thread(); + + if (!opts.conf) { + opts.conf = &g_default_conf; + } + + if (!opts.base_bdev) { + SPDK_ERRLOG("Lack of underlying device in configuration\n"); + rc = -EINVAL; + goto fail_sync; + } + + dev->conf = *opts.conf; + dev->limit = SPDK_FTL_LIMIT_MAX; + + dev->name = strdup(opts.name); + if (!dev->name) { + SPDK_ERRLOG("Unable to set device name\n"); + goto fail_sync; + } + + if (ftl_dev_init_base_bdev(dev, opts.base_bdev)) { + SPDK_ERRLOG("Unsupported underlying device\n"); + goto fail_sync; + } + + if (opts.conf->l2p_path) { + dev->conf.l2p_path = strdup(opts.conf->l2p_path); + if (!dev->conf.l2p_path) { + rc = -ENOMEM; + goto fail_sync; + } + } + + /* In case of errors, we free all of the memory in ftl_dev_free_sync(), */ + /* so we don't have to clean up in each of the init functions. */ + if (ftl_check_conf(dev, opts.conf)) { + SPDK_ERRLOG("Invalid device configuration\n"); + goto fail_sync; + } + + if (ftl_init_lba_map_pools(dev)) { + SPDK_ERRLOG("Unable to init LBA map pools\n"); + goto fail_sync; + } + + if (ftl_init_media_events_pool(dev)) { + SPDK_ERRLOG("Unable to init media events pools\n"); + goto fail_sync; + } + + ftl_init_wptr_list(dev); + + if (ftl_dev_init_bands(dev)) { + SPDK_ERRLOG("Unable to initialize band array\n"); + goto fail_sync; + } + + if (ftl_dev_init_nv_cache(dev, opts.cache_bdev)) { + SPDK_ERRLOG("Unable to initialize persistent cache\n"); + goto fail_sync; + } + + dev->reloc = ftl_reloc_init(dev); + if (!dev->reloc) { + SPDK_ERRLOG("Unable to initialize reloc structures\n"); + goto fail_sync; + } + + if (ftl_dev_init_io_channel(dev)) { + SPDK_ERRLOG("Unable to initialize IO channels\n"); + goto fail_sync; + } + + if (ftl_dev_init_zones(init_ctx)) { + SPDK_ERRLOG("Failed to initialize zones\n"); + goto fail_async; + } + + return 0; +fail_sync: + ftl_dev_free_sync(dev); + ftl_dev_free_init_ctx(init_ctx); + return rc; +fail_async: + ftl_init_fail(init_ctx); + return 0; +} + +static void +_ftl_halt_defrag(void *arg) +{ + ftl_reloc_halt(((struct spdk_ftl_dev *)arg)->reloc); +} + +static void +ftl_halt_complete_cb(void *ctx) +{ + struct ftl_dev_init_ctx *fini_ctx = ctx; + struct spdk_ftl_dev *dev = fini_ctx->dev; + + /* Make sure core IO channel has already been released */ + if (dev->num_io_channels > 0) { + spdk_thread_send_msg(spdk_get_thread(), ftl_halt_complete_cb, ctx); + return; + } + + spdk_io_device_unregister(fini_ctx->dev, NULL); + + ftl_dev_free_sync(fini_ctx->dev); + if (fini_ctx->cb_fn != NULL) { + fini_ctx->cb_fn(NULL, fini_ctx->cb_arg, fini_ctx->halt_complete_status); + } + + ftl_dev_free_init_ctx(fini_ctx); +} + +static void +ftl_put_io_channel_cb(void *ctx) +{ + struct ftl_dev_init_ctx *fini_ctx = ctx; + struct spdk_ftl_dev *dev = fini_ctx->dev; + + spdk_put_io_channel(dev->ioch); + spdk_thread_send_msg(spdk_get_thread(), ftl_halt_complete_cb, ctx); +} + +static void +ftl_nv_cache_header_fini_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_dev_init_ctx *fini_ctx = cb_arg; + int rc = 0; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Failed to write non-volatile cache metadata header\n"); + rc = -EIO; + } + + fini_ctx->halt_complete_status = rc; + spdk_thread_send_msg(fini_ctx->thread, ftl_put_io_channel_cb, fini_ctx); +} + +static int +ftl_halt_poller(void *ctx) +{ + struct ftl_dev_init_ctx *fini_ctx = ctx; + struct spdk_ftl_dev *dev = fini_ctx->dev; + + if (!dev->core_poller) { + spdk_poller_unregister(&fini_ctx->poller); + + if (ftl_dev_has_nv_cache(dev)) { + ftl_nv_cache_write_header(&dev->nv_cache, true, + ftl_nv_cache_header_fini_cb, fini_ctx); + } else { + fini_ctx->halt_complete_status = 0; + spdk_thread_send_msg(fini_ctx->thread, ftl_put_io_channel_cb, fini_ctx); + } + } + + return SPDK_POLLER_BUSY; +} + +static void +ftl_add_halt_poller(void *ctx) +{ + struct ftl_dev_init_ctx *fini_ctx = ctx; + struct spdk_ftl_dev *dev = fini_ctx->dev; + + dev->halt = 1; + + _ftl_halt_defrag(dev); + + assert(!fini_ctx->poller); + fini_ctx->poller = SPDK_POLLER_REGISTER(ftl_halt_poller, fini_ctx, 100); +} + +static int +ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg, + struct spdk_thread *thread) +{ + struct ftl_dev_init_ctx *fini_ctx; + + if (dev->halt_started) { + dev->halt_started = true; + return -EBUSY; + } + + fini_ctx = calloc(1, sizeof(*fini_ctx)); + if (!fini_ctx) { + return -ENOMEM; + } + + fini_ctx->dev = dev; + fini_ctx->cb_fn = cb_fn; + fini_ctx->cb_arg = cb_arg; + fini_ctx->thread = thread; + + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_add_halt_poller, fini_ctx); + return 0; +} + +int +spdk_ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg) +{ + return ftl_dev_free(dev, cb_fn, cb_arg, spdk_get_thread()); +} + +SPDK_LOG_REGISTER_COMPONENT("ftl_init", SPDK_LOG_FTL_INIT) diff --git a/src/spdk/lib/ftl/ftl_io.c b/src/spdk/lib/ftl/ftl_io.c new file mode 100644 index 000000000..39a845bae --- /dev/null +++ b/src/spdk/lib/ftl/ftl_io.c @@ -0,0 +1,563 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/ftl.h" +#include "spdk/likely.h" +#include "spdk/util.h" + +#include "ftl_io.h" +#include "ftl_core.h" +#include "ftl_band.h" +#include "ftl_debug.h" + +void +ftl_io_inc_req(struct ftl_io *io) +{ + struct ftl_band *band = io->band; + + if (!(io->flags & FTL_IO_CACHE) && io->type != FTL_IO_READ && io->type != FTL_IO_ERASE) { + ftl_band_acquire_lba_map(band); + } + + __atomic_fetch_add(&io->dev->num_inflight, 1, __ATOMIC_SEQ_CST); + + ++io->req_cnt; +} + +void +ftl_io_dec_req(struct ftl_io *io) +{ + struct ftl_band *band = io->band; + unsigned long num_inflight __attribute__((unused)); + + if (!(io->flags & FTL_IO_CACHE) && io->type != FTL_IO_READ && io->type != FTL_IO_ERASE) { + ftl_band_release_lba_map(band); + } + + num_inflight = __atomic_fetch_sub(&io->dev->num_inflight, 1, __ATOMIC_SEQ_CST); + + assert(num_inflight > 0); + assert(io->req_cnt > 0); + + --io->req_cnt; +} + +struct iovec * +ftl_io_iovec(struct ftl_io *io) +{ + return &io->iov[0]; +} + +uint64_t +ftl_io_get_lba(const struct ftl_io *io, size_t offset) +{ + assert(offset < io->num_blocks); + + if (io->flags & FTL_IO_VECTOR_LBA) { + return io->lba.vector[offset]; + } else { + return io->lba.single + offset; + } +} + +uint64_t +ftl_io_current_lba(const struct ftl_io *io) +{ + return ftl_io_get_lba(io, io->pos); +} + +void +ftl_io_advance(struct ftl_io *io, size_t num_blocks) +{ + struct iovec *iov = ftl_io_iovec(io); + size_t iov_blocks, block_left = num_blocks; + + io->pos += num_blocks; + + if (io->iov_cnt != 0) { + while (block_left > 0) { + assert(io->iov_pos < io->iov_cnt); + iov_blocks = iov[io->iov_pos].iov_len / FTL_BLOCK_SIZE; + + if (io->iov_off + block_left < iov_blocks) { + io->iov_off += block_left; + break; + } + + assert(iov_blocks > io->iov_off); + block_left -= (iov_blocks - io->iov_off); + io->iov_off = 0; + io->iov_pos++; + } + } + + if (io->parent) { + ftl_io_advance(io->parent, num_blocks); + } +} + +size_t +ftl_iovec_num_blocks(struct iovec *iov, size_t iov_cnt) +{ + size_t num_blocks = 0, i = 0; + + for (; i < iov_cnt; ++i) { + num_blocks += iov[i].iov_len / FTL_BLOCK_SIZE; + } + + return num_blocks; +} + +void * +ftl_io_iovec_addr(struct ftl_io *io) +{ + assert(io->iov_pos < io->iov_cnt); + assert(io->iov_off * FTL_BLOCK_SIZE < ftl_io_iovec(io)[io->iov_pos].iov_len); + + return (char *)ftl_io_iovec(io)[io->iov_pos].iov_base + + io->iov_off * FTL_BLOCK_SIZE; +} + +size_t +ftl_io_iovec_len_left(struct ftl_io *io) +{ + struct iovec *iov = ftl_io_iovec(io); + return iov[io->iov_pos].iov_len / FTL_BLOCK_SIZE - io->iov_off; +} + +static void +ftl_io_init_iovec(struct ftl_io *io, const struct iovec *iov, size_t iov_cnt, size_t iov_off, + size_t num_blocks) +{ + size_t offset = 0, num_left; + + io->iov_pos = 0; + io->iov_cnt = 0; + io->num_blocks = num_blocks; + + while (offset < num_blocks) { + assert(io->iov_cnt < FTL_IO_MAX_IOVEC && io->iov_cnt < iov_cnt); + + num_left = spdk_min(iov[io->iov_cnt].iov_len / FTL_BLOCK_SIZE - iov_off, + num_blocks); + io->iov[io->iov_cnt].iov_base = (char *)iov[io->iov_cnt].iov_base + + iov_off * FTL_BLOCK_SIZE; + io->iov[io->iov_cnt].iov_len = num_left * FTL_BLOCK_SIZE; + + offset += num_left; + io->iov_cnt++; + iov_off = 0; + } +} + +void +ftl_io_shrink_iovec(struct ftl_io *io, size_t num_blocks) +{ + size_t iov_off = 0, block_off = 0; + + assert(io->num_blocks >= num_blocks); + assert(io->pos == 0 && io->iov_pos == 0 && io->iov_off == 0); + + for (; iov_off < io->iov_cnt; ++iov_off) { + size_t num_iov = io->iov[iov_off].iov_len / FTL_BLOCK_SIZE; + size_t num_left = num_blocks - block_off; + + if (num_iov >= num_left) { + io->iov[iov_off].iov_len = num_left * FTL_BLOCK_SIZE; + io->iov_cnt = iov_off + 1; + io->num_blocks = num_blocks; + break; + } + + block_off += num_iov; + } +} + +static void +ftl_io_init(struct ftl_io *io, struct spdk_ftl_dev *dev, + ftl_io_fn fn, void *ctx, int flags, int type) +{ + io->flags |= flags | FTL_IO_INITIALIZED; + io->type = type; + io->dev = dev; + io->lba.single = FTL_LBA_INVALID; + io->addr.offset = FTL_ADDR_INVALID; + io->cb_fn = fn; + io->cb_ctx = ctx; + io->trace = ftl_trace_alloc_id(dev); +} + +struct ftl_io * +ftl_io_init_internal(const struct ftl_io_init_opts *opts) +{ + struct ftl_io *io = opts->io; + struct ftl_io *parent = opts->parent; + struct spdk_ftl_dev *dev = opts->dev; + const struct iovec *iov; + size_t iov_cnt, iov_off; + + if (!io) { + if (parent) { + io = ftl_io_alloc_child(parent); + } else { + io = ftl_io_alloc(ftl_get_io_channel(dev)); + } + + if (!io) { + return NULL; + } + } + + ftl_io_clear(io); + ftl_io_init(io, dev, opts->cb_fn, opts->cb_ctx, opts->flags | FTL_IO_INTERNAL, opts->type); + + io->batch = opts->batch; + io->band = opts->band; + io->md = opts->md; + io->iov = &io->iov_buf[0]; + + if (parent) { + if (parent->flags & FTL_IO_VECTOR_LBA) { + io->lba.vector = parent->lba.vector + parent->pos; + } else { + io->lba.single = parent->lba.single + parent->pos; + } + + iov = &parent->iov[parent->iov_pos]; + iov_cnt = parent->iov_cnt - parent->iov_pos; + iov_off = parent->iov_off; + } else { + iov = &opts->iovs[0]; + iov_cnt = opts->iovcnt; + iov_off = 0; + } + + /* Some requests (zone resets) do not use iovecs */ + if (iov_cnt > 0) { + ftl_io_init_iovec(io, iov, iov_cnt, iov_off, opts->num_blocks); + } + + if (opts->flags & FTL_IO_VECTOR_LBA) { + io->lba.vector = calloc(io->num_blocks, sizeof(uint64_t)); + if (!io->lba.vector) { + ftl_io_free(io); + return NULL; + } + } + + return io; +} + +struct ftl_io * +ftl_io_wbuf_init(struct spdk_ftl_dev *dev, struct ftl_addr addr, struct ftl_band *band, + struct ftl_batch *batch, ftl_io_fn cb) +{ + struct ftl_io *io; + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .batch = batch, + .band = band, + .size = sizeof(struct ftl_io), + .flags = 0, + .type = FTL_IO_WRITE, + .num_blocks = dev->xfer_size, + .cb_fn = cb, + .iovcnt = dev->xfer_size, + .md = batch->metadata, + }; + + memcpy(opts.iovs, batch->iov, sizeof(struct iovec) * dev->xfer_size); + + io = ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->addr = addr; + + return io; +} + +struct ftl_io * +ftl_io_erase_init(struct ftl_band *band, size_t num_blocks, ftl_io_fn cb) +{ + struct ftl_io *io; + struct ftl_io_init_opts opts = { + .dev = band->dev, + .io = NULL, + .band = band, + .size = sizeof(struct ftl_io), + .flags = FTL_IO_PHYSICAL_MODE, + .type = FTL_IO_ERASE, + .num_blocks = 1, + .cb_fn = cb, + .iovcnt = 0, + .md = NULL, + }; + + io = ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->num_blocks = num_blocks; + + return io; +} + +static void +_ftl_user_cb(struct ftl_io *io, void *arg, int status) +{ + io->user_fn(arg, status); +} + +struct ftl_io * +ftl_io_user_init(struct spdk_io_channel *_ioch, uint64_t lba, size_t num_blocks, struct iovec *iov, + size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_ctx, int type) +{ + struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(_ioch); + struct spdk_ftl_dev *dev = ioch->dev; + struct ftl_io *io; + + io = ftl_io_alloc(_ioch); + if (spdk_unlikely(!io)) { + return NULL; + } + + ftl_io_init(io, dev, _ftl_user_cb, cb_ctx, 0, type); + io->lba.single = lba; + io->user_fn = cb_fn; + io->iov = iov; + io->iov_cnt = iov_cnt; + io->num_blocks = num_blocks; + + ftl_trace_lba_io_init(io->dev, io); + return io; +} + +static void +_ftl_io_free(struct ftl_io *io) +{ + struct ftl_io_channel *ioch; + + assert(LIST_EMPTY(&io->children)); + + if (io->flags & FTL_IO_VECTOR_LBA) { + free(io->lba.vector); + } + + if (pthread_spin_destroy(&io->lock)) { + SPDK_ERRLOG("pthread_spin_destroy failed\n"); + } + + ioch = ftl_io_channel_get_ctx(io->ioch); + spdk_mempool_put(ioch->io_pool, io); +} + +static bool +ftl_io_remove_child(struct ftl_io *io) +{ + struct ftl_io *parent = io->parent; + bool parent_done; + + pthread_spin_lock(&parent->lock); + LIST_REMOVE(io, child_entry); + parent_done = parent->done && LIST_EMPTY(&parent->children); + parent->status = parent->status ? : io->status; + pthread_spin_unlock(&parent->lock); + + return parent_done; +} + +void +ftl_io_complete(struct ftl_io *io) +{ + struct ftl_io *parent = io->parent; + bool complete; + + io->flags &= ~FTL_IO_INITIALIZED; + + pthread_spin_lock(&io->lock); + complete = LIST_EMPTY(&io->children); + io->done = true; + pthread_spin_unlock(&io->lock); + + if (complete) { + if (io->cb_fn) { + io->cb_fn(io, io->cb_ctx, io->status); + } + + if (parent && ftl_io_remove_child(io)) { + ftl_io_complete(parent); + } + + _ftl_io_free(io); + } +} + +struct ftl_io * +ftl_io_alloc_child(struct ftl_io *parent) +{ + struct ftl_io *io; + + io = ftl_io_alloc(parent->ioch); + if (spdk_unlikely(!io)) { + return NULL; + } + + ftl_io_init(io, parent->dev, NULL, NULL, parent->flags, parent->type); + io->parent = parent; + + pthread_spin_lock(&parent->lock); + LIST_INSERT_HEAD(&parent->children, io, child_entry); + pthread_spin_unlock(&parent->lock); + + return io; +} + +void ftl_io_fail(struct ftl_io *io, int status) +{ + io->status = status; + ftl_io_advance(io, io->num_blocks - io->pos); +} + +void * +ftl_io_get_md(const struct ftl_io *io) +{ + if (!io->md) { + return NULL; + } + + return (char *)io->md + io->pos * io->dev->md_size; +} + +struct ftl_io * +ftl_io_alloc(struct spdk_io_channel *ch) +{ + struct ftl_io *io; + struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(ch); + + io = spdk_mempool_get(ioch->io_pool); + if (!io) { + return NULL; + } + + memset(io, 0, ioch->elem_size); + io->ioch = ch; + + if (pthread_spin_init(&io->lock, PTHREAD_PROCESS_PRIVATE)) { + SPDK_ERRLOG("pthread_spin_init failed\n"); + spdk_mempool_put(ioch->io_pool, io); + return NULL; + } + + return io; +} + +void +ftl_io_reinit(struct ftl_io *io, ftl_io_fn cb, void *ctx, int flags, int type) +{ + ftl_io_clear(io); + ftl_io_init(io, io->dev, cb, ctx, flags, type); +} + +void +ftl_io_clear(struct ftl_io *io) +{ + ftl_io_reset(io); + + io->flags = 0; + io->batch = NULL; + io->band = NULL; +} + +void +ftl_io_reset(struct ftl_io *io) +{ + io->req_cnt = io->pos = io->iov_pos = io->iov_off = 0; + io->done = false; +} + +void +ftl_io_free(struct ftl_io *io) +{ + struct ftl_io *parent; + + if (!io) { + return; + } + + parent = io->parent; + if (parent && ftl_io_remove_child(io)) { + ftl_io_complete(parent); + } + + _ftl_io_free(io); +} + +void +ftl_io_call_foreach_child(struct ftl_io *io, int (*callback)(struct ftl_io *)) +{ + struct ftl_io *child, *tmp; + + assert(!io->done); + + /* + * If the IO doesn't have any children, it means that it directly describes a request (i.e. + * all of the buffers, LBAs, etc. are filled). Otherwise the IO only groups together several + * requests and may be partially filled, so the callback needs to be called on all of its + * children instead. + */ + if (LIST_EMPTY(&io->children)) { + callback(io); + return; + } + + LIST_FOREACH_SAFE(child, &io->children, child_entry, tmp) { + int rc = callback(child); + if (rc) { + assert(rc != -EAGAIN); + ftl_io_fail(io, rc); + break; + } + } + + /* + * If all the callbacks were processed or an error occurred, treat this IO as completed. + * Multiple calls to ftl_io_call_foreach_child are not supported, resubmissions are supposed + * to be handled in the callback. + */ + ftl_io_complete(io); +} diff --git a/src/spdk/lib/ftl/ftl_io.h b/src/spdk/lib/ftl/ftl_io.h new file mode 100644 index 000000000..d49dc3de7 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_io.h @@ -0,0 +1,351 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_IO_H +#define FTL_IO_H + +#include "spdk/stdinc.h" +#include "spdk/nvme.h" +#include "spdk/ftl.h" + +#include "ftl_addr.h" +#include "ftl_trace.h" + +struct spdk_ftl_dev; +struct ftl_band; +struct ftl_batch; +struct ftl_io; + +typedef int (*ftl_md_pack_fn)(struct ftl_band *); +typedef void (*ftl_io_fn)(struct ftl_io *, void *, int); + +/* IO flags */ +enum ftl_io_flags { + /* Indicates whether IO is already initialized */ + FTL_IO_INITIALIZED = (1 << 0), + /* Internal based IO (defrag, metadata etc.) */ + FTL_IO_INTERNAL = (1 << 1), + /* Indicates that the IO should not go through if there's */ + /* already another one scheduled to the same LBA */ + FTL_IO_WEAK = (1 << 2), + /* Indicates that the IO is used for padding */ + FTL_IO_PAD = (1 << 3), + /* The IO operates on metadata */ + FTL_IO_MD = (1 << 4), + /* Using physical instead of logical address */ + FTL_IO_PHYSICAL_MODE = (1 << 5), + /* Indicates that IO contains noncontiguous LBAs */ + FTL_IO_VECTOR_LBA = (1 << 6), + /* The IO is directed to non-volatile cache */ + FTL_IO_CACHE = (1 << 7), + /* Indicates that physical address should be taken from IO struct, */ + /* not assigned by wptr, only works if wptr is also in direct mode */ + FTL_IO_DIRECT_ACCESS = (1 << 8), + /* Bypass the non-volatile cache */ + FTL_IO_BYPASS_CACHE = (1 << 9), +}; + +enum ftl_io_type { + FTL_IO_READ, + FTL_IO_WRITE, + FTL_IO_ERASE, +}; + +#define FTL_IO_MAX_IOVEC 64 + +struct ftl_io_init_opts { + struct spdk_ftl_dev *dev; + + /* IO descriptor */ + struct ftl_io *io; + + /* Parent request */ + struct ftl_io *parent; + + /* Size of IO descriptor */ + size_t size; + + /* IO flags */ + int flags; + + /* IO type */ + enum ftl_io_type type; + + /* Transfer batch, set for IO going through the write buffer */ + struct ftl_batch *batch; + + /* Band to which the IO is directed */ + struct ftl_band *band; + + /* Number of logical blocks */ + size_t num_blocks; + + /* Data */ + struct iovec iovs[FTL_IO_MAX_IOVEC]; + int iovcnt; + + /* Metadata */ + void *md; + + /* Callback's function */ + ftl_io_fn cb_fn; + + /* Callback's context */ + void *cb_ctx; +}; + +struct ftl_io_channel; + +struct ftl_wbuf_entry { + /* IO channel that owns the write bufer entry */ + struct ftl_io_channel *ioch; + /* Data payload (single block) */ + void *payload; + /* Index within the IO channel's wbuf_entries array */ + uint32_t index; + uint32_t io_flags; + /* Points at the band the data is copied from. Only valid for internal + * requests coming from reloc. + */ + struct ftl_band *band; + /* Physical address of that particular block. Valid once the data has + * been written out. + */ + struct ftl_addr addr; + /* Logical block address */ + uint64_t lba; + + /* Trace ID of the requests the entry is part of */ + uint64_t trace; + + /* Indicates that the entry was written out and is still present in the + * L2P table. + */ + bool valid; + /* Lock that protects the entry from being evicted from the L2P */ + pthread_spinlock_t lock; + TAILQ_ENTRY(ftl_wbuf_entry) tailq; +}; + +#define FTL_IO_CHANNEL_INDEX_INVALID ((uint64_t)-1) + +struct ftl_io_channel { + /* Device */ + struct spdk_ftl_dev *dev; + /* IO pool element size */ + size_t elem_size; + /* Index within the IO channel array */ + uint64_t index; + /* IO pool */ + struct spdk_mempool *io_pool; + /* Underlying device IO channel */ + struct spdk_io_channel *base_ioch; + /* Persistent cache IO channel */ + struct spdk_io_channel *cache_ioch; + /* Poller used for completing write requests and retrying IO */ + struct spdk_poller *poller; + /* Write completion queue */ + TAILQ_HEAD(, ftl_io) write_cmpl_queue; + TAILQ_HEAD(, ftl_io) retry_queue; + TAILQ_ENTRY(ftl_io_channel) tailq; + + /* Array of write buffer entries */ + struct ftl_wbuf_entry *wbuf_entries; + /* Write buffer data payload */ + void *wbuf_payload; + /* Number of write buffer entries */ + uint32_t num_entries; + /* Write buffer queues */ + struct spdk_ring *free_queue; + struct spdk_ring *submit_queue; + /* Maximum number of concurrent user writes */ + uint32_t qdepth_limit; + /* Current number of concurrent user writes */ + uint32_t qdepth_current; + /* Means that the IO channel is being flushed */ + bool flush; +}; + +/* General IO descriptor */ +struct ftl_io { + /* Device */ + struct spdk_ftl_dev *dev; + + /* IO channel */ + struct spdk_io_channel *ioch; + + union { + /* LBA table */ + uint64_t *vector; + + /* First LBA */ + uint64_t single; + } lba; + + /* First block address */ + struct ftl_addr addr; + + /* Number of processed blocks */ + size_t pos; + + /* Number of blocks */ + size_t num_blocks; + + /* IO vector pointer */ + struct iovec *iov; + + /* IO vector buffer for internal requests */ + struct iovec iov_buf[FTL_IO_MAX_IOVEC]; + + /* Metadata */ + void *md; + + /* Number of IO vectors */ + size_t iov_cnt; + + /* Position within the iovec */ + size_t iov_pos; + + /* Offset within the iovec (in blocks) */ + size_t iov_off; + + /* Transfer batch (valid only for writes going through the write buffer) */ + struct ftl_batch *batch; + + /* Band this IO is being written to */ + struct ftl_band *band; + + /* Request status */ + int status; + + /* Number of split requests */ + size_t req_cnt; + + /* Callback's function */ + ftl_io_fn cb_fn; + + /* Callback's context */ + void *cb_ctx; + + /* User callback function */ + spdk_ftl_fn user_fn; + + /* Flags */ + int flags; + + /* IO type */ + enum ftl_io_type type; + + /* Done flag */ + bool done; + + /* Parent request */ + struct ftl_io *parent; + /* Child requests list */ + LIST_HEAD(, ftl_io) children; + /* Child list link */ + LIST_ENTRY(ftl_io) child_entry; + /* Children lock */ + pthread_spinlock_t lock; + + /* Trace group id */ + uint64_t trace; + + /* Used by retry and write completion queues */ + TAILQ_ENTRY(ftl_io) ioch_entry; +}; + +/* Metadata IO */ +struct ftl_md_io { + /* Parent IO structure */ + struct ftl_io io; + + /* Serialization/deserialization callback */ + ftl_md_pack_fn pack_fn; + + /* Callback's function */ + ftl_io_fn cb_fn; + + /* Callback's context */ + void *cb_ctx; +}; + +static inline bool +ftl_io_mode_physical(const struct ftl_io *io) +{ + return io->flags & FTL_IO_PHYSICAL_MODE; +} + +static inline bool +ftl_io_mode_logical(const struct ftl_io *io) +{ + return !ftl_io_mode_physical(io); +} + +static inline bool +ftl_io_done(const struct ftl_io *io) +{ + return io->req_cnt == 0 && io->pos == io->num_blocks; +} + +struct ftl_io *ftl_io_alloc(struct spdk_io_channel *ch); +struct ftl_io *ftl_io_alloc_child(struct ftl_io *parent); +void ftl_io_fail(struct ftl_io *io, int status); +void ftl_io_free(struct ftl_io *io); +struct ftl_io *ftl_io_init_internal(const struct ftl_io_init_opts *opts); +void ftl_io_reinit(struct ftl_io *io, ftl_io_fn cb, + void *ctx, int flags, int type); +void ftl_io_clear(struct ftl_io *io); +void ftl_io_inc_req(struct ftl_io *io); +void ftl_io_dec_req(struct ftl_io *io); +struct iovec *ftl_io_iovec(struct ftl_io *io); +uint64_t ftl_io_current_lba(const struct ftl_io *io); +uint64_t ftl_io_get_lba(const struct ftl_io *io, size_t offset); +void ftl_io_advance(struct ftl_io *io, size_t num_blocks); +size_t ftl_iovec_num_blocks(struct iovec *iov, size_t iov_cnt); +void *ftl_io_iovec_addr(struct ftl_io *io); +size_t ftl_io_iovec_len_left(struct ftl_io *io); +struct ftl_io *ftl_io_wbuf_init(struct spdk_ftl_dev *dev, struct ftl_addr addr, + struct ftl_band *band, struct ftl_batch *batch, ftl_io_fn cb); +struct ftl_io *ftl_io_erase_init(struct ftl_band *band, size_t num_blocks, ftl_io_fn cb); +struct ftl_io *ftl_io_user_init(struct spdk_io_channel *ioch, uint64_t lba, size_t num_blocks, + struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, + void *cb_arg, int type); +void *ftl_io_get_md(const struct ftl_io *io); +void ftl_io_complete(struct ftl_io *io); +void ftl_io_shrink_iovec(struct ftl_io *io, size_t num_blocks); +void ftl_io_process_error(struct ftl_io *io, const struct spdk_nvme_cpl *status); +void ftl_io_reset(struct ftl_io *io); +void ftl_io_call_foreach_child(struct ftl_io *io, int (*callback)(struct ftl_io *)); + +#endif /* FTL_IO_H */ diff --git a/src/spdk/lib/ftl/ftl_reloc.c b/src/spdk/lib/ftl/ftl_reloc.c new file mode 100644 index 000000000..e59bf4d81 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_reloc.c @@ -0,0 +1,860 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/likely.h" +#include "spdk_internal/log.h" +#include "spdk/ftl.h" + +#include "ftl_reloc.h" +#include "ftl_core.h" +#include "ftl_io.h" +#include "ftl_band.h" +#include "ftl_debug.h" + +/* Maximum active reloc moves */ +#define FTL_RELOC_MAX_MOVES 256 + +struct ftl_reloc; +struct ftl_band_reloc; + +enum ftl_reloc_move_state { + FTL_RELOC_STATE_READ_LBA_MAP, + FTL_RELOC_STATE_READ, + FTL_RELOC_STATE_WRITE, +}; + +enum ftl_band_reloc_state { + FTL_BAND_RELOC_STATE_INACTIVE, + FTL_BAND_RELOC_STATE_PENDING, + FTL_BAND_RELOC_STATE_ACTIVE, + FTL_BAND_RELOC_STATE_HIGH_PRIO +}; + +struct ftl_reloc_move { + struct ftl_band_reloc *breloc; + + /* Start addr */ + struct ftl_addr addr; + + /* Number of logical blocks */ + size_t num_blocks; + + /* Data buffer */ + void *data; + + /* Move state (read lba_map, read, write) */ + enum ftl_reloc_move_state state; + + /* IO associated with move */ + struct ftl_io *io; + + STAILQ_ENTRY(ftl_reloc_move) entry; +}; + +struct ftl_band_reloc { + struct ftl_reloc *parent; + + /* Band being relocated */ + struct ftl_band *band; + + /* Number of logical blocks to be relocated */ + size_t num_blocks; + + /* Bitmap of logical blocks to be relocated */ + struct spdk_bit_array *reloc_map; + + /* State of the band reloc */ + enum ftl_band_reloc_state state; + + /* The band is being defragged */ + bool defrag; + + /* Reloc map iterator */ + struct { + /* Array of zone offsets */ + size_t *zone_offset; + + /* Current zone */ + size_t zone_current; + } iter; + + /* Number of outstanding moves */ + size_t num_outstanding; + + /* Pool of move objects */ + struct ftl_reloc_move *moves; + + /* Move queue */ + STAILQ_HEAD(, ftl_reloc_move) move_queue; + + TAILQ_ENTRY(ftl_band_reloc) entry; +}; + +struct ftl_reloc { + /* Device associated with relocate */ + struct spdk_ftl_dev *dev; + + /* Indicates relocate is about to halt */ + bool halt; + + /* Maximum number of IOs per band */ + size_t max_qdepth; + + /* Maximum number of active band relocates */ + size_t max_active; + + /* Maximum transfer size (in logical blocks) per single IO */ + size_t xfer_size; + /* Number of bands being defragged */ + size_t num_defrag_bands; + + /* Array of band relocates */ + struct ftl_band_reloc *brelocs; + + /* Number of active/priority band relocates */ + size_t num_active; + + /* Priority band relocates queue */ + TAILQ_HEAD(, ftl_band_reloc) prio_queue; + + /* Active band relocates queue */ + TAILQ_HEAD(, ftl_band_reloc) active_queue; + + /* Pending band relocates queue */ + TAILQ_HEAD(, ftl_band_reloc) pending_queue; +}; + +bool +ftl_reloc_is_defrag_active(const struct ftl_reloc *reloc) +{ + return reloc->num_defrag_bands > 0; +} + +static size_t +ftl_reloc_iter_zone_offset(struct ftl_band_reloc *breloc) +{ + size_t zone = breloc->iter.zone_current; + + return breloc->iter.zone_offset[zone]; +} + +static size_t +ftl_reloc_iter_zone_done(struct ftl_band_reloc *breloc) +{ + size_t num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev); + + return ftl_reloc_iter_zone_offset(breloc) == num_blocks; +} + +static void +ftl_reloc_clr_block(struct ftl_band_reloc *breloc, size_t block_off) +{ + if (!spdk_bit_array_get(breloc->reloc_map, block_off)) { + return; + } + + spdk_bit_array_clear(breloc->reloc_map, block_off); + assert(breloc->num_blocks); + breloc->num_blocks--; +} + +static void +ftl_reloc_read_lba_map_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_reloc_move *move = arg; + struct ftl_band_reloc *breloc = move->breloc; + + breloc->num_outstanding--; + assert(status == 0); + move->state = FTL_RELOC_STATE_WRITE; + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); +} + +static int +ftl_reloc_read_lba_map(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move) +{ + struct ftl_band *band = breloc->band; + + breloc->num_outstanding++; + return ftl_band_read_lba_map(band, ftl_band_block_offset_from_addr(band, move->addr), + move->num_blocks, ftl_reloc_read_lba_map_cb, move); +} + +static void +ftl_reloc_prep(struct ftl_band_reloc *breloc) +{ + struct ftl_band *band = breloc->band; + struct ftl_reloc *reloc = breloc->parent; + struct ftl_reloc_move *move; + size_t i; + + reloc->num_active++; + + if (!band->high_prio) { + if (ftl_band_alloc_lba_map(band)) { + SPDK_ERRLOG("Failed to allocate lba map\n"); + assert(false); + } + } else { + ftl_band_acquire_lba_map(band); + } + + for (i = 0; i < reloc->max_qdepth; ++i) { + move = &breloc->moves[i]; + move->state = FTL_RELOC_STATE_READ; + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); + } +} + +static void +ftl_reloc_free_move(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move) +{ + assert(move); + spdk_dma_free(move->data); + memset(move, 0, sizeof(*move)); + move->state = FTL_RELOC_STATE_READ; +} + +static void +ftl_reloc_write_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_reloc_move *move = arg; + struct ftl_addr addr = move->addr; + struct ftl_band_reloc *breloc = move->breloc; + size_t i; + + breloc->num_outstanding--; + + if (status) { + SPDK_ERRLOG("Reloc write failed with status: %d\n", status); + assert(false); + return; + } + + for (i = 0; i < move->num_blocks; ++i) { + addr.offset = move->addr.offset + i; + size_t block_off = ftl_band_block_offset_from_addr(breloc->band, addr); + ftl_reloc_clr_block(breloc, block_off); + } + + ftl_reloc_free_move(breloc, move); + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); +} + +static void +ftl_reloc_read_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_reloc_move *move = arg; + struct ftl_band_reloc *breloc = move->breloc; + + breloc->num_outstanding--; + + /* TODO: We should handle fail on relocation read. We need to inform */ + /* user that this group of blocks is bad (update l2p with bad block address and */ + /* put it to lba_map/sector_lba). Maybe we could also retry read with smaller granularity? */ + if (status) { + SPDK_ERRLOG("Reloc read failed with status: %d\n", status); + assert(false); + return; + } + + move->state = FTL_RELOC_STATE_READ_LBA_MAP; + move->io = NULL; + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); +} + +static void +ftl_reloc_iter_reset(struct ftl_band_reloc *breloc) +{ + memset(breloc->iter.zone_offset, 0, ftl_get_num_punits(breloc->band->dev) * + sizeof(*breloc->iter.zone_offset)); + breloc->iter.zone_current = 0; +} + +static size_t +ftl_reloc_iter_block_offset(struct ftl_band_reloc *breloc) +{ + size_t zone_offset = breloc->iter.zone_current * ftl_get_num_blocks_in_zone(breloc->parent->dev); + + return breloc->iter.zone_offset[breloc->iter.zone_current] + zone_offset; +} + +static void +ftl_reloc_iter_next_zone(struct ftl_band_reloc *breloc) +{ + size_t num_zones = ftl_get_num_punits(breloc->band->dev); + + breloc->iter.zone_current = (breloc->iter.zone_current + 1) % num_zones; +} + +static int +ftl_reloc_block_valid(struct ftl_band_reloc *breloc, size_t block_off) +{ + struct ftl_addr addr = ftl_band_addr_from_block_offset(breloc->band, block_off); + + return ftl_addr_is_written(breloc->band, addr) && + spdk_bit_array_get(breloc->reloc_map, block_off) && + ftl_band_block_offset_valid(breloc->band, block_off); +} + +static int +ftl_reloc_iter_next(struct ftl_band_reloc *breloc, size_t *block_off) +{ + size_t zone = breloc->iter.zone_current; + + *block_off = ftl_reloc_iter_block_offset(breloc); + + if (ftl_reloc_iter_zone_done(breloc)) { + return 0; + } + + breloc->iter.zone_offset[zone]++; + + if (!ftl_reloc_block_valid(breloc, *block_off)) { + ftl_reloc_clr_block(breloc, *block_off); + return 0; + } + + return 1; +} + +static int +ftl_reloc_first_valid_block(struct ftl_band_reloc *breloc, size_t *block_off) +{ + size_t i, num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev); + + for (i = ftl_reloc_iter_zone_offset(breloc); i < num_blocks; ++i) { + if (ftl_reloc_iter_next(breloc, block_off)) { + return 1; + } + } + + return 0; +} + +static int +ftl_reloc_iter_done(struct ftl_band_reloc *breloc) +{ + size_t i; + size_t num_zones = ftl_get_num_punits(breloc->band->dev); + size_t num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev); + + for (i = 0; i < num_zones; ++i) { + if (breloc->iter.zone_offset[i] != num_blocks) { + return 0; + } + } + + return 1; +} + +static size_t +ftl_reloc_find_valid_blocks(struct ftl_band_reloc *breloc, + size_t _num_blocks, struct ftl_addr *addr) +{ + size_t block_off, num_blocks = 0; + + if (!ftl_reloc_first_valid_block(breloc, &block_off)) { + return 0; + } + + *addr = ftl_band_addr_from_block_offset(breloc->band, block_off); + + for (num_blocks = 1; num_blocks < _num_blocks; num_blocks++) { + if (!ftl_reloc_iter_next(breloc, &block_off)) { + break; + } + } + + return num_blocks; +} + +static size_t +ftl_reloc_next_blocks(struct ftl_band_reloc *breloc, struct ftl_addr *addr) +{ + size_t i, num_blocks = 0; + struct spdk_ftl_dev *dev = breloc->parent->dev; + + for (i = 0; i < ftl_get_num_punits(dev); ++i) { + num_blocks = ftl_reloc_find_valid_blocks(breloc, breloc->parent->xfer_size, addr); + ftl_reloc_iter_next_zone(breloc); + + if (num_blocks || ftl_reloc_iter_done(breloc)) { + break; + } + } + + return num_blocks; +} + +static struct ftl_io * +ftl_reloc_io_init(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move, + ftl_io_fn fn, enum ftl_io_type io_type, int flags) +{ + size_t block_off, i; + struct ftl_addr addr = move->addr; + struct ftl_io *io = NULL; + struct ftl_io_init_opts opts = { + .dev = breloc->parent->dev, + .band = breloc->band, + .size = sizeof(*io), + .flags = flags | FTL_IO_INTERNAL | FTL_IO_PHYSICAL_MODE, + .type = io_type, + .num_blocks = move->num_blocks, + .iovs = { + { + .iov_base = move->data, + .iov_len = move->num_blocks * FTL_BLOCK_SIZE, + } + }, + .iovcnt = 1, + .cb_fn = fn, + }; + + io = ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->cb_ctx = move; + io->addr = move->addr; + + if (flags & FTL_IO_VECTOR_LBA) { + for (i = 0; i < io->num_blocks; ++i, ++addr.offset) { + block_off = ftl_band_block_offset_from_addr(breloc->band, addr); + + if (!ftl_band_block_offset_valid(breloc->band, block_off)) { + io->lba.vector[i] = FTL_LBA_INVALID; + continue; + } + + io->lba.vector[i] = breloc->band->lba_map.map[block_off]; + } + } + + ftl_trace_lba_io_init(io->dev, io); + + return io; +} + +static int +ftl_reloc_write(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move) +{ + int io_flags = FTL_IO_WEAK | FTL_IO_VECTOR_LBA | FTL_IO_BYPASS_CACHE; + + if (spdk_likely(!move->io)) { + move->io = ftl_reloc_io_init(breloc, move, ftl_reloc_write_cb, + FTL_IO_WRITE, io_flags); + if (!move->io) { + ftl_reloc_free_move(breloc, move); + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); + return -ENOMEM; + } + } + + breloc->num_outstanding++; + ftl_io_write(move->io); + return 0; +} + +static int +ftl_reloc_read(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move) +{ + struct ftl_addr addr = {}; + + move->num_blocks = ftl_reloc_next_blocks(breloc, &addr); + move->breloc = breloc; + move->addr = addr; + + if (!move->num_blocks) { + return 0; + } + + move->data = spdk_dma_malloc(FTL_BLOCK_SIZE * move->num_blocks, 4096, NULL); + if (!move->data) { + return -1; + } + + move->io = ftl_reloc_io_init(breloc, move, ftl_reloc_read_cb, FTL_IO_READ, 0); + if (!move->io) { + ftl_reloc_free_move(breloc, move); + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); + SPDK_ERRLOG("Failed to initialize io for relocation."); + return -1; + } + + breloc->num_outstanding++; + ftl_io_read(move->io); + return 0; +} + +static void +ftl_reloc_process_moves(struct ftl_band_reloc *breloc) +{ + struct ftl_reloc_move *move; + STAILQ_HEAD(, ftl_reloc_move) move_queue; + int rc = 0; + + /* + * When IO allocation fails, we do not want to retry immediately so keep moves on + * temporary queue + */ + STAILQ_INIT(&move_queue); + STAILQ_SWAP(&breloc->move_queue, &move_queue, ftl_reloc_move); + + while (!STAILQ_EMPTY(&move_queue)) { + move = STAILQ_FIRST(&move_queue); + STAILQ_REMOVE_HEAD(&move_queue, entry); + + switch (move->state) { + case FTL_RELOC_STATE_READ_LBA_MAP: + rc = ftl_reloc_read_lba_map(breloc, move); + break; + case FTL_RELOC_STATE_READ: + rc = ftl_reloc_read(breloc, move); + break; + case FTL_RELOC_STATE_WRITE: + rc = ftl_reloc_write(breloc, move); + break; + default: + assert(false); + break; + } + + if (rc) { + SPDK_ERRLOG("Move queue processing failed\n"); + assert(false); + } + } +} + +static bool +ftl_reloc_done(struct ftl_band_reloc *breloc) +{ + return !breloc->num_outstanding && STAILQ_EMPTY(&breloc->move_queue); +} + +static void +ftl_reloc_release(struct ftl_band_reloc *breloc) +{ + struct ftl_reloc *reloc = breloc->parent; + struct ftl_band *band = breloc->band; + + ftl_reloc_iter_reset(breloc); + ftl_band_release_lba_map(band); + reloc->num_active--; + + if (breloc->state == FTL_BAND_RELOC_STATE_HIGH_PRIO) { + /* High prio band must be relocated as a whole and ANM events will be ignored */ + assert(breloc->num_blocks == 0 && ftl_band_empty(band)); + TAILQ_REMOVE(&reloc->prio_queue, breloc, entry); + band->high_prio = 0; + breloc->state = FTL_BAND_RELOC_STATE_INACTIVE; + } else { + assert(breloc->state == FTL_BAND_RELOC_STATE_ACTIVE); + TAILQ_REMOVE(&reloc->active_queue, breloc, entry); + breloc->state = FTL_BAND_RELOC_STATE_INACTIVE; + + /* If we got ANM event during relocation put such band back to pending queue */ + if (breloc->num_blocks != 0) { + breloc->state = FTL_BAND_RELOC_STATE_PENDING; + TAILQ_INSERT_TAIL(&reloc->pending_queue, breloc, entry); + return; + } + } + + if (ftl_band_empty(band) && band->state == FTL_BAND_STATE_CLOSED) { + ftl_band_set_state(breloc->band, FTL_BAND_STATE_FREE); + + if (breloc->defrag) { + breloc->defrag = false; + assert(reloc->num_defrag_bands > 0); + reloc->num_defrag_bands--; + } + } +} + +static void +ftl_process_reloc(struct ftl_band_reloc *breloc) +{ + ftl_reloc_process_moves(breloc); + + if (ftl_reloc_done(breloc)) { + ftl_reloc_release(breloc); + } +} + +static int +ftl_band_reloc_init(struct ftl_reloc *reloc, struct ftl_band_reloc *breloc, + struct ftl_band *band) +{ + breloc->band = band; + breloc->parent = reloc; + + breloc->reloc_map = spdk_bit_array_create(ftl_get_num_blocks_in_band(reloc->dev)); + if (!breloc->reloc_map) { + SPDK_ERRLOG("Failed to initialize reloc map"); + return -1; + } + + breloc->iter.zone_offset = calloc(ftl_get_num_punits(band->dev), + sizeof(*breloc->iter.zone_offset)); + if (!breloc->iter.zone_offset) { + SPDK_ERRLOG("Failed to initialize reloc iterator"); + return -1; + } + + STAILQ_INIT(&breloc->move_queue); + + breloc->moves = calloc(reloc->max_qdepth, sizeof(*breloc->moves)); + if (!breloc->moves) { + return -1; + } + + return 0; +} + +static void +ftl_band_reloc_free(struct ftl_band_reloc *breloc) +{ + struct ftl_reloc_move *move; + + if (!breloc) { + return; + } + + assert(breloc->num_outstanding == 0); + + /* Drain write queue if there is active band relocation during shutdown */ + if (breloc->state == FTL_BAND_RELOC_STATE_ACTIVE || + breloc->state == FTL_BAND_RELOC_STATE_HIGH_PRIO) { + assert(breloc->parent->halt); + STAILQ_FOREACH(move, &breloc->move_queue, entry) { + ftl_reloc_free_move(breloc, move); + } + } + + spdk_bit_array_free(&breloc->reloc_map); + free(breloc->iter.zone_offset); + free(breloc->moves); +} + +struct ftl_reloc * +ftl_reloc_init(struct spdk_ftl_dev *dev) +{ + struct ftl_reloc *reloc; + size_t i; + + reloc = calloc(1, sizeof(*reloc)); + if (!reloc) { + return NULL; + } + + reloc->dev = dev; + reloc->halt = true; + reloc->max_qdepth = dev->conf.max_reloc_qdepth; + reloc->max_active = dev->conf.max_active_relocs; + reloc->xfer_size = dev->xfer_size; + reloc->num_defrag_bands = 0; + + if (reloc->max_qdepth > FTL_RELOC_MAX_MOVES) { + goto error; + } + + reloc->brelocs = calloc(ftl_get_num_bands(dev), sizeof(*reloc->brelocs)); + if (!reloc->brelocs) { + goto error; + } + + for (i = 0; i < ftl_get_num_bands(reloc->dev); ++i) { + if (ftl_band_reloc_init(reloc, &reloc->brelocs[i], &dev->bands[i])) { + goto error; + } + } + + TAILQ_INIT(&reloc->pending_queue); + TAILQ_INIT(&reloc->active_queue); + TAILQ_INIT(&reloc->prio_queue); + + return reloc; +error: + ftl_reloc_free(reloc); + return NULL; +} + +void +ftl_reloc_free(struct ftl_reloc *reloc) +{ + size_t i; + + if (!reloc) { + return; + } + + for (i = 0; i < ftl_get_num_bands(reloc->dev); ++i) { + ftl_band_reloc_free(&reloc->brelocs[i]); + } + + free(reloc->brelocs); + free(reloc); +} + +bool +ftl_reloc_is_halted(const struct ftl_reloc *reloc) +{ + return reloc->halt; +} + +void +ftl_reloc_halt(struct ftl_reloc *reloc) +{ + reloc->halt = true; +} + +void +ftl_reloc_resume(struct ftl_reloc *reloc) +{ + reloc->halt = false; +} + +void +ftl_reloc(struct ftl_reloc *reloc) +{ + struct ftl_band_reloc *breloc, *tbreloc; + + if (ftl_reloc_is_halted(reloc)) { + return; + } + + /* Process first band from priority queue and return */ + breloc = TAILQ_FIRST(&reloc->prio_queue); + if (breloc) { + ftl_process_reloc(breloc); + return; + } + + TAILQ_FOREACH_SAFE(breloc, &reloc->pending_queue, entry, tbreloc) { + if (reloc->num_active == reloc->max_active) { + break; + } + + /* Wait for band to close before relocating */ + if (breloc->band->state != FTL_BAND_STATE_CLOSED) { + continue; + } + + ftl_reloc_prep(breloc); + assert(breloc->state == FTL_BAND_RELOC_STATE_PENDING); + TAILQ_REMOVE(&reloc->pending_queue, breloc, entry); + breloc->state = FTL_BAND_RELOC_STATE_ACTIVE; + TAILQ_INSERT_HEAD(&reloc->active_queue, breloc, entry); + } + + TAILQ_FOREACH_SAFE(breloc, &reloc->active_queue, entry, tbreloc) { + assert(breloc->state == FTL_BAND_RELOC_STATE_ACTIVE); + ftl_process_reloc(breloc); + } +} + +void +ftl_reloc_add(struct ftl_reloc *reloc, struct ftl_band *band, size_t offset, + size_t num_blocks, int prio, bool is_defrag) +{ + struct ftl_band_reloc *breloc = &reloc->brelocs[band->id]; + size_t i; + + /* No need to add anything if already at high prio - whole band should be relocated */ + if (!prio && band->high_prio) { + return; + } + + pthread_spin_lock(&band->lba_map.lock); + if (band->lba_map.num_vld == 0) { + pthread_spin_unlock(&band->lba_map.lock); + + /* If the band is closed and has no valid blocks, free it */ + if (band->state == FTL_BAND_STATE_CLOSED) { + ftl_band_set_state(band, FTL_BAND_STATE_FREE); + } + + return; + } + pthread_spin_unlock(&band->lba_map.lock); + + for (i = offset; i < offset + num_blocks; ++i) { + if (spdk_bit_array_get(breloc->reloc_map, i)) { + continue; + } + spdk_bit_array_set(breloc->reloc_map, i); + breloc->num_blocks++; + } + + /* If the band is coming from the defrag process, mark it appropriately */ + if (is_defrag) { + assert(offset == 0 && num_blocks == ftl_get_num_blocks_in_band(band->dev)); + reloc->num_defrag_bands++; + breloc->defrag = true; + } + + if (!prio) { + if (breloc->state == FTL_BAND_RELOC_STATE_INACTIVE) { + breloc->state = FTL_BAND_RELOC_STATE_PENDING; + TAILQ_INSERT_HEAD(&reloc->pending_queue, breloc, entry); + } + } else { + bool active = false; + /* If priority band is already on pending or active queue, remove it from it */ + switch (breloc->state) { + case FTL_BAND_RELOC_STATE_PENDING: + TAILQ_REMOVE(&reloc->pending_queue, breloc, entry); + break; + case FTL_BAND_RELOC_STATE_ACTIVE: + active = true; + TAILQ_REMOVE(&reloc->active_queue, breloc, entry); + break; + default: + break; + } + + breloc->state = FTL_BAND_RELOC_STATE_HIGH_PRIO; + TAILQ_INSERT_TAIL(&reloc->prio_queue, breloc, entry); + + /* + * If band has been already on active queue it doesn't need any additional + * resources + */ + if (!active) { + ftl_reloc_prep(breloc); + } + } +} diff --git a/src/spdk/lib/ftl/ftl_reloc.h b/src/spdk/lib/ftl/ftl_reloc.h new file mode 100644 index 000000000..21f49a47d --- /dev/null +++ b/src/spdk/lib/ftl/ftl_reloc.h @@ -0,0 +1,53 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_RELOC_H +#define FTL_RELOC_H + +#include "spdk/stdinc.h" +#include "spdk/ftl.h" + +struct ftl_reloc; +struct ftl_band; + +struct ftl_reloc *ftl_reloc_init(struct spdk_ftl_dev *dev); +void ftl_reloc_free(struct ftl_reloc *reloc); +void ftl_reloc_add(struct ftl_reloc *reloc, struct ftl_band *band, + size_t offset, size_t num_blocks, int prio, bool is_defrag); +void ftl_reloc(struct ftl_reloc *reloc); +void ftl_reloc_halt(struct ftl_reloc *reloc); +void ftl_reloc_resume(struct ftl_reloc *reloc); +bool ftl_reloc_is_halted(const struct ftl_reloc *reloc); +bool ftl_reloc_is_defrag_active(const struct ftl_reloc *reloc); + +#endif /* FTL_RELOC_H */ diff --git a/src/spdk/lib/ftl/ftl_restore.c b/src/spdk/lib/ftl/ftl_restore.c new file mode 100644 index 000000000..6f626645d --- /dev/null +++ b/src/spdk/lib/ftl/ftl_restore.c @@ -0,0 +1,1350 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/ftl.h" +#include "spdk/util.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/crc32.h" + +#include "ftl_core.h" +#include "ftl_band.h" +#include "ftl_io.h" + +struct ftl_restore_band { + struct ftl_restore *parent; + /* Associated band */ + struct ftl_band *band; + /* Status of retrieving this band's metadata */ + enum ftl_md_status md_status; + /* Padded queue link */ + STAILQ_ENTRY(ftl_restore_band) stailq; +}; + +struct ftl_nv_cache_restore; + +/* Describes single phase to be restored from non-volatile cache */ +struct ftl_nv_cache_range { + struct ftl_nv_cache_restore *parent; + /* Start offset */ + uint64_t start_addr; + /* Last block's address */ + uint64_t last_addr; + /* + * Number of blocks (can be smaller than the difference between the last + * and the starting block due to range overlap) + */ + uint64_t num_blocks; + /* Number of blocks already recovered */ + uint64_t num_recovered; + /* Current address during recovery */ + uint64_t current_addr; + /* Phase of the range */ + unsigned int phase; + /* Indicates whether the data from this range needs to be recovered */ + bool recovery; +}; + +struct ftl_nv_cache_block { + struct ftl_nv_cache_restore *parent; + /* Data buffer */ + void *buf; + /* Metadata buffer */ + void *md_buf; + /* Block offset within the cache */ + uint64_t offset; +}; + +struct ftl_nv_cache_restore { + struct ftl_nv_cache *nv_cache; + /* IO channel to use */ + struct spdk_io_channel *ioch; + /* + * Non-volatile cache ranges. The ranges can overlap, as we have no + * control over the order of completions. The phase of the range is the + * index within the table. The range with index 0 marks blocks that were + * never written. + */ + struct ftl_nv_cache_range range[FTL_NV_CACHE_PHASE_COUNT]; +#define FTL_NV_CACHE_RESTORE_DEPTH 128 + /* Non-volatile cache buffers */ + struct ftl_nv_cache_block block[FTL_NV_CACHE_RESTORE_DEPTH]; + /* Current address */ + uint64_t current_addr; + /* Number of outstanding requests */ + size_t num_outstanding; + /* Recovery/scan status */ + int status; + /* Current phase of the recovery */ + unsigned int phase; +}; + +struct ftl_restore { + struct spdk_ftl_dev *dev; + /* Completion callback (called for each phase of the restoration) */ + ftl_restore_fn cb; + /* Completion callback context */ + void *cb_arg; + /* Number of inflight IOs */ + unsigned int num_ios; + /* Current band number (index in the below bands array) */ + unsigned int current; + /* Array of bands */ + struct ftl_restore_band *bands; + /* Queue of bands to be padded (due to unsafe shutdown) */ + STAILQ_HEAD(, ftl_restore_band) pad_bands; + /* Status of the padding */ + int pad_status; + /* Metadata buffer */ + void *md_buf; + /* LBA map buffer */ + void *lba_map; + /* Indicates we're in the final phase of the restoration */ + bool final_phase; + /* Non-volatile cache recovery */ + struct ftl_nv_cache_restore nv_cache; +}; + +static int +ftl_restore_tail_md(struct ftl_restore_band *rband); +static void +ftl_pad_zone_cb(struct ftl_io *io, void *arg, int status); +static void +ftl_restore_pad_band(struct ftl_restore_band *rband); + +static void +ftl_restore_free(struct ftl_restore *restore) +{ + unsigned int i; + + if (!restore) { + return; + } + + for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) { + spdk_dma_free(restore->nv_cache.block[i].buf); + } + + spdk_dma_free(restore->md_buf); + free(restore->bands); + free(restore); +} + +static struct ftl_restore * +ftl_restore_init(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg) +{ + struct ftl_restore *restore; + struct ftl_restore_band *rband; + size_t i; + + restore = calloc(1, sizeof(*restore)); + if (!restore) { + goto error; + } + + restore->dev = dev; + restore->cb = cb; + restore->cb_arg = cb_arg; + restore->final_phase = false; + + restore->bands = calloc(ftl_get_num_bands(dev), sizeof(*restore->bands)); + if (!restore->bands) { + goto error; + } + + STAILQ_INIT(&restore->pad_bands); + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + rband = &restore->bands[i]; + rband->band = &dev->bands[i]; + rband->parent = restore; + rband->md_status = FTL_MD_NO_MD; + } + + /* Allocate buffer capable of holding head mds of all bands */ + restore->md_buf = spdk_dma_zmalloc(ftl_get_num_bands(dev) * ftl_head_md_num_blocks(dev) * + FTL_BLOCK_SIZE, 0, NULL); + if (!restore->md_buf) { + goto error; + } + + return restore; +error: + ftl_restore_free(restore); + return NULL; +} + +static void +ftl_restore_complete(struct ftl_restore *restore, int status) +{ + struct ftl_restore *ctx = status ? NULL : restore; + bool final_phase = restore->final_phase; + + restore->cb(ctx, status, restore->cb_arg); + if (status || final_phase) { + ftl_restore_free(restore); + } +} + +static int +ftl_band_cmp(const void *lband, const void *rband) +{ + uint64_t lseq = ((struct ftl_restore_band *)lband)->band->seq; + uint64_t rseq = ((struct ftl_restore_band *)rband)->band->seq; + + if (lseq < rseq) { + return -1; + } else { + return 1; + } +} + +static int +ftl_restore_check_seq(const struct ftl_restore *restore) +{ + const struct spdk_ftl_dev *dev = restore->dev; + const struct ftl_restore_band *rband; + const struct ftl_band *next_band; + size_t i; + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + rband = &restore->bands[i]; + if (rband->md_status != FTL_MD_SUCCESS) { + continue; + } + + next_band = LIST_NEXT(rband->band, list_entry); + if (next_band && rband->band->seq == next_band->seq) { + return -1; + } + } + + return 0; +} + +static bool +ftl_restore_head_valid(struct spdk_ftl_dev *dev, struct ftl_restore *restore, size_t *num_valid) +{ + struct ftl_restore_band *rband; + size_t i; + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + rband = &restore->bands[i]; + + if (rband->md_status != FTL_MD_SUCCESS && + rband->md_status != FTL_MD_NO_MD && + rband->md_status != FTL_MD_IO_FAILURE) { + SPDK_ERRLOG("Inconsistent head metadata found on band %u\n", + rband->band->id); + return false; + } + + if (rband->md_status == FTL_MD_SUCCESS) { + (*num_valid)++; + } + } + + return true; +} + +static void +ftl_restore_head_complete(struct ftl_restore *restore) +{ + struct spdk_ftl_dev *dev = restore->dev; + size_t num_valid = 0; + int status = -EIO; + + if (!ftl_restore_head_valid(dev, restore, &num_valid)) { + goto out; + } + + if (num_valid == 0) { + SPDK_ERRLOG("Couldn't find any valid bands\n"); + goto out; + } + + /* Sort bands in sequence number ascending order */ + qsort(restore->bands, ftl_get_num_bands(dev), sizeof(struct ftl_restore_band), + ftl_band_cmp); + + if (ftl_restore_check_seq(restore)) { + SPDK_ERRLOG("Band sequence consistency failed\n"); + goto out; + } + + dev->num_lbas = dev->global_md.num_lbas; + status = 0; +out: + ftl_restore_complete(restore, status); +} + +static void +ftl_restore_head_cb(struct ftl_io *io, void *ctx, int status) +{ + struct ftl_restore_band *rband = ctx; + struct ftl_restore *restore = rband->parent; + unsigned int num_ios; + + rband->md_status = status; + num_ios = __atomic_fetch_sub(&restore->num_ios, 1, __ATOMIC_SEQ_CST); + assert(num_ios > 0); + + if (num_ios == 1) { + ftl_restore_head_complete(restore); + } +} + +static void +ftl_restore_head_md(void *ctx) +{ + struct ftl_restore *restore = ctx; + struct spdk_ftl_dev *dev = restore->dev; + struct ftl_restore_band *rband; + struct ftl_lba_map *lba_map; + unsigned int num_failed = 0, num_ios; + size_t i; + + restore->num_ios = ftl_get_num_bands(dev); + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + rband = &restore->bands[i]; + lba_map = &rband->band->lba_map; + + lba_map->dma_buf = restore->md_buf + i * ftl_head_md_num_blocks(dev) * FTL_BLOCK_SIZE; + + if (ftl_band_read_head_md(rband->band, ftl_restore_head_cb, rband)) { + if (spdk_likely(rband->band->num_zones)) { + SPDK_ERRLOG("Failed to read metadata on band %zu\n", i); + + rband->md_status = FTL_MD_INVALID_CRC; + + /* If the first IO fails, don't bother sending anything else */ + if (i == 0) { + ftl_restore_complete(restore, -EIO); + } + } + + num_failed++; + } + } + + if (spdk_unlikely(num_failed > 0)) { + num_ios = __atomic_fetch_sub(&restore->num_ios, num_failed, __ATOMIC_SEQ_CST); + if (num_ios == num_failed) { + ftl_restore_complete(restore, -EIO); + } + } +} + +int +ftl_restore_md(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg) +{ + struct ftl_restore *restore; + + restore = ftl_restore_init(dev, cb, cb_arg); + if (!restore) { + return -ENOMEM; + } + + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_restore_head_md, restore); + + return 0; +} + +static int +ftl_restore_l2p(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_addr addr; + uint64_t lba; + size_t i; + + for (i = 0; i < ftl_get_num_blocks_in_band(band->dev); ++i) { + if (!spdk_bit_array_get(band->lba_map.vld, i)) { + continue; + } + + lba = band->lba_map.map[i]; + if (lba >= dev->num_lbas) { + return -1; + } + + addr = ftl_l2p_get(dev, lba); + if (!ftl_addr_invalid(addr)) { + ftl_invalidate_addr(dev, addr); + } + + addr = ftl_band_addr_from_block_offset(band, i); + + ftl_band_set_addr(band, lba, addr); + ftl_l2p_set(dev, lba, addr); + } + + return 0; +} + +static struct ftl_restore_band * +ftl_restore_next_band(struct ftl_restore *restore) +{ + struct ftl_restore_band *rband; + + for (; restore->current < ftl_get_num_bands(restore->dev); ++restore->current) { + rband = &restore->bands[restore->current]; + + if (spdk_likely(rband->band->num_zones) && + rband->md_status == FTL_MD_SUCCESS) { + restore->current++; + return rband; + } + } + + return NULL; +} + +static void +ftl_nv_cache_restore_complete(struct ftl_nv_cache_restore *restore, int status) +{ + struct ftl_restore *ftl_restore = SPDK_CONTAINEROF(restore, struct ftl_restore, nv_cache); + + restore->status = restore->status ? : status; + if (restore->num_outstanding == 0) { + ftl_restore_complete(ftl_restore, restore->status); + } +} + +static void ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); + +static void +ftl_nv_cache_restore_done(struct ftl_nv_cache_restore *restore, uint64_t current_addr) +{ + struct ftl_nv_cache *nv_cache = restore->nv_cache; + + pthread_spin_lock(&nv_cache->lock); + nv_cache->current_addr = current_addr; + nv_cache->ready = true; + pthread_spin_unlock(&nv_cache->lock); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Enabling non-volatile cache (phase: %u, addr: %" + PRIu64")\n", nv_cache->phase, current_addr); + + ftl_nv_cache_restore_complete(restore, 0); +} + +static void +ftl_nv_cache_write_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_restore *restore = cb_arg; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Unable to write the non-volatile cache metadata header\n"); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + ftl_nv_cache_restore_done(restore, FTL_NV_CACHE_DATA_OFFSET); +} + +static void +ftl_nv_cache_scrub_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_restore *restore = cb_arg; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + int rc; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Scrubbing non-volatile cache failed\n"); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + nv_cache->phase = 1; + rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_write_header_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to write the non-volatile cache metadata header: %s\n", + spdk_strerror(-rc)); + ftl_nv_cache_restore_complete(restore, -EIO); + } +} + +static void +ftl_nv_cache_scrub_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_restore *restore = cb_arg; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + int rc; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n"); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + rc = ftl_nv_cache_scrub(nv_cache, ftl_nv_cache_scrub_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to scrub the non-volatile cache: %s\n", spdk_strerror(-rc)); + ftl_nv_cache_restore_complete(restore, rc); + } +} + +static void +ftl_nv_cache_band_flush_cb(void *ctx, int status) +{ + struct ftl_nv_cache_restore *restore = ctx; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + int rc; + + if (spdk_unlikely(status != 0)) { + SPDK_ERRLOG("Flushing active bands failed: %s\n", spdk_strerror(-status)); + ftl_nv_cache_restore_complete(restore, status); + return; + } + + /* + * Use phase 0 to indicate that the cache is being scrubbed. If the power is lost during + * this process, we'll know it needs to be resumed. + */ + nv_cache->phase = 0; + rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_scrub_header_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header: %s\n", + spdk_strerror(-rc)); + ftl_nv_cache_restore_complete(restore, rc); + } +} + +static void +ftl_nv_cache_wbuf_flush_cb(void *ctx, int status) +{ + struct ftl_nv_cache_restore *restore = ctx; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + int rc; + + if (spdk_unlikely(status != 0)) { + SPDK_ERRLOG("Flushing the write buffer failed: %s\n", spdk_strerror(-status)); + ftl_nv_cache_restore_complete(restore, status); + return; + } + + rc = ftl_flush_active_bands(dev, ftl_nv_cache_band_flush_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to flush active bands: %s\n", spdk_strerror(-rc)); + ftl_nv_cache_restore_complete(restore, rc); + } +} + +static void +ftl_nv_cache_recovery_done(struct ftl_nv_cache_restore *restore) +{ + struct ftl_nv_cache *nv_cache = restore->nv_cache; + struct ftl_nv_cache_range *range_prev, *range_current; + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + struct spdk_bdev *bdev; + uint64_t current_addr; + int rc; + + range_prev = &restore->range[ftl_nv_cache_prev_phase(nv_cache->phase)]; + range_current = &restore->range[nv_cache->phase]; + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + + /* + * If there are more than two ranges or the ranges overlap, scrub the non-volatile cache to + * make sure that any subsequent power loss will find the cache in usable state + */ + if ((range_prev->num_blocks + range_current->num_blocks < nv_cache->num_data_blocks) || + (range_prev->start_addr < range_current->last_addr && + range_current->start_addr < range_prev->last_addr)) { + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache inconsistency detected\n"); + + rc = ftl_flush_wbuf(dev, ftl_nv_cache_wbuf_flush_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to flush the write buffer: %s\n", spdk_strerror(-rc)); + ftl_nv_cache_restore_complete(restore, rc); + } + + return; + } + + /* The latest phase is the one written in the header (set in nvc_cache->phase) */ + current_addr = range_current->last_addr + 1; + + /* + * The first range might be empty (only the header was written) or the range might + * end at the last available address, in which case set current address to the + * beginning of the device. + */ + if (range_current->num_blocks == 0 || current_addr >= spdk_bdev_get_num_blocks(bdev)) { + current_addr = FTL_NV_CACHE_DATA_OFFSET; + } + + ftl_nv_cache_restore_done(restore, current_addr); +} + +static void +ftl_nv_cache_recover_block(struct ftl_nv_cache_block *block) +{ + struct ftl_nv_cache_restore *restore = block->parent; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + struct ftl_nv_cache_range *range = &restore->range[restore->phase]; + int rc; + + assert(range->current_addr <= range->last_addr); + + restore->num_outstanding++; + block->offset = range->current_addr++; + rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch, + block->buf, block->md_buf, + block->offset, 1, ftl_nv_cache_block_read_cb, + block); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n", + block->offset, spdk_strerror(-rc)); + restore->num_outstanding--; + ftl_nv_cache_restore_complete(restore, rc); + } +} + +static void +ftl_nv_cache_recover_range(struct ftl_nv_cache_restore *restore) +{ + struct ftl_nv_cache_range *range; + unsigned int phase = restore->phase; + + do { + /* Find first range with non-zero number of blocks that is marked for recovery */ + range = &restore->range[phase]; + if (range->recovery && range->num_recovered < range->num_blocks) { + break; + } + + phase = ftl_nv_cache_next_phase(phase); + } while (phase != restore->phase); + + /* There are no ranges to be recovered, we're done */ + if (range->num_recovered == range->num_blocks || !range->recovery) { + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache recovery done\n"); + ftl_nv_cache_recovery_done(restore); + return; + } + + range->current_addr = range->start_addr; + restore->phase = phase; + + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Recovering range %u %"PRIu64"-%"PRIu64" (%"PRIu64")\n", + phase, range->start_addr, range->last_addr, range->num_blocks); + + ftl_nv_cache_recover_block(&restore->block[0]); +} + +static void +ftl_nv_cache_write_cb(struct ftl_io *io, void *cb_arg, int status) +{ + struct ftl_nv_cache_block *block = cb_arg; + struct ftl_nv_cache_restore *restore = block->parent; + struct ftl_nv_cache_range *range = &restore->range[restore->phase]; + + restore->num_outstanding--; + if (status != 0) { + SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n", + block->offset, spdk_strerror(-status)); + ftl_nv_cache_restore_complete(restore, -ENOMEM); + return; + } + + range->num_recovered++; + if (range->current_addr <= range->last_addr) { + ftl_nv_cache_recover_block(block); + } else if (restore->num_outstanding == 0) { + assert(range->num_recovered == range->num_blocks); + ftl_nv_cache_recover_range(restore); + } +} + +static struct ftl_io * +ftl_nv_cache_alloc_io(struct ftl_nv_cache_block *block, uint64_t lba) +{ + struct ftl_restore *restore = SPDK_CONTAINEROF(block->parent, struct ftl_restore, nv_cache); + struct ftl_io_init_opts opts = { + .dev = restore->dev, + .io = NULL, + .flags = FTL_IO_BYPASS_CACHE, + .type = FTL_IO_WRITE, + .num_blocks = 1, + .cb_fn = ftl_nv_cache_write_cb, + .cb_ctx = block, + .iovs = { + { + .iov_base = block->buf, + .iov_len = FTL_BLOCK_SIZE, + } + }, + .iovcnt = 1, + }; + struct ftl_io *io; + + io = ftl_io_init_internal(&opts); + if (spdk_unlikely(!io)) { + return NULL; + } + + io->lba.single = lba; + return io; +} + +static void +ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_block *block = cb_arg; + struct ftl_nv_cache_restore *restore = block->parent; + struct ftl_nv_cache_range *range = &restore->range[restore->phase]; + struct ftl_io *io; + unsigned int phase; + uint64_t lba; + + spdk_bdev_free_io(bdev_io); + restore->num_outstanding--; + + if (!success) { + SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64"\n", + block->offset); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase); + if (spdk_unlikely(phase != restore->phase)) { + if (range->current_addr < range->last_addr) { + ftl_nv_cache_recover_block(block); + } else if (restore->num_outstanding == 0) { + ftl_nv_cache_recover_range(restore); + } + + return; + } + + io = ftl_nv_cache_alloc_io(block, lba); + if (spdk_unlikely(!io)) { + SPDK_ERRLOG("Failed to allocate ftl_io during non-volatile cache recovery\n"); + ftl_nv_cache_restore_complete(restore, -ENOMEM); + return; + } + + restore->num_outstanding++; + ftl_io_write(io); +} + +/* + * Since we have no control over the order in which the requests complete in regards to their + * submission, the cache can be in either of the following states: + * - [1 1 1 1 1 1 1 1 1 1]: simplest case, whole cache contains single phase (although it should be + * very rare), + * - [1 1 1 1 3 3 3 3 3 3]: two phases, changing somewhere in the middle with no overlap. This is + * the state left by clean shutdown, + * - [1 1 1 1 3 1 3 3 3 3]: similar to the above, but this time the two ranges overlap. This + * happens when completions are reordered during unsafe shutdown, + * - [2 1 2 1 1 1 1 3 1 3]: three different phases, each one of which can overlap with + * previous/next one. The data from the oldest phase doesn't need to be + * recovered, as it was already being written to, which means it's + * already on the main storage. + */ +static void +ftl_nv_cache_scan_done(struct ftl_nv_cache_restore *restore) +{ + struct ftl_nv_cache *nv_cache = restore->nv_cache; +#if defined(DEBUG) + struct ftl_nv_cache_range *range; + uint64_t i, num_blocks = 0; + + for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) { + range = &restore->range[i]; + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Range %"PRIu64": %"PRIu64"-%"PRIu64" (%" PRIu64 + ")\n", i, range->start_addr, range->last_addr, range->num_blocks); + num_blocks += range->num_blocks; + } + assert(num_blocks == nv_cache->num_data_blocks); +#endif + restore->phase = ftl_nv_cache_prev_phase(nv_cache->phase); + + /* + * Only the latest two phases need to be recovered. The third one, even if present, + * already has to be stored on the main storage, as it's already started to be + * overwritten (only present here because of reordering of requests' completions). + */ + restore->range[nv_cache->phase].recovery = true; + restore->range[restore->phase].recovery = true; + + ftl_nv_cache_recover_range(restore); +} + +static int ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block); + +static void +ftl_nv_cache_scan_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_block *block = cb_arg; + struct ftl_nv_cache_restore *restore = block->parent; + struct ftl_nv_cache_range *range; + struct spdk_bdev *bdev; + unsigned int phase; + uint64_t lba; + + restore->num_outstanding--; + bdev = spdk_bdev_desc_get_bdev(restore->nv_cache->bdev_desc); + spdk_bdev_free_io(bdev_io); + + if (!success) { + SPDK_ERRLOG("Non-volatile cache scan failed on block %"PRIu64"\n", + block->offset); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + /* If we've already hit an error, don't bother with scanning anything else */ + if (spdk_unlikely(restore->status != 0)) { + ftl_nv_cache_restore_complete(restore, restore->status); + return; + } + + ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase); + range = &restore->range[phase]; + range->num_blocks++; + + if (range->start_addr == FTL_LBA_INVALID || range->start_addr > block->offset) { + range->start_addr = block->offset; + } + + if (range->last_addr == FTL_LBA_INVALID || range->last_addr < block->offset) { + range->last_addr = block->offset; + } + + /* All the blocks were read, once they're all completed and we're finished */ + if (restore->current_addr == spdk_bdev_get_num_blocks(bdev)) { + if (restore->num_outstanding == 0) { + ftl_nv_cache_scan_done(restore); + } + + return; + } + + ftl_nv_cache_scan_block(block); +} + +static int +ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block) +{ + struct ftl_nv_cache_restore *restore = block->parent; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + int rc; + + restore->num_outstanding++; + block->offset = restore->current_addr++; + rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch, + block->buf, block->md_buf, + block->offset, 1, ftl_nv_cache_scan_cb, + block); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Non-volatile cache scan failed on block %"PRIu64" (%s)\n", + block->offset, spdk_strerror(-rc)); + restore->num_outstanding--; + ftl_nv_cache_restore_complete(restore, rc); + return rc; + } + + return 0; +} + +static void +ftl_nv_cache_clean_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_restore *restore = cb_arg; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Unable to write the non-volatile cache metadata header\n"); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + ftl_nv_cache_restore_done(restore, restore->current_addr); +} + +static bool +ftl_nv_cache_header_valid(struct spdk_ftl_dev *dev, const struct ftl_nv_cache_header *hdr) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(dev->nv_cache.bdev_desc); + uint32_t checksum; + + checksum = spdk_crc32c_update(hdr, offsetof(struct ftl_nv_cache_header, checksum), 0); + if (checksum != hdr->checksum) { + SPDK_ERRLOG("Invalid header checksum (found: %"PRIu32", expected: %"PRIu32")\n", + checksum, hdr->checksum); + return false; + } + + if (hdr->version != FTL_NV_CACHE_HEADER_VERSION) { + SPDK_ERRLOG("Invalid header version (found: %"PRIu32", expected: %"PRIu32")\n", + hdr->version, FTL_NV_CACHE_HEADER_VERSION); + return false; + } + + if (hdr->size != spdk_bdev_get_num_blocks(bdev)) { + SPDK_ERRLOG("Unexpected size of the non-volatile cache bdev (%"PRIu64", expected: %" + PRIu64")\n", hdr->size, spdk_bdev_get_num_blocks(bdev)); + return false; + } + + if (spdk_uuid_compare(&hdr->uuid, &dev->uuid)) { + SPDK_ERRLOG("Invalid device UUID\n"); + return false; + } + + if (!ftl_nv_cache_phase_is_valid(hdr->phase) && hdr->phase != 0) { + return false; + } + + if ((hdr->current_addr >= spdk_bdev_get_num_blocks(bdev) || + hdr->current_addr < FTL_NV_CACHE_DATA_OFFSET) && + (hdr->current_addr != FTL_LBA_INVALID)) { + SPDK_ERRLOG("Unexpected value of non-volatile cache's current address: %"PRIu64"\n", + hdr->current_addr); + return false; + } + + return true; +} + +static void +ftl_nv_cache_read_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_restore *restore = cb_arg; + struct spdk_ftl_dev *dev = restore->dev; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + struct ftl_nv_cache_header *hdr; + struct iovec *iov = NULL; + int iov_cnt = 0, i, rc; + + if (!success) { + SPDK_ERRLOG("Unable to read non-volatile cache metadata header\n"); + ftl_restore_complete(restore, -ENOTRECOVERABLE); + goto out; + } + + spdk_bdev_io_get_iovec(bdev_io, &iov, &iov_cnt); + assert(iov != NULL); + hdr = iov[0].iov_base; + + if (!ftl_nv_cache_header_valid(dev, hdr)) { + ftl_restore_complete(restore, -ENOTRECOVERABLE); + goto out; + } + + /* Remember the latest phase */ + nv_cache->phase = hdr->phase; + + /* If the phase equals zero, we lost power during recovery. We need to finish it up + * by scrubbing the device once again. + */ + if (hdr->phase == 0) { + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Detected phase 0, restarting scrub\n"); + rc = ftl_nv_cache_scrub(nv_cache, ftl_nv_cache_scrub_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to scrub the non-volatile cache: %s\n", + spdk_strerror(-rc)); + ftl_restore_complete(restore, -ENOTRECOVERABLE); + } + + goto out; + } + + /* Valid current_addr means that the shutdown was clean, so we just need to overwrite the + * header to make sure that any power loss occurring before the cache is wrapped won't be + * mistaken for a clean shutdown. + */ + if (hdr->current_addr != FTL_LBA_INVALID) { + restore->nv_cache.current_addr = hdr->current_addr; + + rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_clean_header_cb, + &restore->nv_cache); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Failed to overwrite the non-volatile cache header: %s\n", + spdk_strerror(-rc)); + ftl_restore_complete(restore, -ENOTRECOVERABLE); + } + + goto out; + } + + /* Otherwise the shutdown was unexpected, so we need to recover the data from the cache */ + restore->nv_cache.current_addr = FTL_NV_CACHE_DATA_OFFSET; + + for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) { + if (ftl_nv_cache_scan_block(&restore->nv_cache.block[i])) { + break; + } + } +out: + spdk_bdev_free_io(bdev_io); +} + +void +ftl_restore_nv_cache(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg) +{ + struct spdk_ftl_dev *dev = restore->dev; + struct spdk_bdev *bdev; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + struct ftl_io_channel *ioch; + struct ftl_nv_cache_restore *nvc_restore = &restore->nv_cache; + struct ftl_nv_cache_block *block; + size_t alignment; + int rc, i; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + alignment = spdk_max(spdk_bdev_get_buf_align(bdev), sizeof(uint64_t)); + + nvc_restore->nv_cache = nv_cache; + nvc_restore->ioch = ioch->cache_ioch; + + restore->final_phase = true; + restore->cb = cb; + restore->cb_arg = cb_arg; + + for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) { + block = &nvc_restore->block[i]; + block->parent = nvc_restore; + block->buf = spdk_dma_zmalloc(spdk_bdev_get_block_size(bdev) + + spdk_bdev_get_md_size(bdev), + alignment, NULL); + if (!block->buf) { + /* The memory will be freed in ftl_restore_free */ + SPDK_ERRLOG("Unable to allocate memory\n"); + ftl_restore_complete(restore, -ENOMEM); + return; + } + + block->md_buf = (char *)block->buf + spdk_bdev_get_block_size(bdev); + } + + for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) { + nvc_restore->range[i].parent = nvc_restore; + nvc_restore->range[i].start_addr = FTL_LBA_INVALID; + nvc_restore->range[i].last_addr = FTL_LBA_INVALID; + nvc_restore->range[i].num_blocks = 0; + nvc_restore->range[i].recovery = false; + nvc_restore->range[i].phase = i; + } + + rc = spdk_bdev_read_blocks(nv_cache->bdev_desc, ioch->cache_ioch, nv_cache->dma_buf, + 0, FTL_NV_CACHE_DATA_OFFSET, ftl_nv_cache_read_header_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Failed to read non-volatile cache metadata header: %s\n", + spdk_strerror(-rc)); + ftl_restore_complete(restore, rc); + } +} + +static bool +ftl_pad_zone_pad_finish(struct ftl_restore_band *rband, bool direct_access) +{ + struct ftl_restore *restore = rband->parent; + struct ftl_restore_band *next_band; + size_t i, num_pad_zones = 0; + + if (spdk_unlikely(restore->pad_status && !restore->num_ios)) { + if (direct_access) { + /* In case of any errors found we want to clear direct access. */ + /* Direct access bands have their own allocated md, which would be lost */ + /* on restore complete otherwise. */ + rband->band->state = FTL_BAND_STATE_CLOSED; + ftl_band_set_direct_access(rband->band, false); + } + ftl_restore_complete(restore, restore->pad_status); + return true; + } + + for (i = 0; i < rband->band->num_zones; ++i) { + if (rband->band->zone_buf[i].info.state != SPDK_BDEV_ZONE_STATE_FULL) { + num_pad_zones++; + } + } + + /* Finished all zones in a band, check if all bands are done */ + if (num_pad_zones == 0) { + if (direct_access) { + rband->band->state = FTL_BAND_STATE_CLOSED; + ftl_band_set_direct_access(rband->band, false); + } + + next_band = STAILQ_NEXT(rband, stailq); + if (!next_band) { + ftl_restore_complete(restore, restore->pad_status); + return true; + } else { + /* Start off padding in the next band */ + ftl_restore_pad_band(next_band); + return true; + } + } + + return false; +} + +static struct ftl_io * +ftl_restore_init_pad_io(struct ftl_restore_band *rband, void *buffer, + struct ftl_addr addr) +{ + struct ftl_band *band = rband->band; + struct spdk_ftl_dev *dev = band->dev; + int flags = FTL_IO_PAD | FTL_IO_INTERNAL | FTL_IO_PHYSICAL_MODE | FTL_IO_MD | + FTL_IO_DIRECT_ACCESS; + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .band = band, + .size = sizeof(struct ftl_io), + .flags = flags, + .type = FTL_IO_WRITE, + .num_blocks = dev->xfer_size, + .cb_fn = ftl_pad_zone_cb, + .cb_ctx = rband, + .iovs = { + { + .iov_base = buffer, + .iov_len = dev->xfer_size * FTL_BLOCK_SIZE, + } + }, + .iovcnt = 1, + .parent = NULL, + }; + struct ftl_io *io; + + io = ftl_io_init_internal(&opts); + if (spdk_unlikely(!io)) { + return NULL; + } + + io->addr = addr; + rband->parent->num_ios++; + + return io; +} + +static void +ftl_pad_zone_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_restore_band *rband = arg; + struct ftl_restore *restore = rband->parent; + struct ftl_band *band = io->band; + struct ftl_zone *zone; + struct ftl_io *new_io; + uint64_t offset; + + restore->num_ios--; + /* TODO check for next unit error vs early close error */ + if (status) { + restore->pad_status = status; + goto end; + } + + offset = io->addr.offset % ftl_get_num_blocks_in_zone(restore->dev); + if (offset + io->num_blocks == ftl_get_num_blocks_in_zone(restore->dev)) { + zone = ftl_band_zone_from_addr(band, io->addr); + zone->info.state = SPDK_BDEV_ZONE_STATE_FULL; + } else { + struct ftl_addr addr = io->addr; + addr.offset += io->num_blocks; + new_io = ftl_restore_init_pad_io(rband, io->iov[0].iov_base, addr); + if (spdk_unlikely(!new_io)) { + restore->pad_status = -ENOMEM; + goto end; + } + + ftl_io_write(new_io); + return; + } + +end: + spdk_dma_free(io->iov[0].iov_base); + ftl_pad_zone_pad_finish(rband, true); +} + +static void +ftl_restore_pad_band(struct ftl_restore_band *rband) +{ + struct ftl_restore *restore = rband->parent; + struct ftl_band *band = rband->band; + struct spdk_ftl_dev *dev = band->dev; + void *buffer = NULL; + struct ftl_io *io; + struct ftl_addr addr; + size_t i; + int rc = 0; + + /* Check if some zones are not closed */ + if (ftl_pad_zone_pad_finish(rband, false)) { + /* + * If we're here, end meta wasn't recognized, but the whole band is written + * Assume the band was padded and ignore it + */ + return; + } + + band->state = FTL_BAND_STATE_OPEN; + rc = ftl_band_set_direct_access(band, true); + if (rc) { + ftl_restore_complete(restore, rc); + return; + } + + for (i = 0; i < band->num_zones; ++i) { + if (band->zone_buf[i].info.state == SPDK_BDEV_ZONE_STATE_FULL) { + continue; + } + + addr.offset = band->zone_buf[i].info.write_pointer; + + buffer = spdk_dma_zmalloc(FTL_BLOCK_SIZE * dev->xfer_size, 0, NULL); + if (spdk_unlikely(!buffer)) { + rc = -ENOMEM; + goto error; + } + + io = ftl_restore_init_pad_io(rband, buffer, addr); + if (spdk_unlikely(!io)) { + rc = -ENOMEM; + spdk_dma_free(buffer); + goto error; + } + + ftl_io_write(io); + } + + return; + +error: + restore->pad_status = rc; + ftl_pad_zone_pad_finish(rband, true); +} + +static void +ftl_restore_pad_open_bands(void *ctx) +{ + struct ftl_restore *restore = ctx; + + ftl_restore_pad_band(STAILQ_FIRST(&restore->pad_bands)); +} + +static void +ftl_restore_tail_md_cb(struct ftl_io *io, void *ctx, int status) +{ + struct ftl_restore_band *rband = ctx; + struct ftl_restore *restore = rband->parent; + struct spdk_ftl_dev *dev = restore->dev; + + if (status) { + if (!dev->conf.allow_open_bands) { + SPDK_ERRLOG("%s while restoring tail md in band %u.\n", + spdk_strerror(-status), rband->band->id); + ftl_band_release_lba_map(rband->band); + ftl_restore_complete(restore, status); + return; + } else { + SPDK_ERRLOG("%s while restoring tail md. Will attempt to pad band %u.\n", + spdk_strerror(-status), rband->band->id); + STAILQ_INSERT_TAIL(&restore->pad_bands, rband, stailq); + } + } + + if (!status && ftl_restore_l2p(rband->band)) { + ftl_band_release_lba_map(rband->band); + ftl_restore_complete(restore, -ENOTRECOVERABLE); + return; + } + ftl_band_release_lba_map(rband->band); + + rband = ftl_restore_next_band(restore); + if (!rband) { + if (!STAILQ_EMPTY(&restore->pad_bands)) { + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_restore_pad_open_bands, + restore); + } else { + ftl_restore_complete(restore, 0); + } + + return; + } + + ftl_restore_tail_md(rband); +} + +static int +ftl_restore_tail_md(struct ftl_restore_band *rband) +{ + struct ftl_restore *restore = rband->parent; + struct ftl_band *band = rband->band; + + if (ftl_band_alloc_lba_map(band)) { + SPDK_ERRLOG("Failed to allocate lba map\n"); + ftl_restore_complete(restore, -ENOMEM); + return -ENOMEM; + } + + if (ftl_band_read_tail_md(band, band->tail_md_addr, ftl_restore_tail_md_cb, rband)) { + SPDK_ERRLOG("Failed to send tail metadata read\n"); + ftl_restore_complete(restore, -EIO); + return -EIO; + } + + return 0; +} + +int +ftl_restore_device(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg) +{ + struct spdk_ftl_dev *dev = restore->dev; + struct ftl_restore_band *rband; + + restore->current = 0; + restore->cb = cb; + restore->cb_arg = cb_arg; + restore->final_phase = dev->nv_cache.bdev_desc == NULL; + + /* If restore_device is called, there must be at least one valid band */ + rband = ftl_restore_next_band(restore); + assert(rband); + return ftl_restore_tail_md(rband); +} diff --git a/src/spdk/lib/ftl/ftl_trace.c b/src/spdk/lib/ftl/ftl_trace.c new file mode 100644 index 000000000..ba66323ad --- /dev/null +++ b/src/spdk/lib/ftl/ftl_trace.c @@ -0,0 +1,361 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/trace.h" + +#include "ftl_core.h" +#include "ftl_trace.h" +#include "ftl_io.h" +#include "ftl_band.h" + +#if defined(DEBUG) + +#define OWNER_FTL 0x20 +#define TRACE_GROUP_FTL 0x6 + +enum ftl_trace_source { + FTL_TRACE_SOURCE_INTERNAL, + FTL_TRACE_SOURCE_USER, + FTL_TRACE_SOURCE_MAX, +}; + +#define FTL_TPOINT_ID(id, src) SPDK_TPOINT_ID(TRACE_GROUP_FTL, (((id) << 1) | (!!(src)))) + +#define FTL_TRACE_BAND_DEFRAG(src) FTL_TPOINT_ID(0, src) +#define FTL_TRACE_BAND_WRITE(src) FTL_TPOINT_ID(1, src) +#define FTL_TRACE_LIMITS(src) FTL_TPOINT_ID(2, src) +#define FTL_TRACE_WBUF_POP(src) FTL_TPOINT_ID(3, src) + +#define FTL_TRACE_READ_SCHEDULE(src) FTL_TPOINT_ID(4, src) +#define FTL_TRACE_READ_SUBMISSION(src) FTL_TPOINT_ID(5, src) +#define FTL_TRACE_READ_COMPLETION_INVALID(src) FTL_TPOINT_ID(6, src) +#define FTL_TRACE_READ_COMPLETION_CACHE(src) FTL_TPOINT_ID(7, src) +#define FTL_TRACE_READ_COMPLETION_DISK(src) FTL_TPOINT_ID(8, src) + +#define FTL_TRACE_MD_READ_SCHEDULE(src) FTL_TPOINT_ID(9, src) +#define FTL_TRACE_MD_READ_SUBMISSION(src) FTL_TPOINT_ID(10, src) +#define FTL_TRACE_MD_READ_COMPLETION(src) FTL_TPOINT_ID(11, src) + +#define FTL_TRACE_WRITE_SCHEDULE(src) FTL_TPOINT_ID(12, src) +#define FTL_TRACE_WRITE_WBUF_FILL(src) FTL_TPOINT_ID(13, src) +#define FTL_TRACE_WRITE_SUBMISSION(src) FTL_TPOINT_ID(14, src) +#define FTL_TRACE_WRITE_COMPLETION(src) FTL_TPOINT_ID(15, src) + +#define FTL_TRACE_MD_WRITE_SCHEDULE(src) FTL_TPOINT_ID(16, src) +#define FTL_TRACE_MD_WRITE_SUBMISSION(src) FTL_TPOINT_ID(17, src) +#define FTL_TRACE_MD_WRITE_COMPLETION(src) FTL_TPOINT_ID(18, src) + +#define FTL_TRACE_ERASE_SUBMISSION(src) FTL_TPOINT_ID(19, src) +#define FTL_TRACE_ERASE_COMPLETION(src) FTL_TPOINT_ID(20, src) + +SPDK_TRACE_REGISTER_FN(ftl_trace_func, "ftl", TRACE_GROUP_FTL) +{ + const char source[] = { 'i', 'u' }; + char descbuf[128]; + int i; + + spdk_trace_register_owner(OWNER_FTL, 'f'); + + for (i = 0; i < FTL_TRACE_SOURCE_MAX; ++i) { + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "band_defrag"); + spdk_trace_register_description(descbuf, FTL_TRACE_BAND_DEFRAG(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "band: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "band_write"); + spdk_trace_register_description(descbuf, FTL_TRACE_BAND_WRITE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "band: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "limits"); + spdk_trace_register_description(descbuf, FTL_TRACE_LIMITS(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "limits: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "rwb_pop"); + spdk_trace_register_description(descbuf, FTL_TRACE_WBUF_POP(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_sched"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_SCHEDULE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_submit"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_SUBMISSION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_cmpl"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_COMPLETION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_sched"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_SCHEDULE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_submit"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_SUBMISSION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_cmpl"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_COMPLETION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_sched"); + spdk_trace_register_description(descbuf, FTL_TRACE_READ_SCHEDULE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_submit"); + spdk_trace_register_description(descbuf, FTL_TRACE_READ_SUBMISSION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_invld"); + spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_INVALID(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_cache"); + spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_CACHE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_ssd"); + spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_DISK(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_sched"); + spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_SCHEDULE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "rwb_fill"); + spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_WBUF_FILL(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_submit"); + spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_SUBMISSION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_cmpl"); + spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_COMPLETION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "erase_submit"); + spdk_trace_register_description(descbuf, FTL_TRACE_ERASE_SUBMISSION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "erase_cmpl"); + spdk_trace_register_description(descbuf, FTL_TRACE_ERASE_COMPLETION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + } +} + +static uint16_t +ftl_trace_io_source(const struct ftl_io *io) +{ + if (io->flags & FTL_IO_INTERNAL) { + return FTL_TRACE_SOURCE_INTERNAL; + } else { + return FTL_TRACE_SOURCE_USER; + } +} + +static uint64_t +ftl_trace_next_id(struct ftl_trace *trace) +{ + assert(trace->id != FTL_TRACE_INVALID_ID); + return __atomic_fetch_add(&trace->id, 1, __ATOMIC_SEQ_CST); +} + +void +ftl_trace_defrag_band(struct spdk_ftl_dev *dev, const struct ftl_band *band) +{ + struct ftl_trace *trace = &dev->stats.trace; + + spdk_trace_record(FTL_TRACE_BAND_DEFRAG(FTL_TRACE_SOURCE_INTERNAL), + ftl_trace_next_id(trace), 0, band->lba_map.num_vld, band->id); +} + +void +ftl_trace_write_band(struct spdk_ftl_dev *dev, const struct ftl_band *band) +{ + struct ftl_trace *trace = &dev->stats.trace; + + spdk_trace_record(FTL_TRACE_BAND_WRITE(FTL_TRACE_SOURCE_INTERNAL), + ftl_trace_next_id(trace), 0, 0, band->id); +} + +void +ftl_trace_lba_io_init(struct spdk_ftl_dev *dev, const struct ftl_io *io) +{ + uint16_t tpoint_id = 0, source; + + assert(io->trace != FTL_TRACE_INVALID_ID); + source = ftl_trace_io_source(io); + + if (io->flags & FTL_IO_MD) { + switch (io->type) { + case FTL_IO_READ: + tpoint_id = FTL_TRACE_MD_READ_SCHEDULE(source); + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_MD_WRITE_SCHEDULE(source); + break; + default: + assert(0); + } + } else { + switch (io->type) { + case FTL_IO_READ: + tpoint_id = FTL_TRACE_READ_SCHEDULE(source); + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_WRITE_SCHEDULE(source); + break; + default: + assert(0); + } + } + + spdk_trace_record(tpoint_id, io->trace, io->num_blocks, 0, ftl_io_get_lba(io, 0)); +} + +void +ftl_trace_wbuf_fill(struct spdk_ftl_dev *dev, const struct ftl_io *io) +{ + assert(io->trace != FTL_TRACE_INVALID_ID); + + spdk_trace_record(FTL_TRACE_WRITE_WBUF_FILL(ftl_trace_io_source(io)), io->trace, + 0, 0, ftl_io_current_lba(io)); +} + +void +ftl_trace_wbuf_pop(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry) +{ + uint16_t tpoint_id; + + assert(entry->trace != FTL_TRACE_INVALID_ID); + + if (entry->io_flags & FTL_IO_INTERNAL) { + tpoint_id = FTL_TRACE_WBUF_POP(FTL_TRACE_SOURCE_INTERNAL); + } else { + tpoint_id = FTL_TRACE_WBUF_POP(FTL_TRACE_SOURCE_USER); + } + + spdk_trace_record(tpoint_id, entry->trace, 0, entry->addr.offset, entry->lba); +} + +void +ftl_trace_completion(struct spdk_ftl_dev *dev, const struct ftl_io *io, + enum ftl_trace_completion completion) +{ + uint16_t tpoint_id = 0, source; + + assert(io->trace != FTL_TRACE_INVALID_ID); + source = ftl_trace_io_source(io); + + if (io->flags & FTL_IO_MD) { + switch (io->type) { + case FTL_IO_READ: + tpoint_id = FTL_TRACE_MD_READ_COMPLETION(source); + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_MD_WRITE_COMPLETION(source); + break; + default: + assert(0); + } + } else { + switch (io->type) { + case FTL_IO_READ: + switch (completion) { + case FTL_TRACE_COMPLETION_INVALID: + tpoint_id = FTL_TRACE_READ_COMPLETION_INVALID(source); + break; + case FTL_TRACE_COMPLETION_CACHE: + tpoint_id = FTL_TRACE_READ_COMPLETION_CACHE(source); + break; + case FTL_TRACE_COMPLETION_DISK: + tpoint_id = FTL_TRACE_READ_COMPLETION_DISK(source); + break; + } + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_WRITE_COMPLETION(source); + break; + case FTL_IO_ERASE: + tpoint_id = FTL_TRACE_ERASE_COMPLETION(source); + break; + default: + assert(0); + } + } + + spdk_trace_record(tpoint_id, io->trace, 0, 0, ftl_io_get_lba(io, io->pos - 1)); +} + +void +ftl_trace_submission(struct spdk_ftl_dev *dev, const struct ftl_io *io, struct ftl_addr addr, + size_t addr_cnt) +{ + uint16_t tpoint_id = 0, source; + + assert(io->trace != FTL_TRACE_INVALID_ID); + source = ftl_trace_io_source(io); + + if (io->flags & FTL_IO_MD) { + switch (io->type) { + case FTL_IO_READ: + tpoint_id = FTL_TRACE_MD_READ_SUBMISSION(source); + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_MD_WRITE_SUBMISSION(source); + break; + default: + assert(0); + } + } else { + switch (io->type) { + case FTL_IO_READ: + tpoint_id = FTL_TRACE_READ_SUBMISSION(source); + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_WRITE_SUBMISSION(source); + break; + case FTL_IO_ERASE: + tpoint_id = FTL_TRACE_ERASE_SUBMISSION(source); + break; + default: + assert(0); + } + } + + spdk_trace_record(tpoint_id, io->trace, addr_cnt, 0, addr.offset); +} + +void +ftl_trace_limits(struct spdk_ftl_dev *dev, int limit, size_t num_free) +{ + struct ftl_trace *trace = &dev->stats.trace; + + spdk_trace_record(FTL_TRACE_LIMITS(FTL_TRACE_SOURCE_INTERNAL), ftl_trace_next_id(trace), + num_free, limit, 0); +} + +uint64_t +ftl_trace_alloc_id(struct spdk_ftl_dev *dev) +{ + struct ftl_trace *trace = &dev->stats.trace; + + return ftl_trace_next_id(trace); +} + +#endif /* defined(DEBUG) */ diff --git a/src/spdk/lib/ftl/ftl_trace.h b/src/spdk/lib/ftl/ftl_trace.h new file mode 100644 index 000000000..52988cff6 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_trace.h @@ -0,0 +1,84 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_TRACE_H +#define FTL_TRACE_H + +#include "ftl_addr.h" + +#define FTL_TRACE_INVALID_ID ((uint64_t) -1) + +enum ftl_trace_completion { + FTL_TRACE_COMPLETION_INVALID, + FTL_TRACE_COMPLETION_CACHE, + FTL_TRACE_COMPLETION_DISK, +}; + +struct ftl_trace { + /* Monotonically incrementing event id */ + uint64_t id; +}; + +struct spdk_ftl_dev; +struct ftl_trace; +struct ftl_io; +struct ftl_wbuf_entry; +struct ftl_band; + +#if defined(DEBUG) +uint64_t ftl_trace_alloc_id(struct spdk_ftl_dev *dev); +void ftl_trace_defrag_band(struct spdk_ftl_dev *dev, const struct ftl_band *band); +void ftl_trace_write_band(struct spdk_ftl_dev *dev, const struct ftl_band *band); +void ftl_trace_lba_io_init(struct spdk_ftl_dev *dev, const struct ftl_io *io); +void ftl_trace_wbuf_fill(struct spdk_ftl_dev *dev, const struct ftl_io *io); +void ftl_trace_wbuf_pop(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry); +void ftl_trace_submission(struct spdk_ftl_dev *dev, + const struct ftl_io *io, + struct ftl_addr addr, size_t addr_cnt); +void ftl_trace_completion(struct spdk_ftl_dev *dev, + const struct ftl_io *io, + enum ftl_trace_completion type); +void ftl_trace_limits(struct spdk_ftl_dev *dev, int limit, size_t num_free); +#else /* defined(DEBUG) */ +#define ftl_trace_alloc_id(dev) FTL_TRACE_INVALID_ID +#define ftl_trace_defrag_band(dev, band) +#define ftl_trace_write_band(dev, band) +#define ftl_trace_lba_io_init(dev, io) +#define ftl_trace_wbuf_fill(dev, io) +#define ftl_trace_wbuf_pop(dev, entry) +#define ftl_trace_submission(dev, io, addr, addr_cnt) +#define ftl_trace_completion(dev, io, type) +#define ftl_trace_limits(dev, limits, num_free) +#endif + +#endif /* FTL_TRACE_H */ diff --git a/src/spdk/lib/ftl/spdk_ftl.map b/src/spdk/lib/ftl/spdk_ftl.map new file mode 100644 index 000000000..141fd01e0 --- /dev/null +++ b/src/spdk/lib/ftl/spdk_ftl.map @@ -0,0 +1,14 @@ +{ + global: + + # public functions + spdk_ftl_dev_init; + spdk_ftl_dev_free; + spdk_ftl_conf_init_defaults; + spdk_ftl_dev_get_attrs; + spdk_ftl_read; + spdk_ftl_write; + spdk_ftl_flush; + + local: *; +}; diff --git a/src/spdk/lib/idxd/Makefile b/src/spdk/lib/idxd/Makefile new file mode 100644 index 000000000..ed66aeb15 --- /dev/null +++ b/src/spdk/lib/idxd/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = idxd.c +LIBNAME = idxd + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_idxd.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/idxd/idxd.c b/src/spdk/lib/idxd/idxd.c new file mode 100644 index 000000000..992d96211 --- /dev/null +++ b/src/spdk/lib/idxd/idxd.c @@ -0,0 +1,1292 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/util.h" +#include "spdk/memory.h" + +#include "spdk_internal/log.h" +#include "spdk_internal/idxd.h" + +#include "idxd.h" + +#define ALIGN_4K 0x1000 + +pthread_mutex_t g_driver_lock = PTHREAD_MUTEX_INITIALIZER; + +/* + * g_dev_cfg gives us 2 pre-set configurations of DSA to choose from + * via RPC. + */ +struct device_config *g_dev_cfg = NULL; + +/* + * Pre-built configurations. Variations depend on various factors + * including how many different types of target latency profiles there + * are, how many different QOS requirements there might be, etc. + */ +struct device_config g_dev_cfg0 = { + .config_num = 0, + .num_groups = 4, + .num_wqs_per_group = 1, + .num_engines_per_group = 1, + .total_wqs = 4, + .total_engines = 4, +}; + +struct device_config g_dev_cfg1 = { + .config_num = 1, + .num_groups = 2, + .num_wqs_per_group = 2, + .num_engines_per_group = 2, + .total_wqs = 4, + .total_engines = 4, +}; + +static uint32_t +_idxd_read_4(struct spdk_idxd_device *idxd, uint32_t offset) +{ + return spdk_mmio_read_4((uint32_t *)(idxd->reg_base + offset)); +} + +static void +_idxd_write_4(struct spdk_idxd_device *idxd, uint32_t offset, uint32_t value) +{ + spdk_mmio_write_4((uint32_t *)(idxd->reg_base + offset), value); +} + +static uint64_t +_idxd_read_8(struct spdk_idxd_device *idxd, uint32_t offset) +{ + return spdk_mmio_read_8((uint64_t *)(idxd->reg_base + offset)); +} + +static void +_idxd_write_8(struct spdk_idxd_device *idxd, uint32_t offset, uint64_t value) +{ + spdk_mmio_write_8((uint64_t *)(idxd->reg_base + offset), value); +} + +struct spdk_idxd_io_channel * +spdk_idxd_get_channel(struct spdk_idxd_device *idxd) +{ + struct spdk_idxd_io_channel *chan; + struct idxd_batch *batch; + int i; + + chan = calloc(1, sizeof(struct spdk_idxd_io_channel)); + if (chan == NULL) { + SPDK_ERRLOG("Failed to allocate idxd chan\n"); + return NULL; + } + chan->idxd = idxd; + + TAILQ_INIT(&chan->batches); + + TAILQ_INIT(&chan->batch_pool); + for (i = 0 ; i < NUM_BATCHES ; i++) { + batch = calloc(1, sizeof(struct idxd_batch)); + if (batch == NULL) { + SPDK_ERRLOG("Failed to allocate batch\n"); + while ((batch = TAILQ_FIRST(&chan->batch_pool))) { + TAILQ_REMOVE(&chan->batch_pool, batch, link); + free(batch); + } + return NULL; + } + TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link); + } + + return chan; +} + +void +spdk_idxd_put_channel(struct spdk_idxd_io_channel *chan) +{ + free(chan); +} + +int +spdk_idxd_configure_chan(struct spdk_idxd_io_channel *chan) +{ + uint32_t num_ring_slots; + int rc; + + /* Round robin the WQ selection for the chan on this IDXD device. */ + chan->idxd->wq_id++; + if (chan->idxd->wq_id == g_dev_cfg->total_wqs) { + chan->idxd->wq_id = 0; + } + + num_ring_slots = chan->idxd->queues[chan->idxd->wq_id].wqcfg.wq_size; + + chan->ring_ctrl.ring_slots = spdk_bit_array_create(num_ring_slots); + if (chan->ring_ctrl.ring_slots == NULL) { + SPDK_ERRLOG("Failed to allocate bit array for ring\n"); + return -ENOMEM; + } + + /* + * max ring slots can change as channels come and go but we + * start off getting all of the slots for this work queue. + */ + chan->ring_ctrl.max_ring_slots = num_ring_slots; + + /* Store the original size of the ring. */ + chan->ring_ctrl.ring_size = num_ring_slots; + + chan->ring_ctrl.desc = spdk_zmalloc(num_ring_slots * sizeof(struct idxd_hw_desc), + 0x40, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (chan->ring_ctrl.desc == NULL) { + SPDK_ERRLOG("Failed to allocate descriptor memory\n"); + rc = -ENOMEM; + goto err_desc; + } + + chan->ring_ctrl.completions = spdk_zmalloc(num_ring_slots * sizeof(struct idxd_comp), + 0x40, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (chan->ring_ctrl.completions == NULL) { + SPDK_ERRLOG("Failed to allocate completion memory\n"); + rc = -ENOMEM; + goto err_comp; + } + + chan->ring_ctrl.user_desc = spdk_zmalloc(TOTAL_USER_DESC * sizeof(struct idxd_hw_desc), + 0x40, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (chan->ring_ctrl.user_desc == NULL) { + SPDK_ERRLOG("Failed to allocate batch descriptor memory\n"); + rc = -ENOMEM; + goto err_user_desc; + } + + /* Each slot on the ring reserves DESC_PER_BATCH elemnts in user_desc. */ + chan->ring_ctrl.user_ring_slots = spdk_bit_array_create(NUM_BATCHES); + if (chan->ring_ctrl.user_ring_slots == NULL) { + SPDK_ERRLOG("Failed to allocate bit array for user ring\n"); + rc = -ENOMEM; + goto err_user_ring; + } + + chan->ring_ctrl.user_completions = spdk_zmalloc(TOTAL_USER_DESC * sizeof(struct idxd_comp), + 0x40, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (chan->ring_ctrl.user_completions == NULL) { + SPDK_ERRLOG("Failed to allocate user completion memory\n"); + rc = -ENOMEM; + goto err_user_comp; + } + + chan->ring_ctrl.portal = (char *)chan->idxd->portals + chan->idxd->wq_id * PORTAL_SIZE; + + return 0; + +err_user_comp: + spdk_bit_array_free(&chan->ring_ctrl.user_ring_slots); +err_user_ring: + spdk_free(chan->ring_ctrl.user_desc); +err_user_desc: + spdk_free(chan->ring_ctrl.completions); +err_comp: + spdk_free(chan->ring_ctrl.desc); +err_desc: + spdk_bit_array_free(&chan->ring_ctrl.ring_slots); + + return rc; +} + +/* Used for control commands, not for descriptor submission. */ +static int +idxd_wait_cmd(struct spdk_idxd_device *idxd, int _timeout) +{ + uint32_t timeout = _timeout; + union idxd_cmdsts_reg cmd_status = {}; + + cmd_status.raw = _idxd_read_4(idxd, IDXD_CMDSTS_OFFSET); + while (cmd_status.active && --timeout) { + usleep(1); + cmd_status.raw = _idxd_read_4(idxd, IDXD_CMDSTS_OFFSET); + } + + /* Check for timeout */ + if (timeout == 0 && cmd_status.active) { + SPDK_ERRLOG("Command timeout, waited %u\n", _timeout); + return -EBUSY; + } + + /* Check for error */ + if (cmd_status.err) { + SPDK_ERRLOG("Command status reg reports error 0x%x\n", cmd_status.err); + return -EINVAL; + } + + return 0; +} + +static void +_idxd_drain(struct spdk_idxd_io_channel *chan) +{ + uint32_t index; + int set = 0; + + do { + spdk_idxd_process_events(chan); + set = 0; + for (index = 0; index < chan->ring_ctrl.max_ring_slots; index++) { + set |= spdk_bit_array_get(chan->ring_ctrl.ring_slots, index); + } + } while (set); +} + +int +spdk_idxd_reconfigure_chan(struct spdk_idxd_io_channel *chan, uint32_t num_channels) +{ + uint32_t num_ring_slots; + int rc; + struct idxd_batch *batch; + + _idxd_drain(chan); + + assert(spdk_bit_array_count_set(chan->ring_ctrl.ring_slots) == 0); + + if (num_channels == 0) { + spdk_free(chan->ring_ctrl.completions); + spdk_free(chan->ring_ctrl.desc); + spdk_bit_array_free(&chan->ring_ctrl.ring_slots); + spdk_free(chan->ring_ctrl.user_completions); + spdk_free(chan->ring_ctrl.user_desc); + spdk_bit_array_free(&chan->ring_ctrl.user_ring_slots); + while ((batch = TAILQ_FIRST(&chan->batch_pool))) { + TAILQ_REMOVE(&chan->batch_pool, batch, link); + free(batch); + } + return 0; + } + + num_ring_slots = chan->ring_ctrl.ring_size / num_channels; + + /* re-allocate our descriptor ring for hw flow control. */ + rc = spdk_bit_array_resize(&chan->ring_ctrl.ring_slots, num_ring_slots); + if (rc < 0) { + SPDK_ERRLOG("Unable to resize channel bit array\n"); + return -ENOMEM; + } + + chan->ring_ctrl.max_ring_slots = num_ring_slots; + + /* + * Note: The batch descriptor ring does not change with the + * number of channels as descriptors on this ring do not + * "count" for flow control. + */ + + return rc; +} + +/* Called via RPC to select a pre-defined configuration. */ +void +spdk_idxd_set_config(uint32_t config_num) +{ + switch (config_num) { + case 0: + g_dev_cfg = &g_dev_cfg0; + break; + case 1: + g_dev_cfg = &g_dev_cfg1; + break; + default: + g_dev_cfg = &g_dev_cfg0; + SPDK_ERRLOG("Invalid config, using default\n"); + break; + } +} + +static int +idxd_unmap_pci_bar(struct spdk_idxd_device *idxd, int bar) +{ + int rc = 0; + void *addr = NULL; + + if (bar == IDXD_MMIO_BAR) { + addr = (void *)idxd->reg_base; + } else if (bar == IDXD_WQ_BAR) { + addr = (void *)idxd->portals; + } + + if (addr) { + rc = spdk_pci_device_unmap_bar(idxd->device, 0, addr); + } + return rc; +} + +static int +idxd_map_pci_bars(struct spdk_idxd_device *idxd) +{ + int rc; + void *addr; + uint64_t phys_addr, size; + + rc = spdk_pci_device_map_bar(idxd->device, IDXD_MMIO_BAR, &addr, &phys_addr, &size); + if (rc != 0 || addr == NULL) { + SPDK_ERRLOG("pci_device_map_range failed with error code %d\n", rc); + return -1; + } + idxd->reg_base = addr; + + rc = spdk_pci_device_map_bar(idxd->device, IDXD_WQ_BAR, &addr, &phys_addr, &size); + if (rc != 0 || addr == NULL) { + SPDK_ERRLOG("pci_device_map_range failed with error code %d\n", rc); + rc = idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR); + if (rc) { + SPDK_ERRLOG("unable to unmap MMIO bar\n"); + } + return -EINVAL; + } + idxd->portals = addr; + + return 0; +} + +static int +idxd_reset_dev(struct spdk_idxd_device *idxd) +{ + int rc; + + _idxd_write_4(idxd, IDXD_CMD_OFFSET, IDXD_RESET_DEVICE << IDXD_CMD_SHIFT); + rc = idxd_wait_cmd(idxd, IDXD_REGISTER_TIMEOUT_US); + if (rc < 0) { + SPDK_ERRLOG("Error resetting device %u\n", rc); + } + + return rc; +} + +/* + * Build group config based on getting info from the device combined + * with the defined configuration. Once built, it is written to the + * device. + */ +static int +idxd_group_config(struct spdk_idxd_device *idxd) +{ + int i; + uint64_t base_offset; + + assert(g_dev_cfg->num_groups <= idxd->registers.groupcap.num_groups); + idxd->groups = calloc(idxd->registers.groupcap.num_groups, sizeof(struct idxd_group)); + if (idxd->groups == NULL) { + SPDK_ERRLOG("Failed to allocate group memory\n"); + return -ENOMEM; + } + + assert(g_dev_cfg->total_engines <= idxd->registers.enginecap.num_engines); + for (i = 0; i < g_dev_cfg->total_engines; i++) { + idxd->groups[i % g_dev_cfg->num_groups].grpcfg.engines |= (1 << i); + } + + assert(g_dev_cfg->total_wqs <= idxd->registers.wqcap.num_wqs); + for (i = 0; i < g_dev_cfg->total_wqs; i++) { + idxd->groups[i % g_dev_cfg->num_groups].grpcfg.wqs[0] |= (1 << i); + } + + for (i = 0; i < g_dev_cfg->num_groups; i++) { + idxd->groups[i].idxd = idxd; + idxd->groups[i].id = i; + + /* Divide BW tokens evenly */ + idxd->groups[i].grpcfg.flags.tokens_allowed = + idxd->registers.groupcap.total_tokens / g_dev_cfg->num_groups; + } + + /* + * Now write the group config to the device for all groups. We write + * to the max number of groups in order to 0 out the ones we didn't + * configure. + */ + for (i = 0 ; i < idxd->registers.groupcap.num_groups; i++) { + + base_offset = idxd->grpcfg_offset + i * 64; + + /* GRPWQCFG, work queues config */ + _idxd_write_8(idxd, base_offset, idxd->groups[i].grpcfg.wqs[0]); + + /* GRPENGCFG, engine config */ + _idxd_write_8(idxd, base_offset + CFG_ENGINE_OFFSET, idxd->groups[i].grpcfg.engines); + + /* GRPFLAGS, flags config */ + _idxd_write_8(idxd, base_offset + CFG_FLAG_OFFSET, idxd->groups[i].grpcfg.flags.raw); + } + + return 0; +} + +/* + * Build work queue (WQ) config based on getting info from the device combined + * with the defined configuration. Once built, it is written to the device. + */ +static int +idxd_wq_config(struct spdk_idxd_device *idxd) +{ + int i, j; + struct idxd_wq *queue; + u_int32_t wq_size = idxd->registers.wqcap.total_wq_size / g_dev_cfg->total_wqs; + + SPDK_NOTICELOG("Total ring slots available space 0x%x, so per work queue is 0x%x\n", + idxd->registers.wqcap.total_wq_size, wq_size); + assert(g_dev_cfg->total_wqs <= IDXD_MAX_QUEUES); + assert(g_dev_cfg->total_wqs <= idxd->registers.wqcap.num_wqs); + assert(LOG2_WQ_MAX_BATCH <= idxd->registers.gencap.max_batch_shift); + assert(LOG2_WQ_MAX_XFER <= idxd->registers.gencap.max_xfer_shift); + + idxd->queues = calloc(1, idxd->registers.wqcap.num_wqs * sizeof(struct idxd_wq)); + if (idxd->queues == NULL) { + SPDK_ERRLOG("Failed to allocate queue memory\n"); + return -ENOMEM; + } + + for (i = 0; i < g_dev_cfg->total_wqs; i++) { + queue = &idxd->queues[i]; + queue->wqcfg.wq_size = wq_size; + queue->wqcfg.mode = WQ_MODE_DEDICATED; + queue->wqcfg.max_batch_shift = LOG2_WQ_MAX_BATCH; + queue->wqcfg.max_xfer_shift = LOG2_WQ_MAX_XFER; + queue->wqcfg.wq_state = WQ_ENABLED; + queue->wqcfg.priority = WQ_PRIORITY_1; + + /* Not part of the config struct */ + queue->idxd = idxd; + queue->group = &idxd->groups[i % g_dev_cfg->num_groups]; + } + + /* + * Now write the work queue config to the device for all wq space + */ + for (i = 0 ; i < idxd->registers.wqcap.num_wqs; i++) { + queue = &idxd->queues[i]; + for (j = 0 ; j < WQCFG_NUM_DWORDS; j++) { + _idxd_write_4(idxd, idxd->wqcfg_offset + i * 32 + j * 4, + queue->wqcfg.raw[j]); + } + } + + return 0; +} + +static int +idxd_device_configure(struct spdk_idxd_device *idxd) +{ + int i, rc = 0; + union idxd_offsets_register offsets_reg; + union idxd_genstatus_register genstatus_reg; + + /* + * Map BAR0 and BAR2 + */ + rc = idxd_map_pci_bars(idxd); + if (rc) { + return rc; + } + + /* + * Reset the device + */ + rc = idxd_reset_dev(idxd); + if (rc) { + goto err_reset; + } + + /* + * Read in config registers + */ + idxd->registers.version = _idxd_read_4(idxd, IDXD_VERSION_OFFSET); + idxd->registers.gencap.raw = _idxd_read_8(idxd, IDXD_GENCAP_OFFSET); + idxd->registers.wqcap.raw = _idxd_read_8(idxd, IDXD_WQCAP_OFFSET); + idxd->registers.groupcap.raw = _idxd_read_8(idxd, IDXD_GRPCAP_OFFSET); + idxd->registers.enginecap.raw = _idxd_read_8(idxd, IDXD_ENGCAP_OFFSET); + for (i = 0; i < IDXD_OPCAP_WORDS; i++) { + idxd->registers.opcap.raw[i] = + _idxd_read_8(idxd, i * sizeof(uint64_t) + IDXD_OPCAP_OFFSET); + } + offsets_reg.raw[0] = _idxd_read_8(idxd, IDXD_TABLE_OFFSET); + offsets_reg.raw[1] = _idxd_read_8(idxd, IDXD_TABLE_OFFSET + sizeof(uint64_t)); + idxd->grpcfg_offset = offsets_reg.grpcfg * IDXD_TABLE_OFFSET_MULT; + idxd->wqcfg_offset = offsets_reg.wqcfg * IDXD_TABLE_OFFSET_MULT; + idxd->ims_offset = offsets_reg.ims * IDXD_TABLE_OFFSET_MULT; + idxd->msix_perm_offset = offsets_reg.msix_perm * IDXD_TABLE_OFFSET_MULT; + idxd->perfmon_offset = offsets_reg.perfmon * IDXD_TABLE_OFFSET_MULT; + + /* + * Configure groups and work queues. + */ + rc = idxd_group_config(idxd); + if (rc) { + goto err_group_cfg; + } + + rc = idxd_wq_config(idxd); + if (rc) { + goto err_wq_cfg; + } + + /* + * Enable the device + */ + genstatus_reg.raw = _idxd_read_4(idxd, IDXD_GENSTATUS_OFFSET); + assert(genstatus_reg.state == IDXD_DEVICE_STATE_DISABLED); + + _idxd_write_4(idxd, IDXD_CMD_OFFSET, IDXD_ENABLE_DEV << IDXD_CMD_SHIFT); + rc = idxd_wait_cmd(idxd, IDXD_REGISTER_TIMEOUT_US); + genstatus_reg.raw = _idxd_read_4(idxd, IDXD_GENSTATUS_OFFSET); + if ((rc < 0) || (genstatus_reg.state != IDXD_DEVICE_STATE_ENABLED)) { + rc = -EINVAL; + SPDK_ERRLOG("Error enabling device %u\n", rc); + goto err_device_enable; + } + + genstatus_reg.raw = spdk_mmio_read_4((uint32_t *)(idxd->reg_base + IDXD_GENSTATUS_OFFSET)); + assert(genstatus_reg.state == IDXD_DEVICE_STATE_ENABLED); + + /* + * Enable the work queues that we've configured + */ + for (i = 0; i < g_dev_cfg->total_wqs; i++) { + _idxd_write_4(idxd, IDXD_CMD_OFFSET, + (IDXD_ENABLE_WQ << IDXD_CMD_SHIFT) | i); + rc = idxd_wait_cmd(idxd, IDXD_REGISTER_TIMEOUT_US); + if (rc < 0) { + SPDK_ERRLOG("Error enabling work queues 0x%x\n", rc); + goto err_wq_enable; + } + } + + if ((rc == 0) && (genstatus_reg.state == IDXD_DEVICE_STATE_ENABLED)) { + SPDK_NOTICELOG("Device enabled, version 0x%x gencap: 0x%lx\n", + idxd->registers.version, + idxd->registers.gencap.raw); + + } + + return rc; +err_wq_enable: +err_device_enable: + free(idxd->queues); +err_wq_cfg: + free(idxd->groups); +err_group_cfg: +err_reset: + idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR); + idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR); + + return rc; +} + +static void +idxd_device_destruct(struct spdk_idxd_device *idxd) +{ + idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR); + idxd_unmap_pci_bar(idxd, IDXD_WQ_BAR); + free(idxd->groups); + free(idxd->queues); + free(idxd); +} + +/* Caller must hold g_driver_lock */ +static struct spdk_idxd_device * +idxd_attach(struct spdk_pci_device *device) +{ + struct spdk_idxd_device *idxd; + uint32_t cmd_reg; + int rc; + + idxd = calloc(1, sizeof(struct spdk_idxd_device)); + if (idxd == NULL) { + SPDK_ERRLOG("Failed to allocate memory for idxd device.\n"); + return NULL; + } + + idxd->device = device; + + /* Enable PCI busmaster. */ + spdk_pci_device_cfg_read32(device, &cmd_reg, 4); + cmd_reg |= 0x4; + spdk_pci_device_cfg_write32(device, cmd_reg, 4); + + rc = idxd_device_configure(idxd); + if (rc) { + goto err; + } + + return idxd; +err: + idxd_device_destruct(idxd); + return NULL; +} + +struct idxd_enum_ctx { + spdk_idxd_probe_cb probe_cb; + spdk_idxd_attach_cb attach_cb; + void *cb_ctx; +}; + +/* This function must only be called while holding g_driver_lock */ +static int +idxd_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) +{ + struct idxd_enum_ctx *enum_ctx = ctx; + struct spdk_idxd_device *idxd; + + if (enum_ctx->probe_cb(enum_ctx->cb_ctx, pci_dev)) { + idxd = idxd_attach(pci_dev); + if (idxd == NULL) { + SPDK_ERRLOG("idxd_attach() failed\n"); + return -EINVAL; + } + + enum_ctx->attach_cb(enum_ctx->cb_ctx, pci_dev, idxd); + } + + return 0; +} + +int +spdk_idxd_probe(void *cb_ctx, spdk_idxd_probe_cb probe_cb, spdk_idxd_attach_cb attach_cb) +{ + int rc; + struct idxd_enum_ctx enum_ctx; + + enum_ctx.probe_cb = probe_cb; + enum_ctx.attach_cb = attach_cb; + enum_ctx.cb_ctx = cb_ctx; + + pthread_mutex_lock(&g_driver_lock); + rc = spdk_pci_enumerate(spdk_pci_idxd_get_driver(), idxd_enum_cb, &enum_ctx); + pthread_mutex_unlock(&g_driver_lock); + + return rc; +} + +void +spdk_idxd_detach(struct spdk_idxd_device *idxd) +{ + idxd_device_destruct(idxd); +} + +static struct idxd_hw_desc * +_idxd_prep_command(struct spdk_idxd_io_channel *chan, spdk_idxd_req_cb cb_fn, + void *cb_arg, struct idxd_batch *batch) +{ + uint32_t index; + struct idxd_hw_desc *desc; + struct idxd_comp *comp; + + index = spdk_bit_array_find_first_clear(chan->ring_ctrl.ring_slots, 0); + if (index == UINT32_MAX) { + /* ran out of ring slots */ + return NULL; + } + + spdk_bit_array_set(chan->ring_ctrl.ring_slots, index); + + desc = &chan->ring_ctrl.desc[index]; + comp = &chan->ring_ctrl.completions[index]; + + desc->flags = IDXD_FLAG_COMPLETION_ADDR_VALID | IDXD_FLAG_REQUEST_COMPLETION; + desc->completion_addr = (uintptr_t)&comp->hw; + comp->cb_arg = cb_arg; + comp->cb_fn = cb_fn; + if (batch) { + comp->batch = batch; + batch->batch_desc_index = index; + } + + return desc; +} + +int +spdk_idxd_submit_copy(struct spdk_idxd_io_channel *chan, void *dst, const void *src, + uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL); + if (desc == NULL) { + return -EBUSY; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_MEMMOVE; + desc->src_addr = (uintptr_t)src; + desc->dst_addr = (uintptr_t)dst; + desc->xfer_size = nbytes; + + /* Submit operation. */ + movdir64b(chan->ring_ctrl.portal, desc); + + return 0; +} + +/* Dual-cast copies the same source to two separate destination buffers. */ +int +spdk_idxd_submit_dualcast(struct spdk_idxd_io_channel *chan, void *dst1, void *dst2, + const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) { + SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n"); + return -EINVAL; + } + + /* Common prep. */ + desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL); + if (desc == NULL) { + return -EBUSY; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_DUALCAST; + desc->src_addr = (uintptr_t)src; + desc->dst_addr = (uintptr_t)dst1; + desc->dest2 = (uintptr_t)dst2; + desc->xfer_size = nbytes; + + /* Submit operation. */ + movdir64b(chan->ring_ctrl.portal, desc); + + return 0; +} + +int +spdk_idxd_submit_compare(struct spdk_idxd_io_channel *chan, void *src1, const void *src2, + uint64_t nbytes, + spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL); + if (desc == NULL) { + return -EBUSY; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_COMPARE; + desc->src_addr = (uintptr_t)src1; + desc->src2_addr = (uintptr_t)src2; + desc->xfer_size = nbytes; + + /* Submit operation. */ + movdir64b(chan->ring_ctrl.portal, desc); + + return 0; +} + +int +spdk_idxd_submit_fill(struct spdk_idxd_io_channel *chan, void *dst, uint64_t fill_pattern, + uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL); + if (desc == NULL) { + return -EBUSY; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_MEMFILL; + desc->pattern = fill_pattern; + desc->dst_addr = (uintptr_t)dst; + desc->xfer_size = nbytes; + + /* Submit operation. */ + movdir64b(chan->ring_ctrl.portal, desc); + + return 0; +} + +int +spdk_idxd_submit_crc32c(struct spdk_idxd_io_channel *chan, uint32_t *dst, void *src, + uint32_t seed, uint64_t nbytes, + spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL); + if (desc == NULL) { + return -EBUSY; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_CRC32C_GEN; + desc->dst_addr = (uintptr_t)dst; + desc->src_addr = (uintptr_t)src; + desc->flags &= IDXD_CLEAR_CRC_FLAGS; + desc->crc32c.seed = seed; + desc->xfer_size = nbytes; + + /* Submit operation. */ + movdir64b(chan->ring_ctrl.portal, desc); + + return 0; +} + +uint32_t +spdk_idxd_batch_get_max(void) +{ + return DESC_PER_BATCH; /* TODO maybe add startup RPC to set this */ +} + +struct idxd_batch * +spdk_idxd_batch_create(struct spdk_idxd_io_channel *chan) +{ + struct idxd_batch *batch = NULL; + + if (!TAILQ_EMPTY(&chan->batch_pool)) { + batch = TAILQ_FIRST(&chan->batch_pool); + TAILQ_REMOVE(&chan->batch_pool, batch, link); + } else { + /* The application needs to handle this. */ + return NULL; + } + + batch->batch_num = spdk_bit_array_find_first_clear(chan->ring_ctrl.user_ring_slots, 0); + if (batch->batch_num == UINT32_MAX) { + /* ran out of ring slots, the application needs to handle this. */ + TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link); + return NULL; + } + + spdk_bit_array_set(chan->ring_ctrl.user_ring_slots, batch->batch_num); + + /* + * Find the first descriptor address for the given batch. The + * descriptor ring used for user desctipors is allocated in + * units of DESC_PER_BATCH. The actual index is in units of + * one descriptor. + */ + batch->start_index = batch->cur_index = batch->batch_num * DESC_PER_BATCH; + + TAILQ_INSERT_TAIL(&chan->batches, batch, link); + SPDK_DEBUGLOG(SPDK_LOG_IDXD, "New batch %p num %u\n", batch, batch->batch_num); + + return batch; +} + +static bool +_does_batch_exist(struct idxd_batch *batch, struct spdk_idxd_io_channel *chan) +{ + bool found = false; + struct idxd_batch *cur_batch; + + TAILQ_FOREACH(cur_batch, &chan->batches, link) { + if (cur_batch == batch) { + found = true; + break; + } + } + + return found; +} + +int +spdk_idxd_batch_cancel(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch) +{ + if (_does_batch_exist(batch, chan) == false) { + SPDK_ERRLOG("Attempt to cancel a batch that doesn't exist\n."); + return -EINVAL; + } + + if (batch->remaining > 0) { + SPDK_ERRLOG("Cannot cancel batch, already submitted to HW\n."); + return -EINVAL; + } + + TAILQ_REMOVE(&chan->batches, batch, link); + spdk_bit_array_clear(chan->ring_ctrl.user_ring_slots, batch->batch_num); + TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link); + + return 0; +} + +int +spdk_idxd_batch_submit(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + if (_does_batch_exist(batch, chan) == false) { + SPDK_ERRLOG("Attempt to submit a batch that doesn't exist\n."); + return -EINVAL; + } + + /* Common prep. */ + desc = _idxd_prep_command(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_IDXD, "Can't submit batch %p busy batch num %u\n", batch, batch->batch_num); + return -EBUSY; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_BATCH; + desc->desc_list_addr = (uintptr_t)&chan->ring_ctrl.user_desc[batch->start_index]; + desc->desc_count = batch->cur_index - batch->start_index; + assert(desc->desc_count <= DESC_PER_BATCH); + + if (desc->desc_count < MIN_USER_DESC_COUNT) { + SPDK_ERRLOG("Attempt to submit a batch without at least %u operations.\n", + MIN_USER_DESC_COUNT); + return -EINVAL; + } + + /* Total completions for the batch = num desc plus 1 for the batch desc itself. */ + batch->remaining = desc->desc_count + 1; + + /* Submit operation. */ + movdir64b(chan->ring_ctrl.portal, desc); + + return 0; +} + +static struct idxd_hw_desc * +_idxd_prep_batch_cmd(struct spdk_idxd_io_channel *chan, spdk_idxd_req_cb cb_fn, + void *cb_arg, struct idxd_batch *batch) +{ + struct idxd_hw_desc *desc; + struct idxd_comp *comp; + + if (_does_batch_exist(batch, chan) == false) { + SPDK_ERRLOG("Attempt to add to a batch that doesn't exist\n."); + return NULL; + } + + if ((batch->cur_index - batch->start_index) == DESC_PER_BATCH) { + SPDK_ERRLOG("Attempt to add to a batch that is already full\n."); + return NULL; + } + + desc = &chan->ring_ctrl.user_desc[batch->cur_index]; + comp = &chan->ring_ctrl.user_completions[batch->cur_index]; + SPDK_DEBUGLOG(SPDK_LOG_IDXD, "Prep batch %p index %u\n", batch, batch->cur_index); + + batch->cur_index++; + assert(batch->cur_index > batch->start_index); + + desc->flags = IDXD_FLAG_COMPLETION_ADDR_VALID | IDXD_FLAG_REQUEST_COMPLETION; + desc->completion_addr = (uintptr_t)&comp->hw; + comp->cb_arg = cb_arg; + comp->cb_fn = cb_fn; + comp->batch = batch; + + return desc; +} + +int +spdk_idxd_batch_prep_copy(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *dst, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + return -EINVAL; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_MEMMOVE; + desc->src_addr = (uintptr_t)src; + desc->dst_addr = (uintptr_t)dst; + desc->xfer_size = nbytes; + + return 0; +} + +int +spdk_idxd_batch_prep_fill(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *dst, uint64_t fill_pattern, uint64_t nbytes, + spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + return -EINVAL; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_MEMFILL; + desc->pattern = fill_pattern; + desc->dst_addr = (uintptr_t)dst; + desc->xfer_size = nbytes; + + return 0; +} + +int +spdk_idxd_batch_prep_dualcast(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *dst1, void *dst2, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) { + SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n"); + return -EINVAL; + } + + /* Common prep. */ + desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + return -EINVAL; + } + desc->opcode = IDXD_OPCODE_DUALCAST; + desc->src_addr = (uintptr_t)src; + desc->dst_addr = (uintptr_t)dst1; + desc->dest2 = (uintptr_t)dst2; + desc->xfer_size = nbytes; + + return 0; +} + +int +spdk_idxd_batch_prep_crc32c(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes, + spdk_idxd_req_cb cb_fn, void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + return -EINVAL; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_CRC32C_GEN; + desc->dst_addr = (uintptr_t)dst; + desc->src_addr = (uintptr_t)src; + desc->flags &= IDXD_CLEAR_CRC_FLAGS; + desc->crc32c.seed = seed; + desc->xfer_size = nbytes; + + return 0; +} + +int +spdk_idxd_batch_prep_compare(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch, + void *src1, void *src2, uint64_t nbytes, spdk_idxd_req_cb cb_fn, + void *cb_arg) +{ + struct idxd_hw_desc *desc; + + /* Common prep. */ + desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch); + if (desc == NULL) { + return -EINVAL; + } + + /* Command specific. */ + desc->opcode = IDXD_OPCODE_COMPARE; + desc->src_addr = (uintptr_t)src1; + desc->src2_addr = (uintptr_t)src2; + desc->xfer_size = nbytes; + + return 0; +} + +static void +_dump_error_reg(struct spdk_idxd_io_channel *chan) +{ + uint64_t sw_error_0; + uint16_t i; + + sw_error_0 = _idxd_read_8(chan->idxd, IDXD_SWERR_OFFSET); + + SPDK_NOTICELOG("SW Error bits set:"); + for (i = 0; i < CHAR_BIT; i++) { + if ((1ULL << i) & sw_error_0) { + SPDK_NOTICELOG(" %d\n", i); + } + } + SPDK_NOTICELOG("SW Error error code: %#x\n", (uint8_t)(sw_error_0 >> 8)); + SPDK_NOTICELOG("SW Error WQ index: %u\n", (uint8_t)(sw_error_0 >> 16)); + SPDK_NOTICELOG("SW Error Operation: %u\n", (uint8_t)(sw_error_0 >> 32)); +} + +static void +_free_batch(struct idxd_batch *batch, struct spdk_idxd_io_channel *chan, + struct idxd_comp *comp) +{ + TAILQ_REMOVE(&chan->batches, batch, link); + TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link); + comp->batch = NULL; + spdk_bit_array_clear(chan->ring_ctrl.user_ring_slots, batch->batch_num); + spdk_bit_array_clear(chan->ring_ctrl.ring_slots, batch->batch_desc_index); +} + +static void +_spdk_idxd_process_batch_events(struct spdk_idxd_io_channel *chan) +{ + uint16_t index; + struct idxd_comp *comp; + uint64_t sw_error_0; + int status = 0; + struct idxd_batch *batch; + + /* + * We don't check the bit array for user completions as there's only + * one bit per per batch. + */ + for (index = 0; index < TOTAL_USER_DESC; index++) { + comp = &chan->ring_ctrl.user_completions[index]; + if (comp->hw.status == 1) { + struct idxd_hw_desc *desc; + + sw_error_0 = _idxd_read_8(chan->idxd, IDXD_SWERR_OFFSET); + if (sw_error_0 & 0x1) { + _dump_error_reg(chan); + status = -EINVAL; + } + + desc = &chan->ring_ctrl.user_desc[index]; + switch (desc->opcode) { + case IDXD_OPCODE_CRC32C_GEN: + *(uint32_t *)desc->dst_addr = comp->hw.crc32c_val; + *(uint32_t *)desc->dst_addr ^= ~0; + break; + case IDXD_OPCODE_COMPARE: + if (status == 0) { + status = comp->hw.result; + } + break; + case IDXD_OPCODE_MEMFILL: + case IDXD_OPCODE_DUALCAST: + case IDXD_OPCODE_MEMMOVE: + break; + default: + assert(false); + break; + } + + /* The hw will complete all user desc first before the batch + * desc (see spec for configuration exceptions) however + * because of the order that we check for comps in the poller + * we may "see" them in a different order than they actually + * completed in. + */ + batch = comp->batch; + assert(batch->remaining > 0); + if (--batch->remaining == 0) { + _free_batch(batch, chan, comp); + } + + comp->cb_fn((void *)comp->cb_arg, status); + comp->hw.status = status = 0; + } + } +} + +/* + * TODO: Experiment with different methods of reaping completions for performance + * once we have real silicon. + */ +void +spdk_idxd_process_events(struct spdk_idxd_io_channel *chan) +{ + uint16_t index; + struct idxd_comp *comp; + uint64_t sw_error_0; + int status = 0; + struct idxd_batch *batch; + + if (!TAILQ_EMPTY(&chan->batches)) { + _spdk_idxd_process_batch_events(chan); + } + + for (index = 0; index < chan->ring_ctrl.max_ring_slots; index++) { + if (spdk_bit_array_get(chan->ring_ctrl.ring_slots, index)) { + comp = &chan->ring_ctrl.completions[index]; + if (comp->hw.status == 1) { + struct idxd_hw_desc *desc; + + sw_error_0 = _idxd_read_8(chan->idxd, IDXD_SWERR_OFFSET); + if (sw_error_0 & 0x1) { + _dump_error_reg(chan); + status = -EINVAL; + } + + desc = &chan->ring_ctrl.desc[index]; + switch (desc->opcode) { + case IDXD_OPCODE_BATCH: + /* The hw will complete all user desc first before the batch + * desc (see spec for configuration exceptions) however + * because of the order that we check for comps in the poller + * we may "see" them in a different order than they actually + * completed in. + */ + batch = comp->batch; + assert(batch->remaining > 0); + if (--batch->remaining == 0) { + _free_batch(batch, chan, comp); + } + break; + case IDXD_OPCODE_CRC32C_GEN: + *(uint32_t *)desc->dst_addr = comp->hw.crc32c_val; + *(uint32_t *)desc->dst_addr ^= ~0; + break; + case IDXD_OPCODE_COMPARE: + if (status == 0) { + status = comp->hw.result; + } + break; + } + + comp->cb_fn(comp->cb_arg, status); + comp->hw.status = status = 0; + if (desc->opcode != IDXD_OPCODE_BATCH) { + spdk_bit_array_clear(chan->ring_ctrl.ring_slots, index); + } + } + } + } +} + +SPDK_LOG_REGISTER_COMPONENT("idxd", SPDK_LOG_IDXD) diff --git a/src/spdk/lib/idxd/idxd.h b/src/spdk/lib/idxd/idxd.h new file mode 100644 index 000000000..09d021152 --- /dev/null +++ b/src/spdk/lib/idxd/idxd.h @@ -0,0 +1,188 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __IDXD_H__ +#define __IDXD_H__ + +#include "spdk/stdinc.h" + +#include "spdk/idxd.h" +#include "spdk/queue.h" +#include "spdk/mmio.h" +#include "spdk/bit_array.h" + +#include "idxd_spec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* TODO: get the gcc intrinsic to work. */ +#define nop() asm volatile ("nop") +static inline void movdir64b(void *dst, const void *src) +{ + asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02" + : "=m"(*(char *)dst) + : "d"(src), "a"(dst)); +} + +#define IDXD_REGISTER_TIMEOUT_US 50 +#define IDXD_DRAIN_TIMEOUT_US 500000 + +/* TODO: make some of these RPC selectable */ +#define WQ_MODE_DEDICATED 1 +#define LOG2_WQ_MAX_BATCH 8 /* 2^8 = 256 */ +#define LOG2_WQ_MAX_XFER 30 /* 2^30 = 1073741824 */ +#define WQCFG_NUM_DWORDS 8 +#define WQ_PRIORITY_1 1 +#define IDXD_MAX_QUEUES 64 + +#define TOTAL_USER_DESC (1 << LOG2_WQ_MAX_BATCH) +#define DESC_PER_BATCH 16 /* TODO maybe make this a startup RPC */ +#define NUM_BATCHES (TOTAL_USER_DESC / DESC_PER_BATCH) +#define MIN_USER_DESC_COUNT 2 + +struct idxd_batch { + uint32_t batch_desc_index; + uint32_t batch_num; + uint32_t cur_index; + uint32_t start_index; + uint32_t remaining; + TAILQ_ENTRY(idxd_batch) link; +}; + +struct device_config { + uint8_t config_num; + uint8_t num_wqs_per_group; + uint8_t num_engines_per_group; + uint8_t num_groups; + uint16_t total_wqs; + uint16_t total_engines; +}; + +struct idxd_ring_control { + void *portal; + + uint16_t ring_size; + + /* + * Rings for this channel, one for descriptors and one + * for completions, share the same index. Batch descriptors + * are managed independently from data descriptors. + */ + struct idxd_hw_desc *desc; + struct idxd_comp *completions; + struct idxd_hw_desc *user_desc; + struct idxd_comp *user_completions; + + /* + * We use one bit array to track ring slots for both + * desc and completions. + */ + struct spdk_bit_array *ring_slots; + uint32_t max_ring_slots; + + /* + * We use a separate bit array to track ring slots for + * descriptors submitted via the user in a batch. + */ + struct spdk_bit_array *user_ring_slots; +}; + +struct spdk_idxd_io_channel { + struct spdk_idxd_device *idxd; + struct idxd_ring_control ring_ctrl; + TAILQ_HEAD(, idxd_batch) batch_pool; /* free batches */ + TAILQ_HEAD(, idxd_batch) batches; /* in use batches */ +}; + +struct pci_dev_id { + int vendor_id; + int device_id; +}; + +struct idxd_group { + struct spdk_idxd_device *idxd; + struct idxd_grpcfg grpcfg; + struct pci_dev_id pcidev; + int num_engines; + int num_wqs; + int id; + uint8_t tokens_allowed; + bool use_token_limit; + uint8_t tokens_reserved; + int tc_a; + int tc_b; +}; + +/* + * This struct wraps the hardware completion record which is 32 bytes in + * size and must be 32 byte aligned. + */ +struct idxd_comp { + struct idxd_hw_comp_record hw; + void *cb_arg; + spdk_idxd_req_cb cb_fn; + struct idxd_batch *batch; + uint64_t pad2; +} __attribute__((packed)); +SPDK_STATIC_ASSERT(sizeof(struct idxd_comp) == 64, "size mismatch"); + +struct idxd_wq { + struct spdk_idxd_device *idxd; + struct idxd_group *group; + union idxd_wqcfg wqcfg; +}; + +struct spdk_idxd_device { + struct spdk_pci_device *device; + void *reg_base; + void *portals; + int socket_id; + int wq_id; + + struct idxd_registers registers; + uint32_t ims_offset; + uint32_t msix_perm_offset; + uint32_t wqcfg_offset; + uint32_t grpcfg_offset; + uint32_t perfmon_offset; + struct idxd_group *groups; + struct idxd_wq *queues; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* __IDXD_H__ */ diff --git a/src/spdk/lib/idxd/idxd_spec.h b/src/spdk/lib/idxd/idxd_spec.h new file mode 100644 index 000000000..51d52cdcc --- /dev/null +++ b/src/spdk/lib/idxd/idxd_spec.h @@ -0,0 +1,503 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * IDXD specification definitions + */ + +#ifndef SPDK_IDXD_SPEC_H +#define SPDK_IDXD_SPEC_H + +#include "spdk/stdinc.h" +#include "spdk/assert.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define IDXD_MMIO_BAR 0 +#define IDXD_WQ_BAR 2 +#define PORTAL_SIZE (4096 * 4) + +#define CFG_ENGINE_OFFSET 0x20 +#define CFG_FLAG_OFFSET 0x28 + +#define IDXD_CMD_SHIFT 20 + +#define IDXD_VERSION_OFFSET 0x00 +#define IDXD_GENCAP_OFFSET 0x10 +#define IDXD_WQCAP_OFFSET 0x20 +#define IDXD_GRPCAP_OFFSET 0x30 +#define IDXD_OPCAP_OFFSET 0x40 +#define IDXD_ENGCAP_OFFSET 0x38 +#define IDXD_OPCAP_OFFSET 0x40 +#define IDXD_TABLE_OFFSET 0x60 +#define IDXD_GENCFG_OFFSET 0x80 +#define IDXD_GENCTRL_OFFSET 0x88 +#define IDXD_GENSTATUS_OFFSET 0x90 +#define IDXD_INTCAUSE_OFFSET 0x98 +#define IDXD_CMD_OFFSET 0xa0 +#define IDXD_CMDSTS_OFFSET 0xa8 +#define IDXD_SWERR_OFFSET 0xc0 +#define IDXD_TABLE_OFFSET_MULT 0x100 + +#define IDXD_OPCAP_WORDS 0x4 + +#define IDXD_CLEAR_CRC_FLAGS 0xFFFFu + +#define IDXD_FLAG_FENCE (1 << 0) +#define IDXD_FLAG_COMPLETION_ADDR_VALID (1 << 2) +#define IDXD_FLAG_REQUEST_COMPLETION (1 << 3) +#define IDXD_FLAG_CACHE_CONTROL (1 << 8) + +/* + * IDXD is a family of devices, DSA is the only currently + * supported one. + */ +enum dsa_completion_status { + IDXD_COMP_NONE = 0, + IDXD_COMP_SUCCESS = 1, + IDXD_COMP_SUCCESS_PRED = 2, + IDXD_COMP_PAGE_FAULT_NOBOF = 3, + IDXD_COMP_PAGE_FAULT_IR = 4, + IDXD_COMP_BATCH_FAIL = 5, + IDXD_COMP_BATCH_PAGE_FAULT = 6, + IDXD_COMP_DR_OFFSET_NOINC = 7, + IDXD_COMP_DR_OFFSET_ERANGE = 8, + IDXD_COMP_DIF_ERR = 9, + IDXD_COMP_BAD_OPCODE = 16, + IDXD_COMP_INVALID_FLAGS = 17, + IDXD_COMP_NOZERO_RESERVE = 18, + IDXD_COMP_XFER_ERANGE = 19, + IDXD_COMP_DESC_CNT_ERANGE = 20, + IDXD_COMP_DR_ERANGE = 21, + IDXD_COMP_OVERLAP_BUFFERS = 22, + IDXD_COMP_DCAST_ERR = 23, + IDXD_COMP_DESCLIST_ALIGN = 24, + IDXD_COMP_INT_HANDLE_INVAL = 25, + IDXD_COMP_CRA_XLAT = 26, + IDXD_COMP_CRA_ALIGN = 27, + IDXD_COMP_ADDR_ALIGN = 28, + IDXD_COMP_PRIV_BAD = 29, + IDXD_COMP_TRAFFIC_CLASS_CONF = 30, + IDXD_COMP_PFAULT_RDBA = 31, + IDXD_COMP_HW_ERR1 = 32, + IDXD_COMP_HW_ERR_DRB = 33, + IDXD_COMP_TRANSLATION_FAIL = 34, +}; + +enum idxd_wq_state { + WQ_DISABLED = 0, + WQ_ENABLED = 1, +}; + +enum idxd_wq_flag { + WQ_FLAG_DEDICATED = 0, + WQ_FLAG_BOF = 1, +}; + +enum idxd_wq_type { + WQT_NONE = 0, + WQT_KERNEL = 1, + WQT_USER = 2, + WQT_MDEV = 3, +}; + +enum idxd_dev_state { + IDXD_DEVICE_STATE_DISABLED = 0, + IDXD_DEVICE_STATE_ENABLED = 1, + IDXD_DEVICE_STATE_DRAIN = 2, + IDXD_DEVICE_STATE_HALT = 3, +}; + +enum idxd_device_reset_type { + IDXD_DEVICE_RESET_SOFTWARE = 0, + IDXD_DEVICE_RESET_FLR = 1, + IDXD_DEVICE_RESET_WARM = 2, + IDXD_DEVICE_RESET_COLD = 3, +}; + +enum idxd_cmds { + IDXD_ENABLE_DEV = 1, + IDXD_DISABLE_DEV = 2, + IDXD_DRAIN_ALL = 3, + IDXD_ABORT_ALL = 4, + IDXD_RESET_DEVICE = 5, + IDXD_ENABLE_WQ = 6, + IDXD_DISABLE_WQ = 7, + IDXD_DRAIN_WQ = 8, + IDXD_ABORT_WQ = 9, + IDXD_RESET_WQ = 10, +}; + +enum idxd_cmdsts_err { + IDXD_CMDSTS_SUCCESS = 0, + IDXD_CMDSTS_INVAL_CMD = 1, + IDXD_CMDSTS_INVAL_WQIDX = 2, + IDXD_CMDSTS_HW_ERR = 3, + IDXD_CMDSTS_ERR_DEV_ENABLED = 16, + IDXD_CMDSTS_ERR_CONFIG = 17, + IDXD_CMDSTS_ERR_BUSMASTER_EN = 18, + IDXD_CMDSTS_ERR_PASID_INVAL = 19, + IDXD_CMDSTS_ERR_WQ_SIZE_ERANGE = 20, + IDXD_CMDSTS_ERR_GRP_CONFIG = 21, + IDXD_CMDSTS_ERR_GRP_CONFIG2 = 22, + IDXD_CMDSTS_ERR_GRP_CONFIG3 = 23, + IDXD_CMDSTS_ERR_GRP_CONFIG4 = 24, + IDXD_CMDSTS_ERR_DEV_NOTEN = 32, + IDXD_CMDSTS_ERR_WQ_ENABLED = 33, + IDXD_CMDSTS_ERR_WQ_SIZE = 34, + IDXD_CMDSTS_ERR_WQ_PRIOR = 35, + IDXD_CMDSTS_ERR_WQ_MODE = 36, + IDXD_CMDSTS_ERR_BOF_EN = 37, + IDXD_CMDSTS_ERR_PASID_EN = 38, + IDXD_CMDSTS_ERR_MAX_BATCH_SIZE = 39, + IDXD_CMDSTS_ERR_MAX_XFER_SIZE = 40, + IDXD_CMDSTS_ERR_DIS_DEV_EN = 49, + IDXD_CMDSTS_ERR_DEV_NOT_EN = 50, + IDXD_CMDSTS_ERR_INVAL_INT_IDX = 65, + IDXD_CMDSTS_ERR_NO_HANDLE = 66, +}; + +enum idxd_wq_hw_state { + IDXD_WQ_DEV_DISABLED = 0, + IDXD_WQ_DEV_ENABLED = 1, + IDXD_WQ_DEV_BUSY = 2, +}; + +struct idxd_hw_desc { + uint32_t pasid: 20; + uint32_t rsvd: 11; + uint32_t priv: 1; + uint32_t flags: 24; + uint32_t opcode: 8; + uint64_t completion_addr; + union { + uint64_t src_addr; + uint64_t readback_addr; + uint64_t pattern; + uint64_t desc_list_addr; + }; + union { + uint64_t dst_addr; + uint64_t readback_addr2; + uint64_t src2_addr; + uint64_t comp_pattern; + }; + union { + uint32_t xfer_size; + uint32_t desc_count; + }; + uint16_t int_handle; + uint16_t rsvd1; + union { + uint8_t expected_res; + struct delta { + uint64_t addr; + uint32_t max_size; + } delta; + uint32_t delta_rec_size; + uint64_t dest2; + struct crc32c { + uint32_t seed; + uint32_t rsvd; + uint64_t addr; + } crc32c; + struct dif_chk { + uint8_t src_flags; + uint8_t rsvd1; + uint8_t flags; + uint8_t rsvd2[5]; + uint32_t ref_tag_seed; + uint16_t app_tag_mask; + uint16_t app_tag_seed; + } dif_chk; + struct dif_ins { + uint8_t rsvd1; + uint8_t dest_flag; + uint8_t flags; + uint8_t rsvd2[13]; + uint32_t ref_tag_seed; + uint16_t app_tag_mask; + uint16_t app_tag_seed; + } dif_ins; + struct dif_upd { + uint8_t src_flags; + uint8_t dest_flags; + uint8_t flags; + uint8_t rsvd[5]; + uint32_t src_ref_tag_seed; + uint16_t src_app_tag_mask; + uint16_t src_app_tag_seed; + uint32_t dest_ref_tag_seed; + uint16_t dest_app_tag_mask; + uint16_t dest_app_tag_seed; + } dif_upd; + uint8_t op_specific[24]; + }; +} __attribute__((packed)); +SPDK_STATIC_ASSERT(sizeof(struct idxd_hw_desc) == 64, "size mismatch"); + +struct idxd_hw_comp_record { + volatile uint8_t status; + union { + uint8_t result; + uint8_t dif_status; + }; + uint16_t rsvd; + uint32_t bytes_completed; + uint64_t fault_addr; + union { + uint32_t delta_rec_size; + uint32_t crc32c_val; + struct { + uint32_t dif_chk_ref_tag; + uint16_t dif_chk_app_tag_mask; + uint16_t dif_chk_app_tag; + }; + struct dif_ins_comp { + uint64_t rsvd; + uint32_t ref_tag; + uint16_t app_tag_mask; + uint16_t app_tag; + } dif_ins_comp; + struct dif_upd_comp { + uint32_t src_ref_tag; + uint16_t src_app_tag_mask; + uint16_t src_app_tag; + uint32_t dest_ref_tag; + uint16_t dest_app_tag_mask; + uint16_t dest_app_tag; + } dif_upd_comp; + uint8_t op_specific[16]; + }; +} __attribute__((packed)); +SPDK_STATIC_ASSERT(sizeof(struct idxd_hw_comp_record) == 32, "size mismatch"); + +union idxd_gencap_register { + struct { + uint64_t block_on_fault: 1; + uint64_t overlap_copy: 1; + uint64_t cache_control_mem: 1; + uint64_t cache_control_cache: 1; + uint64_t rsvd: 3; + uint64_t int_handle_req: 1; + uint64_t dest_readback: 1; + uint64_t drain_readback: 1; + uint64_t rsvd2: 6; + uint64_t max_xfer_shift: 5; + uint64_t max_batch_shift: 4; + uint64_t max_ims_mult: 6; + uint64_t config_en: 1; + uint64_t max_descs_per_engine: 8; + uint64_t rsvd3: 24; + } __attribute__((packed)); + uint64_t raw; +}; +SPDK_STATIC_ASSERT(sizeof(union idxd_gencap_register) == 8, "size mismatch"); + +union idxd_wqcap_register { + struct { + uint64_t total_wq_size: 16; + uint64_t num_wqs: 8; + uint64_t rsvd: 24; + uint64_t shared_mode: 1; + uint64_t dedicated_mode: 1; + uint64_t rsvd2: 1; + uint64_t priority: 1; + uint64_t occupancy: 1; + uint64_t occupancy_int: 1; + uint64_t rsvd3: 10; + } __attribute__((packed)); + uint64_t raw; +}; +SPDK_STATIC_ASSERT(sizeof(union idxd_wqcap_register) == 8, "size mismatch"); + +union idxd_groupcap_register { + struct { + uint64_t num_groups: 8; + uint64_t total_tokens: 8; + uint64_t token_en: 1; + uint64_t token_limit: 1; + uint64_t rsvd: 46; + } __attribute__((packed)); + uint64_t raw; +}; +SPDK_STATIC_ASSERT(sizeof(union idxd_groupcap_register) == 8, "size mismatch"); + +union idxd_enginecap_register { + struct { + uint64_t num_engines: 8; + uint64_t rsvd: 56; + } __attribute__((packed)); + uint64_t raw; +}; +SPDK_STATIC_ASSERT(sizeof(union idxd_enginecap_register) == 8, "size mismatch"); + +struct idxd_opcap_register { + uint64_t raw[4]; +}; +SPDK_STATIC_ASSERT(sizeof(struct idxd_opcap_register) == 32, "size mismatch"); + +struct idxd_registers { + uint32_t version; + union idxd_gencap_register gencap; + union idxd_wqcap_register wqcap; + union idxd_groupcap_register groupcap; + union idxd_enginecap_register enginecap; + struct idxd_opcap_register opcap; +}; +SPDK_STATIC_ASSERT(sizeof(struct idxd_registers) == 72, "size mismatch"); + +union idxd_offsets_register { + struct { + uint64_t grpcfg: 16; + uint64_t wqcfg: 16; + uint64_t msix_perm: 16; + uint64_t ims: 16; + uint64_t perfmon: 16; + uint64_t rsvd: 48; + } __attribute__((packed)); + uint64_t raw[2]; +}; +SPDK_STATIC_ASSERT(sizeof(union idxd_offsets_register) == 16, "size mismatch"); + +union idxd_genstatus_register { + struct { + uint32_t state: 2; + uint32_t reset_type: 2; + uint32_t rsvd: 28; + } __attribute__((packed)); + uint32_t raw; +}; +SPDK_STATIC_ASSERT(sizeof(union idxd_genstatus_register) == 4, "size mismatch"); + +union idxd_cmdsts_reg { + struct { + uint8_t err; + uint16_t result; + uint8_t rsvd: 7; + uint8_t active: 1; + } __attribute__((packed)); + uint32_t raw; +}; +SPDK_STATIC_ASSERT(sizeof(union idxd_cmdsts_reg) == 4, "size mismatch"); + +union idxd_swerr_register { + struct { + uint64_t valid: 1; + uint64_t overflow: 1; + uint64_t desc_valid: 1; + uint64_t wq_idx_valid: 1; + uint64_t batch: 1; + uint64_t fault_rw: 1; + uint64_t priv: 1; + uint64_t rsvd: 1; + uint64_t error: 8; + uint64_t wq_idx: 8; + uint64_t rsvd2: 8; + uint64_t operation: 8; + uint64_t pasid: 20; + uint64_t rsvd3: 4; + uint64_t batch_idx: 16; + uint64_t rsvd4: 16; + uint64_t invalid_flags: 32; + uint64_t fault_addr; + uint64_t rsvd5; + } __attribute__((packed)); + uint64_t raw[4]; +}; +SPDK_STATIC_ASSERT(sizeof(union idxd_swerr_register) == 32, "size mismatch"); + +union idxd_group_flags { + struct { + uint32_t tc_a: 3; + uint32_t tc_b: 3; + uint32_t rsvd: 1; + uint32_t use_token_limit: 1; + uint32_t tokens_reserved: 8; + uint32_t rsvd2: 4; + uint32_t tokens_allowed: 8; + uint32_t rsvd3: 4; + } __attribute__((packed)); + uint32_t raw; +}; +SPDK_STATIC_ASSERT(sizeof(union idxd_group_flags) == 4, "size mismatch"); + +struct idxd_grpcfg { + uint64_t wqs[4]; + uint64_t engines; + union idxd_group_flags flags; +}; +SPDK_STATIC_ASSERT(sizeof(struct idxd_grpcfg) == 48, "size mismatch"); + +union idxd_wqcfg { + struct { + uint16_t wq_size; + uint16_t rsvd; + uint16_t wq_thresh; + uint16_t rsvd1; + uint32_t mode: 1; + uint32_t bof: 1; + uint32_t rsvd2: 2; + uint32_t priority: 4; + uint32_t pasid: 20; + uint32_t pasid_en: 1; + uint32_t priv: 1; + uint32_t rsvd3: 2; + uint32_t max_xfer_shift: 5; + uint32_t max_batch_shift: 4; + uint32_t rsvd4: 23; + uint16_t occupancy_inth; + uint16_t occupancy_table_sel: 1; + uint16_t rsvd5: 15; + uint16_t occupancy_limit; + uint16_t occupancy_int_en: 1; + uint16_t rsvd6: 15; + uint16_t occupancy; + uint16_t occupancy_int: 1; + uint16_t rsvd7: 12; + uint16_t mode_support: 1; + uint16_t wq_state: 2; + uint32_t rsvd8; + } __attribute__((packed)); + uint32_t raw[8]; +}; +SPDK_STATIC_ASSERT(sizeof(union idxd_wqcfg) == 32, "size mismatch"); + +#ifdef __cplusplus +} +#endif + +#endif /* SPDK_IDXD_SPEC_H */ diff --git a/src/spdk/lib/idxd/spdk_idxd.map b/src/spdk/lib/idxd/spdk_idxd.map new file mode 100644 index 000000000..4bffdf209 --- /dev/null +++ b/src/spdk/lib/idxd/spdk_idxd.map @@ -0,0 +1,29 @@ +{ + global: + + # public functions + spdk_idxd_configure_chan; + spdk_idxd_reconfigure_chan; + spdk_idxd_probe; + spdk_idxd_detach; + spdk_idxd_batch_prep_copy; + spdk_idxd_batch_prep_dualcast; + spdk_idxd_batch_prep_fill; + spdk_idxd_batch_prep_crc32c; + spdk_idxd_batch_prep_compare; + spdk_idxd_batch_submit; + spdk_idxd_batch_create; + spdk_idxd_batch_cancel; + spdk_idxd_batch_get_max; + spdk_idxd_set_config; + spdk_idxd_submit_compare; + spdk_idxd_submit_crc32c; + spdk_idxd_submit_copy; + spdk_idxd_submit_dualcast; + spdk_idxd_submit_fill; + spdk_idxd_process_events; + spdk_idxd_get_channel; + spdk_idxd_put_channel; + + local: *; +}; diff --git a/src/spdk/lib/ioat/Makefile b/src/spdk/lib/ioat/Makefile new file mode 100644 index 000000000..4cada5685 --- /dev/null +++ b/src/spdk/lib/ioat/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 + +C_SRCS = ioat.c +LIBNAME = ioat + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_ioat.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/ioat/ioat.c b/src/spdk/lib/ioat/ioat.c new file mode 100644 index 000000000..516fa545c --- /dev/null +++ b/src/spdk/lib/ioat/ioat.c @@ -0,0 +1,775 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "ioat_internal.h" + +#include "spdk/env.h" +#include "spdk/util.h" +#include "spdk/memory.h" + +#include "spdk_internal/log.h" + +struct ioat_driver { + pthread_mutex_t lock; + TAILQ_HEAD(, spdk_ioat_chan) attached_chans; +}; + +static struct ioat_driver g_ioat_driver = { + .lock = PTHREAD_MUTEX_INITIALIZER, + .attached_chans = TAILQ_HEAD_INITIALIZER(g_ioat_driver.attached_chans), +}; + +static uint64_t +ioat_get_chansts(struct spdk_ioat_chan *ioat) +{ + return spdk_mmio_read_8(&ioat->regs->chansts); +} + +static void +ioat_write_chancmp(struct spdk_ioat_chan *ioat, uint64_t addr) +{ + spdk_mmio_write_8(&ioat->regs->chancmp, addr); +} + +static void +ioat_write_chainaddr(struct spdk_ioat_chan *ioat, uint64_t addr) +{ + spdk_mmio_write_8(&ioat->regs->chainaddr, addr); +} + +static inline void +ioat_suspend(struct spdk_ioat_chan *ioat) +{ + ioat->regs->chancmd = SPDK_IOAT_CHANCMD_SUSPEND; +} + +static inline void +ioat_reset(struct spdk_ioat_chan *ioat) +{ + ioat->regs->chancmd = SPDK_IOAT_CHANCMD_RESET; +} + +static inline uint32_t +ioat_reset_pending(struct spdk_ioat_chan *ioat) +{ + uint8_t cmd; + + cmd = ioat->regs->chancmd; + return (cmd & SPDK_IOAT_CHANCMD_RESET) == SPDK_IOAT_CHANCMD_RESET; +} + +static int +ioat_map_pci_bar(struct spdk_ioat_chan *ioat) +{ + int regs_bar, rc; + void *addr; + uint64_t phys_addr, size; + + regs_bar = 0; + rc = spdk_pci_device_map_bar(ioat->device, regs_bar, &addr, &phys_addr, &size); + if (rc != 0 || addr == NULL) { + SPDK_ERRLOG("pci_device_map_range failed with error code %d\n", + rc); + return -1; + } + + ioat->regs = (volatile struct spdk_ioat_registers *)addr; + + return 0; +} + +static int +ioat_unmap_pci_bar(struct spdk_ioat_chan *ioat) +{ + int rc = 0; + void *addr = (void *)ioat->regs; + + if (addr) { + rc = spdk_pci_device_unmap_bar(ioat->device, 0, addr); + } + return rc; +} + + +static inline uint32_t +ioat_get_active(struct spdk_ioat_chan *ioat) +{ + return (ioat->head - ioat->tail) & ((1 << ioat->ring_size_order) - 1); +} + +static inline uint32_t +ioat_get_ring_space(struct spdk_ioat_chan *ioat) +{ + return (1 << ioat->ring_size_order) - ioat_get_active(ioat) - 1; +} + +static uint32_t +ioat_get_ring_index(struct spdk_ioat_chan *ioat, uint32_t index) +{ + return index & ((1 << ioat->ring_size_order) - 1); +} + +static void +ioat_get_ring_entry(struct spdk_ioat_chan *ioat, uint32_t index, + struct ioat_descriptor **desc, + union spdk_ioat_hw_desc **hw_desc) +{ + uint32_t i = ioat_get_ring_index(ioat, index); + + *desc = &ioat->ring[i]; + *hw_desc = &ioat->hw_ring[i]; +} + +static void +ioat_submit_single(struct spdk_ioat_chan *ioat) +{ + ioat->head++; +} + +void +spdk_ioat_flush(struct spdk_ioat_chan *ioat) +{ + uint32_t index = ioat_get_ring_index(ioat, ioat->head - 1); + union spdk_ioat_hw_desc *hw_desc; + + hw_desc = &ioat->hw_ring[index]; + hw_desc->dma.u.control.completion_update = 1; + ioat->regs->dmacount = (uint16_t)ioat->head; +} + +static struct ioat_descriptor * +ioat_prep_null(struct spdk_ioat_chan *ioat) +{ + struct ioat_descriptor *desc; + union spdk_ioat_hw_desc *hw_desc; + + if (ioat_get_ring_space(ioat) < 1) { + return NULL; + } + + ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc); + + hw_desc->dma.u.control_raw = 0; + hw_desc->dma.u.control.op = SPDK_IOAT_OP_COPY; + hw_desc->dma.u.control.null = 1; + + hw_desc->dma.size = 8; + hw_desc->dma.src_addr = 0; + hw_desc->dma.dest_addr = 0; + + desc->callback_fn = NULL; + desc->callback_arg = NULL; + + ioat_submit_single(ioat); + + return desc; +} + +static struct ioat_descriptor * +ioat_prep_copy(struct spdk_ioat_chan *ioat, uint64_t dst, + uint64_t src, uint32_t len) +{ + struct ioat_descriptor *desc; + union spdk_ioat_hw_desc *hw_desc; + + assert(len <= ioat->max_xfer_size); + + if (ioat_get_ring_space(ioat) < 1) { + return NULL; + } + + ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc); + + hw_desc->dma.u.control_raw = 0; + hw_desc->dma.u.control.op = SPDK_IOAT_OP_COPY; + + hw_desc->dma.size = len; + hw_desc->dma.src_addr = src; + hw_desc->dma.dest_addr = dst; + + desc->callback_fn = NULL; + desc->callback_arg = NULL; + + ioat_submit_single(ioat); + + return desc; +} + +static struct ioat_descriptor * +ioat_prep_fill(struct spdk_ioat_chan *ioat, uint64_t dst, + uint64_t fill_pattern, uint32_t len) +{ + struct ioat_descriptor *desc; + union spdk_ioat_hw_desc *hw_desc; + + assert(len <= ioat->max_xfer_size); + + if (ioat_get_ring_space(ioat) < 1) { + return NULL; + } + + ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc); + + hw_desc->fill.u.control_raw = 0; + hw_desc->fill.u.control.op = SPDK_IOAT_OP_FILL; + + hw_desc->fill.size = len; + hw_desc->fill.src_data = fill_pattern; + hw_desc->fill.dest_addr = dst; + + desc->callback_fn = NULL; + desc->callback_arg = NULL; + + ioat_submit_single(ioat); + + return desc; +} + +static int ioat_reset_hw(struct spdk_ioat_chan *ioat) +{ + int timeout; + uint64_t status; + uint32_t chanerr; + int rc; + + status = ioat_get_chansts(ioat); + if (is_ioat_active(status) || is_ioat_idle(status)) { + ioat_suspend(ioat); + } + + timeout = 20; /* in milliseconds */ + while (is_ioat_active(status) || is_ioat_idle(status)) { + spdk_delay_us(1000); + timeout--; + if (timeout == 0) { + SPDK_ERRLOG("timed out waiting for suspend\n"); + return -1; + } + status = ioat_get_chansts(ioat); + } + + /* + * Clear any outstanding errors. + * CHANERR is write-1-to-clear, so write the current CHANERR bits back to reset everything. + */ + chanerr = ioat->regs->chanerr; + ioat->regs->chanerr = chanerr; + + if (ioat->regs->cbver < SPDK_IOAT_VER_3_3) { + rc = spdk_pci_device_cfg_read32(ioat->device, &chanerr, + SPDK_IOAT_PCI_CHANERR_INT_OFFSET); + if (rc) { + SPDK_ERRLOG("failed to read the internal channel error register\n"); + return -1; + } + + spdk_pci_device_cfg_write32(ioat->device, chanerr, + SPDK_IOAT_PCI_CHANERR_INT_OFFSET); + } + + ioat_reset(ioat); + + timeout = 20; + while (ioat_reset_pending(ioat)) { + spdk_delay_us(1000); + timeout--; + if (timeout == 0) { + SPDK_ERRLOG("timed out waiting for reset\n"); + return -1; + } + } + + return 0; +} + +static int +ioat_process_channel_events(struct spdk_ioat_chan *ioat) +{ + struct ioat_descriptor *desc; + uint64_t status, completed_descriptor, hw_desc_phys_addr, events_count = 0; + uint32_t tail; + + if (ioat->head == ioat->tail) { + return 0; + } + + status = *ioat->comp_update; + completed_descriptor = status & SPDK_IOAT_CHANSTS_COMPLETED_DESCRIPTOR_MASK; + + if (is_ioat_halted(status)) { + SPDK_ERRLOG("Channel halted (%x)\n", ioat->regs->chanerr); + return -1; + } + + if (completed_descriptor == ioat->last_seen) { + return 0; + } + + do { + tail = ioat_get_ring_index(ioat, ioat->tail); + desc = &ioat->ring[tail]; + + if (desc->callback_fn) { + desc->callback_fn(desc->callback_arg); + } + + hw_desc_phys_addr = desc->phys_addr; + ioat->tail++; + events_count++; + } while (hw_desc_phys_addr != completed_descriptor); + + ioat->last_seen = hw_desc_phys_addr; + + return events_count; +} + +static void +ioat_channel_destruct(struct spdk_ioat_chan *ioat) +{ + ioat_unmap_pci_bar(ioat); + + if (ioat->ring) { + free(ioat->ring); + } + + if (ioat->hw_ring) { + spdk_free(ioat->hw_ring); + } + + if (ioat->comp_update) { + spdk_free((void *)ioat->comp_update); + ioat->comp_update = NULL; + } +} + +uint32_t +spdk_ioat_get_max_descriptors(struct spdk_ioat_chan *ioat) +{ + return 1 << ioat->ring_size_order; +} + +static int +ioat_channel_start(struct spdk_ioat_chan *ioat) +{ + uint8_t xfercap, version; + uint64_t status; + int i, num_descriptors; + uint64_t comp_update_bus_addr = 0; + uint64_t phys_addr; + + if (ioat_map_pci_bar(ioat) != 0) { + SPDK_ERRLOG("ioat_map_pci_bar() failed\n"); + return -1; + } + + version = ioat->regs->cbver; + if (version < SPDK_IOAT_VER_3_0) { + SPDK_ERRLOG(" unsupported IOAT version %u.%u\n", + version >> 4, version & 0xF); + return -1; + } + + /* Always support DMA copy */ + ioat->dma_capabilities = SPDK_IOAT_ENGINE_COPY_SUPPORTED; + if (ioat->regs->dmacapability & SPDK_IOAT_DMACAP_BFILL) { + ioat->dma_capabilities |= SPDK_IOAT_ENGINE_FILL_SUPPORTED; + } + xfercap = ioat->regs->xfercap; + + /* Only bits [4:0] are valid. */ + xfercap &= 0x1f; + if (xfercap == 0) { + /* 0 means 4 GB max transfer size. */ + ioat->max_xfer_size = 1ULL << 32; + } else if (xfercap < 12) { + /* XFERCAP must be at least 12 (4 KB) according to the spec. */ + SPDK_ERRLOG("invalid XFERCAP value %u\n", xfercap); + return -1; + } else { + ioat->max_xfer_size = 1U << xfercap; + } + + ioat->comp_update = spdk_zmalloc(sizeof(*ioat->comp_update), SPDK_IOAT_CHANCMP_ALIGN, + NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (ioat->comp_update == NULL) { + return -1; + } + + comp_update_bus_addr = spdk_vtophys((void *)ioat->comp_update, NULL); + if (comp_update_bus_addr == SPDK_VTOPHYS_ERROR) { + spdk_free((void *)ioat->comp_update); + return -1; + } + + ioat->ring_size_order = IOAT_DEFAULT_ORDER; + + num_descriptors = 1 << ioat->ring_size_order; + + ioat->ring = calloc(num_descriptors, sizeof(struct ioat_descriptor)); + if (!ioat->ring) { + return -1; + } + + ioat->hw_ring = spdk_zmalloc(num_descriptors * sizeof(union spdk_ioat_hw_desc), 64, + NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!ioat->hw_ring) { + return -1; + } + + for (i = 0; i < num_descriptors; i++) { + phys_addr = spdk_vtophys(&ioat->hw_ring[i], NULL); + if (phys_addr == SPDK_VTOPHYS_ERROR) { + SPDK_ERRLOG("Failed to translate descriptor %u to physical address\n", i); + return -1; + } + + ioat->ring[i].phys_addr = phys_addr; + ioat->hw_ring[ioat_get_ring_index(ioat, i - 1)].generic.next = phys_addr; + } + + ioat->head = 0; + ioat->tail = 0; + ioat->last_seen = 0; + + ioat_reset_hw(ioat); + + ioat->regs->chanctrl = SPDK_IOAT_CHANCTRL_ANY_ERR_ABORT_EN; + ioat_write_chancmp(ioat, comp_update_bus_addr); + ioat_write_chainaddr(ioat, ioat->ring[0].phys_addr); + + ioat_prep_null(ioat); + spdk_ioat_flush(ioat); + + i = 100; + while (i-- > 0) { + spdk_delay_us(100); + status = ioat_get_chansts(ioat); + if (is_ioat_idle(status)) { + break; + } + } + + if (is_ioat_idle(status)) { + ioat_process_channel_events(ioat); + } else { + SPDK_ERRLOG("could not start channel: status = %p\n error = %#x\n", + (void *)status, ioat->regs->chanerr); + return -1; + } + + return 0; +} + +/* Caller must hold g_ioat_driver.lock */ +static struct spdk_ioat_chan * +ioat_attach(struct spdk_pci_device *device) +{ + struct spdk_ioat_chan *ioat; + uint32_t cmd_reg; + + ioat = calloc(1, sizeof(struct spdk_ioat_chan)); + if (ioat == NULL) { + return NULL; + } + + /* Enable PCI busmaster. */ + spdk_pci_device_cfg_read32(device, &cmd_reg, 4); + cmd_reg |= 0x4; + spdk_pci_device_cfg_write32(device, cmd_reg, 4); + + ioat->device = device; + + if (ioat_channel_start(ioat) != 0) { + ioat_channel_destruct(ioat); + free(ioat); + return NULL; + } + + return ioat; +} + +struct ioat_enum_ctx { + spdk_ioat_probe_cb probe_cb; + spdk_ioat_attach_cb attach_cb; + void *cb_ctx; +}; + +/* This function must only be called while holding g_ioat_driver.lock */ +static int +ioat_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) +{ + struct ioat_enum_ctx *enum_ctx = ctx; + struct spdk_ioat_chan *ioat; + + /* Verify that this device is not already attached */ + TAILQ_FOREACH(ioat, &g_ioat_driver.attached_chans, tailq) { + /* + * NOTE: This assumes that the PCI abstraction layer will use the same device handle + * across enumerations; we could compare by BDF instead if this is not true. + */ + if (pci_dev == ioat->device) { + return 0; + } + } + + if (enum_ctx->probe_cb(enum_ctx->cb_ctx, pci_dev)) { + /* + * Since I/OAT init is relatively quick, just perform the full init during probing. + * If this turns out to be a bottleneck later, this can be changed to work like + * NVMe with a list of devices to initialize in parallel. + */ + ioat = ioat_attach(pci_dev); + if (ioat == NULL) { + SPDK_ERRLOG("ioat_attach() failed\n"); + return -1; + } + + TAILQ_INSERT_TAIL(&g_ioat_driver.attached_chans, ioat, tailq); + + enum_ctx->attach_cb(enum_ctx->cb_ctx, pci_dev, ioat); + } + + return 0; +} + +int +spdk_ioat_probe(void *cb_ctx, spdk_ioat_probe_cb probe_cb, spdk_ioat_attach_cb attach_cb) +{ + int rc; + struct ioat_enum_ctx enum_ctx; + + pthread_mutex_lock(&g_ioat_driver.lock); + + enum_ctx.probe_cb = probe_cb; + enum_ctx.attach_cb = attach_cb; + enum_ctx.cb_ctx = cb_ctx; + + rc = spdk_pci_enumerate(spdk_pci_ioat_get_driver(), ioat_enum_cb, &enum_ctx); + + pthread_mutex_unlock(&g_ioat_driver.lock); + + return rc; +} + +void +spdk_ioat_detach(struct spdk_ioat_chan *ioat) +{ + struct ioat_driver *driver = &g_ioat_driver; + + /* ioat should be in the free list (not registered to a thread) + * when calling ioat_detach(). + */ + pthread_mutex_lock(&driver->lock); + TAILQ_REMOVE(&driver->attached_chans, ioat, tailq); + pthread_mutex_unlock(&driver->lock); + + ioat_channel_destruct(ioat); + free(ioat); +} + +int +spdk_ioat_build_copy(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn, + void *dst, const void *src, uint64_t nbytes) +{ + struct ioat_descriptor *last_desc; + uint64_t remaining, op_size; + uint64_t vdst, vsrc; + uint64_t vdst_page, vsrc_page; + uint64_t pdst_page, psrc_page; + uint32_t orig_head; + + if (!ioat) { + return -EINVAL; + } + + orig_head = ioat->head; + + vdst = (uint64_t)dst; + vsrc = (uint64_t)src; + vdst_page = vsrc_page = 0; + pdst_page = psrc_page = SPDK_VTOPHYS_ERROR; + + remaining = nbytes; + while (remaining) { + if (_2MB_PAGE(vsrc) != vsrc_page) { + vsrc_page = _2MB_PAGE(vsrc); + psrc_page = spdk_vtophys((void *)vsrc_page, NULL); + } + + if (_2MB_PAGE(vdst) != vdst_page) { + vdst_page = _2MB_PAGE(vdst); + pdst_page = spdk_vtophys((void *)vdst_page, NULL); + } + op_size = remaining; + op_size = spdk_min(op_size, (VALUE_2MB - _2MB_OFFSET(vsrc))); + op_size = spdk_min(op_size, (VALUE_2MB - _2MB_OFFSET(vdst))); + op_size = spdk_min(op_size, ioat->max_xfer_size); + remaining -= op_size; + + last_desc = ioat_prep_copy(ioat, + pdst_page + _2MB_OFFSET(vdst), + psrc_page + _2MB_OFFSET(vsrc), + op_size); + + if (remaining == 0 || last_desc == NULL) { + break; + } + + vsrc += op_size; + vdst += op_size; + + } + /* Issue null descriptor for null transfer */ + if (nbytes == 0) { + last_desc = ioat_prep_null(ioat); + } + + if (last_desc) { + last_desc->callback_fn = cb_fn; + last_desc->callback_arg = cb_arg; + } else { + /* + * Ran out of descriptors in the ring - reset head to leave things as they were + * in case we managed to fill out any descriptors. + */ + ioat->head = orig_head; + return -ENOMEM; + } + + return 0; +} + +int +spdk_ioat_submit_copy(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn, + void *dst, const void *src, uint64_t nbytes) +{ + int rc; + + rc = spdk_ioat_build_copy(ioat, cb_arg, cb_fn, dst, src, nbytes); + if (rc != 0) { + return rc; + } + + spdk_ioat_flush(ioat); + return 0; +} + +int +spdk_ioat_build_fill(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn, + void *dst, uint64_t fill_pattern, uint64_t nbytes) +{ + struct ioat_descriptor *last_desc = NULL; + uint64_t remaining, op_size; + uint64_t vdst; + uint32_t orig_head; + + if (!ioat) { + return -EINVAL; + } + + if (!(ioat->dma_capabilities & SPDK_IOAT_ENGINE_FILL_SUPPORTED)) { + SPDK_ERRLOG("Channel does not support memory fill\n"); + return -1; + } + + orig_head = ioat->head; + + vdst = (uint64_t)dst; + remaining = nbytes; + + while (remaining) { + op_size = remaining; + op_size = spdk_min(op_size, (VALUE_2MB - _2MB_OFFSET(vdst))); + op_size = spdk_min(op_size, ioat->max_xfer_size); + remaining -= op_size; + + last_desc = ioat_prep_fill(ioat, + spdk_vtophys((void *)vdst, NULL), + fill_pattern, + op_size); + + if (remaining == 0 || last_desc == NULL) { + break; + } + + vdst += op_size; + } + + if (last_desc) { + last_desc->callback_fn = cb_fn; + last_desc->callback_arg = cb_arg; + } else { + /* + * Ran out of descriptors in the ring - reset head to leave things as they were + * in case we managed to fill out any descriptors. + */ + ioat->head = orig_head; + return -ENOMEM; + } + + return 0; +} + +int +spdk_ioat_submit_fill(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn, + void *dst, uint64_t fill_pattern, uint64_t nbytes) +{ + int rc; + + rc = spdk_ioat_build_fill(ioat, cb_arg, cb_fn, dst, fill_pattern, nbytes); + if (rc != 0) { + return rc; + } + + spdk_ioat_flush(ioat); + return 0; +} + +uint32_t +spdk_ioat_get_dma_capabilities(struct spdk_ioat_chan *ioat) +{ + if (!ioat) { + return 0; + } + return ioat->dma_capabilities; +} + +int +spdk_ioat_process_events(struct spdk_ioat_chan *ioat) +{ + return ioat_process_channel_events(ioat); +} + +SPDK_LOG_REGISTER_COMPONENT("ioat", SPDK_LOG_IOAT) diff --git a/src/spdk/lib/ioat/ioat_internal.h b/src/spdk/lib/ioat/ioat_internal.h new file mode 100644 index 000000000..19593bb00 --- /dev/null +++ b/src/spdk/lib/ioat/ioat_internal.h @@ -0,0 +1,100 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __IOAT_INTERNAL_H__ +#define __IOAT_INTERNAL_H__ + +#include "spdk/stdinc.h" + +#include "spdk/ioat.h" +#include "spdk/ioat_spec.h" +#include "spdk/queue.h" +#include "spdk/mmio.h" + +/* Allocate 1 << 15 (32K) descriptors per channel by default. */ +#define IOAT_DEFAULT_ORDER 15 + +struct ioat_descriptor { + uint64_t phys_addr; + spdk_ioat_req_cb callback_fn; + void *callback_arg; +}; + +/* One of these per allocated PCI device. */ +struct spdk_ioat_chan { + /* Opaque handle to upper layer */ + struct spdk_pci_device *device; + uint64_t max_xfer_size; + volatile struct spdk_ioat_registers *regs; + + volatile uint64_t *comp_update; + + uint32_t head; + uint32_t tail; + + uint32_t ring_size_order; + uint64_t last_seen; + + struct ioat_descriptor *ring; + union spdk_ioat_hw_desc *hw_ring; + uint32_t dma_capabilities; + + /* tailq entry for attached_chans */ + TAILQ_ENTRY(spdk_ioat_chan) tailq; +}; + +static inline uint32_t +is_ioat_active(uint64_t status) +{ + return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_ACTIVE; +} + +static inline uint32_t +is_ioat_idle(uint64_t status) +{ + return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_IDLE; +} + +static inline uint32_t +is_ioat_halted(uint64_t status) +{ + return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_HALTED; +} + +static inline uint32_t +is_ioat_suspended(uint64_t status) +{ + return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_SUSPENDED; +} + +#endif /* __IOAT_INTERNAL_H__ */ diff --git a/src/spdk/lib/ioat/spdk_ioat.map b/src/spdk/lib/ioat/spdk_ioat.map new file mode 100644 index 000000000..f467da817 --- /dev/null +++ b/src/spdk/lib/ioat/spdk_ioat.map @@ -0,0 +1,17 @@ +{ + global: + + # public functions + spdk_ioat_probe; + spdk_ioat_detach; + spdk_ioat_build_copy; + spdk_ioat_submit_copy; + spdk_ioat_build_fill; + spdk_ioat_submit_fill; + spdk_ioat_flush; + spdk_ioat_process_events; + spdk_ioat_get_dma_capabilities; + spdk_ioat_get_max_descriptors; + + local: *; +}; diff --git a/src/spdk/lib/iscsi/Makefile b/src/spdk/lib/iscsi/Makefile new file mode 100644 index 000000000..2c663d880 --- /dev/null +++ b/src/spdk/lib/iscsi/Makefile @@ -0,0 +1,50 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib +C_SRCS = conn.c \ + init_grp.c iscsi.c md5.c param.c portal_grp.c \ + tgt_node.c iscsi_subsystem.c \ + iscsi_rpc.c task.c +LIBNAME = iscsi +LOCAL_SYS_LIBS = -lcrypto + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_iscsi.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/iscsi/conn.c b/src/spdk/lib/iscsi/conn.c new file mode 100644 index 000000000..4c7a54fcf --- /dev/null +++ b/src/spdk/lib/iscsi/conn.c @@ -0,0 +1,1714 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/event.h" +#include "spdk/likely.h" +#include "spdk/thread.h" +#include "spdk/queue.h" +#include "spdk/trace.h" +#include "spdk/net.h" +#include "spdk/sock.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include "iscsi/task.h" +#include "iscsi/conn.h" +#include "iscsi/tgt_node.h" +#include "iscsi/portal_grp.h" + +#define MAKE_DIGEST_WORD(BUF, CRC32C) \ + ( ((*((uint8_t *)(BUF)+0)) = (uint8_t)((uint32_t)(CRC32C) >> 0)), \ + ((*((uint8_t *)(BUF)+1)) = (uint8_t)((uint32_t)(CRC32C) >> 8)), \ + ((*((uint8_t *)(BUF)+2)) = (uint8_t)((uint32_t)(CRC32C) >> 16)), \ + ((*((uint8_t *)(BUF)+3)) = (uint8_t)((uint32_t)(CRC32C) >> 24))) + +#define SPDK_ISCSI_CONNECTION_MEMSET(conn) \ + memset(&(conn)->portal, 0, sizeof(*(conn)) - \ + offsetof(struct spdk_iscsi_conn, portal)); + +struct spdk_iscsi_conn *g_conns_array = MAP_FAILED; +static int g_conns_array_fd = -1; +static char g_shm_name[64]; + +static TAILQ_HEAD(, spdk_iscsi_conn) g_free_conns = TAILQ_HEAD_INITIALIZER(g_free_conns); +static TAILQ_HEAD(, spdk_iscsi_conn) g_active_conns = TAILQ_HEAD_INITIALIZER(g_active_conns); + +static pthread_mutex_t g_conns_mutex = PTHREAD_MUTEX_INITIALIZER; + +static struct spdk_poller *g_shutdown_timer = NULL; + +static void iscsi_conn_sock_cb(void *arg, struct spdk_sock_group *group, + struct spdk_sock *sock); + +static struct spdk_iscsi_conn * +allocate_conn(void) +{ + struct spdk_iscsi_conn *conn; + + pthread_mutex_lock(&g_conns_mutex); + conn = TAILQ_FIRST(&g_free_conns); + if (conn != NULL) { + assert(!conn->is_valid); + TAILQ_REMOVE(&g_free_conns, conn, conn_link); + SPDK_ISCSI_CONNECTION_MEMSET(conn); + conn->is_valid = 1; + + TAILQ_INSERT_TAIL(&g_active_conns, conn, conn_link); + } + pthread_mutex_unlock(&g_conns_mutex); + + return conn; +} + +static void +_free_conn(struct spdk_iscsi_conn *conn) +{ + TAILQ_REMOVE(&g_active_conns, conn, conn_link); + + memset(conn->portal_host, 0, sizeof(conn->portal_host)); + memset(conn->portal_port, 0, sizeof(conn->portal_port)); + conn->is_valid = 0; + + TAILQ_INSERT_TAIL(&g_free_conns, conn, conn_link); +} + +static void +free_conn(struct spdk_iscsi_conn *conn) +{ + pthread_mutex_lock(&g_conns_mutex); + _free_conn(conn); + pthread_mutex_unlock(&g_conns_mutex); +} + +static void +_iscsi_conns_cleanup(void) +{ + if (g_conns_array != MAP_FAILED) { + munmap(g_conns_array, sizeof(struct spdk_iscsi_conn) * + MAX_ISCSI_CONNECTIONS); + g_conns_array = MAP_FAILED; + } + + if (g_conns_array_fd >= 0) { + close(g_conns_array_fd); + g_conns_array_fd = -1; + shm_unlink(g_shm_name); + } +} + +int initialize_iscsi_conns(void) +{ + size_t conns_size = sizeof(struct spdk_iscsi_conn) * MAX_ISCSI_CONNECTIONS; + uint32_t i; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_init\n"); + + snprintf(g_shm_name, sizeof(g_shm_name), "/spdk_iscsi_conns.%d", spdk_app_get_shm_id()); + g_conns_array_fd = shm_open(g_shm_name, O_RDWR | O_CREAT, 0600); + if (g_conns_array_fd < 0) { + SPDK_ERRLOG("could not shm_open %s\n", g_shm_name); + goto err; + } + + if (ftruncate(g_conns_array_fd, conns_size) != 0) { + SPDK_ERRLOG("could not ftruncate\n"); + goto err; + } + g_conns_array = mmap(0, conns_size, PROT_READ | PROT_WRITE, MAP_SHARED, + g_conns_array_fd, 0); + + if (g_conns_array == MAP_FAILED) { + SPDK_ERRLOG("could not mmap cons array file %s (%d)\n", g_shm_name, errno); + goto err; + } + + memset(g_conns_array, 0, conns_size); + + for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { + g_conns_array[i].id = i; + TAILQ_INSERT_TAIL(&g_free_conns, &g_conns_array[i], conn_link); + } + + return 0; + +err: + _iscsi_conns_cleanup(); + + return -1; +} + +static void +iscsi_poll_group_add_conn(struct spdk_iscsi_poll_group *pg, struct spdk_iscsi_conn *conn) +{ + int rc; + + rc = spdk_sock_group_add_sock(pg->sock_group, conn->sock, iscsi_conn_sock_cb, conn); + if (rc < 0) { + SPDK_ERRLOG("Failed to add sock=%p of conn=%p\n", conn->sock, conn); + return; + } + + conn->is_stopped = false; + STAILQ_INSERT_TAIL(&pg->connections, conn, pg_link); +} + +static void +iscsi_poll_group_remove_conn(struct spdk_iscsi_poll_group *pg, struct spdk_iscsi_conn *conn) +{ + int rc; + + assert(conn->sock != NULL); + rc = spdk_sock_group_remove_sock(pg->sock_group, conn->sock); + if (rc < 0) { + SPDK_ERRLOG("Failed to remove sock=%p of conn=%p\n", conn->sock, conn); + } + + conn->is_stopped = true; + STAILQ_REMOVE(&pg->connections, conn, spdk_iscsi_conn, pg_link); +} + +static void +iscsi_conn_start(void *ctx) +{ + struct spdk_iscsi_conn *conn = ctx; + + iscsi_poll_group_add_conn(conn->pg, conn); +} + +int +iscsi_conn_construct(struct spdk_iscsi_portal *portal, + struct spdk_sock *sock) +{ + struct spdk_iscsi_poll_group *pg; + struct spdk_iscsi_conn *conn; + int i, rc; + + conn = allocate_conn(); + if (conn == NULL) { + SPDK_ERRLOG("Could not allocate connection.\n"); + return -1; + } + + pthread_mutex_lock(&g_iscsi.mutex); + conn->timeout = g_iscsi.timeout * spdk_get_ticks_hz(); /* seconds to TSC */ + conn->nopininterval = g_iscsi.nopininterval; + conn->nopininterval *= spdk_get_ticks_hz(); /* seconds to TSC */ + conn->nop_outstanding = false; + conn->data_out_cnt = 0; + conn->data_in_cnt = 0; + conn->disable_chap = portal->group->disable_chap; + conn->require_chap = portal->group->require_chap; + conn->mutual_chap = portal->group->mutual_chap; + conn->chap_group = portal->group->chap_group; + pthread_mutex_unlock(&g_iscsi.mutex); + conn->MaxRecvDataSegmentLength = 8192; /* RFC3720(12.12) */ + + conn->portal = portal; + conn->pg_tag = portal->group->tag; + memcpy(conn->portal_host, portal->host, strlen(portal->host)); + memcpy(conn->portal_port, portal->port, strlen(portal->port)); + conn->sock = sock; + + conn->state = ISCSI_CONN_STATE_INVALID; + conn->login_phase = ISCSI_SECURITY_NEGOTIATION_PHASE; + conn->ttt = 0; + + conn->partial_text_parameter = NULL; + + for (i = 0; i < MAX_CONNECTION_PARAMS; i++) { + conn->conn_param_state_negotiated[i] = false; + } + + for (i = 0; i < MAX_SESSION_PARAMS; i++) { + conn->sess_param_state_negotiated[i] = false; + } + + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY; + + TAILQ_INIT(&conn->write_pdu_list); + TAILQ_INIT(&conn->snack_pdu_list); + TAILQ_INIT(&conn->queued_r2t_tasks); + TAILQ_INIT(&conn->active_r2t_tasks); + TAILQ_INIT(&conn->queued_datain_tasks); + memset(&conn->luns, 0, sizeof(conn->luns)); + + rc = spdk_sock_getaddr(sock, conn->target_addr, sizeof conn->target_addr, NULL, + conn->initiator_addr, sizeof conn->initiator_addr, NULL); + if (rc < 0) { + SPDK_ERRLOG("spdk_sock_getaddr() failed\n"); + goto error_return; + } + + /* set low water mark */ + rc = spdk_sock_set_recvlowat(conn->sock, 1); + if (rc != 0) { + SPDK_ERRLOG("spdk_sock_set_recvlowat() failed\n"); + goto error_return; + } + + /* set default params */ + rc = iscsi_conn_params_init(&conn->params); + if (rc < 0) { + SPDK_ERRLOG("iscsi_conn_params_init() failed\n"); + goto error_return; + } + conn->logout_request_timer = NULL; + conn->logout_timer = NULL; + conn->shutdown_timer = NULL; + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Launching connection on acceptor thread\n"); + conn->pending_task_cnt = 0; + + /* Get the first poll group. */ + pg = TAILQ_FIRST(&g_iscsi.poll_group_head); + if (pg == NULL) { + SPDK_ERRLOG("There is no poll group.\n"); + assert(false); + goto error_return; + } + + conn->pg = pg; + spdk_thread_send_msg(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(pg)), + iscsi_conn_start, conn); + return 0; + +error_return: + iscsi_param_free(conn->params); + free_conn(conn); + return -1; +} + +void +iscsi_conn_free_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + iscsi_conn_xfer_complete_cb cb_fn; + void *cb_arg; + + cb_fn = pdu->cb_fn; + cb_arg = pdu->cb_arg; + + assert(cb_fn != NULL); + pdu->cb_fn = NULL; + + if (pdu->task) { + iscsi_task_put(pdu->task); + } + iscsi_put_pdu(pdu); + + cb_fn(cb_arg); +} + +static int +iscsi_conn_free_tasks(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_pdu *pdu, *tmp_pdu; + struct spdk_iscsi_task *iscsi_task, *tmp_iscsi_task; + + TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, tmp_pdu) { + TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq); + iscsi_conn_free_pdu(conn, pdu); + } + + TAILQ_FOREACH_SAFE(iscsi_task, &conn->queued_datain_tasks, link, tmp_iscsi_task) { + if (!iscsi_task->is_queued) { + TAILQ_REMOVE(&conn->queued_datain_tasks, iscsi_task, link); + iscsi_task_put(iscsi_task); + } + } + + /* We have to parse conn->write_pdu_list in the end. In iscsi_conn_free_pdu(), + * iscsi_conn_handle_queued_datain_tasks() may be called, and + * iscsi_conn_handle_queued_datain_tasks() will parse conn->queued_datain_tasks + * and may stack some PDUs to conn->write_pdu_list. Hence when we come here, we + * have to ensure there is no associated task in conn->queued_datain_tasks. + */ + TAILQ_FOREACH_SAFE(pdu, &conn->write_pdu_list, tailq, tmp_pdu) { + TAILQ_REMOVE(&conn->write_pdu_list, pdu, tailq); + iscsi_conn_free_pdu(conn, pdu); + } + + if (conn->pending_task_cnt) { + return -1; + } + + return 0; +} + +static void +iscsi_conn_cleanup_backend(struct spdk_iscsi_conn *conn) +{ + int rc; + struct spdk_iscsi_tgt_node *target; + + if (conn->sess->connections > 1) { + /* connection specific cleanup */ + } else if (!g_iscsi.AllowDuplicateIsid) { + /* clean up all tasks to all LUNs for session */ + target = conn->sess->target; + if (target != NULL) { + rc = iscsi_tgt_node_cleanup_luns(conn, target); + if (rc < 0) { + SPDK_ERRLOG("target abort failed\n"); + } + } + } +} + +static void +iscsi_conn_free(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_sess *sess; + int idx; + uint32_t i; + + pthread_mutex_lock(&g_conns_mutex); + + if (conn->sess == NULL) { + goto end; + } + + idx = -1; + sess = conn->sess; + conn->sess = NULL; + + for (i = 0; i < sess->connections; i++) { + if (sess->conns[i] == conn) { + idx = i; + break; + } + } + + if (idx < 0) { + SPDK_ERRLOG("remove conn not found\n"); + } else { + for (i = idx; i < sess->connections - 1; i++) { + sess->conns[i] = sess->conns[i + 1]; + } + sess->conns[sess->connections - 1] = NULL; + sess->connections--; + + if (sess->connections == 0) { + /* cleanup last connection */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "cleanup last conn free sess\n"); + iscsi_free_sess(sess); + } + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Terminating connections(tsih %d): %d\n", + sess->tsih, sess->connections); + +end: + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "cleanup free conn\n"); + iscsi_param_free(conn->params); + _free_conn(conn); + + pthread_mutex_unlock(&g_conns_mutex); +} + +static void +iscsi_conn_close_lun(struct spdk_iscsi_conn *conn, int lun_id) +{ + struct spdk_iscsi_lun *iscsi_lun; + + iscsi_lun = conn->luns[lun_id]; + if (iscsi_lun == NULL) { + return; + } + + spdk_scsi_lun_free_io_channel(iscsi_lun->desc); + spdk_scsi_lun_close(iscsi_lun->desc); + spdk_poller_unregister(&iscsi_lun->remove_poller); + free(iscsi_lun); + + conn->luns[lun_id] = NULL; +} + +static void +iscsi_conn_close_luns(struct spdk_iscsi_conn *conn) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + iscsi_conn_close_lun(conn, i); + } +} + +static bool +iscsi_conn_check_tasks_for_lun(struct spdk_iscsi_conn *conn, + struct spdk_scsi_lun *lun) +{ + struct spdk_iscsi_pdu *pdu, *tmp_pdu; + struct spdk_iscsi_task *task; + + assert(lun != NULL); + + /* We can remove deferred PDUs safely because they are already flushed. */ + TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, tmp_pdu) { + if (lun == pdu->task->scsi.lun) { + TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq); + iscsi_conn_free_pdu(conn, pdu); + } + } + + TAILQ_FOREACH(task, &conn->queued_datain_tasks, link) { + if (lun == task->scsi.lun) { + return false; + } + } + + /* This check loop works even when connection exits in the middle of LUN hotplug + * because all PDUs in write_pdu_list are removed in iscsi_conn_free_tasks(). + */ + TAILQ_FOREACH(pdu, &conn->write_pdu_list, tailq) { + if (pdu->task && lun == pdu->task->scsi.lun) { + return false; + } + } + + return true; +} + +static int +iscsi_conn_remove_lun(void *ctx) +{ + struct spdk_iscsi_lun *iscsi_lun = ctx; + struct spdk_iscsi_conn *conn = iscsi_lun->conn; + struct spdk_scsi_lun *lun = iscsi_lun->lun; + int lun_id = spdk_scsi_lun_get_id(lun); + + if (!iscsi_conn_check_tasks_for_lun(conn, lun)) { + return SPDK_POLLER_BUSY; + } + iscsi_conn_close_lun(conn, lun_id); + return SPDK_POLLER_BUSY; +} + +static void +_iscsi_conn_hotremove_lun(void *ctx) +{ + struct spdk_iscsi_lun *iscsi_lun = ctx; + struct spdk_iscsi_conn *conn = iscsi_lun->conn; + struct spdk_scsi_lun *lun = iscsi_lun->lun; + + assert(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)) == + spdk_get_thread()); + + /* If a connection is already in stating status, just return */ + if (conn->state >= ISCSI_CONN_STATE_EXITING) { + return; + } + + iscsi_clear_all_transfer_task(conn, lun, NULL); + + iscsi_lun->remove_poller = SPDK_POLLER_REGISTER(iscsi_conn_remove_lun, iscsi_lun, + 1000); +} + +static void +iscsi_conn_hotremove_lun(struct spdk_scsi_lun *lun, void *remove_ctx) +{ + struct spdk_iscsi_conn *conn = remove_ctx; + int lun_id = spdk_scsi_lun_get_id(lun); + struct spdk_iscsi_lun *iscsi_lun; + + iscsi_lun = conn->luns[lun_id]; + if (iscsi_lun == NULL) { + SPDK_ERRLOG("LUN hotplug was notified to the unallocated LUN %d.\n", lun_id); + return; + } + + spdk_thread_send_msg(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)), + _iscsi_conn_hotremove_lun, iscsi_lun); +} + +static int +iscsi_conn_open_lun(struct spdk_iscsi_conn *conn, int lun_id, + struct spdk_scsi_lun *lun) +{ + int rc; + struct spdk_iscsi_lun *iscsi_lun; + + iscsi_lun = calloc(1, sizeof(*iscsi_lun)); + if (iscsi_lun == NULL) { + return -ENOMEM; + } + + iscsi_lun->conn = conn; + iscsi_lun->lun = lun; + + rc = spdk_scsi_lun_open(lun, iscsi_conn_hotremove_lun, conn, &iscsi_lun->desc); + if (rc != 0) { + free(iscsi_lun); + return rc; + } + + rc = spdk_scsi_lun_allocate_io_channel(iscsi_lun->desc); + if (rc != 0) { + spdk_scsi_lun_close(iscsi_lun->desc); + free(iscsi_lun); + return rc; + } + + conn->luns[lun_id] = iscsi_lun; + + return 0; +} + +static void +iscsi_conn_open_luns(struct spdk_iscsi_conn *conn) +{ + int i, rc; + struct spdk_scsi_lun *lun; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + lun = spdk_scsi_dev_get_lun(conn->dev, i); + if (lun == NULL) { + continue; + } + + rc = iscsi_conn_open_lun(conn, i, lun); + if (rc != 0) { + goto error; + } + } + + return; + +error: + iscsi_conn_close_luns(conn); +} + +/** + * This function will stop executing the specified connection. + */ +static void +iscsi_conn_stop(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_tgt_node *target; + + assert(conn->state == ISCSI_CONN_STATE_EXITED); + assert(conn->data_in_cnt == 0); + assert(conn->data_out_cnt == 0); + + if (conn->sess != NULL && + conn->sess->session_type == SESSION_TYPE_NORMAL && + conn->full_feature) { + target = conn->sess->target; + pthread_mutex_lock(&target->mutex); + target->num_active_conns--; + pthread_mutex_unlock(&target->mutex); + + iscsi_conn_close_luns(conn); + } + + assert(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)) == + spdk_get_thread()); +} + +static int +_iscsi_conn_check_shutdown(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + int rc; + + rc = iscsi_conn_free_tasks(conn); + if (rc < 0) { + return SPDK_POLLER_BUSY; + } + + spdk_poller_unregister(&conn->shutdown_timer); + + iscsi_conn_stop(conn); + iscsi_conn_free(conn); + + return SPDK_POLLER_BUSY; +} + +static void +_iscsi_conn_destruct(struct spdk_iscsi_conn *conn) +{ + int rc; + + iscsi_poll_group_remove_conn(conn->pg, conn); + spdk_sock_close(&conn->sock); + iscsi_clear_all_transfer_task(conn, NULL, NULL); + spdk_poller_unregister(&conn->logout_request_timer); + spdk_poller_unregister(&conn->logout_timer); + + rc = iscsi_conn_free_tasks(conn); + if (rc < 0) { + /* The connection cannot be freed yet. Check back later. */ + conn->shutdown_timer = SPDK_POLLER_REGISTER(_iscsi_conn_check_shutdown, conn, 1000); + } else { + iscsi_conn_stop(conn); + iscsi_conn_free(conn); + } +} + +static int +_iscsi_conn_check_pending_tasks(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + if (conn->dev != NULL && + spdk_scsi_dev_has_pending_tasks(conn->dev, conn->initiator_port)) { + return SPDK_POLLER_BUSY; + } + + spdk_poller_unregister(&conn->shutdown_timer); + + _iscsi_conn_destruct(conn); + + return SPDK_POLLER_BUSY; +} + +void +iscsi_conn_destruct(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_pdu *pdu; + struct spdk_iscsi_task *task; + int opcode; + + /* If a connection is already in exited status, just return */ + if (conn->state >= ISCSI_CONN_STATE_EXITED) { + return; + } + + conn->state = ISCSI_CONN_STATE_EXITED; + + /* + * Each connection pre-allocates its next PDU - make sure these get + * freed here. + */ + pdu = conn->pdu_in_progress; + if (pdu) { + /* remove the task left in the PDU too. */ + task = pdu->task; + if (task) { + opcode = pdu->bhs.opcode; + switch (opcode) { + case ISCSI_OP_SCSI: + case ISCSI_OP_SCSI_DATAOUT: + spdk_scsi_task_process_abort(&task->scsi); + iscsi_task_cpl(&task->scsi); + break; + default: + SPDK_ERRLOG("unexpected opcode %x\n", opcode); + iscsi_task_put(task); + break; + } + } + iscsi_put_pdu(pdu); + conn->pdu_in_progress = NULL; + } + + if (conn->sess != NULL && conn->pending_task_cnt > 0) { + iscsi_conn_cleanup_backend(conn); + } + + if (conn->dev != NULL && + spdk_scsi_dev_has_pending_tasks(conn->dev, conn->initiator_port)) { + conn->shutdown_timer = SPDK_POLLER_REGISTER(_iscsi_conn_check_pending_tasks, conn, 1000); + } else { + _iscsi_conn_destruct(conn); + } +} + +int +iscsi_get_active_conns(struct spdk_iscsi_tgt_node *target) +{ + struct spdk_iscsi_conn *conn; + int num = 0; + + if (g_conns_array == MAP_FAILED) { + return 0; + } + + pthread_mutex_lock(&g_conns_mutex); + TAILQ_FOREACH(conn, &g_active_conns, conn_link) { + if (target == NULL || conn->target == target) { + num++; + } + } + pthread_mutex_unlock(&g_conns_mutex); + return num; +} + +static void +iscsi_conn_check_shutdown_cb(void *arg1) +{ + _iscsi_conns_cleanup(); + shutdown_iscsi_conns_done(); +} + +static int +iscsi_conn_check_shutdown(void *arg) +{ + if (iscsi_get_active_conns(NULL) != 0) { + return SPDK_POLLER_BUSY; + } + + spdk_poller_unregister(&g_shutdown_timer); + + spdk_thread_send_msg(spdk_get_thread(), iscsi_conn_check_shutdown_cb, NULL); + + return SPDK_POLLER_BUSY; +} + +static void +iscsi_send_logout_request(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_async *rsph; + + rsp_pdu = iscsi_get_pdu(conn); + assert(rsp_pdu != NULL); + + rsph = (struct iscsi_bhs_async *)&rsp_pdu->bhs; + rsp_pdu->data = NULL; + + rsph->opcode = ISCSI_OP_ASYNC; + to_be32(&rsph->ffffffff, 0xFFFFFFFF); + rsph->async_event = 1; + to_be16(&rsph->param3, ISCSI_LOGOUT_REQUEST_TIMEOUT); + + to_be32(&rsph->stat_sn, conn->StatSN); + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL); +} + +static int +logout_request_timeout(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + if (conn->state < ISCSI_CONN_STATE_EXITING) { + conn->state = ISCSI_CONN_STATE_EXITING; + } + + return SPDK_POLLER_BUSY; +} + +/* If the connection is running and logout is not requested yet, request logout + * to initiator and wait for the logout process to start. + */ +static void +_iscsi_conn_request_logout(void *ctx) +{ + struct spdk_iscsi_conn *conn = ctx; + + if (conn->state > ISCSI_CONN_STATE_RUNNING || + conn->logout_request_timer != NULL) { + return; + } + + iscsi_send_logout_request(conn); + + conn->logout_request_timer = SPDK_POLLER_REGISTER(logout_request_timeout, + conn, ISCSI_LOGOUT_REQUEST_TIMEOUT * 1000000); +} + +static void +iscsi_conn_request_logout(struct spdk_iscsi_conn *conn) +{ + struct spdk_thread *thread; + + if (conn->state == ISCSI_CONN_STATE_INVALID) { + /* Move it to EXITING state if the connection is in login. */ + conn->state = ISCSI_CONN_STATE_EXITING; + } else if (conn->state == ISCSI_CONN_STATE_RUNNING && + conn->logout_request_timer == NULL) { + thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)); + spdk_thread_send_msg(thread, _iscsi_conn_request_logout, conn); + } +} + +void +iscsi_conns_request_logout(struct spdk_iscsi_tgt_node *target) +{ + struct spdk_iscsi_conn *conn; + + if (g_conns_array == MAP_FAILED) { + return; + } + + pthread_mutex_lock(&g_conns_mutex); + TAILQ_FOREACH(conn, &g_active_conns, conn_link) { + if (target == NULL || conn->target == target) { + iscsi_conn_request_logout(conn); + } + } + pthread_mutex_unlock(&g_conns_mutex); +} + +void +shutdown_iscsi_conns(void) +{ + iscsi_conns_request_logout(NULL); + + g_shutdown_timer = SPDK_POLLER_REGISTER(iscsi_conn_check_shutdown, NULL, 1000); +} + +/* Do not set conn->state if the connection has already started exiting. + * This ensures we do not move a connection from EXITED state back to EXITING. + */ +static void +_iscsi_conn_drop(void *ctx) +{ + struct spdk_iscsi_conn *conn = ctx; + + if (conn->state < ISCSI_CONN_STATE_EXITING) { + conn->state = ISCSI_CONN_STATE_EXITING; + } +} + +int +iscsi_drop_conns(struct spdk_iscsi_conn *conn, const char *conn_match, + int drop_all) +{ + struct spdk_iscsi_conn *xconn; + const char *xconn_match; + struct spdk_thread *thread; + int num; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_drop_conns\n"); + + num = 0; + pthread_mutex_lock(&g_conns_mutex); + if (g_conns_array == MAP_FAILED) { + goto exit; + } + + TAILQ_FOREACH(xconn, &g_active_conns, conn_link) { + if (xconn == conn) { + continue; + } + + if (!drop_all && xconn->initiator_port == NULL) { + continue; + } + + xconn_match = + drop_all ? xconn->initiator_name : spdk_scsi_port_get_name(xconn->initiator_port); + + if (!strcasecmp(conn_match, xconn_match) && + conn->target == xconn->target) { + + if (num == 0) { + /* + * Only print this message before we report the + * first dropped connection. + */ + SPDK_ERRLOG("drop old connections %s by %s\n", + conn->target->name, conn_match); + } + + SPDK_ERRLOG("exiting conn by %s (%s)\n", + xconn_match, xconn->initiator_addr); + if (xconn->sess != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "TSIH=%u\n", xconn->sess->tsih); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "TSIH=xx\n"); + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CID=%u\n", xconn->cid); + + thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(xconn->pg)); + spdk_thread_send_msg(thread, _iscsi_conn_drop, xconn); + + num++; + } + } + +exit: + pthread_mutex_unlock(&g_conns_mutex); + + if (num != 0) { + SPDK_ERRLOG("exiting %d conns\n", num); + } + + return 0; +} + +static int +_iscsi_conn_abort_queued_datain_task(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task) +{ + struct spdk_iscsi_task *subtask; + uint32_t remaining_size; + + if (conn->data_in_cnt >= MAX_LARGE_DATAIN_PER_CONNECTION) { + return -1; + } + + assert(task->current_datain_offset <= task->scsi.transfer_len); + /* Stop split and abort read I/O for remaining data. */ + if (task->current_datain_offset < task->scsi.transfer_len) { + remaining_size = task->scsi.transfer_len - task->current_datain_offset; + subtask = iscsi_task_get(conn, task, iscsi_task_cpl); + assert(subtask != NULL); + subtask->scsi.offset = task->current_datain_offset; + subtask->scsi.length = remaining_size; + spdk_scsi_task_set_data(&subtask->scsi, NULL, 0); + task->current_datain_offset += subtask->scsi.length; + + subtask->scsi.transfer_len = subtask->scsi.length; + spdk_scsi_task_process_abort(&subtask->scsi); + iscsi_task_cpl(&subtask->scsi); + } + + /* Remove the primary task from the list because all subtasks are submitted + * or aborted. + */ + assert(task->current_datain_offset == task->scsi.transfer_len); + TAILQ_REMOVE(&conn->queued_datain_tasks, task, link); + return 0; +} + +int +iscsi_conn_abort_queued_datain_task(struct spdk_iscsi_conn *conn, + uint32_t ref_task_tag) +{ + struct spdk_iscsi_task *task; + + TAILQ_FOREACH(task, &conn->queued_datain_tasks, link) { + if (task->tag == ref_task_tag) { + return _iscsi_conn_abort_queued_datain_task(conn, task); + } + } + + return 0; +} + +int +iscsi_conn_abort_queued_datain_tasks(struct spdk_iscsi_conn *conn, + struct spdk_scsi_lun *lun, + struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_task *task, *task_tmp; + struct spdk_iscsi_pdu *pdu_tmp; + int rc; + + TAILQ_FOREACH_SAFE(task, &conn->queued_datain_tasks, link, task_tmp) { + pdu_tmp = iscsi_task_get_pdu(task); + if ((lun == NULL || lun == task->scsi.lun) && + (pdu == NULL || (spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn)))) { + rc = _iscsi_conn_abort_queued_datain_task(conn, task); + if (rc != 0) { + return rc; + } + } + } + + return 0; +} + +int +iscsi_conn_handle_queued_datain_tasks(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_task *task; + + while (!TAILQ_EMPTY(&conn->queued_datain_tasks) && + conn->data_in_cnt < MAX_LARGE_DATAIN_PER_CONNECTION) { + task = TAILQ_FIRST(&conn->queued_datain_tasks); + assert(task->current_datain_offset <= task->scsi.transfer_len); + if (task->current_datain_offset < task->scsi.transfer_len) { + struct spdk_iscsi_task *subtask; + uint32_t remaining_size = 0; + + remaining_size = task->scsi.transfer_len - task->current_datain_offset; + subtask = iscsi_task_get(conn, task, iscsi_task_cpl); + assert(subtask != NULL); + subtask->scsi.offset = task->current_datain_offset; + spdk_scsi_task_set_data(&subtask->scsi, NULL, 0); + + if (spdk_scsi_dev_get_lun(conn->dev, task->lun_id) == NULL) { + /* Stop submitting split read I/Os for remaining data. */ + TAILQ_REMOVE(&conn->queued_datain_tasks, task, link); + task->current_datain_offset += remaining_size; + assert(task->current_datain_offset == task->scsi.transfer_len); + subtask->scsi.transfer_len = remaining_size; + spdk_scsi_task_process_null_lun(&subtask->scsi); + iscsi_task_cpl(&subtask->scsi); + return 0; + } + + subtask->scsi.length = spdk_min(SPDK_BDEV_LARGE_BUF_MAX_SIZE, remaining_size); + task->current_datain_offset += subtask->scsi.length; + iscsi_queue_task(conn, subtask); + } + if (task->current_datain_offset == task->scsi.transfer_len) { + TAILQ_REMOVE(&conn->queued_datain_tasks, task, link); + } + } + return 0; +} + +void +iscsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_iscsi_task *task = iscsi_task_from_scsi_task(scsi_task); + + iscsi_task_mgmt_response(task->conn, task); + iscsi_task_put(task); +} + +static void +iscsi_task_copy_to_rsp_scsi_status(struct spdk_iscsi_task *primary, + struct spdk_scsi_task *task) +{ + memcpy(primary->rsp_sense_data, task->sense_data, task->sense_data_len); + primary->rsp_sense_data_len = task->sense_data_len; + primary->rsp_scsi_status = task->status; +} + +static void +iscsi_task_copy_from_rsp_scsi_status(struct spdk_scsi_task *task, + struct spdk_iscsi_task *primary) +{ + memcpy(task->sense_data, primary->rsp_sense_data, + primary->rsp_sense_data_len); + task->sense_data_len = primary->rsp_sense_data_len; + task->status = primary->rsp_scsi_status; +} + +static void +process_completed_read_subtask_list(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *primary) +{ + struct spdk_iscsi_task *subtask, *tmp; + + TAILQ_FOREACH_SAFE(subtask, &primary->subtask_list, subtask_link, tmp) { + if (subtask->scsi.offset == primary->bytes_completed) { + TAILQ_REMOVE(&primary->subtask_list, subtask, subtask_link); + primary->bytes_completed += subtask->scsi.length; + iscsi_task_response(conn, subtask); + iscsi_task_put(subtask); + } else { + break; + } + } + + if (primary->bytes_completed == primary->scsi.transfer_len) { + iscsi_task_put(primary); + } +} + +static void +process_read_task_completion(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, + struct spdk_iscsi_task *primary) +{ + struct spdk_iscsi_task *tmp; + + /* If the status of the completed subtask is the first failure, + * copy it to out-of-order subtasks and remember it as the status + * of the command, + * + * Even if the status of the completed task is success, + * there are any failed subtask ever, copy the first failed status + * to it. + */ + if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) { + if (primary->rsp_scsi_status == SPDK_SCSI_STATUS_GOOD) { + TAILQ_FOREACH(tmp, &primary->subtask_list, subtask_link) { + spdk_scsi_task_copy_status(&tmp->scsi, &task->scsi); + } + iscsi_task_copy_to_rsp_scsi_status(primary, &task->scsi); + } + } else if (primary->rsp_scsi_status != SPDK_SCSI_STATUS_GOOD) { + iscsi_task_copy_from_rsp_scsi_status(&task->scsi, primary); + } + + if (task == primary) { + primary->bytes_completed = task->scsi.length; + /* For non split read I/O */ + assert(primary->bytes_completed == task->scsi.transfer_len); + iscsi_task_response(conn, task); + iscsi_task_put(task); + } else { + if (task->scsi.offset != primary->bytes_completed) { + TAILQ_FOREACH(tmp, &primary->subtask_list, subtask_link) { + if (task->scsi.offset < tmp->scsi.offset) { + TAILQ_INSERT_BEFORE(tmp, task, subtask_link); + return; + } + } + + TAILQ_INSERT_TAIL(&primary->subtask_list, task, subtask_link); + } else { + TAILQ_INSERT_HEAD(&primary->subtask_list, task, subtask_link); + process_completed_read_subtask_list(conn, primary); + } + } +} + +static void +process_non_read_task_completion(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, + struct spdk_iscsi_task *primary) +{ + primary->bytes_completed += task->scsi.length; + + /* If the status of the subtask is the first failure, remember it as + * the status of the command and set it to the status of the primary + * task later. + * + * If the first failed task is the primary, two copies can be avoided + * but code simplicity is prioritized. + */ + if (task->scsi.status == SPDK_SCSI_STATUS_GOOD) { + if (task != primary) { + primary->scsi.data_transferred += task->scsi.data_transferred; + } + } else if (primary->rsp_scsi_status == SPDK_SCSI_STATUS_GOOD) { + iscsi_task_copy_to_rsp_scsi_status(primary, &task->scsi); + } + + if (primary->bytes_completed == primary->scsi.transfer_len) { + /* + * Check if this is the last task completed for an iSCSI write + * that required child subtasks. If task != primary, we know + * for sure that it was part of an iSCSI write with child subtasks. + * The trickier case is when the last task completed was the initial + * task - in this case the task will have a smaller length than + * the overall transfer length. + */ + if (task != primary || task->scsi.length != task->scsi.transfer_len) { + /* If LUN is removed in the middle of the iSCSI write sequence, + * primary might complete the write to the initiator because it is not + * ensured that the initiator will send all data requested by R2Ts. + * + * We check it and skip the following if primary is completed. (see + * iscsi_clear_all_transfer_task() in iscsi.c.) + */ + if (primary->is_r2t_active) { + if (primary->rsp_scsi_status != SPDK_SCSI_STATUS_GOOD) { + iscsi_task_copy_from_rsp_scsi_status(&primary->scsi, primary); + } + iscsi_task_response(conn, primary); + iscsi_del_transfer_task(conn, primary->tag); + } + } else { + iscsi_task_response(conn, task); + } + } + iscsi_task_put(task); +} + +void +iscsi_task_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_iscsi_task *primary; + struct spdk_iscsi_task *task = iscsi_task_from_scsi_task(scsi_task); + struct spdk_iscsi_conn *conn = task->conn; + struct spdk_iscsi_pdu *pdu = task->pdu; + + spdk_trace_record(TRACE_ISCSI_TASK_DONE, conn->id, 0, (uintptr_t)task, 0); + + task->is_queued = false; + primary = iscsi_task_get_primary(task); + + if (iscsi_task_is_read(primary)) { + process_read_task_completion(conn, task, primary); + } else { + process_non_read_task_completion(conn, task, primary); + } + if (!task->parent) { + spdk_trace_record(TRACE_ISCSI_PDU_COMPLETED, 0, 0, (uintptr_t)pdu, 0); + } +} + +static void +iscsi_conn_send_nopin(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_nop_in *rsp; + /* Only send nopin if we have logged in and are in a normal session. */ + if (conn->sess == NULL || + !conn->full_feature || + !iscsi_param_eq_val(conn->sess->params, "SessionType", "Normal")) { + return; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "send NOPIN isid=%"PRIx64", tsih=%u, cid=%u\n", + conn->sess->isid, conn->sess->tsih, conn->cid); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + conn->StatSN, conn->sess->ExpCmdSN, + conn->sess->MaxCmdSN); + rsp_pdu = iscsi_get_pdu(conn); + rsp = (struct iscsi_bhs_nop_in *) &rsp_pdu->bhs; + rsp_pdu->data = NULL; + /* + * iscsi_get_pdu() memset's the PDU for us, so only fill out the needed + * fields. + */ + rsp->opcode = ISCSI_OP_NOPIN; + rsp->flags = 0x80; + /* + * Technically the to_be32() is not needed here, since + * to_be32(0xFFFFFFFU) returns 0xFFFFFFFFU. + */ + to_be32(&rsp->itt, 0xFFFFFFFFU); + to_be32(&rsp->ttt, conn->id); + to_be32(&rsp->stat_sn, conn->StatSN); + to_be32(&rsp->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsp->max_cmd_sn, conn->sess->MaxCmdSN); + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL); + conn->last_nopin = spdk_get_ticks(); + conn->nop_outstanding = true; +} + +void +iscsi_conn_handle_nop(struct spdk_iscsi_conn *conn) +{ + uint64_t tsc; + + /** + * This function will be executed by nop_poller of iSCSI polling group, so + * we need to check the connection state first, then do the nop interval + * expiration check work. + */ + if ((conn->state == ISCSI_CONN_STATE_EXITED) || + (conn->state == ISCSI_CONN_STATE_EXITING)) { + return; + } + + /* Check for nop interval expiration */ + tsc = spdk_get_ticks(); + if (conn->nop_outstanding) { + if ((tsc - conn->last_nopin) > conn->timeout) { + SPDK_ERRLOG("Timed out waiting for NOP-Out response from initiator\n"); + SPDK_ERRLOG(" tsc=0x%lx, last_nopin=0x%lx\n", tsc, conn->last_nopin); + SPDK_ERRLOG(" initiator=%s, target=%s\n", conn->initiator_name, + conn->target_short_name); + conn->state = ISCSI_CONN_STATE_EXITING; + } + } else if (tsc - conn->last_nopin > conn->nopininterval) { + iscsi_conn_send_nopin(conn); + } +} + +/** + * \brief Reads data for the specified iSCSI connection from its TCP socket. + * + * The TCP socket is marked as non-blocking, so this function may not read + * all data requested. + * + * Returns SPDK_ISCSI_CONNECTION_FATAL if the recv() operation indicates a fatal + * error with the TCP connection (including if the TCP connection was closed + * unexpectedly. + * + * Otherwise returns the number of bytes successfully read. + */ +int +iscsi_conn_read_data(struct spdk_iscsi_conn *conn, int bytes, + void *buf) +{ + int ret; + + if (bytes == 0) { + return 0; + } + + ret = spdk_sock_recv(conn->sock, buf, bytes); + + if (ret > 0) { + spdk_trace_record(TRACE_ISCSI_READ_FROM_SOCKET_DONE, conn->id, ret, 0, 0); + return ret; + } + + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return 0; + } + + /* For connect reset issue, do not output error log */ + if (errno == ECONNRESET) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_sock_recv() failed, errno %d: %s\n", + errno, spdk_strerror(errno)); + } else { + SPDK_ERRLOG("spdk_sock_recv() failed, errno %d: %s\n", + errno, spdk_strerror(errno)); + } + } + + /* connection closed */ + return SPDK_ISCSI_CONNECTION_FATAL; +} + +int +iscsi_conn_readv_data(struct spdk_iscsi_conn *conn, + struct iovec *iov, int iovcnt) +{ + int ret; + + if (iov == NULL || iovcnt == 0) { + return 0; + } + + if (iovcnt == 1) { + return iscsi_conn_read_data(conn, iov[0].iov_len, + iov[0].iov_base); + } + + ret = spdk_sock_readv(conn->sock, iov, iovcnt); + + if (ret > 0) { + spdk_trace_record(TRACE_ISCSI_READ_FROM_SOCKET_DONE, conn->id, ret, 0, 0); + return ret; + } + + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return 0; + } + + /* For connect reset issue, do not output error log */ + if (errno == ECONNRESET) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_sock_readv() failed, errno %d: %s\n", + errno, spdk_strerror(errno)); + } else { + SPDK_ERRLOG("spdk_sock_readv() failed, errno %d: %s\n", + errno, spdk_strerror(errno)); + } + } + + /* connection closed */ + return SPDK_ISCSI_CONNECTION_FATAL; +} + +static bool +iscsi_is_free_pdu_deferred(struct spdk_iscsi_pdu *pdu) +{ + if (pdu == NULL) { + return false; + } + + if (pdu->bhs.opcode == ISCSI_OP_R2T || + pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) { + return true; + } + + return false; +} + +static int +iscsi_dif_verify(struct spdk_iscsi_pdu *pdu, struct spdk_dif_ctx *dif_ctx) +{ + struct iovec iov; + struct spdk_dif_error err_blk = {}; + uint32_t num_blocks; + int rc; + + iov.iov_base = pdu->data; + iov.iov_len = pdu->data_buf_len; + num_blocks = pdu->data_buf_len / dif_ctx->block_size; + + rc = spdk_dif_verify(&iov, 1, num_blocks, dif_ctx, &err_blk); + if (rc != 0) { + SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", + err_blk.err_type, err_blk.err_offset); + } + + return rc; +} + +static void +_iscsi_conn_pdu_write_done(void *cb_arg, int err) +{ + struct spdk_iscsi_pdu *pdu = cb_arg; + struct spdk_iscsi_conn *conn = pdu->conn; + + assert(conn != NULL); + + if (spdk_unlikely(conn->state >= ISCSI_CONN_STATE_EXITING)) { + /* The other policy will recycle the resource */ + return; + } + + TAILQ_REMOVE(&conn->write_pdu_list, pdu, tailq); + + if (err != 0) { + conn->state = ISCSI_CONN_STATE_EXITING; + } else { + spdk_trace_record(TRACE_ISCSI_FLUSH_WRITEBUF_DONE, conn->id, pdu->mapped_length, (uintptr_t)pdu, 0); + } + + if ((conn->full_feature) && + (conn->sess->ErrorRecoveryLevel >= 1) && + iscsi_is_free_pdu_deferred(pdu)) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "stat_sn=%d\n", + from_be32(&pdu->bhs.stat_sn)); + TAILQ_INSERT_TAIL(&conn->snack_pdu_list, pdu, + tailq); + } else { + iscsi_conn_free_pdu(conn, pdu); + } +} + +void +iscsi_conn_pdu_generic_complete(void *cb_arg) +{ +} + +void +iscsi_conn_write_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu, + iscsi_conn_xfer_complete_cb cb_fn, + void *cb_arg) +{ + uint32_t crc32c; + ssize_t rc; + + if (spdk_unlikely(pdu->dif_insert_or_strip)) { + rc = iscsi_dif_verify(pdu, &pdu->dif_ctx); + if (rc != 0) { + iscsi_conn_free_pdu(conn, pdu); + conn->state = ISCSI_CONN_STATE_EXITING; + return; + } + } + + if (pdu->bhs.opcode != ISCSI_OP_LOGIN_RSP) { + /* Header Digest */ + if (conn->header_digest) { + crc32c = iscsi_pdu_calc_header_digest(pdu); + MAKE_DIGEST_WORD(pdu->header_digest, crc32c); + } + + /* Data Digest */ + if (conn->data_digest && DGET24(pdu->bhs.data_segment_len) != 0) { + crc32c = iscsi_pdu_calc_data_digest(pdu); + MAKE_DIGEST_WORD(pdu->data_digest, crc32c); + } + } + + pdu->cb_fn = cb_fn; + pdu->cb_arg = cb_arg; + TAILQ_INSERT_TAIL(&conn->write_pdu_list, pdu, tailq); + + if (spdk_unlikely(conn->state >= ISCSI_CONN_STATE_EXITING)) { + return; + } + pdu->sock_req.iovcnt = iscsi_build_iovs(conn, pdu->iov, SPDK_COUNTOF(pdu->iov), pdu, + &pdu->mapped_length); + pdu->sock_req.cb_fn = _iscsi_conn_pdu_write_done; + pdu->sock_req.cb_arg = pdu; + + spdk_trace_record(TRACE_ISCSI_FLUSH_WRITEBUF_START, conn->id, pdu->mapped_length, (uintptr_t)pdu, + pdu->sock_req.iovcnt); + spdk_sock_writev_async(conn->sock, &pdu->sock_req); +} + +static void +iscsi_conn_sock_cb(void *arg, struct spdk_sock_group *group, struct spdk_sock *sock) +{ + struct spdk_iscsi_conn *conn = arg; + int rc; + + assert(conn != NULL); + + if ((conn->state == ISCSI_CONN_STATE_EXITED) || + (conn->state == ISCSI_CONN_STATE_EXITING)) { + return; + } + + /* Handle incoming PDUs */ + rc = iscsi_handle_incoming_pdus(conn); + if (rc < 0) { + conn->state = ISCSI_CONN_STATE_EXITING; + } +} + +static void +iscsi_conn_full_feature_migrate(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + if (conn->state >= ISCSI_CONN_STATE_EXITING) { + /* Connection is being exited before this callback is executed. */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connection is already exited.\n"); + return; + } + + if (conn->sess->session_type == SESSION_TYPE_NORMAL) { + iscsi_conn_open_luns(conn); + } + + /* Add this connection to the assigned poll group. */ + iscsi_poll_group_add_conn(conn->pg, conn); +} + +static struct spdk_iscsi_poll_group *g_next_pg = NULL; + +void +iscsi_conn_schedule(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_poll_group *pg; + struct spdk_iscsi_tgt_node *target; + + if (conn->sess->session_type != SESSION_TYPE_NORMAL) { + /* Leave all non-normal sessions on the acceptor + * thread. */ + return; + } + pthread_mutex_lock(&g_iscsi.mutex); + + target = conn->sess->target; + pthread_mutex_lock(&target->mutex); + target->num_active_conns++; + if (target->num_active_conns == 1) { + /** + * This is the only active connection for this target node. + * Pick a poll group using round-robin. + */ + if (g_next_pg == NULL) { + g_next_pg = TAILQ_FIRST(&g_iscsi.poll_group_head); + assert(g_next_pg != NULL); + } + + pg = g_next_pg; + g_next_pg = TAILQ_NEXT(g_next_pg, link); + + /* Save the pg in the target node so it can be used for any other connections to this target node. */ + target->pg = pg; + } else { + /** + * There are other active connections for this target node. + */ + pg = target->pg; + } + + pthread_mutex_unlock(&target->mutex); + pthread_mutex_unlock(&g_iscsi.mutex); + + assert(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)) == + spdk_get_thread()); + + /* Remove this connection from the previous poll group */ + iscsi_poll_group_remove_conn(conn->pg, conn); + + conn->last_nopin = spdk_get_ticks(); + conn->pg = pg; + + spdk_thread_send_msg(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(pg)), + iscsi_conn_full_feature_migrate, conn); +} + +static int +logout_timeout(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + if (conn->state < ISCSI_CONN_STATE_EXITING) { + conn->state = ISCSI_CONN_STATE_EXITING; + } + + return SPDK_POLLER_BUSY; +} + +void +iscsi_conn_logout(struct spdk_iscsi_conn *conn) +{ + conn->is_logged_out = true; + conn->logout_timer = SPDK_POLLER_REGISTER(logout_timeout, conn, ISCSI_LOGOUT_TIMEOUT * 1000000); +} + +SPDK_TRACE_REGISTER_FN(iscsi_conn_trace, "iscsi_conn", TRACE_GROUP_ISCSI) +{ + spdk_trace_register_owner(OWNER_ISCSI_CONN, 'c'); + spdk_trace_register_object(OBJECT_ISCSI_PDU, 'p'); + spdk_trace_register_description("ISCSI_READ_DONE", TRACE_ISCSI_READ_FROM_SOCKET_DONE, + OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("ISCSI_WRITE_START", TRACE_ISCSI_FLUSH_WRITEBUF_START, + OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, "iovec: "); + spdk_trace_register_description("ISCSI_WRITE_DONE", TRACE_ISCSI_FLUSH_WRITEBUF_DONE, + OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("ISCSI_READ_PDU", TRACE_ISCSI_READ_PDU, + OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 1, 0, "opc: "); + spdk_trace_register_description("ISCSI_TASK_DONE", TRACE_ISCSI_TASK_DONE, + OWNER_ISCSI_CONN, OBJECT_SCSI_TASK, 0, 0, ""); + spdk_trace_register_description("ISCSI_TASK_QUEUE", TRACE_ISCSI_TASK_QUEUE, + OWNER_ISCSI_CONN, OBJECT_SCSI_TASK, 1, 1, "pdu: "); + spdk_trace_register_description("ISCSI_TASK_EXECUTED", TRACE_ISCSI_TASK_EXECUTED, + OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 0, 0, ""); + spdk_trace_register_description("ISCSI_PDU_COMPLETED", TRACE_ISCSI_PDU_COMPLETED, + OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 0, 0, ""); +} + +void +iscsi_conn_info_json(struct spdk_json_write_ctx *w, struct spdk_iscsi_conn *conn) +{ + uint16_t tsih; + + if (!conn->is_valid) { + return; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_named_int32(w, "id", conn->id); + + spdk_json_write_named_int32(w, "cid", conn->cid); + + /* + * If we try to return data for a connection that has not + * logged in yet, the session will not be set. So in this + * case, return -1 for the tsih rather than segfaulting + * on the null conn->sess. + */ + if (conn->sess == NULL) { + tsih = -1; + } else { + tsih = conn->sess->tsih; + } + spdk_json_write_named_int32(w, "tsih", tsih); + + spdk_json_write_named_string(w, "initiator_addr", conn->initiator_addr); + + spdk_json_write_named_string(w, "target_addr", conn->target_addr); + + spdk_json_write_named_string(w, "target_node_name", conn->target_short_name); + + spdk_json_write_named_string(w, "thread_name", + spdk_thread_get_name(spdk_get_thread())); + + spdk_json_write_object_end(w); +} diff --git a/src/spdk/lib/iscsi/conn.h b/src/spdk/lib/iscsi/conn.h new file mode 100644 index 000000000..a85d2ddeb --- /dev/null +++ b/src/spdk/lib/iscsi/conn.h @@ -0,0 +1,237 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ISCSI_CONN_H +#define SPDK_ISCSI_CONN_H + +#include "spdk/stdinc.h" + +#include "iscsi/iscsi.h" +#include "spdk/queue.h" +#include "spdk/cpuset.h" +#include "spdk/scsi.h" + +/* + * MAX_CONNECTION_PARAMS: The numbers of the params in conn_param_table + * MAX_SESSION_PARAMS: The numbers of the params in sess_param_table + */ +#define MAX_CONNECTION_PARAMS 14 +#define MAX_SESSION_PARAMS 19 + +#define MAX_ADDRBUF 64 +#define MAX_INITIATOR_ADDR (MAX_ADDRBUF) +#define MAX_TARGET_ADDR (MAX_ADDRBUF) + +#define OWNER_ISCSI_CONN 0x1 + +#define OBJECT_ISCSI_PDU 0x1 + +#define TRACE_GROUP_ISCSI 0x1 +#define TRACE_ISCSI_READ_FROM_SOCKET_DONE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x0) +#define TRACE_ISCSI_FLUSH_WRITEBUF_START SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x1) +#define TRACE_ISCSI_FLUSH_WRITEBUF_DONE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x2) +#define TRACE_ISCSI_READ_PDU SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x3) +#define TRACE_ISCSI_TASK_DONE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x4) +#define TRACE_ISCSI_TASK_QUEUE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x5) +#define TRACE_ISCSI_TASK_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x6) +#define TRACE_ISCSI_PDU_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x7) + +enum iscsi_pdu_recv_state { + /* Ready to wait for PDU */ + ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY, + + /* Active connection waiting for any PDU header */ + ISCSI_PDU_RECV_STATE_AWAIT_PDU_HDR, + + /* Active connection waiting for payload */ + ISCSI_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD, + + /* Active connection does not wait for payload */ + ISCSI_PDU_RECV_STATE_ERROR, +}; + +struct spdk_poller; +struct spdk_iscsi_conn; + +struct spdk_iscsi_lun { + struct spdk_iscsi_conn *conn; + struct spdk_scsi_lun *lun; + struct spdk_scsi_lun_desc *desc; + struct spdk_poller *remove_poller; +}; + +struct spdk_iscsi_conn { + int id; + int is_valid; + /* + * All fields below this point are reinitialized each time the + * connection object is allocated. Make sure to update the + * SPDK_ISCSI_CONNECTION_MEMSET() macro if changing which fields + * are initialized when allocated. + */ + struct spdk_iscsi_portal *portal; + int pg_tag; + char portal_host[MAX_PORTAL_ADDR + 1]; + char portal_port[MAX_PORTAL_ADDR + 1]; + struct spdk_iscsi_poll_group *pg; + struct spdk_sock *sock; + struct spdk_iscsi_sess *sess; + + enum iscsi_connection_state state; + int login_phase; + bool is_logged_out; + struct spdk_iscsi_pdu *login_rsp_pdu; + + uint64_t last_flush; + uint64_t last_fill; + uint64_t last_nopin; + + /* Timer used to destroy connection after requesting logout if + * initiator does not send logout request. + */ + struct spdk_poller *logout_request_timer; + + /* Timer used to destroy connection after logout if initiator does + * not close the connection. + */ + struct spdk_poller *logout_timer; + + /* Timer used to wait for connection to close + */ + struct spdk_poller *shutdown_timer; + + struct spdk_iscsi_pdu *pdu_in_progress; + enum iscsi_pdu_recv_state pdu_recv_state; + + TAILQ_HEAD(, spdk_iscsi_pdu) write_pdu_list; + TAILQ_HEAD(, spdk_iscsi_pdu) snack_pdu_list; + + int pending_r2t; + + uint16_t cid; + + /* IP address */ + char initiator_addr[MAX_INITIATOR_ADDR]; + char target_addr[MAX_TARGET_ADDR]; + + /* Initiator/Target port binds */ + char initiator_name[MAX_INITIATOR_NAME]; + struct spdk_scsi_port *initiator_port; + char target_short_name[MAX_TARGET_NAME]; + struct spdk_scsi_port *target_port; + struct spdk_iscsi_tgt_node *target; + struct spdk_scsi_dev *dev; + + /* for fast access */ + int header_digest; + int data_digest; + int full_feature; + + struct iscsi_param *params; + bool sess_param_state_negotiated[MAX_SESSION_PARAMS]; + bool conn_param_state_negotiated[MAX_CONNECTION_PARAMS]; + struct iscsi_chap_auth auth; + bool authenticated; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; + uint32_t pending_task_cnt; + uint32_t data_out_cnt; + uint32_t data_in_cnt; + + uint64_t timeout; + uint64_t nopininterval; + bool nop_outstanding; + + /* + * This is the maximum data segment length that iscsi target can send + * to the initiator on this connection. Not to be confused with the + * maximum data segment length that initiators can send to iscsi target, which + * is statically defined as SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH. + */ + int MaxRecvDataSegmentLength; + + uint32_t StatSN; + uint32_t exp_statsn; + uint32_t ttt; /* target transfer tag */ + char *partial_text_parameter; + + STAILQ_ENTRY(spdk_iscsi_conn) pg_link; + bool is_stopped; /* Set true when connection is stopped for migration */ + TAILQ_HEAD(queued_r2t_tasks, spdk_iscsi_task) queued_r2t_tasks; + TAILQ_HEAD(active_r2t_tasks, spdk_iscsi_task) active_r2t_tasks; + TAILQ_HEAD(queued_datain_tasks, spdk_iscsi_task) queued_datain_tasks; + + struct spdk_iscsi_lun *luns[SPDK_SCSI_DEV_MAX_LUN]; + + TAILQ_ENTRY(spdk_iscsi_conn) conn_link; +}; + +extern struct spdk_iscsi_conn *g_conns_array; + +void iscsi_task_cpl(struct spdk_scsi_task *scsi_task); +void iscsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task); + +int initialize_iscsi_conns(void); +void shutdown_iscsi_conns(void); +void iscsi_conns_request_logout(struct spdk_iscsi_tgt_node *target); +int iscsi_get_active_conns(struct spdk_iscsi_tgt_node *target); + +int iscsi_conn_construct(struct spdk_iscsi_portal *portal, struct spdk_sock *sock); +void iscsi_conn_destruct(struct spdk_iscsi_conn *conn); +void iscsi_conn_handle_nop(struct spdk_iscsi_conn *conn); +void iscsi_conn_schedule(struct spdk_iscsi_conn *conn); +void iscsi_conn_logout(struct spdk_iscsi_conn *conn); +int iscsi_drop_conns(struct spdk_iscsi_conn *conn, + const char *conn_match, int drop_all); +int iscsi_conn_handle_queued_datain_tasks(struct spdk_iscsi_conn *conn); +int iscsi_conn_abort_queued_datain_task(struct spdk_iscsi_conn *conn, + uint32_t ref_task_tag); +int iscsi_conn_abort_queued_datain_tasks(struct spdk_iscsi_conn *conn, + struct spdk_scsi_lun *lun, + struct spdk_iscsi_pdu *pdu); + +int iscsi_conn_read_data(struct spdk_iscsi_conn *conn, int len, void *buf); +int iscsi_conn_readv_data(struct spdk_iscsi_conn *conn, + struct iovec *iov, int iovcnt); +void iscsi_conn_write_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu, + iscsi_conn_xfer_complete_cb cb_fn, + void *cb_arg); + +void iscsi_conn_free_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu); + +void iscsi_conn_info_json(struct spdk_json_write_ctx *w, struct spdk_iscsi_conn *conn); +void iscsi_conn_pdu_generic_complete(void *cb_arg); +#endif /* SPDK_ISCSI_CONN_H */ diff --git a/src/spdk/lib/iscsi/init_grp.c b/src/spdk/lib/iscsi/init_grp.c new file mode 100644 index 000000000..49e78d89d --- /dev/null +++ b/src/spdk/lib/iscsi/init_grp.c @@ -0,0 +1,787 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/conf.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include "iscsi/iscsi.h" +#include "iscsi/init_grp.h" + +static struct spdk_iscsi_init_grp * +iscsi_init_grp_create(int tag) +{ + struct spdk_iscsi_init_grp *ig; + + ig = calloc(1, sizeof(*ig)); + if (ig == NULL) { + SPDK_ERRLOG("calloc() failed for initiator group\n"); + return NULL; + } + + ig->tag = tag; + TAILQ_INIT(&ig->initiator_head); + TAILQ_INIT(&ig->netmask_head); + return ig; +} + +static struct spdk_iscsi_initiator_name * +iscsi_init_grp_find_initiator(struct spdk_iscsi_init_grp *ig, char *name) +{ + struct spdk_iscsi_initiator_name *iname; + + TAILQ_FOREACH(iname, &ig->initiator_head, tailq) { + if (!strcmp(iname->name, name)) { + return iname; + } + } + return NULL; +} + +static int +iscsi_init_grp_add_initiator(struct spdk_iscsi_init_grp *ig, char *name) +{ + struct spdk_iscsi_initiator_name *iname; + char *p; + size_t len; + + if (ig->ninitiators >= MAX_INITIATOR) { + SPDK_ERRLOG("> MAX_INITIATOR(=%d) is not allowed\n", MAX_INITIATOR); + return -EPERM; + } + + len = strlen(name); + if (len > MAX_INITIATOR_NAME) { + SPDK_ERRLOG("Initiator Name is larger than 223 bytes\n"); + return -EINVAL; + } + + iname = iscsi_init_grp_find_initiator(ig, name); + if (iname != NULL) { + return -EEXIST; + } + + iname = calloc(1, sizeof(*iname)); + if (iname == NULL) { + SPDK_ERRLOG("malloc() failed for initiator name str\n"); + return -ENOMEM; + } + + memcpy(iname->name, name, len); + + /* Replace "ALL" by "ANY" if set */ + p = strstr(iname->name, "ALL"); + if (p != NULL) { + SPDK_WARNLOG("Please use \"%s\" instead of \"%s\"\n", "ANY", "ALL"); + SPDK_WARNLOG("Converting \"%s\" to \"%s\" automatically\n", "ALL", "ANY"); + memcpy(p, "ANY", 3); + } + + TAILQ_INSERT_TAIL(&ig->initiator_head, iname, tailq); + ig->ninitiators++; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "InitiatorName %s\n", name); + return 0; +} + +static int +iscsi_init_grp_delete_initiator(struct spdk_iscsi_init_grp *ig, char *name) +{ + struct spdk_iscsi_initiator_name *iname; + + iname = iscsi_init_grp_find_initiator(ig, name); + if (iname == NULL) { + return -ENOENT; + } + + TAILQ_REMOVE(&ig->initiator_head, iname, tailq); + ig->ninitiators--; + free(iname); + return 0; +} + +static int +iscsi_init_grp_add_initiators(struct spdk_iscsi_init_grp *ig, int num_inames, + char **inames) +{ + int i; + int rc; + + for (i = 0; i < num_inames; i++) { + rc = iscsi_init_grp_add_initiator(ig, inames[i]); + if (rc < 0) { + goto cleanup; + } + } + return 0; + +cleanup: + for (; i > 0; --i) { + iscsi_init_grp_delete_initiator(ig, inames[i - 1]); + } + return rc; +} + +static void +iscsi_init_grp_delete_all_initiators(struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_initiator_name *iname, *tmp; + + TAILQ_FOREACH_SAFE(iname, &ig->initiator_head, tailq, tmp) { + TAILQ_REMOVE(&ig->initiator_head, iname, tailq); + ig->ninitiators--; + free(iname); + } +} + +static int +iscsi_init_grp_delete_initiators(struct spdk_iscsi_init_grp *ig, int num_inames, char **inames) +{ + int i; + int rc; + + for (i = 0; i < num_inames; i++) { + rc = iscsi_init_grp_delete_initiator(ig, inames[i]); + if (rc < 0) { + goto cleanup; + } + } + return 0; + +cleanup: + for (; i > 0; --i) { + rc = iscsi_init_grp_add_initiator(ig, inames[i - 1]); + if (rc != 0) { + iscsi_init_grp_delete_all_initiators(ig); + break; + } + } + return -1; +} + +static struct spdk_iscsi_initiator_netmask * +iscsi_init_grp_find_netmask(struct spdk_iscsi_init_grp *ig, const char *mask) +{ + struct spdk_iscsi_initiator_netmask *netmask; + + TAILQ_FOREACH(netmask, &ig->netmask_head, tailq) { + if (!strcmp(netmask->mask, mask)) { + return netmask; + } + } + return NULL; +} + +static int +iscsi_init_grp_add_netmask(struct spdk_iscsi_init_grp *ig, char *mask) +{ + struct spdk_iscsi_initiator_netmask *imask; + char *p; + size_t len; + + if (ig->nnetmasks >= MAX_NETMASK) { + SPDK_ERRLOG("> MAX_NETMASK(=%d) is not allowed\n", MAX_NETMASK); + return -EPERM; + } + + len = strlen(mask); + if (len > MAX_INITIATOR_ADDR) { + SPDK_ERRLOG("Initiator Name is larger than %d bytes\n", MAX_INITIATOR_ADDR); + return -EINVAL; + } + + imask = iscsi_init_grp_find_netmask(ig, mask); + if (imask != NULL) { + return -EEXIST; + } + + imask = calloc(1, sizeof(*imask)); + if (imask == NULL) { + SPDK_ERRLOG("malloc() failed for inititator mask str\n"); + return -ENOMEM; + } + + memcpy(imask->mask, mask, len); + + /* Replace "ALL" by "ANY" if set */ + p = strstr(imask->mask, "ALL"); + if (p != NULL) { + SPDK_WARNLOG("Please use \"%s\" instead of \"%s\"\n", "ANY", "ALL"); + SPDK_WARNLOG("Converting \"%s\" to \"%s\" automatically\n", "ALL", "ANY"); + memcpy(p, "ANY", 3); + } + + TAILQ_INSERT_TAIL(&ig->netmask_head, imask, tailq); + ig->nnetmasks++; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Netmask %s\n", mask); + return 0; +} + +static int +iscsi_init_grp_delete_netmask(struct spdk_iscsi_init_grp *ig, char *mask) +{ + struct spdk_iscsi_initiator_netmask *imask; + + imask = iscsi_init_grp_find_netmask(ig, mask); + if (imask == NULL) { + return -ENOENT; + } + + TAILQ_REMOVE(&ig->netmask_head, imask, tailq); + ig->nnetmasks--; + free(imask); + return 0; +} + +static int +iscsi_init_grp_add_netmasks(struct spdk_iscsi_init_grp *ig, int num_imasks, char **imasks) +{ + int i; + int rc; + + for (i = 0; i < num_imasks; i++) { + rc = iscsi_init_grp_add_netmask(ig, imasks[i]); + if (rc != 0) { + goto cleanup; + } + } + return 0; + +cleanup: + for (; i > 0; --i) { + iscsi_init_grp_delete_netmask(ig, imasks[i - 1]); + } + return rc; +} + +static void +iscsi_init_grp_delete_all_netmasks(struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_initiator_netmask *imask, *tmp; + + TAILQ_FOREACH_SAFE(imask, &ig->netmask_head, tailq, tmp) { + TAILQ_REMOVE(&ig->netmask_head, imask, tailq); + ig->nnetmasks--; + free(imask); + } +} + +static int +iscsi_init_grp_delete_netmasks(struct spdk_iscsi_init_grp *ig, int num_imasks, char **imasks) +{ + int i; + int rc; + + for (i = 0; i < num_imasks; i++) { + rc = iscsi_init_grp_delete_netmask(ig, imasks[i]); + if (rc != 0) { + goto cleanup; + } + } + return 0; + +cleanup: + for (; i > 0; --i) { + rc = iscsi_init_grp_add_netmask(ig, imasks[i - 1]); + if (rc != 0) { + iscsi_init_grp_delete_all_netmasks(ig); + break; + } + } + return -1; +} + +/* Read spdk iscsi target's config file and create initiator group */ +static int +iscsi_parse_init_grp(struct spdk_conf_section *sp) +{ + int i, rc = 0; + const char *val = NULL; + int num_initiator_names; + int num_initiator_masks; + char **initiators = NULL, **netmasks = NULL; + int tag = spdk_conf_section_get_num(sp); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add initiator group %d\n", tag); + + val = spdk_conf_section_get_val(sp, "Comment"); + if (val != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val); + } + + /* counts number of definitions */ + for (i = 0; ; i++) { + val = spdk_conf_section_get_nval(sp, "InitiatorName", i); + if (val == NULL) { + break; + } + } + if (i == 0) { + SPDK_ERRLOG("num_initiator_names = 0\n"); + return -EINVAL; + } + num_initiator_names = i; + if (num_initiator_names > MAX_INITIATOR) { + SPDK_ERRLOG("%d > MAX_INITIATOR\n", num_initiator_names); + return -E2BIG; + } + for (i = 0; ; i++) { + val = spdk_conf_section_get_nval(sp, "Netmask", i); + if (val == NULL) { + break; + } + } + if (i == 0) { + SPDK_ERRLOG("num_initiator_mask = 0\n"); + return -EINVAL; + } + num_initiator_masks = i; + if (num_initiator_masks > MAX_NETMASK) { + SPDK_ERRLOG("%d > MAX_NETMASK\n", num_initiator_masks); + return -E2BIG; + } + + initiators = calloc(num_initiator_names, sizeof(char *)); + if (!initiators) { + SPDK_ERRLOG("calloc() failed for temp initiator name array\n"); + return -ENOMEM; + } + for (i = 0; i < num_initiator_names; i++) { + val = spdk_conf_section_get_nval(sp, "InitiatorName", i); + if (!val) { + SPDK_ERRLOG("InitiatorName %d not found\n", i); + rc = -EINVAL; + goto cleanup; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "InitiatorName %s\n", val); + initiators[i] = strdup(val); + if (!initiators[i]) { + SPDK_ERRLOG("strdup() failed for temp initiator name\n"); + rc = -ENOMEM; + goto cleanup; + } + } + netmasks = calloc(num_initiator_masks, sizeof(char *)); + if (!netmasks) { + SPDK_ERRLOG("malloc() failed for portal group\n"); + rc = -ENOMEM; + goto cleanup; + } + for (i = 0; i < num_initiator_masks; i++) { + val = spdk_conf_section_get_nval(sp, "Netmask", i); + if (!val) { + SPDK_ERRLOG("Netmask %d not found\n", i); + rc = -EINVAL; + goto cleanup; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Netmask %s\n", val); + netmasks[i] = strdup(val); + if (!netmasks[i]) { + SPDK_ERRLOG("strdup() failed for temp initiator mask\n"); + rc = -ENOMEM; + goto cleanup; + } + } + + rc = iscsi_init_grp_create_from_initiator_list(tag, + num_initiator_names, initiators, num_initiator_masks, netmasks); + +cleanup: + if (initiators) { + for (i = 0; i < num_initiator_names; i++) { + if (initiators[i]) { + free(initiators[i]); + } + } + free(initiators); + } + if (netmasks) { + for (i = 0; i < num_initiator_masks; i++) { + if (netmasks[i]) { + free(netmasks[i]); + } + } + free(netmasks); + } + return rc; +} + +int +iscsi_init_grp_register(struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_init_grp *tmp; + int rc = -1; + + assert(ig != NULL); + + pthread_mutex_lock(&g_iscsi.mutex); + tmp = iscsi_init_grp_find_by_tag(ig->tag); + if (tmp == NULL) { + TAILQ_INSERT_TAIL(&g_iscsi.ig_head, ig, tailq); + rc = 0; + } + pthread_mutex_unlock(&g_iscsi.mutex); + + return rc; +} + +/* + * Create initiator group from list of initiator ip/hostnames and netmasks + * The initiator hostname/netmask lists are allocated by the caller on the + * heap. Freed later by common initiator_group_destroy() code + */ +int +iscsi_init_grp_create_from_initiator_list(int tag, + int num_initiator_names, + char **initiator_names, + int num_initiator_masks, + char **initiator_masks) +{ + int rc = -1; + struct spdk_iscsi_init_grp *ig = NULL; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "add initiator group (from initiator list) tag=%d, #initiators=%d, #masks=%d\n", + tag, num_initiator_names, num_initiator_masks); + + ig = iscsi_init_grp_create(tag); + if (!ig) { + SPDK_ERRLOG("initiator group create error (%d)\n", tag); + return rc; + } + + rc = iscsi_init_grp_add_initiators(ig, num_initiator_names, + initiator_names); + if (rc < 0) { + SPDK_ERRLOG("add initiator name error\n"); + goto cleanup; + } + + rc = iscsi_init_grp_add_netmasks(ig, num_initiator_masks, + initiator_masks); + if (rc < 0) { + SPDK_ERRLOG("add initiator netmask error\n"); + goto cleanup; + } + + rc = iscsi_init_grp_register(ig); + if (rc < 0) { + SPDK_ERRLOG("initiator group register error (%d)\n", tag); + goto cleanup; + } + return 0; + +cleanup: + iscsi_init_grp_destroy(ig); + return rc; +} + +int +iscsi_init_grp_add_initiators_from_initiator_list(int tag, + int num_initiator_names, + char **initiator_names, + int num_initiator_masks, + char **initiator_masks) +{ + int rc = -1; + struct spdk_iscsi_init_grp *ig; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "add initiator to initiator group: tag=%d, #initiators=%d, #masks=%d\n", + tag, num_initiator_names, num_initiator_masks); + + pthread_mutex_lock(&g_iscsi.mutex); + ig = iscsi_init_grp_find_by_tag(tag); + if (!ig) { + pthread_mutex_unlock(&g_iscsi.mutex); + SPDK_ERRLOG("initiator group (%d) is not found\n", tag); + return rc; + } + + rc = iscsi_init_grp_add_initiators(ig, num_initiator_names, + initiator_names); + if (rc < 0) { + SPDK_ERRLOG("add initiator name error\n"); + goto error; + } + + rc = iscsi_init_grp_add_netmasks(ig, num_initiator_masks, + initiator_masks); + if (rc < 0) { + SPDK_ERRLOG("add initiator netmask error\n"); + iscsi_init_grp_delete_initiators(ig, num_initiator_names, + initiator_names); + } + +error: + pthread_mutex_unlock(&g_iscsi.mutex); + return rc; +} + +int +iscsi_init_grp_delete_initiators_from_initiator_list(int tag, + int num_initiator_names, + char **initiator_names, + int num_initiator_masks, + char **initiator_masks) +{ + int rc = -1; + struct spdk_iscsi_init_grp *ig; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "delete initiator from initiator group: tag=%d, #initiators=%d, #masks=%d\n", + tag, num_initiator_names, num_initiator_masks); + + pthread_mutex_lock(&g_iscsi.mutex); + ig = iscsi_init_grp_find_by_tag(tag); + if (!ig) { + pthread_mutex_unlock(&g_iscsi.mutex); + SPDK_ERRLOG("initiator group (%d) is not found\n", tag); + return rc; + } + + rc = iscsi_init_grp_delete_initiators(ig, num_initiator_names, + initiator_names); + if (rc < 0) { + SPDK_ERRLOG("delete initiator name error\n"); + goto error; + } + + rc = iscsi_init_grp_delete_netmasks(ig, num_initiator_masks, + initiator_masks); + if (rc < 0) { + SPDK_ERRLOG("delete initiator netmask error\n"); + iscsi_init_grp_add_initiators(ig, num_initiator_names, + initiator_names); + goto error; + } + +error: + pthread_mutex_unlock(&g_iscsi.mutex); + return rc; +} + +void +iscsi_init_grp_destroy(struct spdk_iscsi_init_grp *ig) +{ + if (!ig) { + return; + } + + iscsi_init_grp_delete_all_initiators(ig); + iscsi_init_grp_delete_all_netmasks(ig); + free(ig); +}; + +struct spdk_iscsi_init_grp * +iscsi_init_grp_find_by_tag(int tag) +{ + struct spdk_iscsi_init_grp *ig; + + TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) { + if (ig->tag == tag) { + return ig; + } + } + + return NULL; +} + +int +iscsi_parse_init_grps(void) +{ + struct spdk_conf_section *sp; + int rc; + + sp = spdk_conf_first_section(NULL); + while (sp != NULL) { + if (spdk_conf_section_match_prefix(sp, "InitiatorGroup")) { + if (spdk_conf_section_get_num(sp) == 0) { + SPDK_ERRLOG("Group 0 is invalid\n"); + return -1; + } + rc = iscsi_parse_init_grp(sp); + if (rc < 0) { + SPDK_ERRLOG("parse_init_group() failed\n"); + return -1; + } + } + sp = spdk_conf_next_section(sp); + } + return 0; +} + +void +iscsi_init_grps_destroy(void) +{ + struct spdk_iscsi_init_grp *ig, *tmp; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_init_grp_array_destroy\n"); + pthread_mutex_lock(&g_iscsi.mutex); + TAILQ_FOREACH_SAFE(ig, &g_iscsi.ig_head, tailq, tmp) { + TAILQ_REMOVE(&g_iscsi.ig_head, ig, tailq); + iscsi_init_grp_destroy(ig); + } + pthread_mutex_unlock(&g_iscsi.mutex); +} + +struct spdk_iscsi_init_grp * +iscsi_init_grp_unregister(int tag) +{ + struct spdk_iscsi_init_grp *ig; + + pthread_mutex_lock(&g_iscsi.mutex); + TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) { + if (ig->tag == tag) { + TAILQ_REMOVE(&g_iscsi.ig_head, ig, tailq); + pthread_mutex_unlock(&g_iscsi.mutex); + return ig; + } + } + pthread_mutex_unlock(&g_iscsi.mutex); + return NULL; +} + +static const char *initiator_group_section = \ + "\n" + "# Users must change the InitiatorGroup section(s) to match the IP\n" + "# addresses and initiator configuration in their environment.\n" + "# Netmask can be used to specify a single IP address or a range of IP addresses\n" + "# Netmask 192.168.1.20 <== single IP address\n" + "# Netmask 192.168.1.0/24 <== IP range 192.168.1.*\n"; + +#define INITIATOR_GROUP_TMPL \ +"[InitiatorGroup%d]\n" \ +" Comment \"Initiator Group%d\"\n" + +#define INITIATOR_TMPL \ +" InitiatorName " + +#define NETMASK_TMPL \ +" Netmask " + +void +iscsi_init_grps_config_text(FILE *fp) +{ + struct spdk_iscsi_init_grp *ig; + struct spdk_iscsi_initiator_name *iname; + struct spdk_iscsi_initiator_netmask *imask; + + /* Create initiator group section */ + fprintf(fp, "%s", initiator_group_section); + + /* Dump initiator groups */ + TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) { + if (NULL == ig) { continue; } + fprintf(fp, INITIATOR_GROUP_TMPL, ig->tag, ig->tag); + + /* Dump initiators */ + fprintf(fp, INITIATOR_TMPL); + TAILQ_FOREACH(iname, &ig->initiator_head, tailq) { + fprintf(fp, "%s ", iname->name); + } + fprintf(fp, "\n"); + + /* Dump netmasks */ + fprintf(fp, NETMASK_TMPL); + TAILQ_FOREACH(imask, &ig->netmask_head, tailq) { + fprintf(fp, "%s ", imask->mask); + } + fprintf(fp, "\n"); + } +} + +static void +iscsi_init_grp_info_json(struct spdk_iscsi_init_grp *ig, + struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_initiator_name *iname; + struct spdk_iscsi_initiator_netmask *imask; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_int32(w, "tag", ig->tag); + + spdk_json_write_named_array_begin(w, "initiators"); + TAILQ_FOREACH(iname, &ig->initiator_head, tailq) { + spdk_json_write_string(w, iname->name); + } + spdk_json_write_array_end(w); + + spdk_json_write_named_array_begin(w, "netmasks"); + TAILQ_FOREACH(imask, &ig->netmask_head, tailq) { + spdk_json_write_string(w, imask->mask); + } + spdk_json_write_array_end(w); + + spdk_json_write_object_end(w); +} + +static void +iscsi_init_grp_config_json(struct spdk_iscsi_init_grp *ig, + struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "iscsi_create_initiator_group"); + + spdk_json_write_name(w, "params"); + iscsi_init_grp_info_json(ig, w); + + spdk_json_write_object_end(w); +} + +void +iscsi_init_grps_info_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_init_grp *ig; + + TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) { + iscsi_init_grp_info_json(ig, w); + } +} + +void +iscsi_init_grps_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_init_grp *ig; + + TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) { + iscsi_init_grp_config_json(ig, w); + } +} diff --git a/src/spdk/lib/iscsi/init_grp.h b/src/spdk/lib/iscsi/init_grp.h new file mode 100644 index 000000000..8913c98cd --- /dev/null +++ b/src/spdk/lib/iscsi/init_grp.h @@ -0,0 +1,81 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_INIT_GRP_H +#define SPDK_INIT_GRP_H + +#include "spdk/conf.h" +#include "iscsi/iscsi.h" +#include "iscsi/conn.h" + +struct spdk_iscsi_initiator_name { + char name[MAX_INITIATOR_NAME + 1]; + TAILQ_ENTRY(spdk_iscsi_initiator_name) tailq; +}; + +struct spdk_iscsi_initiator_netmask { + char mask[MAX_INITIATOR_ADDR + 1]; + TAILQ_ENTRY(spdk_iscsi_initiator_netmask) tailq; +}; + +struct spdk_iscsi_init_grp { + int ninitiators; + TAILQ_HEAD(, spdk_iscsi_initiator_name) initiator_head; + int nnetmasks; + TAILQ_HEAD(, spdk_iscsi_initiator_netmask) netmask_head; + int ref; + int tag; + TAILQ_ENTRY(spdk_iscsi_init_grp) tailq; +}; + +/* SPDK iSCSI Initiator Group management API */ +int iscsi_init_grp_create_from_initiator_list(int tag, + int num_initiator_names, char **initiator_names, + int num_initiator_masks, char **initiator_masks); +int iscsi_init_grp_add_initiators_from_initiator_list(int tag, + int num_initiator_names, char **initiator_names, + int num_initiator_masks, char **initiator_masks); +int iscsi_init_grp_delete_initiators_from_initiator_list(int tag, + int num_initiator_names, char **initiator_names, + int num_initiator_masks, char **initiator_masks); +int iscsi_init_grp_register(struct spdk_iscsi_init_grp *ig); +struct spdk_iscsi_init_grp *iscsi_init_grp_unregister(int tag); +struct spdk_iscsi_init_grp *iscsi_init_grp_find_by_tag(int tag); +void iscsi_init_grp_destroy(struct spdk_iscsi_init_grp *ig); +int iscsi_parse_init_grps(void); +void iscsi_init_grps_destroy(void); +void iscsi_init_grps_config_text(FILE *fp); +void iscsi_init_grps_info_json(struct spdk_json_write_ctx *w); +void iscsi_init_grps_config_json(struct spdk_json_write_ctx *w); +#endif /* SPDK_INIT_GRP_H */ diff --git a/src/spdk/lib/iscsi/iscsi.c b/src/spdk/lib/iscsi/iscsi.c new file mode 100644 index 000000000..febf4cac4 --- /dev/null +++ b/src/spdk/lib/iscsi/iscsi.c @@ -0,0 +1,4797 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/base64.h" +#include "spdk/crc32.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/trace.h" +#include "spdk/sock.h" +#include "spdk/string.h" +#include "spdk/queue.h" +#include "spdk/net.h" + +#include "iscsi/md5.h" +#include "iscsi/iscsi.h" +#include "iscsi/param.h" +#include "iscsi/tgt_node.h" +#include "iscsi/task.h" +#include "iscsi/conn.h" +#include "spdk/scsi.h" +#include "spdk/bdev.h" +#include "iscsi/portal_grp.h" + +#include "spdk_internal/log.h" + +#define MAX_TMPBUF 1024 + +#define SPDK_CRC32C_INITIAL 0xffffffffUL +#define SPDK_CRC32C_XOR 0xffffffffUL + +#ifdef __FreeBSD__ +#define HAVE_SRANDOMDEV 1 +#define HAVE_ARC4RANDOM 1 +#endif + +struct spdk_iscsi_globals g_iscsi = { + .mutex = PTHREAD_MUTEX_INITIALIZER, + .portal_head = TAILQ_HEAD_INITIALIZER(g_iscsi.portal_head), + .pg_head = TAILQ_HEAD_INITIALIZER(g_iscsi.pg_head), + .ig_head = TAILQ_HEAD_INITIALIZER(g_iscsi.ig_head), + .target_head = TAILQ_HEAD_INITIALIZER(g_iscsi.target_head), + .auth_group_head = TAILQ_HEAD_INITIALIZER(g_iscsi.auth_group_head), + .poll_group_head = TAILQ_HEAD_INITIALIZER(g_iscsi.poll_group_head), +}; + +#define MATCH_DIGEST_WORD(BUF, CRC32C) \ + ( ((((uint32_t) *((uint8_t *)(BUF)+0)) << 0) \ + | (((uint32_t) *((uint8_t *)(BUF)+1)) << 8) \ + | (((uint32_t) *((uint8_t *)(BUF)+2)) << 16) \ + | (((uint32_t) *((uint8_t *)(BUF)+3)) << 24)) \ + == (CRC32C)) + +#ifndef HAVE_SRANDOMDEV +static void +srandomdev(void) +{ + unsigned long seed; + time_t now; + pid_t pid; + + pid = getpid(); + now = time(NULL); + seed = pid ^ now; + srandom(seed); +} +#endif /* HAVE_SRANDOMDEV */ + +#ifndef HAVE_ARC4RANDOM +static int g_arc4random_initialized = 0; + +static uint32_t +arc4random(void) +{ + uint32_t r; + uint32_t r1, r2; + + if (!g_arc4random_initialized) { + srandomdev(); + g_arc4random_initialized = 1; + } + r1 = (uint32_t)(random() & 0xffff); + r2 = (uint32_t)(random() & 0xffff); + r = (r1 << 16) | r2; + return r; +} +#endif /* HAVE_ARC4RANDOM */ + +static void +gen_random(uint8_t *buf, size_t len) +{ + uint32_t r; + size_t idx; + + for (idx = 0; idx < len; idx++) { + r = arc4random(); + buf[idx] = (uint8_t) r; + } +} + +static uint64_t +iscsi_get_isid(const uint8_t isid[6]) +{ + return (uint64_t)isid[0] << 40 | + (uint64_t)isid[1] << 32 | + (uint64_t)isid[2] << 24 | + (uint64_t)isid[3] << 16 | + (uint64_t)isid[4] << 8 | + (uint64_t)isid[5]; +} + +static int +bin2hex(char *buf, size_t len, const uint8_t *data, size_t data_len) +{ + const char *digits = "0123456789ABCDEF"; + size_t total = 0; + size_t idx; + + if (len < 3) { + return -1; + } + buf[total] = '0'; + total++; + buf[total] = 'x'; + total++; + buf[total] = '\0'; + + for (idx = 0; idx < data_len; idx++) { + if (total + 3 > len) { + buf[total] = '\0'; + return - 1; + } + buf[total] = digits[(data[idx] >> 4) & 0x0fU]; + total++; + buf[total] = digits[data[idx] & 0x0fU]; + total++; + } + buf[total] = '\0'; + return total; +} + +static int +hex2bin(uint8_t *data, size_t data_len, const char *str) +{ + const char *digits = "0123456789ABCDEF"; + const char *dp; + const char *p; + size_t total = 0; + int n0, n1; + + p = str; + if (p[0] != '0' && (p[1] != 'x' && p[1] != 'X')) { + return -1; + } + p += 2; + + while (p[0] != '\0' && p[1] != '\0') { + if (total >= data_len) { + return -1; + } + dp = strchr(digits, toupper((int) p[0])); + if (dp == NULL) { + return -1; + } + n0 = (int)(dp - digits); + dp = strchr(digits, toupper((int) p[1])); + if (dp == NULL) { + return -1; + } + n1 = (int)(dp - digits); + + data[total] = (uint8_t)(((n0 & 0x0fU) << 4) | (n1 & 0x0fU)); + total++; + p += 2; + } + return total; +} + +static int +iscsi_reject(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu, + int reason) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_reject *rsph; + uint8_t *data; + int total_ahs_len; + int data_len; + int alloc_len; + + pdu->is_rejected = true; + + total_ahs_len = pdu->bhs.total_ahs_len; + data_len = 0; + alloc_len = ISCSI_BHS_LEN + (4 * total_ahs_len); + + if (conn->header_digest) { + alloc_len += ISCSI_DIGEST_LEN; + } + + data = calloc(1, alloc_len); + if (!data) { + SPDK_ERRLOG("calloc() failed for data segment\n"); + return -ENOMEM; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Reject PDU reason=%d\n", reason); + + if (conn->sess != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + conn->StatSN, conn->sess->ExpCmdSN, + conn->sess->MaxCmdSN); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u\n", conn->StatSN); + } + + memcpy(data, &pdu->bhs, ISCSI_BHS_LEN); + data_len += ISCSI_BHS_LEN; + + if (total_ahs_len != 0) { + total_ahs_len = spdk_min((4 * total_ahs_len), ISCSI_AHS_LEN); + memcpy(data + data_len, pdu->ahs, total_ahs_len); + data_len += total_ahs_len; + } + + if (conn->header_digest) { + memcpy(data + data_len, pdu->header_digest, ISCSI_DIGEST_LEN); + data_len += ISCSI_DIGEST_LEN; + } + + rsp_pdu = iscsi_get_pdu(conn); + if (rsp_pdu == NULL) { + free(data); + return -ENOMEM; + } + + rsph = (struct iscsi_bhs_reject *)&rsp_pdu->bhs; + rsp_pdu->data = data; + rsph->opcode = ISCSI_OP_REJECT; + rsph->flags |= 0x80; /* bit 0 is default to 1 */ + rsph->reason = reason; + DSET24(rsph->data_segment_len, data_len); + + rsph->ffffffff = 0xffffffffU; + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (conn->sess != NULL) { + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + } else { + to_be32(&rsph->exp_cmd_sn, 1); + to_be32(&rsph->max_cmd_sn, 1); + } + + SPDK_LOGDUMP(SPDK_LOG_ISCSI, "PDU", (void *)&rsp_pdu->bhs, ISCSI_BHS_LEN); + + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL); + + return 0; +} + +uint32_t +iscsi_pdu_calc_header_digest(struct spdk_iscsi_pdu *pdu) +{ + uint32_t crc32c; + uint32_t ahs_len_bytes = pdu->bhs.total_ahs_len * 4; + + crc32c = SPDK_CRC32C_INITIAL; + crc32c = spdk_crc32c_update(&pdu->bhs, ISCSI_BHS_LEN, crc32c); + + if (ahs_len_bytes) { + crc32c = spdk_crc32c_update(pdu->ahs, ahs_len_bytes, crc32c); + } + + /* BHS and AHS are always 4-byte multiples in length, so no padding is necessary. */ + crc32c = crc32c ^ SPDK_CRC32C_XOR; + return crc32c; +} + +uint32_t +iscsi_pdu_calc_data_digest(struct spdk_iscsi_pdu *pdu) +{ + uint32_t data_len = DGET24(pdu->bhs.data_segment_len); + uint32_t crc32c; + uint32_t mod; + struct iovec iov; + uint32_t num_blocks; + + crc32c = SPDK_CRC32C_INITIAL; + if (spdk_likely(!pdu->dif_insert_or_strip)) { + crc32c = spdk_crc32c_update(pdu->data, data_len, crc32c); + } else { + iov.iov_base = pdu->data_buf; + iov.iov_len = pdu->data_buf_len; + num_blocks = pdu->data_buf_len / pdu->dif_ctx.block_size; + + spdk_dif_update_crc32c(&iov, 1, num_blocks, &crc32c, &pdu->dif_ctx); + } + + mod = data_len % ISCSI_ALIGNMENT; + if (mod != 0) { + uint32_t pad_length = ISCSI_ALIGNMENT - mod; + uint8_t pad[3] = {0, 0, 0}; + + assert(pad_length > 0); + assert(pad_length <= sizeof(pad)); + crc32c = spdk_crc32c_update(pad, pad_length, crc32c); + } + + crc32c = crc32c ^ SPDK_CRC32C_XOR; + return crc32c; +} + +static int +iscsi_conn_read_data_segment(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *pdu, + uint32_t segment_len) +{ + struct iovec buf_iov, iovs[32]; + int rc, _rc; + + if (spdk_likely(!pdu->dif_insert_or_strip)) { + return iscsi_conn_read_data(conn, + segment_len - pdu->data_valid_bytes, + pdu->data_buf + pdu->data_valid_bytes); + } else { + buf_iov.iov_base = pdu->data_buf; + buf_iov.iov_len = pdu->data_buf_len; + rc = spdk_dif_set_md_interleave_iovs(iovs, 32, &buf_iov, 1, + pdu->data_valid_bytes, + segment_len - pdu->data_valid_bytes, NULL, + &pdu->dif_ctx); + if (rc > 0) { + rc = iscsi_conn_readv_data(conn, iovs, rc); + if (rc > 0) { + _rc = spdk_dif_generate_stream(&buf_iov, 1, + pdu->data_valid_bytes, rc, + &pdu->dif_ctx); + if (_rc != 0) { + SPDK_ERRLOG("DIF generate failed\n"); + rc = _rc; + } + } + } else { + SPDK_ERRLOG("Setup iovs for interleaved metadata failed\n"); + } + return rc; + } +} + +struct _iscsi_sgl { + struct iovec *iov; + int iovcnt; + uint32_t iov_offset; + uint32_t total_size; +}; + +static inline void +_iscsi_sgl_init(struct _iscsi_sgl *s, struct iovec *iovs, int iovcnt, + uint32_t iov_offset) +{ + s->iov = iovs; + s->iovcnt = iovcnt; + s->iov_offset = iov_offset; + s->total_size = 0; +} + +static inline bool +_iscsi_sgl_append(struct _iscsi_sgl *s, uint8_t *data, uint32_t data_len) +{ + if (s->iov_offset >= data_len) { + s->iov_offset -= data_len; + } else { + assert(s->iovcnt > 0); + s->iov->iov_base = data + s->iov_offset; + s->iov->iov_len = data_len - s->iov_offset; + s->total_size += data_len - s->iov_offset; + s->iov_offset = 0; + s->iov++; + s->iovcnt--; + if (s->iovcnt == 0) { + return false; + } + } + + return true; +} + +/* Build iovec array to leave metadata space for every data block + * when reading data segment from socket. + */ +static inline bool +_iscsi_sgl_append_with_md(struct _iscsi_sgl *s, + void *buf, uint32_t buf_len, uint32_t data_len, + struct spdk_dif_ctx *dif_ctx) +{ + int rc; + uint32_t total_size = 0; + struct iovec buf_iov; + + if (s->iov_offset >= data_len) { + s->iov_offset -= data_len; + } else { + buf_iov.iov_base = buf; + buf_iov.iov_len = buf_len; + rc = spdk_dif_set_md_interleave_iovs(s->iov, s->iovcnt, &buf_iov, 1, + s->iov_offset, data_len - s->iov_offset, + &total_size, dif_ctx); + if (rc < 0) { + SPDK_ERRLOG("Failed to setup iovs for DIF strip\n"); + return false; + } + + s->total_size += total_size; + s->iov_offset = 0; + assert(s->iovcnt >= rc); + s->iovcnt -= rc; + s->iov += rc; + + if (s->iovcnt == 0) { + return false; + } + } + + return true; +} + +int +iscsi_build_iovs(struct spdk_iscsi_conn *conn, struct iovec *iovs, int iovcnt, + struct spdk_iscsi_pdu *pdu, uint32_t *_mapped_length) +{ + struct _iscsi_sgl sgl; + int enable_digest; + uint32_t total_ahs_len; + uint32_t data_len; + + if (iovcnt == 0) { + return 0; + } + + total_ahs_len = pdu->bhs.total_ahs_len; + data_len = DGET24(pdu->bhs.data_segment_len); + data_len = ISCSI_ALIGN(data_len); + + enable_digest = 1; + if (pdu->bhs.opcode == ISCSI_OP_LOGIN_RSP) { + /* this PDU should be sent without digest */ + enable_digest = 0; + } + + _iscsi_sgl_init(&sgl, iovs, iovcnt, pdu->writev_offset); + + /* BHS */ + if (!_iscsi_sgl_append(&sgl, (uint8_t *)&pdu->bhs, ISCSI_BHS_LEN)) { + goto end; + } + /* AHS */ + if (total_ahs_len > 0) { + if (!_iscsi_sgl_append(&sgl, pdu->ahs, 4 * total_ahs_len)) { + goto end; + } + } + + /* Header Digest */ + if (enable_digest && conn->header_digest) { + if (!_iscsi_sgl_append(&sgl, pdu->header_digest, ISCSI_DIGEST_LEN)) { + goto end; + } + } + + /* Data Segment */ + if (data_len > 0) { + if (!pdu->dif_insert_or_strip) { + if (!_iscsi_sgl_append(&sgl, pdu->data, data_len)) { + goto end; + } + } else { + if (!_iscsi_sgl_append_with_md(&sgl, pdu->data, pdu->data_buf_len, + data_len, &pdu->dif_ctx)) { + goto end; + } + } + } + + /* Data Digest */ + if (enable_digest && conn->data_digest && data_len != 0) { + _iscsi_sgl_append(&sgl, pdu->data_digest, ISCSI_DIGEST_LEN); + } + +end: + if (_mapped_length != NULL) { + *_mapped_length = sgl.total_size; + } + + return iovcnt - sgl.iovcnt; +} + +void iscsi_free_sess(struct spdk_iscsi_sess *sess) +{ + if (sess == NULL) { + return; + } + + sess->tag = 0; + sess->target = NULL; + sess->session_type = SESSION_TYPE_INVALID; + iscsi_param_free(sess->params); + free(sess->conns); + spdk_scsi_port_free(&sess->initiator_port); + spdk_mempool_put(g_iscsi.session_pool, (void *)sess); +} + +static int +create_iscsi_sess(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target, + enum session_type session_type) +{ + struct spdk_iscsi_sess *sess; + int rc; + + sess = spdk_mempool_get(g_iscsi.session_pool); + if (!sess) { + SPDK_ERRLOG("Unable to get session object\n"); + SPDK_ERRLOG("MaxSessions set to %d\n", g_iscsi.MaxSessions); + return -ENOMEM; + } + + /* configuration values */ + pthread_mutex_lock(&g_iscsi.mutex); + + sess->MaxConnections = g_iscsi.MaxConnectionsPerSession; + sess->MaxOutstandingR2T = DEFAULT_MAXOUTSTANDINGR2T; + + sess->DefaultTime2Wait = g_iscsi.DefaultTime2Wait; + sess->DefaultTime2Retain = g_iscsi.DefaultTime2Retain; + sess->FirstBurstLength = g_iscsi.FirstBurstLength; + sess->MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH; + sess->InitialR2T = DEFAULT_INITIALR2T; + sess->ImmediateData = g_iscsi.ImmediateData; + sess->DataPDUInOrder = DEFAULT_DATAPDUINORDER; + sess->DataSequenceInOrder = DEFAULT_DATASEQUENCEINORDER; + sess->ErrorRecoveryLevel = g_iscsi.ErrorRecoveryLevel; + + pthread_mutex_unlock(&g_iscsi.mutex); + + sess->tag = conn->pg_tag; + + sess->conns = calloc(sess->MaxConnections, sizeof(*sess->conns)); + if (!sess->conns) { + SPDK_ERRLOG("calloc() failed for connection array\n"); + return -ENOMEM; + } + + sess->connections = 0; + + sess->conns[sess->connections] = conn; + sess->connections++; + + sess->params = NULL; + sess->target = target; + sess->isid = 0; + sess->session_type = session_type; + sess->current_text_itt = 0xffffffffU; + + /* set default params */ + rc = iscsi_sess_params_init(&sess->params); + if (rc < 0) { + SPDK_ERRLOG("iscsi_sess_params_init() failed\n"); + goto error_return; + } + /* replace with config value */ + rc = iscsi_param_set_int(sess->params, "MaxConnections", + sess->MaxConnections); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = iscsi_param_set_int(sess->params, "MaxOutstandingR2T", + sess->MaxOutstandingR2T); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = iscsi_param_set_int(sess->params, "DefaultTime2Wait", + sess->DefaultTime2Wait); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = iscsi_param_set_int(sess->params, "DefaultTime2Retain", + sess->DefaultTime2Retain); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = iscsi_param_set_int(sess->params, "FirstBurstLength", + sess->FirstBurstLength); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = iscsi_param_set_int(sess->params, "MaxBurstLength", + sess->MaxBurstLength); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = iscsi_param_set(sess->params, "InitialR2T", + sess->InitialR2T ? "Yes" : "No"); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + goto error_return; + } + + rc = iscsi_param_set(sess->params, "ImmediateData", + sess->ImmediateData ? "Yes" : "No"); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + goto error_return; + } + + rc = iscsi_param_set(sess->params, "DataPDUInOrder", + sess->DataPDUInOrder ? "Yes" : "No"); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + goto error_return; + } + + rc = iscsi_param_set(sess->params, "DataSequenceInOrder", + sess->DataSequenceInOrder ? "Yes" : "No"); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + goto error_return; + } + + rc = iscsi_param_set_int(sess->params, "ErrorRecoveryLevel", + sess->ErrorRecoveryLevel); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + /* realloc buffer */ + rc = iscsi_param_set_int(conn->params, "MaxRecvDataSegmentLength", + conn->MaxRecvDataSegmentLength); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + /* sess for first connection of session */ + conn->sess = sess; + return 0; + +error_return: + iscsi_free_sess(sess); + conn->sess = NULL; + return -1; +} + +static struct spdk_iscsi_sess * +get_iscsi_sess_by_tsih(uint16_t tsih) +{ + struct spdk_iscsi_sess *session; + + if (tsih == 0 || tsih > g_iscsi.MaxSessions) { + return NULL; + } + + session = g_iscsi.session[tsih - 1]; + assert(tsih == session->tsih); + + return session; +} + +static uint8_t +append_iscsi_sess(struct spdk_iscsi_conn *conn, + const char *initiator_port_name, uint16_t tsih, uint16_t cid) +{ + struct spdk_iscsi_sess *sess; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "append session: init port name=%s, tsih=%u, cid=%u\n", + initiator_port_name, tsih, cid); + + sess = get_iscsi_sess_by_tsih(tsih); + if (sess == NULL) { + SPDK_ERRLOG("spdk_get_iscsi_sess_by_tsih failed\n"); + return ISCSI_LOGIN_CONN_ADD_FAIL; + } + if ((conn->pg_tag != sess->tag) || + (strcasecmp(initiator_port_name, spdk_scsi_port_get_name(sess->initiator_port)) != 0) || + (conn->target != sess->target)) { + /* no match */ + SPDK_ERRLOG("no MCS session for init port name=%s, tsih=%d, cid=%d\n", + initiator_port_name, tsih, cid); + return ISCSI_LOGIN_CONN_ADD_FAIL; + } + + if (sess->connections >= sess->MaxConnections) { + /* no slot for connection */ + SPDK_ERRLOG("too many connections for init port name=%s, tsih=%d, cid=%d\n", + initiator_port_name, tsih, cid); + return ISCSI_LOGIN_TOO_MANY_CONNECTIONS; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connections (tsih %d): %d\n", sess->tsih, sess->connections); + conn->sess = sess; + + /* + * TODO: need a mutex or other sync mechanism to protect the session's + * connection list. + */ + sess->conns[sess->connections] = conn; + sess->connections++; + + return 0; +} + +static int +iscsi_append_text(struct spdk_iscsi_conn *conn __attribute__((__unused__)), + const char *key, const char *val, uint8_t *data, + int alloc_len, int data_len) +{ + int total; + int len; + + total = data_len; + if (alloc_len < 1) { + return 0; + } + if (total > alloc_len) { + total = alloc_len; + data[total - 1] = '\0'; + return total; + } + + if (alloc_len - total < 1) { + SPDK_ERRLOG("data space small %d\n", alloc_len); + return total; + } + len = snprintf((char *) data + total, alloc_len - total, "%s=%s", key, val); + total += len + 1; + + return total; +} + +static int +iscsi_append_param(struct spdk_iscsi_conn *conn, const char *key, + uint8_t *data, int alloc_len, int data_len) +{ + struct iscsi_param *param; + int rc; + + param = iscsi_param_find(conn->params, key); + if (param == NULL) { + param = iscsi_param_find(conn->sess->params, key); + if (param == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "no key %.64s\n", key); + return data_len; + } + } + rc = iscsi_append_text(conn, param->key, param->val, data, + alloc_len, data_len); + return rc; +} + +static int +iscsi_auth_params(struct spdk_iscsi_conn *conn, + struct iscsi_param *params, const char *method, uint8_t *data, + int alloc_len, int data_len) +{ + char *in_val; + char *in_next; + char *new_val; + const char *algorithm; + const char *name; + const char *response; + const char *identifier; + const char *challenge; + int total; + int rc; + + if (conn == NULL || params == NULL || method == NULL) { + return -1; + } + if (strcasecmp(method, "CHAP") == 0) { + /* method OK */ + } else { + SPDK_ERRLOG("unsupported AuthMethod %.64s\n", method); + return -1; + } + + total = data_len; + if (alloc_len < 1) { + return 0; + } + if (total > alloc_len) { + total = alloc_len; + data[total - 1] = '\0'; + return total; + } + + /* for temporary store */ + in_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1); + if (!in_val) { + SPDK_ERRLOG("malloc() failed for temporary store\n"); + return -ENOMEM; + } + + /* CHAP method (RFC1994) */ + if ((algorithm = iscsi_param_get_val(params, "CHAP_A")) != NULL) { + if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_WAIT_A) { + SPDK_ERRLOG("CHAP sequence error\n"); + goto error_return; + } + + /* CHAP_A is LIST type */ + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", algorithm); + in_next = in_val; + while ((new_val = spdk_strsepq(&in_next, ",")) != NULL) { + if (strcasecmp(new_val, "5") == 0) { + /* CHAP with MD5 */ + break; + } + } + if (new_val == NULL) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Reject"); + new_val = in_val; + iscsi_append_text(conn, "CHAP_A", new_val, + data, alloc_len, total); + goto error_return; + } + /* selected algorithm is 5 (MD5) */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_A=%s\n", new_val); + total = iscsi_append_text(conn, "CHAP_A", new_val, + data, alloc_len, total); + + /* Identifier is one octet */ + gen_random(conn->auth.chap_id, 1); + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d", + (int) conn->auth.chap_id[0]); + total = iscsi_append_text(conn, "CHAP_I", in_val, + data, alloc_len, total); + + /* Challenge Value is a variable stream of octets */ + /* (binary length MUST not exceed 1024 bytes) */ + conn->auth.chap_challenge_len = ISCSI_CHAP_CHALLENGE_LEN; + gen_random(conn->auth.chap_challenge, conn->auth.chap_challenge_len); + bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, + conn->auth.chap_challenge, conn->auth.chap_challenge_len); + total = iscsi_append_text(conn, "CHAP_C", in_val, + data, alloc_len, total); + + conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_NR; + } else if ((name = iscsi_param_get_val(params, "CHAP_N")) != NULL) { + uint8_t resmd5[SPDK_MD5DIGEST_LEN]; + uint8_t tgtmd5[SPDK_MD5DIGEST_LEN]; + struct spdk_md5ctx md5ctx; + size_t decoded_len = 0; + + if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_WAIT_NR) { + SPDK_ERRLOG("CHAP sequence error\n"); + goto error_return; + } + + response = iscsi_param_get_val(params, "CHAP_R"); + if (response == NULL) { + SPDK_ERRLOG("no response\n"); + goto error_return; + } + if (response[0] == '0' && + (response[1] == 'x' || response[1] == 'X')) { + rc = hex2bin(resmd5, SPDK_MD5DIGEST_LEN, response); + if (rc < 0 || rc != SPDK_MD5DIGEST_LEN) { + SPDK_ERRLOG("response format error\n"); + goto error_return; + } + } else if (response[0] == '0' && + (response[1] == 'b' || response[1] == 'B')) { + response += 2; + rc = spdk_base64_decode(resmd5, &decoded_len, response); + if (rc < 0 || decoded_len != SPDK_MD5DIGEST_LEN) { + SPDK_ERRLOG("response format error\n"); + goto error_return; + } + } else { + SPDK_ERRLOG("response format error\n"); + goto error_return; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_N/CHAP_R\n"); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ag_tag=%d\n", conn->chap_group); + + rc = iscsi_chap_get_authinfo(&conn->auth, name, conn->chap_group); + if (rc < 0) { + /* SPDK_ERRLOG("auth user or secret is missing\n"); */ + SPDK_ERRLOG("iscsi_chap_get_authinfo() failed\n"); + goto error_return; + } + if (conn->auth.user[0] == '\0' || conn->auth.secret[0] == '\0') { + /* SPDK_ERRLOG("auth user or secret is missing\n"); */ + SPDK_ERRLOG("auth failed (name %.64s)\n", name); + goto error_return; + } + + md5init(&md5ctx); + /* Identifier */ + md5update(&md5ctx, conn->auth.chap_id, 1); + /* followed by secret */ + md5update(&md5ctx, conn->auth.secret, + strlen(conn->auth.secret)); + /* followed by Challenge Value */ + md5update(&md5ctx, conn->auth.chap_challenge, + conn->auth.chap_challenge_len); + /* tgtmd5 is expecting Response Value */ + md5final(tgtmd5, &md5ctx); + + bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, tgtmd5, SPDK_MD5DIGEST_LEN); + +#if 0 + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "tgtmd5=%s, resmd5=%s\n", in_val, response); + spdk_dump("tgtmd5", tgtmd5, SPDK_MD5DIGEST_LEN); + spdk_dump("resmd5", resmd5, SPDK_MD5DIGEST_LEN); +#endif + + /* compare MD5 digest */ + if (memcmp(tgtmd5, resmd5, SPDK_MD5DIGEST_LEN) != 0) { + /* not match */ + /* SPDK_ERRLOG("auth user or secret is missing\n"); */ + SPDK_ERRLOG("auth failed (name %.64s)\n", name); + goto error_return; + } + /* OK initiator's secret */ + conn->authenticated = true; + + /* mutual CHAP? */ + identifier = iscsi_param_get_val(params, "CHAP_I"); + if (identifier != NULL) { + conn->auth.chap_mid[0] = (uint8_t) strtol(identifier, NULL, 10); + challenge = iscsi_param_get_val(params, "CHAP_C"); + if (challenge == NULL) { + SPDK_ERRLOG("CHAP sequence error\n"); + goto error_return; + } + if (challenge[0] == '0' && + (challenge[1] == 'x' || challenge[1] == 'X')) { + rc = hex2bin(conn->auth.chap_mchallenge, + ISCSI_CHAP_CHALLENGE_LEN, challenge); + if (rc < 0) { + SPDK_ERRLOG("challenge format error\n"); + goto error_return; + } + conn->auth.chap_mchallenge_len = rc; + } else if (challenge[0] == '0' && + (challenge[1] == 'b' || challenge[1] == 'B')) { + challenge += 2; + rc = spdk_base64_decode(conn->auth.chap_mchallenge, + &decoded_len, challenge); + if (rc < 0) { + SPDK_ERRLOG("challenge format error\n"); + goto error_return; + } + conn->auth.chap_mchallenge_len = decoded_len; + } else { + SPDK_ERRLOG("challenge format error\n"); + goto error_return; + } +#if 0 + spdk_dump("MChallenge", conn->auth.chap_mchallenge, + conn->auth.chap_mchallenge_len); +#endif + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_I/CHAP_C\n"); + + if (conn->auth.muser[0] == '\0' || conn->auth.msecret[0] == '\0') { + /* SPDK_ERRLOG("mutual auth user or secret is missing\n"); */ + SPDK_ERRLOG("auth failed (name %.64s)\n", name); + goto error_return; + } + + md5init(&md5ctx); + /* Identifier */ + md5update(&md5ctx, conn->auth.chap_mid, 1); + /* followed by secret */ + md5update(&md5ctx, conn->auth.msecret, + strlen(conn->auth.msecret)); + /* followed by Challenge Value */ + md5update(&md5ctx, conn->auth.chap_mchallenge, + conn->auth.chap_mchallenge_len); + /* tgtmd5 is Response Value */ + md5final(tgtmd5, &md5ctx); + + bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, tgtmd5, SPDK_MD5DIGEST_LEN); + + total = iscsi_append_text(conn, "CHAP_N", + conn->auth.muser, data, alloc_len, total); + total = iscsi_append_text(conn, "CHAP_R", + in_val, data, alloc_len, total); + } else { + /* not mutual */ + if (conn->mutual_chap) { + SPDK_ERRLOG("required mutual CHAP\n"); + goto error_return; + } + } + + conn->auth.chap_phase = ISCSI_CHAP_PHASE_END; + } else { + /* not found CHAP keys */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "start CHAP\n"); + conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A; + } + + free(in_val); + return total; + +error_return: + conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A; + free(in_val); + return -1; +} + +static int +iscsi_check_values(struct spdk_iscsi_conn *conn) +{ + if (conn->sess->FirstBurstLength > conn->sess->MaxBurstLength) { + SPDK_ERRLOG("FirstBurstLength(%d) > MaxBurstLength(%d)\n", + conn->sess->FirstBurstLength, + conn->sess->MaxBurstLength); + return -1; + } + if (conn->sess->FirstBurstLength > g_iscsi.FirstBurstLength) { + SPDK_ERRLOG("FirstBurstLength(%d) > iSCSI target restriction(%d)\n", + conn->sess->FirstBurstLength, g_iscsi.FirstBurstLength); + return -1; + } + if (conn->sess->MaxBurstLength > 0x00ffffff) { + SPDK_ERRLOG("MaxBurstLength(%d) > 0x00ffffff\n", + conn->sess->MaxBurstLength); + return -1; + } + + if (conn->MaxRecvDataSegmentLength < 512) { + SPDK_ERRLOG("MaxRecvDataSegmentLength(%d) < 512\n", + conn->MaxRecvDataSegmentLength); + return -1; + } + if (conn->MaxRecvDataSegmentLength > 0x00ffffff) { + SPDK_ERRLOG("MaxRecvDataSegmentLength(%d) > 0x00ffffff\n", + conn->MaxRecvDataSegmentLength); + return -1; + } + return 0; +} + +static int +iscsi_conn_params_update(struct spdk_iscsi_conn *conn) +{ + int rc; + uint32_t recv_buf_size; + + /* update internal variables */ + rc = iscsi_copy_param2var(conn); + if (rc < 0) { + SPDK_ERRLOG("iscsi_copy_param2var() failed\n"); + if (conn->state < ISCSI_CONN_STATE_EXITING) { + conn->state = ISCSI_CONN_STATE_EXITING; + } + return rc; + } + + /* check value */ + rc = iscsi_check_values(conn); + if (rc < 0) { + SPDK_ERRLOG("iscsi_check_values() failed\n"); + if (conn->state < ISCSI_CONN_STATE_EXITING) { + conn->state = ISCSI_CONN_STATE_EXITING; + } + } + + /* The socket receive buffer may need to be adjusted based on the new parameters */ + + /* Don't allow the recv buffer to be 0 or very large. */ + recv_buf_size = spdk_max(0x1000, spdk_min(0x2000, conn->sess->FirstBurstLength)); + + /* Add in extra space for the PDU */ + recv_buf_size += ISCSI_BHS_LEN + ISCSI_AHS_LEN; + + if (conn->header_digest) { + recv_buf_size += ISCSI_DIGEST_LEN; + } + + if (conn->data_digest) { + recv_buf_size += ISCSI_DIGEST_LEN; + } + + /* Set up to buffer up to 4 commands with immediate data at once */ + if (spdk_sock_set_recvbuf(conn->sock, recv_buf_size * 4) < 0) { + /* Not fatal. */ + } + + return rc; +} + +static void +iscsi_conn_login_pdu_err_complete(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + if (conn->full_feature) { + iscsi_conn_params_update(conn); + } +} + +static void +iscsi_conn_login_pdu_success_complete(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + if (conn->state >= ISCSI_CONN_STATE_EXITING) { + /* Connection is being exited before this callback is executed. */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connection is already exited.\n"); + return; + } + if (conn->full_feature) { + if (iscsi_conn_params_update(conn) != 0) { + return; + } + } + conn->state = ISCSI_CONN_STATE_RUNNING; + if (conn->full_feature != 0) { + iscsi_conn_schedule(conn); + } +} + +/* + * The response function of spdk_iscsi_op_login + */ +static void +iscsi_op_login_response(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, struct iscsi_param *params, + iscsi_conn_xfer_complete_cb cb_fn) +{ + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + rsph->version_max = ISCSI_VERSION; + rsph->version_act = ISCSI_VERSION; + DSET24(rsph->data_segment_len, rsp_pdu->data_segment_len); + + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (conn->sess != NULL) { + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + } else { + to_be32(&rsph->exp_cmd_sn, rsp_pdu->cmd_sn); + to_be32(&rsph->max_cmd_sn, rsp_pdu->cmd_sn); + } + + SPDK_LOGDUMP(SPDK_LOG_ISCSI, "PDU", (uint8_t *)rsph, ISCSI_BHS_LEN); + SPDK_LOGDUMP(SPDK_LOG_ISCSI, "DATA", rsp_pdu->data, rsp_pdu->data_segment_len); + + /* Set T/CSG/NSG to reserved if login error. */ + if (rsph->status_class != 0) { + rsph->flags &= ~ISCSI_LOGIN_TRANSIT; + rsph->flags &= ~ISCSI_LOGIN_CURRENT_STAGE_MASK; + rsph->flags &= ~ISCSI_LOGIN_NEXT_STAGE_MASK; + } + iscsi_param_free(params); + iscsi_conn_write_pdu(conn, rsp_pdu, cb_fn, conn); +} + +/* + * The function which is used to initialize the internal response data + * structure of iscsi login function. + * return: + * 0, success; + * otherwise, error; + */ +static int +iscsi_op_login_rsp_init(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu) +{ + struct iscsi_bhs_login_req *reqh; + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + rsph->opcode = ISCSI_OP_LOGIN_RSP; + rsph->status_class = ISCSI_CLASS_SUCCESS; + rsph->status_detail = ISCSI_LOGIN_ACCEPT; + rsp_pdu->data_segment_len = 0; + + /* The default MaxRecvDataSegmentLength 8192 is used during login. - RFC3720 */ + rsp_pdu->data = calloc(1, 8192); + if (!rsp_pdu->data) { + SPDK_ERRLOG("calloc() failed for data segment\n"); + rsph->status_class = ISCSI_CLASS_TARGET_ERROR; + rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + rsp_pdu->data_buf_len = 8192; + + reqh = (struct iscsi_bhs_login_req *)&pdu->bhs; + rsph->flags |= (reqh->flags & ISCSI_LOGIN_TRANSIT); + rsph->flags |= (reqh->flags & ISCSI_LOGIN_CONTINUE); + rsph->flags |= (reqh->flags & ISCSI_LOGIN_CURRENT_STAGE_MASK); + if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) { + rsph->flags |= (reqh->flags & ISCSI_LOGIN_NEXT_STAGE_MASK); + } + + /* We don't need to convert from network byte order. Just store it */ + memcpy(&rsph->isid, reqh->isid, 6); + rsph->tsih = reqh->tsih; + rsph->itt = reqh->itt; + rsp_pdu->cmd_sn = from_be32(&reqh->cmd_sn); + + if (rsph->tsih) { + rsph->stat_sn = reqh->exp_stat_sn; + } + + SPDK_LOGDUMP(SPDK_LOG_ISCSI, "PDU", (uint8_t *)&pdu->bhs, ISCSI_BHS_LEN); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "T=%d, C=%d, CSG=%d, NSG=%d, Min=%d, Max=%d, ITT=%x\n", + ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags), + ISCSI_BHS_LOGIN_GET_CBIT(rsph->flags), + ISCSI_BHS_LOGIN_GET_CSG(rsph->flags), + ISCSI_BHS_LOGIN_GET_NSG(rsph->flags), + reqh->version_min, reqh->version_max, from_be32(&rsph->itt)); + + if (conn->sess != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u," + "MaxCmdSN=%u\n", rsp_pdu->cmd_sn, + from_be32(&rsph->stat_sn), conn->StatSN, + conn->sess->ExpCmdSN, + conn->sess->MaxCmdSN); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "CmdSN=%u, ExpStatSN=%u, StatSN=%u\n", + rsp_pdu->cmd_sn, from_be32(&rsph->stat_sn), + conn->StatSN); + } + + if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags) && + ISCSI_BHS_LOGIN_GET_CBIT(rsph->flags)) { + SPDK_ERRLOG("transit error\n"); + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + /* make sure reqh->version_max < ISCSI_VERSION */ + if (reqh->version_min > ISCSI_VERSION) { + SPDK_ERRLOG("unsupported version min %d/max %d, expecting %d\n", reqh->version_min, + reqh->version_max, ISCSI_VERSION); + /* Unsupported version */ + /* set all reserved flag to zero */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_UNSUPPORTED_VERSION; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + if ((ISCSI_BHS_LOGIN_GET_NSG(rsph->flags) == ISCSI_NSG_RESERVED_CODE) && + ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) { + /* set NSG to zero */ + rsph->flags &= ~ISCSI_LOGIN_NEXT_STAGE_MASK; + /* also set other bits to zero */ + rsph->flags &= ~ISCSI_LOGIN_TRANSIT; + rsph->flags &= ~ISCSI_LOGIN_CURRENT_STAGE_MASK; + SPDK_ERRLOG("Received reserved NSG code: %d\n", ISCSI_NSG_RESERVED_CODE); + /* Initiator error */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + return 0; +} + +static int +iscsi_op_login_store_incoming_params(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu, + struct iscsi_param **params) +{ + struct iscsi_bhs_login_req *reqh; + struct iscsi_bhs_login_rsp *rsph; + int rc; + + reqh = (struct iscsi_bhs_login_req *)&pdu->bhs; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + rc = iscsi_parse_params(params, pdu->data, + pdu->data_segment_len, ISCSI_BHS_LOGIN_GET_CBIT(reqh->flags), + &conn->partial_text_parameter); + if (rc < 0) { + SPDK_ERRLOG("iscsi_parse_params() failed\n"); + iscsi_param_free(*params); + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + + return 0; +} + +/* + * This function is used to initialize the port info + * return + * 0: success + * otherwise: error + */ +static int +iscsi_op_login_initialize_port(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + char *initiator_port_name, + uint32_t name_length, + struct iscsi_param *params) +{ + const char *val; + struct iscsi_bhs_login_rsp *rsph; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + /* Initiator Name and Port */ + val = iscsi_param_get_val(params, "InitiatorName"); + if (val == NULL) { + SPDK_ERRLOG("InitiatorName is empty\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + snprintf(conn->initiator_name, sizeof(conn->initiator_name), "%s", val); + snprintf(initiator_port_name, name_length, + "%s,i,0x%12.12" PRIx64, val, iscsi_get_isid(rsph->isid)); + spdk_strlwr(conn->initiator_name); + spdk_strlwr(initiator_port_name); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Initiator name: %s\n", conn->initiator_name); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Initiator port: %s\n", initiator_port_name); + + return 0; +} + +/* + * This function is used to judge the session type + * return + * 0: success + * Other value: error + */ +static int +iscsi_op_login_session_type(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + enum session_type *session_type, + struct iscsi_param *params) +{ + const char *session_type_str; + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + session_type_str = iscsi_param_get_val(params, "SessionType"); + if (session_type_str == NULL) { + if (rsph->tsih != 0) { + *session_type = SESSION_TYPE_NORMAL; + } else { + SPDK_ERRLOG("SessionType is empty\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + } else { + if (strcasecmp(session_type_str, "Discovery") == 0) { + *session_type = SESSION_TYPE_DISCOVERY; + } else if (strcasecmp(session_type_str, "Normal") == 0) { + *session_type = SESSION_TYPE_NORMAL; + } else { + *session_type = SESSION_TYPE_INVALID; + SPDK_ERRLOG("SessionType is invalid\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Session Type: %s\n", session_type_str); + + return 0; +} + +/* + * This function is used to check the target info + * return: + * 0: success + * otherwise: error + */ +static int +iscsi_op_login_check_target(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + const char *target_name, + struct spdk_iscsi_tgt_node **target) +{ + bool result; + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + *target = iscsi_find_tgt_node(target_name); + if (*target == NULL) { + SPDK_WARNLOG("target %s not found\n", target_name); + /* Not found */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_TARGET_NOT_FOUND; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + if (iscsi_tgt_node_is_destructed(*target)) { + SPDK_ERRLOG("target %s is removed\n", target_name); + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_TARGET_REMOVED; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + result = iscsi_tgt_node_access(conn, *target, + conn->initiator_name, + conn->initiator_addr); + if (!result) { + SPDK_ERRLOG("access denied\n"); + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_AUTHORIZATION_FAIL; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + return 0; +} + +/* + * This function use to check the session + * return: + * 0, success + * otherwise: error + */ +static int +iscsi_op_login_check_session(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + char *initiator_port_name, int cid) + +{ + int rc = 0; + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + /* check existing session */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "isid=%"PRIx64", tsih=%u, cid=%u\n", + iscsi_get_isid(rsph->isid), from_be16(&rsph->tsih), cid); + if (rsph->tsih != 0) { + /* multiple connections */ + rc = append_iscsi_sess(conn, initiator_port_name, + from_be16(&rsph->tsih), cid); + if (rc != 0) { + SPDK_ERRLOG("isid=%"PRIx64", tsih=%u, cid=%u:" + "spdk_append_iscsi_sess() failed\n", + iscsi_get_isid(rsph->isid), from_be16(&rsph->tsih), + cid); + /* Can't include in session */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = rc; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + } else if (!g_iscsi.AllowDuplicateIsid) { + /* new session, drop old sess by the initiator */ + iscsi_drop_conns(conn, initiator_port_name, 0 /* drop old */); + } + + return rc; +} + +/* + * This function is used to del the original param and update it with new + * value + * return: + * 0: success + * otherwise: error + */ +static int +iscsi_op_login_update_param(struct spdk_iscsi_conn *conn, + const char *key, const char *value, + const char *list) +{ + int rc = 0; + struct iscsi_param *new_param, *orig_param; + int index; + + orig_param = iscsi_param_find(conn->params, key); + if (orig_param == NULL) { + SPDK_ERRLOG("orig_param %s not found\n", key); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + + index = orig_param->state_index; + rc = iscsi_param_del(&conn->params, key); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_del(%s) failed\n", key); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + rc = iscsi_param_add(&conn->params, key, value, list, ISPT_LIST); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_add() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + new_param = iscsi_param_find(conn->params, key); + if (new_param == NULL) { + SPDK_ERRLOG("iscsi_param_find() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + new_param->state_index = index; + return rc; +} + +static int +iscsi_negotiate_chap_param(struct spdk_iscsi_conn *conn) +{ + int rc = 0; + + if (conn->disable_chap) { + rc = iscsi_op_login_update_param(conn, "AuthMethod", "None", "None"); + } else if (conn->require_chap) { + rc = iscsi_op_login_update_param(conn, "AuthMethod", "CHAP", "CHAP"); + } + + return rc; +} + +/* + * The function which is used to handle the part of session discovery + * return: + * 0, success; + * otherwise: error; + */ +static int +iscsi_op_login_session_discovery_chap(struct spdk_iscsi_conn *conn) +{ + return iscsi_negotiate_chap_param(conn); +} + +/* + * This function is used to update the param related with chap + * return: + * 0: success + * otherwise: error + */ +static int +iscsi_op_login_negotiate_chap_param(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target) +{ + conn->disable_chap = target->disable_chap; + conn->require_chap = target->require_chap; + conn->mutual_chap = target->mutual_chap; + conn->chap_group = target->chap_group; + + return iscsi_negotiate_chap_param(conn); +} + +static int +iscsi_op_login_negotiate_digest_param(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target) +{ + int rc; + + if (target->header_digest) { + /* + * User specified header digests, so update the list of + * HeaderDigest values to remove "None" so that only + * initiators who support CRC32C can connect. + */ + rc = iscsi_op_login_update_param(conn, "HeaderDigest", "CRC32C", "CRC32C"); + if (rc < 0) { + return rc; + } + } + + if (target->data_digest) { + /* + * User specified data digests, so update the list of + * DataDigest values to remove "None" so that only + * initiators who support CRC32C can connect. + */ + rc = iscsi_op_login_update_param(conn, "DataDigest", "CRC32C", "CRC32C"); + if (rc < 0) { + return rc; + } + } + + return 0; +} + +/* + * The function which is used to handle the part of normal login session + * return: + * 0, success; + * SPDK_ISCSI_LOGIN_ERROR_PARAMETER, parameter error; + */ +static int +iscsi_op_login_session_normal(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + char *initiator_port_name, + struct iscsi_param *params, + int cid) +{ + struct spdk_iscsi_tgt_node *target = NULL; + const char *target_name; + const char *target_short_name; + struct iscsi_bhs_login_rsp *rsph; + int rc = 0; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + target_name = iscsi_param_get_val(params, "TargetName"); + + if (target_name == NULL) { + SPDK_ERRLOG("TargetName is empty\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + memset(conn->target_short_name, 0, MAX_TARGET_NAME); + target_short_name = strstr(target_name, ":"); + if (target_short_name != NULL) { + target_short_name++; /* Advance past the ':' */ + if (strlen(target_short_name) >= MAX_TARGET_NAME) { + SPDK_ERRLOG("Target Short Name (%s) is more than %u characters\n", + target_short_name, MAX_TARGET_NAME); + /* Invalid request */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INVALID_LOGIN_REQUEST; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + snprintf(conn->target_short_name, MAX_TARGET_NAME, "%s", + target_short_name); + } + + pthread_mutex_lock(&g_iscsi.mutex); + rc = iscsi_op_login_check_target(conn, rsp_pdu, target_name, &target); + pthread_mutex_unlock(&g_iscsi.mutex); + + if (rc < 0) { + return rc; + } + + conn->target = target; + conn->dev = target->dev; + conn->target_port = spdk_scsi_dev_find_port_by_id(target->dev, + conn->pg_tag); + + rc = iscsi_op_login_check_session(conn, rsp_pdu, + initiator_port_name, cid); + if (rc < 0) { + return rc; + } + + /* force target flags */ + pthread_mutex_lock(&target->mutex); + rc = iscsi_op_login_negotiate_chap_param(conn, target); + pthread_mutex_unlock(&target->mutex); + + if (rc == 0) { + rc = iscsi_op_login_negotiate_digest_param(conn, target); + } + + if (rc != 0) { + /* Invalid request */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INVALID_LOGIN_REQUEST; + } + + return rc; +} + +/* + * This function is used to set the info in the connection data structure + * return + * 0: success + * otherwise: error + */ +static int +iscsi_op_login_set_conn_info(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + char *initiator_port_name, + enum session_type session_type, int cid) +{ + int rc = 0; + struct spdk_iscsi_tgt_node *target; + struct iscsi_bhs_login_rsp *rsph; + struct spdk_scsi_port *initiator_port; + + target = conn->target; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + conn->authenticated = false; + conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A; + conn->cid = cid; + + if (conn->sess == NULL) { + /* create initiator port */ + initiator_port = spdk_scsi_port_create(iscsi_get_isid(rsph->isid), 0, initiator_port_name); + if (initiator_port == NULL) { + SPDK_ERRLOG("create_port() failed\n"); + rsph->status_class = ISCSI_CLASS_TARGET_ERROR; + rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + /* new session */ + rc = create_iscsi_sess(conn, target, session_type); + if (rc < 0) { + spdk_scsi_port_free(&initiator_port); + SPDK_ERRLOG("create_sess() failed\n"); + rsph->status_class = ISCSI_CLASS_TARGET_ERROR; + rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + /* initialize parameters */ + conn->sess->initiator_port = initiator_port; + conn->StatSN = from_be32(&rsph->stat_sn); + conn->sess->isid = iscsi_get_isid(rsph->isid); + + /* Initiator port TransportID */ + spdk_scsi_port_set_iscsi_transport_id(conn->sess->initiator_port, + conn->initiator_name, + conn->sess->isid); + + /* Discovery sessions will not have a target. */ + if (target != NULL) { + conn->sess->queue_depth = target->queue_depth; + } else { + /* + * Assume discovery sessions have an effective command + * windows size of 1. + */ + conn->sess->queue_depth = 1; + } + conn->sess->ExpCmdSN = rsp_pdu->cmd_sn; + conn->sess->MaxCmdSN = rsp_pdu->cmd_sn + conn->sess->queue_depth - 1; + } + + conn->initiator_port = conn->sess->initiator_port; + + return 0; +} + +/* + * This function is used to set the target info + * return + * 0: success + * otherwise: error + */ +static int +iscsi_op_login_set_target_info(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + enum session_type session_type) +{ + char buf[MAX_TMPBUF]; + const char *val; + int rc = 0; + struct spdk_iscsi_tgt_node *target = conn->target; + + /* declarative parameters */ + if (target != NULL) { + pthread_mutex_lock(&target->mutex); + if (target->alias[0] != '\0') { + snprintf(buf, sizeof buf, "%s", target->alias); + } else { + snprintf(buf, sizeof buf, "%s", ""); + } + pthread_mutex_unlock(&target->mutex); + rc = iscsi_param_set(conn->sess->params, "TargetAlias", buf); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + } + snprintf(buf, sizeof buf, "%s:%s,%d", conn->portal_host, conn->portal_port, + conn->pg_tag); + rc = iscsi_param_set(conn->sess->params, "TargetAddress", buf); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + snprintf(buf, sizeof buf, "%d", conn->pg_tag); + rc = iscsi_param_set(conn->sess->params, "TargetPortalGroupTag", buf); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + + /* write in response */ + if (target != NULL) { + val = iscsi_param_get_val(conn->sess->params, "TargetAlias"); + if (val != NULL && strlen(val) != 0) { + rsp_pdu->data_segment_len = iscsi_append_param(conn, + "TargetAlias", + rsp_pdu->data, + rsp_pdu->data_buf_len, + rsp_pdu->data_segment_len); + } + if (session_type == SESSION_TYPE_DISCOVERY) { + rsp_pdu->data_segment_len = iscsi_append_param(conn, + "TargetAddress", + rsp_pdu->data, + rsp_pdu->data_buf_len, + rsp_pdu->data_segment_len); + } + rsp_pdu->data_segment_len = iscsi_append_param(conn, + "TargetPortalGroupTag", + rsp_pdu->data, + rsp_pdu->data_buf_len, + rsp_pdu->data_segment_len); + } + + return rc; +} + +/* + * This function is used to handle the login of iscsi initiator when there is + * no session + * return: + * 0, success; + * SPDK_ISCSI_LOGIN_ERROR_PARAMETER, parameter error; + * SPDK_ISCSI_LOGIN_ERROR_RESPONSE, used to notify the login fail. + */ +static int +iscsi_op_login_phase_none(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + struct iscsi_param *params, int cid) +{ + enum session_type session_type; + char initiator_port_name[MAX_INITIATOR_PORT_NAME]; + struct iscsi_bhs_login_rsp *rsph; + int rc = 0; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + conn->target = NULL; + conn->dev = NULL; + + rc = iscsi_op_login_initialize_port(conn, rsp_pdu, initiator_port_name, + MAX_INITIATOR_PORT_NAME, params); + if (rc < 0) { + return rc; + } + + rc = iscsi_op_login_session_type(conn, rsp_pdu, &session_type, params); + if (rc < 0) { + return rc; + } + + /* Target Name and Port */ + if (session_type == SESSION_TYPE_NORMAL) { + rc = iscsi_op_login_session_normal(conn, rsp_pdu, + initiator_port_name, + params, cid); + if (rc < 0) { + return rc; + } + + } else if (session_type == SESSION_TYPE_DISCOVERY) { + rsph->tsih = 0; + + /* force target flags */ + pthread_mutex_lock(&g_iscsi.mutex); + rc = iscsi_op_login_session_discovery_chap(conn); + pthread_mutex_unlock(&g_iscsi.mutex); + if (rc < 0) { + return rc; + } + } else { + SPDK_ERRLOG("unknown session type\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + rc = iscsi_op_login_set_conn_info(conn, rsp_pdu, initiator_port_name, + session_type, cid); + if (rc < 0) { + return rc; + } + + /* limit conns on discovery session */ + if (session_type == SESSION_TYPE_DISCOVERY) { + conn->sess->MaxConnections = 1; + rc = iscsi_param_set_int(conn->sess->params, + "MaxConnections", + conn->sess->MaxConnections); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + } + + return iscsi_op_login_set_target_info(conn, rsp_pdu, session_type); +} + +/* + * This function is used to set the csg bit case in rsp + * return: + * 0, success + * otherwise: error + */ +static int +iscsi_op_login_rsp_handle_csg_bit(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + struct iscsi_param *params) +{ + const char *auth_method; + int rc; + struct iscsi_bhs_login_rsp *rsph; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + switch (ISCSI_BHS_LOGIN_GET_CSG(rsph->flags)) { + case ISCSI_SECURITY_NEGOTIATION_PHASE: + /* SecurityNegotiation */ + auth_method = iscsi_param_get_val(conn->params, "AuthMethod"); + if (auth_method == NULL) { + SPDK_ERRLOG("AuthMethod is empty\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + if (strcasecmp(auth_method, "None") == 0) { + conn->authenticated = true; + } else { + rc = iscsi_auth_params(conn, params, auth_method, + rsp_pdu->data, rsp_pdu->data_buf_len, + rsp_pdu->data_segment_len); + if (rc < 0) { + SPDK_ERRLOG("iscsi_auth_params() failed\n"); + /* Authentication failure */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + rsp_pdu->data_segment_len = rc; + if (!conn->authenticated) { + /* not complete */ + rsph->flags &= ~ISCSI_LOGIN_TRANSIT; + } else { + if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_END) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CHAP phase not complete"); + } + } + + SPDK_LOGDUMP(SPDK_LOG_ISCSI, "Negotiated Auth Params", + rsp_pdu->data, rsp_pdu->data_segment_len); + } + break; + + case ISCSI_OPERATIONAL_NEGOTIATION_PHASE: + /* LoginOperationalNegotiation */ + if (conn->state == ISCSI_CONN_STATE_INVALID) { + if (conn->require_chap) { + /* Authentication failure */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } else { + /* AuthMethod=None */ + conn->authenticated = true; + } + } + if (!conn->authenticated) { + SPDK_ERRLOG("authentication error\n"); + /* Authentication failure */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + break; + + case ISCSI_FULL_FEATURE_PHASE: + /* FullFeaturePhase */ + SPDK_ERRLOG("XXX Login in FullFeaturePhase\n"); + /* Initiator error */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + + default: + SPDK_ERRLOG("unknown stage\n"); + /* Initiator error */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + return 0; +} + +/* This function is used to notify the session info + * return + * 0: success + * otherwise: error + */ +static int +iscsi_op_login_notify_session_info(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu) +{ + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + if (conn->sess->session_type == SESSION_TYPE_NORMAL) { + /* normal session */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Login from %s (%s) on %s tgt_node%d" + " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u," + " CID=%u, HeaderDigest=%s, DataDigest=%s\n", + conn->initiator_name, conn->initiator_addr, + conn->target->name, conn->target->num, + conn->portal_host, conn->portal_port, conn->pg_tag, + conn->sess->isid, conn->sess->tsih, conn->cid, + (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C") + ? "on" : "off"), + (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C") + ? "on" : "off")); + } else if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + /* discovery session */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Login(discovery) from %s (%s) on" + " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u," + " CID=%u, HeaderDigest=%s, DataDigest=%s\n", + conn->initiator_name, conn->initiator_addr, + conn->portal_host, conn->portal_port, conn->pg_tag, + conn->sess->isid, conn->sess->tsih, conn->cid, + (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C") + ? "on" : "off"), + (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C") + ? "on" : "off")); + } else { + SPDK_ERRLOG("unknown session type\n"); + /* Initiator error */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + return 0; +} + +/* + * This function is to handle the tbit cases + * return + * 0: success + * otherwise error + */ +static int +iscsi_op_login_rsp_handle_t_bit(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu) +{ + int rc; + struct iscsi_bhs_login_rsp *rsph; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + switch (ISCSI_BHS_LOGIN_GET_NSG(rsph->flags)) { + case ISCSI_SECURITY_NEGOTIATION_PHASE: + /* SecurityNegotiation */ + conn->login_phase = ISCSI_SECURITY_NEGOTIATION_PHASE; + break; + + case ISCSI_OPERATIONAL_NEGOTIATION_PHASE: + /* LoginOperationalNegotiation */ + conn->login_phase = ISCSI_OPERATIONAL_NEGOTIATION_PHASE; + break; + + case ISCSI_FULL_FEATURE_PHASE: + /* FullFeaturePhase */ + conn->login_phase = ISCSI_FULL_FEATURE_PHASE; + to_be16(&rsph->tsih, conn->sess->tsih); + + rc = iscsi_op_login_notify_session_info(conn, rsp_pdu); + if (rc < 0) { + return rc; + } + + conn->full_feature = 1; + break; + + default: + SPDK_ERRLOG("unknown stage\n"); + /* Initiator error */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + return 0; +} + +/* + * This function is used to set the values of the internal data structure used + * by spdk_iscsi_op_login function + * return: + * 0, used to notify the a successful login + * SPDK_ISCSI_LOGIN_ERROR_RESPONSE, used to notify a failure login. + */ +static int +iscsi_op_login_rsp_handle(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, struct iscsi_param **params) +{ + int rc; + struct iscsi_bhs_login_rsp *rsph; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + /* negotiate parameters */ + rc = iscsi_negotiate_params(conn, params, rsp_pdu->data, + rsp_pdu->data_buf_len, + rsp_pdu->data_segment_len); + if (rc < 0) { + /* + * iscsi_negotiate_params just returns -1 on failure, + * so translate this into meaningful response codes and + * return values. + */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + rsp_pdu->data_segment_len = rc; + SPDK_LOGDUMP(SPDK_LOG_ISCSI, "Negotiated Params", rsp_pdu->data, rc); + + /* handle the CSG bit case */ + rc = iscsi_op_login_rsp_handle_csg_bit(conn, rsp_pdu, *params); + if (rc < 0) { + return rc; + } + + /* handle the T bit case */ + if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) { + rc = iscsi_op_login_rsp_handle_t_bit(conn, rsp_pdu); + } + + return rc; +} + +static int +iscsi_pdu_hdr_op_login(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + int rc; + struct iscsi_bhs_login_req *reqh; + struct spdk_iscsi_pdu *rsp_pdu; + + if (conn->full_feature && conn->sess != NULL && + conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_login_req *)&pdu->bhs; + pdu->cmd_sn = from_be32(&reqh->cmd_sn); + + /* During login processing, use the 8KB default FirstBurstLength as + * our maximum data segment length value. + */ + if (pdu->data_segment_len > SPDK_ISCSI_FIRST_BURST_LENGTH) { + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + rsp_pdu = iscsi_get_pdu(conn); + if (rsp_pdu == NULL) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + rc = iscsi_op_login_rsp_init(conn, pdu, rsp_pdu); + if (rc < 0) { + iscsi_op_login_response(conn, rsp_pdu, NULL, iscsi_conn_login_pdu_err_complete); + return 0; + } + + conn->login_rsp_pdu = rsp_pdu; + return 0; +} + +static int +iscsi_pdu_payload_op_login(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + int rc; + struct iscsi_bhs_login_req *reqh; + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_param *params = NULL; + int cid; + + if (conn->login_rsp_pdu == NULL) { + return 0; + } + + rsp_pdu = conn->login_rsp_pdu; + + reqh = (struct iscsi_bhs_login_req *)&pdu->bhs; + cid = from_be16(&reqh->cid); + + rc = iscsi_op_login_store_incoming_params(conn, pdu, rsp_pdu, ¶ms); + if (rc < 0) { + iscsi_op_login_response(conn, rsp_pdu, NULL, iscsi_conn_login_pdu_err_complete); + return 0; + } + + if (conn->state == ISCSI_CONN_STATE_INVALID) { + rc = iscsi_op_login_phase_none(conn, rsp_pdu, params, cid); + if (rc == SPDK_ISCSI_LOGIN_ERROR_RESPONSE || rc == SPDK_ISCSI_LOGIN_ERROR_PARAMETER) { + iscsi_op_login_response(conn, rsp_pdu, params, iscsi_conn_login_pdu_err_complete); + return 0; + } + } + + rc = iscsi_op_login_rsp_handle(conn, rsp_pdu, ¶ms); + if (rc == SPDK_ISCSI_LOGIN_ERROR_RESPONSE) { + iscsi_op_login_response(conn, rsp_pdu, params, iscsi_conn_login_pdu_err_complete); + return 0; + } + + iscsi_op_login_response(conn, rsp_pdu, params, iscsi_conn_login_pdu_success_complete); + return 0; +} + +static int +iscsi_pdu_hdr_op_text(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + uint32_t task_tag; + uint32_t ExpStatSN; + int F_bit, C_bit; + struct iscsi_bhs_text_req *reqh; + + if (pdu->data_segment_len > iscsi_get_max_immediate_data_size()) { + SPDK_ERRLOG("data segment len(=%zu) > immediate data len(=%"PRIu32")\n", + pdu->data_segment_len, iscsi_get_max_immediate_data_size()); + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + reqh = (struct iscsi_bhs_text_req *)&pdu->bhs; + + F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL); + C_bit = !!(reqh->flags & ISCSI_TEXT_CONTINUE); + task_tag = from_be32(&reqh->itt); + ExpStatSN = from_be32(&reqh->exp_stat_sn); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, F=%d, C=%d, ITT=%x, TTT=%x\n", + reqh->immediate, F_bit, C_bit, task_tag, from_be32(&reqh->ttt)); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + pdu->cmd_sn, ExpStatSN, conn->StatSN, conn->sess->ExpCmdSN, + conn->sess->MaxCmdSN); + + if (ExpStatSN != conn->StatSN) { +#if 0 + SPDK_ERRLOG("StatSN(%u) error\n", ExpStatSN); + return -1; +#else + /* StarPort have a bug */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u) rewound\n", ExpStatSN); + conn->StatSN = ExpStatSN; +#endif + } + + if (F_bit && C_bit) { + SPDK_ERRLOG("final and continue\n"); + return -1; + } + + /* + * If this is the first text op in a sequence, save the ITT so we can + * compare it against the ITT for subsequent ops in the same sequence. + * If a subsequent text op in same sequence has a different ITT, reject + * that PDU. + */ + if (conn->sess->current_text_itt == 0xffffffffU) { + conn->sess->current_text_itt = task_tag; + } else if (conn->sess->current_text_itt != task_tag) { + SPDK_ERRLOG("The correct itt is %u, and the current itt is %u...\n", + conn->sess->current_text_itt, task_tag); + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + return 0; +} + +static void +iscsi_conn_text_pdu_complete(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + iscsi_conn_params_update(conn); +} + +static int +iscsi_pdu_payload_op_text(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct iscsi_param *params = NULL; + struct spdk_iscsi_pdu *rsp_pdu; + uint8_t *data; + uint64_t lun; + uint32_t task_tag; + const char *val; + int F_bit, C_bit; + int data_len; + int alloc_len; + int rc; + struct iscsi_bhs_text_req *reqh; + struct iscsi_bhs_text_resp *rsph; + + data_len = 0; + alloc_len = conn->MaxRecvDataSegmentLength; + + reqh = (struct iscsi_bhs_text_req *)&pdu->bhs; + + F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL); + C_bit = !!(reqh->flags & ISCSI_TEXT_CONTINUE); + lun = from_be64(&reqh->lun); + task_tag = from_be32(&reqh->itt); + + /* store incoming parameters */ + rc = iscsi_parse_params(¶ms, pdu->data, pdu->data_segment_len, + C_bit, &conn->partial_text_parameter); + if (rc < 0) { + SPDK_ERRLOG("iscsi_parse_params() failed\n"); + iscsi_param_free(params); + return -1; + } + + data = calloc(1, alloc_len); + if (!data) { + SPDK_ERRLOG("calloc() failed for data segment\n"); + iscsi_param_free(params); + return -ENOMEM; + } + + /* negotiate parameters */ + data_len = iscsi_negotiate_params(conn, ¶ms, + data, alloc_len, data_len); + if (data_len < 0) { + SPDK_ERRLOG("iscsi_negotiate_params() failed\n"); + iscsi_param_free(params); + free(data); + return -1; + } + + /* sendtargets is special case */ + val = iscsi_param_get_val(params, "SendTargets"); + if (val != NULL) { + if (iscsi_param_eq_val(conn->sess->params, + "SessionType", "Discovery")) { + if (strcasecmp(val, "") == 0) { + val = "ALL"; + } + + data_len = iscsi_send_tgts(conn, + conn->initiator_name, + conn->initiator_addr, + val, data, alloc_len, + data_len); + } else { + if (strcasecmp(val, "") == 0) { + val = conn->target->name; + } + + if (strcasecmp(val, "ALL") == 0) { + /* not in discovery session */ + data_len = iscsi_append_text(conn, + "SendTargets", + "Reject", data, + alloc_len, data_len); + } else { + data_len = iscsi_send_tgts(conn, + conn->initiator_name, + conn->initiator_addr, + val, data, alloc_len, + data_len); + } + } + } else { + if (iscsi_param_eq_val(conn->sess->params, "SessionType", "Discovery")) { + iscsi_param_free(params); + free(data); + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + + iscsi_param_free(params); + SPDK_LOGDUMP(SPDK_LOG_ISCSI, "Negotiated Params", data, data_len); + + /* response PDU */ + rsp_pdu = iscsi_get_pdu(conn); + if (rsp_pdu == NULL) { + free(data); + return SPDK_ISCSI_CONNECTION_FATAL; + } + rsph = (struct iscsi_bhs_text_resp *)&rsp_pdu->bhs; + + rsp_pdu->data = data; + rsph->opcode = ISCSI_OP_TEXT_RSP; + + if (F_bit) { + rsph->flags |= ISCSI_FLAG_FINAL; + } + + if (C_bit) { + rsph->flags |= ISCSI_TEXT_CONTINUE; + } + + DSET24(rsph->data_segment_len, data_len); + to_be64(&rsph->lun, lun); + to_be32(&rsph->itt, task_tag); + + if (F_bit) { + rsph->ttt = 0xffffffffU; + conn->sess->current_text_itt = 0xffffffffU; + } else { + to_be32(&rsph->ttt, 1 + conn->id); + } + + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (reqh->immediate == 0) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_text_pdu_complete, conn); + return 0; +} + +static void iscsi_conn_logout_pdu_complete(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + if (conn->sess == NULL) { + /* + * login failed but initiator still sent a logout rather than + * just closing the TCP connection. + */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Logout(login failed) from %s (%s) on" + " (%s:%s,%d)\n", + conn->initiator_name, conn->initiator_addr, + conn->portal_host, conn->portal_port, conn->pg_tag); + } else if (iscsi_param_eq_val(conn->sess->params, "SessionType", "Normal")) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Logout from %s (%s) on %s tgt_node%d" + " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u," + " CID=%u, HeaderDigest=%s, DataDigest=%s\n", + conn->initiator_name, conn->initiator_addr, + conn->target->name, conn->target->num, + conn->portal_host, conn->portal_port, conn->pg_tag, + conn->sess->isid, conn->sess->tsih, conn->cid, + (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C") + ? "on" : "off"), + (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C") + ? "on" : "off")); + } else { + /* discovery session */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Logout(discovery) from %s (%s) on" + " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u," + " CID=%u, HeaderDigest=%s, DataDigest=%s\n", + conn->initiator_name, conn->initiator_addr, + conn->portal_host, conn->portal_port, conn->pg_tag, + conn->sess->isid, conn->sess->tsih, conn->cid, + (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C") + ? "on" : "off"), + (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C") + ? "on" : "off")); + } +} + +static int +iscsi_pdu_hdr_op_logout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_pdu *rsp_pdu; + uint32_t task_tag; + uint32_t ExpStatSN; + int response; + struct iscsi_bhs_logout_req *reqh; + struct iscsi_bhs_logout_resp *rsph; + uint16_t cid; + + reqh = (struct iscsi_bhs_logout_req *)&pdu->bhs; + + cid = from_be16(&reqh->cid); + task_tag = from_be32(&reqh->itt); + ExpStatSN = from_be32(&reqh->exp_stat_sn); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "reason=%d, ITT=%x, cid=%d\n", + reqh->reason, task_tag, cid); + + if (conn->sess != NULL) { + if (conn->sess->session_type == SESSION_TYPE_DISCOVERY && + reqh->reason != ISCSI_LOGOUT_REASON_CLOSE_SESSION) { + SPDK_ERRLOG("Target can accept logout only with reason \"close the session\" " + "on discovery session. %d is not acceptable reason.\n", + reqh->reason); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + pdu->cmd_sn, ExpStatSN, conn->StatSN, + conn->sess->ExpCmdSN, conn->sess->MaxCmdSN); + + if (pdu->cmd_sn != conn->sess->ExpCmdSN) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN(%u) might have dropped\n", pdu->cmd_sn); + /* ignore error */ + } + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN=%u, ExpStatSN=%u, StatSN=%u\n", + pdu->cmd_sn, ExpStatSN, conn->StatSN); + } + + if (ExpStatSN != conn->StatSN) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u/%u) might have dropped\n", + ExpStatSN, conn->StatSN); + /* ignore error */ + } + + if (conn->id == cid) { + /* connection or session closed successfully */ + response = 0; + iscsi_conn_logout(conn); + } else { + response = 1; + } + + /* response PDU */ + rsp_pdu = iscsi_get_pdu(conn); + if (rsp_pdu == NULL) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + rsph = (struct iscsi_bhs_logout_resp *)&rsp_pdu->bhs; + rsp_pdu->data = NULL; + rsph->opcode = ISCSI_OP_LOGOUT_RSP; + rsph->flags |= 0x80; /* bit 0 must be 1 */ + rsph->response = response; + DSET24(rsph->data_segment_len, 0); + to_be32(&rsph->itt, task_tag); + + if (conn->sess != NULL) { + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (conn->sess->connections == 1) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + } else { + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + to_be32(&rsph->exp_cmd_sn, pdu->cmd_sn); + to_be32(&rsph->max_cmd_sn, pdu->cmd_sn); + } + + rsph->time_2_wait = 0; + rsph->time_2_retain = 0; + + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_logout_pdu_complete, conn); + + return 0; +} + +static int +iscsi_send_r2t(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, int offset, + int len, uint32_t transfer_tag, uint32_t *R2TSN) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_r2t *rsph; + uint64_t fmt_lun; + + /* R2T PDU */ + rsp_pdu = iscsi_get_pdu(conn); + if (rsp_pdu == NULL) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + rsph = (struct iscsi_bhs_r2t *)&rsp_pdu->bhs; + rsp_pdu->data = NULL; + rsph->opcode = ISCSI_OP_R2T; + rsph->flags |= 0x80; /* bit 0 is default to 1 */ + fmt_lun = spdk_scsi_lun_id_int_to_fmt(task->lun_id); + to_be64(&rsph->lun, fmt_lun); + to_be32(&rsph->itt, task->tag); + to_be32(&rsph->ttt, transfer_tag); + + to_be32(&rsph->stat_sn, conn->StatSN); + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + to_be32(&rsph->r2t_sn, *R2TSN); + *R2TSN += 1; + + task->r2t_datasn = 0; /* next expected datasn to ack */ + + to_be32(&rsph->buffer_offset, (uint32_t)offset); + to_be32(&rsph->desired_xfer_len, (uint32_t)len); + task->desired_data_transfer_length = (size_t)len; + + /* we need to hold onto this task/cmd because until the PDU has been + * written out */ + rsp_pdu->task = task; + task->scsi.ref++; + + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL); + + return 0; +} + +/* This function is used to remove the r2t pdu from snack_pdu_list by < task, r2t_sn> info */ +static struct spdk_iscsi_pdu * +iscsi_remove_r2t_pdu_from_snack_list(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, + uint32_t r2t_sn) +{ + struct spdk_iscsi_pdu *pdu; + struct iscsi_bhs_r2t *r2t_header; + + TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) { + if (pdu->bhs.opcode == ISCSI_OP_R2T) { + r2t_header = (struct iscsi_bhs_r2t *)&pdu->bhs; + if (pdu->task == task && + from_be32(&r2t_header->r2t_sn) == r2t_sn) { + TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq); + return pdu; + } + } + } + + return NULL; +} + +/* This function is used re-send the r2t packet */ +static int +iscsi_send_r2t_recovery(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, uint32_t r2t_sn, + bool send_new_r2tsn) +{ + struct spdk_iscsi_pdu *pdu; + struct iscsi_bhs_r2t *rsph; + uint32_t transfer_len; + uint32_t len; + int rc; + + /* remove the r2t pdu from the snack_list */ + pdu = iscsi_remove_r2t_pdu_from_snack_list(conn, task, r2t_sn); + if (!pdu) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "No pdu is found\n"); + return -1; + } + + /* flag + * false: only need to re-send the old r2t with changing statsn + * true: we send a r2t with new r2tsn + */ + if (!send_new_r2tsn) { + to_be32(&pdu->bhs.stat_sn, conn->StatSN); + iscsi_conn_write_pdu(conn, pdu, iscsi_conn_pdu_generic_complete, NULL); + } else { + rsph = (struct iscsi_bhs_r2t *)&pdu->bhs; + transfer_len = from_be32(&rsph->desired_xfer_len); + + /* still need to increase the acked r2tsn */ + task->acked_r2tsn++; + len = spdk_min(conn->sess->MaxBurstLength, + (transfer_len - task->next_expected_r2t_offset)); + + /* remove the old_r2t_pdu */ + iscsi_conn_free_pdu(conn, pdu); + + /* re-send a new r2t pdu */ + rc = iscsi_send_r2t(conn, task, task->next_expected_r2t_offset, + len, task->ttt, &task->R2TSN); + if (rc < 0) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + + return 0; +} + +static int +add_transfer_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task) +{ + uint32_t transfer_len; + size_t max_burst_len; + size_t segment_len; + size_t data_len; + int len; + int rc; + int data_out_req; + + transfer_len = task->scsi.transfer_len; + data_len = iscsi_task_get_pdu(task)->data_segment_len; + max_burst_len = conn->sess->MaxBurstLength; + segment_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; + data_out_req = 1 + (transfer_len - data_len - 1) / segment_len; + task->data_out_cnt = data_out_req; + + /* + * If we already have too many tasks using R2T, then queue this task + * and start sending R2T for it after some of the tasks using R2T/data + * out buffers complete. + */ + if (conn->pending_r2t >= DEFAULT_MAXR2T) { + TAILQ_INSERT_TAIL(&conn->queued_r2t_tasks, task, link); + return 0; + } + + conn->data_out_cnt += data_out_req; + conn->pending_r2t++; + + task->next_expected_r2t_offset = data_len; + task->current_r2t_length = 0; + task->R2TSN = 0; + /* According to RFC3720 10.8.5, 0xffffffff is + * reserved for TTT in R2T. + */ + if (++conn->ttt == 0xffffffffu) { + conn->ttt = 0; + } + task->ttt = conn->ttt; + + while (data_len != transfer_len) { + len = spdk_min(max_burst_len, (transfer_len - data_len)); + rc = iscsi_send_r2t(conn, task, data_len, len, + task->ttt, &task->R2TSN); + if (rc < 0) { + SPDK_ERRLOG("iscsi_send_r2t() failed\n"); + return rc; + } + data_len += len; + task->next_r2t_offset = data_len; + task->outstanding_r2t++; + if (conn->sess->MaxOutstandingR2T == task->outstanding_r2t) { + break; + } + } + + TAILQ_INSERT_TAIL(&conn->active_r2t_tasks, task, link); + task->is_r2t_active = true; + return 0; +} + +/* If there are additional large writes queued for R2Ts, start them now. + * This is called when a large write is just completed or when multiple LUNs + * are attached and large write tasks for the specific LUN are cleared. + */ +static void +start_queued_transfer_tasks(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_task *task, *tmp; + + TAILQ_FOREACH_SAFE(task, &conn->queued_r2t_tasks, link, tmp) { + if (conn->pending_r2t < DEFAULT_MAXR2T) { + TAILQ_REMOVE(&conn->queued_r2t_tasks, task, link); + add_transfer_task(conn, task); + } else { + break; + } + } +} + +bool +iscsi_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t task_tag) +{ + struct spdk_iscsi_task *task, *tmp; + + TAILQ_FOREACH_SAFE(task, &conn->active_r2t_tasks, link, tmp) { + if (task->tag == task_tag) { + assert(conn->data_out_cnt >= task->data_out_cnt); + conn->data_out_cnt -= task->data_out_cnt; + + conn->pending_r2t--; + + assert(task->is_r2t_active == true); + TAILQ_REMOVE(&conn->active_r2t_tasks, task, link); + task->is_r2t_active = false; + iscsi_task_put(task); + + start_queued_transfer_tasks(conn); + return true; + } + } + return false; +} + +void iscsi_clear_all_transfer_task(struct spdk_iscsi_conn *conn, + struct spdk_scsi_lun *lun, + struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_task *task, *task_tmp; + struct spdk_iscsi_pdu *pdu_tmp; + + TAILQ_FOREACH_SAFE(task, &conn->active_r2t_tasks, link, task_tmp) { + pdu_tmp = iscsi_task_get_pdu(task); + if ((lun == NULL || lun == task->scsi.lun) && + (pdu == NULL || spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn))) { + task->outstanding_r2t = 0; + task->next_r2t_offset = 0; + task->next_expected_r2t_offset = 0; + assert(conn->data_out_cnt >= task->data_out_cnt); + conn->data_out_cnt -= task->data_out_cnt; + conn->pending_r2t--; + + TAILQ_REMOVE(&conn->active_r2t_tasks, task, link); + task->is_r2t_active = false; + if (lun != NULL && spdk_scsi_lun_is_removing(lun)) { + spdk_scsi_task_process_null_lun(&task->scsi); + iscsi_task_response(conn, task); + } + iscsi_task_put(task); + } + } + + TAILQ_FOREACH_SAFE(task, &conn->queued_r2t_tasks, link, task_tmp) { + pdu_tmp = iscsi_task_get_pdu(task); + if ((lun == NULL || lun == task->scsi.lun) && + (pdu == NULL || spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn))) { + TAILQ_REMOVE(&conn->queued_r2t_tasks, task, link); + task->is_r2t_active = false; + if (lun != NULL && spdk_scsi_lun_is_removing(lun)) { + spdk_scsi_task_process_null_lun(&task->scsi); + iscsi_task_response(conn, task); + } + iscsi_task_put(task); + } + } + + start_queued_transfer_tasks(conn); +} + +static struct spdk_iscsi_task * +get_transfer_task(struct spdk_iscsi_conn *conn, uint32_t transfer_tag) +{ + struct spdk_iscsi_task *task; + + TAILQ_FOREACH(task, &conn->active_r2t_tasks, link) { + if (task->ttt == transfer_tag) { + return task; + } + } + + return NULL; +} + +static void +iscsi_conn_datain_pdu_complete(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + iscsi_conn_handle_queued_datain_tasks(conn); +} + +static int +iscsi_send_datain(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, int datain_flag, + int residual_len, int offset, int DataSN, int len) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_data_in *rsph; + uint32_t task_tag; + uint32_t transfer_tag; + int F_bit, U_bit, O_bit, S_bit; + struct spdk_iscsi_task *primary; + struct spdk_scsi_lun *lun_dev; + + primary = iscsi_task_get_primary(task); + + /* DATA PDU */ + rsp_pdu = iscsi_get_pdu(conn); + rsph = (struct iscsi_bhs_data_in *)&rsp_pdu->bhs; + rsp_pdu->data = task->scsi.iovs[0].iov_base + offset; + rsp_pdu->data_buf_len = task->scsi.iovs[0].iov_len - offset; + rsp_pdu->data_from_mempool = true; + + task_tag = task->tag; + transfer_tag = 0xffffffffU; + + F_bit = datain_flag & ISCSI_FLAG_FINAL; + O_bit = datain_flag & ISCSI_DATAIN_OVERFLOW; + U_bit = datain_flag & ISCSI_DATAIN_UNDERFLOW; + S_bit = datain_flag & ISCSI_DATAIN_STATUS; + + /* + * we need to hold onto this task/cmd because until the + * PDU has been written out + */ + rsp_pdu->task = task; + task->scsi.ref++; + + rsph->opcode = ISCSI_OP_SCSI_DATAIN; + + if (F_bit) { + rsph->flags |= ISCSI_FLAG_FINAL; + } + + /* we leave the A_bit clear */ + + if (F_bit && S_bit) { + if (O_bit) { + rsph->flags |= ISCSI_DATAIN_OVERFLOW; + } + + if (U_bit) { + rsph->flags |= ISCSI_DATAIN_UNDERFLOW; + } + } + + if (S_bit) { + rsph->flags |= ISCSI_DATAIN_STATUS; + rsph->status = task->scsi.status; + } + + DSET24(rsph->data_segment_len, len); + + to_be32(&rsph->itt, task_tag); + to_be32(&rsph->ttt, transfer_tag); + + if (S_bit) { + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + } + + if (F_bit && S_bit && !iscsi_task_is_immediate(primary)) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + to_be32(&rsph->data_sn, DataSN); + + if (conn->sess->ErrorRecoveryLevel >= 1) { + primary->datain_datasn = DataSN; + } + DataSN++; + + if (task->parent) { + offset += primary->scsi.data_transferred; + } + to_be32(&rsph->buffer_offset, (uint32_t)offset); + task->scsi.offset = offset; + + if (F_bit && S_bit) { + to_be32(&rsph->res_cnt, residual_len); + } + + lun_dev = spdk_scsi_dev_get_lun(conn->dev, task->lun_id); + if (spdk_likely(lun_dev != NULL)) { + if (spdk_unlikely(spdk_scsi_lun_get_dif_ctx(lun_dev, &task->scsi, + &rsp_pdu->dif_ctx))) { + rsp_pdu->dif_insert_or_strip = true; + } + } + + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_datain_pdu_complete, conn); + + return DataSN; +} + +static int +iscsi_transfer_in(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task) +{ + uint32_t DataSN; + uint32_t transfer_len; + uint32_t data_len; + uint32_t segment_len; + uint32_t offset; + uint32_t residual_len = 0; + int sent_status; + uint32_t len; + int datain_flag = 0; + int datain_seq_cnt; + int i; + uint32_t sequence_end; + struct spdk_iscsi_task *primary; + + primary = iscsi_task_get_primary(task); + segment_len = conn->MaxRecvDataSegmentLength; + data_len = task->scsi.data_transferred; + transfer_len = task->scsi.length; + + if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) { + return 0; + } + + if (data_len < transfer_len) { + /* underflow */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Underflow %u/%u\n", data_len, transfer_len); + residual_len = transfer_len - data_len; + transfer_len = data_len; + datain_flag |= ISCSI_DATAIN_UNDERFLOW; + } else if (data_len > transfer_len) { + /* overflow */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Overflow %u/%u\n", data_len, transfer_len); + residual_len = data_len - transfer_len; + datain_flag |= ISCSI_DATAIN_OVERFLOW; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer %u\n", transfer_len); + residual_len = 0; + } + + DataSN = primary->datain_datasn; + sent_status = 0; + + /* calculate the number of sequences for all data-in pdus */ + datain_seq_cnt = 1 + ((transfer_len - 1) / (int)conn->sess->MaxBurstLength); + for (i = 0; i < datain_seq_cnt; i++) { + offset = i * conn->sess->MaxBurstLength; + sequence_end = spdk_min(((i + 1) * conn->sess->MaxBurstLength), + transfer_len); + + /* send data splitted by segment_len */ + for (; offset < sequence_end; offset += segment_len) { + len = spdk_min(segment_len, (sequence_end - offset)); + + datain_flag &= ~ISCSI_FLAG_FINAL; + datain_flag &= ~ISCSI_DATAIN_STATUS; + + if (offset + len == sequence_end) { + /* last PDU in a sequence */ + datain_flag |= ISCSI_FLAG_FINAL; + if (task->scsi.sense_data_len == 0) { + /* The last pdu in all data-in pdus */ + if ((offset + len) == transfer_len && + (primary->bytes_completed == primary->scsi.transfer_len)) { + datain_flag |= ISCSI_DATAIN_STATUS; + sent_status = 1; + } + } + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer=%d, Offset=%d, Len=%d\n", + sequence_end, offset, len); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, DataSN=%u, Offset=%u, Len=%d\n", + conn->StatSN, DataSN, offset, len); + + DataSN = iscsi_send_datain(conn, task, datain_flag, residual_len, + offset, DataSN, len); + } + } + + if (task != primary) { + primary->scsi.data_transferred += task->scsi.data_transferred; + } + primary->datain_datasn = DataSN; + + return sent_status; +} + +void iscsi_task_response(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_scsi_resp *rsph; + uint32_t task_tag; + uint32_t transfer_len; + size_t residual_len; + size_t data_len; + int O_bit, U_bit; + int rc; + struct spdk_iscsi_task *primary; + + primary = iscsi_task_get_primary(task); + + transfer_len = primary->scsi.transfer_len; + task_tag = task->tag; + + /* transfer data from logical unit */ + /* (direction is view of initiator side) */ + if (iscsi_task_is_read(primary)) { + rc = iscsi_transfer_in(conn, task); + if (rc > 0) { + /* sent status by last DATAIN PDU */ + return; + } + + if (primary->bytes_completed != primary->scsi.transfer_len) { + return; + } + } + + O_bit = U_bit = 0; + residual_len = 0; + data_len = primary->scsi.data_transferred; + + if ((transfer_len != 0) && + (task->scsi.status == SPDK_SCSI_STATUS_GOOD)) { + if (data_len < transfer_len) { + /* underflow */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Underflow %zu/%u\n", data_len, transfer_len); + residual_len = transfer_len - data_len; + U_bit = 1; + } else if (data_len > transfer_len) { + /* overflow */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Overflow %zu/%u\n", data_len, transfer_len); + residual_len = data_len - transfer_len; + O_bit = 1; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer %u\n", transfer_len); + } + } + + /* response PDU */ + rsp_pdu = iscsi_get_pdu(conn); + assert(rsp_pdu != NULL); + rsph = (struct iscsi_bhs_scsi_resp *)&rsp_pdu->bhs; + assert(task->scsi.sense_data_len <= sizeof(rsp_pdu->sense.data)); + memcpy(rsp_pdu->sense.data, task->scsi.sense_data, task->scsi.sense_data_len); + to_be16(&rsp_pdu->sense.length, task->scsi.sense_data_len); + rsp_pdu->data = (uint8_t *)&rsp_pdu->sense; + rsp_pdu->data_from_mempool = true; + + /* + * we need to hold onto this task/cmd because until the + * PDU has been written out + */ + rsp_pdu->task = task; + task->scsi.ref++; + + rsph->opcode = ISCSI_OP_SCSI_RSP; + rsph->flags |= 0x80; /* bit 0 is default to 1 */ + + if (O_bit) { + rsph->flags |= ISCSI_SCSI_OVERFLOW; + } + + if (U_bit) { + rsph->flags |= ISCSI_SCSI_UNDERFLOW; + } + + rsph->status = task->scsi.status; + if (task->scsi.sense_data_len) { + /* SenseLength (2 bytes) + SenseData */ + DSET24(rsph->data_segment_len, 2 + task->scsi.sense_data_len); + } + to_be32(&rsph->itt, task_tag); + + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (!iscsi_task_is_immediate(primary)) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + to_be32(&rsph->bi_read_res_cnt, 0); + to_be32(&rsph->res_cnt, residual_len); + + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL); +} + +/* + * This function compare the input pdu's bhs with the pdu's bhs associated by + * active_r2t_tasks and queued_r2t_tasks in a connection + */ +static bool +iscsi_compare_pdu_bhs_within_existed_r2t_tasks(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_task *task; + + TAILQ_FOREACH(task, &conn->active_r2t_tasks, link) { + if (!memcmp(&pdu->bhs, iscsi_task_get_bhs(task), ISCSI_BHS_LEN)) { + return true; + } + } + + TAILQ_FOREACH(task, &conn->queued_r2t_tasks, link) { + if (!memcmp(&pdu->bhs, iscsi_task_get_bhs(task), ISCSI_BHS_LEN)) { + return true; + } + } + + return false; +} + +void +iscsi_queue_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task) +{ + spdk_trace_record(TRACE_ISCSI_TASK_QUEUE, conn->id, task->scsi.length, + (uintptr_t)task, (uintptr_t)task->pdu); + task->is_queued = true; + spdk_scsi_dev_queue_task(conn->dev, &task->scsi); +} + +static int +iscsi_pdu_payload_op_scsi_read(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task) +{ + if (task->scsi.transfer_len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE) { + task->parent = NULL; + task->scsi.offset = 0; + task->scsi.length = task->scsi.transfer_len; + spdk_scsi_task_set_data(&task->scsi, NULL, 0); + + iscsi_queue_task(conn, task); + return 0; + } else { + TAILQ_INIT(&task->subtask_list); + task->current_datain_offset = 0; + TAILQ_INSERT_TAIL(&conn->queued_datain_tasks, task, link); + + return iscsi_conn_handle_queued_datain_tasks(conn); + } +} + +static int +iscsi_pdu_payload_op_scsi_write(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task) +{ + struct spdk_iscsi_pdu *pdu; + struct iscsi_bhs_scsi_req *reqh; + uint32_t transfer_len; + uint32_t scsi_data_len; + int rc; + + pdu = iscsi_task_get_pdu(task); + reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs; + + transfer_len = task->scsi.transfer_len; + + if (spdk_likely(!pdu->dif_insert_or_strip)) { + scsi_data_len = pdu->data_segment_len; + } else { + scsi_data_len = pdu->data_buf_len; + } + + if (reqh->final_bit && + pdu->data_segment_len < transfer_len) { + /* needs R2T */ + rc = add_transfer_task(conn, task); + if (rc < 0) { + SPDK_ERRLOG("add_transfer_task() failed\n"); + iscsi_task_put(task); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + /* Non-immediate writes */ + if (pdu->data_segment_len == 0) { + return 0; + } else { + /* we are doing the first partial write task */ + task->scsi.ref++; + spdk_scsi_task_set_data(&task->scsi, pdu->data, scsi_data_len); + task->scsi.length = pdu->data_segment_len; + } + } + + if (pdu->data_segment_len == transfer_len) { + /* we are doing small writes with no R2T */ + spdk_scsi_task_set_data(&task->scsi, pdu->data, scsi_data_len); + task->scsi.length = transfer_len; + } + + iscsi_queue_task(conn, task); + return 0; +} + +static int +iscsi_pdu_hdr_op_scsi(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_task *task; + struct spdk_scsi_dev *dev; + uint8_t *cdb; + uint64_t lun; + uint32_t task_tag; + uint32_t transfer_len; + int R_bit, W_bit; + int lun_i; + struct iscsi_bhs_scsi_req *reqh; + + if (conn->sess->session_type != SESSION_TYPE_NORMAL) { + SPDK_ERRLOG("ISCSI_OP_SCSI not allowed in discovery and invalid session\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs; + + R_bit = reqh->read_bit; + W_bit = reqh->write_bit; + lun = from_be64(&reqh->lun); + task_tag = from_be32(&reqh->itt); + transfer_len = from_be32(&reqh->expected_data_xfer_len); + cdb = reqh->cdb; + + SPDK_LOGDUMP(SPDK_LOG_ISCSI, "CDB", cdb, 16); + + task = iscsi_task_get(conn, NULL, iscsi_task_cpl); + if (!task) { + SPDK_ERRLOG("Unable to acquire task\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + iscsi_task_associate_pdu(task, pdu); + lun_i = spdk_scsi_lun_id_fmt_to_int(lun); + task->lun_id = lun_i; + dev = conn->dev; + task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_i); + + if ((R_bit != 0) && (W_bit != 0)) { + SPDK_ERRLOG("Bidirectional CDB is not supported\n"); + iscsi_task_put(task); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + task->scsi.cdb = cdb; + task->tag = task_tag; + task->scsi.transfer_len = transfer_len; + task->scsi.target_port = conn->target_port; + task->scsi.initiator_port = conn->initiator_port; + task->parent = NULL; + task->rsp_scsi_status = SPDK_SCSI_STATUS_GOOD; + + if (task->scsi.lun == NULL) { + spdk_scsi_task_process_null_lun(&task->scsi); + iscsi_task_cpl(&task->scsi); + return 0; + } + + /* no bi-directional support */ + if (R_bit) { + task->scsi.dxfer_dir = SPDK_SCSI_DIR_FROM_DEV; + } else if (W_bit) { + task->scsi.dxfer_dir = SPDK_SCSI_DIR_TO_DEV; + + if ((conn->sess->ErrorRecoveryLevel >= 1) && + (iscsi_compare_pdu_bhs_within_existed_r2t_tasks(conn, pdu))) { + iscsi_task_response(conn, task); + iscsi_task_put(task); + return 0; + } + + if (pdu->data_segment_len > iscsi_get_max_immediate_data_size()) { + SPDK_ERRLOG("data segment len(=%zu) > immediate data len(=%"PRIu32")\n", + pdu->data_segment_len, iscsi_get_max_immediate_data_size()); + iscsi_task_put(task); + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + if (pdu->data_segment_len > transfer_len) { + SPDK_ERRLOG("data segment len(=%zu) > task transfer len(=%d)\n", + pdu->data_segment_len, transfer_len); + iscsi_task_put(task); + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + /* check the ImmediateData and also pdu->data_segment_len */ + if ((!conn->sess->ImmediateData && (pdu->data_segment_len > 0)) || + (pdu->data_segment_len > conn->sess->FirstBurstLength)) { + iscsi_task_put(task); + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + if (spdk_unlikely(spdk_scsi_lun_get_dif_ctx(task->scsi.lun, &task->scsi, &pdu->dif_ctx))) { + pdu->dif_insert_or_strip = true; + } + } else { + /* neither R nor W bit set */ + task->scsi.dxfer_dir = SPDK_SCSI_DIR_NONE; + if (transfer_len > 0) { + iscsi_task_put(task); + SPDK_ERRLOG("Reject scsi cmd with EDTL > 0 but (R | W) == 0\n"); + return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD); + } + } + + pdu->task = task; + return 0; +} + +static int +iscsi_pdu_payload_op_scsi(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_task *task; + + if (pdu->task == NULL) { + return 0; + } + + task = pdu->task; + + if (spdk_scsi_dev_get_lun(conn->dev, task->lun_id) == NULL) { + spdk_scsi_task_process_null_lun(&task->scsi); + iscsi_task_cpl(&task->scsi); + return 0; + } + + switch (task->scsi.dxfer_dir) { + case SPDK_SCSI_DIR_FROM_DEV: + return iscsi_pdu_payload_op_scsi_read(conn, task); + case SPDK_SCSI_DIR_TO_DEV: + return iscsi_pdu_payload_op_scsi_write(conn, task); + case SPDK_SCSI_DIR_NONE: + iscsi_queue_task(conn, task); + return 0; + default: + assert(false); + iscsi_task_put(task); + break; + } + + return SPDK_ISCSI_CONNECTION_FATAL; +} + +static void +abort_transfer_task_in_task_mgmt_resp(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task) +{ + struct spdk_iscsi_pdu *pdu; + + pdu = iscsi_task_get_pdu(task); + + switch (task->scsi.function) { + /* abort task identified by Reference Task Tag field */ + case ISCSI_TASK_FUNC_ABORT_TASK: + iscsi_del_transfer_task(conn, task->scsi.abort_id); + break; + + /* abort all tasks issued via this session on the LUN */ + case ISCSI_TASK_FUNC_ABORT_TASK_SET: + iscsi_clear_all_transfer_task(conn, task->scsi.lun, pdu); + break; + + case ISCSI_TASK_FUNC_LOGICAL_UNIT_RESET: + iscsi_clear_all_transfer_task(conn, task->scsi.lun, pdu); + break; + } +} + +void +iscsi_task_mgmt_response(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_task_req *reqh; + struct iscsi_bhs_task_resp *rsph; + + if (task->pdu == NULL) { + /* + * This was an internally generated task management command, + * usually from LUN cleanup when a connection closes. + */ + return; + } + + reqh = (struct iscsi_bhs_task_req *)&task->pdu->bhs; + /* response PDU */ + rsp_pdu = iscsi_get_pdu(conn); + rsph = (struct iscsi_bhs_task_resp *)&rsp_pdu->bhs; + rsph->opcode = ISCSI_OP_TASK_RSP; + rsph->flags |= 0x80; /* bit 0 default to 1 */ + switch (task->scsi.response) { + case SPDK_SCSI_TASK_MGMT_RESP_COMPLETE: + abort_transfer_task_in_task_mgmt_resp(conn, task); + rsph->response = ISCSI_TASK_FUNC_RESP_COMPLETE; + break; + case SPDK_SCSI_TASK_MGMT_RESP_SUCCESS: + abort_transfer_task_in_task_mgmt_resp(conn, task); + rsph->response = ISCSI_TASK_FUNC_RESP_COMPLETE; + break; + case SPDK_SCSI_TASK_MGMT_RESP_REJECT: + rsph->response = ISCSI_TASK_FUNC_REJECTED; + break; + case SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN: + rsph->response = ISCSI_TASK_FUNC_RESP_LUN_NOT_EXIST; + break; + case SPDK_SCSI_TASK_MGMT_RESP_TARGET_FAILURE: + rsph->response = ISCSI_TASK_FUNC_REJECTED; + break; + case SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED: + rsph->response = ISCSI_TASK_FUNC_RESP_FUNC_NOT_SUPPORTED; + break; + } + rsph->itt = reqh->itt; + + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (reqh->immediate == 0) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL); +} + +static void +iscsi_queue_mgmt_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task) +{ + struct spdk_scsi_lun *lun; + + lun = spdk_scsi_dev_get_lun(conn->dev, task->lun_id); + if (lun == NULL) { + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN; + iscsi_task_mgmt_response(conn, task); + iscsi_task_put(task); + return; + } + + spdk_scsi_dev_queue_mgmt_task(conn->dev, &task->scsi); +} + +static int +_iscsi_op_abort_task(void *arg) +{ + struct spdk_iscsi_task *task = arg; + int rc; + + rc = iscsi_conn_abort_queued_datain_task(task->conn, task->scsi.abort_id); + if (rc != 0) { + return SPDK_POLLER_BUSY; + } + + spdk_poller_unregister(&task->mgmt_poller); + iscsi_queue_mgmt_task(task->conn, task); + return SPDK_POLLER_BUSY; +} + +static void +iscsi_op_abort_task(struct spdk_iscsi_task *task, uint32_t ref_task_tag) +{ + task->scsi.abort_id = ref_task_tag; + task->scsi.function = SPDK_SCSI_TASK_FUNC_ABORT_TASK; + task->mgmt_poller = SPDK_POLLER_REGISTER(_iscsi_op_abort_task, task, 10); +} + +static int +_iscsi_op_abort_task_set(void *arg) +{ + struct spdk_iscsi_task *task = arg; + int rc; + + rc = iscsi_conn_abort_queued_datain_tasks(task->conn, task->scsi.lun, + task->pdu); + if (rc != 0) { + return SPDK_POLLER_BUSY; + } + + spdk_poller_unregister(&task->mgmt_poller); + iscsi_queue_mgmt_task(task->conn, task); + return SPDK_POLLER_BUSY; +} + +void +iscsi_op_abort_task_set(struct spdk_iscsi_task *task, uint8_t function) +{ + task->scsi.function = function; + task->mgmt_poller = SPDK_POLLER_REGISTER(_iscsi_op_abort_task_set, task, 10); +} + +static int +iscsi_pdu_hdr_op_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct iscsi_bhs_task_req *reqh; + uint64_t lun; + uint32_t task_tag; + uint32_t ref_task_tag; + uint8_t function; + int lun_i; + struct spdk_iscsi_task *task; + struct spdk_scsi_dev *dev; + + if (conn->sess->session_type != SESSION_TYPE_NORMAL) { + SPDK_ERRLOG("ISCSI_OP_TASK not allowed in discovery and invalid session\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_task_req *)&pdu->bhs; + function = reqh->flags & ISCSI_TASK_FUNCTION_MASK; + lun = from_be64(&reqh->lun); + task_tag = from_be32(&reqh->itt); + ref_task_tag = from_be32(&reqh->ref_task_tag); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, func=%d, ITT=%x, ref TT=%x, LUN=0x%16.16"PRIx64"\n", + reqh->immediate, function, task_tag, ref_task_tag, lun); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + conn->StatSN, conn->sess->ExpCmdSN, conn->sess->MaxCmdSN); + + lun_i = spdk_scsi_lun_id_fmt_to_int(lun); + dev = conn->dev; + + task = iscsi_task_get(conn, NULL, iscsi_task_mgmt_cpl); + if (!task) { + SPDK_ERRLOG("Unable to acquire task\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + iscsi_task_associate_pdu(task, pdu); + task->scsi.target_port = conn->target_port; + task->scsi.initiator_port = conn->initiator_port; + task->tag = task_tag; + task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_i); + task->lun_id = lun_i; + + if (task->scsi.lun == NULL) { + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN; + iscsi_task_mgmt_response(conn, task); + iscsi_task_put(task); + return 0; + } + + switch (function) { + /* abort task identified by Referenced Task Tag field */ + case ISCSI_TASK_FUNC_ABORT_TASK: + SPDK_NOTICELOG("ABORT_TASK\n"); + + iscsi_op_abort_task(task, ref_task_tag); + return 0; + + /* abort all tasks issued via this session on the LUN */ + case ISCSI_TASK_FUNC_ABORT_TASK_SET: + SPDK_NOTICELOG("ABORT_TASK_SET\n"); + + iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET); + return 0; + + case ISCSI_TASK_FUNC_CLEAR_TASK_SET: + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + SPDK_NOTICELOG("CLEAR_TASK_SET (Unsupported)\n"); + break; + + case ISCSI_TASK_FUNC_CLEAR_ACA: + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + SPDK_NOTICELOG("CLEAR_ACA (Unsupported)\n"); + break; + + case ISCSI_TASK_FUNC_LOGICAL_UNIT_RESET: + SPDK_NOTICELOG("LOGICAL_UNIT_RESET\n"); + + iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_LUN_RESET); + return 0; + + case ISCSI_TASK_FUNC_TARGET_WARM_RESET: + SPDK_NOTICELOG("TARGET_WARM_RESET (Unsupported)\n"); + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + break; + + case ISCSI_TASK_FUNC_TARGET_COLD_RESET: + SPDK_NOTICELOG("TARGET_COLD_RESET (Unsupported)\n"); + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + break; + + case ISCSI_TASK_FUNC_TASK_REASSIGN: + SPDK_NOTICELOG("TASK_REASSIGN (Unsupported)\n"); + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + break; + + default: + SPDK_ERRLOG("unsupported function %d\n", function); + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT; + break; + } + + iscsi_task_mgmt_response(conn, task); + iscsi_task_put(task); + return 0; +} + +static int +iscsi_pdu_hdr_op_nopout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct iscsi_bhs_nop_out *reqh; + uint32_t task_tag; + uint32_t transfer_tag; + int I_bit; + + if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + SPDK_ERRLOG("ISCSI_OP_NOPOUT not allowed in discovery session\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_nop_out *)&pdu->bhs; + I_bit = reqh->immediate; + + if (pdu->data_segment_len > SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) { + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + task_tag = from_be32(&reqh->itt); + transfer_tag = from_be32(&reqh->ttt); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, ITT=%x, TTT=%x\n", + I_bit, task_tag, transfer_tag); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + pdu->cmd_sn, conn->StatSN, conn->sess->ExpCmdSN, + conn->sess->MaxCmdSN); + + if (transfer_tag != 0xFFFFFFFF && transfer_tag != (uint32_t)conn->id) { + SPDK_ERRLOG("invalid transfer tag 0x%x\n", transfer_tag); + /* + * Technically we should probably fail the connection here, but for now + * just print the error message and continue. + */ + } + + if (task_tag == 0xffffffffU && I_bit == 0) { + SPDK_ERRLOG("got NOPOUT ITT=0xffffffff, I=0\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + return 0; +} + +static int +iscsi_pdu_payload_op_nopout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_nop_out *reqh; + struct iscsi_bhs_nop_in *rsph; + uint8_t *data; + uint64_t lun; + uint32_t task_tag; + int I_bit; + int data_len; + + reqh = (struct iscsi_bhs_nop_out *)&pdu->bhs; + I_bit = reqh->immediate; + + data_len = pdu->data_segment_len; + if (data_len > conn->MaxRecvDataSegmentLength) { + data_len = conn->MaxRecvDataSegmentLength; + } + + lun = from_be64(&reqh->lun); + task_tag = from_be32(&reqh->itt); + + /* + * We don't actually check to see if this is a response to the NOP-In + * that we sent. Our goal is to just verify that the initiator is + * alive and responding to commands, not to verify that it tags + * NOP-Outs correctly + */ + conn->nop_outstanding = false; + + if (task_tag == 0xffffffffU) { + assert(I_bit == 1); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got NOPOUT ITT=0xffffffff\n"); + return 0; + } + + data = calloc(1, data_len); + if (!data) { + SPDK_ERRLOG("calloc() failed for ping data\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + /* response of NOPOUT */ + if (data_len > 0) { + /* copy ping data */ + memcpy(data, pdu->data, data_len); + } + + /* response PDU */ + rsp_pdu = iscsi_get_pdu(conn); + assert(rsp_pdu != NULL); + + rsph = (struct iscsi_bhs_nop_in *)&rsp_pdu->bhs; + rsp_pdu->data = data; + rsph->opcode = ISCSI_OP_NOPIN; + rsph->flags |= 0x80; /* bit 0 default to 1 */ + DSET24(rsph->data_segment_len, data_len); + to_be64(&rsph->lun, lun); + to_be32(&rsph->itt, task_tag); + to_be32(&rsph->ttt, 0xffffffffU); + + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (I_bit == 0) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL); + conn->last_nopin = spdk_get_ticks(); + + return 0; +} + +/* This function returns the spdk_scsi_task by searching the snack list via + * task transfertag and the pdu's opcode + */ +static struct spdk_iscsi_task * +get_scsi_task_from_ttt(struct spdk_iscsi_conn *conn, uint32_t transfer_tag) +{ + struct spdk_iscsi_pdu *pdu; + struct iscsi_bhs_data_in *datain_bhs; + + TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) { + if (pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) { + datain_bhs = (struct iscsi_bhs_data_in *)&pdu->bhs; + if (from_be32(&datain_bhs->ttt) == transfer_tag) { + return pdu->task; + } + } + } + + return NULL; +} + +/* This function returns the spdk_scsi_task by searching the snack list via + * initiator task tag and the pdu's opcode + */ +static struct spdk_iscsi_task * +get_scsi_task_from_itt(struct spdk_iscsi_conn *conn, + uint32_t task_tag, enum iscsi_op opcode) +{ + struct spdk_iscsi_pdu *pdu; + + TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) { + if (pdu->bhs.opcode == opcode && + pdu->task != NULL && + pdu->task->tag == task_tag) { + return pdu->task; + } + } + + return NULL; +} + +/* This function is used to handle the r2t snack */ +static int +iscsi_handle_r2t_snack(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, + struct spdk_iscsi_pdu *pdu, uint32_t beg_run, + uint32_t run_length, int32_t task_tag) +{ + int32_t last_r2tsn; + int i; + + if (beg_run < task->acked_r2tsn) { + SPDK_ERRLOG("ITT: 0x%08x, R2T SNACK requests retransmission of" + "R2TSN: from 0x%08x to 0x%08x. But it has already" + "ack to R2TSN:0x%08x, protocol error.\n", + task_tag, beg_run, (beg_run + run_length), + (task->acked_r2tsn - 1)); + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + if (run_length) { + if ((beg_run + run_length) > task->R2TSN) { + SPDK_ERRLOG("ITT: 0x%08x, received R2T SNACK with" + "BegRun: 0x%08x, RunLength: 0x%08x, exceeds" + "current R2TSN: 0x%08x, protocol error.\n", + task_tag, beg_run, run_length, + task->R2TSN); + + return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD); + } + last_r2tsn = (beg_run + run_length); + } else { + last_r2tsn = task->R2TSN; + } + + for (i = beg_run; i < last_r2tsn; i++) { + if (iscsi_send_r2t_recovery(conn, task, i, false) < 0) { + SPDK_ERRLOG("The r2t_sn=%d of r2t_task=%p is not sent\n", i, task); + } + } + return 0; +} + +/* This function is used to recover the data in packet */ +static int +iscsi_handle_recovery_datain(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, + struct spdk_iscsi_pdu *pdu, uint32_t beg_run, + uint32_t run_length, uint32_t task_tag) +{ + struct spdk_iscsi_pdu *old_pdu, *pdu_temp; + uint32_t i; + struct iscsi_bhs_data_in *datain_header; + uint32_t last_statsn; + + task = iscsi_task_get_primary(task); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_handle_recovery_datain\n"); + + if (beg_run < task->acked_data_sn) { + SPDK_ERRLOG("ITT: 0x%08x, DATA IN SNACK requests retransmission of" + "DATASN: from 0x%08x to 0x%08x but already acked to " + "DATASN: 0x%08x protocol error\n", + task_tag, beg_run, + (beg_run + run_length), (task->acked_data_sn - 1)); + + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + if (run_length == 0) { + /* as the DataSN begins at 0 */ + run_length = task->datain_datasn + 1; + } + + if ((beg_run + run_length - 1) > task->datain_datasn) { + SPDK_ERRLOG("Initiator requests BegRun: 0x%08x, RunLength:" + "0x%08x greater than maximum DataSN: 0x%08x.\n", + beg_run, run_length, task->datain_datasn); + + return -1; + } else { + last_statsn = beg_run + run_length - 1; + } + + for (i = beg_run; i <= last_statsn; i++) { + TAILQ_FOREACH_SAFE(old_pdu, &conn->snack_pdu_list, tailq, pdu_temp) { + if (old_pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) { + datain_header = (struct iscsi_bhs_data_in *)&old_pdu->bhs; + if (from_be32(&datain_header->itt) == task_tag && + from_be32(&datain_header->data_sn) == i) { + TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq); + iscsi_conn_write_pdu(conn, old_pdu, old_pdu->cb_fn, old_pdu->cb_arg); + break; + } + } + } + } + return 0; +} + +/* This function is used to handle the status snack */ +static int +iscsi_handle_status_snack(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + uint32_t beg_run; + uint32_t run_length; + struct iscsi_bhs_snack_req *reqh; + uint32_t i; + uint32_t last_statsn; + bool found_pdu; + struct spdk_iscsi_pdu *old_pdu; + + reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs; + beg_run = from_be32(&reqh->beg_run); + run_length = from_be32(&reqh->run_len); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d, run_length=%d, conn->StatSN=" + "%d, conn->exp_statsn=%d\n", beg_run, run_length, + conn->StatSN, conn->exp_statsn); + + if (!beg_run) { + beg_run = conn->exp_statsn; + } else if (beg_run < conn->exp_statsn) { + SPDK_ERRLOG("Got Status SNACK Begrun: 0x%08x, RunLength: 0x%08x " + "but already got ExpStatSN: 0x%08x on CID:%hu.\n", + beg_run, run_length, conn->StatSN, conn->cid); + + return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD); + } + + last_statsn = (!run_length) ? conn->StatSN : (beg_run + run_length); + + for (i = beg_run; i < last_statsn; i++) { + found_pdu = false; + TAILQ_FOREACH(old_pdu, &conn->snack_pdu_list, tailq) { + if (from_be32(&old_pdu->bhs.stat_sn) == i) { + found_pdu = true; + break; + } + } + + if (!found_pdu) { + SPDK_ERRLOG("Unable to find StatSN: 0x%08x. For a Status" + "SNACK, assuming this is a proactive SNACK " + "for an untransmitted StatSN, ignoring.\n", + beg_run); + } else { + TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq); + iscsi_conn_write_pdu(conn, old_pdu, old_pdu->cb_fn, old_pdu->cb_arg); + } + } + + return 0; +} + +/* This function is used to handle the data ack snack */ +static int +iscsi_handle_data_ack(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + uint32_t transfer_tag; + uint32_t beg_run; + uint32_t run_length; + struct spdk_iscsi_pdu *old_pdu; + uint32_t old_datasn; + struct iscsi_bhs_snack_req *reqh; + struct spdk_iscsi_task *task; + struct iscsi_bhs_data_in *datain_header; + struct spdk_iscsi_task *primary; + + reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs; + transfer_tag = from_be32(&reqh->ttt); + beg_run = from_be32(&reqh->beg_run); + run_length = from_be32(&reqh->run_len); + task = NULL; + datain_header = NULL; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d,transfer_tag=%d,run_len=%d\n", + beg_run, transfer_tag, run_length); + + task = get_scsi_task_from_ttt(conn, transfer_tag); + if (!task) { + SPDK_ERRLOG("Data ACK SNACK for TTT: 0x%08x is invalid.\n", + transfer_tag); + goto reject_return; + } + + primary = iscsi_task_get_primary(task); + if ((run_length != 0) || (beg_run < primary->acked_data_sn)) { + SPDK_ERRLOG("TTT: 0x%08x Data ACK SNACK BegRUN: %d is less than " + "the next expected acked DataSN: %d\n", + transfer_tag, beg_run, primary->acked_data_sn); + goto reject_return; + } + + primary->acked_data_sn = beg_run; + + /* To free the pdu */ + TAILQ_FOREACH(old_pdu, &conn->snack_pdu_list, tailq) { + if (old_pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) { + datain_header = (struct iscsi_bhs_data_in *) &old_pdu->bhs; + old_datasn = from_be32(&datain_header->data_sn); + if ((from_be32(&datain_header->ttt) == transfer_tag) && + (old_datasn == beg_run - 1)) { + TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq); + iscsi_conn_free_pdu(conn, old_pdu); + break; + } + } + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Received Data ACK SNACK for TTT: 0x%08x," + " updated acked DataSN to 0x%08x.\n", transfer_tag, + (task->acked_data_sn - 1)); + + return 0; + +reject_return: + return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_SNACK); +} + +/* This function is used to handle the snack request from the initiator */ +static int +iscsi_pdu_hdr_op_snack(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct iscsi_bhs_snack_req *reqh; + struct spdk_iscsi_task *task; + int type; + uint32_t task_tag; + uint32_t beg_run; + uint32_t run_length; + int rc; + + if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + SPDK_ERRLOG("ISCSI_OP_SNACK not allowed in discovery session\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs; + if (!conn->sess->ErrorRecoveryLevel) { + SPDK_ERRLOG("Got a SNACK request in ErrorRecoveryLevel=0\n"); + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + type = reqh->flags & ISCSI_FLAG_SNACK_TYPE_MASK; + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "The value of type is %d\n", type); + + switch (type) { + case 0: + reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs; + task_tag = from_be32(&reqh->itt); + beg_run = from_be32(&reqh->beg_run); + run_length = from_be32(&reqh->run_len); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d, run_length=%d, " + "task_tag=%x, transfer_tag=%u\n", beg_run, + run_length, task_tag, from_be32(&reqh->ttt)); + + task = get_scsi_task_from_itt(conn, task_tag, + ISCSI_OP_SCSI_DATAIN); + if (task) { + return iscsi_handle_recovery_datain(conn, task, pdu, + beg_run, run_length, task_tag); + } + task = get_scsi_task_from_itt(conn, task_tag, ISCSI_OP_R2T); + if (task) { + return iscsi_handle_r2t_snack(conn, task, pdu, beg_run, + run_length, task_tag); + } + SPDK_ERRLOG("It is Neither datain nor r2t recovery request\n"); + rc = -1; + break; + case ISCSI_FLAG_SNACK_TYPE_STATUS: + rc = iscsi_handle_status_snack(conn, pdu); + break; + case ISCSI_FLAG_SNACK_TYPE_DATA_ACK: + rc = iscsi_handle_data_ack(conn, pdu); + break; + case ISCSI_FLAG_SNACK_TYPE_RDATA: + SPDK_ERRLOG("R-Data SNACK is Not Supported int spdk\n"); + rc = iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + break; + default: + SPDK_ERRLOG("Unknown SNACK type %d, protocol error\n", type); + rc = iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + break; + } + + return rc; +} + +static int +iscsi_pdu_hdr_op_data(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_task *task, *subtask; + struct iscsi_bhs_data_out *reqh; + struct spdk_scsi_lun *lun_dev; + uint32_t transfer_tag; + uint32_t task_tag; + uint32_t transfer_len; + uint32_t DataSN; + uint32_t buffer_offset; + uint32_t len; + int F_bit; + int rc; + int reject_reason = ISCSI_REASON_INVALID_PDU_FIELD; + + if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + SPDK_ERRLOG("ISCSI_OP_SCSI_DATAOUT not allowed in discovery session\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_data_out *)&pdu->bhs; + F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL); + transfer_tag = from_be32(&reqh->ttt); + task_tag = from_be32(&reqh->itt); + DataSN = from_be32(&reqh->data_sn); + buffer_offset = from_be32(&reqh->buffer_offset); + + if (pdu->data_segment_len > SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) { + reject_reason = ISCSI_REASON_PROTOCOL_ERROR; + goto reject_return; + } + + task = get_transfer_task(conn, transfer_tag); + if (task == NULL) { + SPDK_ERRLOG("Not found task for transfer_tag=%x\n", transfer_tag); + goto reject_return; + } + + lun_dev = spdk_scsi_dev_get_lun(conn->dev, task->lun_id); + + if (pdu->data_segment_len > task->desired_data_transfer_length) { + SPDK_ERRLOG("the dataout pdu data length is larger than the value sent by R2T PDU\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + if (task->tag != task_tag) { + SPDK_ERRLOG("The r2t task tag is %u, and the dataout task tag is %u\n", + task->tag, task_tag); + goto reject_return; + } + + if (DataSN != task->r2t_datasn) { + SPDK_ERRLOG("DataSN(%u) exp=%d error\n", DataSN, task->r2t_datasn); + if (conn->sess->ErrorRecoveryLevel >= 1) { + goto send_r2t_recovery_return; + } else { + reject_reason = ISCSI_REASON_PROTOCOL_ERROR; + goto reject_return; + } + } + + if (buffer_offset != task->next_expected_r2t_offset) { + SPDK_ERRLOG("offset(%u) error\n", buffer_offset); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + transfer_len = task->scsi.transfer_len; + task->current_r2t_length += pdu->data_segment_len; + task->next_expected_r2t_offset += pdu->data_segment_len; + task->r2t_datasn++; + + if (task->current_r2t_length > conn->sess->MaxBurstLength) { + SPDK_ERRLOG("R2T burst(%u) > MaxBurstLength(%u)\n", + task->current_r2t_length, + conn->sess->MaxBurstLength); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + if (F_bit) { + /* + * This R2T burst is done. Clear the length before we + * receive a PDU for the next R2t burst. + */ + task->current_r2t_length = 0; + } + + subtask = iscsi_task_get(conn, task, iscsi_task_cpl); + if (subtask == NULL) { + SPDK_ERRLOG("Unable to acquire subtask\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + subtask->scsi.offset = buffer_offset; + subtask->scsi.length = pdu->data_segment_len; + iscsi_task_associate_pdu(subtask, pdu); + + if (task->next_expected_r2t_offset == transfer_len) { + task->acked_r2tsn++; + } else if (F_bit && (task->next_r2t_offset < transfer_len)) { + task->acked_r2tsn++; + len = spdk_min(conn->sess->MaxBurstLength, + (transfer_len - task->next_r2t_offset)); + rc = iscsi_send_r2t(conn, task, task->next_r2t_offset, len, + task->ttt, &task->R2TSN); + if (rc < 0) { + SPDK_ERRLOG("iscsi_send_r2t() failed\n"); + } + task->next_r2t_offset += len; + } + + if (lun_dev == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "LUN %d is removed, complete the task immediately\n", + task->lun_id); + subtask->scsi.transfer_len = subtask->scsi.length; + spdk_scsi_task_process_null_lun(&subtask->scsi); + iscsi_task_cpl(&subtask->scsi); + return 0; + } + + if (spdk_unlikely(spdk_scsi_lun_get_dif_ctx(lun_dev, &subtask->scsi, &pdu->dif_ctx))) { + pdu->dif_insert_or_strip = true; + } + + pdu->task = subtask; + return 0; + +send_r2t_recovery_return: + rc = iscsi_send_r2t_recovery(conn, task, task->acked_r2tsn, true); + if (rc == 0) { + return 0; + } + +reject_return: + return iscsi_reject(conn, pdu, reject_reason); +} + +static int +iscsi_pdu_payload_op_data(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_task *subtask; + struct iscsi_bhs_data_out *reqh; + uint32_t transfer_tag; + + if (pdu->task == NULL) { + return 0; + } + + subtask = pdu->task; + + reqh = (struct iscsi_bhs_data_out *)&pdu->bhs; + transfer_tag = from_be32(&reqh->ttt); + + if (get_transfer_task(conn, transfer_tag) == NULL) { + SPDK_ERRLOG("Not found for transfer_tag=%x\n", transfer_tag); + subtask->scsi.transfer_len = subtask->scsi.length; + spdk_scsi_task_process_abort(&subtask->scsi); + iscsi_task_cpl(&subtask->scsi); + return 0; + } + + if (spdk_likely(!pdu->dif_insert_or_strip)) { + spdk_scsi_task_set_data(&subtask->scsi, pdu->data, pdu->data_segment_len); + } else { + spdk_scsi_task_set_data(&subtask->scsi, pdu->data, pdu->data_buf_len); + } + + if (spdk_scsi_dev_get_lun(conn->dev, subtask->lun_id) == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "LUN %d is removed, complete the task immediately\n", + subtask->lun_id); + subtask->scsi.transfer_len = subtask->scsi.length; + spdk_scsi_task_process_null_lun(&subtask->scsi); + iscsi_task_cpl(&subtask->scsi); + return 0; + } + + iscsi_queue_task(conn, subtask); + return 0; +} + +static void +init_login_reject_response(struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu) +{ + struct iscsi_bhs_login_rsp *rsph; + + memset(rsp_pdu, 0, sizeof(struct spdk_iscsi_pdu)); + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + rsph->version_max = ISCSI_VERSION; + rsph->version_act = ISCSI_VERSION; + rsph->opcode = ISCSI_OP_LOGIN_RSP; + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INVALID_LOGIN_REQUEST; + rsph->itt = pdu->bhs.itt; +} + +static void +iscsi_pdu_dump(struct spdk_iscsi_pdu *pdu) +{ + SPDK_ERRLOGDUMP("PDU", (uint8_t *)&pdu->bhs, ISCSI_BHS_LEN); +} + +/* This function is used to refree the pdu when it is acknowledged */ +static void +remove_acked_pdu(struct spdk_iscsi_conn *conn, uint32_t ExpStatSN) +{ + struct spdk_iscsi_pdu *pdu, *pdu_temp; + uint32_t stat_sn; + + conn->exp_statsn = spdk_min(ExpStatSN, conn->StatSN); + TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, pdu_temp) { + stat_sn = from_be32(&pdu->bhs.stat_sn); + if (spdk_sn32_lt(stat_sn, conn->exp_statsn)) { + TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq); + iscsi_conn_free_pdu(conn, pdu); + } + } +} + +static int +iscsi_update_cmdsn(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + int opcode; + uint32_t ExpStatSN; + int I_bit; + struct spdk_iscsi_sess *sess; + struct iscsi_bhs_scsi_req *reqh; + + sess = conn->sess; + if (!sess) { + SPDK_ERRLOG("Connection has no associated session!\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + opcode = pdu->bhs.opcode; + reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs; + + pdu->cmd_sn = from_be32(&reqh->cmd_sn); + + I_bit = reqh->immediate; + if (I_bit == 0) { + if (spdk_sn32_lt(pdu->cmd_sn, sess->ExpCmdSN) || + spdk_sn32_gt(pdu->cmd_sn, sess->MaxCmdSN)) { + if (sess->session_type == SESSION_TYPE_NORMAL && + opcode != ISCSI_OP_SCSI_DATAOUT) { + SPDK_ERRLOG("CmdSN(%u) ignore (ExpCmdSN=%u, MaxCmdSN=%u)\n", + pdu->cmd_sn, sess->ExpCmdSN, sess->MaxCmdSN); + + if (sess->ErrorRecoveryLevel >= 1) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Skip the error in ERL 1 and 2\n"); + } else { + return SPDK_PDU_FATAL; + } + } + } + } else if (pdu->cmd_sn != sess->ExpCmdSN) { + SPDK_ERRLOG("CmdSN(%u) error ExpCmdSN=%u\n", pdu->cmd_sn, sess->ExpCmdSN); + + if (sess->ErrorRecoveryLevel >= 1) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Skip the error in ERL 1 and 2\n"); + } else if (opcode != ISCSI_OP_NOPOUT) { + /* + * The Linux initiator does not send valid CmdSNs for + * nopout under heavy load, so do not close the + * connection in that case. + */ + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + + ExpStatSN = from_be32(&reqh->exp_stat_sn); + if (spdk_sn32_gt(ExpStatSN, conn->StatSN)) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u) advanced\n", ExpStatSN); + ExpStatSN = conn->StatSN; + } + + if (sess->ErrorRecoveryLevel >= 1) { + remove_acked_pdu(conn, ExpStatSN); + } + + if (!I_bit && opcode != ISCSI_OP_SCSI_DATAOUT) { + sess->ExpCmdSN++; + } + + return 0; +} + +static int +iscsi_pdu_hdr_handle(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + int opcode; + int rc; + struct spdk_iscsi_pdu *rsp_pdu = NULL; + + if (pdu == NULL) { + return -1; + } + + opcode = pdu->bhs.opcode; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "opcode %x\n", opcode); + + if (opcode == ISCSI_OP_LOGIN) { + return iscsi_pdu_hdr_op_login(conn, pdu); + } + + /* connection in login phase but receive non-login opcode + * return response code 0x020b to initiator. + * */ + if (!conn->full_feature && conn->state == ISCSI_CONN_STATE_RUNNING) { + rsp_pdu = iscsi_get_pdu(conn); + if (rsp_pdu == NULL) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + init_login_reject_response(pdu, rsp_pdu); + iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL); + SPDK_ERRLOG("Received opcode %d in login phase\n", opcode); + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } else if (conn->state == ISCSI_CONN_STATE_INVALID) { + SPDK_ERRLOG("before Full Feature\n"); + iscsi_pdu_dump(pdu); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + rc = iscsi_update_cmdsn(conn, pdu); + if (rc != 0) { + return rc; + } + + switch (opcode) { + case ISCSI_OP_NOPOUT: + rc = iscsi_pdu_hdr_op_nopout(conn, pdu); + break; + + case ISCSI_OP_SCSI: + rc = iscsi_pdu_hdr_op_scsi(conn, pdu); + break; + case ISCSI_OP_TASK: + rc = iscsi_pdu_hdr_op_task(conn, pdu); + break; + + case ISCSI_OP_TEXT: + rc = iscsi_pdu_hdr_op_text(conn, pdu); + break; + + case ISCSI_OP_LOGOUT: + rc = iscsi_pdu_hdr_op_logout(conn, pdu); + break; + + case ISCSI_OP_SCSI_DATAOUT: + rc = iscsi_pdu_hdr_op_data(conn, pdu); + break; + + case ISCSI_OP_SNACK: + rc = iscsi_pdu_hdr_op_snack(conn, pdu); + break; + + default: + SPDK_ERRLOG("unsupported opcode %x\n", opcode); + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + if (rc < 0) { + SPDK_ERRLOG("processing PDU header (opcode=%x) failed on %s(%s)\n", + opcode, + conn->target_port != NULL ? spdk_scsi_port_get_name(conn->target_port) : "NULL", + conn->initiator_port != NULL ? spdk_scsi_port_get_name(conn->initiator_port) : "NULL"); + } + + return rc; +} + +static int +iscsi_pdu_payload_handle(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + int opcode; + int rc = 0; + + opcode = pdu->bhs.opcode; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "opcode %x\n", opcode); + + switch (opcode) { + case ISCSI_OP_LOGIN: + rc = iscsi_pdu_payload_op_login(conn, pdu); + break; + case ISCSI_OP_NOPOUT: + rc = iscsi_pdu_payload_op_nopout(conn, pdu); + break; + case ISCSI_OP_SCSI: + rc = iscsi_pdu_payload_op_scsi(conn, pdu); + break; + case ISCSI_OP_TASK: + break; + case ISCSI_OP_TEXT: + rc = iscsi_pdu_payload_op_text(conn, pdu); + break; + case ISCSI_OP_LOGOUT: + break; + case ISCSI_OP_SCSI_DATAOUT: + rc = iscsi_pdu_payload_op_data(conn, pdu); + break; + case ISCSI_OP_SNACK: + break; + default: + SPDK_ERRLOG("unsupported opcode %x\n", opcode); + return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + if (rc < 0) { + SPDK_ERRLOG("processing PDU payload (opcode=%x) failed on %s(%s)\n", + opcode, + conn->target_port != NULL ? spdk_scsi_port_get_name(conn->target_port) : "NULL", + conn->initiator_port != NULL ? spdk_scsi_port_get_name(conn->initiator_port) : "NULL"); + } + + return rc; +} + +static int +iscsi_read_pdu(struct spdk_iscsi_conn *conn) +{ + enum iscsi_pdu_recv_state prev_state; + struct spdk_iscsi_pdu *pdu; + struct spdk_mempool *pool; + uint32_t crc32c; + int ahs_len; + uint32_t data_len; + int rc; + + do { + prev_state = conn->pdu_recv_state; + pdu = conn->pdu_in_progress; + + switch (conn->pdu_recv_state) { + case ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY: + assert(conn->pdu_in_progress == NULL); + + conn->pdu_in_progress = iscsi_get_pdu(conn); + if (conn->pdu_in_progress == NULL) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_HDR; + break; + case ISCSI_PDU_RECV_STATE_AWAIT_PDU_HDR: + if (pdu->bhs_valid_bytes < ISCSI_BHS_LEN) { + rc = iscsi_conn_read_data(conn, + ISCSI_BHS_LEN - pdu->bhs_valid_bytes, + (uint8_t *)&pdu->bhs + pdu->bhs_valid_bytes); + if (rc < 0) { + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + pdu->bhs_valid_bytes += rc; + if (pdu->bhs_valid_bytes < ISCSI_BHS_LEN) { + return 0; + } + } + + pdu->data_segment_len = ISCSI_ALIGN(DGET24(pdu->bhs.data_segment_len)); + + /* AHS */ + ahs_len = pdu->bhs.total_ahs_len * 4; + assert(ahs_len <= ISCSI_AHS_LEN); + if (pdu->ahs_valid_bytes < ahs_len) { + rc = iscsi_conn_read_data(conn, + ahs_len - pdu->ahs_valid_bytes, + pdu->ahs + pdu->ahs_valid_bytes); + if (rc < 0) { + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + + pdu->ahs_valid_bytes += rc; + if (pdu->ahs_valid_bytes < ahs_len) { + return 0; + } + } + + /* Header Digest */ + if (conn->header_digest && + pdu->hdigest_valid_bytes < ISCSI_DIGEST_LEN) { + rc = iscsi_conn_read_data(conn, + ISCSI_DIGEST_LEN - pdu->hdigest_valid_bytes, + pdu->header_digest + pdu->hdigest_valid_bytes); + if (rc < 0) { + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + + pdu->hdigest_valid_bytes += rc; + if (pdu->hdigest_valid_bytes < ISCSI_DIGEST_LEN) { + return 0; + } + } + + if (conn->header_digest) { + crc32c = iscsi_pdu_calc_header_digest(pdu); + rc = MATCH_DIGEST_WORD(pdu->header_digest, crc32c); + if (rc == 0) { + SPDK_ERRLOG("header digest error (%s)\n", conn->initiator_name); + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + } + + rc = iscsi_pdu_hdr_handle(conn, pdu); + if (rc < 0) { + SPDK_ERRLOG("Critical error is detected. Close the connection\n"); + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD; + break; + case ISCSI_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD: + data_len = pdu->data_segment_len; + + if (data_len != 0 && pdu->data_buf == NULL) { + if (data_len <= iscsi_get_max_immediate_data_size()) { + pool = g_iscsi.pdu_immediate_data_pool; + pdu->data_buf_len = SPDK_BDEV_BUF_SIZE_WITH_MD(iscsi_get_max_immediate_data_size()); + } else if (data_len <= SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) { + pool = g_iscsi.pdu_data_out_pool; + pdu->data_buf_len = SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH); + } else { + SPDK_ERRLOG("Data(%d) > MaxSegment(%d)\n", + data_len, SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH); + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + pdu->mobj = spdk_mempool_get(pool); + if (pdu->mobj == NULL) { + return 0; + } + pdu->data_buf = pdu->mobj->buf; + pdu->data = pdu->mobj->buf; + pdu->data_from_mempool = true; + } + + /* copy the actual data into local buffer */ + if (pdu->data_valid_bytes < data_len) { + rc = iscsi_conn_read_data_segment(conn, pdu, data_len); + if (rc < 0) { + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + + pdu->data_valid_bytes += rc; + if (pdu->data_valid_bytes < data_len) { + return 0; + } + } + + /* copy out the data digest */ + if (conn->data_digest && data_len != 0 && + pdu->ddigest_valid_bytes < ISCSI_DIGEST_LEN) { + rc = iscsi_conn_read_data(conn, + ISCSI_DIGEST_LEN - pdu->ddigest_valid_bytes, + pdu->data_digest + pdu->ddigest_valid_bytes); + if (rc < 0) { + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + + pdu->ddigest_valid_bytes += rc; + if (pdu->ddigest_valid_bytes < ISCSI_DIGEST_LEN) { + return 0; + } + } + + /* All data for this PDU has now been read from the socket. */ + spdk_trace_record(TRACE_ISCSI_READ_PDU, conn->id, pdu->data_valid_bytes, + (uintptr_t)pdu, pdu->bhs.opcode); + + /* check data digest */ + if (conn->data_digest && data_len != 0) { + crc32c = iscsi_pdu_calc_data_digest(pdu); + rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c); + if (rc == 0) { + SPDK_ERRLOG("data digest error (%s)\n", conn->initiator_name); + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + } + + if (conn->is_logged_out) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "pdu received after logout\n"); + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + break; + } + + if (!pdu->is_rejected) { + rc = iscsi_pdu_payload_handle(conn, pdu); + } else { + rc = 0; + } + if (rc == 0) { + spdk_trace_record(TRACE_ISCSI_TASK_EXECUTED, 0, 0, (uintptr_t)pdu, 0); + iscsi_put_pdu(pdu); + conn->pdu_in_progress = NULL; + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY; + return 1; + } else { + conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR; + } + break; + case ISCSI_PDU_RECV_STATE_ERROR: + return SPDK_ISCSI_CONNECTION_FATAL; + default: + assert(false); + SPDK_ERRLOG("code should not come here\n"); + break; + } + } while (prev_state != conn->pdu_recv_state); + + return 0; +} + +#define GET_PDU_LOOP_COUNT 16 + +int +iscsi_handle_incoming_pdus(struct spdk_iscsi_conn *conn) +{ + int i, rc; + + /* Read new PDUs from network */ + for (i = 0; i < GET_PDU_LOOP_COUNT; i++) { + rc = iscsi_read_pdu(conn); + if (rc == 0) { + break; + } else if (rc < 0) { + return rc; + } + + if (conn->is_stopped) { + break; + } + } + + return i; +} diff --git a/src/spdk/lib/iscsi/iscsi.h b/src/spdk/lib/iscsi/iscsi.h new file mode 100644 index 000000000..b1747e4ab --- /dev/null +++ b/src/spdk/lib/iscsi/iscsi.h @@ -0,0 +1,465 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ISCSI_H +#define SPDK_ISCSI_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/iscsi_spec.h" +#include "spdk/thread.h" +#include "spdk/sock.h" + +#include "spdk/scsi.h" +#include "iscsi/param.h" + +#include "spdk/assert.h" +#include "spdk/dif.h" +#include "spdk/util.h" + +#define SPDK_ISCSI_DEFAULT_NODEBASE "iqn.2016-06.io.spdk" + +#define DEFAULT_MAXR2T 4 +#define MAX_INITIATOR_PORT_NAME 256 +#define MAX_INITIATOR_NAME 223 +#define MAX_TARGET_NAME 223 + +#define MAX_PORTAL 1024 +#define MAX_INITIATOR 256 +#define MAX_NETMASK 256 +#define MAX_ISCSI_CONNECTIONS 1024 +#define MAX_PORTAL_ADDR 256 +#define MAX_PORTAL_PORT 32 + +#define DEFAULT_PORT 3260 +#define DEFAULT_MAX_SESSIONS 128 +#define DEFAULT_MAX_CONNECTIONS_PER_SESSION 2 +#define DEFAULT_MAXOUTSTANDINGR2T 1 +#define DEFAULT_DEFAULTTIME2WAIT 2 +#define DEFAULT_DEFAULTTIME2RETAIN 20 +#define DEFAULT_INITIALR2T true +#define DEFAULT_IMMEDIATEDATA true +#define DEFAULT_DATAPDUINORDER true +#define DEFAULT_DATASEQUENCEINORDER true +#define DEFAULT_ERRORRECOVERYLEVEL 0 +#define DEFAULT_TIMEOUT 60 +#define MAX_NOPININTERVAL 60 +#define DEFAULT_NOPININTERVAL 30 + +/* + * SPDK iSCSI target currently only supports 64KB as the maximum data segment length + * it can receive from initiators. Other values may work, but no guarantees. + */ +#define SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH 65536 + +/* + * Defines maximum number of data out buffers each connection can have in + * use at any given time. + */ +#define MAX_DATA_OUT_PER_CONNECTION 16 + +/* + * Defines maximum number of data in buffers each connection can have in + * use at any given time. So this limit does not affect I/O smaller than + * SPDK_BDEV_SMALL_BUF_MAX_SIZE. + */ +#define MAX_LARGE_DATAIN_PER_CONNECTION 64 + +#define SPDK_ISCSI_MAX_BURST_LENGTH \ + (SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH * MAX_DATA_OUT_PER_CONNECTION) + +/* + * Defines default maximum amount in bytes of unsolicited data the iSCSI + * initiator may send to the SPDK iSCSI target during the execution of + * a single SCSI command. And it is smaller than the MaxBurstLength. + */ +#define SPDK_ISCSI_FIRST_BURST_LENGTH 8192 + +/* + * Defines minimum amount in bytes of unsolicited data the iSCSI initiator + * may send to the SPDK iSCSI target during the execution of a single + * SCSI command. + */ +#define SPDK_ISCSI_MIN_FIRST_BURST_LENGTH 512 + +#define SPDK_ISCSI_MAX_FIRST_BURST_LENGTH 16777215 + +/* + * Defines default maximum queue depth per connection and this can be + * changed by configuration file. + */ +#define DEFAULT_MAX_QUEUE_DEPTH 64 + +/** Defines how long we should wait for a logout request when the target + * requests logout to the initiator asynchronously. + */ +#define ISCSI_LOGOUT_REQUEST_TIMEOUT 30 /* in seconds */ + +/** Defines how long we should wait for a TCP close after responding to a + * logout request, before terminating the connection ourselves. + */ +#define ISCSI_LOGOUT_TIMEOUT 5 /* in seconds */ + +/* For spdk_iscsi_login_in related function use, we need to avoid the conflict + * with other errors + * */ +#define SPDK_ISCSI_LOGIN_ERROR_RESPONSE -1000 +#define SPDK_ISCSI_LOGIN_ERROR_PARAMETER -1001 +#define SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE -1002 + +#define ISCSI_AHS_LEN 60 + +struct spdk_mobj { + struct spdk_mempool *mp; + void *buf; +}; + +/* + * Maximum number of SGL elements, i.e., + * BHS, AHS, Header Digest, Data Segment and Data Digest. + */ +#define SPDK_ISCSI_MAX_SGL_DESCRIPTORS (5) + +typedef void (*iscsi_conn_xfer_complete_cb)(void *cb_arg); + +struct spdk_iscsi_pdu { + struct iscsi_bhs bhs; + struct spdk_mobj *mobj; + bool is_rejected; + uint8_t *data_buf; + uint8_t *data; + uint8_t header_digest[ISCSI_DIGEST_LEN]; + uint8_t data_digest[ISCSI_DIGEST_LEN]; + size_t data_segment_len; + int bhs_valid_bytes; + int ahs_valid_bytes; + uint32_t data_valid_bytes; + int hdigest_valid_bytes; + int ddigest_valid_bytes; + int ref; + bool data_from_mempool; /* indicate whether the data buffer is allocated from mempool */ + struct spdk_iscsi_task *task; /* data tied to a task buffer */ + uint32_t cmd_sn; + uint32_t writev_offset; + uint32_t data_buf_len; + bool dif_insert_or_strip; + struct spdk_dif_ctx dif_ctx; + struct spdk_iscsi_conn *conn; + + iscsi_conn_xfer_complete_cb cb_fn; + void *cb_arg; + + /* The sock request ends with a 0 length iovec. Place the actual iovec immediately + * after it. There is a static assert below to check if the compiler inserted + * any unwanted padding */ + int32_t mapped_length; + struct spdk_sock_request sock_req; + struct iovec iov[SPDK_ISCSI_MAX_SGL_DESCRIPTORS]; + TAILQ_ENTRY(spdk_iscsi_pdu) tailq; + + + /* + * 60 bytes of AHS should suffice for now. + * This should always be at the end of PDU data structure. + * we need to not zero this out when doing memory clear. + */ + uint8_t ahs[ISCSI_AHS_LEN]; + + struct { + uint16_t length; /* iSCSI SenseLength (big-endian) */ + uint8_t data[32]; + } sense; +}; +SPDK_STATIC_ASSERT(offsetof(struct spdk_iscsi_pdu, + sock_req) + sizeof(struct spdk_sock_request) == offsetof(struct spdk_iscsi_pdu, iov), + "Compiler inserted padding between iov and sock_req"); + +enum iscsi_connection_state { + ISCSI_CONN_STATE_INVALID = 0, + ISCSI_CONN_STATE_RUNNING = 1, + ISCSI_CONN_STATE_EXITING = 2, + ISCSI_CONN_STATE_EXITED = 3, +}; + +enum iscsi_chap_phase { + ISCSI_CHAP_PHASE_NONE = 0, + ISCSI_CHAP_PHASE_WAIT_A = 1, + ISCSI_CHAP_PHASE_WAIT_NR = 2, + ISCSI_CHAP_PHASE_END = 3, +}; + +enum session_type { + SESSION_TYPE_INVALID = 0, + SESSION_TYPE_NORMAL = 1, + SESSION_TYPE_DISCOVERY = 2, +}; + +#define ISCSI_CHAP_CHALLENGE_LEN 1024 +#define ISCSI_CHAP_MAX_USER_LEN 255 +#define ISCSI_CHAP_MAX_SECRET_LEN 255 + +struct iscsi_chap_auth { + enum iscsi_chap_phase chap_phase; + + char user[ISCSI_CHAP_MAX_USER_LEN + 1]; + char secret[ISCSI_CHAP_MAX_SECRET_LEN + 1]; + char muser[ISCSI_CHAP_MAX_USER_LEN + 1]; + char msecret[ISCSI_CHAP_MAX_SECRET_LEN + 1]; + + uint8_t chap_id[1]; + uint8_t chap_mid[1]; + int chap_challenge_len; + uint8_t chap_challenge[ISCSI_CHAP_CHALLENGE_LEN]; + int chap_mchallenge_len; + uint8_t chap_mchallenge[ISCSI_CHAP_CHALLENGE_LEN]; +}; + +struct spdk_iscsi_auth_secret { + char user[ISCSI_CHAP_MAX_USER_LEN + 1]; + char secret[ISCSI_CHAP_MAX_SECRET_LEN + 1]; + char muser[ISCSI_CHAP_MAX_USER_LEN + 1]; + char msecret[ISCSI_CHAP_MAX_SECRET_LEN + 1]; + TAILQ_ENTRY(spdk_iscsi_auth_secret) tailq; +}; + +struct spdk_iscsi_auth_group { + int32_t tag; + TAILQ_HEAD(, spdk_iscsi_auth_secret) secret_head; + TAILQ_ENTRY(spdk_iscsi_auth_group) tailq; +}; + +struct spdk_iscsi_sess { + uint32_t connections; + struct spdk_iscsi_conn **conns; + + struct spdk_scsi_port *initiator_port; + int tag; + + uint64_t isid; + uint16_t tsih; + struct spdk_iscsi_tgt_node *target; + int queue_depth; + + struct iscsi_param *params; + + enum session_type session_type; + uint32_t MaxConnections; + uint32_t MaxOutstandingR2T; + uint32_t DefaultTime2Wait; + uint32_t DefaultTime2Retain; + uint32_t FirstBurstLength; + uint32_t MaxBurstLength; + bool InitialR2T; + bool ImmediateData; + bool DataPDUInOrder; + bool DataSequenceInOrder; + uint32_t ErrorRecoveryLevel; + + uint32_t ExpCmdSN; + uint32_t MaxCmdSN; + + uint32_t current_text_itt; +}; + +struct spdk_iscsi_poll_group { + struct spdk_poller *poller; + struct spdk_poller *nop_poller; + STAILQ_HEAD(connections, spdk_iscsi_conn) connections; + struct spdk_sock_group *sock_group; + TAILQ_ENTRY(spdk_iscsi_poll_group) link; +}; + +struct spdk_iscsi_opts { + char *authfile; + char *nodebase; + int32_t timeout; + int32_t nopininterval; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; + uint32_t MaxSessions; + uint32_t MaxConnectionsPerSession; + uint32_t MaxConnections; + uint32_t MaxQueueDepth; + uint32_t DefaultTime2Wait; + uint32_t DefaultTime2Retain; + uint32_t FirstBurstLength; + bool ImmediateData; + uint32_t ErrorRecoveryLevel; + bool AllowDuplicateIsid; +}; + +struct spdk_iscsi_globals { + char *authfile; + char *nodebase; + pthread_mutex_t mutex; + uint32_t refcnt; + TAILQ_HEAD(, spdk_iscsi_portal) portal_head; + TAILQ_HEAD(, spdk_iscsi_portal_grp) pg_head; + TAILQ_HEAD(, spdk_iscsi_init_grp) ig_head; + TAILQ_HEAD(, spdk_iscsi_tgt_node) target_head; + TAILQ_HEAD(, spdk_iscsi_auth_group) auth_group_head; + TAILQ_HEAD(, spdk_iscsi_poll_group) poll_group_head; + + int32_t timeout; + int32_t nopininterval; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; + + uint32_t MaxSessions; + uint32_t MaxConnectionsPerSession; + uint32_t MaxConnections; + uint32_t MaxQueueDepth; + uint32_t DefaultTime2Wait; + uint32_t DefaultTime2Retain; + uint32_t FirstBurstLength; + bool ImmediateData; + uint32_t ErrorRecoveryLevel; + bool AllowDuplicateIsid; + + struct spdk_mempool *pdu_pool; + struct spdk_mempool *pdu_immediate_data_pool; + struct spdk_mempool *pdu_data_out_pool; + struct spdk_mempool *session_pool; + struct spdk_mempool *task_pool; + + struct spdk_iscsi_sess **session; +}; + +#define ISCSI_SECURITY_NEGOTIATION_PHASE 0 +#define ISCSI_OPERATIONAL_NEGOTIATION_PHASE 1 +#define ISCSI_NSG_RESERVED_CODE 2 +#define ISCSI_FULL_FEATURE_PHASE 3 + +/* logout reason */ +#define ISCSI_LOGOUT_REASON_CLOSE_SESSION 0 +#define ISCSI_LOGOUT_REASON_CLOSE_CONNECTION 1 +#define ISCSI_LOGOUT_REASON_REMOVE_CONN_FOR_RECOVERY 2 + +enum spdk_error_codes { + SPDK_ISCSI_CONNECTION_FATAL = -1, + SPDK_PDU_FATAL = -2, +}; + +#define DGET24(B) \ + ((( (uint32_t) *((uint8_t *)(B)+0)) << 16) \ + | (((uint32_t) *((uint8_t *)(B)+1)) << 8) \ + | (((uint32_t) *((uint8_t *)(B)+2)) << 0)) + +#define DSET24(B,D) \ + (((*((uint8_t *)(B)+0)) = (uint8_t)((uint32_t)(D) >> 16)), \ + ((*((uint8_t *)(B)+1)) = (uint8_t)((uint32_t)(D) >> 8)), \ + ((*((uint8_t *)(B)+2)) = (uint8_t)((uint32_t)(D) >> 0))) + +#define xstrdup(s) (s ? strdup(s) : (char *)NULL) + +extern struct spdk_iscsi_globals g_iscsi; +extern struct spdk_iscsi_opts *g_spdk_iscsi_opts; + +struct spdk_iscsi_task; +struct spdk_json_write_ctx; + +typedef void (*spdk_iscsi_init_cb)(void *cb_arg, int rc); + +void spdk_iscsi_init(spdk_iscsi_init_cb cb_fn, void *cb_arg); +typedef void (*spdk_iscsi_fini_cb)(void *arg); +void spdk_iscsi_fini(spdk_iscsi_fini_cb cb_fn, void *cb_arg); +void shutdown_iscsi_conns_done(void); +void spdk_iscsi_config_text(FILE *fp); +void spdk_iscsi_config_json(struct spdk_json_write_ctx *w); + +struct spdk_iscsi_opts *iscsi_opts_alloc(void); +void iscsi_opts_free(struct spdk_iscsi_opts *opts); +struct spdk_iscsi_opts *iscsi_opts_copy(struct spdk_iscsi_opts *src); +void iscsi_opts_info_json(struct spdk_json_write_ctx *w); +int iscsi_set_discovery_auth(bool disable_chap, bool require_chap, + bool mutual_chap, int32_t chap_group); +int iscsi_chap_get_authinfo(struct iscsi_chap_auth *auth, const char *authuser, + int ag_tag); +int iscsi_add_auth_group(int32_t tag, struct spdk_iscsi_auth_group **_group); +struct spdk_iscsi_auth_group *iscsi_find_auth_group_by_tag(int32_t tag); +void iscsi_delete_auth_group(struct spdk_iscsi_auth_group *group); +int iscsi_auth_group_add_secret(struct spdk_iscsi_auth_group *group, + const char *user, const char *secret, + const char *muser, const char *msecret); +int iscsi_auth_group_delete_secret(struct spdk_iscsi_auth_group *group, + const char *user); +void iscsi_auth_groups_info_json(struct spdk_json_write_ctx *w); + +void iscsi_task_response(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task); +int iscsi_build_iovs(struct spdk_iscsi_conn *conn, struct iovec *iovs, int iovcnt, + struct spdk_iscsi_pdu *pdu, uint32_t *mapped_length); +int iscsi_handle_incoming_pdus(struct spdk_iscsi_conn *conn); +void iscsi_task_mgmt_response(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task); + +void iscsi_free_sess(struct spdk_iscsi_sess *sess); +void iscsi_clear_all_transfer_task(struct spdk_iscsi_conn *conn, + struct spdk_scsi_lun *lun, + struct spdk_iscsi_pdu *pdu); +bool iscsi_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t CmdSN); + +uint32_t iscsi_pdu_calc_header_digest(struct spdk_iscsi_pdu *pdu); +uint32_t iscsi_pdu_calc_data_digest(struct spdk_iscsi_pdu *pdu); + +/* Memory management */ +void iscsi_put_pdu(struct spdk_iscsi_pdu *pdu); +struct spdk_iscsi_pdu *iscsi_get_pdu(struct spdk_iscsi_conn *conn); +void iscsi_op_abort_task_set(struct spdk_iscsi_task *task, + uint8_t function); +void iscsi_queue_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task); + +static inline uint32_t +iscsi_get_max_immediate_data_size(void) +{ + /* + * Specify enough extra space in addition to FirstBurstLength to + * account for a header digest, data digest and additional header + * segments (AHS). These are not normally used but they do not + * take up much space and we need to make sure the worst-case scenario + * can be satisified by the size returned here. + */ + return g_iscsi.FirstBurstLength + + ISCSI_DIGEST_LEN + /* data digest */ + ISCSI_DIGEST_LEN + /* header digest */ + 8 + /* bidirectional AHS */ + 52; /* extended CDB AHS (for a 64-byte CDB) */ +} + +#endif /* SPDK_ISCSI_H */ diff --git a/src/spdk/lib/iscsi/iscsi_rpc.c b/src/spdk/lib/iscsi/iscsi_rpc.c new file mode 100644 index 000000000..8ab43d31d --- /dev/null +++ b/src/spdk/lib/iscsi/iscsi_rpc.c @@ -0,0 +1,1639 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "iscsi/iscsi.h" +#include "iscsi/conn.h" +#include "iscsi/tgt_node.h" +#include "iscsi/portal_grp.h" +#include "iscsi/init_grp.h" + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +static void +rpc_iscsi_get_initiator_groups(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "iscsi_get_initiator_groups requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + iscsi_init_grps_info_json(w); + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_get_initiator_groups", rpc_iscsi_get_initiator_groups, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_initiator_groups, get_initiator_groups) + +struct rpc_initiator_list { + size_t num_initiators; + char *initiators[MAX_INITIATOR]; +}; + +static int +decode_rpc_initiator_list(const struct spdk_json_val *val, void *out) +{ + struct rpc_initiator_list *list = out; + + return spdk_json_decode_array(val, spdk_json_decode_string, list->initiators, MAX_INITIATOR, + &list->num_initiators, sizeof(char *)); +} + +static void +free_rpc_initiator_list(struct rpc_initiator_list *list) +{ + size_t i; + + for (i = 0; i < list->num_initiators; i++) { + free(list->initiators[i]); + } +} + +struct rpc_netmask_list { + size_t num_netmasks; + char *netmasks[MAX_NETMASK]; +}; + +static int +decode_rpc_netmask_list(const struct spdk_json_val *val, void *out) +{ + struct rpc_netmask_list *list = out; + + return spdk_json_decode_array(val, spdk_json_decode_string, list->netmasks, MAX_NETMASK, + &list->num_netmasks, sizeof(char *)); +} + +static void +free_rpc_netmask_list(struct rpc_netmask_list *list) +{ + size_t i; + + for (i = 0; i < list->num_netmasks; i++) { + free(list->netmasks[i]); + } +} + +struct rpc_initiator_group { + int32_t tag; + struct rpc_initiator_list initiator_list; + struct rpc_netmask_list netmask_list; +}; + +static void +free_rpc_initiator_group(struct rpc_initiator_group *ig) +{ + free_rpc_initiator_list(&ig->initiator_list); + free_rpc_netmask_list(&ig->netmask_list); +} + +static const struct spdk_json_object_decoder rpc_initiator_group_decoders[] = { + {"tag", offsetof(struct rpc_initiator_group, tag), spdk_json_decode_int32}, + {"initiators", offsetof(struct rpc_initiator_group, initiator_list), decode_rpc_initiator_list}, + {"netmasks", offsetof(struct rpc_initiator_group, netmask_list), decode_rpc_netmask_list}, +}; + +static void +rpc_iscsi_create_initiator_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_initiator_group req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_initiator_group_decoders, + SPDK_COUNTOF(rpc_initiator_group_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.initiator_list.num_initiators == 0 || + req.netmask_list.num_netmasks == 0) { + goto invalid; + } + + if (iscsi_init_grp_create_from_initiator_list(req.tag, + req.initiator_list.num_initiators, + req.initiator_list.initiators, + req.netmask_list.num_netmasks, + req.netmask_list.netmasks)) { + SPDK_ERRLOG("create_from_initiator_list failed\n"); + goto invalid; + } + + free_rpc_initiator_group(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_initiator_group(&req); +} +SPDK_RPC_REGISTER("iscsi_create_initiator_group", rpc_iscsi_create_initiator_group, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_initiator_group, add_initiator_group) + +static const struct spdk_json_object_decoder rpc_add_or_delete_initiators_decoders[] = { + {"tag", offsetof(struct rpc_initiator_group, tag), spdk_json_decode_int32}, + {"initiators", offsetof(struct rpc_initiator_group, initiator_list), decode_rpc_initiator_list, true}, + {"netmasks", offsetof(struct rpc_initiator_group, netmask_list), decode_rpc_netmask_list, true}, +}; + +static void +rpc_iscsi_initiator_group_add_initiators(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_initiator_group req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_add_or_delete_initiators_decoders, + SPDK_COUNTOF(rpc_add_or_delete_initiators_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (iscsi_init_grp_add_initiators_from_initiator_list(req.tag, + req.initiator_list.num_initiators, + req.initiator_list.initiators, + req.netmask_list.num_netmasks, + req.netmask_list.netmasks)) { + SPDK_ERRLOG("add_initiators_from_initiator_list failed\n"); + goto invalid; + } + + free_rpc_initiator_group(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_initiator_group(&req); +} +SPDK_RPC_REGISTER("iscsi_initiator_group_add_initiators", + rpc_iscsi_initiator_group_add_initiators, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_initiator_group_add_initiators, + add_initiators_to_initiator_group) + +static void +rpc_iscsi_initiator_group_remove_initiators(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_initiator_group req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_add_or_delete_initiators_decoders, + SPDK_COUNTOF(rpc_add_or_delete_initiators_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (iscsi_init_grp_delete_initiators_from_initiator_list(req.tag, + req.initiator_list.num_initiators, + req.initiator_list.initiators, + req.netmask_list.num_netmasks, + req.netmask_list.netmasks)) { + SPDK_ERRLOG("delete_initiators_from_initiator_list failed\n"); + goto invalid; + } + + free_rpc_initiator_group(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_initiator_group(&req); +} +SPDK_RPC_REGISTER("iscsi_initiator_group_remove_initiators", + rpc_iscsi_initiator_group_remove_initiators, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_initiator_group_remove_initiators, + delete_initiators_from_initiator_group) + +struct rpc_iscsi_delete_initiator_group { + int32_t tag; +}; + +static const struct spdk_json_object_decoder rpc_iscsi_delete_initiator_group_decoders[] = { + {"tag", offsetof(struct rpc_iscsi_delete_initiator_group, tag), spdk_json_decode_int32}, +}; + +static void +rpc_iscsi_delete_initiator_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_iscsi_delete_initiator_group req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_init_grp *ig; + + if (spdk_json_decode_object(params, rpc_iscsi_delete_initiator_group_decoders, + SPDK_COUNTOF(rpc_iscsi_delete_initiator_group_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + ig = iscsi_init_grp_unregister(req.tag); + if (!ig) { + goto invalid; + } + iscsi_tgt_node_delete_map(NULL, ig); + iscsi_init_grp_destroy(ig); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +} +SPDK_RPC_REGISTER("iscsi_delete_initiator_group", rpc_iscsi_delete_initiator_group, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_initiator_group, delete_initiator_group) + +static void +rpc_iscsi_get_target_nodes(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "iscsi_get_target_nodes requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + iscsi_tgt_nodes_info_json(w); + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_get_target_nodes", rpc_iscsi_get_target_nodes, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_target_nodes, get_target_nodes) + +struct rpc_pg_ig_map { + int32_t pg_tag; + int32_t ig_tag; +}; + +static const struct spdk_json_object_decoder rpc_pg_ig_map_decoders[] = { + {"pg_tag", offsetof(struct rpc_pg_ig_map, pg_tag), spdk_json_decode_int32}, + {"ig_tag", offsetof(struct rpc_pg_ig_map, ig_tag), spdk_json_decode_int32}, +}; + +static int +decode_rpc_pg_ig_map(const struct spdk_json_val *val, void *out) +{ + struct rpc_pg_ig_map *pg_ig_map = out; + + return spdk_json_decode_object(val, rpc_pg_ig_map_decoders, + SPDK_COUNTOF(rpc_pg_ig_map_decoders), + pg_ig_map); +} + +struct rpc_pg_ig_maps { + size_t num_maps; + struct rpc_pg_ig_map maps[MAX_TARGET_MAP]; +}; + +static int +decode_rpc_pg_ig_maps(const struct spdk_json_val *val, void *out) +{ + struct rpc_pg_ig_maps *pg_ig_maps = out; + + return spdk_json_decode_array(val, decode_rpc_pg_ig_map, pg_ig_maps->maps, + MAX_TARGET_MAP, &pg_ig_maps->num_maps, + sizeof(struct rpc_pg_ig_map)); +} + +#define RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN 64 + +struct rpc_lun { + char *bdev_name; + int32_t lun_id; +}; + +static const struct spdk_json_object_decoder rpc_lun_decoders[] = { + {"bdev_name", offsetof(struct rpc_lun, bdev_name), spdk_json_decode_string}, + {"lun_id", offsetof(struct rpc_lun, lun_id), spdk_json_decode_int32}, +}; + +static int +decode_rpc_lun(const struct spdk_json_val *val, void *out) +{ + struct rpc_lun *lun = out; + + return spdk_json_decode_object(val, rpc_lun_decoders, + SPDK_COUNTOF(rpc_lun_decoders), lun); +} + +struct rpc_luns { + size_t num_luns; + struct rpc_lun luns[RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN]; +}; + +static int +decode_rpc_luns(const struct spdk_json_val *val, void *out) +{ + struct rpc_luns *luns = out; + + return spdk_json_decode_array(val, decode_rpc_lun, luns->luns, + RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN, + &luns->num_luns, sizeof(struct rpc_lun)); +} + +static void +free_rpc_luns(struct rpc_luns *p) +{ + size_t i; + + for (i = 0; i < p->num_luns; i++) { + free(p->luns[i].bdev_name); + } +} + +struct rpc_target_node { + char *name; + char *alias_name; + + struct rpc_pg_ig_maps pg_ig_maps; + struct rpc_luns luns; + + int32_t queue_depth; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; + + bool header_digest; + bool data_digest; +}; + +static void +free_rpc_target_node(struct rpc_target_node *req) +{ + free(req->name); + free(req->alias_name); + free_rpc_luns(&req->luns); +} + +static const struct spdk_json_object_decoder rpc_target_node_decoders[] = { + {"name", offsetof(struct rpc_target_node, name), spdk_json_decode_string}, + {"alias_name", offsetof(struct rpc_target_node, alias_name), spdk_json_decode_string}, + {"pg_ig_maps", offsetof(struct rpc_target_node, pg_ig_maps), decode_rpc_pg_ig_maps}, + {"luns", offsetof(struct rpc_target_node, luns), decode_rpc_luns}, + {"queue_depth", offsetof(struct rpc_target_node, queue_depth), spdk_json_decode_int32}, + {"disable_chap", offsetof(struct rpc_target_node, disable_chap), spdk_json_decode_bool, true}, + {"require_chap", offsetof(struct rpc_target_node, require_chap), spdk_json_decode_bool, true}, + {"mutual_chap", offsetof(struct rpc_target_node, mutual_chap), spdk_json_decode_bool, true}, + {"chap_group", offsetof(struct rpc_target_node, chap_group), spdk_json_decode_int32, true}, + {"header_digest", offsetof(struct rpc_target_node, header_digest), spdk_json_decode_bool, true}, + {"data_digest", offsetof(struct rpc_target_node, data_digest), spdk_json_decode_bool, true}, +}; + +static void +rpc_iscsi_create_target_node(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_target_node req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_tgt_node *target; + int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0}; + char *bdev_names[RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN] = {0}; + int32_t lun_ids[RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN] = {0}; + size_t i; + + if (spdk_json_decode_object(params, rpc_target_node_decoders, + SPDK_COUNTOF(rpc_target_node_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + for (i = 0; i < req.pg_ig_maps.num_maps; i++) { + pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag; + ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag; + } + + for (i = 0; i < req.luns.num_luns; i++) { + bdev_names[i] = req.luns.luns[i].bdev_name; + lun_ids[i] = req.luns.luns[i].lun_id; + } + + /* + * Use default parameters in a few places: + * index = -1 : automatically pick an index for the new target node + * alias = NULL + */ + target = iscsi_tgt_node_construct(-1, req.name, req.alias_name, + pg_tags, + ig_tags, + req.pg_ig_maps.num_maps, + (const char **)bdev_names, + lun_ids, + req.luns.num_luns, + req.queue_depth, + req.disable_chap, + req.require_chap, + req.mutual_chap, + req.chap_group, + req.header_digest, + req.data_digest); + + if (target == NULL) { + goto invalid; + } + + free_rpc_target_node(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_target_node(&req); +} +SPDK_RPC_REGISTER("iscsi_create_target_node", rpc_iscsi_create_target_node, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_target_node, construct_target_node) + +struct rpc_tgt_node_pg_ig_maps { + char *name; + struct rpc_pg_ig_maps pg_ig_maps; +}; + +static const struct spdk_json_object_decoder rpc_tgt_node_pg_ig_maps_decoders[] = { + {"name", offsetof(struct rpc_tgt_node_pg_ig_maps, name), spdk_json_decode_string}, + {"pg_ig_maps", offsetof(struct rpc_tgt_node_pg_ig_maps, pg_ig_maps), decode_rpc_pg_ig_maps}, +}; + +static void +rpc_iscsi_target_node_add_pg_ig_maps(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_tgt_node_pg_ig_maps req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_tgt_node *target; + int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0}; + size_t i; + int rc; + + if (spdk_json_decode_object(params, rpc_tgt_node_pg_ig_maps_decoders, + SPDK_COUNTOF(rpc_tgt_node_pg_ig_maps_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + target = iscsi_find_tgt_node(req.name); + if (target == NULL) { + SPDK_ERRLOG("target is not found\n"); + goto invalid; + } + + for (i = 0; i < req.pg_ig_maps.num_maps; i++) { + pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag; + ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag; + } + + rc = iscsi_target_node_add_pg_ig_maps(target, pg_tags, ig_tags, + req.pg_ig_maps.num_maps); + if (rc < 0) { + SPDK_ERRLOG("add pg-ig maps failed\n"); + goto invalid; + } + + free(req.name); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free(req.name); +} +SPDK_RPC_REGISTER("iscsi_target_node_add_pg_ig_maps", + rpc_iscsi_target_node_add_pg_ig_maps, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_add_pg_ig_maps, add_pg_ig_maps) + +static void +rpc_iscsi_target_node_remove_pg_ig_maps(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_tgt_node_pg_ig_maps req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_tgt_node *target; + int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0}; + size_t i; + int rc; + + if (spdk_json_decode_object(params, rpc_tgt_node_pg_ig_maps_decoders, + SPDK_COUNTOF(rpc_tgt_node_pg_ig_maps_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + target = iscsi_find_tgt_node(req.name); + if (target == NULL) { + SPDK_ERRLOG("target is not found\n"); + goto invalid; + } + + for (i = 0; i < req.pg_ig_maps.num_maps; i++) { + pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag; + ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag; + } + + rc = iscsi_target_node_remove_pg_ig_maps(target, pg_tags, ig_tags, + req.pg_ig_maps.num_maps); + if (rc < 0) { + SPDK_ERRLOG("remove pg-ig maps failed\n"); + goto invalid; + } + + free(req.name); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free(req.name); +} +SPDK_RPC_REGISTER("iscsi_target_node_remove_pg_ig_maps", + rpc_iscsi_target_node_remove_pg_ig_maps, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_remove_pg_ig_maps, + delete_pg_ig_maps) + +struct rpc_iscsi_delete_target_node { + char *name; +}; + +static void +free_rpc_iscsi_delete_target_node(struct rpc_iscsi_delete_target_node *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_iscsi_delete_target_node_decoders[] = { + {"name", offsetof(struct rpc_iscsi_delete_target_node, name), spdk_json_decode_string}, +}; + +struct rpc_iscsi_delete_target_node_ctx { + struct rpc_iscsi_delete_target_node req; + struct spdk_jsonrpc_request *request; +}; + +static void +rpc_iscsi_delete_target_node_done(void *cb_arg, int rc) +{ + struct rpc_iscsi_delete_target_node_ctx *ctx = cb_arg; + struct spdk_json_write_ctx *w; + + free_rpc_iscsi_delete_target_node(&ctx->req); + + w = spdk_jsonrpc_begin_result(ctx->request); + spdk_json_write_bool(w, rc == 0); + spdk_jsonrpc_end_result(ctx->request, w); + + free(ctx); +} + +static void +rpc_iscsi_delete_target_node(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_iscsi_delete_target_node_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(ENOMEM)); + return; + } + + if (spdk_json_decode_object(params, rpc_iscsi_delete_target_node_decoders, + SPDK_COUNTOF(rpc_iscsi_delete_target_node_decoders), + &ctx->req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (ctx->req.name == NULL) { + SPDK_ERRLOG("missing name param\n"); + goto invalid; + } + + ctx->request = request; + + iscsi_shutdown_tgt_node_by_name(ctx->req.name, + rpc_iscsi_delete_target_node_done, ctx); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_iscsi_delete_target_node(&ctx->req); + free(ctx); +} +SPDK_RPC_REGISTER("iscsi_delete_target_node", rpc_iscsi_delete_target_node, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_target_node, delete_target_node) + +static void +rpc_iscsi_get_portal_groups(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "iscsi_get_portal_groups requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + iscsi_portal_grps_info_json(w); + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_get_portal_groups", rpc_iscsi_get_portal_groups, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_portal_groups, get_portal_groups) + +struct rpc_portal { + char *host; + char *port; +}; + +struct rpc_portal_list { + size_t num_portals; + struct rpc_portal portals[MAX_PORTAL]; +}; + +struct rpc_portal_group { + int32_t tag; + struct rpc_portal_list portal_list; +}; + +static void +free_rpc_portal(struct rpc_portal *portal) +{ + free(portal->host); + free(portal->port); +} + +static void +free_rpc_portal_list(struct rpc_portal_list *pl) +{ + size_t i; + + for (i = 0; i < pl->num_portals; i++) { + free_rpc_portal(&pl->portals[i]); + } + pl->num_portals = 0; +} + +static void +free_rpc_portal_group(struct rpc_portal_group *pg) +{ + free_rpc_portal_list(&pg->portal_list); +} + +static const struct spdk_json_object_decoder rpc_portal_decoders[] = { + {"host", offsetof(struct rpc_portal, host), spdk_json_decode_string}, + {"port", offsetof(struct rpc_portal, port), spdk_json_decode_string}, +}; + +static int +decode_rpc_portal(const struct spdk_json_val *val, void *out) +{ + struct rpc_portal *portal = out; + + return spdk_json_decode_object(val, rpc_portal_decoders, + SPDK_COUNTOF(rpc_portal_decoders), + portal); +} + +static int +decode_rpc_portal_list(const struct spdk_json_val *val, void *out) +{ + struct rpc_portal_list *list = out; + + return spdk_json_decode_array(val, decode_rpc_portal, list->portals, MAX_PORTAL, &list->num_portals, + sizeof(struct rpc_portal)); +} + +static const struct spdk_json_object_decoder rpc_portal_group_decoders[] = { + {"tag", offsetof(struct rpc_portal_group, tag), spdk_json_decode_int32}, + {"portals", offsetof(struct rpc_portal_group, portal_list), decode_rpc_portal_list}, +}; + +static void +rpc_iscsi_create_portal_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_portal_group req = {}; + struct spdk_iscsi_portal_grp *pg = NULL; + struct spdk_iscsi_portal *portal; + struct spdk_json_write_ctx *w; + size_t i = 0; + int rc = -1; + + if (spdk_json_decode_object(params, rpc_portal_group_decoders, + SPDK_COUNTOF(rpc_portal_group_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto out; + } + + pg = iscsi_portal_grp_create(req.tag); + if (pg == NULL) { + SPDK_ERRLOG("portal_grp_create failed\n"); + goto out; + } + for (i = 0; i < req.portal_list.num_portals; i++) { + portal = iscsi_portal_create(req.portal_list.portals[i].host, + req.portal_list.portals[i].port); + if (portal == NULL) { + SPDK_ERRLOG("portal_create failed\n"); + goto out; + } + iscsi_portal_grp_add_portal(pg, portal); + } + + rc = iscsi_portal_grp_open(pg); + if (rc != 0) { + SPDK_ERRLOG("portal_grp_open failed\n"); + goto out; + } + + rc = iscsi_portal_grp_register(pg); + if (rc != 0) { + SPDK_ERRLOG("portal_grp_register failed\n"); + } + +out: + if (rc == 0) { + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + + if (pg != NULL) { + iscsi_portal_grp_release(pg); + } + } + free_rpc_portal_group(&req); +} +SPDK_RPC_REGISTER("iscsi_create_portal_group", rpc_iscsi_create_portal_group, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_portal_group, add_portal_group) + +struct rpc_iscsi_delete_portal_group { + int32_t tag; +}; + +static const struct spdk_json_object_decoder rpc_iscsi_delete_portal_group_decoders[] = { + {"tag", offsetof(struct rpc_iscsi_delete_portal_group, tag), spdk_json_decode_int32}, +}; + +static void +rpc_iscsi_delete_portal_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_iscsi_delete_portal_group req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_portal_grp *pg; + + if (spdk_json_decode_object(params, rpc_iscsi_delete_portal_group_decoders, + SPDK_COUNTOF(rpc_iscsi_delete_portal_group_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + pg = iscsi_portal_grp_unregister(req.tag); + if (!pg) { + goto invalid; + } + + iscsi_tgt_node_delete_map(pg, NULL); + iscsi_portal_grp_release(pg); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +} +SPDK_RPC_REGISTER("iscsi_delete_portal_group", rpc_iscsi_delete_portal_group, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_portal_group, delete_portal_group) + +struct rpc_portal_group_auth { + int32_t tag; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; +}; + +static const struct spdk_json_object_decoder rpc_portal_group_auth_decoders[] = { + {"tag", offsetof(struct rpc_portal_group_auth, tag), spdk_json_decode_int32}, + {"disable_chap", offsetof(struct rpc_portal_group_auth, disable_chap), spdk_json_decode_bool, true}, + {"require_chap", offsetof(struct rpc_portal_group_auth, require_chap), spdk_json_decode_bool, true}, + {"mutual_chap", offsetof(struct rpc_portal_group_auth, mutual_chap), spdk_json_decode_bool, true}, + {"chap_group", offsetof(struct rpc_portal_group_auth, chap_group), spdk_json_decode_int32, true}, +}; + +static void +rpc_iscsi_portal_group_set_auth(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_portal_group_auth req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_portal_grp *pg; + int rc; + + if (spdk_json_decode_object(params, rpc_portal_group_auth_decoders, + SPDK_COUNTOF(rpc_portal_group_auth_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + pthread_mutex_lock(&g_iscsi.mutex); + + pg = iscsi_portal_grp_find_by_tag(req.tag); + if (pg == NULL) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not find portal group %d", req.tag); + goto exit; + } + + rc = iscsi_portal_grp_set_chap_params(pg, req.disable_chap, req.require_chap, + req.mutual_chap, req.chap_group); + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid combination of auth params"); + goto exit; + } + + pthread_mutex_unlock(&g_iscsi.mutex); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + + return; + +exit: + pthread_mutex_unlock(&g_iscsi.mutex); +} +SPDK_RPC_REGISTER("iscsi_portal_group_set_auth", rpc_iscsi_portal_group_set_auth, + SPDK_RPC_RUNTIME) + +struct rpc_iscsi_get_connections_ctx { + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; +}; + +static void +_rpc_iscsi_get_connections_done(struct spdk_io_channel_iter *i, int status) +{ + struct rpc_iscsi_get_connections_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + spdk_json_write_array_end(ctx->w); + spdk_jsonrpc_end_result(ctx->request, ctx->w); + + free(ctx); +} + +static void +_rpc_iscsi_get_connections(struct spdk_io_channel_iter *i) +{ + struct rpc_iscsi_get_connections_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_iscsi_poll_group *pg = spdk_io_channel_get_ctx(ch); + struct spdk_iscsi_conn *conn; + + STAILQ_FOREACH(conn, &pg->connections, pg_link) { + iscsi_conn_info_json(ctx->w, conn); + } + + spdk_for_each_channel_continue(i, 0); +} + +static void +rpc_iscsi_get_connections(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_iscsi_get_connections_ctx *ctx; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "iscsi_get_connections requires no parameters"); + return; + } + + ctx = calloc(1, sizeof(struct rpc_iscsi_get_connections_ctx)); + if (ctx == NULL) { + SPDK_ERRLOG("Failed to allocate rpc_get_iscsi_conns_ctx struct\n"); + spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); + return; + } + + ctx->request = request; + ctx->w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_array_begin(ctx->w); + + spdk_for_each_channel(&g_iscsi, + _rpc_iscsi_get_connections, + ctx, + _rpc_iscsi_get_connections_done); +} +SPDK_RPC_REGISTER("iscsi_get_connections", rpc_iscsi_get_connections, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_connections, get_iscsi_connections) + +struct rpc_target_lun { + char *name; + char *bdev_name; + int32_t lun_id; +}; + +static void +free_rpc_target_lun(struct rpc_target_lun *req) +{ + free(req->name); + free(req->bdev_name); +} + +static const struct spdk_json_object_decoder rpc_target_lun_decoders[] = { + {"name", offsetof(struct rpc_target_lun, name), spdk_json_decode_string}, + {"bdev_name", offsetof(struct rpc_target_lun, bdev_name), spdk_json_decode_string}, + {"lun_id", offsetof(struct rpc_target_lun, lun_id), spdk_json_decode_int32, true}, +}; + +static void +rpc_iscsi_target_node_add_lun(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_target_lun req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_tgt_node *target; + int rc; + + req.lun_id = -1; + + if (spdk_json_decode_object(params, rpc_target_lun_decoders, + SPDK_COUNTOF(rpc_target_lun_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + target = iscsi_find_tgt_node(req.name); + if (target == NULL) { + SPDK_ERRLOG("target is not found\n"); + goto invalid; + } + + rc = iscsi_tgt_node_add_lun(target, req.bdev_name, req.lun_id); + if (rc < 0) { + SPDK_ERRLOG("add lun failed\n"); + goto invalid; + } + + free_rpc_target_lun(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_target_lun(&req); +} +SPDK_RPC_REGISTER("iscsi_target_node_add_lun", rpc_iscsi_target_node_add_lun, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_add_lun, target_node_add_lun) + +struct rpc_target_auth { + char *name; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; +}; + +static void +free_rpc_target_auth(struct rpc_target_auth *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_target_auth_decoders[] = { + {"name", offsetof(struct rpc_target_auth, name), spdk_json_decode_string}, + {"disable_chap", offsetof(struct rpc_target_auth, disable_chap), spdk_json_decode_bool, true}, + {"require_chap", offsetof(struct rpc_target_auth, require_chap), spdk_json_decode_bool, true}, + {"mutual_chap", offsetof(struct rpc_target_auth, mutual_chap), spdk_json_decode_bool, true}, + {"chap_group", offsetof(struct rpc_target_auth, chap_group), spdk_json_decode_int32, true}, +}; + +static void +rpc_iscsi_target_node_set_auth(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_target_auth req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_tgt_node *target; + int rc; + + if (spdk_json_decode_object(params, rpc_target_auth_decoders, + SPDK_COUNTOF(rpc_target_auth_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto exit; + } + + target = iscsi_find_tgt_node(req.name); + if (target == NULL) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not find target %s", req.name); + goto exit; + } + + rc = iscsi_tgt_node_set_chap_params(target, req.disable_chap, req.require_chap, + req.mutual_chap, req.chap_group); + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid combination of auth params"); + goto exit; + } + + free_rpc_target_auth(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +exit: + free_rpc_target_auth(&req); +} +SPDK_RPC_REGISTER("iscsi_target_node_set_auth", rpc_iscsi_target_node_set_auth, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_set_auth, set_iscsi_target_node_auth) + +static void +rpc_iscsi_get_options(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "iscsi_get_options requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + iscsi_opts_info_json(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_get_options", rpc_iscsi_get_options, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_options, get_iscsi_global_params) + +struct rpc_discovery_auth { + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; +}; + +static const struct spdk_json_object_decoder rpc_discovery_auth_decoders[] = { + {"disable_chap", offsetof(struct rpc_discovery_auth, disable_chap), spdk_json_decode_bool, true}, + {"require_chap", offsetof(struct rpc_discovery_auth, require_chap), spdk_json_decode_bool, true}, + {"mutual_chap", offsetof(struct rpc_discovery_auth, mutual_chap), spdk_json_decode_bool, true}, + {"chap_group", offsetof(struct rpc_discovery_auth, chap_group), spdk_json_decode_int32, true}, +}; + +static void +rpc_iscsi_set_discovery_auth(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_discovery_auth req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_discovery_auth_decoders, + SPDK_COUNTOF(rpc_discovery_auth_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + rc = iscsi_set_discovery_auth(req.disable_chap, req.require_chap, + req.mutual_chap, req.chap_group); + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid combination of CHAP params"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_set_discovery_auth", rpc_iscsi_set_discovery_auth, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_set_discovery_auth, set_iscsi_discovery_auth) + +#define MAX_AUTH_SECRETS 64 + +struct rpc_auth_secret { + char *user; + char *secret; + char *muser; + char *msecret; +}; + +static void +free_rpc_auth_secret(struct rpc_auth_secret *_secret) +{ + free(_secret->user); + free(_secret->secret); + free(_secret->muser); + free(_secret->msecret); +} + +static const struct spdk_json_object_decoder rpc_auth_secret_decoders[] = { + {"user", offsetof(struct rpc_auth_secret, user), spdk_json_decode_string}, + {"secret", offsetof(struct rpc_auth_secret, secret), spdk_json_decode_string}, + {"muser", offsetof(struct rpc_auth_secret, muser), spdk_json_decode_string, true}, + {"msecret", offsetof(struct rpc_auth_secret, msecret), spdk_json_decode_string, true}, +}; + +static int +decode_rpc_auth_secret(const struct spdk_json_val *val, void *out) +{ + struct rpc_auth_secret *_secret = out; + + return spdk_json_decode_object(val, rpc_auth_secret_decoders, + SPDK_COUNTOF(rpc_auth_secret_decoders), _secret); +} + +struct rpc_auth_secrets { + size_t num_secret; + struct rpc_auth_secret secrets[MAX_AUTH_SECRETS]; +}; + +static void +free_rpc_auth_secrets(struct rpc_auth_secrets *secrets) +{ + size_t i; + + for (i = 0; i < secrets->num_secret; i++) { + free_rpc_auth_secret(&secrets->secrets[i]); + } +} + +static int +decode_rpc_auth_secrets(const struct spdk_json_val *val, void *out) +{ + struct rpc_auth_secrets *secrets = out; + + return spdk_json_decode_array(val, decode_rpc_auth_secret, secrets->secrets, + MAX_AUTH_SECRETS, &secrets->num_secret, + sizeof(struct rpc_auth_secret)); +} + +struct rpc_auth_group { + int32_t tag; + struct rpc_auth_secrets secrets; +}; + +static void +free_rpc_auth_group(struct rpc_auth_group *group) +{ + free_rpc_auth_secrets(&group->secrets); +} + +static const struct spdk_json_object_decoder rpc_auth_group_decoders[] = { + {"tag", offsetof(struct rpc_auth_group, tag), spdk_json_decode_int32}, + {"secrets", offsetof(struct rpc_auth_group, secrets), decode_rpc_auth_secrets, true}, +}; + +static void +rpc_iscsi_create_auth_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_auth_group req = {}; + struct rpc_auth_secret *_secret; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_auth_group *group = NULL; + int rc; + size_t i; + + if (spdk_json_decode_object(params, rpc_auth_group_decoders, + SPDK_COUNTOF(rpc_auth_group_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_auth_group(&req); + return; + } + + pthread_mutex_lock(&g_iscsi.mutex); + + rc = iscsi_add_auth_group(req.tag, &group); + if (rc != 0) { + pthread_mutex_unlock(&g_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not add auth group (%d), %s", + req.tag, spdk_strerror(-rc)); + free_rpc_auth_group(&req); + return; + } + + for (i = 0; i < req.secrets.num_secret; i++) { + _secret = &req.secrets.secrets[i]; + rc = iscsi_auth_group_add_secret(group, _secret->user, _secret->secret, + _secret->muser, _secret->msecret); + if (rc != 0) { + iscsi_delete_auth_group(group); + pthread_mutex_unlock(&g_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not add secret to auth group (%d), %s", + req.tag, spdk_strerror(-rc)); + free_rpc_auth_group(&req); + return; + } + } + + pthread_mutex_unlock(&g_iscsi.mutex); + + free_rpc_auth_group(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_create_auth_group", rpc_iscsi_create_auth_group, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_auth_group, add_iscsi_auth_group) + +struct rpc_delete_auth_group { + int32_t tag; +}; + +static const struct spdk_json_object_decoder rpc_delete_auth_group_decoders[] = { + {"tag", offsetof(struct rpc_delete_auth_group, tag), spdk_json_decode_int32}, +}; + +static void +rpc_iscsi_delete_auth_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_auth_group req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_auth_group *group; + + if (spdk_json_decode_object(params, rpc_delete_auth_group_decoders, + SPDK_COUNTOF(rpc_delete_auth_group_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + pthread_mutex_lock(&g_iscsi.mutex); + + group = iscsi_find_auth_group_by_tag(req.tag); + if (group == NULL) { + pthread_mutex_unlock(&g_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not find auth group (%d)", req.tag); + return; + } + + iscsi_delete_auth_group(group); + + pthread_mutex_unlock(&g_iscsi.mutex); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_delete_auth_group", rpc_iscsi_delete_auth_group, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_auth_group, delete_iscsi_auth_group) + +struct rpc_add_auth_secret { + int32_t tag; + char *user; + char *secret; + char *muser; + char *msecret; +}; + +static void +free_rpc_add_auth_secret(struct rpc_add_auth_secret *_secret) +{ + free(_secret->user); + free(_secret->secret); + free(_secret->muser); + free(_secret->msecret); +} + +static const struct spdk_json_object_decoder rpc_add_auth_secret_decoders[] = { + {"tag", offsetof(struct rpc_add_auth_secret, tag), spdk_json_decode_int32}, + {"user", offsetof(struct rpc_add_auth_secret, user), spdk_json_decode_string}, + {"secret", offsetof(struct rpc_add_auth_secret, secret), spdk_json_decode_string}, + {"muser", offsetof(struct rpc_add_auth_secret, muser), spdk_json_decode_string, true}, + {"msecret", offsetof(struct rpc_add_auth_secret, msecret), spdk_json_decode_string, true}, +}; + +static void +rpc_iscsi_auth_group_add_secret(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_add_auth_secret req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_auth_group *group; + int rc; + + if (spdk_json_decode_object(params, rpc_add_auth_secret_decoders, + SPDK_COUNTOF(rpc_add_auth_secret_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_add_auth_secret(&req); + return; + } + + pthread_mutex_lock(&g_iscsi.mutex); + + group = iscsi_find_auth_group_by_tag(req.tag); + if (group == NULL) { + pthread_mutex_unlock(&g_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not find auth group (%d)", req.tag); + free_rpc_add_auth_secret(&req); + return; + } + + rc = iscsi_auth_group_add_secret(group, req.user, req.secret, req.muser, req.msecret); + if (rc != 0) { + pthread_mutex_unlock(&g_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not add secret to auth group (%d), %s", + req.tag, spdk_strerror(-rc)); + free_rpc_add_auth_secret(&req); + return; + } + + pthread_mutex_unlock(&g_iscsi.mutex); + + free_rpc_add_auth_secret(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_auth_group_add_secret", rpc_iscsi_auth_group_add_secret, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_auth_group_add_secret, add_secret_to_iscsi_auth_group) + + +struct rpc_remove_auth_secret { + int32_t tag; + char *user; +}; + +static void +free_rpc_remove_auth_secret(struct rpc_remove_auth_secret *_secret) +{ + free(_secret->user); +} + +static const struct spdk_json_object_decoder rpc_remove_auth_secret_decoders[] = { + {"tag", offsetof(struct rpc_remove_auth_secret, tag), spdk_json_decode_int32}, + {"user", offsetof(struct rpc_remove_auth_secret, user), spdk_json_decode_string}, +}; + +static void +rpc_iscsi_auth_group_remove_secret(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_remove_auth_secret req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_auth_group *group; + int rc; + + if (spdk_json_decode_object(params, rpc_remove_auth_secret_decoders, + SPDK_COUNTOF(rpc_remove_auth_secret_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_remove_auth_secret(&req); + return; + } + + pthread_mutex_lock(&g_iscsi.mutex); + + group = iscsi_find_auth_group_by_tag(req.tag); + if (group == NULL) { + pthread_mutex_unlock(&g_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not find auth group (%d)", req.tag); + free_rpc_remove_auth_secret(&req); + return; + } + + rc = iscsi_auth_group_delete_secret(group, req.user); + if (rc != 0) { + pthread_mutex_unlock(&g_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not delete secret from CHAP group (%d), %s", + req.tag, spdk_strerror(-rc)); + free_rpc_remove_auth_secret(&req); + return; + } + + pthread_mutex_unlock(&g_iscsi.mutex); + + free_rpc_remove_auth_secret(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_auth_group_remove_secret", + rpc_iscsi_auth_group_remove_secret, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_auth_group_remove_secret, + delete_secret_from_iscsi_auth_group) + +static void +rpc_iscsi_get_auth_groups(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "iscsi_get_auth_groups requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + iscsi_auth_groups_info_json(w); + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_get_auth_groups", rpc_iscsi_get_auth_groups, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_auth_groups, get_iscsi_auth_groups) + +static const struct spdk_json_object_decoder rpc_set_iscsi_opts_decoders[] = { + {"auth_file", offsetof(struct spdk_iscsi_opts, authfile), spdk_json_decode_string, true}, + {"node_base", offsetof(struct spdk_iscsi_opts, nodebase), spdk_json_decode_string, true}, + {"nop_timeout", offsetof(struct spdk_iscsi_opts, timeout), spdk_json_decode_int32, true}, + {"nop_in_interval", offsetof(struct spdk_iscsi_opts, nopininterval), spdk_json_decode_int32, true}, + {"no_discovery_auth", offsetof(struct spdk_iscsi_opts, disable_chap), spdk_json_decode_bool, true}, + {"req_discovery_auth", offsetof(struct spdk_iscsi_opts, require_chap), spdk_json_decode_bool, true}, + {"req_discovery_auth_mutual", offsetof(struct spdk_iscsi_opts, mutual_chap), spdk_json_decode_bool, true}, + {"discovery_auth_group", offsetof(struct spdk_iscsi_opts, chap_group), spdk_json_decode_int32, true}, + {"disable_chap", offsetof(struct spdk_iscsi_opts, disable_chap), spdk_json_decode_bool, true}, + {"require_chap", offsetof(struct spdk_iscsi_opts, require_chap), spdk_json_decode_bool, true}, + {"mutual_chap", offsetof(struct spdk_iscsi_opts, mutual_chap), spdk_json_decode_bool, true}, + {"chap_group", offsetof(struct spdk_iscsi_opts, chap_group), spdk_json_decode_int32, true}, + {"max_sessions", offsetof(struct spdk_iscsi_opts, MaxSessions), spdk_json_decode_uint32, true}, + {"max_queue_depth", offsetof(struct spdk_iscsi_opts, MaxQueueDepth), spdk_json_decode_uint32, true}, + {"max_connections_per_session", offsetof(struct spdk_iscsi_opts, MaxConnectionsPerSession), spdk_json_decode_uint32, true}, + {"default_time2wait", offsetof(struct spdk_iscsi_opts, DefaultTime2Wait), spdk_json_decode_uint32, true}, + {"default_time2retain", offsetof(struct spdk_iscsi_opts, DefaultTime2Retain), spdk_json_decode_uint32, true}, + {"first_burst_length", offsetof(struct spdk_iscsi_opts, FirstBurstLength), spdk_json_decode_uint32, true}, + {"immediate_data", offsetof(struct spdk_iscsi_opts, ImmediateData), spdk_json_decode_bool, true}, + {"error_recovery_level", offsetof(struct spdk_iscsi_opts, ErrorRecoveryLevel), spdk_json_decode_uint32, true}, + {"allow_duplicated_isid", offsetof(struct spdk_iscsi_opts, AllowDuplicateIsid), spdk_json_decode_bool, true}, +}; + +static void +rpc_iscsi_set_options(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_iscsi_opts *opts; + struct spdk_json_write_ctx *w; + + if (g_spdk_iscsi_opts != NULL) { + SPDK_ERRLOG("this RPC must not be called more than once.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Must not call more than once"); + return; + } + + opts = iscsi_opts_alloc(); + if (opts == NULL) { + SPDK_ERRLOG("iscsi_opts_alloc() failed.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Out of memory"); + return; + } + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_set_iscsi_opts_decoders, + SPDK_COUNTOF(rpc_set_iscsi_opts_decoders), opts)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + iscsi_opts_free(opts); + return; + } + } + + g_spdk_iscsi_opts = iscsi_opts_copy(opts); + iscsi_opts_free(opts); + + if (g_spdk_iscsi_opts == NULL) { + SPDK_ERRLOG("iscsi_opts_copy() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Out of memory"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("iscsi_set_options", rpc_iscsi_set_options, SPDK_RPC_STARTUP) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_set_options, set_iscsi_options) diff --git a/src/spdk/lib/iscsi/iscsi_subsystem.c b/src/spdk/lib/iscsi/iscsi_subsystem.c new file mode 100644 index 000000000..1eb766233 --- /dev/null +++ b/src/spdk/lib/iscsi/iscsi_subsystem.c @@ -0,0 +1,1577 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/env.h" +#include "spdk/string.h" +#include "spdk/sock.h" +#include "spdk/likely.h" + +#include "iscsi/iscsi.h" +#include "iscsi/init_grp.h" +#include "iscsi/portal_grp.h" +#include "iscsi/conn.h" +#include "iscsi/task.h" +#include "iscsi/tgt_node.h" + +#include "spdk_internal/event.h" +#include "spdk_internal/log.h" + +struct spdk_iscsi_opts *g_spdk_iscsi_opts = NULL; + +static struct spdk_thread *g_init_thread = NULL; +static spdk_iscsi_init_cb g_init_cb_fn = NULL; +static void *g_init_cb_arg = NULL; + +static spdk_iscsi_fini_cb g_fini_cb_fn; +static void *g_fini_cb_arg; + +#define ISCSI_CONFIG_TMPL \ +"[iSCSI]\n" \ +" # node name (not include optional part)\n" \ +" # Users can optionally change this to fit their environment.\n" \ +" NodeBase \"%s\"\n" \ +"\n" \ +" # files\n" \ +" %s %s\n" \ +"\n" \ +" # socket I/O timeout sec. (polling is infinity)\n" \ +" Timeout %d\n" \ +"\n" \ +" # authentication information for discovery session\n" \ +" DiscoveryAuthMethod %s\n" \ +" DiscoveryAuthGroup %s\n" \ +"\n" \ +" MaxSessions %d\n" \ +" MaxConnectionsPerSession %d\n" \ +" MaxConnections %d\n" \ +" MaxQueueDepth %d\n" \ +"\n" \ +" # iSCSI initial parameters negotiate with initiators\n" \ +" # NOTE: incorrect values might crash\n" \ +" DefaultTime2Wait %d\n" \ +" DefaultTime2Retain %d\n" \ +"\n" \ +" FirstBurstLength %d\n" \ +" ImmediateData %s\n" \ +" ErrorRecoveryLevel %d\n" \ +"\n" + +static void +iscsi_globals_config_text(FILE *fp) +{ + const char *authmethod = "None"; + char authgroup[32] = "None"; + + if (NULL == fp) { + return; + } + + if (g_iscsi.require_chap) { + authmethod = "CHAP"; + } else if (g_iscsi.mutual_chap) { + authmethod = "CHAP Mutual"; + } else if (!g_iscsi.disable_chap) { + authmethod = "Auto"; + } + + if (g_iscsi.chap_group) { + snprintf(authgroup, sizeof(authgroup), "AuthGroup%d", g_iscsi.chap_group); + } + + fprintf(fp, ISCSI_CONFIG_TMPL, + g_iscsi.nodebase, + g_iscsi.authfile ? "AuthFile" : "", + g_iscsi.authfile ? g_iscsi.authfile : "", + g_iscsi.timeout, authmethod, authgroup, + g_iscsi.MaxSessions, g_iscsi.MaxConnectionsPerSession, + g_iscsi.MaxConnections, + g_iscsi.MaxQueueDepth, + g_iscsi.DefaultTime2Wait, g_iscsi.DefaultTime2Retain, + g_iscsi.FirstBurstLength, + (g_iscsi.ImmediateData) ? "Yes" : "No", + g_iscsi.ErrorRecoveryLevel); +} + +#define ISCSI_DATA_BUFFER_ALIGNMENT (0x1000) +#define ISCSI_DATA_BUFFER_MASK (ISCSI_DATA_BUFFER_ALIGNMENT - 1) + +static void +mobj_ctor(struct spdk_mempool *mp, __attribute__((unused)) void *arg, + void *_m, __attribute__((unused)) unsigned i) +{ + struct spdk_mobj *m = _m; + + m->mp = mp; + m->buf = (uint8_t *)m + sizeof(struct spdk_mobj); + m->buf = (void *)((unsigned long)((uint8_t *)m->buf + ISCSI_DATA_BUFFER_ALIGNMENT) & + ~ISCSI_DATA_BUFFER_MASK); +} + +#define NUM_PDU_PER_CONNECTION(iscsi) (2 * (iscsi->MaxQueueDepth + MAX_LARGE_DATAIN_PER_CONNECTION + 8)) +#define PDU_POOL_SIZE(iscsi) (iscsi->MaxConnections * NUM_PDU_PER_CONNECTION(iscsi)) +#define IMMEDIATE_DATA_POOL_SIZE(iscsi) (iscsi->MaxConnections * 128) +#define DATA_OUT_POOL_SIZE(iscsi) (iscsi->MaxConnections * MAX_DATA_OUT_PER_CONNECTION) + +static int +iscsi_initialize_pdu_pool(void) +{ + struct spdk_iscsi_globals *iscsi = &g_iscsi; + int imm_mobj_size = SPDK_BDEV_BUF_SIZE_WITH_MD(iscsi_get_max_immediate_data_size()) + + sizeof(struct spdk_mobj) + ISCSI_DATA_BUFFER_ALIGNMENT; + int dout_mobj_size = SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) + + sizeof(struct spdk_mobj) + ISCSI_DATA_BUFFER_ALIGNMENT; + + /* create PDU pool */ + iscsi->pdu_pool = spdk_mempool_create("PDU_Pool", + PDU_POOL_SIZE(iscsi), + sizeof(struct spdk_iscsi_pdu), + 256, SPDK_ENV_SOCKET_ID_ANY); + if (!iscsi->pdu_pool) { + SPDK_ERRLOG("create PDU pool failed\n"); + return -1; + } + + iscsi->pdu_immediate_data_pool = spdk_mempool_create_ctor("PDU_immediate_data_Pool", + IMMEDIATE_DATA_POOL_SIZE(iscsi), + imm_mobj_size, 256, + SPDK_ENV_SOCKET_ID_ANY, + mobj_ctor, NULL); + if (!iscsi->pdu_immediate_data_pool) { + SPDK_ERRLOG("create PDU immediate data pool failed\n"); + return -1; + } + + iscsi->pdu_data_out_pool = spdk_mempool_create_ctor("PDU_data_out_Pool", + DATA_OUT_POOL_SIZE(iscsi), + dout_mobj_size, 256, + SPDK_ENV_SOCKET_ID_ANY, + mobj_ctor, NULL); + if (!iscsi->pdu_data_out_pool) { + SPDK_ERRLOG("create PDU data out pool failed\n"); + return -1; + } + + return 0; +} + +static void +iscsi_sess_ctor(struct spdk_mempool *pool, void *arg, void *session_buf, + unsigned index) +{ + struct spdk_iscsi_globals *iscsi = arg; + struct spdk_iscsi_sess *sess = session_buf; + + iscsi->session[index] = sess; + + /* tsih 0 is reserved, so start tsih values at 1. */ + sess->tsih = index + 1; +} + +#define DEFAULT_TASK_POOL_SIZE 32768 + +static int +iscsi_initialize_task_pool(void) +{ + struct spdk_iscsi_globals *iscsi = &g_iscsi; + + /* create scsi_task pool */ + iscsi->task_pool = spdk_mempool_create("SCSI_TASK_Pool", + DEFAULT_TASK_POOL_SIZE, + sizeof(struct spdk_iscsi_task), + 128, SPDK_ENV_SOCKET_ID_ANY); + if (!iscsi->task_pool) { + SPDK_ERRLOG("create task pool failed\n"); + return -1; + } + + return 0; +} + +#define SESSION_POOL_SIZE(iscsi) (iscsi->MaxSessions) +static int +iscsi_initialize_session_pool(void) +{ + struct spdk_iscsi_globals *iscsi = &g_iscsi; + + iscsi->session_pool = spdk_mempool_create_ctor("Session_Pool", + SESSION_POOL_SIZE(iscsi), + sizeof(struct spdk_iscsi_sess), 0, + SPDK_ENV_SOCKET_ID_ANY, + iscsi_sess_ctor, iscsi); + if (!iscsi->session_pool) { + SPDK_ERRLOG("create session pool failed\n"); + return -1; + } + + return 0; +} + +static int +iscsi_initialize_all_pools(void) +{ + if (iscsi_initialize_pdu_pool() != 0) { + return -1; + } + + if (iscsi_initialize_session_pool() != 0) { + return -1; + } + + if (iscsi_initialize_task_pool() != 0) { + return -1; + } + + return 0; +} + +static void +iscsi_check_pool(struct spdk_mempool *pool, size_t count) +{ + if (pool && spdk_mempool_count(pool) != count) { + SPDK_ERRLOG("spdk_mempool_count(%s) == %zu, should be %zu\n", + spdk_mempool_get_name(pool), spdk_mempool_count(pool), count); + } +} + +static void +iscsi_check_pools(void) +{ + struct spdk_iscsi_globals *iscsi = &g_iscsi; + + iscsi_check_pool(iscsi->pdu_pool, PDU_POOL_SIZE(iscsi)); + iscsi_check_pool(iscsi->session_pool, SESSION_POOL_SIZE(iscsi)); + iscsi_check_pool(iscsi->pdu_immediate_data_pool, IMMEDIATE_DATA_POOL_SIZE(iscsi)); + iscsi_check_pool(iscsi->pdu_data_out_pool, DATA_OUT_POOL_SIZE(iscsi)); + iscsi_check_pool(iscsi->task_pool, DEFAULT_TASK_POOL_SIZE); +} + +static void +iscsi_free_pools(void) +{ + struct spdk_iscsi_globals *iscsi = &g_iscsi; + + spdk_mempool_free(iscsi->pdu_pool); + spdk_mempool_free(iscsi->session_pool); + spdk_mempool_free(iscsi->pdu_immediate_data_pool); + spdk_mempool_free(iscsi->pdu_data_out_pool); + spdk_mempool_free(iscsi->task_pool); +} + +void iscsi_put_pdu(struct spdk_iscsi_pdu *pdu) +{ + if (!pdu) { + return; + } + + assert(pdu->ref > 0); + pdu->ref--; + + if (pdu->ref == 0) { + if (pdu->mobj) { + spdk_mempool_put(pdu->mobj->mp, (void *)pdu->mobj); + } + + if (pdu->data && !pdu->data_from_mempool) { + free(pdu->data); + } + + spdk_mempool_put(g_iscsi.pdu_pool, (void *)pdu); + } +} + +struct spdk_iscsi_pdu *iscsi_get_pdu(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_pdu *pdu; + + assert(conn != NULL); + pdu = spdk_mempool_get(g_iscsi.pdu_pool); + if (!pdu) { + SPDK_ERRLOG("Unable to get PDU\n"); + abort(); + } + + /* we do not want to zero out the last part of the structure reserved for AHS and sense data */ + memset(pdu, 0, offsetof(struct spdk_iscsi_pdu, ahs)); + pdu->ref = 1; + pdu->conn = conn; + + return pdu; +} + +static void +iscsi_log_globals(void) +{ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthFile %s\n", + g_iscsi.authfile ? g_iscsi.authfile : "(none)"); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "NodeBase %s\n", g_iscsi.nodebase); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxSessions %d\n", g_iscsi.MaxSessions); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxConnectionsPerSession %d\n", + g_iscsi.MaxConnectionsPerSession); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxQueueDepth %d\n", g_iscsi.MaxQueueDepth); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "DefaultTime2Wait %d\n", + g_iscsi.DefaultTime2Wait); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "DefaultTime2Retain %d\n", + g_iscsi.DefaultTime2Retain); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "FirstBurstLength %d\n", + g_iscsi.FirstBurstLength); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ImmediateData %s\n", + g_iscsi.ImmediateData ? "Yes" : "No"); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AllowDuplicateIsid %s\n", + g_iscsi.AllowDuplicateIsid ? "Yes" : "No"); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ErrorRecoveryLevel %d\n", + g_iscsi.ErrorRecoveryLevel); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Timeout %d\n", g_iscsi.timeout); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "NopInInterval %d\n", + g_iscsi.nopininterval); + if (g_iscsi.disable_chap) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "DiscoveryAuthMethod None\n"); + } else if (!g_iscsi.require_chap) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "DiscoveryAuthMethod Auto\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "DiscoveryAuthMethod %s %s\n", + g_iscsi.require_chap ? "CHAP" : "", + g_iscsi.mutual_chap ? "Mutual" : ""); + } + + if (g_iscsi.chap_group == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "DiscoveryAuthGroup None\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "DiscoveryAuthGroup AuthGroup%d\n", + g_iscsi.chap_group); + } +} + +static void +iscsi_opts_init(struct spdk_iscsi_opts *opts) +{ + opts->MaxSessions = DEFAULT_MAX_SESSIONS; + opts->MaxConnectionsPerSession = DEFAULT_MAX_CONNECTIONS_PER_SESSION; + opts->MaxQueueDepth = DEFAULT_MAX_QUEUE_DEPTH; + opts->DefaultTime2Wait = DEFAULT_DEFAULTTIME2WAIT; + opts->DefaultTime2Retain = DEFAULT_DEFAULTTIME2RETAIN; + opts->FirstBurstLength = SPDK_ISCSI_FIRST_BURST_LENGTH; + opts->ImmediateData = DEFAULT_IMMEDIATEDATA; + opts->AllowDuplicateIsid = false; + opts->ErrorRecoveryLevel = DEFAULT_ERRORRECOVERYLEVEL; + opts->timeout = DEFAULT_TIMEOUT; + opts->nopininterval = DEFAULT_NOPININTERVAL; + opts->disable_chap = false; + opts->require_chap = false; + opts->mutual_chap = false; + opts->chap_group = 0; + opts->authfile = NULL; + opts->nodebase = NULL; +} + +struct spdk_iscsi_opts * +iscsi_opts_alloc(void) +{ + struct spdk_iscsi_opts *opts; + + opts = calloc(1, sizeof(*opts)); + if (!opts) { + SPDK_ERRLOG("calloc() failed for iscsi options\n"); + return NULL; + } + + iscsi_opts_init(opts); + + return opts; +} + +void +iscsi_opts_free(struct spdk_iscsi_opts *opts) +{ + free(opts->authfile); + free(opts->nodebase); + free(opts); +} + +/* Deep copy of spdk_iscsi_opts */ +struct spdk_iscsi_opts * +iscsi_opts_copy(struct spdk_iscsi_opts *src) +{ + struct spdk_iscsi_opts *dst; + + dst = calloc(1, sizeof(*dst)); + if (!dst) { + SPDK_ERRLOG("calloc() failed for iscsi options\n"); + return NULL; + } + + if (src->authfile) { + dst->authfile = strdup(src->authfile); + if (!dst->authfile) { + free(dst); + SPDK_ERRLOG("failed to strdup for auth file %s\n", src->authfile); + return NULL; + } + } + + if (src->nodebase) { + dst->nodebase = strdup(src->nodebase); + if (!dst->nodebase) { + free(dst->authfile); + free(dst); + SPDK_ERRLOG("failed to strdup for nodebase %s\n", src->nodebase); + return NULL; + } + } + + dst->MaxSessions = src->MaxSessions; + dst->MaxConnectionsPerSession = src->MaxConnectionsPerSession; + dst->MaxQueueDepth = src->MaxQueueDepth; + dst->DefaultTime2Wait = src->DefaultTime2Wait; + dst->DefaultTime2Retain = src->DefaultTime2Retain; + dst->FirstBurstLength = src->FirstBurstLength; + dst->ImmediateData = src->ImmediateData; + dst->AllowDuplicateIsid = src->AllowDuplicateIsid; + dst->ErrorRecoveryLevel = src->ErrorRecoveryLevel; + dst->timeout = src->timeout; + dst->nopininterval = src->nopininterval; + dst->disable_chap = src->disable_chap; + dst->require_chap = src->require_chap; + dst->mutual_chap = src->mutual_chap; + dst->chap_group = src->chap_group; + + return dst; +} + +static int +iscsi_read_config_file_params(struct spdk_conf_section *sp, + struct spdk_iscsi_opts *opts) +{ + const char *val; + int MaxSessions; + int MaxConnectionsPerSession; + int MaxQueueDepth; + int DefaultTime2Wait; + int DefaultTime2Retain; + int FirstBurstLength; + int ErrorRecoveryLevel; + int timeout; + int nopininterval; + const char *ag_tag; + int ag_tag_i; + int i; + + val = spdk_conf_section_get_val(sp, "Comment"); + if (val != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val); + } + + val = spdk_conf_section_get_val(sp, "AuthFile"); + if (val != NULL) { + opts->authfile = strdup(val); + if (!opts->authfile) { + SPDK_ERRLOG("strdup() failed for AuthFile\n"); + return -ENOMEM; + } + } + + val = spdk_conf_section_get_val(sp, "NodeBase"); + if (val != NULL) { + opts->nodebase = strdup(val); + if (!opts->nodebase) { + free(opts->authfile); + SPDK_ERRLOG("strdup() failed for NodeBase\n"); + return -ENOMEM; + } + } + + MaxSessions = spdk_conf_section_get_intval(sp, "MaxSessions"); + if (MaxSessions >= 0) { + opts->MaxSessions = MaxSessions; + } + + MaxConnectionsPerSession = spdk_conf_section_get_intval(sp, "MaxConnectionsPerSession"); + if (MaxConnectionsPerSession >= 0) { + opts->MaxConnectionsPerSession = MaxConnectionsPerSession; + } + + MaxQueueDepth = spdk_conf_section_get_intval(sp, "MaxQueueDepth"); + if (MaxQueueDepth >= 0) { + opts->MaxQueueDepth = MaxQueueDepth; + } + + DefaultTime2Wait = spdk_conf_section_get_intval(sp, "DefaultTime2Wait"); + if (DefaultTime2Wait >= 0) { + opts->DefaultTime2Wait = DefaultTime2Wait; + } + + DefaultTime2Retain = spdk_conf_section_get_intval(sp, "DefaultTime2Retain"); + if (DefaultTime2Retain >= 0) { + opts->DefaultTime2Retain = DefaultTime2Retain; + } + + FirstBurstLength = spdk_conf_section_get_intval(sp, "FirstBurstLength"); + if (FirstBurstLength >= 0) { + opts->FirstBurstLength = FirstBurstLength; + } + + opts->ImmediateData = spdk_conf_section_get_boolval(sp, "ImmediateData", + opts->ImmediateData); + + /* This option is only for test. + * If AllowDuplicateIsid is enabled, it allows different connections carrying + * TSIH=0 login the target within the same session. + */ + opts->AllowDuplicateIsid = spdk_conf_section_get_boolval(sp, "AllowDuplicateIsid", + opts->AllowDuplicateIsid); + + ErrorRecoveryLevel = spdk_conf_section_get_intval(sp, "ErrorRecoveryLevel"); + if (ErrorRecoveryLevel >= 0) { + opts->ErrorRecoveryLevel = ErrorRecoveryLevel; + } + timeout = spdk_conf_section_get_intval(sp, "Timeout"); + if (timeout >= 0) { + opts->timeout = timeout; + } + nopininterval = spdk_conf_section_get_intval(sp, "NopInInterval"); + if (nopininterval >= 0) { + opts->nopininterval = nopininterval; + } + val = spdk_conf_section_get_val(sp, "DiscoveryAuthMethod"); + if (val != NULL) { + for (i = 0; ; i++) { + val = spdk_conf_section_get_nmval(sp, "DiscoveryAuthMethod", 0, i); + if (val == NULL) { + break; + } + if (strcasecmp(val, "CHAP") == 0) { + opts->require_chap = true; + } else if (strcasecmp(val, "Mutual") == 0) { + opts->require_chap = true; + opts->mutual_chap = true; + } else if (strcasecmp(val, "Auto") == 0) { + opts->disable_chap = false; + opts->require_chap = false; + opts->mutual_chap = false; + } else if (strcasecmp(val, "None") == 0) { + opts->disable_chap = true; + opts->require_chap = false; + opts->mutual_chap = false; + } else { + SPDK_ERRLOG("unknown CHAP mode %s\n", val); + } + } + if (opts->mutual_chap && !opts->require_chap) { + free(opts->authfile); + free(opts->nodebase); + SPDK_ERRLOG("CHAP must set to be required when using mutual CHAP.\n"); + return -EINVAL; + } + } + val = spdk_conf_section_get_val(sp, "DiscoveryAuthGroup"); + if (val != NULL) { + ag_tag = val; + if (strcasecmp(ag_tag, "None") == 0) { + opts->chap_group = 0; + } else { + if (strncasecmp(ag_tag, "AuthGroup", + strlen("AuthGroup")) != 0 + || sscanf(ag_tag, "%*[^0-9]%d", &ag_tag_i) != 1 + || ag_tag_i == 0) { + SPDK_ERRLOG("invalid auth group %s, ignoring\n", ag_tag); + } else { + opts->chap_group = ag_tag_i; + } + } + } + + return 0; +} + +static int +iscsi_opts_verify(struct spdk_iscsi_opts *opts) +{ + if (!opts->nodebase) { + opts->nodebase = strdup(SPDK_ISCSI_DEFAULT_NODEBASE); + if (opts->nodebase == NULL) { + SPDK_ERRLOG("strdup() failed for default nodebase\n"); + return -ENOMEM; + } + } + + if (opts->MaxSessions == 0 || opts->MaxSessions > 65535) { + SPDK_ERRLOG("%d is invalid. MaxSessions must be more than 0 and no more than 65535\n", + opts->MaxSessions); + return -EINVAL; + } + + if (opts->MaxConnectionsPerSession == 0 || opts->MaxConnectionsPerSession > 65535) { + SPDK_ERRLOG("%d is invalid. MaxConnectionsPerSession must be more than 0 and no more than 65535\n", + opts->MaxConnectionsPerSession); + return -EINVAL; + } + + if (opts->MaxQueueDepth == 0 || opts->MaxQueueDepth > 256) { + SPDK_ERRLOG("%d is invalid. MaxQueueDepth must be more than 0 and no more than 256\n", + opts->MaxQueueDepth); + return -EINVAL; + } + + if (opts->DefaultTime2Wait > 3600) { + SPDK_ERRLOG("%d is invalid. DefaultTime2Wait must be no more than 3600\n", + opts->DefaultTime2Wait); + return -EINVAL; + } + + if (opts->DefaultTime2Retain > 3600) { + SPDK_ERRLOG("%d is invalid. DefaultTime2Retain must be no more than 3600\n", + opts->DefaultTime2Retain); + return -EINVAL; + } + + if (opts->FirstBurstLength >= SPDK_ISCSI_MIN_FIRST_BURST_LENGTH) { + if (opts->FirstBurstLength > SPDK_ISCSI_MAX_BURST_LENGTH) { + SPDK_ERRLOG("FirstBurstLength %d shall not exceed MaxBurstLength %d\n", + opts->FirstBurstLength, SPDK_ISCSI_MAX_BURST_LENGTH); + return -EINVAL; + } + } else { + SPDK_ERRLOG("FirstBurstLength %d shall be no less than %d\n", + opts->FirstBurstLength, SPDK_ISCSI_MIN_FIRST_BURST_LENGTH); + return -EINVAL; + } + + if (opts->ErrorRecoveryLevel > 2) { + SPDK_ERRLOG("ErrorRecoveryLevel %d is not supported.\n", opts->ErrorRecoveryLevel); + return -EINVAL; + } + + if (opts->timeout < 0) { + SPDK_ERRLOG("%d is invalid. timeout must not be less than 0\n", opts->timeout); + return -EINVAL; + } + + if (opts->nopininterval < 0 || opts->nopininterval > MAX_NOPININTERVAL) { + SPDK_ERRLOG("%d is invalid. nopinterval must be between 0 and %d\n", + opts->nopininterval, MAX_NOPININTERVAL); + return -EINVAL; + } + + if (!iscsi_check_chap_params(opts->disable_chap, opts->require_chap, + opts->mutual_chap, opts->chap_group)) { + SPDK_ERRLOG("CHAP params in opts are illegal combination\n"); + return -EINVAL; + } + + return 0; +} + +static int +iscsi_parse_options(struct spdk_iscsi_opts **popts) +{ + struct spdk_iscsi_opts *opts; + struct spdk_conf_section *sp; + int rc; + + opts = iscsi_opts_alloc(); + if (!opts) { + SPDK_ERRLOG("iscsi_opts_alloc_failed() failed\n"); + return -ENOMEM; + } + + /* Process parameters */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_read_config_file_parmas\n"); + sp = spdk_conf_find_section(NULL, "iSCSI"); + if (sp != NULL) { + rc = iscsi_read_config_file_params(sp, opts); + if (rc != 0) { + free(opts); + SPDK_ERRLOG("iscsi_read_config_file_params() failed\n"); + return rc; + } + } + + *popts = opts; + + return 0; +} + +static int +iscsi_set_global_params(struct spdk_iscsi_opts *opts) +{ + int rc; + + rc = iscsi_opts_verify(opts); + if (rc != 0) { + SPDK_ERRLOG("spdk_iscsi_opts_verify() failed\n"); + return rc; + } + + if (opts->authfile != NULL) { + g_iscsi.authfile = strdup(opts->authfile); + if (!g_iscsi.authfile) { + SPDK_ERRLOG("failed to strdup for auth file %s\n", opts->authfile); + return -ENOMEM; + } + } + + g_iscsi.nodebase = strdup(opts->nodebase); + if (!g_iscsi.nodebase) { + SPDK_ERRLOG("failed to strdup for nodebase %s\n", opts->nodebase); + return -ENOMEM; + } + + g_iscsi.MaxSessions = opts->MaxSessions; + g_iscsi.MaxConnectionsPerSession = opts->MaxConnectionsPerSession; + g_iscsi.MaxQueueDepth = opts->MaxQueueDepth; + g_iscsi.DefaultTime2Wait = opts->DefaultTime2Wait; + g_iscsi.DefaultTime2Retain = opts->DefaultTime2Retain; + g_iscsi.FirstBurstLength = opts->FirstBurstLength; + g_iscsi.ImmediateData = opts->ImmediateData; + g_iscsi.AllowDuplicateIsid = opts->AllowDuplicateIsid; + g_iscsi.ErrorRecoveryLevel = opts->ErrorRecoveryLevel; + g_iscsi.timeout = opts->timeout; + g_iscsi.nopininterval = opts->nopininterval; + g_iscsi.disable_chap = opts->disable_chap; + g_iscsi.require_chap = opts->require_chap; + g_iscsi.mutual_chap = opts->mutual_chap; + g_iscsi.chap_group = opts->chap_group; + + iscsi_log_globals(); + + return 0; +} + +int +iscsi_set_discovery_auth(bool disable_chap, bool require_chap, bool mutual_chap, + int32_t chap_group) +{ + if (!iscsi_check_chap_params(disable_chap, require_chap, mutual_chap, + chap_group)) { + SPDK_ERRLOG("CHAP params are illegal combination\n"); + return -EINVAL; + } + + pthread_mutex_lock(&g_iscsi.mutex); + g_iscsi.disable_chap = disable_chap; + g_iscsi.require_chap = require_chap; + g_iscsi.mutual_chap = mutual_chap; + g_iscsi.chap_group = chap_group; + pthread_mutex_unlock(&g_iscsi.mutex); + + return 0; +} + +int +iscsi_auth_group_add_secret(struct spdk_iscsi_auth_group *group, + const char *user, const char *secret, + const char *muser, const char *msecret) +{ + struct spdk_iscsi_auth_secret *_secret; + size_t len; + + if (user == NULL || secret == NULL) { + SPDK_ERRLOG("user and secret must be specified\n"); + return -EINVAL; + } + + if (muser != NULL && msecret == NULL) { + SPDK_ERRLOG("msecret must be specified with muser\n"); + return -EINVAL; + } + + TAILQ_FOREACH(_secret, &group->secret_head, tailq) { + if (strcmp(_secret->user, user) == 0) { + SPDK_ERRLOG("user for secret is duplicated\n"); + return -EEXIST; + } + } + + _secret = calloc(1, sizeof(*_secret)); + if (_secret == NULL) { + SPDK_ERRLOG("calloc() failed for CHAP secret\n"); + return -ENOMEM; + } + + len = strnlen(user, sizeof(_secret->user)); + if (len > sizeof(_secret->user) - 1) { + SPDK_ERRLOG("CHAP user longer than %zu characters: %s\n", + sizeof(_secret->user) - 1, user); + free(_secret); + return -EINVAL; + } + memcpy(_secret->user, user, len); + + len = strnlen(secret, sizeof(_secret->secret)); + if (len > sizeof(_secret->secret) - 1) { + SPDK_ERRLOG("CHAP secret longer than %zu characters: %s\n", + sizeof(_secret->secret) - 1, secret); + free(_secret); + return -EINVAL; + } + memcpy(_secret->secret, secret, len); + + if (muser != NULL) { + len = strnlen(muser, sizeof(_secret->muser)); + if (len > sizeof(_secret->muser) - 1) { + SPDK_ERRLOG("Mutual CHAP user longer than %zu characters: %s\n", + sizeof(_secret->muser) - 1, muser); + free(_secret); + return -EINVAL; + } + memcpy(_secret->muser, muser, len); + + len = strnlen(msecret, sizeof(_secret->msecret)); + if (len > sizeof(_secret->msecret) - 1) { + SPDK_ERRLOG("Mutual CHAP secret longer than %zu characters: %s\n", + sizeof(_secret->msecret) - 1, msecret); + free(_secret); + return -EINVAL; + } + memcpy(_secret->msecret, msecret, len); + } + + TAILQ_INSERT_TAIL(&group->secret_head, _secret, tailq); + return 0; +} + +int +iscsi_auth_group_delete_secret(struct spdk_iscsi_auth_group *group, + const char *user) +{ + struct spdk_iscsi_auth_secret *_secret; + + if (user == NULL) { + SPDK_ERRLOG("user must be specified\n"); + return -EINVAL; + } + + TAILQ_FOREACH(_secret, &group->secret_head, tailq) { + if (strcmp(_secret->user, user) == 0) { + break; + } + } + + if (_secret == NULL) { + SPDK_ERRLOG("secret is not found\n"); + return -ENODEV; + } + + TAILQ_REMOVE(&group->secret_head, _secret, tailq); + free(_secret); + + return 0; +} + +int +iscsi_add_auth_group(int32_t tag, struct spdk_iscsi_auth_group **_group) +{ + struct spdk_iscsi_auth_group *group; + + TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) { + if (group->tag == tag) { + SPDK_ERRLOG("Auth group (%d) already exists\n", tag); + return -EEXIST; + } + } + + group = calloc(1, sizeof(*group)); + if (group == NULL) { + SPDK_ERRLOG("calloc() failed for auth group\n"); + return -ENOMEM; + } + + TAILQ_INIT(&group->secret_head); + group->tag = tag; + + TAILQ_INSERT_TAIL(&g_iscsi.auth_group_head, group, tailq); + + *_group = group; + return 0; +} + +void +iscsi_delete_auth_group(struct spdk_iscsi_auth_group *group) +{ + struct spdk_iscsi_auth_secret *_secret, *tmp; + + TAILQ_REMOVE(&g_iscsi.auth_group_head, group, tailq); + + TAILQ_FOREACH_SAFE(_secret, &group->secret_head, tailq, tmp) { + TAILQ_REMOVE(&group->secret_head, _secret, tailq); + free(_secret); + } + free(group); +} + +struct spdk_iscsi_auth_group * +iscsi_find_auth_group_by_tag(int32_t tag) +{ + struct spdk_iscsi_auth_group *group; + + TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) { + if (group->tag == tag) { + return group; + } + } + + return NULL; +} + +static void +iscsi_auth_groups_destroy(void) +{ + struct spdk_iscsi_auth_group *group, *tmp; + + TAILQ_FOREACH_SAFE(group, &g_iscsi.auth_group_head, tailq, tmp) { + iscsi_delete_auth_group(group); + } +} + +static int +iscsi_parse_auth_group(struct spdk_conf_section *sp) +{ + int rc; + int i; + int tag; + const char *val, *user, *secret, *muser, *msecret; + struct spdk_iscsi_auth_group *group = NULL; + + val = spdk_conf_section_get_val(sp, "Comment"); + if (val != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val); + } + + tag = spdk_conf_section_get_num(sp); + + rc = iscsi_add_auth_group(tag, &group); + if (rc != 0) { + SPDK_ERRLOG("Failed to add auth group\n"); + return rc; + } + + for (i = 0; ; i++) { + val = spdk_conf_section_get_nval(sp, "Auth", i); + if (val == NULL) { + break; + } + + user = spdk_conf_section_get_nmval(sp, "Auth", i, 0); + secret = spdk_conf_section_get_nmval(sp, "Auth", i, 1); + muser = spdk_conf_section_get_nmval(sp, "Auth", i, 2); + msecret = spdk_conf_section_get_nmval(sp, "Auth", i, 3); + + rc = iscsi_auth_group_add_secret(group, user, secret, muser, msecret); + if (rc != 0) { + SPDK_ERRLOG("Failed to add secret to auth group\n"); + iscsi_delete_auth_group(group); + return rc; + } + } + + return 0; +} + +static int +iscsi_parse_auth_info(void) +{ + struct spdk_conf *config; + struct spdk_conf_section *sp; + int rc; + + config = spdk_conf_allocate(); + if (!config) { + SPDK_ERRLOG("Failed to allocate config file\n"); + return -ENOMEM; + } + + rc = spdk_conf_read(config, g_iscsi.authfile); + if (rc != 0) { + SPDK_INFOLOG(SPDK_LOG_ISCSI, "Failed to load auth file\n"); + spdk_conf_free(config); + return rc; + } + + sp = spdk_conf_first_section(config); + while (sp != NULL) { + if (spdk_conf_section_match_prefix(sp, "AuthGroup")) { + if (spdk_conf_section_get_num(sp) == 0) { + SPDK_ERRLOG("Group 0 is invalid\n"); + iscsi_auth_groups_destroy(); + spdk_conf_free(config); + return -EINVAL; + } + + rc = iscsi_parse_auth_group(sp); + if (rc != 0) { + SPDK_ERRLOG("parse_auth_group() failed\n"); + iscsi_auth_groups_destroy(); + spdk_conf_free(config); + return rc; + } + } + sp = spdk_conf_next_section(sp); + } + + spdk_conf_free(config); + return 0; +} + +static struct spdk_iscsi_auth_secret * +iscsi_find_auth_secret(const char *authuser, int ag_tag) +{ + struct spdk_iscsi_auth_group *group; + struct spdk_iscsi_auth_secret *_secret; + + TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) { + if (group->tag == ag_tag) { + TAILQ_FOREACH(_secret, &group->secret_head, tailq) { + if (strcmp(_secret->user, authuser) == 0) { + return _secret; + } + } + } + } + + return NULL; +} + +int +iscsi_chap_get_authinfo(struct iscsi_chap_auth *auth, const char *authuser, + int ag_tag) +{ + struct spdk_iscsi_auth_secret *_secret; + + if (authuser == NULL) { + return -EINVAL; + } + + if (auth->user[0] != '\0') { + memset(auth->user, 0, sizeof(auth->user)); + memset(auth->secret, 0, sizeof(auth->secret)); + memset(auth->muser, 0, sizeof(auth->muser)); + memset(auth->msecret, 0, sizeof(auth->msecret)); + } + + pthread_mutex_lock(&g_iscsi.mutex); + + _secret = iscsi_find_auth_secret(authuser, ag_tag); + if (_secret == NULL) { + pthread_mutex_unlock(&g_iscsi.mutex); + + SPDK_ERRLOG("CHAP secret is not found: user:%s, tag:%d\n", + authuser, ag_tag); + return -ENOENT; + } + + memcpy(auth->user, _secret->user, sizeof(auth->user)); + memcpy(auth->secret, _secret->secret, sizeof(auth->secret)); + + if (_secret->muser[0] != '\0') { + memcpy(auth->muser, _secret->muser, sizeof(auth->muser)); + memcpy(auth->msecret, _secret->msecret, sizeof(auth->msecret)); + } + + pthread_mutex_unlock(&g_iscsi.mutex); + return 0; +} + +static int +iscsi_initialize_global_params(void) +{ + int rc; + + if (!g_spdk_iscsi_opts) { + rc = iscsi_parse_options(&g_spdk_iscsi_opts); + if (rc != 0) { + SPDK_ERRLOG("iscsi_parse_options() failed\n"); + return rc; + } + } + + rc = iscsi_set_global_params(g_spdk_iscsi_opts); + if (rc != 0) { + SPDK_ERRLOG("iscsi_set_global_params() failed\n"); + } + + iscsi_opts_free(g_spdk_iscsi_opts); + g_spdk_iscsi_opts = NULL; + + return rc; +} + +static void +iscsi_init_complete(int rc) +{ + spdk_iscsi_init_cb cb_fn = g_init_cb_fn; + void *cb_arg = g_init_cb_arg; + + g_init_cb_fn = NULL; + g_init_cb_arg = NULL; + + cb_fn(cb_arg, rc); +} + +static void +iscsi_parse_configuration(void) +{ + int rc; + + rc = iscsi_parse_portal_grps(); + if (rc < 0) { + SPDK_ERRLOG("iscsi_parse_portal_grps() failed\n"); + goto end; + } + + rc = iscsi_parse_init_grps(); + if (rc < 0) { + SPDK_ERRLOG("iscsi_parse_init_grps() failed\n"); + goto end; + } + + rc = iscsi_parse_tgt_nodes(); + if (rc < 0) { + SPDK_ERRLOG("iscsi_parse_tgt_nodes() failed\n"); + } + + if (g_iscsi.authfile != NULL) { + if (access(g_iscsi.authfile, R_OK) == 0) { + rc = iscsi_parse_auth_info(); + if (rc < 0) { + SPDK_ERRLOG("iscsi_parse_auth_info() failed\n"); + } + } else { + SPDK_INFOLOG(SPDK_LOG_ISCSI, "CHAP secret file is not found in the path %s\n", + g_iscsi.authfile); + } + } + +end: + iscsi_init_complete(rc); +} + +static int +iscsi_poll_group_poll(void *ctx) +{ + struct spdk_iscsi_poll_group *group = ctx; + struct spdk_iscsi_conn *conn, *tmp; + int rc; + + if (spdk_unlikely(STAILQ_EMPTY(&group->connections))) { + return SPDK_POLLER_IDLE; + } + + rc = spdk_sock_group_poll(group->sock_group); + if (rc < 0) { + SPDK_ERRLOG("Failed to poll sock_group=%p\n", group->sock_group); + } + + STAILQ_FOREACH_SAFE(conn, &group->connections, pg_link, tmp) { + if (conn->state == ISCSI_CONN_STATE_EXITING) { + iscsi_conn_destruct(conn); + } + } + + return rc != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +} + +static int +iscsi_poll_group_handle_nop(void *ctx) +{ + struct spdk_iscsi_poll_group *group = ctx; + struct spdk_iscsi_conn *conn, *tmp; + + STAILQ_FOREACH_SAFE(conn, &group->connections, pg_link, tmp) { + iscsi_conn_handle_nop(conn); + } + + return SPDK_POLLER_BUSY; +} + +static int +iscsi_poll_group_create(void *io_device, void *ctx_buf) +{ + struct spdk_iscsi_poll_group *pg = ctx_buf; + + STAILQ_INIT(&pg->connections); + pg->sock_group = spdk_sock_group_create(NULL); + assert(pg->sock_group != NULL); + + pg->poller = SPDK_POLLER_REGISTER(iscsi_poll_group_poll, pg, 0); + /* set the period to 1 sec */ + pg->nop_poller = SPDK_POLLER_REGISTER(iscsi_poll_group_handle_nop, pg, 1000000); + + return 0; +} + +static void +iscsi_poll_group_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_iscsi_poll_group *pg = ctx_buf; + struct spdk_io_channel *ch; + struct spdk_thread *thread; + + assert(pg->poller != NULL); + assert(pg->sock_group != NULL); + + spdk_sock_group_close(&pg->sock_group); + spdk_poller_unregister(&pg->poller); + spdk_poller_unregister(&pg->nop_poller); + + ch = spdk_io_channel_from_ctx(pg); + thread = spdk_io_channel_get_thread(ch); + + assert(thread == spdk_get_thread()); + + spdk_thread_exit(thread); +} + +static void +_iscsi_init_thread_done(void *ctx) +{ + struct spdk_iscsi_poll_group *pg = ctx; + + TAILQ_INSERT_TAIL(&g_iscsi.poll_group_head, pg, link); + if (--g_iscsi.refcnt == 0) { + iscsi_parse_configuration(); + } +} + +static void +_iscsi_init_thread(void *ctx) +{ + struct spdk_io_channel *ch; + struct spdk_iscsi_poll_group *pg; + + ch = spdk_get_io_channel(&g_iscsi); + pg = spdk_io_channel_get_ctx(ch); + + spdk_thread_send_msg(g_init_thread, _iscsi_init_thread_done, pg); +} + +static void +initialize_iscsi_poll_group(void) +{ + struct spdk_cpuset tmp_cpumask = {}; + uint32_t i; + char thread_name[32]; + struct spdk_thread *thread; + + spdk_io_device_register(&g_iscsi, iscsi_poll_group_create, iscsi_poll_group_destroy, + sizeof(struct spdk_iscsi_poll_group), "iscsi_tgt"); + + /* Create threads for CPU cores active for this application, and send a + * message to each thread to create a poll group on it. + */ + g_init_thread = spdk_get_thread(); + assert(g_init_thread != NULL); + assert(g_iscsi.refcnt == 0); + + SPDK_ENV_FOREACH_CORE(i) { + spdk_cpuset_zero(&tmp_cpumask); + spdk_cpuset_set_cpu(&tmp_cpumask, i, true); + snprintf(thread_name, sizeof(thread_name), "iscsi_poll_group_%u", i); + + thread = spdk_thread_create(thread_name, &tmp_cpumask); + assert(thread != NULL); + + g_iscsi.refcnt++; + spdk_thread_send_msg(thread, _iscsi_init_thread, NULL); + } +} + +static int +iscsi_parse_globals(void) +{ + int rc; + + rc = iscsi_initialize_global_params(); + if (rc != 0) { + SPDK_ERRLOG("iscsi_initialize_iscsi_global_params() failed\n"); + return rc; + } + + g_iscsi.session = calloc(1, sizeof(struct spdk_iscsi_sess *) * g_iscsi.MaxSessions); + if (!g_iscsi.session) { + SPDK_ERRLOG("calloc() failed for session array\n"); + return -1; + } + + /* + * For now, just support same number of total connections, rather + * than MaxSessions * MaxConnectionsPerSession. After we add better + * handling for low resource conditions from our various buffer + * pools, we can bump this up to support more connections. + */ + g_iscsi.MaxConnections = g_iscsi.MaxSessions; + + rc = iscsi_initialize_all_pools(); + if (rc != 0) { + SPDK_ERRLOG("initialize_all_pools() failed\n"); + free(g_iscsi.session); + g_iscsi.session = NULL; + return -1; + } + + rc = initialize_iscsi_conns(); + if (rc < 0) { + SPDK_ERRLOG("initialize_iscsi_conns() failed\n"); + free(g_iscsi.session); + g_iscsi.session = NULL; + return rc; + } + + initialize_iscsi_poll_group(); + return 0; +} + +void +spdk_iscsi_init(spdk_iscsi_init_cb cb_fn, void *cb_arg) +{ + int rc; + + assert(cb_fn != NULL); + g_init_cb_fn = cb_fn; + g_init_cb_arg = cb_arg; + + rc = iscsi_parse_globals(); + if (rc < 0) { + SPDK_ERRLOG("iscsi_parse_globals() failed\n"); + iscsi_init_complete(-1); + } + + /* + * iscsi_parse_configuration() will be called as the callback to + * spdk_initialize_iscsi_poll_group() and will complete iSCSI + * subsystem initialization. + */ +} + +void +spdk_iscsi_fini(spdk_iscsi_fini_cb cb_fn, void *cb_arg) +{ + g_fini_cb_fn = cb_fn; + g_fini_cb_arg = cb_arg; + + iscsi_portal_grp_close_all(); + shutdown_iscsi_conns(); +} + +static void +iscsi_fini_done(void *io_device) +{ + free(g_iscsi.authfile); + free(g_iscsi.nodebase); + + pthread_mutex_destroy(&g_iscsi.mutex); + g_fini_cb_fn(g_fini_cb_arg); +} + +static void +_iscsi_fini_dev_unreg(struct spdk_io_channel_iter *i, int status) +{ + iscsi_check_pools(); + iscsi_free_pools(); + free(g_iscsi.session); + + assert(TAILQ_EMPTY(&g_iscsi.poll_group_head)); + + iscsi_shutdown_tgt_nodes(); + iscsi_init_grps_destroy(); + iscsi_portal_grps_destroy(); + iscsi_auth_groups_destroy(); + + spdk_io_device_unregister(&g_iscsi, iscsi_fini_done); +} + +static void +_iscsi_fini_thread(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch; + struct spdk_iscsi_poll_group *pg; + + ch = spdk_io_channel_iter_get_channel(i); + pg = spdk_io_channel_get_ctx(ch); + + pthread_mutex_lock(&g_iscsi.mutex); + TAILQ_REMOVE(&g_iscsi.poll_group_head, pg, link); + pthread_mutex_unlock(&g_iscsi.mutex); + + spdk_put_io_channel(ch); + + spdk_for_each_channel_continue(i, 0); +} + +void +shutdown_iscsi_conns_done(void) +{ + spdk_for_each_channel(&g_iscsi, _iscsi_fini_thread, NULL, _iscsi_fini_dev_unreg); +} + +void +spdk_iscsi_config_text(FILE *fp) +{ + iscsi_globals_config_text(fp); + iscsi_portal_grps_config_text(fp); + iscsi_init_grps_config_text(fp); + iscsi_tgt_nodes_config_text(fp); +} + +void +iscsi_opts_info_json(struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + if (g_iscsi.authfile != NULL) { + spdk_json_write_named_string(w, "auth_file", g_iscsi.authfile); + } + spdk_json_write_named_string(w, "node_base", g_iscsi.nodebase); + + spdk_json_write_named_uint32(w, "max_sessions", g_iscsi.MaxSessions); + spdk_json_write_named_uint32(w, "max_connections_per_session", + g_iscsi.MaxConnectionsPerSession); + + spdk_json_write_named_uint32(w, "max_queue_depth", g_iscsi.MaxQueueDepth); + + spdk_json_write_named_uint32(w, "default_time2wait", g_iscsi.DefaultTime2Wait); + spdk_json_write_named_uint32(w, "default_time2retain", g_iscsi.DefaultTime2Retain); + + spdk_json_write_named_uint32(w, "first_burst_length", g_iscsi.FirstBurstLength); + + spdk_json_write_named_bool(w, "immediate_data", g_iscsi.ImmediateData); + + spdk_json_write_named_bool(w, "allow_duplicated_isid", g_iscsi.AllowDuplicateIsid); + + spdk_json_write_named_uint32(w, "error_recovery_level", g_iscsi.ErrorRecoveryLevel); + + spdk_json_write_named_int32(w, "nop_timeout", g_iscsi.timeout); + spdk_json_write_named_int32(w, "nop_in_interval", g_iscsi.nopininterval); + + spdk_json_write_named_bool(w, "disable_chap", g_iscsi.disable_chap); + spdk_json_write_named_bool(w, "require_chap", g_iscsi.require_chap); + spdk_json_write_named_bool(w, "mutual_chap", g_iscsi.mutual_chap); + spdk_json_write_named_int32(w, "chap_group", g_iscsi.chap_group); + + spdk_json_write_object_end(w); +} + +static void +iscsi_auth_group_info_json(struct spdk_iscsi_auth_group *group, + struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_auth_secret *_secret; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_int32(w, "tag", group->tag); + + spdk_json_write_named_array_begin(w, "secrets"); + TAILQ_FOREACH(_secret, &group->secret_head, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "user", _secret->user); + spdk_json_write_named_string(w, "secret", _secret->secret); + + if (_secret->muser[0] != '\0') { + spdk_json_write_named_string(w, "muser", _secret->muser); + spdk_json_write_named_string(w, "msecret", _secret->msecret); + } + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_json_write_object_end(w); +} + +static void +iscsi_auth_group_config_json(struct spdk_iscsi_auth_group *group, + struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "iscsi_create_auth_group"); + + spdk_json_write_name(w, "params"); + iscsi_auth_group_info_json(group, w); + + spdk_json_write_object_end(w); +} + +void +iscsi_auth_groups_info_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_auth_group *group; + + TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) { + iscsi_auth_group_info_json(group, w); + } +} + +static void +iscsi_auth_groups_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_auth_group *group; + + TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) { + iscsi_auth_group_config_json(group, w); + } +} + +static void +iscsi_opts_config_json(struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "iscsi_set_options"); + + spdk_json_write_name(w, "params"); + iscsi_opts_info_json(w); + + spdk_json_write_object_end(w); +} + +void +spdk_iscsi_config_json(struct spdk_json_write_ctx *w) +{ + spdk_json_write_array_begin(w); + iscsi_opts_config_json(w); + iscsi_portal_grps_config_json(w); + iscsi_init_grps_config_json(w); + iscsi_tgt_nodes_config_json(w); + iscsi_auth_groups_config_json(w); + spdk_json_write_array_end(w); +} + +SPDK_LOG_REGISTER_COMPONENT("iscsi", SPDK_LOG_ISCSI) diff --git a/src/spdk/lib/iscsi/md5.c b/src/spdk/lib/iscsi/md5.c new file mode 100644 index 000000000..c316ac354 --- /dev/null +++ b/src/spdk/lib/iscsi/md5.c @@ -0,0 +1,75 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include <openssl/md5.h> + +#include "iscsi/md5.h" + +int md5init(struct spdk_md5ctx *md5ctx) +{ + int rc; + + if (md5ctx == NULL) { + return -1; + } + rc = MD5_Init(&md5ctx->md5ctx); + return rc; +} + +int md5final(void *md5, struct spdk_md5ctx *md5ctx) +{ + int rc; + + if (md5ctx == NULL || md5 == NULL) { + return -1; + } + rc = MD5_Final(md5, &md5ctx->md5ctx); + return rc; +} + +int md5update(struct spdk_md5ctx *md5ctx, const void *data, size_t len) +{ + int rc; + + if (md5ctx == NULL) { + return -1; + } + if (data == NULL || len == 0) { + return 0; + } + rc = MD5_Update(&md5ctx->md5ctx, data, len); + return rc; +} diff --git a/src/spdk/lib/iscsi/md5.h b/src/spdk/lib/iscsi/md5.h new file mode 100644 index 000000000..d6fc4c1ff --- /dev/null +++ b/src/spdk/lib/iscsi/md5.h @@ -0,0 +1,52 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_MD5_H +#define SPDK_MD5_H + +#include "spdk/stdinc.h" + +#include <openssl/md5.h> + +#define SPDK_MD5DIGEST_LEN MD5_DIGEST_LENGTH + +struct spdk_md5ctx { + MD5_CTX md5ctx; +}; + +int md5init(struct spdk_md5ctx *md5ctx); +int md5final(void *md5, struct spdk_md5ctx *md5ctx); +int md5update(struct spdk_md5ctx *md5ctx, const void *data, size_t len); + +#endif /* SPDK_MD5_H */ diff --git a/src/spdk/lib/iscsi/param.c b/src/spdk/lib/iscsi/param.c new file mode 100644 index 000000000..18f579359 --- /dev/null +++ b/src/spdk/lib/iscsi/param.c @@ -0,0 +1,1216 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/string.h" +#include "iscsi/iscsi.h" +#include "iscsi/param.h" +#include "iscsi/conn.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#define MAX_TMPBUF 1024 + +/* whose value may be bigger than 255 */ +static const char *non_simple_value_params[] = { + "CHAP_C", + "CHAP_R", + NULL, +}; + +void +iscsi_param_free(struct iscsi_param *params) +{ + struct iscsi_param *param, *next_param; + + if (params == NULL) { + return; + } + for (param = params; param != NULL; param = next_param) { + next_param = param->next; + if (param->list) { + free(param->list); + } + free(param->val); + free(param->key); + free(param); + } +} + +static int +iscsi_find_key_in_array(const char *key, const char *array[]) +{ + int i; + + for (i = 0; array[i] != NULL; i++) { + if (strcasecmp(key, array[i]) == 0) { + return 1; + } + } + return 0; +} + +struct iscsi_param * +iscsi_param_find(struct iscsi_param *params, const char *key) +{ + struct iscsi_param *param; + + if (params == NULL || key == NULL) { + return NULL; + } + for (param = params; param != NULL; param = param->next) { + if (param->key != NULL && param->key[0] == key[0] + && strcasecmp(param->key, key) == 0) { + return param; + } + } + return NULL; +} + +int +iscsi_param_del(struct iscsi_param **params, const char *key) +{ + struct iscsi_param *param, *prev_param = NULL; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "del %s\n", key); + if (params == NULL || key == NULL) { + return 0; + } + for (param = *params; param != NULL; param = param->next) { + if (param->key != NULL && param->key[0] == key[0] + && strcasecmp(param->key, key) == 0) { + if (prev_param != NULL) { + prev_param->next = param->next; + } else { + *params = param->next; + } + param->next = NULL; + iscsi_param_free(param); + return 0; + } + prev_param = param; + } + return -1; +} + +int +iscsi_param_add(struct iscsi_param **params, const char *key, + const char *val, const char *list, int type) +{ + struct iscsi_param *param, *last_param; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add %s=%s, list=[%s], type=%d\n", + key, val, list, type); + if (key == NULL) { + return -1; + } + + param = iscsi_param_find(*params, key); + if (param != NULL) { + iscsi_param_del(params, key); + } + + param = calloc(1, sizeof(*param)); + if (!param) { + SPDK_ERRLOG("calloc() failed for parameter\n"); + return -ENOMEM; + } + + param->next = NULL; + param->key = xstrdup(key); + param->val = xstrdup(val); + param->list = xstrdup(list); + param->type = type; + + last_param = *params; + if (last_param != NULL) { + while (last_param->next != NULL) { + last_param = last_param->next; + } + last_param->next = param; + } else { + *params = param; + } + + return 0; +} + +int +iscsi_param_set(struct iscsi_param *params, const char *key, + const char *val) +{ + struct iscsi_param *param; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set %s=%s\n", key, val); + param = iscsi_param_find(params, key); + if (param == NULL) { + SPDK_ERRLOG("no key %s\n", key); + return -1; + } + + free(param->val); + + param->val = xstrdup(val); + + return 0; +} + +int +iscsi_param_set_int(struct iscsi_param *params, const char *key, uint32_t val) +{ + char buf[MAX_TMPBUF]; + struct iscsi_param *param; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set %s=%d\n", key, val); + param = iscsi_param_find(params, key); + if (param == NULL) { + SPDK_ERRLOG("no key %s\n", key); + return -1; + } + + free(param->val); + snprintf(buf, sizeof buf, "%d", val); + + param->val = strdup(buf); + + return 0; +} + +/** + * Parse a single KEY=VAL pair + * + * data = "KEY=VAL<NUL>" + */ +static int +iscsi_parse_param(struct iscsi_param **params, const uint8_t *data, uint32_t data_len) +{ + int rc; + uint8_t *key_copy, *val_copy; + const uint8_t *key_end; + int key_len, val_len; + int max_len; + + data_len = strnlen(data, data_len); + /* No such thing as strnchr so use memchr instead. */ + key_end = memchr(data, '=', data_len); + if (!key_end) { + SPDK_ERRLOG("'=' not found\n"); + return -1; + } + + key_len = key_end - data; + if (key_len == 0) { + SPDK_ERRLOG("Empty key\n"); + return -1; + } + /* + * RFC 7143 6.1 + */ + if (key_len > ISCSI_TEXT_MAX_KEY_LEN) { + SPDK_ERRLOG("Key name length is bigger than 63\n"); + return -1; + } + + key_copy = malloc(key_len + 1); + if (!key_copy) { + SPDK_ERRLOG("malloc() failed for key_copy\n"); + return -ENOMEM; + } + + memcpy(key_copy, data, key_len); + key_copy[key_len] = '\0'; + /* check whether this key is duplicated */ + if (NULL != iscsi_param_find(*params, key_copy)) { + SPDK_ERRLOG("Duplicated Key %s\n", key_copy); + free(key_copy); + return -1; + } + + val_len = strnlen(key_end + 1, data_len - key_len - 1); + /* + * RFC 3720 5.1 + * If not otherwise specified, the maximum length of a simple-value + * (not its encoded representation) is 255 bytes, not including the delimiter + * (comma or zero byte). + */ + /* + * comma or zero is counted in, otherwise we need to iterate each parameter + * value + */ + max_len = iscsi_find_key_in_array(key_copy, non_simple_value_params) ? + ISCSI_TEXT_MAX_VAL_LEN : ISCSI_TEXT_MAX_SIMPLE_VAL_LEN; + if (val_len > max_len) { + SPDK_ERRLOG("Overflow Val %d\n", val_len); + free(key_copy); + return -1; + } + + val_copy = calloc(1, val_len + 1); + if (val_copy == NULL) { + SPDK_ERRLOG("Could not allocate value string\n"); + free(key_copy); + return -1; + } + + memcpy(val_copy, key_end + 1, val_len); + + rc = iscsi_param_add(params, key_copy, val_copy, NULL, 0); + free(val_copy); + free(key_copy); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_add() failed\n"); + return -1; + } + + /* return number of bytes consumed + * +1 for '=' and +1 for NUL + */ + return key_len + 1 + val_len + 1; +} + +/** + * Parse a sequence of KEY=VAL pairs. + * + * \param data "KEY=VAL<NUL>KEY=VAL<NUL>..." + * \param len length of data in bytes + */ +int +iscsi_parse_params(struct iscsi_param **params, const uint8_t *data, + int len, bool cbit_enabled, char **partial_parameter) +{ + int rc, offset = 0; + char *p; + int i; + + /* strip the partial text parameters if previous PDU have C enabled */ + if (partial_parameter && *partial_parameter) { + for (i = 0; i < len && data[i] != '\0'; i++) { + ; + } + p = spdk_sprintf_alloc("%s%s", *partial_parameter, (const char *)data); + if (!p) { + return -1; + } + rc = iscsi_parse_param(params, p, i + strlen(*partial_parameter)); + free(p); + if (rc < 0) { + return -1; + } + free(*partial_parameter); + *partial_parameter = NULL; + + data = data + i + 1; + len = len - (i + 1); + } + + /* strip the partial text parameters if C bit is enabled */ + if (cbit_enabled) { + if (partial_parameter == NULL) { + SPDK_ERRLOG("C bit set but no partial parameters provided\n"); + return -1; + } + + /* + * reverse iterate the string from the tail not including '\0' + */ + for (i = len - 1; data[i] != '\0' && i > 0; i--) { + ; + } + if (i != 0) { + /* We found a NULL character - don't copy it into the + * partial parameter. + */ + i++; + } + + *partial_parameter = calloc(1, len - i + 1); + if (*partial_parameter == NULL) { + SPDK_ERRLOG("could not allocate partial parameter\n"); + return -1; + } + memcpy(*partial_parameter, &data[i], len - i); + if (i == 0) { + /* No full parameters to parse - so return now. */ + return 0; + } else { + len = i - 1; + } + } + + while (offset < len && data[offset] != '\0') { + rc = iscsi_parse_param(params, data + offset, len - offset); + if (rc < 0) { + return -1; + } + offset += rc; + } + return 0; +} + +char * +iscsi_param_get_val(struct iscsi_param *params, const char *key) +{ + struct iscsi_param *param; + + param = iscsi_param_find(params, key); + if (param == NULL) { + return NULL; + } + return param->val; +} + +int +iscsi_param_eq_val(struct iscsi_param *params, const char *key, + const char *val) +{ + struct iscsi_param *param; + + param = iscsi_param_find(params, key); + if (param == NULL) { + return 0; + } + if (strcasecmp(param->val, val) == 0) { + return 1; + } + return 0; +} + +struct iscsi_param_table { + const char *key; + const char *val; + const char *list; + int type; +}; + +static const struct iscsi_param_table conn_param_table[] = { + { "HeaderDigest", "None", "CRC32C,None", ISPT_LIST }, + { "DataDigest", "None", "CRC32C,None", ISPT_LIST }, + { "MaxRecvDataSegmentLength", "8192", "512,16777215", ISPT_NUMERICAL_DECLARATIVE }, + { "OFMarker", "No", "Yes,No", ISPT_BOOLEAN_AND }, + { "IFMarker", "No", "Yes,No", ISPT_BOOLEAN_AND }, + { "OFMarkInt", "1", "1,65535", ISPT_NUMERICAL_MIN }, + { "IFMarkInt", "1", "1,65535", ISPT_NUMERICAL_MIN }, + { "AuthMethod", "None", "CHAP,None", ISPT_LIST }, + { "CHAP_A", "5", "5", ISPT_LIST }, + { "CHAP_N", "", "", ISPT_DECLARATIVE }, + { "CHAP_R", "", "", ISPT_DECLARATIVE }, + { "CHAP_I", "", "", ISPT_DECLARATIVE }, + { "CHAP_C", "", "", ISPT_DECLARATIVE }, + { NULL, NULL, NULL, ISPT_INVALID }, +}; + +static const struct iscsi_param_table sess_param_table[] = { + { "MaxConnections", "1", "1,65535", ISPT_NUMERICAL_MIN }, +#if 0 + /* need special handling */ + { "SendTargets", "", "", ISPT_DECLARATIVE }, +#endif + { "TargetName", "", "", ISPT_DECLARATIVE }, + { "InitiatorName", "", "", ISPT_DECLARATIVE }, + { "TargetAlias", "", "", ISPT_DECLARATIVE }, + { "InitiatorAlias", "", "", ISPT_DECLARATIVE }, + { "TargetAddress", "", "", ISPT_DECLARATIVE }, + { "TargetPortalGroupTag", "1", "1,65535", ISPT_NUMERICAL_DECLARATIVE }, + { "InitialR2T", "Yes", "Yes,No", ISPT_BOOLEAN_OR }, + { "ImmediateData", "Yes", "Yes,No", ISPT_BOOLEAN_AND }, + { "MaxBurstLength", "262144", "512,16777215", ISPT_NUMERICAL_MIN }, + { "FirstBurstLength", "65536", "512,16777215", ISPT_NUMERICAL_MIN }, + { "DefaultTime2Wait", "2", "0,3600", ISPT_NUMERICAL_MAX }, + { "DefaultTime2Retain", "20", "0,3600", ISPT_NUMERICAL_MIN }, + { "MaxOutstandingR2T", "1", "1,65536", ISPT_NUMERICAL_MIN }, + { "DataPDUInOrder", "Yes", "Yes,No", ISPT_BOOLEAN_OR }, + { "DataSequenceInOrder", "Yes", "Yes,No", ISPT_BOOLEAN_OR }, + { "ErrorRecoveryLevel", "0", "0,2", ISPT_NUMERICAL_MIN }, + { "SessionType", "Normal", "Normal,Discovery", ISPT_DECLARATIVE }, + { NULL, NULL, NULL, ISPT_INVALID }, +}; + +static int +iscsi_params_init_internal(struct iscsi_param **params, + const struct iscsi_param_table *table) +{ + int rc; + int i; + struct iscsi_param *param; + + for (i = 0; table[i].key != NULL; i++) { + rc = iscsi_param_add(params, table[i].key, table[i].val, + table[i].list, table[i].type); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_add() failed\n"); + return -1; + } + param = iscsi_param_find(*params, table[i].key); + if (param != NULL) { + param->state_index = i; + } else { + SPDK_ERRLOG("iscsi_param_find() failed\n"); + return -1; + } + } + + return 0; +} + +int +iscsi_conn_params_init(struct iscsi_param **params) +{ + return iscsi_params_init_internal(params, &conn_param_table[0]); +} + +int +iscsi_sess_params_init(struct iscsi_param **params) +{ + return iscsi_params_init_internal(params, &sess_param_table[0]); +} + +static const char *chap_type[] = { + "CHAP_A", + "CHAP_N", + "CHAP_R", + "CHAP_I", + "CHAP_C", + NULL, +}; + +static const char *discovery_ignored_param[] = { + "MaxConnections", + "InitialR2T", + "ImmediateData", + "MaxBurstLength", + "FirstBurstLength" + "MaxOutstandingR2T", + "DataPDUInOrder", + "DataSequenceInOrder", + NULL, +}; + +static const char *multi_negot_conn_params[] = { + "MaxRecvDataSegmentLength", + NULL, +}; + +/* The following params should be declared by target */ +static const char *target_declarative_params[] = { + "TargetAlias", + "TargetAddress", + "TargetPortalGroupTag", + NULL, +}; + +/* This function is used to construct the data from the special param (e.g., + * MaxRecvDataSegmentLength) + * return: + * normal: the total len of the data + * error: -1 + */ +static int +iscsi_special_param_construction(struct spdk_iscsi_conn *conn, + struct iscsi_param *param, + bool FirstBurstLength_flag, char *data, + int alloc_len, int total) +{ + int len; + struct iscsi_param *param_first; + struct iscsi_param *param_max; + uint32_t FirstBurstLength; + uint32_t MaxBurstLength; + char *val; + + val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1); + if (!val) { + SPDK_ERRLOG("malloc() failed for temporary buffer\n"); + return -ENOMEM; + } + + if (strcasecmp(param->key, "MaxRecvDataSegmentLength") == 0) { + /* + * MaxRecvDataSegmentLength is sent by both + * initiator and target, but is declarative - meaning + * each direction can have different values. + * So when MaxRecvDataSegmentLength is found in the + * the parameter set sent from the initiator, add SPDK + * iscsi target's MaxRecvDataSegmentLength value to + * the returned parameter list. + */ + if (alloc_len - total < 1) { + SPDK_ERRLOG("data space small %d\n", alloc_len); + free(val); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "returning MaxRecvDataSegmentLength=%d\n", + SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH); + len = snprintf((char *)data + total, alloc_len - total, + "MaxRecvDataSegmentLength=%d", + SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH); + total += len + 1; + } + + if (strcasecmp(param->key, "MaxBurstLength") == 0 && + !FirstBurstLength_flag) { + if (alloc_len - total < 1) { + SPDK_ERRLOG("data space small %d\n", alloc_len); + free(val); + return -1; + } + + param_first = iscsi_param_find(conn->sess->params, + "FirstBurstLength"); + if (param_first != NULL) { + FirstBurstLength = (uint32_t)strtol(param_first->val, NULL, 10); + } else { + FirstBurstLength = SPDK_ISCSI_FIRST_BURST_LENGTH; + } + param_max = iscsi_param_find(conn->sess->params, + "MaxBurstLength"); + if (param_max != NULL) { + MaxBurstLength = (uint32_t)strtol(param_max->val, NULL, 10); + } else { + MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH; + } + + if (FirstBurstLength > MaxBurstLength) { + FirstBurstLength = MaxBurstLength; + if (param_first != NULL) { + free(param_first->val); + snprintf(val, ISCSI_TEXT_MAX_VAL_LEN, "%d", + FirstBurstLength); + param_first->val = xstrdup(val); + } + } + len = snprintf((char *)data + total, alloc_len - total, + "FirstBurstLength=%d", FirstBurstLength); + total += len + 1; + } + + free(val); + return total; + +} + +/** + * iscsi_construct_data_from_param: + * To construct the data which will be returned to the initiator + * return: length of the negotiated data, -1 indicates error; + */ +static int +iscsi_construct_data_from_param(struct iscsi_param *param, char *new_val, + char *data, int alloc_len, int total) +{ + int len; + + if (param->type != ISPT_DECLARATIVE && + param->type != ISPT_NUMERICAL_DECLARATIVE) { + if (alloc_len - total < 1) { + SPDK_ERRLOG("data space small %d\n", alloc_len); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "negotiated %s=%s\n", + param->key, new_val); + len = snprintf((char *)data + total, alloc_len - total, "%s=%s", + param->key, new_val); + total += len + 1; + } + return total; +} + +/** + * To negotiate param with + * type = ISPT_LIST + * return: the negotiated value of the key + */ +static char * +iscsi_negotiate_param_list(int *add_param_value, + struct iscsi_param *param, + char *valid_list, char *in_val, + char *cur_val) +{ + char *val_start, *val_end; + char *in_start, *in_end; + int flag = 0; + + if (add_param_value == NULL) { + return NULL; + } + + in_start = in_val; + do { + if ((in_end = strchr(in_start, (int)',')) != NULL) { + *in_end = '\0'; + } + val_start = valid_list; + do { + if ((val_end = strchr(val_start, (int)',')) != NULL) { + *val_end = '\0'; + } + if (strcasecmp(in_start, val_start) == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "match %s\n", + val_start); + flag = 1; + break; + } + if (val_end) { + *val_end = ','; + val_start = val_end + 1; + } + } while (val_end); + if (flag) { + break; + } + if (in_end) { + *in_end = ','; + in_start = in_end + 1; + } + } while (in_end); + + return flag ? val_start : NULL; +} + +/** + * To negotiate param with + * type = ISPT_NUMERICAL_MIN/MAX, ISPT_NUMERICAL_DECLARATIVE + * return: the negotiated value of the key + */ +static char * +iscsi_negotiate_param_numerical(int *add_param_value, + struct iscsi_param *param, + char *valid_list, char *in_val, + char *cur_val) +{ + char *valid_next; + char *new_val = NULL; + char *min_val, *max_val; + int val_i, cur_val_i; + int min_i, max_i; + + if (add_param_value == NULL) { + return NULL; + } + + val_i = (int)strtol(param->val, NULL, 10); + /* check whether the key is FirstBurstLength, if that we use in_val */ + if (strcasecmp(param->key, "FirstBurstLength") == 0) { + val_i = (int)strtol(in_val, NULL, 10); + } + + cur_val_i = (int)strtol(cur_val, NULL, 10); + valid_next = valid_list; + min_val = spdk_strsepq(&valid_next, ","); + max_val = spdk_strsepq(&valid_next, ","); + min_i = (min_val != NULL) ? (int)strtol(min_val, NULL, 10) : 0; + max_i = (max_val != NULL) ? (int)strtol(max_val, NULL, 10) : 0; + if (val_i < min_i || val_i > max_i) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "key %.64s reject\n", param->key); + new_val = NULL; + } else { + switch (param->type) { + case ISPT_NUMERICAL_MIN: + if (val_i > cur_val_i) { + val_i = cur_val_i; + } + break; + case ISPT_NUMERICAL_MAX: + if (val_i < cur_val_i) { + val_i = cur_val_i; + } + break; + default: + break; + } + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d", val_i); + new_val = in_val; + } + + return new_val; +} + +/** + * To negotiate param with + * type = ISPT_BOOLEAN_OR, ISPT_BOOLEAN_AND + * return: the negotiated value of the key + */ +static char * +iscsi_negotiate_param_boolean(int *add_param_value, + struct iscsi_param *param, + char *in_val, char *cur_val, + const char *value) +{ + char *new_val = NULL; + + if (add_param_value == NULL) { + return NULL; + } + + /* Make sure the val is Yes or No */ + if (!((strcasecmp(in_val, "Yes") == 0) || + (strcasecmp(in_val, "No") == 0))) { + /* unknown value */ + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Reject"); + new_val = in_val; + *add_param_value = 1; + return new_val; + } + + if (strcasecmp(cur_val, value) == 0) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", value); + new_val = in_val; + } else { + new_val = param->val; + } + + return new_val; +} + +/** + * The entry function to handle each type of the param + * return value: the new negotiated value + */ +static char * +iscsi_negotiate_param_all(int *add_param_value, struct iscsi_param *param, + char *valid_list, char *in_val, char *cur_val) +{ + char *new_val; + switch (param->type) { + case ISPT_LIST: + new_val = iscsi_negotiate_param_list(add_param_value, + param, + valid_list, + in_val, + cur_val); + break; + + case ISPT_NUMERICAL_MIN: + case ISPT_NUMERICAL_MAX: + case ISPT_NUMERICAL_DECLARATIVE: + new_val = iscsi_negotiate_param_numerical(add_param_value, + param, + valid_list, + in_val, + cur_val); + break; + + case ISPT_BOOLEAN_OR: + new_val = iscsi_negotiate_param_boolean(add_param_value, + param, + in_val, + cur_val, + "Yes"); + break; + case ISPT_BOOLEAN_AND: + new_val = iscsi_negotiate_param_boolean(add_param_value, + param, + in_val, + cur_val, + "No"); + break; + + default: + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val); + new_val = in_val; + break; + } + + return new_val; +} + +/** + * This function is used to judge whether the param is in session's params or + * connection's params + */ +static int +iscsi_negotiate_param_init(struct spdk_iscsi_conn *conn, + struct iscsi_param **cur_param_p, + struct iscsi_param **params_dst_p, + struct iscsi_param *param) +{ + int index; + + *cur_param_p = iscsi_param_find(*params_dst_p, param->key); + if (*cur_param_p == NULL) { + *params_dst_p = conn->sess->params; + *cur_param_p = iscsi_param_find(*params_dst_p, param->key); + if (*cur_param_p == NULL) { + if ((strncasecmp(param->key, "X-", 2) == 0) || + (strncasecmp(param->key, "X#", 2) == 0)) { + /* Extension Key */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "extension key %.64s\n", + param->key); + } else { + SPDK_ERRLOG("unknown key %.64s\n", param->key); + } + return 1; + } else { + index = (*cur_param_p)->state_index; + if (conn->sess_param_state_negotiated[index] && + !iscsi_find_key_in_array(param->key, + target_declarative_params)) { + return SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE; + } + conn->sess_param_state_negotiated[index] = true; + } + } else { + index = (*cur_param_p)->state_index; + if (conn->conn_param_state_negotiated[index] && + !iscsi_find_key_in_array(param->key, + multi_negot_conn_params)) { + return SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE; + } + conn->conn_param_state_negotiated[index] = true; + } + + return 0; +} + +int +iscsi_negotiate_params(struct spdk_iscsi_conn *conn, + struct iscsi_param **params, uint8_t *data, int alloc_len, + int data_len) +{ + struct iscsi_param *param; + struct iscsi_param *cur_param; + char *valid_list, *in_val; + char *cur_val; + char *new_val; + int discovery; + int total; + int rc; + uint32_t FirstBurstLength; + uint32_t MaxBurstLength; + bool FirstBurstLength_flag = false; + int type; + + total = data_len; + if (data_len < 0) { + assert(false); + return -EINVAL; + } + if (alloc_len < 1) { + return 0; + } + if (total > alloc_len) { + total = alloc_len; + data[total - 1] = '\0'; + return total; + } + + if (*params == NULL) { + /* no input */ + return total; + } + + /* discovery? */ + discovery = 0; + cur_param = iscsi_param_find(*params, "SessionType"); + if (cur_param == NULL) { + cur_param = iscsi_param_find(conn->sess->params, "SessionType"); + if (cur_param == NULL) { + /* no session type */ + } else { + if (strcasecmp(cur_param->val, "Discovery") == 0) { + discovery = 1; + } + } + } else { + if (strcasecmp(cur_param->val, "Discovery") == 0) { + discovery = 1; + } + } + + /* for temporary store */ + valid_list = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1); + if (!valid_list) { + SPDK_ERRLOG("malloc() failed for valid_list\n"); + return -ENOMEM; + } + + in_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1); + if (!in_val) { + SPDK_ERRLOG("malloc() failed for in_val\n"); + free(valid_list); + return -ENOMEM; + } + + cur_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1); + if (!cur_val) { + SPDK_ERRLOG("malloc() failed for cur_val\n"); + free(valid_list); + free(in_val); + return -ENOMEM; + } + + /* To adjust the location of FirstBurstLength location and put it to + * the end, then we can always firstly determine the MaxBurstLength + */ + param = iscsi_param_find(*params, "MaxBurstLength"); + if (param != NULL) { + param = iscsi_param_find(*params, "FirstBurstLength"); + + /* check the existence of FirstBurstLength */ + if (param != NULL) { + FirstBurstLength_flag = true; + if (param->next != NULL) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val); + type = param->type; + iscsi_param_add(params, "FirstBurstLength", + in_val, NULL, type); + } + } + } + + for (param = *params; param != NULL; param = param->next) { + struct iscsi_param *params_dst = conn->params; + int add_param_value = 0; + new_val = NULL; + param->type = ISPT_INVALID; + + /* sendtargets is special */ + if (strcasecmp(param->key, "SendTargets") == 0) { + continue; + } + /* CHAP keys */ + if (iscsi_find_key_in_array(param->key, chap_type)) { + continue; + } + + /* 12.2, 12.10, 12.11, 12.13, 12.14, 12.17, 12.18, 12.19 */ + if (discovery && + iscsi_find_key_in_array(param->key, discovery_ignored_param)) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Irrelevant"); + new_val = in_val; + add_param_value = 1; + } else { + rc = iscsi_negotiate_param_init(conn, + &cur_param, + ¶ms_dst, + param); + if (rc < 0) { + free(valid_list); + free(in_val); + free(cur_val); + return rc; + } else if (rc > 0) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "NotUnderstood"); + new_val = in_val; + add_param_value = 1; + } else { + snprintf(valid_list, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", cur_param->list); + snprintf(cur_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", cur_param->val); + param->type = cur_param->type; + } + } + + if (param->type > 0) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val); + + /* "NotUnderstood" value shouldn't be assigned to "Understood" key */ + if (strcasecmp(in_val, "NotUnderstood") == 0) { + free(in_val); + free(valid_list); + free(cur_val); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + + if (strcasecmp(param->key, "FirstBurstLength") == 0) { + FirstBurstLength = (uint32_t)strtol(param->val, NULL, + 10); + new_val = iscsi_param_get_val(conn->sess->params, + "MaxBurstLength"); + if (new_val != NULL) { + MaxBurstLength = (uint32_t) strtol(new_val, NULL, + 10); + } else { + MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH; + } + if (FirstBurstLength < SPDK_ISCSI_MAX_FIRST_BURST_LENGTH && + FirstBurstLength > MaxBurstLength) { + FirstBurstLength = MaxBurstLength; + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d", + FirstBurstLength); + } + } + + /* prevent target's declarative params from being changed by initiator */ + if (iscsi_find_key_in_array(param->key, target_declarative_params)) { + add_param_value = 1; + } + + new_val = iscsi_negotiate_param_all(&add_param_value, + param, + valid_list, + in_val, + cur_val); + } + + /* check the negotiated value of the key */ + if (new_val != NULL) { + /* add_param_value = 0 means updating the value of + * existed key in the connection's parameters + */ + if (add_param_value == 0) { + iscsi_param_set(params_dst, param->key, new_val); + } + total = iscsi_construct_data_from_param(param, + new_val, + data, + alloc_len, + total); + if (total < 0) { + goto final_return; + } + + total = iscsi_special_param_construction(conn, + param, + FirstBurstLength_flag, + data, + alloc_len, + total); + if (total < 0) { + goto final_return; + } + } else { + total = -1; + break; + } + } + +final_return: + free(valid_list); + free(in_val); + free(cur_val); + + return total; +} + +int +iscsi_copy_param2var(struct spdk_iscsi_conn *conn) +{ + const char *val; + + val = iscsi_param_get_val(conn->params, "MaxRecvDataSegmentLength"); + if (val == NULL) { + SPDK_ERRLOG("Getval MaxRecvDataSegmentLength failed\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "copy MaxRecvDataSegmentLength=%s\n", val); + conn->MaxRecvDataSegmentLength = (int)strtol(val, NULL, 10); + if (conn->MaxRecvDataSegmentLength > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { + conn->MaxRecvDataSegmentLength = SPDK_BDEV_LARGE_BUF_MAX_SIZE; + } + + val = iscsi_param_get_val(conn->params, "HeaderDigest"); + if (val == NULL) { + SPDK_ERRLOG("Getval HeaderDigest failed\n"); + return -1; + } + if (strcasecmp(val, "CRC32C") == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set HeaderDigest=1\n"); + conn->header_digest = 1; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set HeaderDigest=0\n"); + conn->header_digest = 0; + } + val = iscsi_param_get_val(conn->params, "DataDigest"); + if (val == NULL) { + SPDK_ERRLOG("Getval DataDigest failed\n"); + return -1; + } + if (strcasecmp(val, "CRC32C") == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set DataDigest=1\n"); + conn->data_digest = 1; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set DataDigest=0\n"); + conn->data_digest = 0; + } + + val = iscsi_param_get_val(conn->sess->params, "MaxConnections"); + if (val == NULL) { + SPDK_ERRLOG("Getval MaxConnections failed\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxConnections=%s\n", val); + conn->sess->MaxConnections = (uint32_t) strtol(val, NULL, 10); + val = iscsi_param_get_val(conn->sess->params, "MaxOutstandingR2T"); + if (val == NULL) { + SPDK_ERRLOG("Getval MaxOutstandingR2T failed\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxOutstandingR2T=%s\n", val); + conn->sess->MaxOutstandingR2T = (uint32_t) strtol(val, NULL, 10); + val = iscsi_param_get_val(conn->sess->params, "FirstBurstLength"); + if (val == NULL) { + SPDK_ERRLOG("Getval FirstBurstLength failed\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy FirstBurstLength=%s\n", val); + conn->sess->FirstBurstLength = (uint32_t) strtol(val, NULL, 10); + val = iscsi_param_get_val(conn->sess->params, "MaxBurstLength"); + if (val == NULL) { + SPDK_ERRLOG("Getval MaxBurstLength failed\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxBurstLength=%s\n", val); + conn->sess->MaxBurstLength = (uint32_t) strtol(val, NULL, 10); + val = iscsi_param_get_val(conn->sess->params, "InitialR2T"); + if (val == NULL) { + SPDK_ERRLOG("Getval InitialR2T failed\n"); + return -1; + } + if (strcasecmp(val, "Yes") == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set InitialR2T=1\n"); + conn->sess->InitialR2T = true; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set InitialR2T=0\n"); + conn->sess->InitialR2T = false; + } + val = iscsi_param_get_val(conn->sess->params, "ImmediateData"); + if (val == NULL) { + SPDK_ERRLOG("Getval ImmediateData failed\n"); + return -1; + } + if (strcasecmp(val, "Yes") == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set ImmediateData=1\n"); + conn->sess->ImmediateData = true; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set ImmediateData=0\n"); + conn->sess->ImmediateData = false; + } + return 0; +} diff --git a/src/spdk/lib/iscsi/param.h b/src/spdk/lib/iscsi/param.h new file mode 100644 index 000000000..ce194c514 --- /dev/null +++ b/src/spdk/lib/iscsi/param.h @@ -0,0 +1,94 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ISCSI_PARAM_H +#define SPDK_ISCSI_PARAM_H + +#include "spdk/stdinc.h" + +struct spdk_iscsi_conn; + +enum iscsi_param_type { + ISPT_INVALID = -1, + ISPT_NOTSPECIFIED = 0, + ISPT_LIST, + ISPT_NUMERICAL_MIN, + ISPT_NUMERICAL_MAX, + ISPT_NUMERICAL_DECLARATIVE, + ISPT_DECLARATIVE, + ISPT_BOOLEAN_OR, + ISPT_BOOLEAN_AND, +}; + +struct iscsi_param { + struct iscsi_param *next; + char *key; + char *val; + char *list; + int type; + int state_index; +}; + +void +iscsi_param_free(struct iscsi_param *params); +struct iscsi_param * +iscsi_param_find(struct iscsi_param *params, const char *key); +int +iscsi_param_del(struct iscsi_param **params, const char *key); +int +iscsi_param_add(struct iscsi_param **params, const char *key, + const char *val, const char *list, int type); +int +iscsi_param_set(struct iscsi_param *params, const char *key, + const char *val); +int +iscsi_param_set_int(struct iscsi_param *params, const char *key, uint32_t val); +int +iscsi_parse_params(struct iscsi_param **params, const uint8_t *data, + int len, bool cbit_enabled, char **partial_parameter); +char * +iscsi_param_get_val(struct iscsi_param *params, const char *key); +int +iscsi_param_eq_val(struct iscsi_param *params, const char *key, + const char *val); + +int iscsi_negotiate_params(struct spdk_iscsi_conn *conn, + struct iscsi_param **params_p, uint8_t *data, + int alloc_len, int data_len); +int iscsi_copy_param2var(struct spdk_iscsi_conn *conn); + +int iscsi_conn_params_init(struct iscsi_param **params); +int iscsi_sess_params_init(struct iscsi_param **params); + +#endif /* SPDK_ISCSI_PARAM_H */ diff --git a/src/spdk/lib/iscsi/portal_grp.c b/src/spdk/lib/iscsi/portal_grp.c new file mode 100644 index 000000000..986562ad7 --- /dev/null +++ b/src/spdk/lib/iscsi/portal_grp.c @@ -0,0 +1,655 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/conf.h" +#include "spdk/sock.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include "iscsi/iscsi.h" +#include "iscsi/conn.h" +#include "iscsi/portal_grp.h" +#include "iscsi/tgt_node.h" + +#define PORTNUMSTRLEN 32 +#define ACCEPT_TIMEOUT_US 1000 /* 1ms */ + +static int +iscsi_portal_accept(void *arg) +{ + struct spdk_iscsi_portal *portal = arg; + struct spdk_sock *sock; + int rc; + int count = 0; + + if (portal->sock == NULL) { + return -1; + } + + while (1) { + sock = spdk_sock_accept(portal->sock); + if (sock != NULL) { + rc = iscsi_conn_construct(portal, sock); + if (rc < 0) { + spdk_sock_close(&sock); + SPDK_ERRLOG("spdk_iscsi_connection_construct() failed\n"); + break; + } + count++; + } else { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + SPDK_ERRLOG("accept error(%d): %s\n", errno, spdk_strerror(errno)); + } + break; + } + } + + return count; +} + +static struct spdk_iscsi_portal * +iscsi_portal_find_by_addr(const char *host, const char *port) +{ + struct spdk_iscsi_portal *p; + + TAILQ_FOREACH(p, &g_iscsi.portal_head, g_tailq) { + if (!strcmp(p->host, host) && !strcmp(p->port, port)) { + return p; + } + } + + return NULL; +} + +/* Assumes caller allocated host and port strings on the heap */ +struct spdk_iscsi_portal * +iscsi_portal_create(const char *host, const char *port) +{ + struct spdk_iscsi_portal *p = NULL, *tmp; + + assert(host != NULL); + assert(port != NULL); + + if (strlen(host) > MAX_PORTAL_ADDR || strlen(port) > MAX_PORTAL_PORT) { + return NULL; + } + + p = calloc(1, sizeof(*p)); + if (!p) { + SPDK_ERRLOG("calloc() failed for portal\n"); + return NULL; + } + + /* check and overwrite abbreviation of wildcard */ + if (strcasecmp(host, "[*]") == 0) { + SPDK_WARNLOG("Please use \"[::]\" as IPv6 wildcard\n"); + SPDK_WARNLOG("Convert \"[*]\" to \"[::]\" automatically\n"); + SPDK_WARNLOG("(Use of \"[*]\" will be deprecated in a future release)"); + snprintf(p->host, sizeof(p->host), "[::]"); + } else if (strcasecmp(host, "*") == 0) { + SPDK_WARNLOG("Please use \"0.0.0.0\" as IPv4 wildcard\n"); + SPDK_WARNLOG("Convert \"*\" to \"0.0.0.0\" automatically\n"); + SPDK_WARNLOG("(Use of \"[*]\" will be deprecated in a future release)"); + snprintf(p->host, sizeof(p->host), "0.0.0.0"); + } else { + memcpy(p->host, host, strlen(host)); + } + + memcpy(p->port, port, strlen(port)); + + p->sock = NULL; + p->group = NULL; /* set at a later time by caller */ + p->acceptor_poller = NULL; + + pthread_mutex_lock(&g_iscsi.mutex); + tmp = iscsi_portal_find_by_addr(host, port); + if (tmp != NULL) { + pthread_mutex_unlock(&g_iscsi.mutex); + SPDK_ERRLOG("portal (%s, %s) already exists\n", host, port); + goto error_out; + } + + TAILQ_INSERT_TAIL(&g_iscsi.portal_head, p, g_tailq); + pthread_mutex_unlock(&g_iscsi.mutex); + + return p; + +error_out: + free(p); + + return NULL; +} + +void +iscsi_portal_destroy(struct spdk_iscsi_portal *p) +{ + assert(p != NULL); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_destroy\n"); + + pthread_mutex_lock(&g_iscsi.mutex); + TAILQ_REMOVE(&g_iscsi.portal_head, p, g_tailq); + pthread_mutex_unlock(&g_iscsi.mutex); + + free(p); + +} + +static int +iscsi_portal_open(struct spdk_iscsi_portal *p) +{ + struct spdk_sock *sock; + int port; + + if (p->sock != NULL) { + SPDK_ERRLOG("portal (%s, %s) is already opened\n", + p->host, p->port); + return -1; + } + + port = (int)strtol(p->port, NULL, 0); + sock = spdk_sock_listen(p->host, port, NULL); + if (sock == NULL) { + SPDK_ERRLOG("listen error %.64s.%d\n", p->host, port); + return -1; + } + + p->sock = sock; + + /* + * When the portal is created by config file, incoming connection + * requests for the socket are pended to accept until reactors start. + * However the gap between listen() and accept() will be slight and + * the requests will be queued by the nonzero backlog of the socket + * or resend by TCP. + */ + p->acceptor_poller = SPDK_POLLER_REGISTER(iscsi_portal_accept, p, ACCEPT_TIMEOUT_US); + + return 0; +} + +static void +iscsi_portal_close(struct spdk_iscsi_portal *p) +{ + if (p->sock) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "close portal (%s, %s)\n", + p->host, p->port); + spdk_poller_unregister(&p->acceptor_poller); + spdk_sock_close(&p->sock); + } +} + +static int +iscsi_parse_portal(const char *portalstring, struct spdk_iscsi_portal **ip) +{ + char *host = NULL, *port = NULL; + int len, rc = -1; + const char *p; + + if (portalstring == NULL) { + SPDK_ERRLOG("portal error\n"); + goto error_out; + } + + /* IP address */ + if (portalstring[0] == '[') { + /* IPv6 */ + p = strchr(portalstring + 1, ']'); + if (p == NULL) { + SPDK_ERRLOG("portal error\n"); + goto error_out; + } + p++; + } else { + /* IPv4 */ + p = strchr(portalstring, ':'); + if (p == NULL) { + p = portalstring + strlen(portalstring); + } + } + + len = p - portalstring; + host = malloc(len + 1); + if (host == NULL) { + SPDK_ERRLOG("malloc() failed for host\n"); + goto error_out; + } + memcpy(host, portalstring, len); + host[len] = '\0'; + + /* Port number (IPv4 and IPv6 are the same) */ + if (p[0] == '\0') { + port = malloc(PORTNUMSTRLEN); + if (!port) { + SPDK_ERRLOG("malloc() failed for port\n"); + goto error_out; + } + snprintf(port, PORTNUMSTRLEN, "%d", DEFAULT_PORT); + } else { + p++; + len = strlen(p); + port = malloc(len + 1); + if (port == NULL) { + SPDK_ERRLOG("malloc() failed for port\n"); + goto error_out; + } + memcpy(port, p, len); + port[len] = '\0'; + } + + *ip = iscsi_portal_create(host, port); + if (!*ip) { + goto error_out; + } + + rc = 0; +error_out: + free(host); + free(port); + + return rc; +} + +struct spdk_iscsi_portal_grp * +iscsi_portal_grp_create(int tag) +{ + struct spdk_iscsi_portal_grp *pg = malloc(sizeof(*pg)); + + if (!pg) { + SPDK_ERRLOG("malloc() failed for portal group\n"); + return NULL; + } + + pg->ref = 0; + pg->tag = tag; + + pthread_mutex_lock(&g_iscsi.mutex); + pg->disable_chap = g_iscsi.disable_chap; + pg->require_chap = g_iscsi.require_chap; + pg->mutual_chap = g_iscsi.mutual_chap; + pg->chap_group = g_iscsi.chap_group; + pthread_mutex_unlock(&g_iscsi.mutex); + + TAILQ_INIT(&pg->head); + + return pg; +} + +void +iscsi_portal_grp_destroy(struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_portal *p; + + assert(pg != NULL); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_grp_destroy\n"); + while (!TAILQ_EMPTY(&pg->head)) { + p = TAILQ_FIRST(&pg->head); + TAILQ_REMOVE(&pg->head, p, per_pg_tailq); + iscsi_portal_destroy(p); + } + free(pg); +} + +int +iscsi_portal_grp_register(struct spdk_iscsi_portal_grp *pg) +{ + int rc = -1; + struct spdk_iscsi_portal_grp *tmp; + + assert(pg != NULL); + + pthread_mutex_lock(&g_iscsi.mutex); + tmp = iscsi_portal_grp_find_by_tag(pg->tag); + if (tmp == NULL) { + TAILQ_INSERT_TAIL(&g_iscsi.pg_head, pg, tailq); + rc = 0; + } + pthread_mutex_unlock(&g_iscsi.mutex); + return rc; +} + +void +iscsi_portal_grp_add_portal(struct spdk_iscsi_portal_grp *pg, + struct spdk_iscsi_portal *p) +{ + assert(pg != NULL); + assert(p != NULL); + + p->group = pg; + TAILQ_INSERT_TAIL(&pg->head, p, per_pg_tailq); +} + +int +iscsi_portal_grp_set_chap_params(struct spdk_iscsi_portal_grp *pg, + bool disable_chap, bool require_chap, + bool mutual_chap, int32_t chap_group) +{ + if (!iscsi_check_chap_params(disable_chap, require_chap, + mutual_chap, chap_group)) { + return -EINVAL; + } + + pg->disable_chap = disable_chap; + pg->require_chap = require_chap; + pg->mutual_chap = mutual_chap; + pg->chap_group = chap_group; + + return 0; +} + +static int +iscsi_parse_portal_grp(struct spdk_conf_section *sp) +{ + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_portal *p; + const char *val; + char *label, *portal; + int i = 0, rc = 0; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add portal group (from config file) %d\n", + spdk_conf_section_get_num(sp)); + + val = spdk_conf_section_get_val(sp, "Comment"); + if (val != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val); + } + + pg = iscsi_portal_grp_create(spdk_conf_section_get_num(sp)); + if (!pg) { + SPDK_ERRLOG("portal group malloc error (%s)\n", spdk_conf_section_get_name(sp)); + return -1; + } + + for (i = 0; ; i++) { + label = spdk_conf_section_get_nmval(sp, "Portal", i, 0); + portal = spdk_conf_section_get_nmval(sp, "Portal", i, 1); + if (label == NULL || portal == NULL) { + break; + } + + rc = iscsi_parse_portal(portal, &p); + if (rc < 0) { + SPDK_ERRLOG("parse portal error (%s)\n", portal); + goto error; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "RIndex=%d, Host=%s, Port=%s, Tag=%d\n", + i, p->host, p->port, spdk_conf_section_get_num(sp)); + + iscsi_portal_grp_add_portal(pg, p); + } + + rc = iscsi_portal_grp_open(pg); + if (rc != 0) { + SPDK_ERRLOG("portal_grp_open failed\n"); + goto error; + } + + /* Add portal group to the end of the pg list */ + rc = iscsi_portal_grp_register(pg); + if (rc != 0) { + SPDK_ERRLOG("register portal failed\n"); + goto error; + } + + return 0; + +error: + iscsi_portal_grp_release(pg); + return -1; +} + +struct spdk_iscsi_portal_grp * +iscsi_portal_grp_find_by_tag(int tag) +{ + struct spdk_iscsi_portal_grp *pg; + + TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) { + if (pg->tag == tag) { + return pg; + } + } + + return NULL; +} + +int +iscsi_parse_portal_grps(void) +{ + int rc = 0; + struct spdk_conf_section *sp; + + sp = spdk_conf_first_section(NULL); + while (sp != NULL) { + if (spdk_conf_section_match_prefix(sp, "PortalGroup")) { + if (spdk_conf_section_get_num(sp) == 0) { + SPDK_ERRLOG("Group 0 is invalid\n"); + return -1; + } + + /* Build portal group from cfg section PortalGroup */ + rc = iscsi_parse_portal_grp(sp); + if (rc < 0) { + SPDK_ERRLOG("parse_portal_group() failed\n"); + return -1; + } + } + sp = spdk_conf_next_section(sp); + } + return 0; +} + +void +iscsi_portal_grps_destroy(void) +{ + struct spdk_iscsi_portal_grp *pg; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_grps_destroy\n"); + pthread_mutex_lock(&g_iscsi.mutex); + while (!TAILQ_EMPTY(&g_iscsi.pg_head)) { + pg = TAILQ_FIRST(&g_iscsi.pg_head); + TAILQ_REMOVE(&g_iscsi.pg_head, pg, tailq); + pthread_mutex_unlock(&g_iscsi.mutex); + iscsi_portal_grp_destroy(pg); + pthread_mutex_lock(&g_iscsi.mutex); + } + pthread_mutex_unlock(&g_iscsi.mutex); +} + +int +iscsi_portal_grp_open(struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_portal *p; + int rc; + + TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { + rc = iscsi_portal_open(p); + if (rc < 0) { + return rc; + } + } + return 0; +} + +static void +iscsi_portal_grp_close(struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_portal *p; + + TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { + iscsi_portal_close(p); + } +} + +void +iscsi_portal_grp_close_all(void) +{ + struct spdk_iscsi_portal_grp *pg; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_grp_close_all\n"); + pthread_mutex_lock(&g_iscsi.mutex); + TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) { + iscsi_portal_grp_close(pg); + } + pthread_mutex_unlock(&g_iscsi.mutex); +} + +struct spdk_iscsi_portal_grp * +iscsi_portal_grp_unregister(int tag) +{ + struct spdk_iscsi_portal_grp *pg; + + pthread_mutex_lock(&g_iscsi.mutex); + TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) { + if (pg->tag == tag) { + TAILQ_REMOVE(&g_iscsi.pg_head, pg, tailq); + pthread_mutex_unlock(&g_iscsi.mutex); + return pg; + } + } + pthread_mutex_unlock(&g_iscsi.mutex); + return NULL; +} + +void +iscsi_portal_grp_release(struct spdk_iscsi_portal_grp *pg) +{ + iscsi_portal_grp_close(pg); + iscsi_portal_grp_destroy(pg); +} + +static const char *portal_group_section = \ + "\n" + "# Users must change the PortalGroup section(s) to match the IP addresses\n" + "# for their environment.\n" + "# PortalGroup sections define which network portals the iSCSI target\n" + "# will use to listen for incoming connections. These are also used to\n" + "# determine which targets are accessible over each portal group.\n" + "# Up to 1024 Portal directives are allowed. These define the network\n" + "# portals of the portal group. The user must specify a IP address\n" + "# for each network portal, and may optionally specify a port.\n" + "# If the port is omitted, 3260 will be used\n" + "# Syntax:\n" + "# Portal <Name> <IP address>[:<port>]\n"; + +#define PORTAL_GROUP_TMPL \ +"[PortalGroup%d]\n" \ +" Comment \"Portal%d\"\n" + +#define PORTAL_TMPL \ +" Portal DA1 %s:%s\n" + +void +iscsi_portal_grps_config_text(FILE *fp) +{ + struct spdk_iscsi_portal *p = NULL; + struct spdk_iscsi_portal_grp *pg = NULL; + + /* Create portal group section */ + fprintf(fp, "%s", portal_group_section); + + /* Dump portal groups */ + TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) { + if (NULL == pg) { continue; } + fprintf(fp, PORTAL_GROUP_TMPL, pg->tag, pg->tag); + /* Dump portals */ + TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { + if (NULL == p) { continue; } + fprintf(fp, PORTAL_TMPL, p->host, p->port); + } + } +} + +static void +iscsi_portal_grp_info_json(struct spdk_iscsi_portal_grp *pg, + struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_portal *portal; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_int32(w, "tag", pg->tag); + + spdk_json_write_named_array_begin(w, "portals"); + TAILQ_FOREACH(portal, &pg->head, per_pg_tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "host", portal->host); + spdk_json_write_named_string(w, "port", portal->port); + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_json_write_object_end(w); +} + +static void +iscsi_portal_grp_config_json(struct spdk_iscsi_portal_grp *pg, + struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "iscsi_create_portal_group"); + + spdk_json_write_name(w, "params"); + iscsi_portal_grp_info_json(pg, w); + + spdk_json_write_object_end(w); +} + +void +iscsi_portal_grps_info_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_portal_grp *pg; + + TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) { + iscsi_portal_grp_info_json(pg, w); + } +} + +void +iscsi_portal_grps_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_portal_grp *pg; + + TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) { + iscsi_portal_grp_config_json(pg, w); + } +} diff --git a/src/spdk/lib/iscsi/portal_grp.h b/src/spdk/lib/iscsi/portal_grp.h new file mode 100644 index 000000000..7ac72e36c --- /dev/null +++ b/src/spdk/lib/iscsi/portal_grp.h @@ -0,0 +1,90 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_PORTAL_GRP_H +#define SPDK_PORTAL_GRP_H + +#include "spdk/conf.h" +#include "spdk/cpuset.h" +#include "iscsi/iscsi.h" + +struct spdk_json_write_ctx; + +struct spdk_iscsi_portal { + struct spdk_iscsi_portal_grp *group; + char host[MAX_PORTAL_ADDR + 1]; + char port[MAX_PORTAL_PORT + 1]; + struct spdk_sock *sock; + struct spdk_poller *acceptor_poller; + TAILQ_ENTRY(spdk_iscsi_portal) per_pg_tailq; + TAILQ_ENTRY(spdk_iscsi_portal) g_tailq; +}; + +struct spdk_iscsi_portal_grp { + int ref; + int tag; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; + TAILQ_ENTRY(spdk_iscsi_portal_grp) tailq; + TAILQ_HEAD(, spdk_iscsi_portal) head; +}; + +/* SPDK iSCSI Portal Group management API */ + +struct spdk_iscsi_portal *iscsi_portal_create(const char *host, const char *port); +void iscsi_portal_destroy(struct spdk_iscsi_portal *p); + +struct spdk_iscsi_portal_grp *iscsi_portal_grp_create(int tag); +void iscsi_portal_grp_add_portal(struct spdk_iscsi_portal_grp *pg, + struct spdk_iscsi_portal *p); +void iscsi_portal_grp_destroy(struct spdk_iscsi_portal_grp *pg); +void iscsi_portal_grp_release(struct spdk_iscsi_portal_grp *pg); +int iscsi_parse_portal_grps(void); +void iscsi_portal_grps_destroy(void); +int iscsi_portal_grp_register(struct spdk_iscsi_portal_grp *pg); +struct spdk_iscsi_portal_grp *iscsi_portal_grp_unregister(int tag); +struct spdk_iscsi_portal_grp *iscsi_portal_grp_find_by_tag(int tag); +int iscsi_portal_grp_open(struct spdk_iscsi_portal_grp *pg); +int iscsi_portal_grp_set_chap_params(struct spdk_iscsi_portal_grp *pg, + bool disable_chap, bool require_chap, + bool mutual_chap, int32_t chap_group); + +void iscsi_portal_grp_close_all(void); +void iscsi_portal_grps_config_text(FILE *fp); +void iscsi_portal_grps_info_json(struct spdk_json_write_ctx *w); +void iscsi_portal_grps_config_json(struct spdk_json_write_ctx *w); + +#endif /* SPDK_PORTAL_GRP_H */ diff --git a/src/spdk/lib/iscsi/spdk_iscsi.map b/src/spdk/lib/iscsi/spdk_iscsi.map new file mode 100644 index 000000000..0475a800d --- /dev/null +++ b/src/spdk/lib/iscsi/spdk_iscsi.map @@ -0,0 +1,11 @@ +{ + global: + + # Functions used by other SPDK libraries + spdk_iscsi_init; + spdk_iscsi_fini; + spdk_iscsi_config_text; + spdk_iscsi_config_json; + + local: *; +}; diff --git a/src/spdk/lib/iscsi/task.c b/src/spdk/lib/iscsi/task.c new file mode 100644 index 000000000..964621178 --- /dev/null +++ b/src/spdk/lib/iscsi/task.c @@ -0,0 +1,98 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/env.h" +#include "spdk/log.h" +#include "iscsi/conn.h" +#include "iscsi/task.h" + +static void +iscsi_task_free(struct spdk_scsi_task *scsi_task) +{ + struct spdk_iscsi_task *task = iscsi_task_from_scsi_task(scsi_task); + + if (task->parent) { + if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) { + assert(task->conn->data_in_cnt > 0); + task->conn->data_in_cnt--; + } + + spdk_scsi_task_put(&task->parent->scsi); + task->parent = NULL; + } + + iscsi_task_disassociate_pdu(task); + assert(task->conn->pending_task_cnt > 0); + task->conn->pending_task_cnt--; + spdk_mempool_put(g_iscsi.task_pool, (void *)task); +} + +struct spdk_iscsi_task * +iscsi_task_get(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *parent, + spdk_scsi_task_cpl cpl_fn) +{ + struct spdk_iscsi_task *task; + + task = spdk_mempool_get(g_iscsi.task_pool); + if (!task) { + SPDK_ERRLOG("Unable to get task\n"); + abort(); + } + + assert(conn != NULL); + memset(task, 0, sizeof(*task)); + task->conn = conn; + assert(conn->pending_task_cnt < UINT32_MAX); + conn->pending_task_cnt++; + spdk_scsi_task_construct(&task->scsi, + cpl_fn, + iscsi_task_free); + if (parent) { + parent->scsi.ref++; + task->parent = parent; + task->tag = parent->tag; + task->lun_id = parent->lun_id; + task->scsi.dxfer_dir = parent->scsi.dxfer_dir; + task->scsi.transfer_len = parent->scsi.transfer_len; + task->scsi.lun = parent->scsi.lun; + task->scsi.cdb = parent->scsi.cdb; + task->scsi.target_port = parent->scsi.target_port; + task->scsi.initiator_port = parent->scsi.initiator_port; + if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) { + conn->data_in_cnt++; + } + } + + return task; +} diff --git a/src/spdk/lib/iscsi/task.h b/src/spdk/lib/iscsi/task.h new file mode 100644 index 000000000..0ef48599a --- /dev/null +++ b/src/spdk/lib/iscsi/task.h @@ -0,0 +1,188 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ISCSI_TASK_H +#define SPDK_ISCSI_TASK_H + +#include "iscsi/iscsi.h" +#include "spdk/scsi.h" +#include "spdk/util.h" + +struct spdk_iscsi_task { + struct spdk_scsi_task scsi; + + struct spdk_iscsi_task *parent; + + uint8_t rsp_scsi_status; + uint8_t rsp_sense_data[32]; + size_t rsp_sense_data_len; + + struct spdk_iscsi_conn *conn; + struct spdk_iscsi_pdu *pdu; + uint32_t outstanding_r2t; + + uint32_t desired_data_transfer_length; + + /* Only valid for Read/Write */ + uint32_t bytes_completed; + + uint32_t data_out_cnt; + + /* + * Tracks the current offset of large read io. + */ + uint32_t current_datain_offset; + + /* + * next_expected_r2t_offset is used when we receive + * the DataOUT PDU. + */ + uint32_t next_expected_r2t_offset; + + /* + * Tracks the length of the R2T that is in progress. + * Used to check that an R2T burst does not exceed + * MaxBurstLength. + */ + uint32_t current_r2t_length; + + /* + * next_r2t_offset is used when we are sending the + * R2T packet to keep track of next offset of r2t. + */ + uint32_t next_r2t_offset; + uint32_t R2TSN; + uint32_t r2t_datasn; /* record next datasn for a r2tsn */ + uint32_t acked_r2tsn; /* next r2tsn to be acked */ + uint32_t datain_datasn; + uint32_t acked_data_sn; /* next expected datain datasn */ + uint32_t ttt; + bool is_r2t_active; + + uint32_t tag; + + /** + * Record the lun id just in case the lun is invalid, + * which will happen when hot removing the lun. + */ + int lun_id; + + struct spdk_poller *mgmt_poller; + + TAILQ_ENTRY(spdk_iscsi_task) link; + + TAILQ_HEAD(subtask_list, spdk_iscsi_task) subtask_list; + TAILQ_ENTRY(spdk_iscsi_task) subtask_link; + bool is_queued; /* is queued in scsi layer for handling */ +}; + +static inline void +iscsi_task_put(struct spdk_iscsi_task *task) +{ + spdk_scsi_task_put(&task->scsi); +} + +static inline struct spdk_iscsi_pdu * +iscsi_task_get_pdu(struct spdk_iscsi_task *task) +{ + return task->pdu; +} + +static inline void +iscsi_task_set_pdu(struct spdk_iscsi_task *task, struct spdk_iscsi_pdu *pdu) +{ + task->pdu = pdu; +} + +static inline struct iscsi_bhs * +iscsi_task_get_bhs(struct spdk_iscsi_task *task) +{ + return &iscsi_task_get_pdu(task)->bhs; +} + +static inline void +iscsi_task_associate_pdu(struct spdk_iscsi_task *task, struct spdk_iscsi_pdu *pdu) +{ + iscsi_task_set_pdu(task, pdu); + pdu->ref++; +} + +static inline void +iscsi_task_disassociate_pdu(struct spdk_iscsi_task *task) +{ + if (iscsi_task_get_pdu(task)) { + iscsi_put_pdu(iscsi_task_get_pdu(task)); + iscsi_task_set_pdu(task, NULL); + } +} + +static inline int +iscsi_task_is_immediate(struct spdk_iscsi_task *task) +{ + struct iscsi_bhs_scsi_req *scsi_req; + + scsi_req = (struct iscsi_bhs_scsi_req *)iscsi_task_get_bhs(task); + return (scsi_req->immediate == 1); +} + +static inline int +iscsi_task_is_read(struct spdk_iscsi_task *task) +{ + struct iscsi_bhs_scsi_req *scsi_req; + + scsi_req = (struct iscsi_bhs_scsi_req *)iscsi_task_get_bhs(task); + return (scsi_req->read_bit == 1); +} + +struct spdk_iscsi_task *iscsi_task_get(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *parent, + spdk_scsi_task_cpl cpl_fn); + +static inline struct spdk_iscsi_task * +iscsi_task_from_scsi_task(struct spdk_scsi_task *task) +{ + return SPDK_CONTAINEROF(task, struct spdk_iscsi_task, scsi); +} + +static inline struct spdk_iscsi_task * +iscsi_task_get_primary(struct spdk_iscsi_task *task) +{ + if (task->parent) { + return task->parent; + } else { + return task; + } +} + +#endif /* SPDK_ISCSI_TASK_H */ diff --git a/src/spdk/lib/iscsi/tgt_node.c b/src/spdk/lib/iscsi/tgt_node.c new file mode 100644 index 000000000..0807a3384 --- /dev/null +++ b/src/spdk/lib/iscsi/tgt_node.c @@ -0,0 +1,1607 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/conf.h" +#include "spdk/sock.h" +#include "spdk/scsi.h" + +#include "spdk_internal/log.h" + +#include "iscsi/iscsi.h" +#include "iscsi/conn.h" +#include "iscsi/tgt_node.h" +#include "iscsi/portal_grp.h" +#include "iscsi/init_grp.h" +#include "iscsi/task.h" + +#define MAX_TMPBUF 4096 +#define MAX_MASKBUF 128 + +static bool +iscsi_ipv6_netmask_allow_addr(const char *netmask, const char *addr) +{ + struct in6_addr in6_mask; + struct in6_addr in6_addr; + char mask[MAX_MASKBUF]; + const char *p; + size_t n; + int bits, bmask; + int i; + + if (netmask[0] != '[') { + return false; + } + p = strchr(netmask, ']'); + if (p == NULL) { + return false; + } + n = p - (netmask + 1); + if (n + 1 > sizeof mask) { + return false; + } + + memcpy(mask, netmask + 1, n); + mask[n] = '\0'; + p++; + + if (p[0] == '/') { + bits = (int) strtol(p + 1, NULL, 10); + if (bits <= 0 || bits > 128) { + return false; + } + } else { + bits = 128; + } + +#if 0 + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "input %s\n", addr); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "mask %s / %d\n", mask, bits); +#endif + + /* presentation to network order binary */ + if (inet_pton(AF_INET6, mask, &in6_mask) <= 0 + || inet_pton(AF_INET6, addr, &in6_addr) <= 0) { + return false; + } + + /* check 128bits */ + for (i = 0; i < (bits / 8); i++) { + if (in6_mask.s6_addr[i] != in6_addr.s6_addr[i]) { + return false; + } + } + if (bits % 8) { + bmask = (0xffU << (8 - (bits % 8))) & 0xffU; + if ((in6_mask.s6_addr[i] & bmask) != (in6_addr.s6_addr[i] & bmask)) { + return false; + } + } + + /* match */ + return true; +} + +static bool +iscsi_ipv4_netmask_allow_addr(const char *netmask, const char *addr) +{ + struct in_addr in4_mask; + struct in_addr in4_addr; + char mask[MAX_MASKBUF]; + const char *p; + uint32_t bmask; + size_t n; + int bits; + + p = strchr(netmask, '/'); + if (p == NULL) { + p = netmask + strlen(netmask); + } + n = p - netmask; + if (n + 1 > sizeof mask) { + return false; + } + + memcpy(mask, netmask, n); + mask[n] = '\0'; + + if (p[0] == '/') { + bits = (int) strtol(p + 1, NULL, 10); + if (bits <= 0 || bits > 32) { + return false; + } + } else { + bits = 32; + } + + /* presentation to network order binary */ + if (inet_pton(AF_INET, mask, &in4_mask) <= 0 + || inet_pton(AF_INET, addr, &in4_addr) <= 0) { + return false; + } + + /* check 32bits */ + bmask = (0xffffffffU << (32 - bits)) & 0xffffffffU; + if ((ntohl(in4_mask.s_addr) & bmask) != (ntohl(in4_addr.s_addr) & bmask)) { + return false; + } + + /* match */ + return true; +} + +static bool +iscsi_netmask_allow_addr(const char *netmask, const char *addr) +{ + if (netmask == NULL || addr == NULL) { + return false; + } + if (strcasecmp(netmask, "ANY") == 0) { + return true; + } + if (netmask[0] == '[') { + /* IPv6 */ + if (iscsi_ipv6_netmask_allow_addr(netmask, addr)) { + return true; + } + } else { + /* IPv4 */ + if (iscsi_ipv4_netmask_allow_addr(netmask, addr)) { + return true; + } + } + return false; +} + +static bool +iscsi_init_grp_allow_addr(struct spdk_iscsi_init_grp *igp, + const char *addr) +{ + struct spdk_iscsi_initiator_netmask *imask; + + TAILQ_FOREACH(imask, &igp->netmask_head, tailq) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "netmask=%s, addr=%s\n", + imask->mask, addr); + if (iscsi_netmask_allow_addr(imask->mask, addr)) { + return true; + } + } + return false; +} + +static int +iscsi_init_grp_allow_iscsi_name(struct spdk_iscsi_init_grp *igp, + const char *iqn, bool *result) +{ + struct spdk_iscsi_initiator_name *iname; + + TAILQ_FOREACH(iname, &igp->initiator_head, tailq) { + /* denied if iqn is matched */ + if ((iname->name[0] == '!') + && (strcasecmp(&iname->name[1], "ANY") == 0 + || strcasecmp(&iname->name[1], iqn) == 0)) { + *result = false; + return 0; + } + /* allowed if iqn is matched */ + if (strcasecmp(iname->name, "ANY") == 0 + || strcasecmp(iname->name, iqn) == 0) { + *result = true; + return 0; + } + } + return -1; +} + +static struct spdk_iscsi_pg_map * +iscsi_tgt_node_find_pg_map(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_portal_grp *pg); + +bool +iscsi_tgt_node_access(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target, const char *iqn, const char *addr) +{ + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_ig_map *ig_map; + int rc; + bool allowed = false; + + if (conn == NULL || target == NULL || iqn == NULL || addr == NULL) { + return false; + } + pg = conn->portal->group; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "pg=%d, iqn=%s, addr=%s\n", + pg->tag, iqn, addr); + pg_map = iscsi_tgt_node_find_pg_map(target, pg); + if (pg_map == NULL) { + return false; + } + TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) { + rc = iscsi_init_grp_allow_iscsi_name(ig_map->ig, iqn, &allowed); + if (rc == 0) { + if (allowed == false) { + goto denied; + } else { + if (iscsi_init_grp_allow_addr(ig_map->ig, addr)) { + return true; + } + } + } else { + /* netmask is denied in this initiator group */ + } + } + +denied: + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "access denied from %s (%s) to %s (%s:%s,%d)\n", + iqn, addr, target->name, conn->portal_host, + conn->portal_port, conn->pg_tag); + return false; +} + +static bool +iscsi_tgt_node_allow_iscsi_name(struct spdk_iscsi_tgt_node *target, const char *iqn) +{ + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_ig_map *ig_map; + int rc; + bool result = false; + + if (target == NULL || iqn == NULL) { + return false; + } + + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) { + rc = iscsi_init_grp_allow_iscsi_name(ig_map->ig, iqn, &result); + if (rc == 0) { + return result; + } + } + } + + return false; +} + +int +iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn, + const char *iaddr, const char *tiqn, uint8_t *data, int alloc_len, + int data_len) +{ + char buf[MAX_TMPBUF]; + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_portal *p; + struct spdk_iscsi_tgt_node *target; + char *host; + int total; + int len; + int rc; + + if (conn == NULL) { + return 0; + } + + total = data_len; + if (alloc_len < 1) { + return 0; + } + if (total >= alloc_len) { + total = alloc_len; + data[total - 1] = '\0'; + return total; + } + + pthread_mutex_lock(&g_iscsi.mutex); + TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) { + if (strcasecmp(tiqn, "ALL") != 0 + && strcasecmp(tiqn, target->name) != 0) { + continue; + } + rc = iscsi_tgt_node_allow_iscsi_name(target, iiqn); + if (rc == 0) { + continue; + } + + /* DO SENDTARGETS */ + len = snprintf((char *) data + total, alloc_len - total, + "TargetName=%s", target->name); + total += len + 1; + + /* write to data */ + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + pg = pg_map->pg; + TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { + if (alloc_len - total < 1) { + pthread_mutex_unlock(&g_iscsi.mutex); + /* TODO: long text responses support */ + SPDK_ERRLOG("SPDK doesn't support long text responses now, " + "you can use larger MaxRecvDataSegmentLength" + "value in initiator\n"); + return alloc_len; + } + host = p->host; + /* wildcard? */ + if (strcasecmp(host, "[::]") == 0 + || strcasecmp(host, "0.0.0.0") == 0) { + if (spdk_sock_is_ipv6(conn->sock)) { + snprintf(buf, sizeof buf, "[%s]", + conn->target_addr); + host = buf; + } else if (spdk_sock_is_ipv4(conn->sock)) { + snprintf(buf, sizeof buf, "%s", + conn->target_addr); + host = buf; + } else { + /* skip portal for the family */ + continue; + } + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "TargetAddress=%s:%s,%d\n", + host, p->port, pg->tag); + len = snprintf((char *) data + total, + alloc_len - total, + "TargetAddress=%s:%s,%d", + host, p->port, pg->tag); + total += len + 1; + } + } + } + pthread_mutex_unlock(&g_iscsi.mutex); + + return total; +} + +struct spdk_iscsi_tgt_node * +iscsi_find_tgt_node(const char *target_name) +{ + struct spdk_iscsi_tgt_node *target; + + if (target_name == NULL) { + return NULL; + } + TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) { + if (strcasecmp(target_name, target->name) == 0) { + return target; + } + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "can't find target %s\n", target_name); + return NULL; +} + +static int +iscsi_tgt_node_register(struct spdk_iscsi_tgt_node *target) +{ + pthread_mutex_lock(&g_iscsi.mutex); + + if (iscsi_find_tgt_node(target->name) != NULL) { + pthread_mutex_unlock(&g_iscsi.mutex); + return -EEXIST; + } + + TAILQ_INSERT_TAIL(&g_iscsi.target_head, target, tailq); + + pthread_mutex_unlock(&g_iscsi.mutex); + return 0; +} + +static int +iscsi_tgt_node_unregister(struct spdk_iscsi_tgt_node *target) +{ + struct spdk_iscsi_tgt_node *t; + + TAILQ_FOREACH(t, &g_iscsi.target_head, tailq) { + if (t == target) { + TAILQ_REMOVE(&g_iscsi.target_head, t, tailq); + return 0; + } + } + + return -1; +} + +static struct spdk_iscsi_ig_map * +iscsi_pg_map_find_ig_map(struct spdk_iscsi_pg_map *pg_map, + struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_ig_map *ig_map; + + TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) { + if (ig_map->ig == ig) { + return ig_map; + } + } + + return NULL; +} + +static struct spdk_iscsi_ig_map * +iscsi_pg_map_add_ig_map(struct spdk_iscsi_pg_map *pg_map, + struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_ig_map *ig_map; + + if (iscsi_pg_map_find_ig_map(pg_map, ig) != NULL) { + return NULL; + } + + ig_map = malloc(sizeof(*ig_map)); + if (ig_map == NULL) { + return NULL; + } + + ig_map->ig = ig; + ig->ref++; + pg_map->num_ig_maps++; + TAILQ_INSERT_TAIL(&pg_map->ig_map_head, ig_map, tailq); + + return ig_map; +} + +static void +_iscsi_pg_map_delete_ig_map(struct spdk_iscsi_pg_map *pg_map, + struct spdk_iscsi_ig_map *ig_map) +{ + TAILQ_REMOVE(&pg_map->ig_map_head, ig_map, tailq); + pg_map->num_ig_maps--; + ig_map->ig->ref--; + free(ig_map); +} + +static int +iscsi_pg_map_delete_ig_map(struct spdk_iscsi_pg_map *pg_map, + struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_ig_map *ig_map; + + ig_map = iscsi_pg_map_find_ig_map(pg_map, ig); + if (ig_map == NULL) { + return -ENOENT; + } + + _iscsi_pg_map_delete_ig_map(pg_map, ig_map); + return 0; +} + +static void +iscsi_pg_map_delete_all_ig_maps(struct spdk_iscsi_pg_map *pg_map) +{ + struct spdk_iscsi_ig_map *ig_map, *tmp; + + TAILQ_FOREACH_SAFE(ig_map, &pg_map->ig_map_head, tailq, tmp) { + _iscsi_pg_map_delete_ig_map(pg_map, ig_map); + } +} + +static struct spdk_iscsi_pg_map * +iscsi_tgt_node_find_pg_map(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_pg_map *pg_map; + + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + if (pg_map->pg == pg) { + return pg_map; + } + } + + return NULL; +} + +static struct spdk_iscsi_pg_map * +iscsi_tgt_node_add_pg_map(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_pg_map *pg_map; + char port_name[MAX_TMPBUF]; + int rc; + + if (iscsi_tgt_node_find_pg_map(target, pg) != NULL) { + return NULL; + } + + if (target->num_pg_maps >= SPDK_SCSI_DEV_MAX_PORTS) { + SPDK_ERRLOG("Number of PG maps is more than allowed (max=%d)\n", + SPDK_SCSI_DEV_MAX_PORTS); + return NULL; + } + + pg_map = malloc(sizeof(*pg_map)); + if (pg_map == NULL) { + return NULL; + } + + snprintf(port_name, sizeof(port_name), "%s,t,0x%4.4x", + spdk_scsi_dev_get_name(target->dev), pg->tag); + rc = spdk_scsi_dev_add_port(target->dev, pg->tag, port_name); + if (rc != 0) { + free(pg_map); + return NULL; + } + + TAILQ_INIT(&pg_map->ig_map_head); + pg_map->num_ig_maps = 0; + pg->ref++; + pg_map->pg = pg; + target->num_pg_maps++; + TAILQ_INSERT_TAIL(&target->pg_map_head, pg_map, tailq); + + return pg_map; +} + +static void +_iscsi_tgt_node_delete_pg_map(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_pg_map *pg_map) +{ + TAILQ_REMOVE(&target->pg_map_head, pg_map, tailq); + target->num_pg_maps--; + pg_map->pg->ref--; + + spdk_scsi_dev_delete_port(target->dev, pg_map->pg->tag); + + free(pg_map); +} + +static int +iscsi_tgt_node_delete_pg_map(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_pg_map *pg_map; + + pg_map = iscsi_tgt_node_find_pg_map(target, pg); + if (pg_map == NULL) { + return -ENOENT; + } + + if (pg_map->num_ig_maps > 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "delete %d ig_maps forcefully\n", + pg_map->num_ig_maps); + } + + iscsi_pg_map_delete_all_ig_maps(pg_map); + _iscsi_tgt_node_delete_pg_map(target, pg_map); + return 0; +} + +static void +iscsi_tgt_node_delete_ig_maps(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_pg_map *pg_map, *tmp; + + TAILQ_FOREACH_SAFE(pg_map, &target->pg_map_head, tailq, tmp) { + iscsi_pg_map_delete_ig_map(pg_map, ig); + if (pg_map->num_ig_maps == 0) { + _iscsi_tgt_node_delete_pg_map(target, pg_map); + } + } +} + +static void +iscsi_tgt_node_delete_all_pg_maps(struct spdk_iscsi_tgt_node *target) +{ + struct spdk_iscsi_pg_map *pg_map, *tmp; + + TAILQ_FOREACH_SAFE(pg_map, &target->pg_map_head, tailq, tmp) { + iscsi_pg_map_delete_all_ig_maps(pg_map); + _iscsi_tgt_node_delete_pg_map(target, pg_map); + } +} + +static void +_iscsi_tgt_node_destruct(void *cb_arg, int rc) +{ + struct spdk_iscsi_tgt_node *target = cb_arg; + iscsi_tgt_node_destruct_cb destruct_cb_fn = target->destruct_cb_fn; + void *destruct_cb_arg = target->destruct_cb_arg; + + if (rc != 0) { + if (destruct_cb_fn) { + destruct_cb_fn(destruct_cb_arg, rc); + } + return; + } + + pthread_mutex_lock(&g_iscsi.mutex); + iscsi_tgt_node_delete_all_pg_maps(target); + pthread_mutex_unlock(&g_iscsi.mutex); + + pthread_mutex_destroy(&target->mutex); + free(target); + + if (destruct_cb_fn) { + destruct_cb_fn(destruct_cb_arg, 0); + } +} + +static int +iscsi_tgt_node_check_active_conns(void *arg) +{ + struct spdk_iscsi_tgt_node *target = arg; + + if (iscsi_get_active_conns(target) != 0) { + return SPDK_POLLER_BUSY; + } + + spdk_poller_unregister(&target->destruct_poller); + + spdk_scsi_dev_destruct(target->dev, _iscsi_tgt_node_destruct, target); + + return SPDK_POLLER_BUSY; +} + +static void +iscsi_tgt_node_destruct(struct spdk_iscsi_tgt_node *target, + iscsi_tgt_node_destruct_cb cb_fn, void *cb_arg) +{ + if (target == NULL) { + if (cb_fn) { + cb_fn(cb_arg, -ENOENT); + } + return; + } + + if (target->destructed) { + SPDK_ERRLOG("Destructing %s is already started\n", target->name); + if (cb_fn) { + cb_fn(cb_arg, -EBUSY); + } + return; + } + + target->destructed = true; + target->destruct_cb_fn = cb_fn; + target->destruct_cb_arg = cb_arg; + + iscsi_conns_request_logout(target); + + if (iscsi_get_active_conns(target) != 0) { + target->destruct_poller = SPDK_POLLER_REGISTER(iscsi_tgt_node_check_active_conns, + target, 10); + } else { + spdk_scsi_dev_destruct(target->dev, _iscsi_tgt_node_destruct, target); + } + +} + +static int +iscsi_tgt_node_delete_pg_ig_map(struct spdk_iscsi_tgt_node *target, + int pg_tag, int ig_tag) +{ + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_init_grp *ig; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_ig_map *ig_map; + + pg = iscsi_portal_grp_find_by_tag(pg_tag); + if (pg == NULL) { + SPDK_ERRLOG("%s: PortalGroup%d not found\n", target->name, pg_tag); + return -ENOENT; + } + ig = iscsi_init_grp_find_by_tag(ig_tag); + if (ig == NULL) { + SPDK_ERRLOG("%s: InitiatorGroup%d not found\n", target->name, ig_tag); + return -ENOENT; + } + + pg_map = iscsi_tgt_node_find_pg_map(target, pg); + if (pg_map == NULL) { + SPDK_ERRLOG("%s: PortalGroup%d is not mapped\n", target->name, pg_tag); + return -ENOENT; + } + ig_map = iscsi_pg_map_find_ig_map(pg_map, ig); + if (ig_map == NULL) { + SPDK_ERRLOG("%s: InitiatorGroup%d is not mapped\n", target->name, pg_tag); + return -ENOENT; + } + + _iscsi_pg_map_delete_ig_map(pg_map, ig_map); + if (pg_map->num_ig_maps == 0) { + _iscsi_tgt_node_delete_pg_map(target, pg_map); + } + + return 0; +} + +static int +iscsi_tgt_node_add_pg_ig_map(struct spdk_iscsi_tgt_node *target, + int pg_tag, int ig_tag) +{ + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_init_grp *ig; + struct spdk_iscsi_ig_map *ig_map; + bool new_pg_map = false; + + pg = iscsi_portal_grp_find_by_tag(pg_tag); + if (pg == NULL) { + SPDK_ERRLOG("%s: PortalGroup%d not found\n", target->name, pg_tag); + return -ENOENT; + } + ig = iscsi_init_grp_find_by_tag(ig_tag); + if (ig == NULL) { + SPDK_ERRLOG("%s: InitiatorGroup%d not found\n", target->name, ig_tag); + return -ENOENT; + } + + /* get existing pg_map or create new pg_map and add it to target */ + pg_map = iscsi_tgt_node_find_pg_map(target, pg); + if (pg_map == NULL) { + pg_map = iscsi_tgt_node_add_pg_map(target, pg); + if (pg_map == NULL) { + goto failed; + } + new_pg_map = true; + } + + /* create new ig_map and add it to pg_map */ + ig_map = iscsi_pg_map_add_ig_map(pg_map, ig); + if (ig_map == NULL) { + goto failed; + } + + return 0; + +failed: + if (new_pg_map) { + _iscsi_tgt_node_delete_pg_map(target, pg_map); + } + + return -1; +} + +int +iscsi_target_node_add_pg_ig_maps(struct spdk_iscsi_tgt_node *target, + int *pg_tag_list, int *ig_tag_list, uint16_t num_maps) +{ + uint16_t i; + int rc; + + pthread_mutex_lock(&g_iscsi.mutex); + for (i = 0; i < num_maps; i++) { + rc = iscsi_tgt_node_add_pg_ig_map(target, pg_tag_list[i], + ig_tag_list[i]); + if (rc != 0) { + SPDK_ERRLOG("could not add map to target\n"); + goto invalid; + } + } + pthread_mutex_unlock(&g_iscsi.mutex); + return 0; + +invalid: + for (; i > 0; --i) { + iscsi_tgt_node_delete_pg_ig_map(target, pg_tag_list[i - 1], + ig_tag_list[i - 1]); + } + pthread_mutex_unlock(&g_iscsi.mutex); + return -1; +} + +int +iscsi_target_node_remove_pg_ig_maps(struct spdk_iscsi_tgt_node *target, + int *pg_tag_list, int *ig_tag_list, uint16_t num_maps) +{ + uint16_t i; + int rc; + + pthread_mutex_lock(&g_iscsi.mutex); + for (i = 0; i < num_maps; i++) { + rc = iscsi_tgt_node_delete_pg_ig_map(target, pg_tag_list[i], + ig_tag_list[i]); + if (rc != 0) { + SPDK_ERRLOG("could not delete map from target\n"); + goto invalid; + } + } + pthread_mutex_unlock(&g_iscsi.mutex); + return 0; + +invalid: + for (; i > 0; --i) { + rc = iscsi_tgt_node_add_pg_ig_map(target, pg_tag_list[i - 1], + ig_tag_list[i - 1]); + if (rc != 0) { + iscsi_tgt_node_delete_all_pg_maps(target); + break; + } + } + pthread_mutex_unlock(&g_iscsi.mutex); + return -1; +} + +static int +check_iscsi_name(const char *name) +{ + const unsigned char *up = (const unsigned char *) name; + size_t n; + + /* valid iSCSI name no larger than 223 bytes */ + if (strlen(name) > MAX_TARGET_NAME) { + return -1; + } + + /* valid iSCSI name? */ + for (n = 0; up[n] != 0; n++) { + if (up[n] > 0x00U && up[n] <= 0x2cU) { + return -1; + } + if (up[n] == 0x2fU) { + return -1; + } + if (up[n] >= 0x3bU && up[n] <= 0x40U) { + return -1; + } + if (up[n] >= 0x5bU && up[n] <= 0x60U) { + return -1; + } + if (up[n] >= 0x7bU && up[n] <= 0x7fU) { + return -1; + } + if (isspace(up[n])) { + return -1; + } + } + /* valid format? */ + if (strncasecmp(name, "iqn.", 4) == 0) { + /* iqn.YYYY-MM.reversed.domain.name */ + if (!isdigit(up[4]) || !isdigit(up[5]) || !isdigit(up[6]) + || !isdigit(up[7]) || up[8] != '-' || !isdigit(up[9]) + || !isdigit(up[10]) || up[11] != '.') { + SPDK_ERRLOG("invalid iqn format. " + "expect \"iqn.YYYY-MM.reversed.domain.name\"\n"); + return -1; + } + } else if (strncasecmp(name, "eui.", 4) == 0) { + /* EUI-64 -> 16bytes */ + /* XXX */ + } else if (strncasecmp(name, "naa.", 4) == 0) { + /* 64bit -> 16bytes, 128bit -> 32bytes */ + /* XXX */ + } + /* OK */ + return 0; +} + +bool +iscsi_check_chap_params(bool disable, bool require, bool mutual, int group) +{ + if (group < 0) { + SPDK_ERRLOG("Invalid auth group ID (%d)\n", group); + return false; + } + if ((!disable && !require && !mutual) || /* Auto */ + (disable && !require && !mutual) || /* None */ + (!disable && require && !mutual) || /* CHAP */ + (!disable && require && mutual)) { /* CHAP Mutual */ + return true; + } + SPDK_ERRLOG("Invalid combination of CHAP params (d=%d,r=%d,m=%d)\n", + disable, require, mutual); + return false; +} + +struct spdk_iscsi_tgt_node *iscsi_tgt_node_construct(int target_index, + const char *name, const char *alias, + int *pg_tag_list, int *ig_tag_list, uint16_t num_maps, + const char *bdev_name_list[], int *lun_id_list, int num_luns, + int queue_depth, + bool disable_chap, bool require_chap, bool mutual_chap, int chap_group, + bool header_digest, bool data_digest) +{ + char fullname[MAX_TMPBUF]; + struct spdk_iscsi_tgt_node *target; + int rc; + + if (!iscsi_check_chap_params(disable_chap, require_chap, + mutual_chap, chap_group)) { + return NULL; + } + + if (num_maps == 0) { + SPDK_ERRLOG("num_maps = 0\n"); + return NULL; + } + + if (name == NULL) { + SPDK_ERRLOG("TargetName not found\n"); + return NULL; + } + + if (strncasecmp(name, "iqn.", 4) != 0 + && strncasecmp(name, "eui.", 4) != 0 + && strncasecmp(name, "naa.", 4) != 0) { + snprintf(fullname, sizeof(fullname), "%s:%s", g_iscsi.nodebase, name); + } else { + snprintf(fullname, sizeof(fullname), "%s", name); + } + + if (check_iscsi_name(fullname) != 0) { + SPDK_ERRLOG("TargetName %s contains an invalid character or format.\n", + name); + return NULL; + } + + target = calloc(1, sizeof(*target)); + if (!target) { + SPDK_ERRLOG("could not allocate target\n"); + return NULL; + } + + rc = pthread_mutex_init(&target->mutex, NULL); + if (rc != 0) { + SPDK_ERRLOG("tgt_node%d: mutex_init() failed\n", target->num); + iscsi_tgt_node_destruct(target, NULL, NULL); + return NULL; + } + + target->num = target_index; + + memcpy(target->name, fullname, strlen(fullname)); + + if (alias != NULL) { + if (strlen(alias) > MAX_TARGET_NAME) { + iscsi_tgt_node_destruct(target, NULL, NULL); + return NULL; + } + memcpy(target->alias, alias, strlen(alias)); + } + + target->dev = spdk_scsi_dev_construct(fullname, bdev_name_list, lun_id_list, num_luns, + SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI, NULL, NULL); + if (!target->dev) { + SPDK_ERRLOG("Could not construct SCSI device\n"); + iscsi_tgt_node_destruct(target, NULL, NULL); + return NULL; + } + + TAILQ_INIT(&target->pg_map_head); + rc = iscsi_target_node_add_pg_ig_maps(target, pg_tag_list, + ig_tag_list, num_maps); + if (rc != 0) { + SPDK_ERRLOG("could not add map to target\n"); + iscsi_tgt_node_destruct(target, NULL, NULL); + return NULL; + } + + target->disable_chap = disable_chap; + target->require_chap = require_chap; + target->mutual_chap = mutual_chap; + target->chap_group = chap_group; + target->header_digest = header_digest; + target->data_digest = data_digest; + + if (queue_depth > 0 && ((uint32_t)queue_depth <= g_iscsi.MaxQueueDepth)) { + target->queue_depth = queue_depth; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "QueueDepth %d is invalid and %d is used instead.\n", + queue_depth, g_iscsi.MaxQueueDepth); + target->queue_depth = g_iscsi.MaxQueueDepth; + } + + rc = iscsi_tgt_node_register(target); + if (rc != 0) { + SPDK_ERRLOG("register target is failed\n"); + iscsi_tgt_node_destruct(target, NULL, NULL); + return NULL; + } + + return target; +} + +static int +iscsi_parse_tgt_node(struct spdk_conf_section *sp) +{ + char buf[MAX_TMPBUF]; + struct spdk_iscsi_tgt_node *target; + int pg_tag_list[MAX_TARGET_MAP], ig_tag_list[MAX_TARGET_MAP]; + int num_target_maps; + const char *alias, *pg_tag, *ig_tag; + const char *ag_tag; + const char *val, *name; + int target_num, chap_group, pg_tag_i, ig_tag_i; + bool header_digest, data_digest; + bool disable_chap, require_chap, mutual_chap; + int i; + int lun_id_list[SPDK_SCSI_DEV_MAX_LUN]; + const char *bdev_name_list[SPDK_SCSI_DEV_MAX_LUN]; + int num_luns, queue_depth; + + target_num = spdk_conf_section_get_num(sp); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add unit %d\n", target_num); + + data_digest = false; + header_digest = false; + + name = spdk_conf_section_get_val(sp, "TargetName"); + + if (name == NULL) { + SPDK_ERRLOG("tgt_node%d: TargetName not found\n", target_num); + return -1; + } + + alias = spdk_conf_section_get_val(sp, "TargetAlias"); + + /* Setup initiator and portal group mapping */ + val = spdk_conf_section_get_val(sp, "Mapping"); + if (val == NULL) { + /* no map */ + SPDK_ERRLOG("tgt_node%d: no Mapping\n", target_num); + return -1; + } + + for (i = 0; i < MAX_TARGET_MAP; i++) { + val = spdk_conf_section_get_nmval(sp, "Mapping", i, 0); + if (val == NULL) { + break; + } + pg_tag = spdk_conf_section_get_nmval(sp, "Mapping", i, 0); + ig_tag = spdk_conf_section_get_nmval(sp, "Mapping", i, 1); + if (pg_tag == NULL || ig_tag == NULL) { + SPDK_ERRLOG("tgt_node%d: mapping error\n", target_num); + return -1; + } + if (strncasecmp(pg_tag, "PortalGroup", + strlen("PortalGroup")) != 0 + || sscanf(pg_tag, "%*[^0-9]%d", &pg_tag_i) != 1) { + SPDK_ERRLOG("tgt_node%d: mapping portal error\n", target_num); + return -1; + } + if (strncasecmp(ig_tag, "InitiatorGroup", + strlen("InitiatorGroup")) != 0 + || sscanf(ig_tag, "%*[^0-9]%d", &ig_tag_i) != 1) { + SPDK_ERRLOG("tgt_node%d: mapping initiator error\n", target_num); + return -1; + } + if (pg_tag_i < 1 || ig_tag_i < 1) { + SPDK_ERRLOG("tgt_node%d: invalid group tag\n", target_num); + return -1; + } + pg_tag_list[i] = pg_tag_i; + ig_tag_list[i] = ig_tag_i; + } + + num_target_maps = i; + + /* Setup AuthMethod */ + val = spdk_conf_section_get_val(sp, "AuthMethod"); + disable_chap = false; + require_chap = false; + mutual_chap = false; + if (val != NULL) { + for (i = 0; ; i++) { + val = spdk_conf_section_get_nmval(sp, "AuthMethod", 0, i); + if (val == NULL) { + break; + } + if (strcasecmp(val, "CHAP") == 0) { + require_chap = true; + } else if (strcasecmp(val, "Mutual") == 0) { + mutual_chap = true; + } else if (strcasecmp(val, "Auto") == 0) { + disable_chap = false; + require_chap = false; + mutual_chap = false; + } else if (strcasecmp(val, "None") == 0) { + disable_chap = true; + require_chap = false; + mutual_chap = false; + } else { + SPDK_ERRLOG("tgt_node%d: unknown auth\n", target_num); + return -1; + } + } + if (mutual_chap && !require_chap) { + SPDK_ERRLOG("tgt_node%d: Mutual but not CHAP\n", target_num); + return -1; + } + } + if (disable_chap) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod None\n"); + } else if (!require_chap) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod Auto\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod CHAP %s\n", + mutual_chap ? "Mutual" : ""); + } + + val = spdk_conf_section_get_val(sp, "AuthGroup"); + if (val == NULL) { + chap_group = 0; + } else { + ag_tag = val; + if (strcasecmp(ag_tag, "None") == 0) { + chap_group = 0; + } else { + if (strncasecmp(ag_tag, "AuthGroup", + strlen("AuthGroup")) != 0 + || sscanf(ag_tag, "%*[^0-9]%d", &chap_group) != 1) { + SPDK_ERRLOG("tgt_node%d: auth group error\n", target_num); + return -1; + } + if (chap_group == 0) { + SPDK_ERRLOG("tgt_node%d: invalid auth group 0\n", target_num); + return -1; + } + } + } + if (chap_group == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthGroup None\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthGroup AuthGroup%d\n", chap_group); + } + + val = spdk_conf_section_get_val(sp, "UseDigest"); + if (val != NULL) { + for (i = 0; ; i++) { + val = spdk_conf_section_get_nmval(sp, "UseDigest", 0, i); + if (val == NULL) { + break; + } + if (strcasecmp(val, "Header") == 0) { + header_digest = true; + } else if (strcasecmp(val, "Data") == 0) { + data_digest = true; + } else if (strcasecmp(val, "Auto") == 0) { + header_digest = false; + data_digest = false; + } else { + SPDK_ERRLOG("tgt_node%d: unknown digest\n", target_num); + return -1; + } + } + } + if (!header_digest && !data_digest) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "UseDigest Auto\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "UseDigest %s %s\n", + header_digest ? "Header" : "", + data_digest ? "Data" : ""); + } + + val = spdk_conf_section_get_val(sp, "QueueDepth"); + if (val == NULL) { + queue_depth = g_iscsi.MaxQueueDepth; + } else { + queue_depth = (int) strtol(val, NULL, 10); + } + + num_luns = 0; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + snprintf(buf, sizeof(buf), "LUN%d", i); + val = spdk_conf_section_get_val(sp, buf); + if (val == NULL) { + continue; + } + + bdev_name_list[num_luns] = val; + lun_id_list[num_luns] = i; + num_luns++; + } + + if (num_luns == 0) { + SPDK_ERRLOG("tgt_node%d: No LUN specified for target %s.\n", target_num, name); + return -1; + } + + target = iscsi_tgt_node_construct(target_num, name, alias, + pg_tag_list, ig_tag_list, num_target_maps, + bdev_name_list, lun_id_list, num_luns, queue_depth, + disable_chap, require_chap, mutual_chap, chap_group, + header_digest, data_digest); + + if (target == NULL) { + SPDK_ERRLOG("tgt_node%d: add_iscsi_target_node error\n", target_num); + return -1; + } + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i); + + if (lun) { + SPDK_INFOLOG(SPDK_LOG_ISCSI, "device %d: LUN%d %s\n", + spdk_scsi_dev_get_id(target->dev), + spdk_scsi_lun_get_id(lun), + spdk_scsi_lun_get_bdev_name(lun)); + } + } + + return 0; +} + +int iscsi_parse_tgt_nodes(void) +{ + struct spdk_conf_section *sp; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_parse_tgt_nodes\n"); + + sp = spdk_conf_first_section(NULL); + while (sp != NULL) { + if (spdk_conf_section_match_prefix(sp, "TargetNode")) { + int tag = spdk_conf_section_get_num(sp); + + if (tag > SPDK_TN_TAG_MAX) { + SPDK_ERRLOG("tag %d is invalid\n", tag); + return -1; + } + rc = iscsi_parse_tgt_node(sp); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_parse_tgt_node() failed\n"); + return -1; + } + } + sp = spdk_conf_next_section(sp); + } + return 0; +} + +void +iscsi_shutdown_tgt_nodes(void) +{ + struct spdk_iscsi_tgt_node *target; + + pthread_mutex_lock(&g_iscsi.mutex); + while (!TAILQ_EMPTY(&g_iscsi.target_head)) { + target = TAILQ_FIRST(&g_iscsi.target_head); + TAILQ_REMOVE(&g_iscsi.target_head, target, tailq); + + pthread_mutex_unlock(&g_iscsi.mutex); + + iscsi_tgt_node_destruct(target, NULL, NULL); + + pthread_mutex_lock(&g_iscsi.mutex); + } + pthread_mutex_unlock(&g_iscsi.mutex); +} + +void +iscsi_shutdown_tgt_node_by_name(const char *target_name, + iscsi_tgt_node_destruct_cb cb_fn, void *cb_arg) +{ + struct spdk_iscsi_tgt_node *target; + + pthread_mutex_lock(&g_iscsi.mutex); + target = iscsi_find_tgt_node(target_name); + if (target != NULL) { + iscsi_tgt_node_unregister(target); + pthread_mutex_unlock(&g_iscsi.mutex); + + iscsi_tgt_node_destruct(target, cb_fn, cb_arg); + + return; + } + pthread_mutex_unlock(&g_iscsi.mutex); + + if (cb_fn) { + cb_fn(cb_arg, -ENOENT); + } +} + +bool +iscsi_tgt_node_is_destructed(struct spdk_iscsi_tgt_node *target) +{ + return target->destructed; +} + +int +iscsi_tgt_node_cleanup_luns(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target) +{ + int i; + struct spdk_iscsi_task *task; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i); + + if (!lun) { + continue; + } + + /* we create a fake management task per LUN to cleanup */ + task = iscsi_task_get(conn, NULL, iscsi_task_mgmt_cpl); + if (!task) { + SPDK_ERRLOG("Unable to acquire task\n"); + return -1; + } + + task->scsi.target_port = conn->target_port; + task->scsi.initiator_port = conn->initiator_port; + task->scsi.lun = lun; + + iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_LUN_RESET); + } + + return 0; +} + +void iscsi_tgt_node_delete_map(struct spdk_iscsi_portal_grp *portal_group, + struct spdk_iscsi_init_grp *initiator_group) +{ + struct spdk_iscsi_tgt_node *target; + + pthread_mutex_lock(&g_iscsi.mutex); + TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) { + if (portal_group) { + iscsi_tgt_node_delete_pg_map(target, portal_group); + } + if (initiator_group) { + iscsi_tgt_node_delete_ig_maps(target, initiator_group); + } + } + pthread_mutex_unlock(&g_iscsi.mutex); +} + +int +iscsi_tgt_node_add_lun(struct spdk_iscsi_tgt_node *target, + const char *bdev_name, int lun_id) +{ + struct spdk_scsi_dev *dev; + int rc; + + if (target->num_active_conns > 0) { + SPDK_ERRLOG("Target has active connections (count=%d)\n", + target->num_active_conns); + return -1; + } + + if (lun_id < -1 || lun_id >= SPDK_SCSI_DEV_MAX_LUN) { + SPDK_ERRLOG("Specified LUN ID (%d) is invalid\n", lun_id); + return -1; + } + + dev = target->dev; + if (dev == NULL) { + SPDK_ERRLOG("SCSI device is not found\n"); + return -1; + } + + rc = spdk_scsi_dev_add_lun(dev, bdev_name, lun_id, NULL, NULL); + if (rc != 0) { + SPDK_ERRLOG("spdk_scsi_dev_add_lun failed\n"); + return -1; + } + + return 0; +} + +int +iscsi_tgt_node_set_chap_params(struct spdk_iscsi_tgt_node *target, + bool disable_chap, bool require_chap, + bool mutual_chap, int32_t chap_group) +{ + if (!iscsi_check_chap_params(disable_chap, require_chap, + mutual_chap, chap_group)) { + return -EINVAL; + } + + pthread_mutex_lock(&target->mutex); + target->disable_chap = disable_chap; + target->require_chap = require_chap; + target->mutual_chap = mutual_chap; + target->chap_group = chap_group; + pthread_mutex_unlock(&target->mutex); + + return 0; +} + +static const char *target_nodes_section = \ + "\n" + "# Users should change the TargetNode section(s) below to match the\n" + "# desired iSCSI target node configuration.\n" + "# TargetName, Mapping, LUN0 are minimum required\n"; + +#define TARGET_NODE_TMPL \ +"[TargetNode%d]\n" \ +" Comment \"Target%d\"\n" \ +" TargetName %s\n" \ +" TargetAlias \"%s\"\n" + +#define TARGET_NODE_PGIG_MAPPING_TMPL \ +" Mapping PortalGroup%d InitiatorGroup%d\n" + +#define TARGET_NODE_AUTH_TMPL \ +" AuthMethod %s\n" \ +" AuthGroup %s\n" \ +" UseDigest %s\n" + +#define TARGET_NODE_QD_TMPL \ +" QueueDepth %d\n\n" + +#define TARGET_NODE_LUN_TMPL \ +" LUN%d %s\n" + +void +iscsi_tgt_nodes_config_text(FILE *fp) +{ + int l = 0; + struct spdk_scsi_dev *dev = NULL; + struct spdk_iscsi_tgt_node *target = NULL; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_ig_map *ig_map; + + /* Create target nodes section */ + fprintf(fp, "%s", target_nodes_section); + + TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) { + int idx; + const char *authmethod = "None"; + char authgroup[32] = "None"; + const char *usedigest = "Auto"; + + dev = target->dev; + if (NULL == dev) { continue; } + + idx = target->num; + fprintf(fp, TARGET_NODE_TMPL, idx, idx, target->name, spdk_scsi_dev_get_name(dev)); + + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) { + fprintf(fp, TARGET_NODE_PGIG_MAPPING_TMPL, + pg_map->pg->tag, + ig_map->ig->tag); + } + } + + if (target->disable_chap) { + authmethod = "None"; + } else if (!target->require_chap) { + authmethod = "Auto"; + } else if (target->mutual_chap) { + authmethod = "CHAP Mutual"; + } else { + authmethod = "CHAP"; + } + + if (target->chap_group > 0) { + snprintf(authgroup, sizeof(authgroup), "AuthGroup%d", target->chap_group); + } + + if (target->header_digest) { + usedigest = "Header"; + } else if (target->data_digest) { + usedigest = "Data"; + } + + fprintf(fp, TARGET_NODE_AUTH_TMPL, + authmethod, authgroup, usedigest); + + for (l = 0; l < SPDK_SCSI_DEV_MAX_LUN; l++) { + struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(dev, l); + + if (!lun) { + continue; + } + + fprintf(fp, TARGET_NODE_LUN_TMPL, + spdk_scsi_lun_get_id(lun), + spdk_scsi_lun_get_bdev_name(lun)); + } + + fprintf(fp, TARGET_NODE_QD_TMPL, + target->queue_depth); + } +} + +static void +iscsi_tgt_node_info_json(struct spdk_iscsi_tgt_node *target, + struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_ig_map *ig_map; + int i; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "name", target->name); + + if (target->alias[0] != '\0') { + spdk_json_write_named_string(w, "alias_name", target->alias); + } + + spdk_json_write_named_array_begin(w, "pg_ig_maps"); + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) { + spdk_json_write_object_begin(w); + spdk_json_write_named_int32(w, "pg_tag", pg_map->pg->tag); + spdk_json_write_named_int32(w, "ig_tag", ig_map->ig->tag); + spdk_json_write_object_end(w); + } + } + spdk_json_write_array_end(w); + + spdk_json_write_named_array_begin(w, "luns"); + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i); + + if (lun) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); + spdk_json_write_named_int32(w, "lun_id", spdk_scsi_lun_get_id(lun)); + spdk_json_write_object_end(w); + } + } + spdk_json_write_array_end(w); + + spdk_json_write_named_int32(w, "queue_depth", target->queue_depth); + + spdk_json_write_named_bool(w, "disable_chap", target->disable_chap); + spdk_json_write_named_bool(w, "require_chap", target->require_chap); + spdk_json_write_named_bool(w, "mutual_chap", target->mutual_chap); + spdk_json_write_named_int32(w, "chap_group", target->chap_group); + + spdk_json_write_named_bool(w, "header_digest", target->header_digest); + spdk_json_write_named_bool(w, "data_digest", target->data_digest); + + spdk_json_write_object_end(w); +} + +static void +iscsi_tgt_node_config_json(struct spdk_iscsi_tgt_node *target, + struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "iscsi_create_target_node"); + + spdk_json_write_name(w, "params"); + iscsi_tgt_node_info_json(target, w); + + spdk_json_write_object_end(w); +} + +void +iscsi_tgt_nodes_info_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_tgt_node *target; + + TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) { + iscsi_tgt_node_info_json(target, w); + } +} + +void +iscsi_tgt_nodes_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_tgt_node *target; + + TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) { + iscsi_tgt_node_config_json(target, w); + } +} diff --git a/src/spdk/lib/iscsi/tgt_node.h b/src/spdk/lib/iscsi/tgt_node.h new file mode 100644 index 000000000..2787fac91 --- /dev/null +++ b/src/spdk/lib/iscsi/tgt_node.h @@ -0,0 +1,147 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ISCSI_TGT_NODE_H_ +#define SPDK_ISCSI_TGT_NODE_H_ + +#include "spdk/stdinc.h" + +#include "iscsi/iscsi.h" + +struct spdk_iscsi_conn; +struct spdk_iscsi_init_grp; +struct spdk_iscsi_portal_grp; +struct spdk_iscsi_portal; +struct spdk_json_write_ctx; + +#define MAX_TARGET_MAP 256 +#define SPDK_TN_TAG_MAX 0x0000ffff + +typedef void (*iscsi_tgt_node_destruct_cb)(void *cb_arg, int rc); + +struct spdk_iscsi_ig_map { + struct spdk_iscsi_init_grp *ig; + TAILQ_ENTRY(spdk_iscsi_ig_map) tailq; +}; + +struct spdk_iscsi_pg_map { + struct spdk_iscsi_portal_grp *pg; + int num_ig_maps; + TAILQ_HEAD(, spdk_iscsi_ig_map) ig_map_head; + TAILQ_ENTRY(spdk_iscsi_pg_map) tailq ; +}; + +struct spdk_iscsi_tgt_node { + int num; + char name[MAX_TARGET_NAME + 1]; + char alias[MAX_TARGET_NAME + 1]; + + pthread_mutex_t mutex; + + bool disable_chap; + bool require_chap; + bool mutual_chap; + int chap_group; + bool header_digest; + bool data_digest; + int queue_depth; + + struct spdk_scsi_dev *dev; + /** + * Counts number of active iSCSI connections associated with this + * target node. + */ + uint32_t num_active_conns; + struct spdk_iscsi_poll_group *pg; + + int num_pg_maps; + TAILQ_HEAD(, spdk_iscsi_pg_map) pg_map_head; + TAILQ_ENTRY(spdk_iscsi_tgt_node) tailq; + + bool destructed; + struct spdk_poller *destruct_poller; + iscsi_tgt_node_destruct_cb destruct_cb_fn; + void *destruct_cb_arg; +}; + +int iscsi_parse_tgt_nodes(void); + +void iscsi_shutdown_tgt_nodes(void); +void iscsi_shutdown_tgt_node_by_name(const char *target_name, + iscsi_tgt_node_destruct_cb cb_fn, void *cb_arg); +bool iscsi_tgt_node_is_destructed(struct spdk_iscsi_tgt_node *target); +int iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn, + const char *iaddr, const char *tiqn, uint8_t *data, int alloc_len, + int data_len); + +/* + * bdev_name_list and lun_id_list are equal sized arrays of size num_luns. + * bdev_name_list refers to the names of the bdevs that will be used for the LUNs on the + * new target node. + * lun_id_list refers to the LUN IDs that will be used for the LUNs on the target node. + */ +struct spdk_iscsi_tgt_node *iscsi_tgt_node_construct(int target_index, + const char *name, const char *alias, + int *pg_tag_list, int *ig_tag_list, uint16_t num_maps, + const char *bdev_name_list[], int *lun_id_list, int num_luns, + int queue_depth, + bool disable_chap, bool require_chap, bool mutual_chap, int chap_group, + bool header_digest, bool data_digest); + +bool iscsi_check_chap_params(bool disable, bool require, bool mutual, int group); + +int iscsi_target_node_add_pg_ig_maps(struct spdk_iscsi_tgt_node *target, + int *pg_tag_list, int *ig_tag_list, + uint16_t num_maps); +int iscsi_target_node_remove_pg_ig_maps(struct spdk_iscsi_tgt_node *target, + int *pg_tag_list, int *ig_tag_list, + uint16_t num_maps); + +bool iscsi_tgt_node_access(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target, const char *iqn, + const char *addr); +struct spdk_iscsi_tgt_node *iscsi_find_tgt_node(const char *target_name); +int iscsi_tgt_node_cleanup_luns(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target); +void iscsi_tgt_node_delete_map(struct spdk_iscsi_portal_grp *portal_group, + struct spdk_iscsi_init_grp *initiator_group); +int iscsi_tgt_node_add_lun(struct spdk_iscsi_tgt_node *target, + const char *bdev_name, int lun_id); +int iscsi_tgt_node_set_chap_params(struct spdk_iscsi_tgt_node *target, + bool disable_chap, bool require_chap, + bool mutual_chap, int32_t chap_group); +void iscsi_tgt_nodes_config_text(FILE *fp); +void iscsi_tgt_nodes_info_json(struct spdk_json_write_ctx *w); +void iscsi_tgt_nodes_config_json(struct spdk_json_write_ctx *w); +#endif /* SPDK_ISCSI_TGT_NODE_H_ */ diff --git a/src/spdk/lib/json/Makefile b/src/spdk/lib/json/Makefile new file mode 100644 index 000000000..91cb8868f --- /dev/null +++ b/src/spdk/lib/json/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = json_parse.c json_util.c json_write.c +LIBNAME = json + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_json.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/json/json_parse.c b/src/spdk/lib/json/json_parse.c new file mode 100644 index 000000000..8639d5ff8 --- /dev/null +++ b/src/spdk/lib/json/json_parse.c @@ -0,0 +1,668 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/json.h" + +#include "spdk_internal/utf.h" + +#define SPDK_JSON_MAX_NESTING_DEPTH 64 + +static int +hex_value(uint8_t c) +{ +#define V(x, y) [x] = y + 1 + static const int8_t val[256] = { + V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4), + V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9), + V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF), + V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF), + }; +#undef V + + return val[c] - 1; +} + +static int +json_decode_string_escape_unicode(uint8_t **strp, uint8_t *buf_end, uint8_t *out) +{ + uint8_t *str = *strp; + int v0, v1, v2, v3; + uint32_t val; + uint32_t surrogate_high = 0; + int rc; +decode: + /* \uXXXX */ + assert(buf_end > str); + + if (*str++ != '\\') { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + if (*str++ != 'u') { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + if ((v3 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + if ((v2 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + if ((v1 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + if ((v0 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + val = v0 | (v1 << 4) | (v2 << 8) | (v3 << 12); + + if (surrogate_high) { + /* We already parsed the high surrogate, so this should be the low part. */ + if (!utf16_valid_surrogate_low(val)) { + return SPDK_JSON_PARSE_INVALID; + } + + /* Convert UTF-16 surrogate pair into codepoint and fall through to utf8_encode. */ + val = utf16_decode_surrogate_pair(surrogate_high, val); + } else if (utf16_valid_surrogate_high(val)) { + surrogate_high = val; + + /* + * We parsed a \uXXXX sequence that decoded to the first half of a + * UTF-16 surrogate pair, so it must be immediately followed by another + * \uXXXX escape. + * + * Loop around to get the low half of the surrogate pair. + */ + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + goto decode; + } else if (utf16_valid_surrogate_low(val)) { + /* + * We found the second half of surrogate pair without the first half; + * this is an invalid encoding. + */ + return SPDK_JSON_PARSE_INVALID; + } + + /* + * Convert Unicode escape (or surrogate pair) to UTF-8 in place. + * + * This is safe (will not write beyond the buffer) because the \uXXXX sequence is 6 bytes + * (or 12 bytes for surrogate pairs), and the longest possible UTF-8 encoding of a + * single codepoint is 4 bytes. + */ + if (out) { + rc = utf8_encode_unsafe(out, val); + } else { + rc = utf8_codepoint_len(val); + } + if (rc < 0) { + return SPDK_JSON_PARSE_INVALID; + } + + *strp = str; /* update input pointer */ + return rc; /* return number of bytes decoded */ +} + +static int +json_decode_string_escape_twochar(uint8_t **strp, uint8_t *buf_end, uint8_t *out) +{ + static const uint8_t escapes[256] = { + ['b'] = '\b', + ['f'] = '\f', + ['n'] = '\n', + ['r'] = '\r', + ['t'] = '\t', + ['/'] = '/', + ['"'] = '"', + ['\\'] = '\\', + }; + uint8_t *str = *strp; + uint8_t c; + + assert(buf_end > str); + if (buf_end - str < 2) { + return SPDK_JSON_PARSE_INCOMPLETE; + } + + assert(str[0] == '\\'); + + c = escapes[str[1]]; + if (c) { + if (out) { + *out = c; + } + *strp += 2; /* consumed two bytes */ + return 1; /* produced one byte */ + } + + return SPDK_JSON_PARSE_INVALID; +} + +/* + * Decode JSON string backslash escape. + * \param strp pointer to pointer to first character of escape (the backslash). + * *strp is also advanced to indicate how much input was consumed. + * + * \return Number of bytes appended to out + */ +static int +json_decode_string_escape(uint8_t **strp, uint8_t *buf_end, uint8_t *out) +{ + int rc; + + rc = json_decode_string_escape_twochar(strp, buf_end, out); + if (rc > 0) { + return rc; + } + + return json_decode_string_escape_unicode(strp, buf_end, out); +} + +/* + * Decode JSON string in place. + * + * \param str_start Pointer to the beginning of the string (the opening " character). + * + * \return Number of bytes in decoded string (beginning from start). + */ +static int +json_decode_string(uint8_t *str_start, uint8_t *buf_end, uint8_t **str_end, uint32_t flags) +{ + uint8_t *str = str_start; + uint8_t *out = str_start + 1; /* Decode string in place (skip the initial quote) */ + int rc; + + if (buf_end - str_start < 2) { + /* + * Shortest valid string (the empty string) is two bytes (""), + * so this can't possibly be valid + */ + *str_end = str; + return SPDK_JSON_PARSE_INCOMPLETE; + } + + if (*str++ != '"') { + *str_end = str; + return SPDK_JSON_PARSE_INVALID; + } + + while (str < buf_end) { + if (str[0] == '"') { + /* + * End of string. + * Update str_end to point at next input byte and return output length. + */ + *str_end = str + 1; + return out - str_start - 1; + } else if (str[0] == '\\') { + rc = json_decode_string_escape(&str, buf_end, + flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE ? out : NULL); + assert(rc != 0); + if (rc < 0) { + *str_end = str; + return rc; + } + out += rc; + } else if (str[0] <= 0x1f) { + /* control characters must be escaped */ + *str_end = str; + return SPDK_JSON_PARSE_INVALID; + } else { + rc = utf8_valid(str, buf_end); + if (rc == 0) { + *str_end = str; + return SPDK_JSON_PARSE_INCOMPLETE; + } else if (rc < 0) { + *str_end = str; + return SPDK_JSON_PARSE_INVALID; + } + + if (out && out != str && (flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE)) { + memmove(out, str, rc); + } + out += rc; + str += rc; + } + } + + /* If execution gets here, we ran out of buffer. */ + *str_end = str; + return SPDK_JSON_PARSE_INCOMPLETE; +} + +static int +json_valid_number(uint8_t *start, uint8_t *buf_end) +{ + uint8_t *p = start; + uint8_t c; + + if (p >= buf_end) { return -1; } + + c = *p++; + if (c >= '1' && c <= '9') { goto num_int_digits; } + if (c == '0') { goto num_frac_or_exp; } + if (c == '-') { goto num_int_first_digit; } + p--; + goto done_invalid; + +num_int_first_digit: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c == '0') { goto num_frac_or_exp; } + if (c >= '1' && c <= '9') { goto num_int_digits; } + p--; + } + goto done_invalid; + +num_int_digits: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_int_digits; } + if (c == '.') { goto num_frac_first_digit; } + if (c == 'e' || c == 'E') { goto num_exp_sign; } + p--; + } + goto done_valid; + +num_frac_or_exp: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c == '.') { goto num_frac_first_digit; } + if (c == 'e' || c == 'E') { goto num_exp_sign; } + p--; + } + goto done_valid; + +num_frac_first_digit: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_frac_digits; } + p--; + } + goto done_invalid; + +num_frac_digits: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_frac_digits; } + if (c == 'e' || c == 'E') { goto num_exp_sign; } + p--; + } + goto done_valid; + +num_exp_sign: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_exp_digits; } + if (c == '-' || c == '+') { goto num_exp_first_digit; } + p--; + } + goto done_invalid; + +num_exp_first_digit: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_exp_digits; } + p--; + } + goto done_invalid; + +num_exp_digits: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_exp_digits; } + p--; + } + goto done_valid; + +done_valid: + /* Valid end state */ + return p - start; + +done_invalid: + /* Invalid end state */ + if (p == buf_end) { + /* Hit the end of the buffer - the stream is incomplete. */ + return SPDK_JSON_PARSE_INCOMPLETE; + } + + /* Found an invalid character in an invalid end state */ + return SPDK_JSON_PARSE_INVALID; +} + +static int +json_valid_comment(const uint8_t *start, const uint8_t *buf_end) +{ + const uint8_t *p = start; + bool multiline; + + assert(buf_end > p); + if (buf_end - p < 2) { + return SPDK_JSON_PARSE_INCOMPLETE; + } + + if (p[0] != '/') { + return SPDK_JSON_PARSE_INVALID; + } + if (p[1] == '*') { + multiline = true; + } else if (p[1] == '/') { + multiline = false; + } else { + return SPDK_JSON_PARSE_INVALID; + } + p += 2; + + if (multiline) { + while (p != buf_end - 1) { + if (p[0] == '*' && p[1] == '/') { + /* Include the terminating star and slash in the comment */ + return p - start + 2; + } + p++; + } + } else { + while (p != buf_end) { + if (*p == '\r' || *p == '\n') { + /* Do not include the line terminator in the comment */ + return p - start; + } + p++; + } + } + + return SPDK_JSON_PARSE_INCOMPLETE; +} + +struct json_literal { + enum spdk_json_val_type type; + uint32_t len; + uint8_t str[8]; +}; + +/* + * JSON only defines 3 possible literals; they can be uniquely identified by bits + * 3 and 4 of the first character: + * 'f' = 0b11[00]110 + * 'n' = 0b11[01]110 + * 't' = 0b11[10]100 + * These two bits can be used as an index into the g_json_literals array. + */ +static const struct json_literal g_json_literals[] = { + {SPDK_JSON_VAL_FALSE, 5, "false"}, + {SPDK_JSON_VAL_NULL, 4, "null"}, + {SPDK_JSON_VAL_TRUE, 4, "true"}, + {} +}; + +static int +match_literal(const uint8_t *start, const uint8_t *end, const uint8_t *literal, size_t len) +{ + assert(end >= start); + if ((size_t)(end - start) < len) { + return SPDK_JSON_PARSE_INCOMPLETE; + } + + if (memcmp(start, literal, len) != 0) { + return SPDK_JSON_PARSE_INVALID; + } + + return len; +} + +ssize_t +spdk_json_parse(void *json, size_t size, struct spdk_json_val *values, size_t num_values, + void **end, uint32_t flags) +{ + uint8_t *json_end = json + size; + enum spdk_json_val_type containers[SPDK_JSON_MAX_NESTING_DEPTH]; + size_t con_value[SPDK_JSON_MAX_NESTING_DEPTH]; + enum spdk_json_val_type con_type = SPDK_JSON_VAL_INVALID; + bool trailing_comma = false; + size_t depth = 0; /* index into containers */ + size_t cur_value = 0; /* index into values */ + size_t con_start_value; + uint8_t *data = json; + uint8_t *new_data; + int rc = 0; + const struct json_literal *lit; + enum { + STATE_VALUE, /* initial state */ + STATE_VALUE_SEPARATOR, /* value separator (comma) */ + STATE_NAME, /* "name": value */ + STATE_NAME_SEPARATOR, /* colon */ + STATE_END, /* parsed the complete value, so only whitespace is valid */ + } state = STATE_VALUE; + +#define ADD_VALUE(t, val_start_ptr, val_end_ptr) \ + if (values && cur_value < num_values) { \ + values[cur_value].type = t; \ + values[cur_value].start = val_start_ptr; \ + values[cur_value].len = val_end_ptr - val_start_ptr; \ + } \ + cur_value++ + + while (data < json_end) { + uint8_t c = *data; + + switch (c) { + case ' ': + case '\t': + case '\r': + case '\n': + /* Whitespace is allowed between any tokens. */ + data++; + break; + + case 't': + case 'f': + case 'n': + /* true, false, or null */ + if (state != STATE_VALUE) { goto done_invalid; } + lit = &g_json_literals[(c >> 3) & 3]; /* See comment above g_json_literals[] */ + assert(lit->str[0] == c); + rc = match_literal(data, json_end, lit->str, lit->len); + if (rc < 0) { goto done_rc; } + ADD_VALUE(lit->type, data, data + rc); + data += rc; + state = depth ? STATE_VALUE_SEPARATOR : STATE_END; + trailing_comma = false; + break; + + case '"': + if (state != STATE_VALUE && state != STATE_NAME) { goto done_invalid; } + rc = json_decode_string(data, json_end, &new_data, flags); + if (rc < 0) { + data = new_data; + goto done_rc; + } + /* + * Start is data + 1 to skip initial quote. + * Length is data + rc - 1 to skip both quotes. + */ + ADD_VALUE(state == STATE_VALUE ? SPDK_JSON_VAL_STRING : SPDK_JSON_VAL_NAME, + data + 1, data + rc - 1); + data = new_data; + if (state == STATE_NAME) { + state = STATE_NAME_SEPARATOR; + } else { + state = depth ? STATE_VALUE_SEPARATOR : STATE_END; + } + trailing_comma = false; + break; + + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (state != STATE_VALUE) { goto done_invalid; } + rc = json_valid_number(data, json_end); + if (rc < 0) { goto done_rc; } + ADD_VALUE(SPDK_JSON_VAL_NUMBER, data, data + rc); + data += rc; + state = depth ? STATE_VALUE_SEPARATOR : STATE_END; + trailing_comma = false; + break; + + case '{': + case '[': + if (state != STATE_VALUE) { goto done_invalid; } + if (depth == SPDK_JSON_MAX_NESTING_DEPTH) { + rc = SPDK_JSON_PARSE_MAX_DEPTH_EXCEEDED; + goto done_rc; + } + if (c == '{') { + con_type = SPDK_JSON_VAL_OBJECT_BEGIN; + state = STATE_NAME; + } else { + con_type = SPDK_JSON_VAL_ARRAY_BEGIN; + state = STATE_VALUE; + } + con_value[depth] = cur_value; + containers[depth++] = con_type; + ADD_VALUE(con_type, data, data + 1); + data++; + trailing_comma = false; + break; + + case '}': + case ']': + if (trailing_comma) { goto done_invalid; } + if (depth == 0) { goto done_invalid; } + con_type = containers[--depth]; + con_start_value = con_value[depth]; + if (values && con_start_value < num_values) { + values[con_start_value].len = cur_value - con_start_value - 1; + } + if (c == '}') { + if (state != STATE_NAME && state != STATE_VALUE_SEPARATOR) { + goto done_invalid; + } + if (con_type != SPDK_JSON_VAL_OBJECT_BEGIN) { + goto done_invalid; + } + ADD_VALUE(SPDK_JSON_VAL_OBJECT_END, data, data + 1); + } else { + if (state != STATE_VALUE && state != STATE_VALUE_SEPARATOR) { + goto done_invalid; + } + if (con_type != SPDK_JSON_VAL_ARRAY_BEGIN) { + goto done_invalid; + } + ADD_VALUE(SPDK_JSON_VAL_ARRAY_END, data, data + 1); + } + con_type = depth == 0 ? SPDK_JSON_VAL_INVALID : containers[depth - 1]; + data++; + state = depth ? STATE_VALUE_SEPARATOR : STATE_END; + trailing_comma = false; + break; + + case ',': + if (state != STATE_VALUE_SEPARATOR) { goto done_invalid; } + data++; + assert(con_type == SPDK_JSON_VAL_ARRAY_BEGIN || + con_type == SPDK_JSON_VAL_OBJECT_BEGIN); + state = con_type == SPDK_JSON_VAL_ARRAY_BEGIN ? STATE_VALUE : STATE_NAME; + trailing_comma = true; + break; + + case ':': + if (state != STATE_NAME_SEPARATOR) { goto done_invalid; } + data++; + state = STATE_VALUE; + break; + + case '/': + if (!(flags & SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS)) { + goto done_invalid; + } + rc = json_valid_comment(data, json_end); + if (rc < 0) { goto done_rc; } + /* Skip over comment */ + data += rc; + break; + + default: + goto done_invalid; + } + + if (state == STATE_END) { + break; + } + } + + if (state == STATE_END) { + /* Skip trailing whitespace */ + while (data < json_end) { + uint8_t c = *data; + + if (c == ' ' || c == '\t' || c == '\r' || c == '\n') { + data++; + } else { + break; + } + } + + /* + * These asserts are just for sanity checking - they are guaranteed by the allowed + * state transitions. + */ + assert(depth == 0); + assert(trailing_comma == false); + assert(data <= json_end); + if (end) { + *end = data; + } + return cur_value; + } + + /* Invalid end state - ran out of data */ + rc = SPDK_JSON_PARSE_INCOMPLETE; + +done_rc: + assert(rc < 0); + if (end) { + *end = data; + } + return rc; + +done_invalid: + rc = SPDK_JSON_PARSE_INVALID; + goto done_rc; +} diff --git a/src/spdk/lib/json/json_util.c b/src/spdk/lib/json/json_util.c new file mode 100644 index 000000000..18d751047 --- /dev/null +++ b/src/spdk/lib/json/json_util.c @@ -0,0 +1,653 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/json.h" + +#include "spdk_internal/utf.h" +#include "spdk_internal/log.h" + +#define SPDK_JSON_DEBUG(...) SPDK_DEBUGLOG(SPDK_LOG_JSON, __VA_ARGS__) + +size_t +spdk_json_val_len(const struct spdk_json_val *val) +{ + if (val == NULL) { + return 0; + } + + if (val->type == SPDK_JSON_VAL_ARRAY_BEGIN || val->type == SPDK_JSON_VAL_OBJECT_BEGIN) { + return val->len + 2; + } + + return 1; +} + +bool +spdk_json_strequal(const struct spdk_json_val *val, const char *str) +{ + size_t len; + + if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NAME) { + return false; + } + + len = strlen(str); + if (val->len != len) { + return false; + } + + return memcmp(val->start, str, len) == 0; +} + +char * +spdk_json_strdup(const struct spdk_json_val *val) +{ + size_t len; + char *s; + + if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NAME) { + return NULL; + } + + len = val->len; + + if (memchr(val->start, '\0', len)) { + /* String contains embedded NUL, so it is not a valid C string. */ + return NULL; + } + + s = malloc(len + 1); + if (s == NULL) { + return s; + } + + memcpy(s, val->start, len); + s[len] = '\0'; + + return s; +} + +struct spdk_json_num { + bool negative; + uint64_t significand; + int64_t exponent; +}; + +static int +json_number_split(const struct spdk_json_val *val, struct spdk_json_num *num) +{ + const char *iter; + size_t remaining; + uint64_t *pval; + uint64_t frac_digits = 0; + uint64_t exponent_u64 = 0; + bool exponent_negative = false; + enum { + NUM_STATE_INT, + NUM_STATE_FRAC, + NUM_STATE_EXP, + } state; + + memset(num, 0, sizeof(*num)); + + if (val->type != SPDK_JSON_VAL_NUMBER) { + return -EINVAL; + } + + remaining = val->len; + if (remaining == 0) { + return -EINVAL; + } + + iter = val->start; + if (*iter == '-') { + num->negative = true; + iter++; + remaining--; + } + + state = NUM_STATE_INT; + pval = &num->significand; + while (remaining--) { + char c = *iter++; + + if (c == '.') { + state = NUM_STATE_FRAC; + } else if (c == 'e' || c == 'E') { + state = NUM_STATE_EXP; + pval = &exponent_u64; + } else if (c == '-') { + assert(state == NUM_STATE_EXP); + exponent_negative = true; + } else if (c == '+') { + assert(state == NUM_STATE_EXP); + /* exp_negative = false; */ /* already false by default */ + } else { + uint64_t new_val; + + assert(c >= '0' && c <= '9'); + new_val = *pval * 10 + c - '0'; + if (new_val < *pval) { + return -ERANGE; + } + + if (state == NUM_STATE_FRAC) { + frac_digits++; + } + + *pval = new_val; + } + } + + if (exponent_negative) { + if (exponent_u64 > 9223372036854775808ULL) { /* abs(INT64_MIN) */ + return -ERANGE; + } + num->exponent = (int64_t) - exponent_u64; + } else { + if (exponent_u64 > INT64_MAX) { + return -ERANGE; + } + num->exponent = exponent_u64; + } + num->exponent -= frac_digits; + + /* Apply as much of the exponent as possible without overflow or truncation */ + if (num->exponent < 0) { + while (num->exponent && num->significand >= 10 && num->significand % 10 == 0) { + num->significand /= 10; + num->exponent++; + } + } else { /* positive exponent */ + while (num->exponent) { + uint64_t new_val = num->significand * 10; + + if (new_val < num->significand) { + break; + } + + num->significand = new_val; + num->exponent--; + } + } + + return 0; +} + +int +spdk_json_number_to_uint16(const struct spdk_json_val *val, uint16_t *num) +{ + struct spdk_json_num split_num; + int rc; + + rc = json_number_split(val, &split_num); + if (rc) { + return rc; + } + + if (split_num.exponent || split_num.negative) { + return -ERANGE; + } + + if (split_num.significand > UINT16_MAX) { + return -ERANGE; + } + *num = (uint16_t)split_num.significand; + return 0; +} + +int +spdk_json_number_to_int32(const struct spdk_json_val *val, int32_t *num) +{ + struct spdk_json_num split_num; + int rc; + + rc = json_number_split(val, &split_num); + if (rc) { + return rc; + } + + if (split_num.exponent) { + return -ERANGE; + } + + if (split_num.negative) { + if (split_num.significand > 2147483648) { /* abs(INT32_MIN) */ + return -ERANGE; + } + *num = (int32_t) - (int64_t)split_num.significand; + return 0; + } + + /* positive */ + if (split_num.significand > INT32_MAX) { + return -ERANGE; + } + *num = (int32_t)split_num.significand; + return 0; +} + +int +spdk_json_number_to_uint32(const struct spdk_json_val *val, uint32_t *num) +{ + struct spdk_json_num split_num; + int rc; + + rc = json_number_split(val, &split_num); + if (rc) { + return rc; + } + + if (split_num.exponent || split_num.negative) { + return -ERANGE; + } + + if (split_num.significand > UINT32_MAX) { + return -ERANGE; + } + *num = (uint32_t)split_num.significand; + return 0; +} + +int +spdk_json_number_to_uint64(const struct spdk_json_val *val, uint64_t *num) +{ + struct spdk_json_num split_num; + int rc; + + rc = json_number_split(val, &split_num); + if (rc) { + return rc; + } + + if (split_num.exponent || split_num.negative) { + return -ERANGE; + } + + *num = split_num.significand; + return 0; +} + +int +spdk_json_decode_object(const struct spdk_json_val *values, + const struct spdk_json_object_decoder *decoders, size_t num_decoders, void *out) +{ + uint32_t i; + bool invalid = false; + size_t decidx; + bool *seen; + + if (values == NULL || values->type != SPDK_JSON_VAL_OBJECT_BEGIN) { + return -1; + } + + seen = calloc(sizeof(bool), num_decoders); + if (seen == NULL) { + return -1; + } + + for (i = 0; i < values->len;) { + const struct spdk_json_val *name = &values[i + 1]; + const struct spdk_json_val *v = &values[i + 2]; + bool found = false; + + for (decidx = 0; decidx < num_decoders; decidx++) { + const struct spdk_json_object_decoder *dec = &decoders[decidx]; + if (spdk_json_strequal(name, dec->name)) { + void *field = (void *)((uintptr_t)out + dec->offset); + + found = true; + + if (seen[decidx]) { + /* duplicate field name */ + invalid = true; + SPDK_JSON_DEBUG("Duplicate key '%s'\n", dec->name); + } else { + seen[decidx] = true; + if (dec->decode_func(v, field)) { + invalid = true; + SPDK_JSON_DEBUG("Decoder failed to decode key '%s'\n", dec->name); + /* keep going to fill out any other valid keys */ + } + } + break; + } + } + + if (!found) { + invalid = true; + SPDK_JSON_DEBUG("Decoder not found for key '%.*s'\n", name->len, (char *)name->start); + } + + i += 1 + spdk_json_val_len(v); + } + + for (decidx = 0; decidx < num_decoders; decidx++) { + if (!decoders[decidx].optional && !seen[decidx]) { + /* required field is missing */ + invalid = true; + break; + } + } + + free(seen); + return invalid ? -1 : 0; +} + +int +spdk_json_decode_array(const struct spdk_json_val *values, spdk_json_decode_fn decode_func, + void *out, size_t max_size, size_t *out_size, size_t stride) +{ + uint32_t i; + char *field; + char *out_end; + + if (values == NULL || values->type != SPDK_JSON_VAL_ARRAY_BEGIN) { + return -1; + } + + *out_size = 0; + field = out; + out_end = field + max_size * stride; + for (i = 0; i < values->len;) { + const struct spdk_json_val *v = &values[i + 1]; + + if (field == out_end) { + return -1; + } + + if (decode_func(v, field)) { + return -1; + } + + i += spdk_json_val_len(v); + field += stride; + (*out_size)++; + } + + return 0; +} + +int +spdk_json_decode_bool(const struct spdk_json_val *val, void *out) +{ + bool *f = out; + + if (val->type != SPDK_JSON_VAL_TRUE && val->type != SPDK_JSON_VAL_FALSE) { + return -1; + } + + *f = val->type == SPDK_JSON_VAL_TRUE; + return 0; +} + +int +spdk_json_decode_uint16(const struct spdk_json_val *val, void *out) +{ + uint16_t *i = out; + + return spdk_json_number_to_uint16(val, i); +} + +int +spdk_json_decode_int32(const struct spdk_json_val *val, void *out) +{ + int32_t *i = out; + + return spdk_json_number_to_int32(val, i); +} + +int +spdk_json_decode_uint32(const struct spdk_json_val *val, void *out) +{ + uint32_t *i = out; + + return spdk_json_number_to_uint32(val, i); +} + +int +spdk_json_decode_uint64(const struct spdk_json_val *val, void *out) +{ + uint64_t *i = out; + + return spdk_json_number_to_uint64(val, i); +} + +int +spdk_json_decode_string(const struct spdk_json_val *val, void *out) +{ + char **s = out; + + free(*s); + + *s = spdk_json_strdup(val); + + if (*s) { + return 0; + } else { + return -1; + } +} + +static struct spdk_json_val * +json_first(struct spdk_json_val *object, enum spdk_json_val_type type) +{ + /* 'object' must be JSON object or array. 'type' might be combination of these two. */ + assert((type & (SPDK_JSON_VAL_ARRAY_BEGIN | SPDK_JSON_VAL_OBJECT_BEGIN)) != 0); + + assert(object != NULL); + + if ((object->type & type) == 0) { + return NULL; + } + + object++; + if (object->len == 0) { + return NULL; + } + + return object; +} + +static struct spdk_json_val * +json_value(struct spdk_json_val *key) +{ + return key->type == SPDK_JSON_VAL_NAME ? key + 1 : NULL; +} + +int +spdk_json_find(struct spdk_json_val *object, const char *key_name, struct spdk_json_val **key, + struct spdk_json_val **val, enum spdk_json_val_type type) +{ + struct spdk_json_val *_key = NULL; + struct spdk_json_val *_val = NULL; + struct spdk_json_val *it; + + assert(object != NULL); + + for (it = json_first(object, SPDK_JSON_VAL_ARRAY_BEGIN | SPDK_JSON_VAL_OBJECT_BEGIN); + it != NULL; + it = spdk_json_next(it)) { + if (it->type != SPDK_JSON_VAL_NAME) { + continue; + } + + if (spdk_json_strequal(it, key_name) != true) { + continue; + } + + if (_key) { + SPDK_JSON_DEBUG("Duplicate key '%s'", key_name); + return -EINVAL; + } + + _key = it; + _val = json_value(_key); + + if (type != SPDK_JSON_VAL_INVALID && (_val->type & type) == 0) { + SPDK_JSON_DEBUG("key '%s' type is %#x but expected one of %#x\n", key_name, _val->type, type); + return -EDOM; + } + } + + if (key) { + *key = _key; + } + + if (val) { + *val = _val; + } + + return _val ? 0 : -ENOENT; +} + +int +spdk_json_find_string(struct spdk_json_val *object, const char *key_name, + struct spdk_json_val **key, struct spdk_json_val **val) +{ + return spdk_json_find(object, key_name, key, val, SPDK_JSON_VAL_STRING); +} + +int +spdk_json_find_array(struct spdk_json_val *object, const char *key_name, + struct spdk_json_val **key, struct spdk_json_val **val) +{ + return spdk_json_find(object, key_name, key, val, SPDK_JSON_VAL_ARRAY_BEGIN); +} + +struct spdk_json_val * +spdk_json_object_first(struct spdk_json_val *object) +{ + struct spdk_json_val *first = json_first(object, SPDK_JSON_VAL_OBJECT_BEGIN); + + /* Empty object? */ + return first && first->type != SPDK_JSON_VAL_OBJECT_END ? first : NULL; +} + +struct spdk_json_val * +spdk_json_array_first(struct spdk_json_val *array_begin) +{ + struct spdk_json_val *first = json_first(array_begin, SPDK_JSON_VAL_ARRAY_BEGIN); + + /* Empty array? */ + return first && first->type != SPDK_JSON_VAL_ARRAY_END ? first : NULL; +} + +static struct spdk_json_val * +json_skip_object_or_array(struct spdk_json_val *val) +{ + unsigned lvl; + enum spdk_json_val_type end_type; + struct spdk_json_val *it; + + if (val->type == SPDK_JSON_VAL_OBJECT_BEGIN) { + end_type = SPDK_JSON_VAL_OBJECT_END; + } else if (val->type == SPDK_JSON_VAL_ARRAY_BEGIN) { + end_type = SPDK_JSON_VAL_ARRAY_END; + } else { + SPDK_JSON_DEBUG("Expected JSON object (%#x) or array (%#x) but got %#x\n", + SPDK_JSON_VAL_OBJECT_BEGIN, SPDK_JSON_VAL_ARRAY_BEGIN, val->type); + return NULL; + } + + lvl = 1; + for (it = val + 1; it->type != SPDK_JSON_VAL_INVALID && lvl != 0; it++) { + if (it->type == val->type) { + lvl++; + } else if (it->type == end_type) { + lvl--; + } + } + + /* if lvl != 0 we have invalid JSON object */ + if (lvl != 0) { + SPDK_JSON_DEBUG("Can't find end of object (type: %#x): lvl (%u) != 0)\n", val->type, lvl); + it = NULL; + } + + return it; +} + +struct spdk_json_val * +spdk_json_next(struct spdk_json_val *it) +{ + struct spdk_json_val *val, *next; + + switch (it->type) { + case SPDK_JSON_VAL_NAME: + val = json_value(it); + next = spdk_json_next(val); + break; + + /* We are in the middle of an array - get to next entry */ + case SPDK_JSON_VAL_NULL: + case SPDK_JSON_VAL_TRUE: + case SPDK_JSON_VAL_FALSE: + case SPDK_JSON_VAL_NUMBER: + case SPDK_JSON_VAL_STRING: + val = it + 1; + return val; + + case SPDK_JSON_VAL_ARRAY_BEGIN: + case SPDK_JSON_VAL_OBJECT_BEGIN: + next = json_skip_object_or_array(it); + break; + + /* Can't go to the next object if started from the end of array or object */ + case SPDK_JSON_VAL_ARRAY_END: + case SPDK_JSON_VAL_OBJECT_END: + case SPDK_JSON_VAL_INVALID: + return NULL; + default: + assert(false); + return NULL; + + } + + /* EOF ? */ + if (next == NULL) { + return NULL; + } + + switch (next->type) { + case SPDK_JSON_VAL_ARRAY_END: + case SPDK_JSON_VAL_OBJECT_END: + case SPDK_JSON_VAL_INVALID: + return NULL; + default: + /* Next value */ + return next; + } +} + +SPDK_LOG_REGISTER_COMPONENT("json_util", SPDK_LOG_JSON) diff --git a/src/spdk/lib/json/json_write.c b/src/spdk/lib/json/json_write.c new file mode 100644 index 000000000..7e9fbb5c3 --- /dev/null +++ b/src/spdk/lib/json/json_write.c @@ -0,0 +1,687 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/json.h" + +#include "spdk_internal/utf.h" + +struct spdk_json_write_ctx { + spdk_json_write_cb write_cb; + void *cb_ctx; + uint32_t flags; + uint32_t indent; + bool new_indent; + bool first_value; + bool failed; + size_t buf_filled; + uint8_t buf[4096]; +}; + +static int emit_buf_full(struct spdk_json_write_ctx *w, const void *data, size_t size); + +static int +fail(struct spdk_json_write_ctx *w) +{ + w->failed = true; + return -1; +} + +static int +flush_buf(struct spdk_json_write_ctx *w) +{ + int rc; + + rc = w->write_cb(w->cb_ctx, w->buf, w->buf_filled); + if (rc != 0) { + return fail(w); + } + + w->buf_filled = 0; + + return 0; +} + +struct spdk_json_write_ctx * +spdk_json_write_begin(spdk_json_write_cb write_cb, void *cb_ctx, uint32_t flags) +{ + struct spdk_json_write_ctx *w; + + w = calloc(1, sizeof(*w)); + if (w == NULL) { + return w; + } + + w->write_cb = write_cb; + w->cb_ctx = cb_ctx; + w->flags = flags; + w->indent = 0; + w->new_indent = false; + w->first_value = true; + w->failed = false; + w->buf_filled = 0; + + return w; +} + +int +spdk_json_write_end(struct spdk_json_write_ctx *w) +{ + bool failed; + int rc; + + if (w == NULL) { + return 0; + } + + failed = w->failed; + + rc = flush_buf(w); + if (rc != 0) { + failed = true; + } + + free(w); + + return failed ? -1 : 0; +} + +static inline int +emit(struct spdk_json_write_ctx *w, const void *data, size_t size) +{ + size_t buf_remain = sizeof(w->buf) - w->buf_filled; + + if (spdk_unlikely(size > buf_remain)) { + /* Not enough space in buffer for the new data. */ + return emit_buf_full(w, data, size); + } + + /* Copy the new data into buf. */ + memcpy(w->buf + w->buf_filled, data, size); + w->buf_filled += size; + return 0; +} + +static int +emit_buf_full(struct spdk_json_write_ctx *w, const void *data, size_t size) +{ + size_t buf_remain = sizeof(w->buf) - w->buf_filled; + int rc; + + assert(size > buf_remain); + + /* Copy as much of the new data as possible into the buffer and flush it. */ + memcpy(w->buf + w->buf_filled, data, buf_remain); + w->buf_filled += buf_remain; + + rc = flush_buf(w); + if (rc != 0) { + return fail(w); + } + + /* Recurse to emit the rest of the data. */ + return emit(w, data + buf_remain, size - buf_remain); +} + +static int +emit_fmt(struct spdk_json_write_ctx *w, const void *data, size_t size) +{ + if (w->flags & SPDK_JSON_WRITE_FLAG_FORMATTED) { + return emit(w, data, size); + } + return 0; +} + +static int +emit_indent(struct spdk_json_write_ctx *w) +{ + uint32_t i; + + if (w->flags & SPDK_JSON_WRITE_FLAG_FORMATTED) { + for (i = 0; i < w->indent; i++) { + if (emit(w, " ", 2)) { return fail(w); } + } + } + return 0; +} + +static int +begin_value(struct spdk_json_write_ctx *w) +{ + /* TODO: check for value state */ + if (w->new_indent) { + if (emit_fmt(w, "\n", 1)) { return fail(w); } + if (emit_indent(w)) { return fail(w); } + } + if (!w->first_value) { + if (emit(w, ",", 1)) { return fail(w); } + if (emit_fmt(w, "\n", 1)) { return fail(w); } + if (emit_indent(w)) { return fail(w); } + } + w->first_value = false; + w->new_indent = false; + return 0; +} + +int +spdk_json_write_val_raw(struct spdk_json_write_ctx *w, const void *data, size_t len) +{ + if (begin_value(w)) { return fail(w); } + return emit(w, data, len); +} + +int +spdk_json_write_null(struct spdk_json_write_ctx *w) +{ + if (begin_value(w)) { return fail(w); } + return emit(w, "null", 4); +} + +int +spdk_json_write_bool(struct spdk_json_write_ctx *w, bool val) +{ + if (begin_value(w)) { return fail(w); } + if (val) { + return emit(w, "true", 4); + } else { + return emit(w, "false", 5); + } +} + +int +spdk_json_write_int32(struct spdk_json_write_ctx *w, int32_t val) +{ + char buf[32]; + int count; + + if (begin_value(w)) { return fail(w); } + count = snprintf(buf, sizeof(buf), "%" PRId32, val); + if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); } + return emit(w, buf, count); +} + +int +spdk_json_write_uint32(struct spdk_json_write_ctx *w, uint32_t val) +{ + char buf[32]; + int count; + + if (begin_value(w)) { return fail(w); } + count = snprintf(buf, sizeof(buf), "%" PRIu32, val); + if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); } + return emit(w, buf, count); +} + +int +spdk_json_write_int64(struct spdk_json_write_ctx *w, int64_t val) +{ + char buf[32]; + int count; + + if (begin_value(w)) { return fail(w); } + count = snprintf(buf, sizeof(buf), "%" PRId64, val); + if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); } + return emit(w, buf, count); +} + +int +spdk_json_write_uint64(struct spdk_json_write_ctx *w, uint64_t val) +{ + char buf[32]; + int count; + + if (begin_value(w)) { return fail(w); } + count = snprintf(buf, sizeof(buf), "%" PRIu64, val); + if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); } + return emit(w, buf, count); +} + +static void +write_hex_4(void *dest, uint16_t val) +{ + uint8_t *p = dest; + char hex[] = "0123456789ABCDEF"; + + p[0] = hex[(val >> 12)]; + p[1] = hex[(val >> 8) & 0xF]; + p[2] = hex[(val >> 4) & 0xF]; + p[3] = hex[val & 0xF]; +} + +static inline int +write_codepoint(struct spdk_json_write_ctx *w, uint32_t codepoint) +{ + static const uint8_t escapes[] = { + ['\b'] = 'b', + ['\f'] = 'f', + ['\n'] = 'n', + ['\r'] = 'r', + ['\t'] = 't', + ['"'] = '"', + ['\\'] = '\\', + /* + * Forward slash (/) is intentionally not converted to an escape + * (it is valid unescaped). + */ + }; + uint16_t high, low; + char out[13]; + size_t out_len; + + if (codepoint < sizeof(escapes) && escapes[codepoint]) { + out[0] = '\\'; + out[1] = escapes[codepoint]; + out_len = 2; + } else if (codepoint >= 0x20 && codepoint < 0x7F) { + /* + * Encode plain ASCII directly (except 0x7F, since it is really + * a control character, despite the JSON spec not considering it one). + */ + out[0] = (uint8_t)codepoint; + out_len = 1; + } else if (codepoint < 0x10000) { + out[0] = '\\'; + out[1] = 'u'; + write_hex_4(&out[2], (uint16_t)codepoint); + out_len = 6; + } else { + utf16_encode_surrogate_pair(codepoint, &high, &low); + out[0] = '\\'; + out[1] = 'u'; + write_hex_4(&out[2], high); + out[6] = '\\'; + out[7] = 'u'; + write_hex_4(&out[8], low); + out_len = 12; + } + + return emit(w, out, out_len); +} + +static int +write_string_or_name(struct spdk_json_write_ctx *w, const char *val, size_t len) +{ + const uint8_t *p = val; + const uint8_t *end = val + len; + + if (emit(w, "\"", 1)) { return fail(w); } + + while (p != end) { + int codepoint_len; + uint32_t codepoint; + + codepoint_len = utf8_valid(p, end); + switch (codepoint_len) { + case 1: + codepoint = utf8_decode_unsafe_1(p); + break; + case 2: + codepoint = utf8_decode_unsafe_2(p); + break; + case 3: + codepoint = utf8_decode_unsafe_3(p); + break; + case 4: + codepoint = utf8_decode_unsafe_4(p); + break; + default: + return fail(w); + } + + if (write_codepoint(w, codepoint)) { return fail(w); } + p += codepoint_len; + } + + return emit(w, "\"", 1); +} + +static int +write_string_or_name_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val, size_t len) +{ + const uint16_t *p = val; + const uint16_t *end = val + len; + + if (emit(w, "\"", 1)) { return fail(w); } + + while (p != end) { + int codepoint_len; + uint32_t codepoint; + + codepoint_len = utf16le_valid(p, end); + switch (codepoint_len) { + case 1: + codepoint = from_le16(&p[0]); + break; + case 2: + codepoint = utf16_decode_surrogate_pair(from_le16(&p[0]), from_le16(&p[1])); + break; + default: + return fail(w); + } + + if (write_codepoint(w, codepoint)) { return fail(w); } + p += codepoint_len; + } + + return emit(w, "\"", 1); +} + +int +spdk_json_write_string_raw(struct spdk_json_write_ctx *w, const char *val, size_t len) +{ + if (begin_value(w)) { return fail(w); } + return write_string_or_name(w, val, len); +} + +int +spdk_json_write_string(struct spdk_json_write_ctx *w, const char *val) +{ + return spdk_json_write_string_raw(w, val, strlen(val)); +} + +int +spdk_json_write_string_utf16le_raw(struct spdk_json_write_ctx *w, const uint16_t *val, size_t len) +{ + if (begin_value(w)) { return fail(w); } + return write_string_or_name_utf16le(w, val, len); +} + +int +spdk_json_write_string_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val) +{ + const uint16_t *p; + size_t len; + + for (len = 0, p = val; *p; p++) { + len++; + } + + return spdk_json_write_string_utf16le_raw(w, val, len); +} + +int +spdk_json_write_string_fmt(struct spdk_json_write_ctx *w, const char *fmt, ...) +{ + va_list args; + int rc; + + va_start(args, fmt); + rc = spdk_json_write_string_fmt_v(w, fmt, args); + va_end(args); + + return rc; +} + +int +spdk_json_write_string_fmt_v(struct spdk_json_write_ctx *w, const char *fmt, va_list args) +{ + char *s; + int rc; + + s = spdk_vsprintf_alloc(fmt, args); + if (s == NULL) { + return -1; + } + + rc = spdk_json_write_string(w, s); + free(s); + return rc; +} + +int +spdk_json_write_array_begin(struct spdk_json_write_ctx *w) +{ + if (begin_value(w)) { return fail(w); } + w->first_value = true; + w->new_indent = true; + w->indent++; + if (emit(w, "[", 1)) { return fail(w); } + return 0; +} + +int +spdk_json_write_array_end(struct spdk_json_write_ctx *w) +{ + w->first_value = false; + if (w->indent == 0) { return fail(w); } + w->indent--; + if (!w->new_indent) { + if (emit_fmt(w, "\n", 1)) { return fail(w); } + if (emit_indent(w)) { return fail(w); } + } + w->new_indent = false; + return emit(w, "]", 1); +} + +int +spdk_json_write_object_begin(struct spdk_json_write_ctx *w) +{ + if (begin_value(w)) { return fail(w); } + w->first_value = true; + w->new_indent = true; + w->indent++; + if (emit(w, "{", 1)) { return fail(w); } + return 0; +} + +int +spdk_json_write_object_end(struct spdk_json_write_ctx *w) +{ + w->first_value = false; + w->indent--; + if (!w->new_indent) { + if (emit_fmt(w, "\n", 1)) { return fail(w); } + if (emit_indent(w)) { return fail(w); } + } + w->new_indent = false; + return emit(w, "}", 1); +} + +int +spdk_json_write_name_raw(struct spdk_json_write_ctx *w, const char *name, size_t len) +{ + /* TODO: check that container is an object */ + if (begin_value(w)) { return fail(w); } + if (write_string_or_name(w, name, len)) { return fail(w); } + w->first_value = true; + if (emit(w, ":", 1)) { return fail(w); } + return emit_fmt(w, " ", 1); +} + +int +spdk_json_write_name(struct spdk_json_write_ctx *w, const char *name) +{ + return spdk_json_write_name_raw(w, name, strlen(name)); +} + +int +spdk_json_write_val(struct spdk_json_write_ctx *w, const struct spdk_json_val *val) +{ + size_t num_values, i; + + switch (val->type) { + case SPDK_JSON_VAL_NUMBER: + return spdk_json_write_val_raw(w, val->start, val->len); + + case SPDK_JSON_VAL_STRING: + return spdk_json_write_string_raw(w, val->start, val->len); + + case SPDK_JSON_VAL_NAME: + return spdk_json_write_name_raw(w, val->start, val->len); + + case SPDK_JSON_VAL_TRUE: + return spdk_json_write_bool(w, true); + + case SPDK_JSON_VAL_FALSE: + return spdk_json_write_bool(w, false); + + case SPDK_JSON_VAL_NULL: + return spdk_json_write_null(w); + + case SPDK_JSON_VAL_ARRAY_BEGIN: + case SPDK_JSON_VAL_OBJECT_BEGIN: + num_values = val[0].len; + + if (val[0].type == SPDK_JSON_VAL_OBJECT_BEGIN) { + if (spdk_json_write_object_begin(w)) { + return fail(w); + } + } else { + if (spdk_json_write_array_begin(w)) { + return fail(w); + } + } + + /* Loop up to and including the _END value */ + for (i = 0; i < num_values + 1;) { + if (spdk_json_write_val(w, &val[i + 1])) { + return fail(w); + } + if (val[i + 1].type == SPDK_JSON_VAL_ARRAY_BEGIN || + val[i + 1].type == SPDK_JSON_VAL_OBJECT_BEGIN) { + i += val[i + 1].len + 2; + } else { + i++; + } + } + return 0; + + case SPDK_JSON_VAL_ARRAY_END: + return spdk_json_write_array_end(w); + + case SPDK_JSON_VAL_OBJECT_END: + return spdk_json_write_object_end(w); + + case SPDK_JSON_VAL_INVALID: + /* Handle INVALID to make the compiler happy (and catch other unhandled types) */ + return fail(w); + } + + return fail(w); +} + +int spdk_json_write_named_null(struct spdk_json_write_ctx *w, const char *name) +{ + int rc = spdk_json_write_name(w, name); + return rc ? rc : spdk_json_write_null(w); +} + +int spdk_json_write_named_bool(struct spdk_json_write_ctx *w, const char *name, bool val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_bool(w, val); +} + +int spdk_json_write_named_int32(struct spdk_json_write_ctx *w, const char *name, int32_t val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_int32(w, val); +} + +int spdk_json_write_named_uint32(struct spdk_json_write_ctx *w, const char *name, uint32_t val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_uint32(w, val); +} + +int spdk_json_write_named_uint64(struct spdk_json_write_ctx *w, const char *name, uint64_t val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_uint64(w, val); +} + +int spdk_json_write_named_int64(struct spdk_json_write_ctx *w, const char *name, int64_t val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_int64(w, val); +} + +int spdk_json_write_named_string(struct spdk_json_write_ctx *w, const char *name, const char *val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_string(w, val); +} + +int spdk_json_write_named_string_fmt(struct spdk_json_write_ctx *w, const char *name, + const char *fmt, ...) +{ + va_list args; + int rc; + + va_start(args, fmt); + rc = spdk_json_write_named_string_fmt_v(w, name, fmt, args); + va_end(args); + + return rc; +} + +int spdk_json_write_named_string_fmt_v(struct spdk_json_write_ctx *w, const char *name, + const char *fmt, va_list args) +{ + char *s; + int rc; + + rc = spdk_json_write_name(w, name); + if (rc) { + return rc; + } + + s = spdk_vsprintf_alloc(fmt, args); + + if (s == NULL) { + return -1; + } + + rc = spdk_json_write_string(w, s); + free(s); + return rc; +} + +int spdk_json_write_named_array_begin(struct spdk_json_write_ctx *w, const char *name) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_array_begin(w); +} + +int spdk_json_write_named_object_begin(struct spdk_json_write_ctx *w, const char *name) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_object_begin(w); +} diff --git a/src/spdk/lib/json/spdk_json.map b/src/spdk/lib/json/spdk_json.map new file mode 100644 index 000000000..0699feaad --- /dev/null +++ b/src/spdk/lib/json/spdk_json.map @@ -0,0 +1,67 @@ +{ + global: + + # public functions + spdk_json_parse; + spdk_json_decode_object; + spdk_json_decode_array; + spdk_json_decode_bool; + spdk_json_decode_uint16; + spdk_json_decode_int32; + spdk_json_decode_uint32; + spdk_json_decode_uint64; + spdk_json_decode_string; + + spdk_json_val_len; + spdk_json_strequal; + spdk_json_strdup; + + spdk_json_number_to_uint16; + spdk_json_number_to_int32; + spdk_json_number_to_uint32; + spdk_json_number_to_uint64; + + spdk_json_write_begin; + spdk_json_write_end; + spdk_json_write_null; + spdk_json_write_bool; + spdk_json_write_int32; + spdk_json_write_uint32; + spdk_json_write_int64; + spdk_json_write_uint64; + spdk_json_write_string; + spdk_json_write_string_raw; + spdk_json_write_string_utf16le; + spdk_json_write_string_utf16le_raw; + spdk_json_write_string_fmt; + spdk_json_write_string_fmt_v; + spdk_json_write_array_begin; + spdk_json_write_array_end; + spdk_json_write_object_begin; + spdk_json_write_object_end; + spdk_json_write_name; + spdk_json_write_name_raw; + spdk_json_write_val; + spdk_json_write_val_raw; + + spdk_json_write_named_null; + spdk_json_write_named_bool; + spdk_json_write_named_int32; + spdk_json_write_named_uint32; + spdk_json_write_named_uint64; + spdk_json_write_named_int64; + spdk_json_write_named_string; + spdk_json_write_named_string_fmt; + spdk_json_write_named_string_fmt_v; + spdk_json_write_named_array_begin; + spdk_json_write_named_object_begin; + + spdk_json_find; + spdk_json_find_string; + spdk_json_find_array; + spdk_json_object_first; + spdk_json_array_first; + spdk_json_next; + + local: *; +}; diff --git a/src/spdk/lib/jsonrpc/Makefile b/src/spdk/lib/jsonrpc/Makefile new file mode 100644 index 000000000..7eb8dd683 --- /dev/null +++ b/src/spdk/lib/jsonrpc/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +LIBNAME = jsonrpc +C_SRCS = jsonrpc_server.c jsonrpc_server_tcp.c +C_SRCS += jsonrpc_client.c jsonrpc_client_tcp.c + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_jsonrpc.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/jsonrpc/jsonrpc_client.c b/src/spdk/lib/jsonrpc/jsonrpc_client.c new file mode 100644 index 000000000..e3940a4d4 --- /dev/null +++ b/src/spdk/lib/jsonrpc/jsonrpc_client.c @@ -0,0 +1,227 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/util.h" +#include "jsonrpc_internal.h" + +static int +capture_version(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + if (spdk_json_strequal(val, "2.0") != true) { + return SPDK_JSON_PARSE_INVALID; + } + + *vptr = val; + return 0; +} + +static int +capture_id(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NUMBER) { + return -EINVAL; + } + + *vptr = val; + return 0; +} + +static int +capture_any(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + *vptr = val; + return 0; +} + +static const struct spdk_json_object_decoder jsonrpc_response_decoders[] = { + {"jsonrpc", offsetof(struct spdk_jsonrpc_client_response, version), capture_version}, + {"id", offsetof(struct spdk_jsonrpc_client_response, id), capture_id, true}, + {"result", offsetof(struct spdk_jsonrpc_client_response, result), capture_any, true}, + {"error", offsetof(struct spdk_jsonrpc_client_response, error), capture_any, true}, +}; + +int +jsonrpc_parse_response(struct spdk_jsonrpc_client *client) +{ + struct spdk_jsonrpc_client_response_internal *r; + ssize_t rc; + size_t buf_len; + size_t values_cnt; + void *end = NULL; + + + /* Check to see if we have received a full JSON value. */ + rc = spdk_json_parse(client->recv_buf, client->recv_offset, NULL, 0, &end, 0); + if (rc == SPDK_JSON_PARSE_INCOMPLETE) { + return 0; + } + + SPDK_DEBUGLOG(SPDK_LOG_RPC_CLIENT, "JSON string is :\n%s\n", client->recv_buf); + if (rc < 0 || rc > SPDK_JSONRPC_CLIENT_MAX_VALUES) { + SPDK_ERRLOG("JSON parse error (rc: %zd)\n", rc); + /* + * Can't recover from parse error (no guaranteed resync point in streaming JSON). + * Return an error to indicate that the connection should be closed. + */ + return -EINVAL; + } + + values_cnt = rc; + + r = calloc(1, sizeof(*r) + sizeof(struct spdk_json_val) * (values_cnt + 1)); + if (!r) { + return -errno; + } + + if (client->resp) { + free(r); + return -ENOSPC; + } + + client->resp = r; + + r->buf = client->recv_buf; + buf_len = client->recv_offset; + r->values_cnt = values_cnt; + + client->recv_buf_size = 0; + client->recv_offset = 0; + client->recv_buf = NULL; + + /* Decode a second time now that there is a full JSON value available. */ + rc = spdk_json_parse(r->buf, buf_len, r->values, values_cnt, &end, + SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE); + if (rc != (ssize_t)values_cnt) { + SPDK_ERRLOG("JSON parse error on second pass (rc: %zd, expected: %zu)\n", rc, values_cnt); + goto err; + } + + assert(end != NULL); + + if (r->values[0].type != SPDK_JSON_VAL_OBJECT_BEGIN) { + SPDK_ERRLOG("top-level JSON value was not object\n"); + goto err; + } + + if (spdk_json_decode_object(r->values, jsonrpc_response_decoders, + SPDK_COUNTOF(jsonrpc_response_decoders), &r->jsonrpc)) { + goto err; + } + + r->ready = 1; + return 1; + +err: + client->resp = NULL; + spdk_jsonrpc_client_free_response(&r->jsonrpc); + return -EINVAL; +} + +static int +jsonrpc_client_write_cb(void *cb_ctx, const void *data, size_t size) +{ + struct spdk_jsonrpc_client_request *request = cb_ctx; + size_t new_size = request->send_buf_size; + + while (new_size - request->send_len < size) { + if (new_size >= SPDK_JSONRPC_SEND_BUF_SIZE_MAX) { + SPDK_ERRLOG("Send buf exceeded maximum size (%zu)\n", + (size_t)SPDK_JSONRPC_SEND_BUF_SIZE_MAX); + return -ENOSPC; + } + + new_size *= 2; + } + + if (new_size != request->send_buf_size) { + uint8_t *new_buf; + + new_buf = realloc(request->send_buf, new_size); + if (new_buf == NULL) { + SPDK_ERRLOG("Resizing send_buf failed (current size %zu, new size %zu)\n", + request->send_buf_size, new_size); + return -ENOMEM; + } + + request->send_buf = new_buf; + request->send_buf_size = new_size; + } + + memcpy(request->send_buf + request->send_len, data, size); + request->send_len += size; + + return 0; +} + +struct spdk_json_write_ctx * +spdk_jsonrpc_begin_request(struct spdk_jsonrpc_client_request *request, int32_t id, + const char *method) +{ + struct spdk_json_write_ctx *w; + + w = spdk_json_write_begin(jsonrpc_client_write_cb, request, 0); + if (w == NULL) { + return NULL; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "jsonrpc", "2.0"); + + if (id >= 0) { + spdk_json_write_named_int32(w, "id", id); + } + + if (method) { + spdk_json_write_named_string(w, "method", method); + } + + return w; +} + +void +spdk_jsonrpc_end_request(struct spdk_jsonrpc_client_request *request, struct spdk_json_write_ctx *w) +{ + assert(w != NULL); + + spdk_json_write_object_end(w); + spdk_json_write_end(w); + jsonrpc_client_write_cb(request, "\n", 1); +} + +SPDK_LOG_REGISTER_COMPONENT("rpc_client", SPDK_LOG_RPC_CLIENT) diff --git a/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c b/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c new file mode 100644 index 000000000..512f6261c --- /dev/null +++ b/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c @@ -0,0 +1,431 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "spdk/string.h" +#include "jsonrpc_internal.h" +#include "spdk/util.h" + +#define RPC_DEFAULT_PORT "5260" + +static int +jsonrpc_client_send_request(struct spdk_jsonrpc_client *client) +{ + ssize_t rc; + struct spdk_jsonrpc_client_request *request = client->request; + + if (!request) { + return 0; + } + + if (request->send_len > 0) { + rc = send(client->sockfd, request->send_buf + request->send_offset, + request->send_len, 0); + if (rc < 0) { + /* For EINTR we pretend that nothing was send. */ + if (errno == EINTR) { + rc = 0; + } else { + rc = -errno; + SPDK_ERRLOG("poll() failed (%d): %s\n", errno, spdk_strerror(errno)); + } + + return rc; + } + + request->send_offset += rc; + request->send_len -= rc; + } + + if (request->send_len == 0) { + client->request = NULL; + spdk_jsonrpc_client_free_request(request); + } + + return 0; +} + +static int +recv_buf_expand(struct spdk_jsonrpc_client *client) +{ + uint8_t *new_buf; + + if (client->recv_buf_size * 2 > SPDK_JSONRPC_SEND_BUF_SIZE_MAX) { + return -ENOSPC; + } + + new_buf = realloc(client->recv_buf, client->recv_buf_size * 2); + if (new_buf == NULL) { + SPDK_ERRLOG("Resizing recv_buf failed (current size %zu, new size %zu)\n", + client->recv_buf_size, client->recv_buf_size * 2); + return -ENOMEM; + } + + client->recv_buf = new_buf; + client->recv_buf_size *= 2; + + return 0; +} + +static int +jsonrpc_client_resp_ready_count(struct spdk_jsonrpc_client *client) +{ + return client->resp != NULL && client->resp->ready ? 1 : 0; +} + +static int +jsonrpc_client_recv(struct spdk_jsonrpc_client *client) +{ + ssize_t rc; + + if (client->recv_buf == NULL) { + client->recv_buf = malloc(SPDK_JSONRPC_SEND_BUF_SIZE_INIT); + if (!client->recv_buf) { + rc = errno; + SPDK_ERRLOG("malloc() failed (%d): %s\n", (int)rc, spdk_strerror(rc)); + return -rc; + } + client->recv_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT; + client->recv_offset = 0; + } else if (client->recv_offset == client->recv_buf_size - 1) { + rc = recv_buf_expand(client); + if (rc) { + return rc; + } + } + + rc = recv(client->sockfd, client->recv_buf + client->recv_offset, + client->recv_buf_size - client->recv_offset - 1, 0); + if (rc < 0) { + /* For EINTR we pretend that nothing was reveived. */ + if (errno == EINTR) { + return 0; + } else { + rc = -errno; + SPDK_ERRLOG("recv() failed (%d): %s\n", errno, spdk_strerror(errno)); + return rc; + } + } else if (rc == 0) { + return -EIO; + } + + client->recv_offset += rc; + client->recv_buf[client->recv_offset] = '\0'; + + /* Check to see if we have received a full JSON value. */ + return jsonrpc_parse_response(client); +} + +static int +jsonrpc_client_poll(struct spdk_jsonrpc_client *client, int timeout) +{ + int rc; + struct pollfd pfd = { .fd = client->sockfd, .events = POLLIN | POLLOUT }; + + rc = poll(&pfd, 1, timeout); + if (rc == -1) { + if (errno == EINTR) { + /* For EINTR we pretend that nothing was received nor send. */ + rc = 0; + } else { + rc = -errno; + SPDK_ERRLOG("poll() failed (%d): %s\n", errno, spdk_strerror(errno)); + } + } else if (rc > 0) { + rc = 0; + + if (pfd.revents & POLLOUT) { + rc = jsonrpc_client_send_request(client); + } + + if (rc == 0 && (pfd.revents & POLLIN)) { + rc = jsonrpc_client_recv(client); + /* Incomplete message in buffer isn't an error. */ + if (rc == -EAGAIN) { + rc = 0; + } + } + } + + return rc ? rc : jsonrpc_client_resp_ready_count(client); +} + +static int +jsonrpc_client_poll_connecting(struct spdk_jsonrpc_client *client, int timeout) +{ + socklen_t rc_len; + int rc; + + struct pollfd pfd = { + .fd = client->sockfd, + .events = POLLOUT + }; + + rc = poll(&pfd, 1, timeout); + if (rc == 0) { + return -ENOTCONN; + } else if (rc == -1) { + if (errno != EINTR) { + SPDK_ERRLOG("poll() failed (%d): %s\n", errno, spdk_strerror(errno)); + goto err; + } + + /* We are still not connected. Caller will have to call us again. */ + return -ENOTCONN; + } else if (pfd.revents & ~POLLOUT) { + /* We only poll for POLLOUT */ + goto err; + } else if ((pfd.revents & POLLOUT) == 0) { + /* Is this even possible to get here? */ + return -ENOTCONN; + } + + rc_len = sizeof(int); + /* connection might fail so need to check SO_ERROR. */ + if (getsockopt(client->sockfd, SOL_SOCKET, SO_ERROR, &rc, &rc_len) == -1) { + goto err; + } + + if (rc == 0) { + client->connected = true; + return 0; + } + +err: + return -EIO; +} + +static int +jsonrpc_client_connect(struct spdk_jsonrpc_client *client, int domain, int protocol, + struct sockaddr *server_addr, socklen_t addrlen) +{ + int rc, flags; + + client->sockfd = socket(domain, SOCK_STREAM, protocol); + if (client->sockfd < 0) { + rc = errno; + SPDK_ERRLOG("socket() failed\n"); + return -rc; + } + + flags = fcntl(client->sockfd, F_GETFL); + if (flags < 0 || fcntl(client->sockfd, F_SETFL, flags | O_NONBLOCK) < 0) { + rc = errno; + SPDK_ERRLOG("fcntl(): can't set nonblocking mode for socket (%d): %s\n", + errno, spdk_strerror(errno)); + goto err; + } + + rc = connect(client->sockfd, server_addr, addrlen); + if (rc != 0) { + rc = errno; + if (rc != EINPROGRESS) { + SPDK_ERRLOG("could not connect to JSON-RPC server: %s\n", spdk_strerror(errno)); + goto err; + } + } else { + client->connected = true; + } + + return -rc; +err: + close(client->sockfd); + client->sockfd = -1; + return -rc; +} + +struct spdk_jsonrpc_client * +spdk_jsonrpc_client_connect(const char *addr, int addr_family) +{ + struct spdk_jsonrpc_client *client = calloc(1, sizeof(struct spdk_jsonrpc_client)); + /* Unix Domain Socket */ + struct sockaddr_un addr_un = {}; + char *add_in = NULL; + int rc; + + if (client == NULL) { + SPDK_ERRLOG("%s\n", spdk_strerror(errno)); + return NULL; + } + + if (addr_family == AF_UNIX) { + addr_un.sun_family = AF_UNIX; + rc = snprintf(addr_un.sun_path, sizeof(addr_un.sun_path), "%s", addr); + if (rc < 0 || (size_t)rc >= sizeof(addr_un.sun_path)) { + rc = -EINVAL; + SPDK_ERRLOG("RPC Listen address Unix socket path too long\n"); + goto err; + } + + rc = jsonrpc_client_connect(client, AF_UNIX, 0, (struct sockaddr *)&addr_un, sizeof(addr_un)); + } else { + /* TCP/IP socket */ + struct addrinfo hints; + struct addrinfo *res; + char *host, *port; + + add_in = strdup(addr); + if (!add_in) { + rc = -errno; + SPDK_ERRLOG("%s\n", spdk_strerror(errno)); + goto err; + } + + rc = spdk_parse_ip_addr(add_in, &host, &port); + if (rc) { + SPDK_ERRLOG("Invalid listen address '%s'\n", addr); + goto err; + } + + if (port == NULL) { + port = RPC_DEFAULT_PORT; + } + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = IPPROTO_TCP; + + rc = getaddrinfo(host, port, &hints, &res); + if (rc != 0) { + SPDK_ERRLOG("Unable to look up RPC connnect address '%s' (%d): %s\n", addr, rc, gai_strerror(rc)); + rc = -EINVAL; + goto err; + } + + rc = jsonrpc_client_connect(client, res->ai_family, res->ai_protocol, res->ai_addr, + res->ai_addrlen); + freeaddrinfo(res); + } + +err: + if (rc != 0 && rc != -EINPROGRESS) { + free(client); + client = NULL; + errno = -rc; + } + + free(add_in); + return client; +} + +void +spdk_jsonrpc_client_close(struct spdk_jsonrpc_client *client) +{ + if (client->sockfd >= 0) { + close(client->sockfd); + } + + free(client->recv_buf); + if (client->resp) { + spdk_jsonrpc_client_free_response(&client->resp->jsonrpc); + } + + free(client); +} + +struct spdk_jsonrpc_client_request * +spdk_jsonrpc_client_create_request(void) +{ + struct spdk_jsonrpc_client_request *request; + + request = calloc(1, sizeof(*request)); + if (request == NULL) { + return NULL; + } + + /* memory malloc for send-buf */ + request->send_buf = malloc(SPDK_JSONRPC_SEND_BUF_SIZE_INIT); + if (!request->send_buf) { + SPDK_ERRLOG("memory malloc for send-buf failed\n"); + free(request); + return NULL; + } + request->send_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT; + + return request; +} + +void +spdk_jsonrpc_client_free_request(struct spdk_jsonrpc_client_request *req) +{ + free(req->send_buf); + free(req); +} + +int +spdk_jsonrpc_client_poll(struct spdk_jsonrpc_client *client, int timeout) +{ + if (client->connected) { + return jsonrpc_client_poll(client, timeout); + } else { + return jsonrpc_client_poll_connecting(client, timeout); + } +} + +int spdk_jsonrpc_client_send_request(struct spdk_jsonrpc_client *client, + struct spdk_jsonrpc_client_request *req) +{ + if (client->request != NULL) { + return -ENOSPC; + } + + client->request = req; + return 0; +} + +struct spdk_jsonrpc_client_response * +spdk_jsonrpc_client_get_response(struct spdk_jsonrpc_client *client) +{ + struct spdk_jsonrpc_client_response_internal *r; + + r = client->resp; + if (r == NULL || r->ready == false) { + return NULL; + } + + client->resp = NULL; + return &r->jsonrpc; +} + +void +spdk_jsonrpc_client_free_response(struct spdk_jsonrpc_client_response *resp) +{ + struct spdk_jsonrpc_client_response_internal *r; + + if (!resp) { + return; + } + + r = SPDK_CONTAINEROF(resp, struct spdk_jsonrpc_client_response_internal, jsonrpc); + free(r->buf); + free(r); +} diff --git a/src/spdk/lib/jsonrpc/jsonrpc_internal.h b/src/spdk/lib/jsonrpc/jsonrpc_internal.h new file mode 100644 index 000000000..f51bedf62 --- /dev/null +++ b/src/spdk/lib/jsonrpc/jsonrpc_internal.h @@ -0,0 +1,166 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_JSONRPC_INTERNAL_H_ +#define SPDK_JSONRPC_INTERNAL_H_ + +#include "spdk/stdinc.h" + +#include "spdk/jsonrpc.h" + +#include "spdk_internal/log.h" + +#define SPDK_JSONRPC_RECV_BUF_SIZE (32 * 1024) +#define SPDK_JSONRPC_SEND_BUF_SIZE_INIT (32 * 1024) +#define SPDK_JSONRPC_SEND_BUF_SIZE_MAX (32 * 1024 * 1024) +#define SPDK_JSONRPC_ID_MAX_LEN 128 +#define SPDK_JSONRPC_MAX_CONNS 64 +#define SPDK_JSONRPC_MAX_VALUES 1024 +#define SPDK_JSONRPC_CLIENT_MAX_VALUES 8192 + +struct spdk_jsonrpc_request { + struct spdk_jsonrpc_server_conn *conn; + + /* Copy of request id value */ + const struct spdk_json_val *id; + + /* Total space allocated for send_buf */ + size_t send_buf_size; + + /* Number of bytes used in send_buf (<= send_buf_size) */ + size_t send_len; + + size_t send_offset; + + uint8_t *recv_buffer; + struct spdk_json_val *values; + size_t values_cnt; + + uint8_t *send_buf; + + struct spdk_json_write_ctx *response; + + STAILQ_ENTRY(spdk_jsonrpc_request) link; +}; + +struct spdk_jsonrpc_server_conn { + struct spdk_jsonrpc_server *server; + int sockfd; + bool closed; + size_t recv_len; + uint8_t recv_buf[SPDK_JSONRPC_RECV_BUF_SIZE]; + uint32_t outstanding_requests; + + pthread_spinlock_t queue_lock; + STAILQ_HEAD(, spdk_jsonrpc_request) send_queue; + + struct spdk_jsonrpc_request *send_request; + + spdk_jsonrpc_conn_closed_fn close_cb; + void *close_cb_ctx; + + TAILQ_ENTRY(spdk_jsonrpc_server_conn) link; +}; + +struct spdk_jsonrpc_server { + int sockfd; + spdk_jsonrpc_handle_request_fn handle_request; + + TAILQ_HEAD(, spdk_jsonrpc_server_conn) free_conns; + TAILQ_HEAD(, spdk_jsonrpc_server_conn) conns; + + struct spdk_jsonrpc_server_conn conns_array[SPDK_JSONRPC_MAX_CONNS]; +}; + +struct spdk_jsonrpc_client_request { + /* Total space allocated for send_buf */ + size_t send_buf_size; + + /* Number of bytes used in send_buf (<= send_buf_size) */ + size_t send_len; + + size_t send_offset; + + uint8_t *send_buf; +}; + +struct spdk_jsonrpc_client_response_internal { + struct spdk_jsonrpc_client_response jsonrpc; + bool ready; + uint8_t *buf; + size_t values_cnt; + struct spdk_json_val values[]; +}; + +struct spdk_jsonrpc_client { + int sockfd; + bool connected; + + size_t recv_buf_size; + size_t recv_offset; + char *recv_buf; + + /* Parsed response */ + struct spdk_jsonrpc_client_response_internal *resp; + struct spdk_jsonrpc_client_request *request; +}; + +/* jsonrpc_server_tcp */ +void jsonrpc_server_handle_request(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *method, + const struct spdk_json_val *params); +void jsonrpc_server_handle_error(struct spdk_jsonrpc_request *request, int error); + +/* Might be called from any thread */ +void jsonrpc_server_send_response(struct spdk_jsonrpc_request *request); + +/* jsonrpc_server */ +int jsonrpc_parse_request(struct spdk_jsonrpc_server_conn *conn, const void *json, + size_t size); + +/* Must be called only from server poll thread */ +void jsonrpc_free_request(struct spdk_jsonrpc_request *request); + +/* + * Parse JSON data as RPC command response. + * + * \param client structure pointer of jsonrpc client + * + * \return 0 On success. Negative error code in error + * -EAGAIN - If the provided data is not a complete JSON value (SPDK_JSON_PARSE_INCOMPLETE) + * -EINVAL - If the provided data has invalid JSON syntax and can't be parsed (SPDK_JSON_PARSE_INVALID). + * -ENOSPC - No space left to store parsed response. + */ +int jsonrpc_parse_response(struct spdk_jsonrpc_client *client); + +#endif diff --git a/src/spdk/lib/jsonrpc/jsonrpc_server.c b/src/spdk/lib/jsonrpc/jsonrpc_server.c new file mode 100644 index 000000000..774612b25 --- /dev/null +++ b/src/spdk/lib/jsonrpc/jsonrpc_server.c @@ -0,0 +1,361 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "jsonrpc_internal.h" + +#include "spdk/util.h" + +struct jsonrpc_request { + const struct spdk_json_val *version; + const struct spdk_json_val *method; + const struct spdk_json_val *params; + const struct spdk_json_val *id; +}; + +static int +capture_val(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + *vptr = val; + return 0; +} + +static const struct spdk_json_object_decoder jsonrpc_request_decoders[] = { + {"jsonrpc", offsetof(struct jsonrpc_request, version), capture_val, true}, + {"method", offsetof(struct jsonrpc_request, method), capture_val}, + {"params", offsetof(struct jsonrpc_request, params), capture_val, true}, + {"id", offsetof(struct jsonrpc_request, id), capture_val, true}, +}; + +static void +parse_single_request(struct spdk_jsonrpc_request *request, struct spdk_json_val *values) +{ + struct jsonrpc_request req = {}; + const struct spdk_json_val *params = NULL; + + if (spdk_json_decode_object(values, jsonrpc_request_decoders, + SPDK_COUNTOF(jsonrpc_request_decoders), + &req)) { + goto invalid; + } + + if (req.version && (req.version->type != SPDK_JSON_VAL_STRING || + !spdk_json_strequal(req.version, "2.0"))) { + goto invalid; + } + + if (!req.method || req.method->type != SPDK_JSON_VAL_STRING) { + goto invalid; + } + + if (req.id) { + if (req.id->type == SPDK_JSON_VAL_STRING || + req.id->type == SPDK_JSON_VAL_NUMBER || + req.id->type == SPDK_JSON_VAL_NULL) { + request->id = req.id; + } else { + goto invalid; + } + } + + if (req.params) { + /* null json value is as if there were no parameters */ + if (req.params->type != SPDK_JSON_VAL_NULL) { + if (req.params->type != SPDK_JSON_VAL_ARRAY_BEGIN && + req.params->type != SPDK_JSON_VAL_OBJECT_BEGIN) { + goto invalid; + } + params = req.params; + } + } + + jsonrpc_server_handle_request(request, req.method, params); + return; + +invalid: + jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST); +} + +static int +jsonrpc_server_write_cb(void *cb_ctx, const void *data, size_t size) +{ + struct spdk_jsonrpc_request *request = cb_ctx; + size_t new_size = request->send_buf_size; + + while (new_size - request->send_len < size) { + if (new_size >= SPDK_JSONRPC_SEND_BUF_SIZE_MAX) { + SPDK_ERRLOG("Send buf exceeded maximum size (%zu)\n", + (size_t)SPDK_JSONRPC_SEND_BUF_SIZE_MAX); + return -1; + } + + new_size *= 2; + } + + if (new_size != request->send_buf_size) { + uint8_t *new_buf; + + new_buf = realloc(request->send_buf, new_size); + if (new_buf == NULL) { + SPDK_ERRLOG("Resizing send_buf failed (current size %zu, new size %zu)\n", + request->send_buf_size, new_size); + return -1; + } + + request->send_buf = new_buf; + request->send_buf_size = new_size; + } + + memcpy(request->send_buf + request->send_len, data, size); + request->send_len += size; + + return 0; +} + +int +jsonrpc_parse_request(struct spdk_jsonrpc_server_conn *conn, const void *json, size_t size) +{ + struct spdk_jsonrpc_request *request; + ssize_t rc; + size_t len; + void *end = NULL; + + /* Check to see if we have received a full JSON value. It is safe to cast away const + * as we don't decode in place. */ + rc = spdk_json_parse((void *)json, size, NULL, 0, &end, 0); + if (rc == SPDK_JSON_PARSE_INCOMPLETE) { + return 0; + } + + request = calloc(1, sizeof(*request)); + if (request == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "Out of memory allocating request\n"); + return -1; + } + + conn->outstanding_requests++; + + request->conn = conn; + + len = end - json; + request->recv_buffer = malloc(len + 1); + if (request->recv_buffer == NULL) { + SPDK_ERRLOG("Failed to allocate buffer to copy request (%zu bytes)\n", len + 1); + jsonrpc_free_request(request); + return -1; + } + + memcpy(request->recv_buffer, json, len); + request->recv_buffer[len] = '\0'; + + if (rc > 0 && rc <= SPDK_JSONRPC_MAX_VALUES) { + request->values_cnt = rc; + request->values = malloc(request->values_cnt * sizeof(request->values[0])); + if (request->values == NULL) { + SPDK_ERRLOG("Failed to allocate buffer for JSON values (%zu bytes)\n", + request->values_cnt * sizeof(request->values[0])); + jsonrpc_free_request(request); + return -1; + } + } + + request->send_offset = 0; + request->send_len = 0; + request->send_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT; + request->send_buf = malloc(request->send_buf_size); + if (request->send_buf == NULL) { + SPDK_ERRLOG("Failed to allocate send_buf (%zu bytes)\n", request->send_buf_size); + jsonrpc_free_request(request); + return -1; + } + + request->response = spdk_json_write_begin(jsonrpc_server_write_cb, request, 0); + if (request->response == NULL) { + SPDK_ERRLOG("Failed to allocate response JSON write context.\n"); + jsonrpc_free_request(request); + return -1; + } + + if (rc <= 0 || rc > SPDK_JSONRPC_MAX_VALUES) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "JSON parse error\n"); + jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_PARSE_ERROR); + + /* + * Can't recover from parse error (no guaranteed resync point in streaming JSON). + * Return an error to indicate that the connection should be closed. + */ + return -1; + } + + /* Decode a second time now that there is a full JSON value available. */ + rc = spdk_json_parse(request->recv_buffer, size, request->values, request->values_cnt, &end, + SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE); + if (rc < 0 || rc > SPDK_JSONRPC_MAX_VALUES) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "JSON parse error on second pass\n"); + jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_PARSE_ERROR); + return -1; + } + + assert(end != NULL); + + if (request->values[0].type == SPDK_JSON_VAL_OBJECT_BEGIN) { + parse_single_request(request, request->values); + } else if (request->values[0].type == SPDK_JSON_VAL_ARRAY_BEGIN) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "Got batch array (not currently supported)\n"); + jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST); + } else { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "top-level JSON value was not array or object\n"); + jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST); + } + + return len; +} + +struct spdk_jsonrpc_server_conn * +spdk_jsonrpc_get_conn(struct spdk_jsonrpc_request *request) +{ + return request->conn; +} + +/* Never return NULL */ +static struct spdk_json_write_ctx * +begin_response(struct spdk_jsonrpc_request *request) +{ + struct spdk_json_write_ctx *w = request->response; + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "jsonrpc", "2.0"); + + spdk_json_write_name(w, "id"); + if (request->id) { + spdk_json_write_val(w, request->id); + } else { + spdk_json_write_null(w); + } + + return w; +} + +static void +skip_response(struct spdk_jsonrpc_request *request) +{ + request->send_len = 0; + spdk_json_write_end(request->response); + request->response = NULL; + jsonrpc_server_send_response(request); +} + +static void +end_response(struct spdk_jsonrpc_request *request) +{ + spdk_json_write_object_end(request->response); + spdk_json_write_end(request->response); + request->response = NULL; + + jsonrpc_server_write_cb(request, "\n", 1); + jsonrpc_server_send_response(request); +} + +void +jsonrpc_free_request(struct spdk_jsonrpc_request *request) +{ + if (!request) { + return; + } + + /* We must send or skip response explicitly */ + assert(request->response == NULL); + + request->conn->outstanding_requests--; + free(request->recv_buffer); + free(request->values); + free(request->send_buf); + free(request); +} + +struct spdk_json_write_ctx * +spdk_jsonrpc_begin_result(struct spdk_jsonrpc_request *request) +{ + struct spdk_json_write_ctx *w = begin_response(request); + + spdk_json_write_name(w, "result"); + return w; +} + +void +spdk_jsonrpc_end_result(struct spdk_jsonrpc_request *request, struct spdk_json_write_ctx *w) +{ + assert(w != NULL); + assert(w == request->response); + + /* If there was no ID in request we skip response. */ + if (request->id && request->id->type != SPDK_JSON_VAL_NULL) { + end_response(request); + } else { + skip_response(request); + } +} + +void +spdk_jsonrpc_send_error_response(struct spdk_jsonrpc_request *request, + int error_code, const char *msg) +{ + struct spdk_json_write_ctx *w = begin_response(request); + + spdk_json_write_named_object_begin(w, "error"); + spdk_json_write_named_int32(w, "code", error_code); + spdk_json_write_named_string(w, "message", msg); + spdk_json_write_object_end(w); + + end_response(request); +} + +void +spdk_jsonrpc_send_error_response_fmt(struct spdk_jsonrpc_request *request, + int error_code, const char *fmt, ...) +{ + struct spdk_json_write_ctx *w = begin_response(request); + va_list args; + + spdk_json_write_named_object_begin(w, "error"); + spdk_json_write_named_int32(w, "code", error_code); + va_start(args, fmt); + spdk_json_write_named_string_fmt_v(w, "message", fmt, args); + va_end(args); + spdk_json_write_object_end(w); + + end_response(request); +} + +SPDK_LOG_REGISTER_COMPONENT("rpc", SPDK_LOG_RPC) diff --git a/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c b/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c new file mode 100644 index 000000000..1e38f713f --- /dev/null +++ b/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c @@ -0,0 +1,441 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "jsonrpc_internal.h" +#include "spdk/string.h" +#include "spdk/util.h" + +struct spdk_jsonrpc_server * +spdk_jsonrpc_server_listen(int domain, int protocol, + struct sockaddr *listen_addr, socklen_t addrlen, + spdk_jsonrpc_handle_request_fn handle_request) +{ + struct spdk_jsonrpc_server *server; + int rc, val, flag, i; + + server = calloc(1, sizeof(struct spdk_jsonrpc_server)); + if (server == NULL) { + return NULL; + } + + TAILQ_INIT(&server->free_conns); + TAILQ_INIT(&server->conns); + + for (i = 0; i < SPDK_JSONRPC_MAX_CONNS; i++) { + TAILQ_INSERT_TAIL(&server->free_conns, &server->conns_array[i], link); + } + + server->handle_request = handle_request; + + server->sockfd = socket(domain, SOCK_STREAM, protocol); + if (server->sockfd < 0) { + SPDK_ERRLOG("socket() failed\n"); + free(server); + return NULL; + } + + val = 1; + setsockopt(server->sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)); + + flag = fcntl(server->sockfd, F_GETFL); + if (fcntl(server->sockfd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", + server->sockfd, spdk_strerror(errno)); + close(server->sockfd); + free(server); + return NULL; + } + + rc = bind(server->sockfd, listen_addr, addrlen); + if (rc != 0) { + SPDK_ERRLOG("could not bind JSON-RPC server: %s\n", spdk_strerror(errno)); + close(server->sockfd); + free(server); + return NULL; + } + + rc = listen(server->sockfd, 512); + if (rc != 0) { + SPDK_ERRLOG("listen() failed, errno = %d\n", errno); + close(server->sockfd); + free(server); + return NULL; + } + + return server; +} + +static struct spdk_jsonrpc_request * +jsonrpc_server_dequeue_request(struct spdk_jsonrpc_server_conn *conn) +{ + struct spdk_jsonrpc_request *request = NULL; + + pthread_spin_lock(&conn->queue_lock); + request = STAILQ_FIRST(&conn->send_queue); + if (request) { + STAILQ_REMOVE_HEAD(&conn->send_queue, link); + } + pthread_spin_unlock(&conn->queue_lock); + return request; +} + +static void +jsonrpc_server_free_conn_request(struct spdk_jsonrpc_server_conn *conn) +{ + struct spdk_jsonrpc_request *request; + + jsonrpc_free_request(conn->send_request); + conn->send_request = NULL ; + while ((request = jsonrpc_server_dequeue_request(conn)) != NULL) { + jsonrpc_free_request(request); + } +} + +static void +jsonrpc_server_conn_close(struct spdk_jsonrpc_server_conn *conn) +{ + conn->closed = true; + + if (conn->sockfd >= 0) { + jsonrpc_server_free_conn_request(conn); + close(conn->sockfd); + conn->sockfd = -1; + + if (conn->close_cb) { + conn->close_cb(conn, conn->close_cb_ctx); + } + } +} + +void +spdk_jsonrpc_server_shutdown(struct spdk_jsonrpc_server *server) +{ + struct spdk_jsonrpc_server_conn *conn; + + close(server->sockfd); + + TAILQ_FOREACH(conn, &server->conns, link) { + jsonrpc_server_conn_close(conn); + } + + free(server); +} + +static void +jsonrpc_server_conn_remove(struct spdk_jsonrpc_server_conn *conn) +{ + struct spdk_jsonrpc_server *server = conn->server; + + jsonrpc_server_conn_close(conn); + + pthread_spin_destroy(&conn->queue_lock); + assert(STAILQ_EMPTY(&conn->send_queue)); + + TAILQ_REMOVE(&server->conns, conn, link); + TAILQ_INSERT_HEAD(&server->free_conns, conn, link); +} + +int +spdk_jsonrpc_conn_add_close_cb(struct spdk_jsonrpc_server_conn *conn, + spdk_jsonrpc_conn_closed_fn cb, void *ctx) +{ + int rc = 0; + + pthread_spin_lock(&conn->queue_lock); + if (conn->close_cb == NULL) { + conn->close_cb = cb; + conn->close_cb_ctx = ctx; + } else { + rc = conn->close_cb == cb && conn->close_cb_ctx == ctx ? -EEXIST : -ENOSPC; + } + pthread_spin_unlock(&conn->queue_lock); + + return rc; +} + +int +spdk_jsonrpc_conn_del_close_cb(struct spdk_jsonrpc_server_conn *conn, + spdk_jsonrpc_conn_closed_fn cb, void *ctx) +{ + int rc = 0; + + pthread_spin_lock(&conn->queue_lock); + if (conn->close_cb == NULL || conn->close_cb != cb || conn->close_cb_ctx != ctx) { + rc = -ENOENT; + } else { + conn->close_cb = NULL; + } + pthread_spin_unlock(&conn->queue_lock); + + return rc; +} + +static int +jsonrpc_server_accept(struct spdk_jsonrpc_server *server) +{ + struct spdk_jsonrpc_server_conn *conn; + int rc, flag; + + rc = accept(server->sockfd, NULL, NULL); + if (rc >= 0) { + conn = TAILQ_FIRST(&server->free_conns); + assert(conn != NULL); + + conn->server = server; + conn->sockfd = rc; + conn->closed = false; + conn->recv_len = 0; + conn->outstanding_requests = 0; + STAILQ_INIT(&conn->send_queue); + conn->send_request = NULL; + + if (pthread_spin_init(&conn->queue_lock, PTHREAD_PROCESS_PRIVATE)) { + SPDK_ERRLOG("Unable to create queue lock for socket: %d", conn->sockfd); + close(conn->sockfd); + return -1; + } + + flag = fcntl(conn->sockfd, F_GETFL); + if (fcntl(conn->sockfd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", + conn->sockfd, spdk_strerror(errno)); + close(conn->sockfd); + pthread_spin_destroy(&conn->queue_lock); + return -1; + } + + TAILQ_REMOVE(&server->free_conns, conn, link); + TAILQ_INSERT_TAIL(&server->conns, conn, link); + return 0; + } + + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + return 0; + } + + return -1; +} + +void +jsonrpc_server_handle_request(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *method, const struct spdk_json_val *params) +{ + request->conn->server->handle_request(request, method, params); +} + +void +jsonrpc_server_handle_error(struct spdk_jsonrpc_request *request, int error) +{ + const char *msg; + + switch (error) { + case SPDK_JSONRPC_ERROR_PARSE_ERROR: + msg = "Parse error"; + break; + + case SPDK_JSONRPC_ERROR_INVALID_REQUEST: + msg = "Invalid request"; + break; + + case SPDK_JSONRPC_ERROR_METHOD_NOT_FOUND: + msg = "Method not found"; + break; + + case SPDK_JSONRPC_ERROR_INVALID_PARAMS: + msg = "Invalid parameters"; + break; + + case SPDK_JSONRPC_ERROR_INTERNAL_ERROR: + msg = "Internal error"; + break; + + default: + msg = "Error"; + break; + } + + spdk_jsonrpc_send_error_response(request, error, msg); +} + +static int +jsonrpc_server_conn_recv(struct spdk_jsonrpc_server_conn *conn) +{ + ssize_t rc, offset; + size_t recv_avail = SPDK_JSONRPC_RECV_BUF_SIZE - conn->recv_len; + + rc = recv(conn->sockfd, conn->recv_buf + conn->recv_len, recv_avail, 0); + if (rc == -1) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + return 0; + } + SPDK_DEBUGLOG(SPDK_LOG_RPC, "recv() failed: %s\n", spdk_strerror(errno)); + return -1; + } + + if (rc == 0) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "remote closed connection\n"); + conn->closed = true; + return 0; + } + + conn->recv_len += rc; + + offset = 0; + do { + rc = jsonrpc_parse_request(conn, conn->recv_buf + offset, conn->recv_len - offset); + if (rc < 0) { + SPDK_ERRLOG("jsonrpc parse request failed\n"); + return -1; + } + + offset += rc; + } while (rc > 0); + + if (offset > 0) { + /* + * Successfully parsed a requests - move any data past the end of the + * parsed requests down to the beginning. + */ + assert((size_t)offset <= conn->recv_len); + memmove(conn->recv_buf, conn->recv_buf + offset, conn->recv_len - offset); + conn->recv_len -= offset; + } + + return 0; +} + +void +jsonrpc_server_send_response(struct spdk_jsonrpc_request *request) +{ + struct spdk_jsonrpc_server_conn *conn = request->conn; + + /* Queue the response to be sent */ + pthread_spin_lock(&conn->queue_lock); + STAILQ_INSERT_TAIL(&conn->send_queue, request, link); + pthread_spin_unlock(&conn->queue_lock); +} + + +static int +jsonrpc_server_conn_send(struct spdk_jsonrpc_server_conn *conn) +{ + struct spdk_jsonrpc_request *request; + ssize_t rc; + +more: + if (conn->outstanding_requests == 0) { + return 0; + } + + if (conn->send_request == NULL) { + conn->send_request = jsonrpc_server_dequeue_request(conn); + } + + request = conn->send_request; + if (request == NULL) { + /* Nothing to send right now */ + return 0; + } + + if (request->send_len > 0) { + rc = send(conn->sockfd, request->send_buf + request->send_offset, + request->send_len, 0); + if (rc < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + return 0; + } + + SPDK_DEBUGLOG(SPDK_LOG_RPC, "send() failed: %s\n", spdk_strerror(errno)); + return -1; + } + + request->send_offset += rc; + request->send_len -= rc; + } + + if (request->send_len == 0) { + /* + * Full response has been sent. + * Free it and set send_request to NULL to move on to the next queued response. + */ + conn->send_request = NULL; + jsonrpc_free_request(request); + goto more; + } + + return 0; +} + +int +spdk_jsonrpc_server_poll(struct spdk_jsonrpc_server *server) +{ + int rc; + struct spdk_jsonrpc_server_conn *conn, *conn_tmp; + + TAILQ_FOREACH_SAFE(conn, &server->conns, link, conn_tmp) { + /* If we can't receive and there are no outstanding requests close the connection. */ + if (conn->closed == true && conn->outstanding_requests == 0) { + jsonrpc_server_conn_close(conn); + } + + if (conn->sockfd == -1 && conn->outstanding_requests == 0) { + jsonrpc_server_conn_remove(conn); + } + } + + /* Check listen socket */ + if (!TAILQ_EMPTY(&server->free_conns)) { + jsonrpc_server_accept(server); + } + + TAILQ_FOREACH(conn, &server->conns, link) { + if (conn->sockfd == -1) { + continue; + } + + rc = jsonrpc_server_conn_send(conn); + if (rc != 0) { + jsonrpc_server_conn_close(conn); + continue; + } + + if (!conn->closed) { + rc = jsonrpc_server_conn_recv(conn); + if (rc != 0) { + jsonrpc_server_conn_close(conn); + } + } + } + + return 0; +} diff --git a/src/spdk/lib/jsonrpc/spdk_jsonrpc.map b/src/spdk/lib/jsonrpc/spdk_jsonrpc.map new file mode 100644 index 000000000..461fd0766 --- /dev/null +++ b/src/spdk/lib/jsonrpc/spdk_jsonrpc.map @@ -0,0 +1,28 @@ +{ + global: + + # public functions + spdk_jsonrpc_server_listen; + spdk_jsonrpc_server_poll; + spdk_jsonrpc_server_shutdown; + spdk_jsonrpc_get_conn; + spdk_jsonrpc_conn_add_close_cb; + spdk_jsonrpc_conn_del_close_cb; + spdk_jsonrpc_begin_result; + spdk_jsonrpc_end_result; + spdk_jsonrpc_send_error_response; + spdk_jsonrpc_send_error_response_fmt; + spdk_jsonrpc_begin_request; + spdk_jsonrpc_end_request; + spdk_jsonrpc_client_connect; + spdk_jsonrpc_client_close; + spdk_jsonrpc_client_create_request; + spdk_jsonrpc_client_free_request; + spdk_jsonrpc_client_send_request; + spdk_jsonrpc_client_poll; + spdk_jsonrpc_client_get_response; + spdk_jsonrpc_client_free_response; + + + local: *; +}; diff --git a/src/spdk/lib/log/Makefile b/src/spdk/lib/log/Makefile new file mode 100644 index 000000000..4e7c25758 --- /dev/null +++ b/src/spdk/lib/log/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 +SO_SUFFIX := $(SO_VER).$(SO_MINOR) + +C_SRCS = log.c log_flags.c +LIBNAME = log + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_log.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/log/log.c b/src/spdk/lib/log/log.c new file mode 100644 index 000000000..0ab50d69c --- /dev/null +++ b/src/spdk/lib/log/log.c @@ -0,0 +1,203 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/log.h" + +static const char *const spdk_level_names[] = { + [SPDK_LOG_ERROR] = "ERROR", + [SPDK_LOG_WARN] = "WARNING", + [SPDK_LOG_NOTICE] = "NOTICE", + [SPDK_LOG_INFO] = "INFO", + [SPDK_LOG_DEBUG] = "DEBUG", +}; + +#define MAX_TMPBUF 1024 + +static logfunc *g_log = NULL; + +void +spdk_log_open(logfunc *logf) +{ + if (logf) { + g_log = logf; + } else { + openlog("spdk", LOG_PID, LOG_LOCAL7); + } +} + +void +spdk_log_close(void) +{ + if (!g_log) { + closelog(); + } +} + +static void +get_timestamp_prefix(char *buf, int buf_size) +{ + struct tm *info; + char date[24]; + struct timespec ts; + long usec; + + clock_gettime(CLOCK_REALTIME, &ts); + info = localtime(&ts.tv_sec); + usec = ts.tv_nsec / 1000; + if (info == NULL) { + snprintf(buf, buf_size, "[%s.%06ld] ", "unknown date", usec); + return; + } + + strftime(date, sizeof(date), "%Y-%m-%d %H:%M:%S", info); + snprintf(buf, buf_size, "[%s.%06ld] ", date, usec); +} + +void +spdk_log(enum spdk_log_level level, const char *file, const int line, const char *func, + const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + spdk_vlog(level, file, line, func, format, ap); + va_end(ap); +} + +void +spdk_vlog(enum spdk_log_level level, const char *file, const int line, const char *func, + const char *format, va_list ap) +{ + int severity = LOG_INFO; + char buf[MAX_TMPBUF]; + char timestamp[64]; + + if (g_log) { + g_log(level, file, line, func, format, ap); + return; + } + + if (level > g_spdk_log_print_level && level > g_spdk_log_level) { + return; + } + + switch (level) { + case SPDK_LOG_ERROR: + severity = LOG_ERR; + break; + case SPDK_LOG_WARN: + severity = LOG_WARNING; + break; + case SPDK_LOG_NOTICE: + severity = LOG_NOTICE; + break; + case SPDK_LOG_INFO: + case SPDK_LOG_DEBUG: + severity = LOG_INFO; + break; + case SPDK_LOG_DISABLED: + return; + } + + vsnprintf(buf, sizeof(buf), format, ap); + + if (level <= g_spdk_log_print_level) { + get_timestamp_prefix(timestamp, sizeof(timestamp)); + if (file) { + fprintf(stderr, "%s%s:%4d:%s: *%s*: %s", timestamp, file, line, func, spdk_level_names[level], buf); + } else { + fprintf(stderr, "%s%s", timestamp, buf); + } + } + + if (level <= g_spdk_log_level) { + if (file) { + syslog(severity, "%s:%4d:%s: *%s*: %s", file, line, func, spdk_level_names[level], buf); + } else { + syslog(severity, "%s", buf); + } + } +} + +static void +fdump(FILE *fp, const char *label, const uint8_t *buf, size_t len) +{ + char tmpbuf[MAX_TMPBUF]; + char buf16[16 + 1]; + size_t total; + unsigned int idx; + + fprintf(fp, "%s\n", label); + + memset(buf16, 0, sizeof buf16); + total = 0; + for (idx = 0; idx < len; idx++) { + if (idx != 0 && idx % 16 == 0) { + snprintf(tmpbuf + total, sizeof tmpbuf - total, + " %s", buf16); + memset(buf16, 0, sizeof buf16); + fprintf(fp, "%s\n", tmpbuf); + total = 0; + } + if (idx % 16 == 0) { + total += snprintf(tmpbuf + total, sizeof tmpbuf - total, + "%08x ", idx); + } + if (idx % 8 == 0) { + total += snprintf(tmpbuf + total, sizeof tmpbuf - total, + "%s", " "); + } + total += snprintf(tmpbuf + total, sizeof tmpbuf - total, + "%2.2x ", buf[idx] & 0xff); + buf16[idx % 16] = isprint(buf[idx]) ? buf[idx] : '.'; + } + for (; idx % 16 != 0; idx++) { + if (idx == 8) { + total += snprintf(tmpbuf + total, sizeof tmpbuf - total, + " "); + } + + total += snprintf(tmpbuf + total, sizeof tmpbuf - total, " "); + } + snprintf(tmpbuf + total, sizeof tmpbuf - total, " %s", buf16); + fprintf(fp, "%s\n", tmpbuf); + fflush(fp); +} + +void +spdk_log_dump(FILE *fp, const char *label, const void *buf, size_t len) +{ + fdump(fp, label, buf, len); +} diff --git a/src/spdk/lib/log/log_flags.c b/src/spdk/lib/log/log_flags.c new file mode 100644 index 000000000..c767a3786 --- /dev/null +++ b/src/spdk/lib/log/log_flags.c @@ -0,0 +1,188 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/log.h" + +static TAILQ_HEAD(, spdk_log_flag) g_log_flags = TAILQ_HEAD_INITIALIZER(g_log_flags); + +enum spdk_log_level g_spdk_log_level = SPDK_LOG_NOTICE; +enum spdk_log_level g_spdk_log_print_level = SPDK_LOG_NOTICE; + +SPDK_LOG_REGISTER_COMPONENT("log", SPDK_LOG_LOG) + +#define MAX_TMPBUF 1024 + +void +spdk_log_set_level(enum spdk_log_level level) +{ + assert(level >= SPDK_LOG_DISABLED); + assert(level <= SPDK_LOG_DEBUG); + g_spdk_log_level = level; +} + +enum spdk_log_level +spdk_log_get_level(void) { + return g_spdk_log_level; +} + +void +spdk_log_set_print_level(enum spdk_log_level level) +{ + assert(level >= SPDK_LOG_DISABLED); + assert(level <= SPDK_LOG_DEBUG); + g_spdk_log_print_level = level; +} + +enum spdk_log_level +spdk_log_get_print_level(void) { + return g_spdk_log_print_level; +} + +static struct spdk_log_flag * +get_log_flag(const char *name) +{ + struct spdk_log_flag *flag; + + TAILQ_FOREACH(flag, &g_log_flags, tailq) { + if (strcasecmp(name, flag->name) == 0) { + return flag; + } + } + + return NULL; +} + +void +spdk_log_register_flag(const char *name, struct spdk_log_flag *flag) +{ + struct spdk_log_flag *iter; + + if (name == NULL || flag == NULL) { + SPDK_ERRLOG("missing spdk_log_flag parameters\n"); + assert(false); + return; + } + + if (get_log_flag(name)) { + SPDK_ERRLOG("duplicate spdk_log_flag '%s'\n", name); + assert(false); + return; + } + + TAILQ_FOREACH(iter, &g_log_flags, tailq) { + if (strcasecmp(iter->name, flag->name) > 0) { + TAILQ_INSERT_BEFORE(iter, flag, tailq); + return; + } + } + + TAILQ_INSERT_TAIL(&g_log_flags, flag, tailq); +} + +bool +spdk_log_get_flag(const char *name) +{ + struct spdk_log_flag *flag = get_log_flag(name); + + if (flag && flag->enabled) { + return true; + } + + return false; +} + +static int +log_set_flag(const char *name, bool value) +{ + struct spdk_log_flag *flag; + + if (strcasecmp(name, "all") == 0) { + TAILQ_FOREACH(flag, &g_log_flags, tailq) { + flag->enabled = value; + } + return 0; + } + + flag = get_log_flag(name); + if (flag == NULL) { + return -1; + } + + flag->enabled = value; + + return 0; +} + +int +spdk_log_set_flag(const char *name) +{ + return log_set_flag(name, true); +} + +int +spdk_log_clear_flag(const char *name) +{ + return log_set_flag(name, false); +} + +struct spdk_log_flag * +spdk_log_get_first_flag(void) +{ + return TAILQ_FIRST(&g_log_flags); +} + +struct spdk_log_flag * +spdk_log_get_next_flag(struct spdk_log_flag *flag) +{ + return TAILQ_NEXT(flag, tailq); +} + +void +spdk_log_usage(FILE *f, const char *log_arg) +{ +#ifdef DEBUG + struct spdk_log_flag *flag; + fprintf(f, " %s, --logflag <flag> enable debug log flag (all", log_arg); + + TAILQ_FOREACH(flag, &g_log_flags, tailq) { + fprintf(f, ", %s", flag->name); + } + + fprintf(f, ")\n"); +#else + fprintf(f, " %s, --logflag <flag> enable debug log flag (not supported" + " - must reconfigure with --enable-debug)\n", log_arg); +#endif +} diff --git a/src/spdk/lib/log/spdk_log.map b/src/spdk/lib/log/spdk_log.map new file mode 100644 index 000000000..84629d555 --- /dev/null +++ b/src/spdk/lib/log/spdk_log.map @@ -0,0 +1,25 @@ +{ + global: + + # public functions + spdk_log_open; + spdk_log_close; + spdk_log_set_level; + spdk_log_get_level; + spdk_log_set_print_level; + spdk_log_get_print_level; + spdk_log; + spdk_vlog; + spdk_log_dump; + spdk_log_get_flag; + spdk_log_set_flag; + spdk_log_clear_flag; + spdk_log_usage; + + # functions used by other SPDK libraries + spdk_log_register_flag; + spdk_log_get_first_flag; + spdk_log_get_next_flag; + + local: *; +}; diff --git a/src/spdk/lib/log_rpc/Makefile b/src/spdk/lib/log_rpc/Makefile new file mode 100644 index 000000000..2c7a78deb --- /dev/null +++ b/src/spdk/lib/log_rpc/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = log_rpc.c +LIBNAME = log_rpc + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_log_rpc.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/log_rpc/log_rpc.c b/src/spdk/lib/log_rpc/log_rpc.c new file mode 100644 index 000000000..78b74c1f5 --- /dev/null +++ b/src/spdk/lib/log_rpc/log_rpc.c @@ -0,0 +1,340 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +struct rpc_log_flag { + char *flag; +}; + +struct rpc_log_level { + char *level; +}; + +static void +free_rpc_log_flag(struct rpc_log_flag *p) +{ + free(p->flag); +} + +static void +free_rpc_log_level(struct rpc_log_level *p) +{ + free(p->level); +} + +static const struct spdk_json_object_decoder rpc_log_flag_decoders[] = { + {"flag", offsetof(struct rpc_log_flag, flag), spdk_json_decode_string}, +}; + +static const struct spdk_json_object_decoder rpc_log_level_decoders[] = { + {"level", offsetof(struct rpc_log_level, level), spdk_json_decode_string}, +}; + +static int +_parse_log_level(char *level) +{ + if (!strcasecmp(level, "ERROR")) { + return SPDK_LOG_ERROR; + } else if (!strcasecmp(level, "WARNING")) { + return SPDK_LOG_WARN; + } else if (!strcasecmp(level, "NOTICE")) { + return SPDK_LOG_NOTICE; + } else if (!strcasecmp(level, "INFO")) { + return SPDK_LOG_INFO; + } else if (!strcasecmp(level, "DEBUG")) { + return SPDK_LOG_DEBUG; + } + return -1; +} + +static const char * +_log_get_level_name(int level) +{ + if (level == SPDK_LOG_ERROR) { + return "ERROR"; + } else if (level == SPDK_LOG_WARN) { + return "WARNING"; + } else if (level == SPDK_LOG_NOTICE) { + return "NOTICE"; + } else if (level == SPDK_LOG_INFO) { + return "INFO"; + } else if (level == SPDK_LOG_DEBUG) { + return "DEBUG"; + } + return NULL; +} + +static void +rpc_log_set_print_level(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_log_level req = {}; + int level; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_log_level_decoders, + SPDK_COUNTOF(rpc_log_level_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto invalid; + } + + level = _parse_log_level(req.level); + if (level == -1) { + SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "tried to set invalid log level\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "invalid log level"); + goto invalid; + } + + spdk_log_set_print_level(level); + free_rpc_log_level(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_log_level(&req); +} +SPDK_RPC_REGISTER("log_set_print_level", rpc_log_set_print_level, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_set_print_level, set_log_print_level) + +static void +rpc_log_get_print_level(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + int level; + const char *name; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "log_get_print_level requires no parameters"); + return; + } + + level = spdk_log_get_print_level(); + name = _log_get_level_name(level); + if (name == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "invalid log level"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, name); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("log_get_print_level", rpc_log_get_print_level, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_get_print_level, get_log_print_level) + +static void +rpc_log_set_level(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_log_level req = {}; + int level; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_log_level_decoders, + SPDK_COUNTOF(rpc_log_level_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto invalid; + } + + level = _parse_log_level(req.level); + if (level == -1) { + SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "tried to set invalid log level\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "invalid log level"); + goto invalid; + } + + + spdk_log_set_level(level); + free_rpc_log_level(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_log_level(&req); +} +SPDK_RPC_REGISTER("log_set_level", rpc_log_set_level, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_set_level, set_log_level) + +static void +rpc_log_get_level(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + int level; + const char *name; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "log_get_level requires no parameters"); + return; + } + + level = spdk_log_get_level(); + name = _log_get_level_name(level); + if (name == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "invalid log level"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, name); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("log_get_level", rpc_log_get_level, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_get_level, get_log_level) + +static void +rpc_log_set_flag(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_log_flag req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_log_flag_decoders, + SPDK_COUNTOF(rpc_log_flag_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto invalid; + } + + if (req.flag == 0) { + SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "invalid flag 0\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "invalid flag 0"); + goto invalid; + } + + spdk_log_set_flag(req.flag); + free_rpc_log_flag(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_log_flag(&req); +} +SPDK_RPC_REGISTER("log_set_flag", rpc_log_set_flag, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_set_flag, set_log_flag) + +static void +rpc_log_clear_flag(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_log_flag req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_log_flag_decoders, + SPDK_COUNTOF(rpc_log_flag_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto invalid; + } + + if (req.flag == 0) { + SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "Invalid flag 0\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "invalid flag 0"); + goto invalid; + } + + spdk_log_clear_flag(req.flag); + free_rpc_log_flag(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_log_flag(&req); +} +SPDK_RPC_REGISTER("log_clear_flag", rpc_log_clear_flag, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_clear_flag, clear_log_flag) + +static void +rpc_log_get_flags(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + struct spdk_log_flag *flag; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "log_get_flags requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_object_begin(w); + flag = spdk_log_get_first_flag(); + while (flag) { + spdk_json_write_name(w, flag->name); + spdk_json_write_bool(w, flag->enabled); + flag = spdk_log_get_next_flag(flag); + } + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("log_get_flags", rpc_log_get_flags, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_get_flags, get_log_flags) + +SPDK_LOG_REGISTER_COMPONENT("log_rpc", SPDK_LOG_LOG_RPC) diff --git a/src/spdk/lib/log_rpc/spdk_log_rpc.map b/src/spdk/lib/log_rpc/spdk_log_rpc.map new file mode 100644 index 000000000..8bee6cdd3 --- /dev/null +++ b/src/spdk/lib/log_rpc/spdk_log_rpc.map @@ -0,0 +1,3 @@ +{ + local: *; +}; diff --git a/src/spdk/lib/lvol/Makefile b/src/spdk/lib/lvol/Makefile new file mode 100644 index 000000000..c370a19a5 --- /dev/null +++ b/src/spdk/lib/lvol/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = lvol.c +LIBNAME = lvol + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_lvol.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/lvol/lvol.c b/src/spdk/lib/lvol/lvol.c new file mode 100644 index 000000000..50b42d7b0 --- /dev/null +++ b/src/spdk/lib/lvol/lvol.c @@ -0,0 +1,1509 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk_internal/lvolstore.h" +#include "spdk_internal/log.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/blob_bdev.h" +#include "spdk/util.h" + +/* Default blob channel opts for lvol */ +#define SPDK_LVOL_BLOB_OPTS_CHANNEL_OPS 512 + +#define LVOL_NAME "name" + +SPDK_LOG_REGISTER_COMPONENT("lvol", SPDK_LOG_LVOL) + +static TAILQ_HEAD(, spdk_lvol_store) g_lvol_stores = TAILQ_HEAD_INITIALIZER(g_lvol_stores); +static pthread_mutex_t g_lvol_stores_mutex = PTHREAD_MUTEX_INITIALIZER; + +static int +add_lvs_to_list(struct spdk_lvol_store *lvs) +{ + struct spdk_lvol_store *tmp; + bool name_conflict = false; + + pthread_mutex_lock(&g_lvol_stores_mutex); + TAILQ_FOREACH(tmp, &g_lvol_stores, link) { + if (!strncmp(lvs->name, tmp->name, SPDK_LVS_NAME_MAX)) { + name_conflict = true; + break; + } + } + if (!name_conflict) { + lvs->on_list = true; + TAILQ_INSERT_TAIL(&g_lvol_stores, lvs, link); + } + pthread_mutex_unlock(&g_lvol_stores_mutex); + + return name_conflict ? -1 : 0; +} + +static void +lvs_free(struct spdk_lvol_store *lvs) +{ + pthread_mutex_lock(&g_lvol_stores_mutex); + if (lvs->on_list) { + TAILQ_REMOVE(&g_lvol_stores, lvs, link); + } + pthread_mutex_unlock(&g_lvol_stores_mutex); + + free(lvs); +} + +static void +lvol_free(struct spdk_lvol *lvol) +{ + free(lvol); +} + +static void +lvol_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvol_with_handle_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + + if (lvolerrno != 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Failed to open lvol %s\n", lvol->unique_id); + goto end; + } + + lvol->ref_count++; + lvol->blob = blob; +end: + req->cb_fn(req->cb_arg, lvol, lvolerrno); + free(req); +} + +void +spdk_lvol_open(struct spdk_lvol *lvol, spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_with_handle_req *req; + struct spdk_blob_open_opts opts; + + assert(cb_fn != NULL); + + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + cb_fn(cb_arg, NULL, -ENODEV); + return; + } + + if (lvol->action_in_progress == true) { + SPDK_ERRLOG("Cannot open lvol - operations on lvol pending\n"); + cb_fn(cb_arg, lvol, -EBUSY); + return; + } + + if (lvol->ref_count > 0) { + lvol->ref_count++; + cb_fn(cb_arg, lvol, 0); + return; + } + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + SPDK_ERRLOG("Cannot alloc memory for request structure\n"); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + + spdk_blob_open_opts_init(&opts); + opts.clear_method = lvol->clear_method; + + spdk_bs_open_blob_ext(lvol->lvol_store->blobstore, lvol->blob_id, &opts, lvol_open_cb, req); +} + +static void +bs_unload_with_error_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + + req->cb_fn(req->cb_arg, NULL, req->lvserrno); + free(req); +} + +static void +load_next_lvol(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs = lvs->blobstore; + struct spdk_lvol *lvol, *tmp; + spdk_blob_id blob_id; + const char *attr; + size_t value_len; + int rc; + + if (lvolerrno == -ENOENT) { + /* Finished iterating */ + req->cb_fn(req->cb_arg, lvs, 0); + free(req); + return; + } else if (lvolerrno < 0) { + SPDK_ERRLOG("Failed to fetch blobs list\n"); + req->lvserrno = lvolerrno; + goto invalid; + } + + blob_id = spdk_blob_get_id(blob); + + if (blob_id == lvs->super_blob_id) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "found superblob %"PRIu64"\n", (uint64_t)blob_id); + spdk_bs_iter_next(bs, blob, load_next_lvol, req); + return; + } + + lvol = calloc(1, sizeof(*lvol)); + if (!lvol) { + SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n"); + req->lvserrno = -ENOMEM; + goto invalid; + } + + lvol->blob = blob; + lvol->blob_id = blob_id; + lvol->lvol_store = lvs; + lvol->thin_provision = spdk_blob_is_thin_provisioned(blob); + + rc = spdk_blob_get_xattr_value(blob, "uuid", (const void **)&attr, &value_len); + if (rc != 0 || value_len != SPDK_UUID_STRING_LEN || attr[SPDK_UUID_STRING_LEN - 1] != '\0' || + spdk_uuid_parse(&lvol->uuid, attr) != 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Missing or corrupt lvol uuid\n"); + memset(&lvol->uuid, 0, sizeof(lvol->uuid)); + } + spdk_uuid_fmt_lower(lvol->uuid_str, sizeof(lvol->uuid_str), &lvol->uuid); + + if (!spdk_mem_all_zero(&lvol->uuid, sizeof(lvol->uuid))) { + snprintf(lvol->unique_id, sizeof(lvol->unique_id), "%s", lvol->uuid_str); + } else { + spdk_uuid_fmt_lower(lvol->unique_id, sizeof(lvol->unique_id), &lvol->lvol_store->uuid); + value_len = strlen(lvol->unique_id); + snprintf(lvol->unique_id + value_len, sizeof(lvol->unique_id) - value_len, "_%"PRIu64, + (uint64_t)blob_id); + } + + rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&attr, &value_len); + if (rc != 0 || value_len > SPDK_LVOL_NAME_MAX) { + SPDK_ERRLOG("Cannot assign lvol name\n"); + lvol_free(lvol); + req->lvserrno = -EINVAL; + goto invalid; + } + + snprintf(lvol->name, sizeof(lvol->name), "%s", attr); + + TAILQ_INSERT_TAIL(&lvs->lvols, lvol, link); + + lvs->lvol_count++; + + SPDK_INFOLOG(SPDK_LOG_LVOL, "added lvol %s (%s)\n", lvol->unique_id, lvol->uuid_str); + + spdk_bs_iter_next(bs, blob, load_next_lvol, req); + + return; + +invalid: + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + TAILQ_REMOVE(&lvs->lvols, lvol, link); + free(lvol); + } + + lvs_free(lvs); + spdk_bs_unload(bs, bs_unload_with_error_cb, req); +} + +static void +close_super_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs = lvs->blobstore; + + if (lvolerrno != 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Could not close super blob\n"); + lvs_free(lvs); + req->lvserrno = -ENODEV; + spdk_bs_unload(bs, bs_unload_with_error_cb, req); + return; + } + + /* Start loading lvols */ + spdk_bs_iter_first(lvs->blobstore, load_next_lvol, req); +} + +static void +close_super_blob_with_error_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs = lvs->blobstore; + + lvs_free(lvs); + + spdk_bs_unload(bs, bs_unload_with_error_cb, req); +} + +static void +lvs_read_uuid(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs = lvs->blobstore; + const char *attr; + size_t value_len; + int rc; + + if (lvolerrno != 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Could not open super blob\n"); + lvs_free(lvs); + req->lvserrno = -ENODEV; + spdk_bs_unload(bs, bs_unload_with_error_cb, req); + return; + } + + rc = spdk_blob_get_xattr_value(blob, "uuid", (const void **)&attr, &value_len); + if (rc != 0 || value_len != SPDK_UUID_STRING_LEN || attr[SPDK_UUID_STRING_LEN - 1] != '\0') { + SPDK_INFOLOG(SPDK_LOG_LVOL, "missing or incorrect UUID\n"); + req->lvserrno = -EINVAL; + spdk_blob_close(blob, close_super_blob_with_error_cb, req); + return; + } + + if (spdk_uuid_parse(&lvs->uuid, attr)) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "incorrect UUID '%s'\n", attr); + req->lvserrno = -EINVAL; + spdk_blob_close(blob, close_super_blob_with_error_cb, req); + return; + } + + rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&attr, &value_len); + if (rc != 0 || value_len > SPDK_LVS_NAME_MAX) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "missing or invalid name\n"); + req->lvserrno = -EINVAL; + spdk_blob_close(blob, close_super_blob_with_error_cb, req); + return; + } + + snprintf(lvs->name, sizeof(lvs->name), "%s", attr); + + rc = add_lvs_to_list(lvs); + if (rc) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "lvolstore with name %s already exists\n", lvs->name); + req->lvserrno = -EEXIST; + spdk_blob_close(blob, close_super_blob_with_error_cb, req); + return; + } + + lvs->super_blob_id = spdk_blob_get_id(blob); + + spdk_blob_close(blob, close_super_cb, req); +} + +static void +lvs_open_super(void *cb_arg, spdk_blob_id blobid, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs = lvs->blobstore; + + if (lvolerrno != 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Super blob not found\n"); + lvs_free(lvs); + req->lvserrno = -ENODEV; + spdk_bs_unload(bs, bs_unload_with_error_cb, req); + return; + } + + spdk_bs_open_blob(bs, blobid, lvs_read_uuid, req); +} + +static void +lvs_load_cb(void *cb_arg, struct spdk_blob_store *bs, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + struct spdk_lvol_store *lvs; + + if (lvolerrno != 0) { + req->cb_fn(req->cb_arg, NULL, lvolerrno); + free(req); + return; + } + + lvs = calloc(1, sizeof(*lvs)); + if (lvs == NULL) { + SPDK_ERRLOG("Cannot alloc memory for lvol store\n"); + spdk_bs_unload(bs, bs_unload_with_error_cb, req); + return; + } + + lvs->blobstore = bs; + lvs->bs_dev = req->bs_dev; + TAILQ_INIT(&lvs->lvols); + TAILQ_INIT(&lvs->pending_lvols); + + req->lvol_store = lvs; + + spdk_bs_get_super(bs, lvs_open_super, req); +} + +static void +lvs_bs_opts_init(struct spdk_bs_opts *opts) +{ + spdk_bs_opts_init(opts); + opts->max_channel_ops = SPDK_LVOL_BLOB_OPTS_CHANNEL_OPS; +} + +void +spdk_lvs_load(struct spdk_bs_dev *bs_dev, spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvs_with_handle_req *req; + struct spdk_bs_opts opts = {}; + + assert(cb_fn != NULL); + + if (bs_dev == NULL) { + SPDK_ERRLOG("Blobstore device does not exist\n"); + cb_fn(cb_arg, NULL, -ENODEV); + return; + } + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + SPDK_ERRLOG("Cannot alloc memory for request structure\n"); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->bs_dev = bs_dev; + + lvs_bs_opts_init(&opts); + snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), "LVOLSTORE"); + + spdk_bs_load(bs_dev, &opts, lvs_load_cb, req); +} + +static void +remove_bs_on_error_cb(void *cb_arg, int bserrno) +{ +} + +static void +super_create_close_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + + if (lvolerrno < 0) { + SPDK_ERRLOG("Lvol store init failed: could not close super blob\n"); + req->cb_fn(req->cb_arg, NULL, lvolerrno); + spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL); + lvs_free(lvs); + free(req); + return; + } + + req->cb_fn(req->cb_arg, lvs, lvolerrno); + free(req); +} + +static void +super_blob_set_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob *blob = lvs->super_blob; + + if (lvolerrno < 0) { + req->cb_fn(req->cb_arg, NULL, lvolerrno); + SPDK_ERRLOG("Lvol store init failed: could not set uuid for super blob\n"); + spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL); + lvs_free(lvs); + free(req); + return; + } + + spdk_blob_close(blob, super_create_close_cb, req); +} + +static void +super_blob_init_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob *blob = lvs->super_blob; + char uuid[SPDK_UUID_STRING_LEN]; + + if (lvolerrno < 0) { + req->cb_fn(req->cb_arg, NULL, lvolerrno); + SPDK_ERRLOG("Lvol store init failed: could not set super blob\n"); + spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL); + lvs_free(lvs); + free(req); + return; + } + + spdk_uuid_fmt_lower(uuid, sizeof(uuid), &lvs->uuid); + + spdk_blob_set_xattr(blob, "uuid", uuid, sizeof(uuid)); + spdk_blob_set_xattr(blob, "name", lvs->name, strnlen(lvs->name, SPDK_LVS_NAME_MAX) + 1); + spdk_blob_sync_md(blob, super_blob_set_cb, req); +} + +static void +super_blob_create_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + + if (lvolerrno < 0) { + req->cb_fn(req->cb_arg, NULL, lvolerrno); + SPDK_ERRLOG("Lvol store init failed: could not open super blob\n"); + spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL); + lvs_free(lvs); + free(req); + return; + } + + lvs->super_blob = blob; + lvs->super_blob_id = spdk_blob_get_id(blob); + + spdk_bs_set_super(lvs->blobstore, lvs->super_blob_id, super_blob_init_cb, req); +} + +static void +super_blob_create_cb(void *cb_arg, spdk_blob_id blobid, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs; + + if (lvolerrno < 0) { + req->cb_fn(req->cb_arg, NULL, lvolerrno); + SPDK_ERRLOG("Lvol store init failed: could not create super blob\n"); + spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL); + lvs_free(lvs); + free(req); + return; + } + + bs = req->lvol_store->blobstore; + + spdk_bs_open_blob(bs, blobid, super_blob_create_open_cb, req); +} + +static void +lvs_init_cb(void *cb_arg, struct spdk_blob_store *bs, int lvserrno) +{ + struct spdk_lvs_with_handle_req *lvs_req = cb_arg; + struct spdk_lvol_store *lvs = lvs_req->lvol_store; + + if (lvserrno != 0) { + assert(bs == NULL); + lvs_req->cb_fn(lvs_req->cb_arg, NULL, lvserrno); + SPDK_ERRLOG("Lvol store init failed: could not initialize blobstore\n"); + lvs_free(lvs); + free(lvs_req); + return; + } + + assert(bs != NULL); + lvs->blobstore = bs; + TAILQ_INIT(&lvs->lvols); + TAILQ_INIT(&lvs->pending_lvols); + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store initialized\n"); + + /* create super blob */ + spdk_bs_create_blob(lvs->blobstore, super_blob_create_cb, lvs_req); +} + +void +spdk_lvs_opts_init(struct spdk_lvs_opts *o) +{ + o->cluster_sz = SPDK_LVS_OPTS_CLUSTER_SZ; + o->clear_method = LVS_CLEAR_WITH_UNMAP; + memset(o->name, 0, sizeof(o->name)); +} + +static void +setup_lvs_opts(struct spdk_bs_opts *bs_opts, struct spdk_lvs_opts *o) +{ + assert(o != NULL); + lvs_bs_opts_init(bs_opts); + bs_opts->cluster_sz = o->cluster_sz; + bs_opts->clear_method = (enum bs_clear_method)o->clear_method; +} + +int +spdk_lvs_init(struct spdk_bs_dev *bs_dev, struct spdk_lvs_opts *o, + spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_store *lvs; + struct spdk_lvs_with_handle_req *lvs_req; + struct spdk_bs_opts opts = {}; + int rc; + + if (bs_dev == NULL) { + SPDK_ERRLOG("Blobstore device does not exist\n"); + return -ENODEV; + } + + if (o == NULL) { + SPDK_ERRLOG("spdk_lvs_opts not specified\n"); + return -EINVAL; + } + + setup_lvs_opts(&opts, o); + + if (strnlen(o->name, SPDK_LVS_NAME_MAX) == SPDK_LVS_NAME_MAX) { + SPDK_ERRLOG("Name has no null terminator.\n"); + return -EINVAL; + } + + if (strnlen(o->name, SPDK_LVS_NAME_MAX) == 0) { + SPDK_ERRLOG("No name specified.\n"); + return -EINVAL; + } + + lvs = calloc(1, sizeof(*lvs)); + if (!lvs) { + SPDK_ERRLOG("Cannot alloc memory for lvol store base pointer\n"); + return -ENOMEM; + } + + spdk_uuid_generate(&lvs->uuid); + snprintf(lvs->name, sizeof(lvs->name), "%s", o->name); + + rc = add_lvs_to_list(lvs); + if (rc) { + SPDK_ERRLOG("lvolstore with name %s already exists\n", lvs->name); + lvs_free(lvs); + return -EEXIST; + } + + lvs_req = calloc(1, sizeof(*lvs_req)); + if (!lvs_req) { + lvs_free(lvs); + SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n"); + return -ENOMEM; + } + + assert(cb_fn != NULL); + lvs_req->cb_fn = cb_fn; + lvs_req->cb_arg = cb_arg; + lvs_req->lvol_store = lvs; + lvs->bs_dev = bs_dev; + lvs->destruct = false; + + snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), "LVOLSTORE"); + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Initializing lvol store\n"); + spdk_bs_init(bs_dev, &opts, lvs_init_cb, lvs_req); + + return 0; +} + +static void +lvs_rename_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_req *req = cb_arg; + + if (lvolerrno != 0) { + req->lvserrno = lvolerrno; + } + if (req->lvserrno != 0) { + SPDK_ERRLOG("Lvol store rename operation failed\n"); + /* Lvs renaming failed, so we should 'clear' new_name. + * Otherwise it could cause a failure on the next attepmt to change the name to 'new_name' */ + snprintf(req->lvol_store->new_name, + sizeof(req->lvol_store->new_name), + "%s", req->lvol_store->name); + } else { + /* Update lvs name with new_name */ + snprintf(req->lvol_store->name, + sizeof(req->lvol_store->name), + "%s", req->lvol_store->new_name); + } + + req->cb_fn(req->cb_arg, req->lvserrno); + free(req); +} + +static void +lvs_rename_sync_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_req *req = cb_arg; + struct spdk_blob *blob = req->lvol_store->super_blob; + + if (lvolerrno < 0) { + req->lvserrno = lvolerrno; + } + + spdk_blob_close(blob, lvs_rename_cb, req); +} + +static void +lvs_rename_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvs_req *req = cb_arg; + int rc; + + if (lvolerrno < 0) { + lvs_rename_cb(cb_arg, lvolerrno); + return; + } + + rc = spdk_blob_set_xattr(blob, "name", req->lvol_store->new_name, + strlen(req->lvol_store->new_name) + 1); + if (rc < 0) { + req->lvserrno = rc; + lvs_rename_sync_cb(req, rc); + return; + } + + req->lvol_store->super_blob = blob; + + spdk_blob_sync_md(blob, lvs_rename_sync_cb, req); +} + +void +spdk_lvs_rename(struct spdk_lvol_store *lvs, const char *new_name, + spdk_lvs_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvs_req *req; + struct spdk_lvol_store *tmp; + + /* Check if new name is current lvs name. + * If so, return success immediately */ + if (strncmp(lvs->name, new_name, SPDK_LVS_NAME_MAX) == 0) { + cb_fn(cb_arg, 0); + return; + } + + /* Check if new or new_name is already used in other lvs */ + pthread_mutex_lock(&g_lvol_stores_mutex); + TAILQ_FOREACH(tmp, &g_lvol_stores, link) { + if (!strncmp(new_name, tmp->name, SPDK_LVS_NAME_MAX) || + !strncmp(new_name, tmp->new_name, SPDK_LVS_NAME_MAX)) { + pthread_mutex_unlock(&g_lvol_stores_mutex); + cb_fn(cb_arg, -EEXIST); + return; + } + } + pthread_mutex_unlock(&g_lvol_stores_mutex); + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + snprintf(lvs->new_name, sizeof(lvs->new_name), "%s", new_name); + req->lvol_store = lvs; + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_bs_open_blob(lvs->blobstore, lvs->super_blob_id, lvs_rename_open_cb, req); +} + +static void +_lvs_unload_cb(void *cb_arg, int lvserrno) +{ + struct spdk_lvs_req *lvs_req = cb_arg; + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store unloaded\n"); + assert(lvs_req->cb_fn != NULL); + lvs_req->cb_fn(lvs_req->cb_arg, lvserrno); + free(lvs_req); +} + +int +spdk_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, + void *cb_arg) +{ + struct spdk_lvs_req *lvs_req; + struct spdk_lvol *lvol, *tmp; + + if (lvs == NULL) { + SPDK_ERRLOG("Lvol store is NULL\n"); + return -ENODEV; + } + + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + if (lvol->action_in_progress == true) { + SPDK_ERRLOG("Cannot unload lvol store - operations on lvols pending\n"); + cb_fn(cb_arg, -EBUSY); + return -EBUSY; + } else if (lvol->ref_count != 0) { + SPDK_ERRLOG("Lvols still open on lvol store\n"); + cb_fn(cb_arg, -EBUSY); + return -EBUSY; + } + } + + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + TAILQ_REMOVE(&lvs->lvols, lvol, link); + lvol_free(lvol); + } + + lvs_req = calloc(1, sizeof(*lvs_req)); + if (!lvs_req) { + SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n"); + return -ENOMEM; + } + + lvs_req->cb_fn = cb_fn; + lvs_req->cb_arg = cb_arg; + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Unloading lvol store\n"); + spdk_bs_unload(lvs->blobstore, _lvs_unload_cb, lvs_req); + lvs_free(lvs); + + return 0; +} + +static void +_lvs_destroy_cb(void *cb_arg, int lvserrno) +{ + struct spdk_lvs_destroy_req *lvs_req = cb_arg; + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store destroyed\n"); + assert(lvs_req->cb_fn != NULL); + lvs_req->cb_fn(lvs_req->cb_arg, lvserrno); + free(lvs_req); +} + +static void +_lvs_destroy_super_cb(void *cb_arg, int bserrno) +{ + struct spdk_lvs_destroy_req *lvs_req = cb_arg; + struct spdk_lvol_store *lvs = lvs_req->lvs; + + assert(lvs != NULL); + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Destroying lvol store\n"); + spdk_bs_destroy(lvs->blobstore, _lvs_destroy_cb, lvs_req); + lvs_free(lvs); +} + +int +spdk_lvs_destroy(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, + void *cb_arg) +{ + struct spdk_lvs_destroy_req *lvs_req; + struct spdk_lvol *iter_lvol, *tmp; + + if (lvs == NULL) { + SPDK_ERRLOG("Lvol store is NULL\n"); + return -ENODEV; + } + + TAILQ_FOREACH_SAFE(iter_lvol, &lvs->lvols, link, tmp) { + if (iter_lvol->action_in_progress == true) { + SPDK_ERRLOG("Cannot destroy lvol store - operations on lvols pending\n"); + cb_fn(cb_arg, -EBUSY); + return -EBUSY; + } else if (iter_lvol->ref_count != 0) { + SPDK_ERRLOG("Lvols still open on lvol store\n"); + cb_fn(cb_arg, -EBUSY); + return -EBUSY; + } + } + + TAILQ_FOREACH_SAFE(iter_lvol, &lvs->lvols, link, tmp) { + free(iter_lvol); + } + + lvs_req = calloc(1, sizeof(*lvs_req)); + if (!lvs_req) { + SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n"); + return -ENOMEM; + } + + lvs_req->cb_fn = cb_fn; + lvs_req->cb_arg = cb_arg; + lvs_req->lvs = lvs; + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Deleting super blob\n"); + spdk_bs_delete_blob(lvs->blobstore, lvs->super_blob_id, _lvs_destroy_super_cb, lvs_req); + + return 0; +} + +static void +lvol_close_blob_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + + if (lvolerrno < 0) { + SPDK_ERRLOG("Could not close blob on lvol\n"); + lvol_free(lvol); + goto end; + } + + lvol->ref_count--; + lvol->action_in_progress = false; + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol %s closed\n", lvol->unique_id); + +end: + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +bool +spdk_lvol_deletable(struct spdk_lvol *lvol) +{ + size_t count = 0; + + spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count); + return (count == 0); +} + +static void +lvol_delete_blob_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + + if (lvolerrno < 0) { + SPDK_ERRLOG("Could not remove blob on lvol gracefully - forced removal\n"); + } else { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol %s deleted\n", lvol->unique_id); + } + + TAILQ_REMOVE(&lvol->lvol_store->lvols, lvol, link); + lvol_free(lvol); + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +static void +lvol_create_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvol_with_handle_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + + TAILQ_REMOVE(&req->lvol->lvol_store->pending_lvols, req->lvol, link); + + if (lvolerrno < 0) { + free(lvol); + req->cb_fn(req->cb_arg, NULL, lvolerrno); + free(req); + return; + } + + lvol->blob = blob; + lvol->blob_id = spdk_blob_get_id(blob); + + TAILQ_INSERT_TAIL(&lvol->lvol_store->lvols, lvol, link); + + snprintf(lvol->unique_id, sizeof(lvol->unique_id), "%s", lvol->uuid_str); + lvol->ref_count++; + + assert(req->cb_fn != NULL); + req->cb_fn(req->cb_arg, req->lvol, lvolerrno); + free(req); +} + +static void +lvol_create_cb(void *cb_arg, spdk_blob_id blobid, int lvolerrno) +{ + struct spdk_lvol_with_handle_req *req = cb_arg; + struct spdk_blob_store *bs; + struct spdk_blob_open_opts opts; + + if (lvolerrno < 0) { + TAILQ_REMOVE(&req->lvol->lvol_store->pending_lvols, req->lvol, link); + free(req->lvol); + assert(req->cb_fn != NULL); + req->cb_fn(req->cb_arg, NULL, lvolerrno); + free(req); + return; + } + + spdk_blob_open_opts_init(&opts); + opts.clear_method = req->lvol->clear_method; + bs = req->lvol->lvol_store->blobstore; + + spdk_bs_open_blob_ext(bs, blobid, &opts, lvol_create_open_cb, req); +} + +static void +lvol_get_xattr_value(void *xattr_ctx, const char *name, + const void **value, size_t *value_len) +{ + struct spdk_lvol *lvol = xattr_ctx; + + if (!strcmp(LVOL_NAME, name)) { + *value = lvol->name; + *value_len = SPDK_LVOL_NAME_MAX; + } else if (!strcmp("uuid", name)) { + *value = lvol->uuid_str; + *value_len = sizeof(lvol->uuid_str); + } +} + +static int +lvs_verify_lvol_name(struct spdk_lvol_store *lvs, const char *name) +{ + struct spdk_lvol *tmp; + + if (name == NULL || strnlen(name, SPDK_LVOL_NAME_MAX) == 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "lvol name not provided.\n"); + return -EINVAL; + } + + if (strnlen(name, SPDK_LVOL_NAME_MAX) == SPDK_LVOL_NAME_MAX) { + SPDK_ERRLOG("Name has no null terminator.\n"); + return -EINVAL; + } + + TAILQ_FOREACH(tmp, &lvs->lvols, link) { + if (!strncmp(name, tmp->name, SPDK_LVOL_NAME_MAX)) { + SPDK_ERRLOG("lvol with name %s already exists\n", name); + return -EEXIST; + } + } + + TAILQ_FOREACH(tmp, &lvs->pending_lvols, link) { + if (!strncmp(name, tmp->name, SPDK_LVOL_NAME_MAX)) { + SPDK_ERRLOG("lvol with name %s is being already created\n", name); + return -EEXIST; + } + } + + return 0; +} + +int +spdk_lvol_create(struct spdk_lvol_store *lvs, const char *name, uint64_t sz, + bool thin_provision, enum lvol_clear_method clear_method, spdk_lvol_op_with_handle_complete cb_fn, + void *cb_arg) +{ + struct spdk_lvol_with_handle_req *req; + struct spdk_blob_store *bs; + struct spdk_lvol *lvol; + struct spdk_blob_opts opts; + uint64_t num_clusters; + char *xattr_names[] = {LVOL_NAME, "uuid"}; + int rc; + + if (lvs == NULL) { + SPDK_ERRLOG("lvol store does not exist\n"); + return -EINVAL; + } + + rc = lvs_verify_lvol_name(lvs, name); + if (rc < 0) { + return rc; + } + + bs = lvs->blobstore; + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + return -ENOMEM; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + lvol = calloc(1, sizeof(*lvol)); + if (!lvol) { + free(req); + SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n"); + return -ENOMEM; + } + lvol->lvol_store = lvs; + num_clusters = spdk_divide_round_up(sz, spdk_bs_get_cluster_size(bs)); + lvol->thin_provision = thin_provision; + lvol->clear_method = (enum blob_clear_method)clear_method; + snprintf(lvol->name, sizeof(lvol->name), "%s", name); + TAILQ_INSERT_TAIL(&lvol->lvol_store->pending_lvols, lvol, link); + spdk_uuid_generate(&lvol->uuid); + spdk_uuid_fmt_lower(lvol->uuid_str, sizeof(lvol->uuid_str), &lvol->uuid); + req->lvol = lvol; + + spdk_blob_opts_init(&opts); + opts.thin_provision = thin_provision; + opts.num_clusters = num_clusters; + opts.clear_method = lvol->clear_method; + opts.xattrs.count = SPDK_COUNTOF(xattr_names); + opts.xattrs.names = xattr_names; + opts.xattrs.ctx = lvol; + opts.xattrs.get_value = lvol_get_xattr_value; + + spdk_bs_create_blob_ext(lvs->blobstore, &opts, lvol_create_cb, req); + + return 0; +} + +void +spdk_lvol_create_snapshot(struct spdk_lvol *origlvol, const char *snapshot_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_store *lvs; + struct spdk_lvol *newlvol; + struct spdk_blob *origblob; + struct spdk_lvol_with_handle_req *req; + struct spdk_blob_xattr_opts snapshot_xattrs; + char *xattr_names[] = {LVOL_NAME, "uuid"}; + int rc; + + if (origlvol == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol not provided.\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + origblob = origlvol->blob; + lvs = origlvol->lvol_store; + if (lvs == NULL) { + SPDK_ERRLOG("lvol store does not exist\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + rc = lvs_verify_lvol_name(lvs, snapshot_name); + if (rc < 0) { + cb_fn(cb_arg, NULL, rc); + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + newlvol = calloc(1, sizeof(*newlvol)); + if (!newlvol) { + SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n"); + free(req); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + newlvol->lvol_store = origlvol->lvol_store; + snprintf(newlvol->name, sizeof(newlvol->name), "%s", snapshot_name); + TAILQ_INSERT_TAIL(&newlvol->lvol_store->pending_lvols, newlvol, link); + spdk_uuid_generate(&newlvol->uuid); + spdk_uuid_fmt_lower(newlvol->uuid_str, sizeof(newlvol->uuid_str), &newlvol->uuid); + snapshot_xattrs.count = SPDK_COUNTOF(xattr_names); + snapshot_xattrs.ctx = newlvol; + snapshot_xattrs.names = xattr_names; + snapshot_xattrs.get_value = lvol_get_xattr_value; + req->lvol = newlvol; + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_bs_create_snapshot(lvs->blobstore, spdk_blob_get_id(origblob), &snapshot_xattrs, + lvol_create_cb, req); +} + +void +spdk_lvol_create_clone(struct spdk_lvol *origlvol, const char *clone_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol *newlvol; + struct spdk_lvol_with_handle_req *req; + struct spdk_lvol_store *lvs; + struct spdk_blob *origblob; + struct spdk_blob_xattr_opts clone_xattrs; + char *xattr_names[] = {LVOL_NAME, "uuid"}; + int rc; + + if (origlvol == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol not provided.\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + origblob = origlvol->blob; + lvs = origlvol->lvol_store; + if (lvs == NULL) { + SPDK_ERRLOG("lvol store does not exist\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + rc = lvs_verify_lvol_name(lvs, clone_name); + if (rc < 0) { + cb_fn(cb_arg, NULL, rc); + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + newlvol = calloc(1, sizeof(*newlvol)); + if (!newlvol) { + SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n"); + free(req); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + newlvol->lvol_store = lvs; + snprintf(newlvol->name, sizeof(newlvol->name), "%s", clone_name); + TAILQ_INSERT_TAIL(&newlvol->lvol_store->pending_lvols, newlvol, link); + spdk_uuid_generate(&newlvol->uuid); + spdk_uuid_fmt_lower(newlvol->uuid_str, sizeof(newlvol->uuid_str), &newlvol->uuid); + clone_xattrs.count = SPDK_COUNTOF(xattr_names); + clone_xattrs.ctx = newlvol; + clone_xattrs.names = xattr_names; + clone_xattrs.get_value = lvol_get_xattr_value; + req->lvol = newlvol; + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_bs_create_clone(lvs->blobstore, spdk_blob_get_id(origblob), &clone_xattrs, + lvol_create_cb, + req); +} + +static void +lvol_resize_done(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +static void +lvol_blob_resize_cb(void *cb_arg, int bserrno) +{ + struct spdk_lvol_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + + if (bserrno != 0) { + req->cb_fn(req->cb_arg, bserrno); + free(req); + return; + } + + spdk_blob_sync_md(lvol->blob, lvol_resize_done, req); +} + +void +spdk_lvol_resize(struct spdk_lvol *lvol, uint64_t sz, + spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_blob *blob = lvol->blob; + struct spdk_lvol_store *lvs = lvol->lvol_store; + struct spdk_lvol_req *req; + uint64_t new_clusters = spdk_divide_round_up(sz, spdk_bs_get_cluster_size(lvs->blobstore)); + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + + spdk_blob_resize(blob, new_clusters, lvol_blob_resize_cb, req); +} + +static void +lvol_set_read_only_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +void +spdk_lvol_set_read_only(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_blob_set_read_only(lvol->blob); + spdk_blob_sync_md(lvol->blob, lvol_set_read_only_cb, req); +} + +static void +lvol_rename_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + + if (lvolerrno != 0) { + SPDK_ERRLOG("Lvol rename operation failed\n"); + } else { + snprintf(req->lvol->name, sizeof(req->lvol->name), "%s", req->name); + } + + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +void +spdk_lvol_rename(struct spdk_lvol *lvol, const char *new_name, + spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol *tmp; + struct spdk_blob *blob = lvol->blob; + struct spdk_lvol_req *req; + int rc; + + /* Check if new name is current lvol name. + * If so, return success immediately */ + if (strncmp(lvol->name, new_name, SPDK_LVOL_NAME_MAX) == 0) { + cb_fn(cb_arg, 0); + return; + } + + /* Check if lvol with 'new_name' already exists in lvolstore */ + TAILQ_FOREACH(tmp, &lvol->lvol_store->lvols, link) { + if (strncmp(tmp->name, new_name, SPDK_LVOL_NAME_MAX) == 0) { + SPDK_ERRLOG("Lvol %s already exists in lvol store %s\n", new_name, lvol->lvol_store->name); + cb_fn(cb_arg, -EEXIST); + return; + } + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + snprintf(req->name, sizeof(req->name), "%s", new_name); + + rc = spdk_blob_set_xattr(blob, "name", new_name, strlen(new_name) + 1); + if (rc < 0) { + free(req); + cb_fn(cb_arg, rc); + return; + } + + spdk_blob_sync_md(blob, lvol_rename_cb, req); +} + +void +spdk_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + struct spdk_blob_store *bs; + + assert(cb_fn != NULL); + + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + cb_fn(cb_arg, -ENODEV); + return; + } + + if (lvol->ref_count != 0) { + SPDK_ERRLOG("Cannot destroy lvol %s because it is still open\n", lvol->unique_id); + cb_fn(cb_arg, -EBUSY); + return; + } + + lvol->action_in_progress = true; + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + bs = lvol->lvol_store->blobstore; + + spdk_bs_delete_blob(bs, lvol->blob_id, lvol_delete_blob_cb, req); +} + +void +spdk_lvol_close(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + + assert(cb_fn != NULL); + + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + cb_fn(cb_arg, -ENODEV); + return; + } + + if (lvol->ref_count > 1) { + lvol->ref_count--; + cb_fn(cb_arg, 0); + return; + } else if (lvol->ref_count == 0) { + cb_fn(cb_arg, -EINVAL); + return; + } + + lvol->action_in_progress = true; + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + + spdk_blob_close(lvol->blob, lvol_close_blob_cb, req); +} + +struct spdk_io_channel * +spdk_lvol_get_io_channel(struct spdk_lvol *lvol) +{ + return spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore); +} + +static void +lvol_inflate_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + + spdk_bs_free_io_channel(req->channel); + + if (lvolerrno < 0) { + SPDK_ERRLOG("Could not inflate lvol\n"); + } + + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +void +spdk_lvol_inflate(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + spdk_blob_id blob_id; + + assert(cb_fn != NULL); + + if (lvol == NULL) { + SPDK_ERRLOG("Lvol does not exist\n"); + cb_fn(cb_arg, -ENODEV); + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->channel = spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore); + if (req->channel == NULL) { + SPDK_ERRLOG("Cannot alloc io channel for lvol inflate request\n"); + free(req); + cb_fn(cb_arg, -ENOMEM); + return; + } + + blob_id = spdk_blob_get_id(lvol->blob); + spdk_bs_inflate_blob(lvol->lvol_store->blobstore, req->channel, blob_id, lvol_inflate_cb, + req); +} + +void +spdk_lvol_decouple_parent(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + spdk_blob_id blob_id; + + assert(cb_fn != NULL); + + if (lvol == NULL) { + SPDK_ERRLOG("Lvol does not exist\n"); + cb_fn(cb_arg, -ENODEV); + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->channel = spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore); + if (req->channel == NULL) { + SPDK_ERRLOG("Cannot alloc io channel for lvol inflate request\n"); + free(req); + cb_fn(cb_arg, -ENOMEM); + return; + } + + blob_id = spdk_blob_get_id(lvol->blob); + spdk_bs_blob_decouple_parent(lvol->lvol_store->blobstore, req->channel, blob_id, + lvol_inflate_cb, req); +} diff --git a/src/spdk/lib/lvol/spdk_lvol.map b/src/spdk/lib/lvol/spdk_lvol.map new file mode 100644 index 000000000..6ddeb3be6 --- /dev/null +++ b/src/spdk/lib/lvol/spdk_lvol.map @@ -0,0 +1,28 @@ +{ + global: + + # public functions + spdk_lvs_opts_init; + spdk_lvs_init; + spdk_lvs_rename; + spdk_lvs_unload; + spdk_lvs_destroy; + spdk_lvol_create; + spdk_lvol_create_snapshot; + spdk_lvol_create_clone; + spdk_lvol_rename; + spdk_lvol_deletable; + spdk_lvol_destroy; + spdk_lvol_close; + spdk_lvol_get_io_channel; + spdk_lvs_load; + spdk_lvol_open; + spdk_lvol_inflate; + spdk_lvol_decouple_parent; + + # internal functions + spdk_lvol_resize; + spdk_lvol_set_read_only; + + local: *; +}; diff --git a/src/spdk/lib/nbd/Makefile b/src/spdk/lib/nbd/Makefile new file mode 100644 index 000000000..69b13d133 --- /dev/null +++ b/src/spdk/lib/nbd/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +LIBNAME = nbd +C_SRCS = nbd.c nbd_rpc.c + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_nbd.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/nbd/nbd.c b/src/spdk/lib/nbd/nbd.c new file mode 100644 index 000000000..7d96b9315 --- /dev/null +++ b/src/spdk/lib/nbd/nbd.c @@ -0,0 +1,1093 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/string.h" + +#include <linux/nbd.h> + +#include "spdk/nbd.h" +#include "nbd_internal.h" +#include "spdk/bdev.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/util.h" +#include "spdk/thread.h" + +#include "spdk_internal/log.h" +#include "spdk/queue.h" + +#define GET_IO_LOOP_COUNT 16 +#define NBD_BUSY_WAITING_MS 1000 +#define NBD_BUSY_POLLING_INTERVAL_US 20000 + +enum nbd_io_state_t { + /* Receiving or ready to receive nbd request header */ + NBD_IO_RECV_REQ = 0, + /* Receiving write payload */ + NBD_IO_RECV_PAYLOAD, + /* Transmitting or ready to transmit nbd response header */ + NBD_IO_XMIT_RESP, + /* Transmitting read payload */ + NBD_IO_XMIT_PAYLOAD, +}; + +struct nbd_io { + struct spdk_nbd_disk *nbd; + enum nbd_io_state_t state; + + void *payload; + uint32_t payload_size; + + struct nbd_request req; + struct nbd_reply resp; + + /* + * Tracks current progress on reading/writing a request, + * response, or payload from the nbd socket. + */ + uint32_t offset; + + /* for bdev io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + + TAILQ_ENTRY(nbd_io) tailq; +}; + +enum nbd_disk_state_t { + NBD_DISK_STATE_RUNNING = 0, + /* soft disconnection caused by receiving nbd_cmd_disc */ + NBD_DISK_STATE_SOFTDISC, + /* hard disconnection caused by mandatory conditions */ + NBD_DISK_STATE_HARDDISC, +}; + +struct spdk_nbd_disk { + struct spdk_bdev *bdev; + struct spdk_bdev_desc *bdev_desc; + struct spdk_io_channel *ch; + int dev_fd; + char *nbd_path; + int kernel_sp_fd; + int spdk_sp_fd; + struct spdk_poller *nbd_poller; + uint32_t buf_align; + + struct nbd_io *io_in_recv; + TAILQ_HEAD(, nbd_io) received_io_list; + TAILQ_HEAD(, nbd_io) executed_io_list; + + enum nbd_disk_state_t state; + /* count of nbd_io in spdk_nbd_disk */ + int io_count; + + TAILQ_ENTRY(spdk_nbd_disk) tailq; +}; + +struct spdk_nbd_disk_globals { + TAILQ_HEAD(, spdk_nbd_disk) disk_head; +}; + +static struct spdk_nbd_disk_globals g_spdk_nbd; + +static int +nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io); + +int +spdk_nbd_init(void) +{ + TAILQ_INIT(&g_spdk_nbd.disk_head); + + return 0; +} + +void +spdk_nbd_fini(void) +{ + struct spdk_nbd_disk *nbd_idx, *nbd_tmp; + + /* + * Stop running spdk_nbd_disk. + * Here, nbd removing are unnecessary, but _SAFE variant + * is needed, since internal nbd_disk_unregister will + * remove nbd from TAILQ. + */ + TAILQ_FOREACH_SAFE(nbd_idx, &g_spdk_nbd.disk_head, tailq, nbd_tmp) { + spdk_nbd_stop(nbd_idx); + } +} + +static int +nbd_disk_register(struct spdk_nbd_disk *nbd) +{ + if (nbd_disk_find_by_nbd_path(nbd->nbd_path)) { + SPDK_NOTICELOG("%s is already exported\n", nbd->nbd_path); + return -EBUSY; + } + + TAILQ_INSERT_TAIL(&g_spdk_nbd.disk_head, nbd, tailq); + + return 0; +} + +static void +nbd_disk_unregister(struct spdk_nbd_disk *nbd) +{ + struct spdk_nbd_disk *nbd_idx, *nbd_tmp; + + /* + * nbd disk may be stopped before registered. + * check whether it was registered. + */ + TAILQ_FOREACH_SAFE(nbd_idx, &g_spdk_nbd.disk_head, tailq, nbd_tmp) { + if (nbd == nbd_idx) { + TAILQ_REMOVE(&g_spdk_nbd.disk_head, nbd_idx, tailq); + break; + } + } +} + +struct spdk_nbd_disk * +nbd_disk_find_by_nbd_path(const char *nbd_path) +{ + struct spdk_nbd_disk *nbd; + + /* + * check whether nbd has already been registered by nbd path. + */ + TAILQ_FOREACH(nbd, &g_spdk_nbd.disk_head, tailq) { + if (!strcmp(nbd->nbd_path, nbd_path)) { + return nbd; + } + } + + return NULL; +} + +struct spdk_nbd_disk *nbd_disk_first(void) +{ + return TAILQ_FIRST(&g_spdk_nbd.disk_head); +} + +struct spdk_nbd_disk *nbd_disk_next(struct spdk_nbd_disk *prev) +{ + return TAILQ_NEXT(prev, tailq); +} + +const char * +nbd_disk_get_nbd_path(struct spdk_nbd_disk *nbd) +{ + return nbd->nbd_path; +} + +const char * +nbd_disk_get_bdev_name(struct spdk_nbd_disk *nbd) +{ + return spdk_bdev_get_name(nbd->bdev); +} + +void +spdk_nbd_write_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_nbd_disk *nbd; + + spdk_json_write_array_begin(w); + + TAILQ_FOREACH(nbd, &g_spdk_nbd.disk_head, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "nbd_start_disk"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "nbd_device", nbd_disk_get_nbd_path(nbd)); + spdk_json_write_named_string(w, "bdev_name", nbd_disk_get_bdev_name(nbd)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); +} + +void +nbd_disconnect(struct spdk_nbd_disk *nbd) +{ + /* + * nbd soft-disconnection to terminate transmission phase. + * After receiving this ioctl command, nbd kernel module will send + * a NBD_CMD_DISC type io to nbd server in order to inform server. + */ + ioctl(nbd->dev_fd, NBD_DISCONNECT); +} + +static struct nbd_io * +nbd_get_io(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io; + + io = calloc(1, sizeof(*io)); + if (!io) { + return NULL; + } + + io->nbd = nbd; + to_be32(&io->resp.magic, NBD_REPLY_MAGIC); + + nbd->io_count++; + + return io; +} + +static void +nbd_put_io(struct spdk_nbd_disk *nbd, struct nbd_io *io) +{ + if (io->payload) { + spdk_free(io->payload); + } + free(io); + + nbd->io_count--; +} + +/* + * Check whether received nbd_io are all transmitted. + * + * \return 1 there is still some nbd_io not transmitted. + * 0 all nbd_io received are transmitted. + */ +static int +nbd_io_xmit_check(struct spdk_nbd_disk *nbd) +{ + if (nbd->io_count == 0) { + return 0; + } else if (nbd->io_count == 1 && nbd->io_in_recv != NULL) { + return 0; + } + + return 1; +} + +/* + * Check whether received nbd_io are all executed, + * and put back executed nbd_io instead of transmitting them + * + * \return 1 there is still some nbd_io under executing + * 0 all nbd_io gotten are freed. + */ +static int +nbd_cleanup_io(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io, *io_tmp; + + /* free io_in_recv */ + if (nbd->io_in_recv != NULL) { + nbd_put_io(nbd, nbd->io_in_recv); + nbd->io_in_recv = NULL; + } + + /* free io in received_io_list */ + if (!TAILQ_EMPTY(&nbd->received_io_list)) { + TAILQ_FOREACH_SAFE(io, &nbd->received_io_list, tailq, io_tmp) { + TAILQ_REMOVE(&nbd->received_io_list, io, tailq); + nbd_put_io(nbd, io); + } + } + + /* free io in executed_io_list */ + if (!TAILQ_EMPTY(&nbd->executed_io_list)) { + TAILQ_FOREACH_SAFE(io, &nbd->executed_io_list, tailq, io_tmp) { + TAILQ_REMOVE(&nbd->executed_io_list, io, tailq); + nbd_put_io(nbd, io); + } + } + + /* + * Some nbd_io may be under executing in bdev. + * Wait for their done operation. + */ + if (nbd->io_count != 0) { + return 1; + } + + return 0; +} + +static void +_nbd_stop(struct spdk_nbd_disk *nbd) +{ + if (nbd->ch) { + spdk_put_io_channel(nbd->ch); + } + + if (nbd->bdev_desc) { + spdk_bdev_close(nbd->bdev_desc); + } + + if (nbd->spdk_sp_fd >= 0) { + close(nbd->spdk_sp_fd); + } + + if (nbd->kernel_sp_fd >= 0) { + close(nbd->kernel_sp_fd); + } + + if (nbd->dev_fd >= 0) { + /* Clear nbd device only if it is occupied by SPDK app */ + if (nbd->nbd_path && nbd_disk_find_by_nbd_path(nbd->nbd_path)) { + ioctl(nbd->dev_fd, NBD_CLEAR_QUE); + ioctl(nbd->dev_fd, NBD_CLEAR_SOCK); + } + close(nbd->dev_fd); + } + + if (nbd->nbd_path) { + free(nbd->nbd_path); + } + + if (nbd->nbd_poller) { + spdk_poller_unregister(&nbd->nbd_poller); + } + + nbd_disk_unregister(nbd); + + free(nbd); +} + +void +spdk_nbd_stop(struct spdk_nbd_disk *nbd) +{ + if (nbd == NULL) { + return; + } + + nbd->state = NBD_DISK_STATE_HARDDISC; + + /* + * Stop action should be called only after all nbd_io are executed. + */ + if (!nbd_cleanup_io(nbd)) { + _nbd_stop(nbd); + } +} + +static int64_t +read_from_socket(int fd, void *buf, size_t length) +{ + ssize_t bytes_read; + + bytes_read = read(fd, buf, length); + if (bytes_read == 0) { + return -EIO; + } else if (bytes_read == -1) { + if (errno != EAGAIN) { + return -errno; + } + return 0; + } else { + return bytes_read; + } +} + +static int64_t +write_to_socket(int fd, void *buf, size_t length) +{ + ssize_t bytes_written; + + bytes_written = write(fd, buf, length); + if (bytes_written == 0) { + return -EIO; + } else if (bytes_written == -1) { + if (errno != EAGAIN) { + return -errno; + } + return 0; + } else { + return bytes_written; + } +} + +static void +nbd_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct nbd_io *io = cb_arg; + struct spdk_nbd_disk *nbd = io->nbd; + + if (success) { + io->resp.error = 0; + } else { + to_be32(&io->resp.error, EIO); + } + + memcpy(&io->resp.handle, &io->req.handle, sizeof(io->resp.handle)); + TAILQ_INSERT_TAIL(&nbd->executed_io_list, io, tailq); + + if (bdev_io != NULL) { + spdk_bdev_free_io(bdev_io); + } + + if (nbd->state == NBD_DISK_STATE_HARDDISC && !nbd_cleanup_io(nbd)) { + _nbd_stop(nbd); + } +} + +static void +nbd_resubmit_io(void *arg) +{ + struct nbd_io *io = (struct nbd_io *)arg; + struct spdk_nbd_disk *nbd = io->nbd; + int rc = 0; + + rc = nbd_submit_bdev_io(nbd, io); + if (rc) { + SPDK_INFOLOG(SPDK_LOG_NBD, "nbd: io resubmit for dev %s , io_type %d, returned %d.\n", + nbd_disk_get_bdev_name(nbd), from_be32(&io->req.type), rc); + } +} + +static void +nbd_queue_io(struct nbd_io *io) +{ + int rc; + struct spdk_bdev *bdev = io->nbd->bdev; + + io->bdev_io_wait.bdev = bdev; + io->bdev_io_wait.cb_fn = nbd_resubmit_io; + io->bdev_io_wait.cb_arg = io; + + rc = spdk_bdev_queue_io_wait(bdev, io->nbd->ch, &io->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in nbd_queue_io, rc=%d.\n", rc); + nbd_io_done(NULL, false, io); + } +} + +static int +nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io) +{ + struct spdk_bdev_desc *desc = nbd->bdev_desc; + struct spdk_io_channel *ch = nbd->ch; + int rc = 0; + + switch (from_be32(&io->req.type)) { + case NBD_CMD_READ: + rc = spdk_bdev_read(desc, ch, io->payload, from_be64(&io->req.from), + io->payload_size, nbd_io_done, io); + break; + case NBD_CMD_WRITE: + rc = spdk_bdev_write(desc, ch, io->payload, from_be64(&io->req.from), + io->payload_size, nbd_io_done, io); + break; +#ifdef NBD_FLAG_SEND_FLUSH + case NBD_CMD_FLUSH: + rc = spdk_bdev_flush(desc, ch, 0, + spdk_bdev_get_num_blocks(nbd->bdev) * spdk_bdev_get_block_size(nbd->bdev), + nbd_io_done, io); + break; +#endif +#ifdef NBD_FLAG_SEND_TRIM + case NBD_CMD_TRIM: + rc = spdk_bdev_unmap(desc, ch, from_be64(&io->req.from), + from_be32(&io->req.len), nbd_io_done, io); + break; +#endif + case NBD_CMD_DISC: + nbd_put_io(nbd, io); + nbd->state = NBD_DISK_STATE_SOFTDISC; + break; + default: + rc = -1; + } + + if (rc < 0) { + if (rc == -ENOMEM) { + SPDK_INFOLOG(SPDK_LOG_NBD, "No memory, start to queue io.\n"); + nbd_queue_io(io); + } else { + SPDK_ERRLOG("nbd io failed in nbd_queue_io, rc=%d.\n", rc); + nbd_io_done(NULL, false, io); + } + } + + return 0; +} + +static int +nbd_io_exec(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io, *io_tmp; + int io_count = 0; + int ret = 0; + + /* + * For soft disconnection, nbd server must handle all outstanding + * request before closing connection. + */ + if (nbd->state == NBD_DISK_STATE_HARDDISC) { + return 0; + } + + if (!TAILQ_EMPTY(&nbd->received_io_list)) { + TAILQ_FOREACH_SAFE(io, &nbd->received_io_list, tailq, io_tmp) { + TAILQ_REMOVE(&nbd->received_io_list, io, tailq); + ret = nbd_submit_bdev_io(nbd, io); + if (ret < 0) { + return ret; + } + + io_count++; + } + } + + return io_count; +} + +static int +nbd_io_recv_internal(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io; + int ret = 0; + int received = 0; + + if (nbd->io_in_recv == NULL) { + nbd->io_in_recv = nbd_get_io(nbd); + if (!nbd->io_in_recv) { + return -ENOMEM; + } + } + + io = nbd->io_in_recv; + + if (io->state == NBD_IO_RECV_REQ) { + ret = read_from_socket(nbd->spdk_sp_fd, (char *)&io->req + io->offset, + sizeof(io->req) - io->offset); + if (ret < 0) { + nbd_put_io(nbd, io); + nbd->io_in_recv = NULL; + return ret; + } + + io->offset += ret; + received = ret; + + /* request is fully received */ + if (io->offset == sizeof(io->req)) { + io->offset = 0; + + /* req magic check */ + if (from_be32(&io->req.magic) != NBD_REQUEST_MAGIC) { + SPDK_ERRLOG("invalid request magic\n"); + nbd_put_io(nbd, io); + nbd->io_in_recv = NULL; + return -EINVAL; + } + + /* io except read/write should ignore payload */ + if (from_be32(&io->req.type) == NBD_CMD_WRITE || + from_be32(&io->req.type) == NBD_CMD_READ) { + io->payload_size = from_be32(&io->req.len); + } else { + io->payload_size = 0; + } + + /* io payload allocate */ + if (io->payload_size) { + io->payload = spdk_malloc(io->payload_size, nbd->buf_align, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (io->payload == NULL) { + SPDK_ERRLOG("could not allocate io->payload of size %d\n", io->payload_size); + nbd_put_io(nbd, io); + nbd->io_in_recv = NULL; + return -ENOMEM; + } + } else { + io->payload = NULL; + } + + /* next io step */ + if (from_be32(&io->req.type) == NBD_CMD_WRITE) { + io->state = NBD_IO_RECV_PAYLOAD; + } else { + io->state = NBD_IO_XMIT_RESP; + nbd->io_in_recv = NULL; + TAILQ_INSERT_TAIL(&nbd->received_io_list, io, tailq); + } + } + } + + if (io->state == NBD_IO_RECV_PAYLOAD) { + ret = read_from_socket(nbd->spdk_sp_fd, io->payload + io->offset, io->payload_size - io->offset); + if (ret < 0) { + nbd_put_io(nbd, io); + nbd->io_in_recv = NULL; + return ret; + } + + io->offset += ret; + received += ret; + + /* request payload is fully received */ + if (io->offset == io->payload_size) { + io->offset = 0; + io->state = NBD_IO_XMIT_RESP; + nbd->io_in_recv = NULL; + TAILQ_INSERT_TAIL(&nbd->received_io_list, io, tailq); + } + + } + + return received; +} + +static int +nbd_io_recv(struct spdk_nbd_disk *nbd) +{ + int i, rc, ret = 0; + + /* + * nbd server should not accept request in both soft and hard + * disconnect states. + */ + if (nbd->state != NBD_DISK_STATE_RUNNING) { + return 0; + } + + for (i = 0; i < GET_IO_LOOP_COUNT; i++) { + rc = nbd_io_recv_internal(nbd); + if (rc < 0) { + return rc; + } + ret += rc; + } + + return ret; +} + +static int +nbd_io_xmit_internal(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io; + int ret = 0; + int sent = 0; + + io = TAILQ_FIRST(&nbd->executed_io_list); + if (io == NULL) { + return 0; + } + + /* Remove IO from list now assuming it will be completed. It will be inserted + * back to the head if it cannot be completed. This approach is specifically + * taken to work around a scan-build use-after-free mischaracterization. + */ + TAILQ_REMOVE(&nbd->executed_io_list, io, tailq); + + /* resp error and handler are already set in io_done */ + + if (io->state == NBD_IO_XMIT_RESP) { + ret = write_to_socket(nbd->spdk_sp_fd, (char *)&io->resp + io->offset, + sizeof(io->resp) - io->offset); + if (ret <= 0) { + goto reinsert; + } + + io->offset += ret; + sent = ret; + + /* response is fully transmitted */ + if (io->offset == sizeof(io->resp)) { + io->offset = 0; + + /* transmit payload only when NBD_CMD_READ with no resp error */ + if (from_be32(&io->req.type) != NBD_CMD_READ || io->resp.error != 0) { + nbd_put_io(nbd, io); + return 0; + } else { + io->state = NBD_IO_XMIT_PAYLOAD; + } + } + } + + if (io->state == NBD_IO_XMIT_PAYLOAD) { + ret = write_to_socket(nbd->spdk_sp_fd, io->payload + io->offset, io->payload_size - io->offset); + if (ret <= 0) { + goto reinsert; + } + + io->offset += ret; + sent += ret; + + /* read payload is fully transmitted */ + if (io->offset == io->payload_size) { + nbd_put_io(nbd, io); + return sent; + } + } + +reinsert: + TAILQ_INSERT_HEAD(&nbd->executed_io_list, io, tailq); + return ret < 0 ? ret : sent; +} + +static int +nbd_io_xmit(struct spdk_nbd_disk *nbd) +{ + int ret = 0; + int rc; + + /* + * For soft disconnection, nbd server must handle all outstanding + * request before closing connection. + */ + if (nbd->state == NBD_DISK_STATE_HARDDISC) { + return 0; + } + + while (!TAILQ_EMPTY(&nbd->executed_io_list)) { + rc = nbd_io_xmit_internal(nbd); + if (rc < 0) { + return rc; + } + + ret += rc; + } + + /* + * For soft disconnection, nbd server can close connection after all + * outstanding request are transmitted. + */ + if (nbd->state == NBD_DISK_STATE_SOFTDISC && !nbd_io_xmit_check(nbd)) { + return -1; + } + + return ret; +} + +/** + * Poll an NBD instance. + * + * \return 0 on success or negated errno values on error (e.g. connection closed). + */ +static int +_nbd_poll(struct spdk_nbd_disk *nbd) +{ + int received, sent, executed; + + /* transmit executed io first */ + sent = nbd_io_xmit(nbd); + if (sent < 0) { + return sent; + } + + received = nbd_io_recv(nbd); + if (received < 0) { + return received; + } + + executed = nbd_io_exec(nbd); + if (executed < 0) { + return executed; + } + + return sent + received + executed; +} + +static int +nbd_poll(void *arg) +{ + struct spdk_nbd_disk *nbd = arg; + int rc; + + rc = _nbd_poll(nbd); + if (rc < 0) { + SPDK_INFOLOG(SPDK_LOG_NBD, "nbd_poll() returned %s (%d); closing connection\n", + spdk_strerror(-rc), rc); + spdk_nbd_stop(nbd); + } + + return rc > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +} + +static void * +nbd_start_kernel(void *arg) +{ + int dev_fd = (int)(intptr_t)arg; + + spdk_unaffinitize_thread(); + + /* This will block in the kernel until we close the spdk_sp_fd. */ + ioctl(dev_fd, NBD_DO_IT); + + pthread_exit(NULL); +} + +static void +nbd_bdev_hot_remove(void *remove_ctx) +{ + struct spdk_nbd_disk *nbd = remove_ctx; + + spdk_nbd_stop(nbd); +} + +struct spdk_nbd_start_ctx { + struct spdk_nbd_disk *nbd; + spdk_nbd_start_cb cb_fn; + void *cb_arg; + struct spdk_poller *poller; + int polling_count; +}; + +static void +nbd_start_complete(struct spdk_nbd_start_ctx *ctx) +{ + int rc; + pthread_t tid; + int flag; + + /* Add nbd_disk to the end of disk list */ + rc = nbd_disk_register(ctx->nbd); + if (rc != 0) { + SPDK_ERRLOG("Failed to register %s, it should not happen.\n", ctx->nbd->nbd_path); + assert(false); + goto err; + } + + rc = ioctl(ctx->nbd->dev_fd, NBD_SET_BLKSIZE, spdk_bdev_get_block_size(ctx->nbd->bdev)); + if (rc == -1) { + SPDK_ERRLOG("ioctl(NBD_SET_BLKSIZE) failed: %s\n", spdk_strerror(errno)); + rc = -errno; + goto err; + } + + rc = ioctl(ctx->nbd->dev_fd, NBD_SET_SIZE_BLOCKS, spdk_bdev_get_num_blocks(ctx->nbd->bdev)); + if (rc == -1) { + SPDK_ERRLOG("ioctl(NBD_SET_SIZE_BLOCKS) failed: %s\n", spdk_strerror(errno)); + rc = -errno; + goto err; + } + +#ifdef NBD_FLAG_SEND_TRIM + rc = ioctl(ctx->nbd->dev_fd, NBD_SET_FLAGS, NBD_FLAG_SEND_TRIM); + if (rc == -1) { + SPDK_ERRLOG("ioctl(NBD_SET_FLAGS) failed: %s\n", spdk_strerror(errno)); + rc = -errno; + goto err; + } +#endif + + rc = pthread_create(&tid, NULL, nbd_start_kernel, (void *)(intptr_t)ctx->nbd->dev_fd); + if (rc != 0) { + SPDK_ERRLOG("could not create thread: %s\n", spdk_strerror(rc)); + rc = -rc; + goto err; + } + + rc = pthread_detach(tid); + if (rc != 0) { + SPDK_ERRLOG("could not detach thread for nbd kernel: %s\n", spdk_strerror(rc)); + rc = -rc; + goto err; + } + + flag = fcntl(ctx->nbd->spdk_sp_fd, F_GETFL); + if (fcntl(ctx->nbd->spdk_sp_fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", + ctx->nbd->spdk_sp_fd, spdk_strerror(errno)); + rc = -errno; + goto err; + } + + ctx->nbd->nbd_poller = SPDK_POLLER_REGISTER(nbd_poll, ctx->nbd, 0); + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->cb_arg, ctx->nbd, 0); + } + + free(ctx); + return; + +err: + spdk_nbd_stop(ctx->nbd); + if (ctx->cb_fn) { + ctx->cb_fn(ctx->cb_arg, NULL, rc); + } + free(ctx); +} + +static int +nbd_enable_kernel(void *arg) +{ + struct spdk_nbd_start_ctx *ctx = arg; + int rc; + + /* Declare device setup by this process */ + rc = ioctl(ctx->nbd->dev_fd, NBD_SET_SOCK, ctx->nbd->kernel_sp_fd); + if (rc == -1) { + if (errno == EBUSY && ctx->polling_count-- > 0) { + if (ctx->poller == NULL) { + ctx->poller = SPDK_POLLER_REGISTER(nbd_enable_kernel, ctx, + NBD_BUSY_POLLING_INTERVAL_US); + } + /* If the kernel is busy, check back later */ + return SPDK_POLLER_BUSY; + } + + SPDK_ERRLOG("ioctl(NBD_SET_SOCK) failed: %s\n", spdk_strerror(errno)); + if (ctx->poller) { + spdk_poller_unregister(&ctx->poller); + } + + spdk_nbd_stop(ctx->nbd); + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->cb_arg, NULL, -errno); + } + + free(ctx); + return SPDK_POLLER_BUSY; + } + + if (ctx->poller) { + spdk_poller_unregister(&ctx->poller); + } + + nbd_start_complete(ctx); + + return SPDK_POLLER_BUSY; +} + +void +spdk_nbd_start(const char *bdev_name, const char *nbd_path, + spdk_nbd_start_cb cb_fn, void *cb_arg) +{ + struct spdk_nbd_start_ctx *ctx = NULL; + struct spdk_nbd_disk *nbd = NULL; + struct spdk_bdev *bdev; + int rc; + int sp[2]; + + bdev = spdk_bdev_get_by_name(bdev_name); + if (bdev == NULL) { + SPDK_ERRLOG("no bdev %s exists\n", bdev_name); + rc = -EINVAL; + goto err; + } + + nbd = calloc(1, sizeof(*nbd)); + if (nbd == NULL) { + rc = -ENOMEM; + goto err; + } + + nbd->dev_fd = -1; + nbd->spdk_sp_fd = -1; + nbd->kernel_sp_fd = -1; + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + rc = -ENOMEM; + goto err; + } + + ctx->nbd = nbd; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->polling_count = NBD_BUSY_WAITING_MS * 1000ULL / NBD_BUSY_POLLING_INTERVAL_US; + + rc = spdk_bdev_open(bdev, true, nbd_bdev_hot_remove, nbd, &nbd->bdev_desc); + if (rc != 0) { + SPDK_ERRLOG("could not open bdev %s, error=%d\n", spdk_bdev_get_name(bdev), rc); + goto err; + } + + nbd->bdev = bdev; + + nbd->ch = spdk_bdev_get_io_channel(nbd->bdev_desc); + nbd->buf_align = spdk_max(spdk_bdev_get_buf_align(bdev), 64); + + rc = socketpair(AF_UNIX, SOCK_STREAM, 0, sp); + if (rc != 0) { + SPDK_ERRLOG("socketpair failed\n"); + rc = -errno; + goto err; + } + + nbd->spdk_sp_fd = sp[0]; + nbd->kernel_sp_fd = sp[1]; + nbd->nbd_path = strdup(nbd_path); + if (!nbd->nbd_path) { + SPDK_ERRLOG("strdup allocation failure\n"); + rc = -ENOMEM; + goto err; + } + + TAILQ_INIT(&nbd->received_io_list); + TAILQ_INIT(&nbd->executed_io_list); + + /* Make sure nbd_path is not used in this SPDK app */ + if (nbd_disk_find_by_nbd_path(nbd->nbd_path)) { + SPDK_NOTICELOG("%s is already exported\n", nbd->nbd_path); + rc = -EBUSY; + goto err; + } + + nbd->dev_fd = open(nbd_path, O_RDWR); + if (nbd->dev_fd == -1) { + SPDK_ERRLOG("open(\"%s\") failed: %s\n", nbd_path, spdk_strerror(errno)); + rc = -errno; + goto err; + } + + SPDK_INFOLOG(SPDK_LOG_NBD, "Enabling kernel access to bdev %s via %s\n", + spdk_bdev_get_name(bdev), nbd_path); + + nbd_enable_kernel(ctx); + return; + +err: + free(ctx); + if (nbd) { + spdk_nbd_stop(nbd); + } + + if (cb_fn) { + cb_fn(cb_arg, NULL, rc); + } +} + +const char * +spdk_nbd_get_path(struct spdk_nbd_disk *nbd) +{ + return nbd->nbd_path; +} + +SPDK_LOG_REGISTER_COMPONENT("nbd", SPDK_LOG_NBD) diff --git a/src/spdk/lib/nbd/nbd_internal.h b/src/spdk/lib/nbd/nbd_internal.h new file mode 100644 index 000000000..c0d7ee220 --- /dev/null +++ b/src/spdk/lib/nbd/nbd_internal.h @@ -0,0 +1,52 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_NBD_INTERNAL_H +#define SPDK_NBD_INTERNAL_H + +#include "spdk/stdinc.h" +#include "spdk/nbd.h" + +struct spdk_nbd_disk *nbd_disk_find_by_nbd_path(const char *nbd_path); + +struct spdk_nbd_disk *nbd_disk_first(void); + +struct spdk_nbd_disk *nbd_disk_next(struct spdk_nbd_disk *prev); + +const char *nbd_disk_get_nbd_path(struct spdk_nbd_disk *nbd); + +const char *nbd_disk_get_bdev_name(struct spdk_nbd_disk *nbd); + +void nbd_disconnect(struct spdk_nbd_disk *nbd); + +#endif /* SPDK_NBD_INTERNAL_H */ diff --git a/src/spdk/lib/nbd/nbd_rpc.c b/src/spdk/lib/nbd/nbd_rpc.c new file mode 100644 index 000000000..a00c0a7e6 --- /dev/null +++ b/src/spdk/lib/nbd/nbd_rpc.c @@ -0,0 +1,422 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/string.h" +#include "spdk/env.h" +#include "spdk/rpc.h" +#include "spdk/util.h" + +#include <linux/nbd.h> + +#include "nbd_internal.h" +#include "spdk_internal/log.h" + +struct rpc_nbd_start_disk { + char *bdev_name; + char *nbd_device; + /* Used to search one available nbd device */ + int nbd_idx; + bool nbd_idx_specified; + struct spdk_jsonrpc_request *request; +}; + +static void +free_rpc_nbd_start_disk(struct rpc_nbd_start_disk *req) +{ + free(req->bdev_name); + free(req->nbd_device); + free(req); +} + +static const struct spdk_json_object_decoder rpc_nbd_start_disk_decoders[] = { + {"bdev_name", offsetof(struct rpc_nbd_start_disk, bdev_name), spdk_json_decode_string}, + {"nbd_device", offsetof(struct rpc_nbd_start_disk, nbd_device), spdk_json_decode_string, true}, +}; + +/* Return 0 to indicate the nbd_device might be available, + * or non-zero to indicate the nbd_device is invalid or in using. + */ +static int +check_available_nbd_disk(char *nbd_device) +{ + char nbd_block_path[256]; + char tail[2]; + int rc; + unsigned int nbd_idx; + struct spdk_nbd_disk *nbd; + + /* nbd device path must be in format of /dev/nbd<num>, with no tail. */ + rc = sscanf(nbd_device, "/dev/nbd%u%1s", &nbd_idx, tail); + if (rc != 1) { + return -errno; + } + + /* make sure nbd_device is not registered inside SPDK */ + nbd = nbd_disk_find_by_nbd_path(nbd_device); + if (nbd) { + /* nbd_device is in using */ + return -EBUSY; + } + + /* A valid pid file in /sys/block indicates the device is in using */ + snprintf(nbd_block_path, 256, "/sys/block/nbd%u/pid", nbd_idx); + + rc = open(nbd_block_path, O_RDONLY); + if (rc < 0) { + if (errno == ENOENT) { + /* nbd_device might be available */ + return 0; + } else { + SPDK_ERRLOG("Failed to check PID file %s: %s\n", nbd_block_path, spdk_strerror(errno)); + return -errno; + } + } + + close(rc); + + /* nbd_device is in using */ + return -EBUSY; +} + +static char * +find_available_nbd_disk(int nbd_idx, int *next_nbd_idx) +{ + int i, rc; + char nbd_device[20]; + + for (i = nbd_idx; ; i++) { + snprintf(nbd_device, 20, "/dev/nbd%d", i); + /* Check whether an nbd device exists in order to reach the last one nbd device */ + rc = access(nbd_device, F_OK); + if (rc != 0) { + break; + } + + rc = check_available_nbd_disk(nbd_device); + if (rc == 0) { + if (next_nbd_idx != NULL) { + *next_nbd_idx = i + 1; + } + + return strdup(nbd_device); + } + } + + return NULL; +} + +static void +rpc_start_nbd_done(void *cb_arg, struct spdk_nbd_disk *nbd, int rc) +{ + struct rpc_nbd_start_disk *req = cb_arg; + struct spdk_jsonrpc_request *request = req->request; + struct spdk_json_write_ctx *w; + + /* Check whether it's automatic nbd-device assignment */ + if (rc == -EBUSY && req->nbd_idx_specified == false) { + free(req->nbd_device); + + req->nbd_device = find_available_nbd_disk(req->nbd_idx, &req->nbd_idx); + if (req->nbd_device != NULL) { + spdk_nbd_start(req->bdev_name, req->nbd_device, + rpc_start_nbd_done, req); + return; + } + + SPDK_INFOLOG(SPDK_LOG_NBD, "There is no available nbd device.\n"); + } + + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, spdk_nbd_get_path(nbd)); + spdk_jsonrpc_end_result(request, w); + + free_rpc_nbd_start_disk(req); +} + +static void +rpc_nbd_start_disk(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_nbd_start_disk *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + SPDK_ERRLOG("could not allocate nbd_start_disk request.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, rpc_nbd_start_disk_decoders, + SPDK_COUNTOF(rpc_nbd_start_disk_decoders), + req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto invalid; + } + + if (req->bdev_name == NULL) { + goto invalid; + } + + if (req->nbd_device != NULL) { + req->nbd_idx_specified = true; + rc = check_available_nbd_disk(req->nbd_device); + if (rc == -EBUSY) { + SPDK_DEBUGLOG(SPDK_LOG_NBD, "NBD device %s is in using.\n", req->nbd_device); + spdk_jsonrpc_send_error_response(request, -EBUSY, spdk_strerror(-rc)); + goto invalid; + } + + if (rc != 0) { + SPDK_DEBUGLOG(SPDK_LOG_NBD, "Illegal nbd_device %s.\n", req->nbd_device); + spdk_jsonrpc_send_error_response_fmt(request, -ENODEV, + "illegal nbd device %s", req->nbd_device); + goto invalid; + } + } else { + req->nbd_idx = 0; + req->nbd_device = find_available_nbd_disk(req->nbd_idx, &req->nbd_idx); + if (req->nbd_device == NULL) { + SPDK_INFOLOG(SPDK_LOG_NBD, "There is no available nbd device.\n"); + spdk_jsonrpc_send_error_response(request, -ENODEV, + "nbd device not found"); + goto invalid; + } + } + + req->request = request; + spdk_nbd_start(req->bdev_name, req->nbd_device, + rpc_start_nbd_done, req); + + return; + +invalid: + free_rpc_nbd_start_disk(req); +} + +SPDK_RPC_REGISTER("nbd_start_disk", rpc_nbd_start_disk, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nbd_start_disk, start_nbd_disk) + +struct rpc_nbd_stop_disk { + char *nbd_device; +}; + +static void +free_rpc_nbd_stop_disk(struct rpc_nbd_stop_disk *req) +{ + free(req->nbd_device); +} + +static const struct spdk_json_object_decoder rpc_nbd_stop_disk_decoders[] = { + {"nbd_device", offsetof(struct rpc_nbd_stop_disk, nbd_device), spdk_json_decode_string}, +}; + +struct nbd_disconnect_arg { + struct spdk_jsonrpc_request *request; + struct spdk_nbd_disk *nbd; +}; + +static void * +nbd_disconnect_thread(void *arg) +{ + struct nbd_disconnect_arg *thd_arg = arg; + struct spdk_json_write_ctx *w; + + spdk_unaffinitize_thread(); + + nbd_disconnect(thd_arg->nbd); + + w = spdk_jsonrpc_begin_result(thd_arg->request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(thd_arg->request, w); + + free(thd_arg); + pthread_exit(NULL); +} + +static void +rpc_nbd_stop_disk(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_nbd_stop_disk req = {}; + struct spdk_nbd_disk *nbd; + pthread_t tid; + struct nbd_disconnect_arg *thd_arg = NULL; + int rc; + + if (spdk_json_decode_object(params, rpc_nbd_stop_disk_decoders, + SPDK_COUNTOF(rpc_nbd_stop_disk_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto out; + } + + if (req.nbd_device == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, "invalid nbd device"); + goto out; + } + + /* make sure nbd_device is registered */ + nbd = nbd_disk_find_by_nbd_path(req.nbd_device); + if (!nbd) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto out; + } + + /* + * thd_arg should be freed by created thread + * if thread is created successfully. + */ + thd_arg = malloc(sizeof(*thd_arg)); + if (!thd_arg) { + SPDK_ERRLOG("could not allocate nbd disconnect thread arg\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + goto out; + } + + thd_arg->request = request; + thd_arg->nbd = nbd; + + /* + * NBD ioctl of disconnect will block until data are flushed. + * Create separate thread to execute it. + */ + rc = pthread_create(&tid, NULL, nbd_disconnect_thread, (void *)thd_arg); + if (rc != 0) { + SPDK_ERRLOG("could not create nbd disconnect thread: %s\n", spdk_strerror(rc)); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(rc)); + free(thd_arg); + goto out; + } + + rc = pthread_detach(tid); + if (rc != 0) { + SPDK_ERRLOG("could not detach nbd disconnect thread: %s\n", spdk_strerror(rc)); + goto out; + } + +out: + free_rpc_nbd_stop_disk(&req); +} + +SPDK_RPC_REGISTER("nbd_stop_disk", rpc_nbd_stop_disk, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nbd_stop_disk, stop_nbd_disk) + +static void +rpc_dump_nbd_info(struct spdk_json_write_ctx *w, + struct spdk_nbd_disk *nbd) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "nbd_device", nbd_disk_get_nbd_path(nbd)); + + spdk_json_write_named_string(w, "bdev_name", nbd_disk_get_bdev_name(nbd)); + + spdk_json_write_object_end(w); +} + +struct rpc_nbd_get_disks { + char *nbd_device; +}; + +static void +free_rpc_nbd_get_disks(struct rpc_nbd_get_disks *r) +{ + free(r->nbd_device); +} + +static const struct spdk_json_object_decoder rpc_nbd_get_disks_decoders[] = { + {"nbd_device", offsetof(struct rpc_nbd_get_disks, nbd_device), spdk_json_decode_string, true}, +}; + +static void +rpc_nbd_get_disks(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_nbd_get_disks req = {}; + struct spdk_json_write_ctx *w; + struct spdk_nbd_disk *nbd = NULL; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_nbd_get_disks_decoders, + SPDK_COUNTOF(rpc_nbd_get_disks_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto invalid; + } + + if (req.nbd_device) { + nbd = nbd_disk_find_by_nbd_path(req.nbd_device); + if (nbd == NULL) { + SPDK_ERRLOG("nbd device '%s' does not exist\n", req.nbd_device); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto invalid; + } + + free_rpc_nbd_get_disks(&req); + } + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + if (nbd != NULL) { + rpc_dump_nbd_info(w, nbd); + } else { + for (nbd = nbd_disk_first(); nbd != NULL; nbd = nbd_disk_next(nbd)) { + rpc_dump_nbd_info(w, nbd); + } + } + + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); + + return; + +invalid: + free_rpc_nbd_get_disks(&req); +} +SPDK_RPC_REGISTER("nbd_get_disks", rpc_nbd_get_disks, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nbd_get_disks, get_nbd_disks) diff --git a/src/spdk/lib/nbd/spdk_nbd.map b/src/spdk/lib/nbd/spdk_nbd.map new file mode 100644 index 000000000..0b7d8de81 --- /dev/null +++ b/src/spdk/lib/nbd/spdk_nbd.map @@ -0,0 +1,13 @@ +{ + global: + + # public functions + spdk_nbd_init; + spdk_nbd_fini; + spdk_nbd_start; + spdk_nbd_stop; + spdk_nbd_get_path; + spdk_nbd_write_config_json; + + local: *; +}; diff --git a/src/spdk/lib/net/Makefile b/src/spdk/lib/net/Makefile new file mode 100644 index 000000000..918df6cfb --- /dev/null +++ b/src/spdk/lib/net/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = interface.c net_rpc.c + +LIBNAME = net + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_net.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/net/interface.c b/src/spdk/lib/net/interface.c new file mode 100644 index 000000000..358cbc308 --- /dev/null +++ b/src/spdk/lib/net/interface.c @@ -0,0 +1,551 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "net_internal.h" + +#include "spdk/stdinc.h" +#include "spdk/string.h" + +#include "spdk/log.h" +#include "spdk/net.h" + +#ifdef __linux__ /* Interface management is Linux-specific */ + +#include <linux/netlink.h> +#include <linux/rtnetlink.h> + +static TAILQ_HEAD(, spdk_interface) g_interface_head; + +static pthread_mutex_t interface_lock = PTHREAD_MUTEX_INITIALIZER; + +static int get_ifc_ipv4(void) +{ + int ret; + int rtattrlen; + int netlink_fd; + uint32_t ipv4_addr; + + struct { + struct nlmsghdr n; + struct ifaddrmsg r; + struct rtattr rta; + } req; + char buf[16384]; + struct nlmsghdr *nlmp; + struct ifaddrmsg *rtmp; + struct rtattr *rtatp; + struct spdk_interface *ifc; + + netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); + if (netlink_fd < 0) { + SPDK_ERRLOG("socket failed!\n"); + return 1; + } + + /* + * Prepare a message structure + */ + memset(&req, 0, sizeof(req)); + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT; + req.n.nlmsg_type = RTM_GETADDR; + + /* IPv4 only */ + req.r.ifa_family = AF_INET; + + /* + * Fill up all the attributes for the rtnetlink header. + */ + assert(&req.rta == (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.n.nlmsg_len))); + req.rta.rta_len = RTA_LENGTH(16); + + /* Send and recv the message from kernel */ + ret = send(netlink_fd, &req, req.n.nlmsg_len, 0); + if (ret < 0) { + SPDK_ERRLOG("netlink send failed: %s\n", spdk_strerror(errno)); + ret = 1; + goto exit; + } + + ret = recv(netlink_fd, buf, sizeof(buf), 0); + if (ret <= 0) { + SPDK_ERRLOG("netlink recv failed: %s\n", spdk_strerror(errno)); + ret = 1; + goto exit; + } + + for (nlmp = (struct nlmsghdr *)buf; ret > (int)sizeof(*nlmp);) { + int len = nlmp->nlmsg_len; + int req_len = len - sizeof(*nlmp); + + if (req_len < 0 || len > ret) { + SPDK_ERRLOG("error\n"); + ret = 1; + goto exit; + } + + if (!NLMSG_OK(nlmp, (uint32_t)ret)) { + SPDK_ERRLOG("NLMSG not OK\n"); + ret = 1; + goto exit; + } + + rtmp = (struct ifaddrmsg *)NLMSG_DATA(nlmp); + rtatp = (struct rtattr *)IFA_RTA(rtmp); + + rtattrlen = IFA_PAYLOAD(nlmp); + + for (; RTA_OK(rtatp, rtattrlen); rtatp = RTA_NEXT(rtatp, rtattrlen)) { + if (rtatp->rta_type == IFA_LOCAL) { + memcpy(&ipv4_addr, (struct in_addr *)RTA_DATA(rtatp), + sizeof(struct in_addr)); + TAILQ_FOREACH(ifc, &g_interface_head, tailq) { + if (ifc->index == rtmp->ifa_index) { + /* add a new IP address to interface */ + if (ifc->num_ip_addresses >= SPDK_MAX_IP_PER_IFC) { + SPDK_ERRLOG("SPDK: number of IP addresses supported for %s excceded. limit=%d\n", + ifc->name, + SPDK_MAX_IP_PER_IFC); + break; + } + ifc->ip_address[ifc->num_ip_addresses] = ipv4_addr; + ifc->num_ip_addresses++; + break; + } + } + } + } + ret -= NLMSG_ALIGN(len); + nlmp = (struct nlmsghdr *)((char *)nlmp + NLMSG_ALIGN(len)); + } + ret = 0; + +exit: + close(netlink_fd); + return ret; +} + + +static int process_new_interface_msg(struct nlmsghdr *h) +{ + int len; + struct spdk_interface *ifc; + struct ifinfomsg *iface; + struct rtattr *attribute; + + iface = (struct ifinfomsg *)NLMSG_DATA(h); + + ifc = (struct spdk_interface *) malloc(sizeof(*ifc)); + if (ifc == NULL) { + SPDK_ERRLOG("Malloc failed\n"); + return 1; + } + + memset(ifc, 0, sizeof(*ifc)); + + /* Set interface index */ + ifc->index = iface->ifi_index; + + len = h->nlmsg_len - NLMSG_LENGTH(sizeof(*iface)); + + /* Loop over all attributes for the NEWLINK message */ + for (attribute = IFLA_RTA(iface); RTA_OK(attribute, len); attribute = RTA_NEXT(attribute, len)) { + switch (attribute->rta_type) { + case IFLA_IFNAME: + if (if_indextoname(iface->ifi_index, ifc->name) == NULL) { + SPDK_ERRLOG("Indextoname failed!\n"); + free(ifc); + return 2; + } + break; + default: + break; + } + } + TAILQ_INSERT_TAIL(&g_interface_head, ifc, tailq); + return 0; +} + +static int prepare_ifc_list(void) +{ + int ret = 0; + struct nl_req_s { + struct nlmsghdr hdr; + struct rtgenmsg gen; + struct ifinfomsg ifi; + }; + int netlink_fd; + struct sockaddr_nl local; /* Our local (user space) side of the communication */ + struct sockaddr_nl kernel; /* The remote (kernel space) side of the communication */ + + struct msghdr rtnl_msg; /* Generic msghdr struct for use with sendmsg */ + struct iovec io; /* IO vector for sendmsg */ + + struct nl_req_s req; /* Structure that describes the rtnetlink packet itself */ + char reply[16384]; /* a large buffer to receive lots of link information */ + + pid_t pid = getpid(); /* Our process ID to build the correct netlink address */ + int end = 0; /* some flag to end loop parsing */ + + /* + * Prepare netlink socket for kernel/user space communication + */ + netlink_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (netlink_fd < 0) { + SPDK_ERRLOG("socket failed!\n"); + return 1; + } + + memset(&local, 0, sizeof(local)); /* Fill-in local address information */ + local.nl_family = AF_NETLINK; + local.nl_pid = pid; + local.nl_groups = 0; + + /* RTNL socket is ready to use, prepare and send L2 request. */ + memset(&rtnl_msg, 0, sizeof(rtnl_msg)); + memset(&kernel, 0, sizeof(kernel)); + memset(&req, 0, sizeof(req)); + + kernel.nl_family = AF_NETLINK; /* Fill-in kernel address (destination of our message) */ + + req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg)); + req.hdr.nlmsg_type = RTM_GETLINK; + req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.hdr.nlmsg_seq = 1; + req.hdr.nlmsg_pid = pid; + + req.ifi.ifi_family = AF_UNSPEC; + req.ifi.ifi_type = 1; + + io.iov_base = &req; + io.iov_len = req.hdr.nlmsg_len; + rtnl_msg.msg_iov = &io; + rtnl_msg.msg_iovlen = 1; + rtnl_msg.msg_name = &kernel; + rtnl_msg.msg_namelen = sizeof(kernel); + + if (sendmsg(netlink_fd, &rtnl_msg, 0) == -1) { + SPDK_ERRLOG("Sendmsg failed!\n"); + ret = 1; + goto exit; + } + + /* Parse reply */ + while (!end) { + int len; + struct nlmsghdr *msg_ptr; /* Pointer to current message part */ + + struct msghdr rtnl_reply; /* Generic msghdr structure for use with recvmsg */ + struct iovec io_reply; + + memset(&io_reply, 0, sizeof(io_reply)); + memset(&rtnl_reply, 0, sizeof(rtnl_reply)); + + io.iov_base = reply; + io.iov_len = 8192; + rtnl_reply.msg_iov = &io; + rtnl_reply.msg_iovlen = 1; + rtnl_reply.msg_name = &kernel; + rtnl_reply.msg_namelen = sizeof(kernel); + + /* Read as much data as fits in the receive buffer */ + len = recvmsg(netlink_fd, &rtnl_reply, 0); + if (len) { + for (msg_ptr = (struct nlmsghdr *) reply; NLMSG_OK(msg_ptr, (uint32_t)len); + msg_ptr = NLMSG_NEXT(msg_ptr, len)) { + switch (msg_ptr->nlmsg_type) { + case NLMSG_DONE: /* This is the special meaning NLMSG_DONE message we asked for by using NLM_F_DUMP flag */ + end++; + break; + case RTM_NEWLINK: /* This is a RTM_NEWLINK message, which contains lots of information about a link */ + ret = process_new_interface_msg(msg_ptr); + if (ret != 0) { + goto exit; + } + break; + default: + break; + } + } + } + } +exit: + close(netlink_fd); + return ret; +} + +static struct spdk_interface * +interface_find_by_index(uint32_t ifc_index) +{ + struct spdk_interface *ifc_entry; + + /* Mutex must has benn held by the caller */ + TAILQ_FOREACH(ifc_entry, &g_interface_head, tailq) { + if (ifc_entry->index == ifc_index) { + return ifc_entry; + } + } + + return NULL; +} + +static int netlink_addr_msg(uint32_t ifc_idx, uint32_t ip_address, uint32_t create) +{ + int fd; + struct sockaddr_nl la; + struct sockaddr_nl pa; + struct msghdr msg; + struct iovec iov; + int ifal; + struct { + struct nlmsghdr n; + struct ifaddrmsg r; + char buf[16384]; + } req; + struct rtattr *rta; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (fd < 0) { + SPDK_ERRLOG("socket failed!\n"); + return errno; + } + + /* setup local address & bind using this address. */ + bzero(&la, sizeof(la)); + la.nl_family = AF_NETLINK; + la.nl_pid = getpid(); + bind(fd, (struct sockaddr *) &la, sizeof(la)); + + /* initialize RTNETLINK request buffer. */ + bzero(&req, sizeof(req)); + + /* compute the initial length of the service request. */ + ifal = sizeof(struct ifaddrmsg); + + /* add first attrib: set IP addr and RTNETLINK buffer size. */ + rta = (struct rtattr *) req.buf; + rta->rta_type = IFA_ADDRESS; + rta->rta_len = sizeof(struct rtattr) + 4; + memcpy(((char *)rta) + sizeof(struct rtattr), &ip_address, sizeof(ip_address)); + ifal += rta->rta_len; + + /* add second attrib. */ + rta = (struct rtattr *)(((char *)rta) + rta->rta_len); + rta->rta_type = IFA_LOCAL; + rta->rta_len = sizeof(struct rtattr) + 4; + memcpy(((char *)rta) + sizeof(struct rtattr), &ip_address, sizeof(ip_address)); + ifal += rta->rta_len; + + /* setup the NETLINK header. */ + req.n.nlmsg_len = NLMSG_LENGTH(ifal); + if (create) { + req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_APPEND; + req.n.nlmsg_type = RTM_NEWADDR; + } else { + req.n.nlmsg_flags = NLM_F_REQUEST; + req.n.nlmsg_type = RTM_DELADDR; + } + + /* setup the service header (struct rtmsg). */ + req.r.ifa_family = AF_INET; + req.r.ifa_prefixlen = 32; /* hardcoded */ + req.r.ifa_flags = IFA_F_PERMANENT | IFA_F_SECONDARY; + req.r.ifa_index = ifc_idx; + req.r.ifa_scope = 0; + + /* create the remote address to communicate. */ + bzero(&pa, sizeof(pa)); + pa.nl_family = AF_NETLINK; + + /* initialize & create the struct msghdr supplied to the sendmsg() function. */ + bzero(&msg, sizeof(msg)); + msg.msg_name = (void *) &pa; + msg.msg_namelen = sizeof(pa); + + /* place the pointer & size of the RTNETLINK message in the struct msghdr. */ + iov.iov_base = (void *) &req.n; + iov.iov_len = req.n.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + /* send the RTNETLINK message to kernel. */ + sendmsg(fd, &msg, 0); + close(fd); + return 0; +} + +static void interface_ip_update(void) +{ + struct spdk_interface *ifc_entry; + + pthread_mutex_lock(&interface_lock); + TAILQ_FOREACH(ifc_entry, &g_interface_head, tailq) { + ifc_entry->num_ip_addresses = 0; + memset(ifc_entry->ip_address, 0, sizeof(ifc_entry->ip_address)); + } + get_ifc_ipv4(); + pthread_mutex_unlock(&interface_lock); +} + +static int +interface_is_ip_address_in_use(int ifc_index, uint32_t addr, bool add) +{ + struct spdk_interface *ifc_entry; + bool in_use = false; + uint32_t idx = 0; + + interface_ip_update(); + + pthread_mutex_lock(&interface_lock); + ifc_entry = interface_find_by_index(ifc_index); + if (ifc_entry == NULL) { + pthread_mutex_unlock(&interface_lock); + return -ENODEV; + } + + for (idx = 0; idx < ifc_entry->num_ip_addresses; idx++) { + if (ifc_entry->ip_address[idx] == addr) { + in_use = true; + break; + } + } + pthread_mutex_unlock(&interface_lock); + + /* The IP address to add is alerady in use */ + if (add == true && in_use == true) { + return -EADDRINUSE; + } + + /* The IP address to delete is not in use */ + if (add == false && in_use == false) { + return -ENXIO; + } + + return 0; +} + +int +spdk_interface_init(void) +{ + int rc = 0; + + TAILQ_INIT(&g_interface_head); + rc = prepare_ifc_list(); + if (!rc) { + rc = get_ifc_ipv4(); + } + + return rc; +} + +void +spdk_interface_destroy(void) +{ + struct spdk_interface *ifc_entry; + + while (!TAILQ_EMPTY(&g_interface_head)) { + ifc_entry = TAILQ_FIRST(&g_interface_head); + TAILQ_REMOVE(&g_interface_head, ifc_entry, tailq); + free(ifc_entry); + } +} + +int +interface_net_interface_add_ip_address(int ifc_index, char *ip_addr) +{ + uint32_t addr; + int ret; + + addr = inet_addr(ip_addr); + + ret = interface_is_ip_address_in_use(ifc_index, addr, true); + if (ret < 0) { + return ret; + } + + return netlink_addr_msg(ifc_index, addr, 1); +} + +int +interface_net_interface_delete_ip_address(int ifc_index, char *ip_addr) +{ + uint32_t addr; + int ret; + + addr = inet_addr(ip_addr); + + ret = interface_is_ip_address_in_use(ifc_index, addr, false); + if (ret < 0) { + return ret; + } + + return netlink_addr_msg(ifc_index, addr, 0); +} + +void *interface_get_list(void) +{ + interface_ip_update(); + return &g_interface_head; +} + +#else /* Not Linux */ + +int +spdk_interface_init(void) +{ + return 0; +} + +void +spdk_interface_destroy(void) +{ +} + +int +interface_net_interface_add_ip_address(int ifc_index, char *ip_addr) +{ + return -1; +} + +int +interface_net_interface_delete_ip_address(int ifc_index, char *ip_addr) +{ + return -1; +} + +void * +interface_get_list(void) +{ + return NULL; +} + +#endif diff --git a/src/spdk/lib/net/net_internal.h b/src/spdk/lib/net/net_internal.h new file mode 100644 index 000000000..4a1422939 --- /dev/null +++ b/src/spdk/lib/net/net_internal.h @@ -0,0 +1,79 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_NET_INTERNAL_H +#define SPDK_NET_INTERNAL_H + +#include "spdk/stdinc.h" + +#include "spdk/queue.h" + +#define SPDK_IFNAMSIZE 32 +#define SPDK_MAX_IP_PER_IFC 32 + +struct spdk_interface { + char name[SPDK_IFNAMSIZE]; + uint32_t index; + uint32_t num_ip_addresses; /* number of IP addresses defined */ + uint32_t ip_address[SPDK_MAX_IP_PER_IFC]; + TAILQ_ENTRY(spdk_interface) tailq; +}; + +/** + * Add an ip address to the network interface. + * + * \param ifc_index Index of the network interface. + * \param ip_addr Ip address to add. + * + * \return 0 on success, -1 on failure. + */ +int interface_net_interface_add_ip_address(int ifc_index, char *ip_addr); + +/** + * Delete an ip address from the network interface. + * + * \param ifc_index Index of the network interface. + * \param ip_addr Ip address to delete. + * + * \return 0 on success, -1 on failure. + */ +int interface_net_interface_delete_ip_address(int ifc_index, char *ip_addr); + +/** + * Get the list of all the network interfaces. + * + * \return a pointer to the head of the linked list of all the network interfaces. + */ +void *interface_get_list(void); + +#endif /* SPDK_NET_INTERNAL_H */ diff --git a/src/spdk/lib/net/net_rpc.c b/src/spdk/lib/net/net_rpc.c new file mode 100644 index 000000000..47a302a6b --- /dev/null +++ b/src/spdk/lib/net/net_rpc.c @@ -0,0 +1,198 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "net_internal.h" + +#include "spdk/stdinc.h" + +#include "spdk/rpc.h" +#include "spdk/net.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +struct rpc_ip_address { + int32_t ifc_index; + char *ip_address; +}; + +static void +free_rpc_ip_address(struct rpc_ip_address *req) +{ + free(req->ip_address); +} + +static const struct spdk_json_object_decoder rpc_ip_address_decoders[] = { + {"ifc_index", offsetof(struct rpc_ip_address, ifc_index), spdk_json_decode_int32}, + {"ip_address", offsetof(struct rpc_ip_address, ip_address), spdk_json_decode_string}, +}; + +static void +rpc_net_interface_add_ip_address(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_ip_address req = {}; + struct spdk_json_write_ctx *w; + int ret_val = 0; + + if (spdk_json_decode_object(params, rpc_ip_address_decoders, + SPDK_COUNTOF(rpc_ip_address_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_NET, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto invalid; + } + + ret_val = interface_net_interface_add_ip_address(req.ifc_index, req.ip_address); + if (ret_val) { + if (ret_val == -ENODEV) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE, + "Interface %d not available", req.ifc_index); + } else if (ret_val == -EADDRINUSE) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "IP address %s is already added to interface %d", + req.ip_address, req.ifc_index); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + strerror(ret_val)); + } + goto invalid; + } + + free_rpc_ip_address(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_ip_address(&req); +} +SPDK_RPC_REGISTER("net_interface_add_ip_address", rpc_net_interface_add_ip_address, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(net_interface_add_ip_address, add_ip_address) + +static void +rpc_net_interface_delete_ip_address(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_ip_address req = {}; + struct spdk_json_write_ctx *w; + int ret_val = 0; + + if (spdk_json_decode_object(params, rpc_ip_address_decoders, + SPDK_COUNTOF(rpc_ip_address_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_NET, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto invalid; + } + + ret_val = interface_net_interface_delete_ip_address(req.ifc_index, req.ip_address); + if (ret_val) { + if (ret_val == -ENODEV) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE, + "Interface %d not available", req.ifc_index); + } else if (ret_val == -ENXIO) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "IP address %s is not found in interface %d", + req.ip_address, req.ifc_index); + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + strerror(ret_val)); + } + goto invalid; + } + + free_rpc_ip_address(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_ip_address(&req); +} +SPDK_RPC_REGISTER("net_interface_delete_ip_address", rpc_net_interface_delete_ip_address, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(net_interface_delete_ip_address, delete_ip_address) + +static void +rpc_net_get_interfaces(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + TAILQ_HEAD(, spdk_interface) *interface_head = interface_get_list(); + struct spdk_interface *ifc; + char *ip_address; + struct in_addr inaddr; + uint32_t i; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "net_get_interfaces requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + TAILQ_FOREACH(ifc, interface_head, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "name", ifc->name); + + spdk_json_write_named_int32(w, "ifc_index", ifc->index); + + spdk_json_write_named_array_begin(w, "ip_addr"); + for (i = 0; i < ifc->num_ip_addresses; i++) { + memcpy(&inaddr, &ifc->ip_address[i], sizeof(uint32_t)); + ip_address = inet_ntoa(inaddr); + spdk_json_write_string(w, ip_address); + } + spdk_json_write_array_end(w); + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("net_get_interfaces", rpc_net_get_interfaces, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(net_get_interfaces, get_interfaces) + +SPDK_LOG_REGISTER_COMPONENT("net", SPDK_LOG_NET) diff --git a/src/spdk/lib/net/spdk_net.map b/src/spdk/lib/net/spdk_net.map new file mode 100644 index 000000000..944bc4c6e --- /dev/null +++ b/src/spdk/lib/net/spdk_net.map @@ -0,0 +1,9 @@ +{ + global: + + # public functions + spdk_interface_init; + spdk_interface_destroy; + + local: *; +}; diff --git a/src/spdk/lib/notify/Makefile b/src/spdk/lib/notify/Makefile new file mode 100644 index 000000000..82249a5b2 --- /dev/null +++ b/src/spdk/lib/notify/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = notify.c notify_rpc.c +LIBNAME = notify + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_notify.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/notify/notify.c b/src/spdk/lib/notify/notify.c new file mode 100644 index 000000000..88c5d633b --- /dev/null +++ b/src/spdk/lib/notify/notify.c @@ -0,0 +1,150 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/queue.h> + +#include "spdk/stdinc.h" +#include "spdk/util.h" +#include "spdk/queue.h" +#include "spdk/string.h" +#include "spdk/log.h" + +#include "spdk/notify.h" + +#define SPDK_NOTIFY_MAX_EVENTS 1024 + +struct spdk_notify_type { + char name[SPDK_NOTIFY_MAX_NAME_SIZE]; + TAILQ_ENTRY(spdk_notify_type) tailq; +}; + +pthread_mutex_t g_events_lock = PTHREAD_MUTEX_INITIALIZER; +static struct spdk_notify_event g_events[SPDK_NOTIFY_MAX_EVENTS]; +static uint64_t g_events_head; + +static TAILQ_HEAD(, spdk_notify_type) g_notify_types = TAILQ_HEAD_INITIALIZER(g_notify_types); + +struct spdk_notify_type * +spdk_notify_type_register(const char *type) +{ + struct spdk_notify_type *it = NULL; + + if (!type) { + SPDK_ERRLOG("Invalid notification type %p\n", type); + return NULL; + } else if (!type[0] || strlen(type) >= SPDK_NOTIFY_MAX_NAME_SIZE) { + SPDK_ERRLOG("Notification type '%s' too short or too long\n", type); + return NULL; + } + + pthread_mutex_lock(&g_events_lock); + TAILQ_FOREACH(it, &g_notify_types, tailq) { + if (strcmp(type, it->name) == 0) { + SPDK_NOTICELOG("Notification type '%s' already registered.\n", type); + goto out; + } + } + + it = calloc(1, sizeof(*it)); + if (it == NULL) { + goto out; + } + + snprintf(it->name, sizeof(it->name), "%s", type); + TAILQ_INSERT_TAIL(&g_notify_types, it, tailq); + +out: + pthread_mutex_unlock(&g_events_lock); + return it; +} + +const char * +spdk_notify_type_get_name(const struct spdk_notify_type *type) +{ + return type->name; +} + + +void +spdk_notify_foreach_type(spdk_notify_foreach_type_cb cb, void *ctx) +{ + struct spdk_notify_type *it; + + pthread_mutex_lock(&g_events_lock); + TAILQ_FOREACH(it, &g_notify_types, tailq) { + if (cb(it, ctx)) { + break; + } + } + pthread_mutex_unlock(&g_events_lock); +} + +uint64_t +spdk_notify_send(const char *type, const char *ctx) +{ + uint64_t head; + struct spdk_notify_event *ev; + + pthread_mutex_lock(&g_events_lock); + head = g_events_head; + g_events_head++; + + ev = &g_events[head % SPDK_NOTIFY_MAX_EVENTS]; + spdk_strcpy_pad(ev->type, type, sizeof(ev->type), '\0'); + spdk_strcpy_pad(ev->ctx, ctx, sizeof(ev->ctx), '\0'); + pthread_mutex_unlock(&g_events_lock); + + return head; +} + +uint64_t +spdk_notify_foreach_event(uint64_t start_idx, uint64_t max, + spdk_notify_foreach_event_cb cb_fn, void *ctx) +{ + uint64_t i; + + pthread_mutex_lock(&g_events_lock); + + if (g_events_head > SPDK_NOTIFY_MAX_EVENTS && start_idx < g_events_head - SPDK_NOTIFY_MAX_EVENTS) { + start_idx = g_events_head - SPDK_NOTIFY_MAX_EVENTS; + } + + for (i = 0; start_idx < g_events_head && i < max; start_idx++, i++) { + if (cb_fn(start_idx, &g_events[start_idx % SPDK_NOTIFY_MAX_EVENTS], ctx)) { + break; + } + } + pthread_mutex_unlock(&g_events_lock); + + return i; +} diff --git a/src/spdk/lib/notify/notify_rpc.c b/src/spdk/lib/notify/notify_rpc.c new file mode 100644 index 000000000..fc40502c2 --- /dev/null +++ b/src/spdk/lib/notify/notify_rpc.c @@ -0,0 +1,126 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "spdk/rpc.h" +#include "spdk/string.h" +#include "spdk/notify.h" +#include "spdk/env.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +static int +notify_get_types_cb(const struct spdk_notify_type *type, void *ctx) +{ + spdk_json_write_string((struct spdk_json_write_ctx *)ctx, spdk_notify_type_get_name(type)); + return 0; +} + +static void +rpc_notify_get_types(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "No parameters required"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + spdk_notify_foreach_type(notify_get_types_cb, w); + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("notify_get_types", rpc_notify_get_types, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(notify_get_types, get_notification_types) + +struct rpc_notify_get_notifications { + uint64_t id; + uint64_t max; + + struct spdk_json_write_ctx *w; +}; + +static const struct spdk_json_object_decoder rpc_notify_get_notifications_decoders[] = { + {"id", offsetof(struct rpc_notify_get_notifications, id), spdk_json_decode_uint64, true}, + {"max", offsetof(struct rpc_notify_get_notifications, max), spdk_json_decode_uint64, true}, +}; + + +static int +notify_get_notifications_cb(uint64_t id, const struct spdk_notify_event *ev, void *ctx) +{ + struct rpc_notify_get_notifications *req = ctx; + + spdk_json_write_object_begin(req->w); + spdk_json_write_named_string(req->w, "type", ev->type); + spdk_json_write_named_string(req->w, "ctx", ev->ctx); + spdk_json_write_named_uint64(req->w, "id", id); + spdk_json_write_object_end(req->w); + return 0; +} + +static void +rpc_notify_get_notifications(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_notify_get_notifications req = {0, UINT64_MAX}; + + if (params && + spdk_json_decode_object(params, rpc_notify_get_notifications_decoders, + SPDK_COUNTOF(rpc_notify_get_notifications_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_NOTIFY_RPC, "spdk_json_decode_object failed\n"); + + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(EINVAL)); + return; + } + + + req.w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_array_begin(req.w); + spdk_notify_foreach_event(req.id, req.max, notify_get_notifications_cb, &req); + spdk_json_write_array_end(req.w); + + spdk_jsonrpc_end_result(request, req.w); +} +SPDK_RPC_REGISTER("notify_get_notifications", rpc_notify_get_notifications, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(notify_get_notifications, get_notifications) + +SPDK_LOG_REGISTER_COMPONENT("notify_rpc", SPDK_NOTIFY_RPC) diff --git a/src/spdk/lib/notify/spdk_notify.map b/src/spdk/lib/notify/spdk_notify.map new file mode 100644 index 000000000..4023a8e66 --- /dev/null +++ b/src/spdk/lib/notify/spdk_notify.map @@ -0,0 +1,10 @@ +{ + global: + spdk_notify_type_register; + spdk_notify_type_get_name; + spdk_notify_foreach_type; + spdk_notify_send; + spdk_notify_foreach_event; + + local: *; +}; diff --git a/src/spdk/lib/nvme/Makefile b/src/spdk/lib/nvme/Makefile new file mode 100644 index 000000000..1c02965f5 --- /dev/null +++ b/src/spdk/lib/nvme/Makefile @@ -0,0 +1,73 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 4 +SO_MINOR := 0 + +C_SRCS = nvme_ctrlr_cmd.c nvme_ctrlr.c nvme_fabric.c nvme_ns_cmd.c nvme_ns.c nvme_pcie.c nvme_qpair.c nvme.c nvme_quirks.c nvme_transport.c nvme_uevent.c nvme_ctrlr_ocssd_cmd.c \ + nvme_ns_ocssd_cmd.c nvme_tcp.c nvme_opal.c nvme_io_msg.c nvme_poll_group.c +C_SRCS-$(CONFIG_RDMA) += nvme_rdma.c +C_SRCS-$(CONFIG_NVME_CUSE) += nvme_cuse.c + +LIBNAME = nvme +LOCAL_SYS_LIBS = -luuid +ifeq ($(CONFIG_RDMA),y) +LOCAL_SYS_LIBS += -libverbs -lrdmacm +#Attach only if FreeBSD and RDMA is specified with configure +ifeq ($(OS),FreeBSD) +# Mellanox - MLX4 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx4.*)","") +LOCAL_SYS_LIBS += -lmlx4 +endif +# Mellanox - MLX5 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx5.*)","") +LOCAL_SYS_LIBS += -lmlx5 +endif +# Chelsio HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libcxgb4.*)","") +LOCAL_SYS_LIBS += -lcxgb4 +endif +endif +endif + +ifeq ($(CONFIG_NVME_CUSE),y) +# fuse requires to set _FILE_OFFSET_BITS to 64 bits even for 64 bit machines +CFLAGS += -D_FILE_OFFSET_BITS=64 +endif + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_nvme.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/nvme/nvme.c b/src/spdk/lib/nvme/nvme.c new file mode 100644 index 000000000..9393810a6 --- /dev/null +++ b/src/spdk/lib/nvme/nvme.c @@ -0,0 +1,1423 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/nvmf_spec.h" +#include "spdk/string.h" +#include "nvme_internal.h" +#include "nvme_io_msg.h" +#include "nvme_uevent.h" + +#define SPDK_NVME_DRIVER_NAME "spdk_nvme_driver" + +struct nvme_driver *g_spdk_nvme_driver; +pid_t g_spdk_nvme_pid; + +/* gross timeout of 180 seconds in milliseconds */ +static int g_nvme_driver_timeout_ms = 3 * 60 * 1000; + +/* Per-process attached controller list */ +static TAILQ_HEAD(, spdk_nvme_ctrlr) g_nvme_attached_ctrlrs = + TAILQ_HEAD_INITIALIZER(g_nvme_attached_ctrlrs); + +/* Returns true if ctrlr should be stored on the multi-process shared_attached_ctrlrs list */ +static bool +nvme_ctrlr_shared(const struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE; +} + +void +nvme_ctrlr_connected(struct spdk_nvme_probe_ctx *probe_ctx, + struct spdk_nvme_ctrlr *ctrlr) +{ + TAILQ_INSERT_TAIL(&probe_ctx->init_ctrlrs, ctrlr, tailq); +} + +int +spdk_nvme_detach(struct spdk_nvme_ctrlr *ctrlr) +{ + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + nvme_ctrlr_proc_put_ref(ctrlr); + + if (nvme_ctrlr_get_ref_count(ctrlr) == 0) { + nvme_io_msg_ctrlr_detach(ctrlr); + if (nvme_ctrlr_shared(ctrlr)) { + TAILQ_REMOVE(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq); + } else { + TAILQ_REMOVE(&g_nvme_attached_ctrlrs, ctrlr, tailq); + } + nvme_ctrlr_destruct(ctrlr); + } + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + return 0; +} + +void +nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_completion_poll_status *status = arg; + + if (status->timed_out) { + /* There is no routine waiting for the completion of this request, free allocated memory */ + free(status); + return; + } + + /* + * Copy status into the argument passed by the caller, so that + * the caller can check the status to determine if the + * the request passed or failed. + */ + memcpy(&status->cpl, cpl, sizeof(*cpl)); + status->done = true; +} + +/** + * Poll qpair for completions until a command completes. + * + * \param qpair queue to poll + * \param status completion status. The user must fill this structure with zeroes before calling + * this function + * \param robust_mutex optional robust mutex to lock while polling qpair + * + * \return 0 if command completed without error, + * -EIO if command completed with error, + * -ECANCELED if command is not completed due to transport/device error + * + * The command to wait upon must be submitted with nvme_completion_poll_cb as the callback + * and status as the callback argument. + */ +int +nvme_wait_for_completion_robust_lock( + struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status, + pthread_mutex_t *robust_mutex) +{ + int rc; + + while (status->done == false) { + if (robust_mutex) { + nvme_robust_mutex_lock(robust_mutex); + } + + rc = spdk_nvme_qpair_process_completions(qpair, 0); + + if (robust_mutex) { + nvme_robust_mutex_unlock(robust_mutex); + } + + if (rc < 0) { + status->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + status->cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + if (status->done == false) { + status->timed_out = true; + } + return -ECANCELED; + } + } + + return spdk_nvme_cpl_is_error(&status->cpl) ? -EIO : 0; +} + +int +nvme_wait_for_completion(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status) +{ + return nvme_wait_for_completion_robust_lock(qpair, status, NULL); +} + +/** + * Poll qpair for completions until a command completes. + * + * \param qpair queue to poll + * \param status completion status. The user must fill this structure with zeroes before calling + * this function + * \param timeout_in_secs optional timeout + * + * \return 0 if command completed without error, + * -EIO if command completed with error, + * -ECANCELED if command is not completed due to transport/device error or time expired + * + * The command to wait upon must be submitted with nvme_completion_poll_cb as the callback + * and status as the callback argument. + */ +int +nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status, + uint64_t timeout_in_secs) +{ + uint64_t timeout_tsc = 0; + int rc = 0; + + if (timeout_in_secs) { + timeout_tsc = spdk_get_ticks() + timeout_in_secs * spdk_get_ticks_hz(); + } + + while (status->done == false) { + rc = spdk_nvme_qpair_process_completions(qpair, 0); + + if (rc < 0) { + status->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + status->cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + break; + } + if (timeout_tsc && spdk_get_ticks() > timeout_tsc) { + break; + } + } + + if (status->done == false || rc < 0) { + if (status->done == false) { + status->timed_out = true; + } + return -ECANCELED; + } + + return spdk_nvme_cpl_is_error(&status->cpl) ? -EIO : 0; +} + +static void +nvme_user_copy_cmd_complete(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *req = arg; + enum spdk_nvme_data_transfer xfer; + + if (req->user_buffer && req->payload_size) { + /* Copy back to the user buffer and free the contig buffer */ + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc); + if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST || + xfer == SPDK_NVME_DATA_BIDIRECTIONAL) { + assert(req->pid == getpid()); + memcpy(req->user_buffer, req->payload.contig_or_cb_arg, req->payload_size); + } + + spdk_free(req->payload.contig_or_cb_arg); + } + + /* Call the user's original callback now that the buffer has been copied */ + req->user_cb_fn(req->user_cb_arg, cpl); +} + +/** + * Allocate a request as well as a DMA-capable buffer to copy to/from the user's buffer. + * + * This is intended for use in non-fast-path functions (admin commands, reservations, etc.) + * where the overhead of a copy is not a problem. + */ +struct nvme_request * +nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair, + void *buffer, uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, bool host_to_controller) +{ + struct nvme_request *req; + void *dma_buffer = NULL; + + if (buffer && payload_size) { + dma_buffer = spdk_zmalloc(payload_size, 4096, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!dma_buffer) { + return NULL; + } + + if (host_to_controller) { + memcpy(dma_buffer, buffer, payload_size); + } + } + + req = nvme_allocate_request_contig(qpair, dma_buffer, payload_size, nvme_user_copy_cmd_complete, + NULL); + if (!req) { + spdk_free(dma_buffer); + return NULL; + } + + req->user_cb_fn = cb_fn; + req->user_cb_arg = cb_arg; + req->user_buffer = buffer; + req->cb_arg = req; + + return req; +} + +/** + * Check if a request has exceeded the controller timeout. + * + * \param req request to check for timeout. + * \param cid command ID for command submitted by req (will be passed to timeout_cb_fn) + * \param active_proc per-process data for the controller associated with req + * \param now_tick current time from spdk_get_ticks() + * \return 0 if requests submitted more recently than req should still be checked for timeouts, or + * 1 if requests newer than req need not be checked. + * + * The request's timeout callback will be called if needed; the caller is only responsible for + * calling this function on each outstanding request. + */ +int +nvme_request_check_timeout(struct nvme_request *req, uint16_t cid, + struct spdk_nvme_ctrlr_process *active_proc, + uint64_t now_tick) +{ + struct spdk_nvme_qpair *qpair = req->qpair; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + assert(active_proc->timeout_cb_fn != NULL); + + if (req->timed_out || req->submit_tick == 0) { + return 0; + } + + if (req->pid != g_spdk_nvme_pid) { + return 0; + } + + if (nvme_qpair_is_admin_queue(qpair) && + req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + return 0; + } + + if (req->submit_tick + active_proc->timeout_ticks > now_tick) { + return 1; + } + + req->timed_out = true; + + /* + * We don't want to expose the admin queue to the user, + * so when we're timing out admin commands set the + * qpair to NULL. + */ + active_proc->timeout_cb_fn(active_proc->timeout_cb_arg, ctrlr, + nvme_qpair_is_admin_queue(qpair) ? NULL : qpair, + cid); + return 0; +} + +int +nvme_robust_mutex_init_shared(pthread_mutex_t *mtx) +{ + int rc = 0; + +#ifdef __FreeBSD__ + pthread_mutex_init(mtx, NULL); +#else + pthread_mutexattr_t attr; + + if (pthread_mutexattr_init(&attr)) { + return -1; + } + if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) || + pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) || + pthread_mutex_init(mtx, &attr)) { + rc = -1; + } + pthread_mutexattr_destroy(&attr); +#endif + + return rc; +} + +int +nvme_driver_init(void) +{ + static pthread_mutex_t g_init_mutex = PTHREAD_MUTEX_INITIALIZER; + int ret = 0; + /* Any socket ID */ + int socket_id = -1; + + /* Use a special process-private mutex to ensure the global + * nvme driver object (g_spdk_nvme_driver) gets initialized by + * only one thread. Once that object is established and its + * mutex is initialized, we can unlock this mutex and use that + * one instead. + */ + pthread_mutex_lock(&g_init_mutex); + + /* Each process needs its own pid. */ + g_spdk_nvme_pid = getpid(); + + /* + * Only one thread from one process will do this driver init work. + * The primary process will reserve the shared memory and do the + * initialization. + * The secondary process will lookup the existing reserved memory. + */ + if (spdk_process_is_primary()) { + /* The unique named memzone already reserved. */ + if (g_spdk_nvme_driver != NULL) { + pthread_mutex_unlock(&g_init_mutex); + return 0; + } else { + g_spdk_nvme_driver = spdk_memzone_reserve(SPDK_NVME_DRIVER_NAME, + sizeof(struct nvme_driver), socket_id, + SPDK_MEMZONE_NO_IOVA_CONTIG); + } + + if (g_spdk_nvme_driver == NULL) { + SPDK_ERRLOG("primary process failed to reserve memory\n"); + pthread_mutex_unlock(&g_init_mutex); + return -1; + } + } else { + g_spdk_nvme_driver = spdk_memzone_lookup(SPDK_NVME_DRIVER_NAME); + + /* The unique named memzone already reserved by the primary process. */ + if (g_spdk_nvme_driver != NULL) { + int ms_waited = 0; + + /* Wait the nvme driver to get initialized. */ + while ((g_spdk_nvme_driver->initialized == false) && + (ms_waited < g_nvme_driver_timeout_ms)) { + ms_waited++; + nvme_delay(1000); /* delay 1ms */ + } + if (g_spdk_nvme_driver->initialized == false) { + SPDK_ERRLOG("timeout waiting for primary process to init\n"); + pthread_mutex_unlock(&g_init_mutex); + return -1; + } + } else { + SPDK_ERRLOG("primary process is not started yet\n"); + pthread_mutex_unlock(&g_init_mutex); + return -1; + } + + pthread_mutex_unlock(&g_init_mutex); + return 0; + } + + /* + * At this moment, only one thread from the primary process will do + * the g_spdk_nvme_driver initialization + */ + assert(spdk_process_is_primary()); + + ret = nvme_robust_mutex_init_shared(&g_spdk_nvme_driver->lock); + if (ret != 0) { + SPDK_ERRLOG("failed to initialize mutex\n"); + spdk_memzone_free(SPDK_NVME_DRIVER_NAME); + pthread_mutex_unlock(&g_init_mutex); + return ret; + } + + /* The lock in the shared g_spdk_nvme_driver object is now ready to + * be used - so we can unlock the g_init_mutex here. + */ + pthread_mutex_unlock(&g_init_mutex); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + g_spdk_nvme_driver->initialized = false; + g_spdk_nvme_driver->hotplug_fd = nvme_uevent_connect(); + if (g_spdk_nvme_driver->hotplug_fd < 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n"); + } + + TAILQ_INIT(&g_spdk_nvme_driver->shared_attached_ctrlrs); + + spdk_uuid_generate(&g_spdk_nvme_driver->default_extended_host_id); + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + return ret; +} + +/* This function must only be called while holding g_spdk_nvme_driver->lock */ +int +nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_probe_ctx *probe_ctx, void *devhandle) +{ + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ctrlr_opts opts; + + assert(trid != NULL); + + spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts)); + + if (!probe_ctx->probe_cb || probe_ctx->probe_cb(probe_ctx->cb_ctx, trid, &opts)) { + ctrlr = nvme_get_ctrlr_by_trid_unsafe(trid); + if (ctrlr) { + /* This ctrlr already exists. + * Increase the ref count before calling attach_cb() as the user may + * call nvme_detach() immediately. */ + nvme_ctrlr_proc_get_ref(ctrlr); + + if (probe_ctx->attach_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + return 0; + } + + ctrlr = nvme_transport_ctrlr_construct(trid, &opts, devhandle); + if (ctrlr == NULL) { + SPDK_ERRLOG("Failed to construct NVMe controller for SSD: %s\n", trid->traddr); + return -1; + } + ctrlr->remove_cb = probe_ctx->remove_cb; + ctrlr->cb_ctx = probe_ctx->cb_ctx; + + if (ctrlr->quirks & NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE && + ctrlr->opts.io_queue_size == DEFAULT_IO_QUEUE_SIZE) { + /* If the user specifically set an IO queue size different than the + * default, use that value. Otherwise overwrite with the quirked value. + * This allows this quirk to be overridden when necessary. + * However, cap.mqes still needs to be respected. + */ + ctrlr->opts.io_queue_size = spdk_min(DEFAULT_IO_QUEUE_SIZE_FOR_QUIRK, ctrlr->cap.bits.mqes + 1u); + } + + nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_ENABLED); + TAILQ_INSERT_TAIL(&probe_ctx->init_ctrlrs, ctrlr, tailq); + return 0; + } + + return 1; +} + +static int +nvme_ctrlr_poll_internal(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_probe_ctx *probe_ctx) +{ + int rc = 0; + + rc = nvme_ctrlr_process_init(ctrlr); + + if (rc) { + /* Controller failed to initialize. */ + TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq); + SPDK_ERRLOG("Failed to initialize SSD: %s\n", ctrlr->trid.traddr); + nvme_ctrlr_fail(ctrlr, false); + nvme_ctrlr_destruct(ctrlr); + return rc; + } + + if (ctrlr->state != NVME_CTRLR_STATE_READY) { + return 0; + } + + STAILQ_INIT(&ctrlr->io_producers); + + /* + * Controller has been initialized. + * Move it to the attached_ctrlrs list. + */ + TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq); + + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + if (nvme_ctrlr_shared(ctrlr)) { + TAILQ_INSERT_TAIL(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq); + } else { + TAILQ_INSERT_TAIL(&g_nvme_attached_ctrlrs, ctrlr, tailq); + } + + /* + * Increase the ref count before calling attach_cb() as the user may + * call nvme_detach() immediately. + */ + nvme_ctrlr_proc_get_ref(ctrlr); + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + if (probe_ctx->attach_cb) { + probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts); + return 0; + } + + return 0; +} + +static int +nvme_init_controllers(struct spdk_nvme_probe_ctx *probe_ctx) +{ + int rc = 0; + + while (true) { + rc = spdk_nvme_probe_poll_async(probe_ctx); + if (rc != -EAGAIN) { + return rc; + } + } + + return rc; +} + +/* This function must not be called while holding g_spdk_nvme_driver->lock */ +static struct spdk_nvme_ctrlr * +nvme_get_ctrlr_by_trid(const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvme_ctrlr *ctrlr; + + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + ctrlr = nvme_get_ctrlr_by_trid_unsafe(trid); + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + return ctrlr; +} + +/* This function must be called while holding g_spdk_nvme_driver->lock */ +struct spdk_nvme_ctrlr * +nvme_get_ctrlr_by_trid_unsafe(const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvme_ctrlr *ctrlr; + + /* Search per-process list */ + TAILQ_FOREACH(ctrlr, &g_nvme_attached_ctrlrs, tailq) { + if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) { + return ctrlr; + } + } + + /* Search multi-process shared list */ + TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) { + if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) { + return ctrlr; + } + } + + return NULL; +} + +/* This function must only be called while holding g_spdk_nvme_driver->lock */ +static int +nvme_probe_internal(struct spdk_nvme_probe_ctx *probe_ctx, + bool direct_connect) +{ + int rc; + struct spdk_nvme_ctrlr *ctrlr, *ctrlr_tmp; + + spdk_nvme_trid_populate_transport(&probe_ctx->trid, probe_ctx->trid.trtype); + if (!spdk_nvme_transport_available_by_name(probe_ctx->trid.trstring)) { + SPDK_ERRLOG("NVMe trtype %u not available\n", probe_ctx->trid.trtype); + return -1; + } + + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + rc = nvme_transport_ctrlr_scan(probe_ctx, direct_connect); + if (rc != 0) { + SPDK_ERRLOG("NVMe ctrlr scan failed\n"); + TAILQ_FOREACH_SAFE(ctrlr, &probe_ctx->init_ctrlrs, tailq, ctrlr_tmp) { + TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq); + nvme_transport_ctrlr_destruct(ctrlr); + } + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + return -1; + } + + /* + * Probe controllers on the shared_attached_ctrlrs list + */ + if (!spdk_process_is_primary() && (probe_ctx->trid.trtype == SPDK_NVME_TRANSPORT_PCIE)) { + TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) { + /* Do not attach other ctrlrs if user specify a valid trid */ + if ((strlen(probe_ctx->trid.traddr) != 0) && + (spdk_nvme_transport_id_compare(&probe_ctx->trid, &ctrlr->trid))) { + continue; + } + + /* Do not attach if we failed to initialize it in this process */ + if (nvme_ctrlr_get_current_process(ctrlr) == NULL) { + continue; + } + + nvme_ctrlr_proc_get_ref(ctrlr); + + /* + * Unlock while calling attach_cb() so the user can call other functions + * that may take the driver lock, like nvme_detach(). + */ + if (probe_ctx->attach_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + } + } + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + return 0; +} + +static void +nvme_probe_ctx_init(struct spdk_nvme_probe_ctx *probe_ctx, + const struct spdk_nvme_transport_id *trid, + void *cb_ctx, + spdk_nvme_probe_cb probe_cb, + spdk_nvme_attach_cb attach_cb, + spdk_nvme_remove_cb remove_cb) +{ + probe_ctx->trid = *trid; + probe_ctx->cb_ctx = cb_ctx; + probe_ctx->probe_cb = probe_cb; + probe_ctx->attach_cb = attach_cb; + probe_ctx->remove_cb = remove_cb; + TAILQ_INIT(&probe_ctx->init_ctrlrs); +} + +int +spdk_nvme_probe(const struct spdk_nvme_transport_id *trid, void *cb_ctx, + spdk_nvme_probe_cb probe_cb, spdk_nvme_attach_cb attach_cb, + spdk_nvme_remove_cb remove_cb) +{ + struct spdk_nvme_transport_id trid_pcie; + struct spdk_nvme_probe_ctx *probe_ctx; + + if (trid == NULL) { + memset(&trid_pcie, 0, sizeof(trid_pcie)); + spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); + trid = &trid_pcie; + } + + probe_ctx = spdk_nvme_probe_async(trid, cb_ctx, probe_cb, + attach_cb, remove_cb); + if (!probe_ctx) { + SPDK_ERRLOG("Create probe context failed\n"); + return -1; + } + + /* + * Keep going even if one or more nvme_attach() calls failed, + * but maintain the value of rc to signal errors when we return. + */ + return nvme_init_controllers(probe_ctx); +} + +static bool +nvme_connect_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + struct spdk_nvme_ctrlr_opts *requested_opts = cb_ctx; + + assert(requested_opts); + memcpy(opts, requested_opts, sizeof(*opts)); + + return true; +} + +static void +nvme_ctrlr_opts_init(struct spdk_nvme_ctrlr_opts *opts, + const struct spdk_nvme_ctrlr_opts *opts_user, + size_t opts_size_user) +{ + assert(opts); + assert(opts_user); + + spdk_nvme_ctrlr_get_default_ctrlr_opts(opts, opts_size_user); + +#define FIELD_OK(field) \ + offsetof(struct spdk_nvme_ctrlr_opts, field) + sizeof(opts->field) <= (opts->opts_size) + + if (FIELD_OK(num_io_queues)) { + opts->num_io_queues = opts_user->num_io_queues; + } + + if (FIELD_OK(use_cmb_sqs)) { + opts->use_cmb_sqs = opts_user->use_cmb_sqs; + } + + if (FIELD_OK(no_shn_notification)) { + opts->no_shn_notification = opts_user->no_shn_notification; + } + + if (FIELD_OK(arb_mechanism)) { + opts->arb_mechanism = opts_user->arb_mechanism; + } + + if (FIELD_OK(arbitration_burst)) { + opts->arbitration_burst = opts_user->arbitration_burst; + } + + if (FIELD_OK(low_priority_weight)) { + opts->low_priority_weight = opts_user->low_priority_weight; + } + + if (FIELD_OK(medium_priority_weight)) { + opts->medium_priority_weight = opts_user->medium_priority_weight; + } + + if (FIELD_OK(high_priority_weight)) { + opts->high_priority_weight = opts_user->high_priority_weight; + } + + if (FIELD_OK(keep_alive_timeout_ms)) { + opts->keep_alive_timeout_ms = opts_user->keep_alive_timeout_ms; + } + + if (FIELD_OK(transport_retry_count)) { + opts->transport_retry_count = opts_user->transport_retry_count; + } + + if (FIELD_OK(io_queue_size)) { + opts->io_queue_size = opts_user->io_queue_size; + } + + if (FIELD_OK(hostnqn)) { + memcpy(opts->hostnqn, opts_user->hostnqn, sizeof(opts_user->hostnqn)); + } + + if (FIELD_OK(io_queue_requests)) { + opts->io_queue_requests = opts_user->io_queue_requests; + } + + if (FIELD_OK(src_addr)) { + memcpy(opts->src_addr, opts_user->src_addr, sizeof(opts_user->src_addr)); + } + + if (FIELD_OK(src_svcid)) { + memcpy(opts->src_svcid, opts_user->src_svcid, sizeof(opts_user->src_svcid)); + } + + if (FIELD_OK(host_id)) { + memcpy(opts->host_id, opts_user->host_id, sizeof(opts_user->host_id)); + } + if (FIELD_OK(extended_host_id)) { + memcpy(opts->extended_host_id, opts_user->extended_host_id, + sizeof(opts_user->extended_host_id)); + } + + if (FIELD_OK(command_set)) { + opts->command_set = opts_user->command_set; + } + + if (FIELD_OK(admin_timeout_ms)) { + opts->admin_timeout_ms = opts_user->admin_timeout_ms; + } + + if (FIELD_OK(header_digest)) { + opts->header_digest = opts_user->header_digest; + } + + if (FIELD_OK(data_digest)) { + opts->data_digest = opts_user->data_digest; + } + + if (FIELD_OK(disable_error_logging)) { + opts->disable_error_logging = opts_user->disable_error_logging; + } + + if (FIELD_OK(transport_ack_timeout)) { + opts->transport_ack_timeout = opts_user->transport_ack_timeout; + } + + if (FIELD_OK(admin_queue_size)) { + opts->admin_queue_size = opts_user->admin_queue_size; + } +#undef FIELD_OK +} + +struct spdk_nvme_ctrlr * +spdk_nvme_connect(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, size_t opts_size) +{ + int rc; + struct spdk_nvme_ctrlr *ctrlr = NULL; + struct spdk_nvme_probe_ctx *probe_ctx; + struct spdk_nvme_ctrlr_opts *opts_local_p = NULL; + struct spdk_nvme_ctrlr_opts opts_local; + + if (trid == NULL) { + SPDK_ERRLOG("No transport ID specified\n"); + return NULL; + } + + if (opts) { + opts_local_p = &opts_local; + nvme_ctrlr_opts_init(opts_local_p, opts, opts_size); + } + + probe_ctx = spdk_nvme_connect_async(trid, opts_local_p, NULL); + if (!probe_ctx) { + SPDK_ERRLOG("Create probe context failed\n"); + return NULL; + } + + rc = nvme_init_controllers(probe_ctx); + if (rc != 0) { + return NULL; + } + + ctrlr = nvme_get_ctrlr_by_trid(trid); + + return ctrlr; +} + +void +spdk_nvme_trid_populate_transport(struct spdk_nvme_transport_id *trid, + enum spdk_nvme_transport_type trtype) +{ + const char *trstring = ""; + + trid->trtype = trtype; + switch (trtype) { + case SPDK_NVME_TRANSPORT_FC: + trstring = SPDK_NVME_TRANSPORT_NAME_FC; + break; + case SPDK_NVME_TRANSPORT_PCIE: + trstring = SPDK_NVME_TRANSPORT_NAME_PCIE; + break; + case SPDK_NVME_TRANSPORT_RDMA: + trstring = SPDK_NVME_TRANSPORT_NAME_RDMA; + break; + case SPDK_NVME_TRANSPORT_TCP: + trstring = SPDK_NVME_TRANSPORT_NAME_TCP; + break; + case SPDK_NVME_TRANSPORT_CUSTOM: + default: + SPDK_ERRLOG("don't use this for custom transports\n"); + assert(0); + return; + } + snprintf(trid->trstring, SPDK_NVMF_TRSTRING_MAX_LEN, "%s", trstring); +} + +int +spdk_nvme_transport_id_populate_trstring(struct spdk_nvme_transport_id *trid, const char *trstring) +{ + int len, i, rc; + + if (trstring == NULL) { + return -EINVAL; + } + + len = strnlen(trstring, SPDK_NVMF_TRSTRING_MAX_LEN); + if (len == SPDK_NVMF_TRSTRING_MAX_LEN) { + return -EINVAL; + } + + rc = snprintf(trid->trstring, SPDK_NVMF_TRSTRING_MAX_LEN, "%s", trstring); + if (rc < 0) { + return rc; + } + + /* cast official trstring to uppercase version of input. */ + for (i = 0; i < len; i++) { + trid->trstring[i] = toupper(trid->trstring[i]); + } + return 0; +} + +int +spdk_nvme_transport_id_parse_trtype(enum spdk_nvme_transport_type *trtype, const char *str) +{ + if (trtype == NULL || str == NULL) { + return -EINVAL; + } + + if (strcasecmp(str, "PCIe") == 0) { + *trtype = SPDK_NVME_TRANSPORT_PCIE; + } else if (strcasecmp(str, "RDMA") == 0) { + *trtype = SPDK_NVME_TRANSPORT_RDMA; + } else if (strcasecmp(str, "FC") == 0) { + *trtype = SPDK_NVME_TRANSPORT_FC; + } else if (strcasecmp(str, "TCP") == 0) { + *trtype = SPDK_NVME_TRANSPORT_TCP; + } else { + *trtype = SPDK_NVME_TRANSPORT_CUSTOM; + } + return 0; +} + +const char * +spdk_nvme_transport_id_trtype_str(enum spdk_nvme_transport_type trtype) +{ + switch (trtype) { + case SPDK_NVME_TRANSPORT_PCIE: + return "PCIe"; + case SPDK_NVME_TRANSPORT_RDMA: + return "RDMA"; + case SPDK_NVME_TRANSPORT_FC: + return "FC"; + case SPDK_NVME_TRANSPORT_TCP: + return "TCP"; + case SPDK_NVME_TRANSPORT_CUSTOM: + return "CUSTOM"; + default: + return NULL; + } +} + +int +spdk_nvme_transport_id_parse_adrfam(enum spdk_nvmf_adrfam *adrfam, const char *str) +{ + if (adrfam == NULL || str == NULL) { + return -EINVAL; + } + + if (strcasecmp(str, "IPv4") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_IPV4; + } else if (strcasecmp(str, "IPv6") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_IPV6; + } else if (strcasecmp(str, "IB") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_IB; + } else if (strcasecmp(str, "FC") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_FC; + } else { + return -ENOENT; + } + return 0; +} + +const char * +spdk_nvme_transport_id_adrfam_str(enum spdk_nvmf_adrfam adrfam) +{ + switch (adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + return "IPv4"; + case SPDK_NVMF_ADRFAM_IPV6: + return "IPv6"; + case SPDK_NVMF_ADRFAM_IB: + return "IB"; + case SPDK_NVMF_ADRFAM_FC: + return "FC"; + default: + return NULL; + } +} + +static size_t +parse_next_key(const char **str, char *key, char *val, size_t key_buf_size, size_t val_buf_size) +{ + + const char *sep, *sep1; + const char *whitespace = " \t\n"; + size_t key_len, val_len; + + *str += strspn(*str, whitespace); + + sep = strchr(*str, ':'); + if (!sep) { + sep = strchr(*str, '='); + if (!sep) { + SPDK_ERRLOG("Key without ':' or '=' separator\n"); + return 0; + } + } else { + sep1 = strchr(*str, '='); + if ((sep1 != NULL) && (sep1 < sep)) { + sep = sep1; + } + } + + key_len = sep - *str; + if (key_len >= key_buf_size) { + SPDK_ERRLOG("Key length %zu greater than maximum allowed %zu\n", + key_len, key_buf_size - 1); + return 0; + } + + memcpy(key, *str, key_len); + key[key_len] = '\0'; + + *str += key_len + 1; /* Skip key: */ + val_len = strcspn(*str, whitespace); + if (val_len == 0) { + SPDK_ERRLOG("Key without value\n"); + return 0; + } + + if (val_len >= val_buf_size) { + SPDK_ERRLOG("Value length %zu greater than maximum allowed %zu\n", + val_len, val_buf_size - 1); + return 0; + } + + memcpy(val, *str, val_len); + val[val_len] = '\0'; + + *str += val_len; + + return val_len; +} + +int +spdk_nvme_transport_id_parse(struct spdk_nvme_transport_id *trid, const char *str) +{ + size_t val_len; + char key[32]; + char val[1024]; + + if (trid == NULL || str == NULL) { + return -EINVAL; + } + + while (*str != '\0') { + + val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val)); + + if (val_len == 0) { + SPDK_ERRLOG("Failed to parse transport ID\n"); + return -EINVAL; + } + + if (strcasecmp(key, "trtype") == 0) { + if (spdk_nvme_transport_id_populate_trstring(trid, val) != 0) { + SPDK_ERRLOG("invalid transport '%s'\n", val); + return -EINVAL; + } + if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, val) != 0) { + SPDK_ERRLOG("Unknown trtype '%s'\n", val); + return -EINVAL; + } + } else if (strcasecmp(key, "adrfam") == 0) { + if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, val) != 0) { + SPDK_ERRLOG("Unknown adrfam '%s'\n", val); + return -EINVAL; + } + } else if (strcasecmp(key, "traddr") == 0) { + if (val_len > SPDK_NVMF_TRADDR_MAX_LEN) { + SPDK_ERRLOG("traddr length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_TRADDR_MAX_LEN); + return -EINVAL; + } + memcpy(trid->traddr, val, val_len + 1); + } else if (strcasecmp(key, "trsvcid") == 0) { + if (val_len > SPDK_NVMF_TRSVCID_MAX_LEN) { + SPDK_ERRLOG("trsvcid length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_TRSVCID_MAX_LEN); + return -EINVAL; + } + memcpy(trid->trsvcid, val, val_len + 1); + } else if (strcasecmp(key, "priority") == 0) { + if (val_len > SPDK_NVMF_PRIORITY_MAX_LEN) { + SPDK_ERRLOG("priority length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_PRIORITY_MAX_LEN); + return -EINVAL; + } + trid->priority = spdk_strtol(val, 10); + } else if (strcasecmp(key, "subnqn") == 0) { + if (val_len > SPDK_NVMF_NQN_MAX_LEN) { + SPDK_ERRLOG("subnqn length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_NQN_MAX_LEN); + return -EINVAL; + } + memcpy(trid->subnqn, val, val_len + 1); + } else if (strcasecmp(key, "hostaddr") == 0) { + continue; + } else if (strcasecmp(key, "hostsvcid") == 0) { + continue; + } else if (strcasecmp(key, "ns") == 0) { + /* + * Special case. The namespace id parameter may + * optionally be passed in the transport id string + * for an SPDK application (e.g. nvme/perf) + * and additionally parsed therein to limit + * targeting a specific namespace. For this + * scenario, just silently ignore this key + * rather than letting it default to logging + * it as an invalid key. + */ + continue; + } else if (strcasecmp(key, "alt_traddr") == 0) { + /* + * Used by applications for enabling transport ID failover. + * Please see the case above for more information on custom parameters. + */ + continue; + } else { + SPDK_ERRLOG("Unknown transport ID key '%s'\n", key); + } + } + + return 0; +} + +int +spdk_nvme_host_id_parse(struct spdk_nvme_host_id *hostid, const char *str) +{ + + size_t key_size = 32; + size_t val_size = 1024; + size_t val_len; + char key[key_size]; + char val[val_size]; + + if (hostid == NULL || str == NULL) { + return -EINVAL; + } + + while (*str != '\0') { + + val_len = parse_next_key(&str, key, val, key_size, val_size); + + if (val_len == 0) { + SPDK_ERRLOG("Failed to parse host ID\n"); + return val_len; + } + + /* Ignore the rest of the options from the transport ID. */ + if (strcasecmp(key, "trtype") == 0) { + continue; + } else if (strcasecmp(key, "adrfam") == 0) { + continue; + } else if (strcasecmp(key, "traddr") == 0) { + continue; + } else if (strcasecmp(key, "trsvcid") == 0) { + continue; + } else if (strcasecmp(key, "subnqn") == 0) { + continue; + } else if (strcasecmp(key, "priority") == 0) { + continue; + } else if (strcasecmp(key, "ns") == 0) { + continue; + } else if (strcasecmp(key, "hostaddr") == 0) { + if (val_len > SPDK_NVMF_TRADDR_MAX_LEN) { + SPDK_ERRLOG("hostaddr length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_TRADDR_MAX_LEN); + return -EINVAL; + } + memcpy(hostid->hostaddr, val, val_len + 1); + + } else if (strcasecmp(key, "hostsvcid") == 0) { + if (val_len > SPDK_NVMF_TRSVCID_MAX_LEN) { + SPDK_ERRLOG("trsvcid length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_TRSVCID_MAX_LEN); + return -EINVAL; + } + memcpy(hostid->hostsvcid, val, val_len + 1); + } else { + SPDK_ERRLOG("Unknown transport ID key '%s'\n", key); + } + } + + return 0; +} + +static int +cmp_int(int a, int b) +{ + return a - b; +} + +int +spdk_nvme_transport_id_compare(const struct spdk_nvme_transport_id *trid1, + const struct spdk_nvme_transport_id *trid2) +{ + int cmp; + + if (trid1->trtype == SPDK_NVME_TRANSPORT_CUSTOM) { + cmp = strcasecmp(trid1->trstring, trid2->trstring); + } else { + cmp = cmp_int(trid1->trtype, trid2->trtype); + } + + if (cmp) { + return cmp; + } + + if (trid1->trtype == SPDK_NVME_TRANSPORT_PCIE) { + struct spdk_pci_addr pci_addr1 = {}; + struct spdk_pci_addr pci_addr2 = {}; + + /* Normalize PCI addresses before comparing */ + if (spdk_pci_addr_parse(&pci_addr1, trid1->traddr) < 0 || + spdk_pci_addr_parse(&pci_addr2, trid2->traddr) < 0) { + return -1; + } + + /* PCIe transport ID only uses trtype and traddr */ + return spdk_pci_addr_compare(&pci_addr1, &pci_addr2); + } + + cmp = strcasecmp(trid1->traddr, trid2->traddr); + if (cmp) { + return cmp; + } + + cmp = cmp_int(trid1->adrfam, trid2->adrfam); + if (cmp) { + return cmp; + } + + cmp = strcasecmp(trid1->trsvcid, trid2->trsvcid); + if (cmp) { + return cmp; + } + + cmp = strcmp(trid1->subnqn, trid2->subnqn); + if (cmp) { + return cmp; + } + + return 0; +} + +int +spdk_nvme_prchk_flags_parse(uint32_t *prchk_flags, const char *str) +{ + size_t val_len; + char key[32]; + char val[1024]; + + if (prchk_flags == NULL || str == NULL) { + return -EINVAL; + } + + while (*str != '\0') { + val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val)); + + if (val_len == 0) { + SPDK_ERRLOG("Failed to parse prchk\n"); + return -EINVAL; + } + + if (strcasecmp(key, "prchk") == 0) { + if (strcasestr(val, "reftag") != NULL) { + *prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG; + } + if (strcasestr(val, "guard") != NULL) { + *prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD; + } + } else { + SPDK_ERRLOG("Unknown key '%s'\n", key); + return -EINVAL; + } + } + + return 0; +} + +const char * +spdk_nvme_prchk_flags_str(uint32_t prchk_flags) +{ + if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) { + if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) { + return "prchk:reftag|guard"; + } else { + return "prchk:reftag"; + } + } else { + if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) { + return "prchk:guard"; + } else { + return NULL; + } + } +} + +struct spdk_nvme_probe_ctx * +spdk_nvme_probe_async(const struct spdk_nvme_transport_id *trid, + void *cb_ctx, + spdk_nvme_probe_cb probe_cb, + spdk_nvme_attach_cb attach_cb, + spdk_nvme_remove_cb remove_cb) +{ + int rc; + struct spdk_nvme_probe_ctx *probe_ctx; + + rc = nvme_driver_init(); + if (rc != 0) { + return NULL; + } + + probe_ctx = calloc(1, sizeof(*probe_ctx)); + if (!probe_ctx) { + return NULL; + } + + nvme_probe_ctx_init(probe_ctx, trid, cb_ctx, probe_cb, attach_cb, remove_cb); + rc = nvme_probe_internal(probe_ctx, false); + if (rc != 0) { + free(probe_ctx); + return NULL; + } + + return probe_ctx; +} + +int +spdk_nvme_probe_poll_async(struct spdk_nvme_probe_ctx *probe_ctx) +{ + int rc = 0; + struct spdk_nvme_ctrlr *ctrlr, *ctrlr_tmp; + + if (!spdk_process_is_primary() && probe_ctx->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + free(probe_ctx); + return 0; + } + + TAILQ_FOREACH_SAFE(ctrlr, &probe_ctx->init_ctrlrs, tailq, ctrlr_tmp) { + rc = nvme_ctrlr_poll_internal(ctrlr, probe_ctx); + if (rc != 0) { + rc = -EIO; + break; + } + } + + if (rc != 0 || TAILQ_EMPTY(&probe_ctx->init_ctrlrs)) { + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + g_spdk_nvme_driver->initialized = true; + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + free(probe_ctx); + return rc; + } + + return -EAGAIN; +} + +struct spdk_nvme_probe_ctx * +spdk_nvme_connect_async(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + spdk_nvme_attach_cb attach_cb) +{ + int rc; + spdk_nvme_probe_cb probe_cb = NULL; + struct spdk_nvme_probe_ctx *probe_ctx; + + rc = nvme_driver_init(); + if (rc != 0) { + return NULL; + } + + probe_ctx = calloc(1, sizeof(*probe_ctx)); + if (!probe_ctx) { + return NULL; + } + + if (opts) { + probe_cb = nvme_connect_probe_cb; + } + + nvme_probe_ctx_init(probe_ctx, trid, (void *)opts, probe_cb, attach_cb, NULL); + rc = nvme_probe_internal(probe_ctx, true); + if (rc != 0) { + free(probe_ctx); + return NULL; + } + + return probe_ctx; +} + +SPDK_LOG_REGISTER_COMPONENT("nvme", SPDK_LOG_NVME) diff --git a/src/spdk/lib/nvme/nvme_ctrlr.c b/src/spdk/lib/nvme/nvme_ctrlr.c new file mode 100644 index 000000000..ced02e9bb --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ctrlr.c @@ -0,0 +1,3639 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvme_internal.h" +#include "nvme_io_msg.h" + +#include "spdk/env.h" +#include "spdk/string.h" + +struct nvme_active_ns_ctx; + +static void nvme_ctrlr_destruct_namespaces(struct spdk_nvme_ctrlr *ctrlr); +static int nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_async_event_request *aer); +static void nvme_ctrlr_identify_active_ns_async(struct nvme_active_ns_ctx *ctx); +static int nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns); +static int nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns); + +static int +nvme_ctrlr_get_cc(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cc_register *cc) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw), + &cc->raw); +} + +static int +nvme_ctrlr_get_csts(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_csts_register *csts) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, csts.raw), + &csts->raw); +} + +int +nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap) +{ + return nvme_transport_ctrlr_get_reg_8(ctrlr, offsetof(struct spdk_nvme_registers, cap.raw), + &cap->raw); +} + +int +nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, vs.raw), + &vs->raw); +} + +static int +nvme_ctrlr_set_cc(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cc_register *cc) +{ + return nvme_transport_ctrlr_set_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw), + cc->raw); +} + +int +nvme_ctrlr_get_cmbsz(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cmbsz_register *cmbsz) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), + &cmbsz->raw); +} + +/* When the field in spdk_nvme_ctrlr_opts are changed and you change this function, please + * also update the nvme_ctrl_opts_init function in nvme_ctrlr.c + */ +void +spdk_nvme_ctrlr_get_default_ctrlr_opts(struct spdk_nvme_ctrlr_opts *opts, size_t opts_size) +{ + char host_id_str[SPDK_UUID_STRING_LEN]; + + assert(opts); + + opts->opts_size = opts_size; + +#define FIELD_OK(field) \ + offsetof(struct spdk_nvme_ctrlr_opts, field) + sizeof(opts->field) <= opts_size + + if (FIELD_OK(num_io_queues)) { + opts->num_io_queues = DEFAULT_MAX_IO_QUEUES; + } + + if (FIELD_OK(use_cmb_sqs)) { + opts->use_cmb_sqs = true; + } + + if (FIELD_OK(no_shn_notification)) { + opts->no_shn_notification = false; + } + + if (FIELD_OK(arb_mechanism)) { + opts->arb_mechanism = SPDK_NVME_CC_AMS_RR; + } + + if (FIELD_OK(arbitration_burst)) { + opts->arbitration_burst = 0; + } + + if (FIELD_OK(low_priority_weight)) { + opts->low_priority_weight = 0; + } + + if (FIELD_OK(medium_priority_weight)) { + opts->medium_priority_weight = 0; + } + + if (FIELD_OK(high_priority_weight)) { + opts->high_priority_weight = 0; + } + + if (FIELD_OK(keep_alive_timeout_ms)) { + opts->keep_alive_timeout_ms = MIN_KEEP_ALIVE_TIMEOUT_IN_MS; + } + + if (FIELD_OK(transport_retry_count)) { + opts->transport_retry_count = SPDK_NVME_DEFAULT_RETRY_COUNT; + } + + if (FIELD_OK(io_queue_size)) { + opts->io_queue_size = DEFAULT_IO_QUEUE_SIZE; + } + + if (nvme_driver_init() == 0) { + if (FIELD_OK(hostnqn)) { + spdk_uuid_fmt_lower(host_id_str, sizeof(host_id_str), + &g_spdk_nvme_driver->default_extended_host_id); + snprintf(opts->hostnqn, sizeof(opts->hostnqn), "2014-08.org.nvmexpress:uuid:%s", host_id_str); + } + + if (FIELD_OK(extended_host_id)) { + memcpy(opts->extended_host_id, &g_spdk_nvme_driver->default_extended_host_id, + sizeof(opts->extended_host_id)); + } + + } + + if (FIELD_OK(io_queue_requests)) { + opts->io_queue_requests = DEFAULT_IO_QUEUE_REQUESTS; + } + + if (FIELD_OK(src_addr)) { + memset(opts->src_addr, 0, sizeof(opts->src_addr)); + } + + if (FIELD_OK(src_svcid)) { + memset(opts->src_svcid, 0, sizeof(opts->src_svcid)); + } + + if (FIELD_OK(host_id)) { + memset(opts->host_id, 0, sizeof(opts->host_id)); + } + + if (FIELD_OK(command_set)) { + opts->command_set = SPDK_NVME_CC_CSS_NVM; + } + + if (FIELD_OK(admin_timeout_ms)) { + opts->admin_timeout_ms = NVME_MAX_ADMIN_TIMEOUT_IN_SECS * 1000; + } + + if (FIELD_OK(header_digest)) { + opts->header_digest = false; + } + + if (FIELD_OK(data_digest)) { + opts->data_digest = false; + } + + if (FIELD_OK(disable_error_logging)) { + opts->disable_error_logging = false; + } + + if (FIELD_OK(transport_ack_timeout)) { + opts->transport_ack_timeout = SPDK_NVME_DEFAULT_TRANSPORT_ACK_TIMEOUT; + } + + if (FIELD_OK(admin_queue_size)) { + opts->admin_queue_size = DEFAULT_ADMIN_QUEUE_SIZE; + } +#undef FIELD_OK +} + +/** + * This function will be called when the process allocates the IO qpair. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_proc_add_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr_process *active_proc; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + TAILQ_INSERT_TAIL(&active_proc->allocated_io_qpairs, qpair, per_process_tailq); + qpair->active_proc = active_proc; + } +} + +/** + * This function will be called when the process frees the IO qpair. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_proc_remove_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr_process *active_proc; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_qpair *active_qpair, *tmp_qpair; + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (!active_proc) { + return; + } + + TAILQ_FOREACH_SAFE(active_qpair, &active_proc->allocated_io_qpairs, + per_process_tailq, tmp_qpair) { + if (active_qpair == qpair) { + TAILQ_REMOVE(&active_proc->allocated_io_qpairs, + active_qpair, per_process_tailq); + + break; + } + } +} + +void +spdk_nvme_ctrlr_get_default_io_qpair_opts(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_io_qpair_opts *opts, + size_t opts_size) +{ + assert(ctrlr); + + assert(opts); + + memset(opts, 0, opts_size); + +#define FIELD_OK(field) \ + offsetof(struct spdk_nvme_io_qpair_opts, field) + sizeof(opts->field) <= opts_size + + if (FIELD_OK(qprio)) { + opts->qprio = SPDK_NVME_QPRIO_URGENT; + } + + if (FIELD_OK(io_queue_size)) { + opts->io_queue_size = ctrlr->opts.io_queue_size; + } + + if (FIELD_OK(io_queue_requests)) { + opts->io_queue_requests = ctrlr->opts.io_queue_requests; + } + + if (FIELD_OK(delay_cmd_submit)) { + opts->delay_cmd_submit = false; + } + + if (FIELD_OK(sq.vaddr)) { + opts->sq.vaddr = NULL; + } + + if (FIELD_OK(sq.paddr)) { + opts->sq.paddr = 0; + } + + if (FIELD_OK(sq.buffer_size)) { + opts->sq.buffer_size = 0; + } + + if (FIELD_OK(cq.vaddr)) { + opts->cq.vaddr = NULL; + } + + if (FIELD_OK(cq.paddr)) { + opts->cq.paddr = 0; + } + + if (FIELD_OK(cq.buffer_size)) { + opts->cq.buffer_size = 0; + } + + if (FIELD_OK(create_only)) { + opts->create_only = false; + } + +#undef FIELD_OK +} + +static struct spdk_nvme_qpair * +nvme_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_io_qpair_opts *opts) +{ + uint32_t qid; + struct spdk_nvme_qpair *qpair; + union spdk_nvme_cc_register cc; + + if (!ctrlr) { + return NULL; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("get_cc failed\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + if (opts->qprio & ~SPDK_NVME_CREATE_IO_SQ_QPRIO_MASK) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + /* + * Only value SPDK_NVME_QPRIO_URGENT(0) is valid for the + * default round robin arbitration method. + */ + if ((cc.bits.ams == SPDK_NVME_CC_AMS_RR) && (opts->qprio != SPDK_NVME_QPRIO_URGENT)) { + SPDK_ERRLOG("invalid queue priority for default round robin arbitration method\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + /* + * Get the first available I/O queue ID. + */ + qid = spdk_bit_array_find_first_set(ctrlr->free_io_qids, 1); + if (qid > ctrlr->opts.num_io_queues) { + SPDK_ERRLOG("No free I/O queue IDs\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + qpair = nvme_transport_ctrlr_create_io_qpair(ctrlr, qid, opts); + if (qpair == NULL) { + SPDK_ERRLOG("nvme_transport_ctrlr_create_io_qpair() failed\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + spdk_bit_array_clear(ctrlr->free_io_qids, qid); + TAILQ_INSERT_TAIL(&ctrlr->active_io_qpairs, qpair, tailq); + + nvme_ctrlr_proc_add_io_qpair(qpair); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return qpair; +} + +int +spdk_nvme_ctrlr_connect_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + int rc; + + if (nvme_qpair_get_state(qpair) != NVME_QPAIR_DISCONNECTED) { + return -EISCONN; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + if (ctrlr->quirks & NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC) { + spdk_delay_us(100); + } + + return rc; +} + +void +spdk_nvme_ctrlr_disconnect_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +struct spdk_nvme_qpair * +spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_io_qpair_opts *user_opts, + size_t opts_size) +{ + + struct spdk_nvme_qpair *qpair; + struct spdk_nvme_io_qpair_opts opts; + int rc; + + /* + * Get the default options, then overwrite them with the user-provided options + * up to opts_size. + * + * This allows for extensions of the opts structure without breaking + * ABI compatibility. + */ + spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); + if (user_opts) { + memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size)); + + /* If user passes buffers, make sure they're big enough for the requested queue size */ + if (opts.sq.vaddr) { + if (opts.sq.buffer_size < (opts.io_queue_size * sizeof(struct spdk_nvme_cmd))) { + SPDK_ERRLOG("sq buffer size %lx is too small for sq size %lx\n", + opts.sq.buffer_size, (opts.io_queue_size * sizeof(struct spdk_nvme_cmd))); + return NULL; + } + } + if (opts.cq.vaddr) { + if (opts.cq.buffer_size < (opts.io_queue_size * sizeof(struct spdk_nvme_cpl))) { + SPDK_ERRLOG("cq buffer size %lx is too small for cq size %lx\n", + opts.cq.buffer_size, (opts.io_queue_size * sizeof(struct spdk_nvme_cpl))); + return NULL; + } + } + } + + qpair = nvme_ctrlr_create_io_qpair(ctrlr, &opts); + + if (qpair == NULL || opts.create_only == true) { + return qpair; + } + + rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair); + if (rc != 0) { + SPDK_ERRLOG("nvme_transport_ctrlr_connect_io_qpair() failed\n"); + nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair); + return NULL; + } + + return qpair; +} + +int +spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr; + enum nvme_qpair_state qpair_state; + int rc; + + assert(qpair != NULL); + assert(nvme_qpair_is_admin_queue(qpair) == false); + assert(qpair->ctrlr != NULL); + + ctrlr = qpair->ctrlr; + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + qpair_state = nvme_qpair_get_state(qpair); + + if (ctrlr->is_removed) { + rc = -ENODEV; + goto out; + } + + if (ctrlr->is_resetting || qpair_state == NVME_QPAIR_DISCONNECTING) { + rc = -EAGAIN; + goto out; + } + + if (ctrlr->is_failed || qpair_state == NVME_QPAIR_DESTROYING) { + rc = -ENXIO; + goto out; + } + + if (qpair_state != NVME_QPAIR_DISCONNECTED) { + rc = 0; + goto out; + } + + rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair); + if (rc) { + rc = -EAGAIN; + goto out; + } + +out: + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +spdk_nvme_qp_failure_reason +spdk_nvme_ctrlr_get_admin_qp_failure_reason(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->adminq->transport_failure_reason; +} + +/* + * This internal function will attempt to take the controller + * lock before calling disconnect on a controller qpair. + * Functions already holding the controller lock should + * call nvme_transport_ctrlr_disconnect_qpair directly. + */ +void +nvme_ctrlr_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + assert(ctrlr != NULL); + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +int +spdk_nvme_ctrlr_free_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr; + + if (qpair == NULL) { + return 0; + } + + ctrlr = qpair->ctrlr; + + if (qpair->in_completion_context) { + /* + * There are many cases where it is convenient to delete an io qpair in the context + * of that qpair's completion routine. To handle this properly, set a flag here + * so that the completion routine will perform an actual delete after the context + * unwinds. + */ + qpair->delete_after_completion_context = 1; + return 0; + } + + if (qpair->poll_group && qpair->poll_group->in_completion_context) { + /* Same as above, but in a poll group. */ + qpair->poll_group->num_qpairs_to_delete++; + qpair->delete_after_completion_context = 1; + return 0; + } + + if (qpair->poll_group) { + spdk_nvme_poll_group_remove(qpair->poll_group->group, qpair); + } + + /* Do not retry. */ + nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING); + + /* In the multi-process case, a process may call this function on a foreign + * I/O qpair (i.e. one that this process did not create) when that qpairs process + * exits unexpectedly. In that case, we must not try to abort any reqs associated + * with that qpair, since the callbacks will also be foreign to this process. + */ + if (qpair->active_proc == nvme_ctrlr_get_current_process(ctrlr)) { + nvme_qpair_abort_reqs(qpair, 1); + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + nvme_ctrlr_proc_remove_io_qpair(qpair); + + TAILQ_REMOVE(&ctrlr->active_io_qpairs, qpair, tailq); + spdk_bit_array_set(ctrlr->free_io_qids, qpair->id); + + if (nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair)) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -1; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return 0; +} + +static void +nvme_ctrlr_construct_intel_support_log_page_list(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_intel_log_page_directory *log_page_directory) +{ + if (log_page_directory == NULL) { + return; + } + + if (ctrlr->cdata.vid != SPDK_PCI_VID_INTEL) { + return; + } + + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY] = true; + + if (log_page_directory->read_latency_log_len || + (ctrlr->quirks & NVME_INTEL_QUIRK_READ_LATENCY)) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY] = true; + } + if (log_page_directory->write_latency_log_len || + (ctrlr->quirks & NVME_INTEL_QUIRK_WRITE_LATENCY)) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY] = true; + } + if (log_page_directory->temperature_statistics_log_len) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_TEMPERATURE] = true; + } + if (log_page_directory->smart_log_len) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_SMART] = true; + } + if (log_page_directory->marketing_description_log_len) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_MARKETING_DESCRIPTION] = true; + } +} + +static int nvme_ctrlr_set_intel_support_log_pages(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + struct nvme_completion_poll_status *status; + struct spdk_nvme_intel_log_page_directory *log_page_directory; + + log_page_directory = spdk_zmalloc(sizeof(struct spdk_nvme_intel_log_page_directory), + 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (log_page_directory == NULL) { + SPDK_ERRLOG("could not allocate log_page_directory\n"); + return -ENXIO; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + spdk_free(log_page_directory); + return -ENOMEM; + } + + rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY, + SPDK_NVME_GLOBAL_NS_TAG, log_page_directory, + sizeof(struct spdk_nvme_intel_log_page_directory), + 0, nvme_completion_poll_cb, status); + if (rc != 0) { + spdk_free(log_page_directory); + free(status); + return rc; + } + + if (nvme_wait_for_completion_timeout(ctrlr->adminq, status, + ctrlr->opts.admin_timeout_ms / 1000)) { + spdk_free(log_page_directory); + SPDK_WARNLOG("Intel log pages not supported on Intel drive!\n"); + if (!status->timed_out) { + free(status); + } + return 0; + } + + nvme_ctrlr_construct_intel_support_log_page_list(ctrlr, log_page_directory); + spdk_free(log_page_directory); + free(status); + return 0; +} + +static int +nvme_ctrlr_set_supported_log_pages(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + + memset(ctrlr->log_page_supported, 0, sizeof(ctrlr->log_page_supported)); + /* Mandatory pages */ + ctrlr->log_page_supported[SPDK_NVME_LOG_ERROR] = true; + ctrlr->log_page_supported[SPDK_NVME_LOG_HEALTH_INFORMATION] = true; + ctrlr->log_page_supported[SPDK_NVME_LOG_FIRMWARE_SLOT] = true; + if (ctrlr->cdata.lpa.celp) { + ctrlr->log_page_supported[SPDK_NVME_LOG_COMMAND_EFFECTS_LOG] = true; + } + if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL && !(ctrlr->quirks & NVME_INTEL_QUIRK_NO_LOG_PAGES)) { + rc = nvme_ctrlr_set_intel_support_log_pages(ctrlr); + } + + return rc; +} + +static void +nvme_ctrlr_set_intel_supported_features(struct spdk_nvme_ctrlr *ctrlr) +{ + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_MAX_LBA] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_NATIVE_MAX_LBA] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_POWER_GOVERNOR_SETTING] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_SMBUS_ADDRESS] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LED_PATTERN] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_RESET_TIMED_WORKLOAD_COUNTERS] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING] = true; +} + +static void +nvme_ctrlr_set_arbitration_feature(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t cdw11; + struct nvme_completion_poll_status *status; + + if (ctrlr->opts.arbitration_burst == 0) { + return; + } + + if (ctrlr->opts.arbitration_burst > 7) { + SPDK_WARNLOG("Valid arbitration burst values is from 0-7\n"); + return; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return; + } + + cdw11 = ctrlr->opts.arbitration_burst; + + if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_WRR_SUPPORTED) { + cdw11 |= (uint32_t)ctrlr->opts.low_priority_weight << 8; + cdw11 |= (uint32_t)ctrlr->opts.medium_priority_weight << 16; + cdw11 |= (uint32_t)ctrlr->opts.high_priority_weight << 24; + } + + if (spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_ARBITRATION, + cdw11, 0, NULL, 0, + nvme_completion_poll_cb, status) < 0) { + SPDK_ERRLOG("Set arbitration feature failed\n"); + free(status); + return; + } + + if (nvme_wait_for_completion_timeout(ctrlr->adminq, status, + ctrlr->opts.admin_timeout_ms / 1000)) { + SPDK_ERRLOG("Timeout to set arbitration feature\n"); + } + + if (!status->timed_out) { + free(status); + } +} + +static void +nvme_ctrlr_set_supported_features(struct spdk_nvme_ctrlr *ctrlr) +{ + memset(ctrlr->feature_supported, 0, sizeof(ctrlr->feature_supported)); + /* Mandatory features */ + ctrlr->feature_supported[SPDK_NVME_FEAT_ARBITRATION] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_POWER_MANAGEMENT] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_ERROR_RECOVERY] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_NUMBER_OF_QUEUES] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_COALESCING] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_WRITE_ATOMICITY] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION] = true; + /* Optional features */ + if (ctrlr->cdata.vwc.present) { + ctrlr->feature_supported[SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE] = true; + } + if (ctrlr->cdata.apsta.supported) { + ctrlr->feature_supported[SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION] = true; + } + if (ctrlr->cdata.hmpre) { + ctrlr->feature_supported[SPDK_NVME_FEAT_HOST_MEM_BUFFER] = true; + } + if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL) { + nvme_ctrlr_set_intel_supported_features(ctrlr); + } + + nvme_ctrlr_set_arbitration_feature(ctrlr); +} + +bool +spdk_nvme_ctrlr_is_failed(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->is_failed; +} + +void +nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove) +{ + /* + * Set the flag here and leave the work failure of qpairs to + * spdk_nvme_qpair_process_completions(). + */ + if (hot_remove) { + ctrlr->is_removed = true; + } + ctrlr->is_failed = true; + nvme_transport_ctrlr_disconnect_qpair(ctrlr, ctrlr->adminq); + SPDK_ERRLOG("ctrlr %s in failed state.\n", ctrlr->trid.traddr); +} + +/** + * This public API function will try to take the controller lock. + * Any private functions being called from a thread already holding + * the ctrlr lock should call nvme_ctrlr_fail directly. + */ +void +spdk_nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr) +{ + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + nvme_ctrlr_fail(ctrlr, false); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +static void +nvme_ctrlr_shutdown(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + uint32_t ms_waited = 0; + uint32_t shutdown_timeout_ms; + + if (ctrlr->is_removed) { + return; + } + + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("ctrlr %s get_cc() failed\n", ctrlr->trid.traddr); + return; + } + + cc.bits.shn = SPDK_NVME_SHN_NORMAL; + + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("ctrlr %s set_cc() failed\n", ctrlr->trid.traddr); + return; + } + + /* + * The NVMe specification defines RTD3E to be the time between + * setting SHN = 1 until the controller will set SHST = 10b. + * If the device doesn't report RTD3 entry latency, or if it + * reports RTD3 entry latency less than 10 seconds, pick + * 10 seconds as a reasonable amount of time to + * wait before proceeding. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RTD3E = %" PRIu32 " us\n", ctrlr->cdata.rtd3e); + shutdown_timeout_ms = (ctrlr->cdata.rtd3e + 999) / 1000; + shutdown_timeout_ms = spdk_max(shutdown_timeout_ms, 10000); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "shutdown timeout = %" PRIu32 " ms\n", shutdown_timeout_ms); + + do { + if (nvme_ctrlr_get_csts(ctrlr, &csts)) { + SPDK_ERRLOG("ctrlr %s get_csts() failed\n", ctrlr->trid.traddr); + return; + } + + if (csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "ctrlr %s shutdown complete in %u milliseconds\n", + ctrlr->trid.traddr, ms_waited); + return; + } + + nvme_delay(1000); + ms_waited++; + } while (ms_waited < shutdown_timeout_ms); + + SPDK_ERRLOG("ctrlr %s did not shutdown within %u milliseconds\n", + ctrlr->trid.traddr, shutdown_timeout_ms); + if (ctrlr->quirks & NVME_QUIRK_SHST_COMPLETE) { + SPDK_ERRLOG("likely due to shutdown handling in the VMWare emulated NVMe SSD\n"); + } +} + +static int +nvme_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + int rc; + + rc = nvme_transport_ctrlr_enable(ctrlr); + if (rc != 0) { + SPDK_ERRLOG("transport ctrlr_enable failed\n"); + return rc; + } + + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("get_cc() failed\n"); + return -EIO; + } + + if (cc.bits.en != 0) { + SPDK_ERRLOG("called with CC.EN = 1\n"); + return -EINVAL; + } + + cc.bits.en = 1; + cc.bits.css = 0; + cc.bits.shn = 0; + cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */ + cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */ + + /* Page size is 2 ^ (12 + mps). */ + cc.bits.mps = spdk_u32log2(ctrlr->page_size) - 12; + + if (ctrlr->cap.bits.css == 0) { + SPDK_INFOLOG(SPDK_LOG_NVME, + "Drive reports no command sets supported. Assuming NVM is supported.\n"); + ctrlr->cap.bits.css = SPDK_NVME_CAP_CSS_NVM; + } + + if (!(ctrlr->cap.bits.css & (1u << ctrlr->opts.command_set))) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested I/O command set %u but supported mask is 0x%x\n", + ctrlr->opts.command_set, ctrlr->cap.bits.css); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Falling back to NVM. Assuming NVM is supported.\n"); + ctrlr->opts.command_set = SPDK_NVME_CC_CSS_NVM; + } + + cc.bits.css = ctrlr->opts.command_set; + + switch (ctrlr->opts.arb_mechanism) { + case SPDK_NVME_CC_AMS_RR: + break; + case SPDK_NVME_CC_AMS_WRR: + if (SPDK_NVME_CAP_AMS_WRR & ctrlr->cap.bits.ams) { + break; + } + return -EINVAL; + case SPDK_NVME_CC_AMS_VS: + if (SPDK_NVME_CAP_AMS_VS & ctrlr->cap.bits.ams) { + break; + } + return -EINVAL; + default: + return -EINVAL; + } + + cc.bits.ams = ctrlr->opts.arb_mechanism; + + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + return -EIO; + } + + return 0; +} + +static int +nvme_ctrlr_disable(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("get_cc() failed\n"); + return -EIO; + } + + if (cc.bits.en == 0) { + return 0; + } + + cc.bits.en = 0; + + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + return -EIO; + } + + return 0; +} + +#ifdef DEBUG +static const char * +nvme_ctrlr_state_string(enum nvme_ctrlr_state state) +{ + switch (state) { + case NVME_CTRLR_STATE_INIT_DELAY: + return "delay init"; + case NVME_CTRLR_STATE_INIT: + return "init"; + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1: + return "disable and wait for CSTS.RDY = 1"; + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0: + return "disable and wait for CSTS.RDY = 0"; + case NVME_CTRLR_STATE_ENABLE: + return "enable controller by writing CC.EN = 1"; + case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1: + return "wait for CSTS.RDY = 1"; + case NVME_CTRLR_STATE_RESET_ADMIN_QUEUE: + return "reset admin queue"; + case NVME_CTRLR_STATE_IDENTIFY: + return "identify controller"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY: + return "wait for identify controller"; + case NVME_CTRLR_STATE_SET_NUM_QUEUES: + return "set number of queues"; + case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES: + return "wait for set number of queues"; + case NVME_CTRLR_STATE_CONSTRUCT_NS: + return "construct namespaces"; + case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS: + return "identify active ns"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS: + return "wait for identify active ns"; + case NVME_CTRLR_STATE_IDENTIFY_NS: + return "identify ns"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS: + return "wait for identify ns"; + case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS: + return "identify namespace id descriptors"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS: + return "wait for identify namespace id descriptors"; + case NVME_CTRLR_STATE_CONFIGURE_AER: + return "configure AER"; + case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER: + return "wait for configure aer"; + case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES: + return "set supported log pages"; + case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES: + return "set supported features"; + case NVME_CTRLR_STATE_SET_DB_BUF_CFG: + return "set doorbell buffer config"; + case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG: + return "wait for doorbell buffer config"; + case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT: + return "set keep alive timeout"; + case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT: + return "wait for set keep alive timeout"; + case NVME_CTRLR_STATE_SET_HOST_ID: + return "set host ID"; + case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID: + return "wait for set host ID"; + case NVME_CTRLR_STATE_READY: + return "ready"; + case NVME_CTRLR_STATE_ERROR: + return "error"; + } + return "unknown"; +}; +#endif /* DEBUG */ + +static void +nvme_ctrlr_set_state(struct spdk_nvme_ctrlr *ctrlr, enum nvme_ctrlr_state state, + uint64_t timeout_in_ms) +{ + uint64_t ticks_per_ms, timeout_in_ticks, now_ticks; + + ctrlr->state = state; + if (timeout_in_ms == NVME_TIMEOUT_INFINITE) { + goto inf; + } + + ticks_per_ms = spdk_get_ticks_hz() / 1000; + if (timeout_in_ms > UINT64_MAX / ticks_per_ms) { + SPDK_ERRLOG("Specified timeout would cause integer overflow. Defaulting to no timeout.\n"); + goto inf; + } + + now_ticks = spdk_get_ticks(); + timeout_in_ticks = timeout_in_ms * ticks_per_ms; + if (timeout_in_ticks > UINT64_MAX - now_ticks) { + SPDK_ERRLOG("Specified timeout would cause integer overflow. Defaulting to no timeout.\n"); + goto inf; + } + + ctrlr->state_timeout_tsc = timeout_in_ticks + now_ticks; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (timeout %" PRIu64 " ms)\n", + nvme_ctrlr_state_string(ctrlr->state), timeout_in_ms); + return; +inf: + SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (no timeout)\n", + nvme_ctrlr_state_string(ctrlr->state)); + ctrlr->state_timeout_tsc = NVME_TIMEOUT_INFINITE; +} + +static void +nvme_ctrlr_free_doorbell_buffer(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->shadow_doorbell) { + spdk_free(ctrlr->shadow_doorbell); + ctrlr->shadow_doorbell = NULL; + } + + if (ctrlr->eventidx) { + spdk_free(ctrlr->eventidx); + ctrlr->eventidx = NULL; + } +} + +static void +nvme_ctrlr_set_doorbell_buffer_config_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_WARNLOG("Doorbell buffer config failed\n"); + } else { + SPDK_INFOLOG(SPDK_LOG_NVME, "NVMe controller: %s doorbell buffer config enabled\n", + ctrlr->trid.traddr); + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, + ctrlr->opts.admin_timeout_ms); +} + +static int +nvme_ctrlr_set_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + uint64_t prp1, prp2, len; + + if (!ctrlr->cdata.oacs.doorbell_buffer_config) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + /* only 1 page size for doorbell buffer */ + ctrlr->shadow_doorbell = spdk_zmalloc(ctrlr->page_size, ctrlr->page_size, + NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE); + if (ctrlr->shadow_doorbell == NULL) { + rc = -ENOMEM; + goto error; + } + + len = ctrlr->page_size; + prp1 = spdk_vtophys(ctrlr->shadow_doorbell, &len); + if (prp1 == SPDK_VTOPHYS_ERROR || len != ctrlr->page_size) { + rc = -EFAULT; + goto error; + } + + ctrlr->eventidx = spdk_zmalloc(ctrlr->page_size, ctrlr->page_size, + NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE); + if (ctrlr->eventidx == NULL) { + rc = -ENOMEM; + goto error; + } + + len = ctrlr->page_size; + prp2 = spdk_vtophys(ctrlr->eventidx, &len); + if (prp2 == SPDK_VTOPHYS_ERROR || len != ctrlr->page_size) { + rc = -EFAULT; + goto error; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG, + ctrlr->opts.admin_timeout_ms); + + rc = nvme_ctrlr_cmd_doorbell_buffer_config(ctrlr, prp1, prp2, + nvme_ctrlr_set_doorbell_buffer_config_done, ctrlr); + if (rc != 0) { + goto error; + } + + return 0; + +error: + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + nvme_ctrlr_free_doorbell_buffer(ctrlr); + return rc; +} + +static void +nvme_ctrlr_abort_queued_aborts(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_request *req, *tmp; + struct spdk_nvme_cpl cpl = {}; + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + + STAILQ_FOREACH_SAFE(req, &ctrlr->queued_aborts, stailq, tmp) { + STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq); + + nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, &cpl); + nvme_free_request(req); + } +} + +int +spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + struct spdk_nvme_qpair *qpair; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + if (ctrlr->is_resetting || ctrlr->is_removed) { + /* + * Controller is already resetting or has been removed. Return + * immediately since there is no need to kick off another + * reset in these cases. + */ + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return ctrlr->is_resetting ? 0 : -ENXIO; + } + + ctrlr->is_resetting = true; + ctrlr->is_failed = false; + + SPDK_NOTICELOG("resetting controller\n"); + + /* Abort all of the queued abort requests */ + nvme_ctrlr_abort_queued_aborts(ctrlr); + + nvme_transport_admin_qpair_abort_aers(ctrlr->adminq); + + /* Disable all queues before disabling the controller hardware. */ + TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + } + + ctrlr->adminq->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + nvme_transport_ctrlr_disconnect_qpair(ctrlr, ctrlr->adminq); + if (nvme_transport_ctrlr_connect_qpair(ctrlr, ctrlr->adminq) != 0) { + SPDK_ERRLOG("Controller reinitialization failed.\n"); + rc = -1; + goto out; + } + + /* Doorbell buffer config is invalid during reset */ + nvme_ctrlr_free_doorbell_buffer(ctrlr); + + /* Set the state back to INIT to cause a full hardware reset. */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE); + + nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_ENABLED); + while (ctrlr->state != NVME_CTRLR_STATE_READY) { + if (nvme_ctrlr_process_init(ctrlr) != 0) { + SPDK_ERRLOG("controller reinitialization failed\n"); + rc = -1; + break; + } + } + + /* + * For PCIe controllers, the memory locations of the tranpsort qpair + * don't change when the controller is reset. They simply need to be + * re-enabled with admin commands to the controller. For fabric + * controllers we need to disconnect and reconnect the qpair on its + * own thread outside of the context of the reset. + */ + if (rc == 0 && ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + /* Reinitialize qpairs */ + TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { + if (nvme_transport_ctrlr_connect_qpair(ctrlr, qpair) != 0) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + rc = -1; + continue; + } + } + } + +out: + if (rc) { + nvme_ctrlr_fail(ctrlr, false); + } + ctrlr->is_resetting = false; + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + if (!ctrlr->cdata.oaes.ns_attribute_notices) { + /* + * If controller doesn't support ns_attribute_notices and + * namespace attributes change (e.g. number of namespaces) + * we need to update system handling device reset. + */ + nvme_io_msg_ctrlr_update(ctrlr); + } + + return rc; +} + +int +spdk_nvme_ctrlr_set_trid(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_transport_id *trid) +{ + int rc = 0; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + if (ctrlr->is_failed == false) { + rc = -EPERM; + goto out; + } + + if (trid->trtype != ctrlr->trid.trtype) { + rc = -EINVAL; + goto out; + } + + if (strncmp(trid->subnqn, ctrlr->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { + rc = -EINVAL; + goto out; + } + + ctrlr->trid = *trid; + +out: + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +static void +nvme_ctrlr_identify_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("nvme_identify_controller failed!\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + /* + * Use MDTS to ensure our default max_xfer_size doesn't exceed what the + * controller supports. + */ + ctrlr->max_xfer_size = nvme_transport_ctrlr_get_max_xfer_size(ctrlr); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_xfer_size %u\n", ctrlr->max_xfer_size); + if (ctrlr->cdata.mdts > 0) { + ctrlr->max_xfer_size = spdk_min(ctrlr->max_xfer_size, + ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts))); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "MDTS max_xfer_size %u\n", ctrlr->max_xfer_size); + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cdata.cntlid); + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + ctrlr->cntlid = ctrlr->cdata.cntlid; + } else { + /* + * Fabrics controllers should already have CNTLID from the Connect command. + * + * If CNTLID from Connect doesn't match CNTLID in the Identify Controller data, + * trust the one from Connect. + */ + if (ctrlr->cntlid != ctrlr->cdata.cntlid) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, + "Identify CNTLID 0x%04" PRIx16 " != Connect CNTLID 0x%04" PRIx16 "\n", + ctrlr->cdata.cntlid, ctrlr->cntlid); + } + } + + if (ctrlr->cdata.sgls.supported) { + assert(ctrlr->cdata.sgls.supported != 0x3); + ctrlr->flags |= SPDK_NVME_CTRLR_SGL_SUPPORTED; + if (ctrlr->cdata.sgls.supported == 0x2) { + ctrlr->flags |= SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT; + } + /* + * Use MSDBD to ensure our max_sges doesn't exceed what the + * controller supports. + */ + ctrlr->max_sges = nvme_transport_ctrlr_get_max_sges(ctrlr); + if (ctrlr->cdata.nvmf_specific.msdbd != 0) { + ctrlr->max_sges = spdk_min(ctrlr->cdata.nvmf_specific.msdbd, ctrlr->max_sges); + } else { + /* A value 0 indicates no limit. */ + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_sges %u\n", ctrlr->max_sges); + } + + if (ctrlr->cdata.oacs.security && !(ctrlr->quirks & NVME_QUIRK_OACS_SECURITY)) { + ctrlr->flags |= SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "fuses compare and write: %d\n", ctrlr->cdata.fuses.compare_and_write); + if (ctrlr->cdata.fuses.compare_and_write) { + ctrlr->flags |= SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_NUM_QUEUES, + ctrlr->opts.admin_timeout_ms); +} + +static int +nvme_ctrlr_identify(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY, + ctrlr->opts.admin_timeout_ms); + + rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0, + &ctrlr->cdata, sizeof(ctrlr->cdata), + nvme_ctrlr_identify_done, ctrlr); + if (rc != 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +enum nvme_active_ns_state { + NVME_ACTIVE_NS_STATE_IDLE, + NVME_ACTIVE_NS_STATE_PROCESSING, + NVME_ACTIVE_NS_STATE_DONE, + NVME_ACTIVE_NS_STATE_ERROR +}; + +typedef void (*nvme_active_ns_ctx_deleter)(struct nvme_active_ns_ctx *); + +struct nvme_active_ns_ctx { + struct spdk_nvme_ctrlr *ctrlr; + uint32_t page; + uint32_t num_pages; + uint32_t next_nsid; + uint32_t *new_ns_list; + nvme_active_ns_ctx_deleter deleter; + + enum nvme_active_ns_state state; +}; + +static struct nvme_active_ns_ctx * +nvme_active_ns_ctx_create(struct spdk_nvme_ctrlr *ctrlr, nvme_active_ns_ctx_deleter deleter) +{ + struct nvme_active_ns_ctx *ctx; + uint32_t num_pages = 0; + uint32_t *new_ns_list = NULL; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + SPDK_ERRLOG("Failed to allocate nvme_active_ns_ctx!\n"); + return NULL; + } + + if (ctrlr->num_ns) { + /* The allocated size must be a multiple of sizeof(struct spdk_nvme_ns_list) */ + num_pages = (ctrlr->num_ns * sizeof(new_ns_list[0]) - 1) / sizeof(struct spdk_nvme_ns_list) + 1; + new_ns_list = spdk_zmalloc(num_pages * sizeof(struct spdk_nvme_ns_list), ctrlr->page_size, + NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE); + if (!new_ns_list) { + SPDK_ERRLOG("Failed to allocate active_ns_list!\n"); + free(ctx); + return NULL; + } + } + + ctx->num_pages = num_pages; + ctx->new_ns_list = new_ns_list; + ctx->ctrlr = ctrlr; + ctx->deleter = deleter; + + return ctx; +} + +static void +nvme_active_ns_ctx_destroy(struct nvme_active_ns_ctx *ctx) +{ + spdk_free(ctx->new_ns_list); + free(ctx); +} + +static void +nvme_ctrlr_identify_active_ns_swap(struct spdk_nvme_ctrlr *ctrlr, uint32_t **new_ns_list) +{ + spdk_free(ctrlr->active_ns_list); + ctrlr->active_ns_list = *new_ns_list; + *new_ns_list = NULL; +} + +static void +nvme_ctrlr_identify_active_ns_async_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_active_ns_ctx *ctx = arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + ctx->state = NVME_ACTIVE_NS_STATE_ERROR; + goto out; + } + + ctx->next_nsid = ctx->new_ns_list[1024 * ctx->page + 1023]; + if (ctx->next_nsid == 0 || ++ctx->page == ctx->num_pages) { + ctx->state = NVME_ACTIVE_NS_STATE_DONE; + goto out; + } + + nvme_ctrlr_identify_active_ns_async(ctx); + return; + +out: + if (ctx->deleter) { + ctx->deleter(ctx); + } +} + +static void +nvme_ctrlr_identify_active_ns_async(struct nvme_active_ns_ctx *ctx) +{ + struct spdk_nvme_ctrlr *ctrlr = ctx->ctrlr; + uint32_t i; + int rc; + + if (ctrlr->num_ns == 0) { + ctx->state = NVME_ACTIVE_NS_STATE_DONE; + goto out; + } + + /* + * If controller doesn't support active ns list CNS 0x02 dummy up + * an active ns list, i.e. all namespaces report as active + */ + if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 1, 0) || ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS) { + for (i = 0; i < ctrlr->num_ns; i++) { + ctx->new_ns_list[i] = i + 1; + } + + ctx->state = NVME_ACTIVE_NS_STATE_DONE; + goto out; + } + + ctx->state = NVME_ACTIVE_NS_STATE_PROCESSING; + rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST, 0, ctx->next_nsid, + &ctx->new_ns_list[1024 * ctx->page], sizeof(struct spdk_nvme_ns_list), + nvme_ctrlr_identify_active_ns_async_done, ctx); + if (rc != 0) { + ctx->state = NVME_ACTIVE_NS_STATE_ERROR; + goto out; + } + + return; + +out: + if (ctx->deleter) { + ctx->deleter(ctx); + } +} + +static void +_nvme_active_ns_ctx_deleter(struct nvme_active_ns_ctx *ctx) +{ + struct spdk_nvme_ctrlr *ctrlr = ctx->ctrlr; + + if (ctx->state == NVME_ACTIVE_NS_STATE_ERROR) { + nvme_ctrlr_destruct_namespaces(ctrlr); + nvme_active_ns_ctx_destroy(ctx); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + assert(ctx->state == NVME_ACTIVE_NS_STATE_DONE); + nvme_ctrlr_identify_active_ns_swap(ctrlr, &ctx->new_ns_list); + nvme_active_ns_ctx_destroy(ctx); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_NS, ctrlr->opts.admin_timeout_ms); +} + +static void +_nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_active_ns_ctx *ctx; + + ctx = nvme_active_ns_ctx_create(ctrlr, _nvme_active_ns_ctx_deleter); + if (!ctx) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS, + ctrlr->opts.admin_timeout_ms); + nvme_ctrlr_identify_active_ns_async(ctx); +} + +int +nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_active_ns_ctx *ctx; + int rc; + + ctx = nvme_active_ns_ctx_create(ctrlr, NULL); + if (!ctx) { + return -ENOMEM; + } + + nvme_ctrlr_identify_active_ns_async(ctx); + while (ctx->state == NVME_ACTIVE_NS_STATE_PROCESSING) { + rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + if (rc < 0) { + ctx->state = NVME_ACTIVE_NS_STATE_ERROR; + break; + } + } + + if (ctx->state == NVME_ACTIVE_NS_STATE_ERROR) { + nvme_active_ns_ctx_destroy(ctx); + return -ENXIO; + } + + assert(ctx->state == NVME_ACTIVE_NS_STATE_DONE); + nvme_ctrlr_identify_active_ns_swap(ctrlr, &ctx->new_ns_list); + nvme_active_ns_ctx_destroy(ctx); + + return 0; +} + +static void +nvme_ctrlr_identify_ns_async_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg; + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + uint32_t nsid; + int rc; + + if (spdk_nvme_cpl_is_error(cpl)) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } else { + nvme_ns_set_identify_data(ns); + } + + /* move on to the next active NS */ + nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ID_DESCS, + ctrlr->opts.admin_timeout_ms); + return; + } + ns->ctrlr = ctrlr; + ns->id = nsid; + + rc = nvme_ctrlr_identify_ns_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } +} + +static int +nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + struct spdk_nvme_ns_data *nsdata; + + nsdata = &ctrlr->nsdata[ns->id - 1]; + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS, + ctrlr->opts.admin_timeout_ms); + return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id, + nsdata, sizeof(*nsdata), + nvme_ctrlr_identify_ns_async_done, ns); +} + +static int +nvme_ctrlr_identify_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t nsid; + struct spdk_nvme_ns *ns; + int rc; + + nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + /* No active NS, move on to the next state */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + ns->ctrlr = ctrlr; + ns->id = nsid; + + rc = nvme_ctrlr_identify_ns_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } + + return rc; +} + +static void +nvme_ctrlr_identify_id_desc_async_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg; + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + uint32_t nsid; + int rc; + + if (spdk_nvme_cpl_is_error(cpl)) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + return; + } + + /* move on to the next active NS */ + nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + return; + } + + rc = nvme_ctrlr_identify_id_desc_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } +} + +static int +nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + + memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list)); + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS, + ctrlr->opts.admin_timeout_ms); + return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST, + 0, ns->id, ns->id_desc_list, sizeof(ns->id_desc_list), + nvme_ctrlr_identify_id_desc_async_done, ns); +} + +static int +nvme_ctrlr_identify_id_desc_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t nsid; + struct spdk_nvme_ns *ns; + int rc; + + if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) || + (ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + /* No active NS, move on to the next state */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + rc = nvme_ctrlr_identify_id_desc_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } + + return rc; +} + +static void +nvme_ctrlr_update_nvmf_ioccsz(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA || + ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP || + ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_FC) { + if (ctrlr->cdata.nvmf_specific.ioccsz < 4) { + SPDK_ERRLOG("Incorrect IOCCSZ %u, the minimum value should be 4\n", + ctrlr->cdata.nvmf_specific.ioccsz); + ctrlr->cdata.nvmf_specific.ioccsz = 4; + assert(0); + } + ctrlr->ioccsz_bytes = ctrlr->cdata.nvmf_specific.ioccsz * 16 - sizeof(struct spdk_nvme_cmd); + ctrlr->icdoff = ctrlr->cdata.nvmf_specific.icdoff; + } +} + +static void +nvme_ctrlr_set_num_queues_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + uint32_t cq_allocated, sq_allocated, min_allocated, i; + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("Set Features - Number of Queues failed!\n"); + ctrlr->opts.num_io_queues = 0; + } else { + /* + * Data in cdw0 is 0-based. + * Lower 16-bits indicate number of submission queues allocated. + * Upper 16-bits indicate number of completion queues allocated. + */ + sq_allocated = (cpl->cdw0 & 0xFFFF) + 1; + cq_allocated = (cpl->cdw0 >> 16) + 1; + + /* + * For 1:1 queue mapping, set number of allocated queues to be minimum of + * submission and completion queues. + */ + min_allocated = spdk_min(sq_allocated, cq_allocated); + + /* Set number of queues to be minimum of requested and actually allocated. */ + ctrlr->opts.num_io_queues = spdk_min(min_allocated, ctrlr->opts.num_io_queues); + } + + ctrlr->free_io_qids = spdk_bit_array_create(ctrlr->opts.num_io_queues + 1); + if (ctrlr->free_io_qids == NULL) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + /* Initialize list of free I/O queue IDs. QID 0 is the admin queue. */ + spdk_bit_array_clear(ctrlr->free_io_qids, 0); + for (i = 1; i <= ctrlr->opts.num_io_queues; i++) { + spdk_bit_array_set(ctrlr->free_io_qids, i); + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONSTRUCT_NS, + ctrlr->opts.admin_timeout_ms); +} + +static int +nvme_ctrlr_set_num_queues(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + if (ctrlr->opts.num_io_queues > SPDK_NVME_MAX_IO_QUEUES) { + SPDK_NOTICELOG("Limiting requested num_io_queues %u to max %d\n", + ctrlr->opts.num_io_queues, SPDK_NVME_MAX_IO_QUEUES); + ctrlr->opts.num_io_queues = SPDK_NVME_MAX_IO_QUEUES; + } else if (ctrlr->opts.num_io_queues < 1) { + SPDK_NOTICELOG("Requested num_io_queues 0, increasing to 1\n"); + ctrlr->opts.num_io_queues = 1; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES, + ctrlr->opts.admin_timeout_ms); + + rc = nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->opts.num_io_queues, + nvme_ctrlr_set_num_queues_done, ctrlr); + if (rc != 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +static void +nvme_ctrlr_set_keep_alive_timeout_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + uint32_t keep_alive_interval_ms; + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + if ((cpl->status.sct == SPDK_NVME_SCT_GENERIC) && + (cpl->status.sc == SPDK_NVME_SC_INVALID_FIELD)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Keep alive timeout Get Feature is not supported\n"); + } else { + SPDK_ERRLOG("Keep alive timeout Get Feature failed: SC %x SCT %x\n", + cpl->status.sc, cpl->status.sct); + ctrlr->opts.keep_alive_timeout_ms = 0; + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + } else { + if (ctrlr->opts.keep_alive_timeout_ms != cpl->cdw0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller adjusted keep alive timeout to %u ms\n", + cpl->cdw0); + } + + ctrlr->opts.keep_alive_timeout_ms = cpl->cdw0; + } + + keep_alive_interval_ms = ctrlr->opts.keep_alive_timeout_ms / 2; + if (keep_alive_interval_ms == 0) { + keep_alive_interval_ms = 1; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Sending keep alive every %u ms\n", keep_alive_interval_ms); + + ctrlr->keep_alive_interval_ticks = (keep_alive_interval_ms * spdk_get_ticks_hz()) / UINT64_C(1000); + + /* Schedule the first Keep Alive to be sent as soon as possible. */ + ctrlr->next_keep_alive_tick = spdk_get_ticks(); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, + ctrlr->opts.admin_timeout_ms); +} + +static int +nvme_ctrlr_set_keep_alive_timeout(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + if (ctrlr->opts.keep_alive_timeout_ms == 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + if (ctrlr->cdata.kas == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller KAS is 0 - not enabling Keep Alive\n"); + ctrlr->opts.keep_alive_timeout_ms = 0; + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT, + ctrlr->opts.admin_timeout_ms); + + /* Retrieve actual keep alive timeout, since the controller may have adjusted it. */ + rc = spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, 0, NULL, 0, + nvme_ctrlr_set_keep_alive_timeout_done, ctrlr); + if (rc != 0) { + SPDK_ERRLOG("Keep alive timeout Get Feature failed: %d\n", rc); + ctrlr->opts.keep_alive_timeout_ms = 0; + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +static void +nvme_ctrlr_set_host_id_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + /* + * Treat Set Features - Host ID failure as non-fatal, since the Host ID feature + * is optional. + */ + SPDK_WARNLOG("Set Features - Host ID failed: SC 0x%x SCT 0x%x\n", + cpl->status.sc, cpl->status.sct); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Set Features - Host ID was successful\n"); + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE); +} + +static int +nvme_ctrlr_set_host_id(struct spdk_nvme_ctrlr *ctrlr) +{ + uint8_t *host_id; + uint32_t host_id_size; + int rc; + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + /* + * NVMe-oF sends the host ID during Connect and doesn't allow + * Set Features - Host Identifier after Connect, so we don't need to do anything here. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "NVMe-oF transport - not sending Set Features - Host ID\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE); + return 0; + } + + if (ctrlr->cdata.ctratt.host_id_exhid_supported) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 128-bit extended host identifier\n"); + host_id = ctrlr->opts.extended_host_id; + host_id_size = sizeof(ctrlr->opts.extended_host_id); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 64-bit host identifier\n"); + host_id = ctrlr->opts.host_id; + host_id_size = sizeof(ctrlr->opts.host_id); + } + + /* If the user specified an all-zeroes host identifier, don't send the command. */ + if (spdk_mem_all_zero(host_id, host_id_size)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, + "User did not specify host ID - not sending Set Features - Host ID\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE); + return 0; + } + + SPDK_LOGDUMP(SPDK_LOG_NVME, "host_id", host_id, host_id_size); + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_HOST_ID, + ctrlr->opts.admin_timeout_ms); + + rc = nvme_ctrlr_cmd_set_host_id(ctrlr, host_id, host_id_size, nvme_ctrlr_set_host_id_done, ctrlr); + if (rc != 0) { + SPDK_ERRLOG("Set Features - Host ID failed: %d\n", rc); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +static void +nvme_ctrlr_destruct_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->ns) { + uint32_t i, num_ns = ctrlr->num_ns; + + for (i = 0; i < num_ns; i++) { + nvme_ns_destruct(&ctrlr->ns[i]); + } + + spdk_free(ctrlr->ns); + ctrlr->ns = NULL; + ctrlr->num_ns = 0; + } + + if (ctrlr->nsdata) { + spdk_free(ctrlr->nsdata); + ctrlr->nsdata = NULL; + } + + spdk_free(ctrlr->active_ns_list); + ctrlr->active_ns_list = NULL; +} + +static void +nvme_ctrlr_update_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t i, nn = ctrlr->cdata.nn; + struct spdk_nvme_ns_data *nsdata; + bool ns_is_active; + + for (i = 0; i < nn; i++) { + struct spdk_nvme_ns *ns = &ctrlr->ns[i]; + uint32_t nsid = i + 1; + + nsdata = &ctrlr->nsdata[nsid - 1]; + ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); + + if (nsdata->ncap && ns_is_active) { + if (nvme_ns_update(ns) != 0) { + SPDK_ERRLOG("Failed to update active NS %u\n", nsid); + continue; + } + } + + if ((nsdata->ncap == 0) && ns_is_active) { + if (nvme_ns_construct(ns, nsid, ctrlr) != 0) { + continue; + } + } + + if (nsdata->ncap && !ns_is_active) { + nvme_ns_destruct(ns); + } + } +} + +static int +nvme_ctrlr_construct_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + uint32_t nn = ctrlr->cdata.nn; + + /* ctrlr->num_ns may be 0 (startup) or a different number of namespaces (reset), + * so check if we need to reallocate. + */ + if (nn != ctrlr->num_ns) { + nvme_ctrlr_destruct_namespaces(ctrlr); + + if (nn == 0) { + SPDK_WARNLOG("controller has 0 namespaces\n"); + return 0; + } + + ctrlr->ns = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns), 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (ctrlr->ns == NULL) { + rc = -ENOMEM; + goto fail; + } + + ctrlr->nsdata = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns_data), 64, + NULL, SPDK_ENV_SOCKET_ID_ANY, + SPDK_MALLOC_SHARE | SPDK_MALLOC_DMA); + if (ctrlr->nsdata == NULL) { + rc = -ENOMEM; + goto fail; + } + + ctrlr->num_ns = nn; + } + + return 0; + +fail: + nvme_ctrlr_destruct_namespaces(ctrlr); + return rc; +} + +static void +nvme_ctrlr_async_event_cb(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_async_event_request *aer = arg; + struct spdk_nvme_ctrlr *ctrlr = aer->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + union spdk_nvme_async_event_completion event; + int rc; + + if (cpl->status.sct == SPDK_NVME_SCT_GENERIC && + cpl->status.sc == SPDK_NVME_SC_ABORTED_SQ_DELETION) { + /* + * This is simulated when controller is being shut down, to + * effectively abort outstanding asynchronous event requests + * and make sure all memory is freed. Do not repost the + * request in this case. + */ + return; + } + + if (cpl->status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC && + cpl->status.sc == SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED) { + /* + * SPDK will only send as many AERs as the device says it supports, + * so this status code indicates an out-of-spec device. Do not repost + * the request in this case. + */ + SPDK_ERRLOG("Controller appears out-of-spec for asynchronous event request\n" + "handling. Do not repost this AER.\n"); + return; + } + + event.raw = cpl->cdw0; + if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && + (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { + rc = nvme_ctrlr_identify_active_ns(ctrlr); + if (rc) { + return; + } + nvme_ctrlr_update_namespaces(ctrlr); + nvme_io_msg_ctrlr_update(ctrlr); + } + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc && active_proc->aer_cb_fn) { + active_proc->aer_cb_fn(active_proc->aer_cb_arg, cpl); + } + + /* If the ctrlr was removed or in the destruct state, we should not send aer again */ + if (ctrlr->is_removed || ctrlr->is_destructed) { + return; + } + + /* + * Repost another asynchronous event request to replace the one + * that just completed. + */ + if (nvme_ctrlr_construct_and_submit_aer(ctrlr, aer)) { + /* + * We can't do anything to recover from a failure here, + * so just print a warning message and leave the AER unsubmitted. + */ + SPDK_ERRLOG("resubmitting AER failed!\n"); + } +} + +static int +nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_async_event_request *aer) +{ + struct nvme_request *req; + + aer->ctrlr = ctrlr; + req = nvme_allocate_request_null(ctrlr->adminq, nvme_ctrlr_async_event_cb, aer); + aer->req = req; + if (req == NULL) { + return -1; + } + + req->cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static void +nvme_ctrlr_configure_aer_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_async_event_request *aer; + int rc; + uint32_t i; + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_NOTICELOG("nvme_ctrlr_configure_aer failed!\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, + ctrlr->opts.admin_timeout_ms); + return; + } + + /* aerl is a zero-based value, so we need to add 1 here. */ + ctrlr->num_aers = spdk_min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl + 1)); + + for (i = 0; i < ctrlr->num_aers; i++) { + aer = &ctrlr->aer[i]; + rc = nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); + if (rc) { + SPDK_ERRLOG("nvme_ctrlr_construct_and_submit_aer failed!\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, + ctrlr->opts.admin_timeout_ms); +} + +static int +nvme_ctrlr_configure_aer(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_feat_async_event_configuration config; + int rc; + + config.raw = 0; + config.bits.crit_warn.bits.available_spare = 1; + config.bits.crit_warn.bits.temperature = 1; + config.bits.crit_warn.bits.device_reliability = 1; + config.bits.crit_warn.bits.read_only = 1; + config.bits.crit_warn.bits.volatile_memory_backup = 1; + + if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 2, 0)) { + if (ctrlr->cdata.oaes.ns_attribute_notices) { + config.bits.ns_attr_notice = 1; + } + if (ctrlr->cdata.oaes.fw_activation_notices) { + config.bits.fw_activation_notice = 1; + } + } + if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 3, 0) && ctrlr->cdata.lpa.telemetry) { + config.bits.telemetry_log_notice = 1; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + + rc = nvme_ctrlr_cmd_set_async_event_config(ctrlr, config, + nvme_ctrlr_configure_aer_done, + ctrlr); + if (rc != 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +struct spdk_nvme_ctrlr_process * +nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr, pid_t pid) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) { + if (active_proc->pid == pid) { + return active_proc; + } + } + + return NULL; +} + +struct spdk_nvme_ctrlr_process * +nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr) +{ + return nvme_ctrlr_get_process(ctrlr, getpid()); +} + +/** + * This function will be called when a process is using the controller. + * 1. For the primary process, it is called when constructing the controller. + * 2. For the secondary process, it is called at probing the controller. + * Note: will check whether the process is already added for the same process. + */ +int +nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle) +{ + struct spdk_nvme_ctrlr_process *ctrlr_proc; + pid_t pid = getpid(); + + /* Check whether the process is already added or not */ + if (nvme_ctrlr_get_process(ctrlr, pid)) { + return 0; + } + + /* Initialize the per process properties for this ctrlr */ + ctrlr_proc = spdk_zmalloc(sizeof(struct spdk_nvme_ctrlr_process), + 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (ctrlr_proc == NULL) { + SPDK_ERRLOG("failed to allocate memory to track the process props\n"); + + return -1; + } + + ctrlr_proc->is_primary = spdk_process_is_primary(); + ctrlr_proc->pid = pid; + STAILQ_INIT(&ctrlr_proc->active_reqs); + ctrlr_proc->devhandle = devhandle; + ctrlr_proc->ref = 0; + TAILQ_INIT(&ctrlr_proc->allocated_io_qpairs); + + TAILQ_INSERT_TAIL(&ctrlr->active_procs, ctrlr_proc, tailq); + + return 0; +} + +/** + * This function will be called when the process detaches the controller. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_remove_process(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_ctrlr_process *proc) +{ + struct spdk_nvme_qpair *qpair, *tmp_qpair; + + assert(STAILQ_EMPTY(&proc->active_reqs)); + + TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + } + + TAILQ_REMOVE(&ctrlr->active_procs, proc, tailq); + + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + spdk_pci_device_detach(proc->devhandle); + } + + spdk_free(proc); +} + +/** + * This function will be called when the process exited unexpectedly + * in order to free any incomplete nvme request, allocated IO qpairs + * and allocated memory. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_cleanup_process(struct spdk_nvme_ctrlr_process *proc) +{ + struct nvme_request *req, *tmp_req; + struct spdk_nvme_qpair *qpair, *tmp_qpair; + + STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { + STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); + + assert(req->pid == proc->pid); + + nvme_free_request(req); + } + + TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) { + TAILQ_REMOVE(&proc->allocated_io_qpairs, qpair, per_process_tailq); + + /* + * The process may have been killed while some qpairs were in their + * completion context. Clear that flag here to allow these IO + * qpairs to be deleted. + */ + qpair->in_completion_context = 0; + + qpair->no_deletion_notification_needed = 1; + + spdk_nvme_ctrlr_free_io_qpair(qpair); + } + + spdk_free(proc); +} + +/** + * This function will be called when destructing the controller. + * 1. There is no more admin request on this controller. + * 2. Clean up any left resource allocation when its associated process is gone. + */ +void +nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc, *tmp; + + /* Free all the processes' properties and make sure no pending admin IOs */ + TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) { + TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq); + + assert(STAILQ_EMPTY(&active_proc->active_reqs)); + + spdk_free(active_proc); + } +} + +/** + * This function will be called when any other process attaches or + * detaches the controller in order to cleanup those unexpectedly + * terminated processes. + * Note: the ctrlr_lock must be held when calling this function. + */ +static int +nvme_ctrlr_remove_inactive_proc(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc, *tmp; + int active_proc_count = 0; + + TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) { + if ((kill(active_proc->pid, 0) == -1) && (errno == ESRCH)) { + SPDK_ERRLOG("process %d terminated unexpected\n", active_proc->pid); + + TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq); + + nvme_ctrlr_cleanup_process(active_proc); + } else { + active_proc_count++; + } + } + + return active_proc_count; +} + +void +nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + nvme_ctrlr_remove_inactive_proc(ctrlr); + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->ref++; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +void +nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + int proc_count; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + proc_count = nvme_ctrlr_remove_inactive_proc(ctrlr); + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->ref--; + assert(active_proc->ref >= 0); + + /* + * The last active process will be removed at the end of + * the destruction of the controller. + */ + if (active_proc->ref == 0 && proc_count != 1) { + nvme_ctrlr_remove_process(ctrlr, active_proc); + } + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +int +nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + int ref = 0; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + nvme_ctrlr_remove_inactive_proc(ctrlr); + + TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) { + ref += active_proc->ref; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return ref; +} + +/** + * Get the PCI device handle which is only visible to its associated process. + */ +struct spdk_pci_device * +nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + struct spdk_pci_device *devhandle = NULL; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + devhandle = active_proc->devhandle; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return devhandle; +} + +/** + * This function will be called repeatedly during initialization until the controller is ready. + */ +int +nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + uint32_t ready_timeout_in_ms; + int rc = 0; + + /* + * May need to avoid accessing any register on the target controller + * for a while. Return early without touching the FSM. + * Check sleep_timeout_tsc > 0 for unit test. + */ + if ((ctrlr->sleep_timeout_tsc > 0) && + (spdk_get_ticks() <= ctrlr->sleep_timeout_tsc)) { + return 0; + } + ctrlr->sleep_timeout_tsc = 0; + + if (nvme_ctrlr_get_cc(ctrlr, &cc) || + nvme_ctrlr_get_csts(ctrlr, &csts)) { + if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE) { + /* While a device is resetting, it may be unable to service MMIO reads + * temporarily. Allow for this case. + */ + SPDK_ERRLOG("Get registers failed while waiting for CSTS.RDY == 0\n"); + goto init_timeout; + } + SPDK_ERRLOG("Failed to read CC and CSTS in state %d\n", ctrlr->state); + return -EIO; + } + + ready_timeout_in_ms = 500 * ctrlr->cap.bits.to; + + /* + * Check if the current initialization step is done or has timed out. + */ + switch (ctrlr->state) { + case NVME_CTRLR_STATE_INIT_DELAY: + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, ready_timeout_in_ms); + if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_INIT) { + /* + * Controller may need some delay before it's enabled. + * + * This is a workaround for an issue where the PCIe-attached NVMe controller + * is not ready after VFIO reset. We delay the initialization rather than the + * enabling itself, because this is required only for the very first enabling + * - directly after a VFIO reset. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Adding 2 second delay before initializing the controller\n"); + ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2000 * spdk_get_ticks_hz() / 1000); + } + break; + + case NVME_CTRLR_STATE_INIT: + /* Begin the hardware initialization by making sure the controller is disabled. */ + if (cc.bits.en) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1\n"); + /* + * Controller is currently enabled. We need to disable it to cause a reset. + * + * If CC.EN = 1 && CSTS.RDY = 0, the controller is in the process of becoming ready. + * Wait for the ready bit to be 1 before disabling the controller. + */ + if (csts.bits.rdy == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 0 - waiting for reset to complete\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1, ready_timeout_in_ms); + return 0; + } + + /* CC.EN = 1 && CSTS.RDY == 1, so we can immediately disable the controller. */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n"); + cc.bits.en = 0; + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + return -EIO; + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms); + + /* + * Wait 2.5 seconds before accessing PCI registers. + * Not using sleep() to avoid blocking other controller's initialization. + */ + if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Applying quirk: delay 2.5 seconds before reading registers\n"); + ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2500 * spdk_get_ticks_hz() / 1000); + } + return 0; + } else { + if (csts.bits.rdy == 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 1 - waiting for shutdown to complete\n"); + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms); + return 0; + } + break; + + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1: + if (csts.bits.rdy == 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - disabling controller\n"); + /* CC.EN = 1 && CSTS.RDY = 1, so we can set CC.EN = 0 now. */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n"); + cc.bits.en = 0; + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + return -EIO; + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms); + return 0; + } + break; + + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0: + if (csts.bits.rdy == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 0\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE, ready_timeout_in_ms); + /* + * Delay 100us before setting CC.EN = 1. Some NVMe SSDs miss CC.EN getting + * set to 1 if it is too soon after CSTS.RDY is reported as 0. + */ + spdk_delay_us(100); + return 0; + } + break; + + case NVME_CTRLR_STATE_ENABLE: + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 1\n"); + rc = nvme_ctrlr_enable(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1, ready_timeout_in_ms); + return rc; + + case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1: + if (csts.bits.rdy == 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - controller is ready\n"); + /* + * The controller has been enabled. + * Perform the rest of initialization serially. + */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_RESET_ADMIN_QUEUE, + ctrlr->opts.admin_timeout_ms); + return 0; + } + break; + + case NVME_CTRLR_STATE_RESET_ADMIN_QUEUE: + nvme_transport_qpair_reset(ctrlr->adminq); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY, + ctrlr->opts.admin_timeout_ms); + break; + + case NVME_CTRLR_STATE_IDENTIFY: + rc = nvme_ctrlr_identify(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_NUM_QUEUES: + nvme_ctrlr_update_nvmf_ioccsz(ctrlr); + rc = nvme_ctrlr_set_num_queues(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_CONSTRUCT_NS: + rc = nvme_ctrlr_construct_namespaces(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS, + ctrlr->opts.admin_timeout_ms); + break; + + case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS: + _nvme_ctrlr_identify_active_ns(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_IDENTIFY_NS: + rc = nvme_ctrlr_identify_namespaces(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS: + rc = nvme_ctrlr_identify_id_desc_namespaces(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_CONFIGURE_AER: + rc = nvme_ctrlr_configure_aer(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES: + rc = nvme_ctrlr_set_supported_log_pages(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES, + ctrlr->opts.admin_timeout_ms); + break; + + case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES: + nvme_ctrlr_set_supported_features(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_DB_BUF_CFG, + ctrlr->opts.admin_timeout_ms); + break; + + case NVME_CTRLR_STATE_SET_DB_BUF_CFG: + rc = nvme_ctrlr_set_doorbell_buffer_config(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT: + rc = nvme_ctrlr_set_keep_alive_timeout(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_HOST_ID: + rc = nvme_ctrlr_set_host_id(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_READY: + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Ctrlr already in ready state\n"); + return 0; + + case NVME_CTRLR_STATE_ERROR: + SPDK_ERRLOG("Ctrlr %s is in error state\n", ctrlr->trid.traddr); + return -1; + + default: + assert(0); + return -1; + } + +init_timeout: + if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE && + spdk_get_ticks() > ctrlr->state_timeout_tsc) { + SPDK_ERRLOG("Initialization timed out in state %d\n", ctrlr->state); + return -1; + } + + return rc; +} + +int +nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx) +{ + pthread_mutexattr_t attr; + int rc = 0; + + if (pthread_mutexattr_init(&attr)) { + return -1; + } + if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE) || +#ifndef __FreeBSD__ + pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) || + pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) || +#endif + pthread_mutex_init(mtx, &attr)) { + rc = -1; + } + pthread_mutexattr_destroy(&attr); + return rc; +} + +int +nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT_DELAY, NVME_TIMEOUT_INFINITE); + } else { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE); + } + + if (ctrlr->opts.admin_queue_size > SPDK_NVME_ADMIN_QUEUE_MAX_ENTRIES) { + SPDK_ERRLOG("admin_queue_size %u exceeds max defined by NVMe spec, use max value\n", + ctrlr->opts.admin_queue_size); + ctrlr->opts.admin_queue_size = SPDK_NVME_ADMIN_QUEUE_MAX_ENTRIES; + } + + if (ctrlr->opts.admin_queue_size < SPDK_NVME_ADMIN_QUEUE_MIN_ENTRIES) { + SPDK_ERRLOG("admin_queue_size %u is less than minimum defined by NVMe spec, use min value\n", + ctrlr->opts.admin_queue_size); + ctrlr->opts.admin_queue_size = SPDK_NVME_ADMIN_QUEUE_MIN_ENTRIES; + } + + ctrlr->flags = 0; + ctrlr->free_io_qids = NULL; + ctrlr->is_resetting = false; + ctrlr->is_failed = false; + ctrlr->is_destructed = false; + + TAILQ_INIT(&ctrlr->active_io_qpairs); + STAILQ_INIT(&ctrlr->queued_aborts); + ctrlr->outstanding_aborts = 0; + + rc = nvme_robust_mutex_init_recursive_shared(&ctrlr->ctrlr_lock); + if (rc != 0) { + return rc; + } + + TAILQ_INIT(&ctrlr->active_procs); + + return rc; +} + +/* This function should be called once at ctrlr initialization to set up constant properties. */ +void +nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap, + const union spdk_nvme_vs_register *vs) +{ + ctrlr->cap = *cap; + ctrlr->vs = *vs; + + if (ctrlr->cap.bits.ams & SPDK_NVME_CAP_AMS_WRR) { + ctrlr->flags |= SPDK_NVME_CTRLR_WRR_SUPPORTED; + } + + ctrlr->min_page_size = 1u << (12 + ctrlr->cap.bits.mpsmin); + + /* For now, always select page_size == min_page_size. */ + ctrlr->page_size = ctrlr->min_page_size; + + ctrlr->opts.io_queue_size = spdk_max(ctrlr->opts.io_queue_size, SPDK_NVME_IO_QUEUE_MIN_ENTRIES); + ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, MAX_IO_QUEUE_ENTRIES); + ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, ctrlr->cap.bits.mqes + 1u); + + ctrlr->opts.io_queue_requests = spdk_max(ctrlr->opts.io_queue_requests, ctrlr->opts.io_queue_size); +} + +void +nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr) +{ + pthread_mutex_destroy(&ctrlr->ctrlr_lock); +} + +void +nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_qpair *qpair, *tmp; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Prepare to destruct SSD: %s\n", ctrlr->trid.traddr); + + ctrlr->is_destructed = true; + + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + + nvme_ctrlr_abort_queued_aborts(ctrlr); + nvme_transport_admin_qpair_abort_aers(ctrlr->adminq); + + TAILQ_FOREACH_SAFE(qpair, &ctrlr->active_io_qpairs, tailq, tmp) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + } + + nvme_ctrlr_free_doorbell_buffer(ctrlr); + + if (ctrlr->opts.no_shn_notification) { + SPDK_INFOLOG(SPDK_LOG_NVME, "Disable SSD: %s without shutdown notification\n", + ctrlr->trid.traddr); + nvme_ctrlr_disable(ctrlr); + } else { + nvme_ctrlr_shutdown(ctrlr); + } + + nvme_ctrlr_destruct_namespaces(ctrlr); + + spdk_bit_array_free(&ctrlr->free_io_qids); + + nvme_transport_ctrlr_destruct(ctrlr); +} + +int +nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_request *req) +{ + return nvme_qpair_submit_request(ctrlr->adminq, req); +} + +static void +nvme_keep_alive_completion(void *cb_ctx, const struct spdk_nvme_cpl *cpl) +{ + /* Do nothing */ +} + +/* + * Check if we need to send a Keep Alive command. + * Caller must hold ctrlr->ctrlr_lock. + */ +static void +nvme_ctrlr_keep_alive(struct spdk_nvme_ctrlr *ctrlr) +{ + uint64_t now; + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + now = spdk_get_ticks(); + if (now < ctrlr->next_keep_alive_tick) { + return; + } + + req = nvme_allocate_request_null(ctrlr->adminq, nvme_keep_alive_completion, NULL); + if (req == NULL) { + return; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_KEEP_ALIVE; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + if (rc != 0) { + SPDK_ERRLOG("Submitting Keep Alive failed\n"); + } + + ctrlr->next_keep_alive_tick = now + ctrlr->keep_alive_interval_ticks; +} + +int32_t +spdk_nvme_ctrlr_process_admin_completions(struct spdk_nvme_ctrlr *ctrlr) +{ + int32_t num_completions; + int32_t rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + if (ctrlr->keep_alive_interval_ticks) { + nvme_ctrlr_keep_alive(ctrlr); + } + + rc = nvme_io_msg_process(ctrlr); + if (rc < 0) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; + } + num_completions = rc; + + rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + if (rc < 0) { + num_completions = rc; + } else { + num_completions += rc; + } + + return num_completions; +} + +const struct spdk_nvme_ctrlr_data * +spdk_nvme_ctrlr_get_data(struct spdk_nvme_ctrlr *ctrlr) +{ + return &ctrlr->cdata; +} + +union spdk_nvme_csts_register spdk_nvme_ctrlr_get_regs_csts(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_csts_register csts; + + if (nvme_ctrlr_get_csts(ctrlr, &csts)) { + csts.raw = 0xFFFFFFFFu; + } + return csts; +} + +union spdk_nvme_cap_register spdk_nvme_ctrlr_get_regs_cap(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->cap; +} + +union spdk_nvme_vs_register spdk_nvme_ctrlr_get_regs_vs(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->vs; +} + +union spdk_nvme_cmbsz_register spdk_nvme_ctrlr_get_regs_cmbsz(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cmbsz_register cmbsz; + + if (nvme_ctrlr_get_cmbsz(ctrlr, &cmbsz)) { + cmbsz.raw = 0; + } + + return cmbsz; +} + +uint32_t +spdk_nvme_ctrlr_get_num_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->num_ns; +} + +static int32_t +nvme_ctrlr_active_ns_idx(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + int32_t result = -1; + + if (ctrlr->active_ns_list == NULL || nsid == 0 || nsid > ctrlr->num_ns) { + return result; + } + + int32_t lower = 0; + int32_t upper = ctrlr->num_ns - 1; + int32_t mid; + + while (lower <= upper) { + mid = lower + (upper - lower) / 2; + if (ctrlr->active_ns_list[mid] == nsid) { + result = mid; + break; + } else { + if (ctrlr->active_ns_list[mid] != 0 && ctrlr->active_ns_list[mid] < nsid) { + lower = mid + 1; + } else { + upper = mid - 1; + } + + } + } + + return result; +} + +bool +spdk_nvme_ctrlr_is_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + return nvme_ctrlr_active_ns_idx(ctrlr, nsid) != -1; +} + +uint32_t +spdk_nvme_ctrlr_get_first_active_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->active_ns_list ? ctrlr->active_ns_list[0] : 0; +} + +uint32_t +spdk_nvme_ctrlr_get_next_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t prev_nsid) +{ + int32_t nsid_idx = nvme_ctrlr_active_ns_idx(ctrlr, prev_nsid); + if (ctrlr->active_ns_list && nsid_idx >= 0 && (uint32_t)nsid_idx < ctrlr->num_ns - 1) { + return ctrlr->active_ns_list[nsid_idx + 1]; + } + return 0; +} + +struct spdk_nvme_ns * +spdk_nvme_ctrlr_get_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + if (nsid < 1 || nsid > ctrlr->num_ns) { + return NULL; + } + + return &ctrlr->ns[nsid - 1]; +} + +struct spdk_pci_device * +spdk_nvme_ctrlr_get_pci_device(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr == NULL) { + return NULL; + } + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + return NULL; + } + + return nvme_ctrlr_proc_get_devhandle(ctrlr); +} + +uint32_t +spdk_nvme_ctrlr_get_max_xfer_size(const struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->max_xfer_size; +} + +void +spdk_nvme_ctrlr_register_aer_callback(struct spdk_nvme_ctrlr *ctrlr, + spdk_nvme_aer_cb aer_cb_fn, + void *aer_cb_arg) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->aer_cb_fn = aer_cb_fn; + active_proc->aer_cb_arg = aer_cb_arg; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +void +spdk_nvme_ctrlr_register_timeout_callback(struct spdk_nvme_ctrlr *ctrlr, + uint64_t timeout_us, spdk_nvme_timeout_cb cb_fn, void *cb_arg) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->timeout_ticks = timeout_us * spdk_get_ticks_hz() / 1000000ULL; + active_proc->timeout_cb_fn = cb_fn; + active_proc->timeout_cb_arg = cb_arg; + } + + ctrlr->timeout_enabled = true; + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +bool +spdk_nvme_ctrlr_is_log_page_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page) +{ + /* No bounds check necessary, since log_page is uint8_t and log_page_supported has 256 entries */ + SPDK_STATIC_ASSERT(sizeof(ctrlr->log_page_supported) == 256, "log_page_supported size mismatch"); + return ctrlr->log_page_supported[log_page]; +} + +bool +spdk_nvme_ctrlr_is_feature_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature_code) +{ + /* No bounds check necessary, since feature_code is uint8_t and feature_supported has 256 entries */ + SPDK_STATIC_ASSERT(sizeof(ctrlr->feature_supported) == 256, "feature_supported size mismatch"); + return ctrlr->feature_supported[feature_code]; +} + +int +spdk_nvme_ctrlr_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload) +{ + struct nvme_completion_poll_status *status; + int res; + struct spdk_nvme_ns *ns; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = nvme_ctrlr_cmd_attach_ns(ctrlr, nsid, payload, + nvme_completion_poll_cb, status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_attach_ns failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + free(status); + + res = nvme_ctrlr_identify_active_ns(ctrlr); + if (res) { + return res; + } + + ns = &ctrlr->ns[nsid - 1]; + return nvme_ns_construct(ns, nsid, ctrlr); +} + +int +spdk_nvme_ctrlr_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload) +{ + struct nvme_completion_poll_status *status; + int res; + struct spdk_nvme_ns *ns; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = nvme_ctrlr_cmd_detach_ns(ctrlr, nsid, payload, + nvme_completion_poll_cb, status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_detach_ns failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + free(status); + + res = nvme_ctrlr_identify_active_ns(ctrlr); + if (res) { + return res; + } + + ns = &ctrlr->ns[nsid - 1]; + /* Inactive NS */ + nvme_ns_destruct(ns); + + return 0; +} + +uint32_t +spdk_nvme_ctrlr_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload) +{ + struct nvme_completion_poll_status *status; + int res; + uint32_t nsid; + struct spdk_nvme_ns *ns; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return 0; + } + + res = nvme_ctrlr_cmd_create_ns(ctrlr, payload, nvme_completion_poll_cb, status); + if (res) { + free(status); + return 0; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_create_ns failed!\n"); + if (!status->timed_out) { + free(status); + } + return 0; + } + + nsid = status->cpl.cdw0; + ns = &ctrlr->ns[nsid - 1]; + free(status); + /* Inactive NS */ + res = nvme_ns_construct(ns, nsid, ctrlr); + if (res) { + return 0; + } + + /* Return the namespace ID that was created */ + return nsid; +} + +int +spdk_nvme_ctrlr_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + struct nvme_completion_poll_status *status; + int res; + struct spdk_nvme_ns *ns; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = nvme_ctrlr_cmd_delete_ns(ctrlr, nsid, nvme_completion_poll_cb, status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_delete_ns failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + free(status); + + res = nvme_ctrlr_identify_active_ns(ctrlr); + if (res) { + return res; + } + + ns = &ctrlr->ns[nsid - 1]; + nvme_ns_destruct(ns); + + return 0; +} + +int +spdk_nvme_ctrlr_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_format *format) +{ + struct nvme_completion_poll_status *status; + int res; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = nvme_ctrlr_cmd_format(ctrlr, nsid, format, nvme_completion_poll_cb, + status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_format failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + free(status); + + return spdk_nvme_ctrlr_reset(ctrlr); +} + +int +spdk_nvme_ctrlr_update_firmware(struct spdk_nvme_ctrlr *ctrlr, void *payload, uint32_t size, + int slot, enum spdk_nvme_fw_commit_action commit_action, struct spdk_nvme_status *completion_status) +{ + struct spdk_nvme_fw_commit fw_commit; + struct nvme_completion_poll_status *status; + int res; + unsigned int size_remaining; + unsigned int offset; + unsigned int transfer; + void *p; + + if (!completion_status) { + return -EINVAL; + } + memset(completion_status, 0, sizeof(struct spdk_nvme_status)); + if (size % 4) { + SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid size!\n"); + return -1; + } + + /* Current support only for SPDK_NVME_FW_COMMIT_REPLACE_IMG + * and SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG + */ + if ((commit_action != SPDK_NVME_FW_COMMIT_REPLACE_IMG) && + (commit_action != SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid command!\n"); + return -1; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + /* Firmware download */ + size_remaining = size; + offset = 0; + p = payload; + + while (size_remaining > 0) { + transfer = spdk_min(size_remaining, ctrlr->min_page_size); + + memset(status, 0, sizeof(*status)); + res = nvme_ctrlr_cmd_fw_image_download(ctrlr, transfer, offset, p, + nvme_completion_poll_cb, + status); + if (res) { + free(status); + return res; + } + + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_fw_image_download failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + p += transfer; + offset += transfer; + size_remaining -= transfer; + } + + /* Firmware commit */ + memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit)); + fw_commit.fs = slot; + fw_commit.ca = commit_action; + + memset(status, 0, sizeof(*status)); + res = nvme_ctrlr_cmd_fw_commit(ctrlr, &fw_commit, nvme_completion_poll_cb, + status); + if (res) { + free(status); + return res; + } + + res = nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock); + + memcpy(completion_status, &status->cpl.status, sizeof(struct spdk_nvme_status)); + + if (!status->timed_out) { + free(status); + } + + if (res) { + if (completion_status->sct != SPDK_NVME_SCT_COMMAND_SPECIFIC || + completion_status->sc != SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET) { + if (completion_status->sct == SPDK_NVME_SCT_COMMAND_SPECIFIC && + completion_status->sc == SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET) { + SPDK_NOTICELOG("firmware activation requires conventional reset to be performed. !\n"); + } else { + SPDK_ERRLOG("nvme_ctrlr_cmd_fw_commit failed!\n"); + } + return -ENXIO; + } + } + + return spdk_nvme_ctrlr_reset(ctrlr); +} + +int +spdk_nvme_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc, size; + union spdk_nvme_cmbsz_register cmbsz; + + cmbsz = spdk_nvme_ctrlr_get_regs_cmbsz(ctrlr); + + if (cmbsz.bits.rds == 0 || cmbsz.bits.wds == 0) { + return -ENOTSUP; + } + + size = cmbsz.bits.sz * (0x1000 << (cmbsz.bits.szu * 4)); + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + rc = nvme_transport_ctrlr_reserve_cmb(ctrlr); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + if (rc < 0) { + return rc; + } + + return size; +} + +void * +spdk_nvme_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size) +{ + void *buf; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + buf = nvme_transport_ctrlr_map_cmb(ctrlr, size); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return buf; +} + +void +spdk_nvme_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + nvme_transport_ctrlr_unmap_cmb(ctrlr); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +bool +spdk_nvme_ctrlr_is_discovery(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr); + + return !strncmp(ctrlr->trid.subnqn, SPDK_NVMF_DISCOVERY_NQN, + strlen(SPDK_NVMF_DISCOVERY_NQN)); +} + +int +spdk_nvme_ctrlr_security_receive(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp, + uint16_t spsp, uint8_t nssf, void *payload, size_t size) +{ + struct nvme_completion_poll_status *status; + int res; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = spdk_nvme_ctrlr_cmd_security_receive(ctrlr, secp, spsp, nssf, payload, size, + nvme_completion_poll_cb, status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_security_receive failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + free(status); + + return 0; +} + +int +spdk_nvme_ctrlr_security_send(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp, + uint16_t spsp, uint8_t nssf, void *payload, size_t size) +{ + struct nvme_completion_poll_status *status; + int res; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = spdk_nvme_ctrlr_cmd_security_send(ctrlr, secp, spsp, nssf, payload, size, + nvme_completion_poll_cb, + status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_security_send failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + + free(status); + + return 0; +} + +uint64_t +spdk_nvme_ctrlr_get_flags(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->flags; +} + +const struct spdk_nvme_transport_id * +spdk_nvme_ctrlr_get_transport_id(struct spdk_nvme_ctrlr *ctrlr) +{ + return &ctrlr->trid; +} + +/* FIXME need to specify max number of iovs */ +int +spdk_nvme_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, + uint32_t len, size_t mps, + void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len)) +{ + uint64_t prp1, prp2; + void *vva; + uint32_t i; + uint32_t residue_len, nents; + uint64_t *prp_list; + int iovcnt; + + prp1 = cmd->dptr.prp.prp1; + prp2 = cmd->dptr.prp.prp2; + + /* PRP1 may started with unaligned page address */ + residue_len = mps - (prp1 % mps); + residue_len = spdk_min(len, residue_len); + + vva = gpa_to_vva(prv, prp1, residue_len); + if (spdk_unlikely(vva == NULL)) { + SPDK_ERRLOG("GPA to VVA failed\n"); + return -1; + } + iovs[0].iov_base = vva; + iovs[0].iov_len = residue_len; + len -= residue_len; + + if (len) { + if (spdk_unlikely(prp2 == 0)) { + SPDK_ERRLOG("no PRP2, %d remaining\n", len); + return -1; + } + + if (len <= mps) { + /* 2 PRP used */ + iovcnt = 2; + vva = gpa_to_vva(prv, prp2, len); + if (spdk_unlikely(vva == NULL)) { + SPDK_ERRLOG("no VVA for %#lx, len%#x\n", + prp2, len); + return -1; + } + iovs[1].iov_base = vva; + iovs[1].iov_len = len; + } else { + /* PRP list used */ + nents = (len + mps - 1) / mps; + vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list)); + if (spdk_unlikely(vva == NULL)) { + SPDK_ERRLOG("no VVA for %#lx, nents=%#x\n", + prp2, nents); + return -1; + } + prp_list = vva; + i = 0; + while (len != 0) { + residue_len = spdk_min(len, mps); + vva = gpa_to_vva(prv, prp_list[i], residue_len); + if (spdk_unlikely(vva == NULL)) { + SPDK_ERRLOG("no VVA for %#lx, residue_len=%#x\n", + prp_list[i], residue_len); + return -1; + } + iovs[i + 1].iov_base = vva; + iovs[i + 1].iov_len = residue_len; + len -= residue_len; + i++; + } + iovcnt = i + 1; + } + } else { + /* 1 PRP used */ + iovcnt = 1; + } + + return iovcnt; +} diff --git a/src/spdk/lib/nvme/nvme_ctrlr_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c new file mode 100644 index 000000000..9b16c8d6f --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c @@ -0,0 +1,966 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +int +spdk_nvme_ctrlr_io_cmd_raw_no_payload_build(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cmd *cmd, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + return -EINVAL; + } + + memset(&payload, 0, sizeof(payload)); + req = nvme_allocate_request(qpair, &payload, 0, 0, cb_fn, cb_arg); + + if (req == NULL) { + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ctrlr_cmd_io_raw(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cmd *cmd, + void *buf, uint32_t len, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + + req = nvme_allocate_request_contig(qpair, buf, len, cb_fn, cb_arg); + + if (req == NULL) { + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ctrlr_cmd_io_raw_with_md(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cmd *cmd, + void *buf, uint32_t len, void *md_buf, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct nvme_payload payload; + uint32_t md_len = 0; + + payload = NVME_PAYLOAD_CONTIG(buf, md_buf); + + /* Caculate metadata length */ + if (md_buf) { + struct spdk_nvme_ns *ns = &ctrlr->ns[cmd->nsid - 1]; + + assert(ns->sector_size != 0); + md_len = len / ns->sector_size * ns->md_size; + } + + req = nvme_allocate_request(qpair, &payload, len, md_len, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ctrlr_cmd_admin_raw(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_cmd *cmd, + void *buf, uint32_t len, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_contig(ctrlr->adminq, buf, len, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr, uint8_t cns, uint16_t cntid, uint32_t nsid, + void *payload, size_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, payload_size, + cb_fn, cb_arg, false); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_IDENTIFY; + cmd->cdw10_bits.identify.cns = cns; + cmd->cdw10_bits.identify.cntid = cntid; + cmd->nsid = nsid; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +int +nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, sizeof(struct spdk_nvme_ctrlr_list), + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT; + cmd->nsid = nsid; + cmd->cdw10_bits.ns_attach.sel = SPDK_NVME_NS_CTRLR_ATTACH; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, sizeof(struct spdk_nvme_ctrlr_list), + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT; + cmd->nsid = nsid; + cmd->cdw10_bits.ns_attach.sel = SPDK_NVME_NS_CTRLR_DETACH; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, sizeof(struct spdk_nvme_ns_data), + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT; + cmd->cdw10_bits.ns_manage.sel = SPDK_NVME_NS_MANAGEMENT_CREATE; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn, + void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT; + cmd->cdw10_bits.ns_manage.sel = SPDK_NVME_NS_MANAGEMENT_DELETE; + cmd->nsid = nsid; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr, uint64_t prp1, uint64_t prp2, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG; + cmd->dptr.prp.prp1 = prp1; + cmd->dptr.prp.prp2 = prp2; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, struct spdk_nvme_format *format, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FORMAT_NVM; + cmd->nsid = nsid; + memcpy(&cmd->cdw10, format, sizeof(uint32_t)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_set_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, uint32_t cdw12, void *payload, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SET_FEATURES; + cmd->cdw10_bits.set_features.fid = feature; + cmd->cdw11 = cdw11; + cmd->cdw12 = cdw12; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, void *payload, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_GET_FEATURES; + cmd->cdw10_bits.get_features.fid = feature; + cmd->cdw11 = cdw11; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_get_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, uint32_t ns_id) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_GET_FEATURES; + cmd->cdw10_bits.get_features.fid = feature; + cmd->cdw11 = cdw11; + cmd->nsid = ns_id; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int spdk_nvme_ctrlr_cmd_set_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, uint32_t cdw12, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, uint32_t ns_id) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SET_FEATURES; + cmd->cdw10_bits.set_features.fid = feature; + cmd->cdw11 = cdw11; + cmd->cdw12 = cdw12; + cmd->nsid = ns_id; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr, + uint32_t num_queues, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + union spdk_nvme_feat_number_of_queues feat_num_queues; + + feat_num_queues.raw = 0; + feat_num_queues.bits.nsqr = num_queues - 1; + feat_num_queues.bits.ncqr = num_queues - 1; + + return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, feat_num_queues.raw, + 0, + NULL, 0, cb_fn, cb_arg); +} + +int +nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + return spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, 0, NULL, 0, + cb_fn, cb_arg); +} + +int +nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr, + union spdk_nvme_feat_async_event_configuration config, spdk_nvme_cmd_cb cb_fn, + void *cb_arg) +{ + uint32_t cdw11; + + cdw11 = config.raw; + return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, cdw11, 0, + NULL, 0, + cb_fn, cb_arg); +} + +int +nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + union spdk_nvme_feat_host_identifier feat_host_identifier; + + feat_host_identifier.raw = 0; + if (host_id_size == 16) { + /* 128-bit extended host identifier */ + feat_host_identifier.bits.exhid = 1; + } else if (host_id_size == 8) { + /* 64-bit host identifier */ + feat_host_identifier.bits.exhid = 0; + } else { + SPDK_ERRLOG("Invalid host ID size %u\n", host_id_size); + return -EINVAL; + } + + return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_HOST_IDENTIFIER, + feat_host_identifier.raw, 0, + host_id, host_id_size, cb_fn, cb_arg); +} + +int +spdk_nvme_ctrlr_cmd_get_log_page_ext(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page, + uint32_t nsid, void *payload, uint32_t payload_size, + uint64_t offset, uint32_t cdw10, + uint32_t cdw11, uint32_t cdw14, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + uint32_t numd, numdl, numdu; + uint32_t lpol, lpou; + int rc; + + if (payload_size == 0) { + return -EINVAL; + } + + if (offset & 3) { + return -EINVAL; + } + + numd = payload_size / sizeof(uint32_t) - 1u; + numdl = numd & 0xFFFFu; + numdu = (numd >> 16) & 0xFFFFu; + + lpol = (uint32_t)offset; + lpou = (uint32_t)(offset >> 32); + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + if (offset && !ctrlr->cdata.lpa.edlp) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -EINVAL; + } + + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, payload_size, cb_fn, cb_arg, false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_GET_LOG_PAGE; + cmd->nsid = nsid; + cmd->cdw10 = cdw10; + cmd->cdw10_bits.get_log_page.numdl = numdl; + cmd->cdw10_bits.get_log_page.lid = log_page; + + cmd->cdw11 = cdw11; + cmd->cdw11_bits.get_log_page.numdu = numdu; + cmd->cdw12 = lpol; + cmd->cdw13 = lpou; + cmd->cdw14 = cdw14; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_get_log_page(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page, + uint32_t nsid, void *payload, uint32_t payload_size, + uint64_t offset, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + return spdk_nvme_ctrlr_cmd_get_log_page_ext(ctrlr, log_page, nsid, payload, + payload_size, offset, 0, 0, 0, cb_fn, cb_arg); +} + +static void +nvme_ctrlr_retry_queued_abort(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_request *next, *tmp; + int rc; + + if (ctrlr->is_resetting || ctrlr->is_destructed) { + return; + } + + STAILQ_FOREACH_SAFE(next, &ctrlr->queued_aborts, stailq, tmp) { + STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq); + ctrlr->outstanding_aborts++; + rc = nvme_ctrlr_submit_admin_request(ctrlr, next); + if (rc < 0) { + SPDK_ERRLOG("Failed to submit queued abort.\n"); + memset(&next->cpl, 0, sizeof(next->cpl)); + next->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + next->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + next->cpl.status.dnr = 1; + nvme_complete_request(next->cb_fn, next->cb_arg, next->qpair, next, &next->cpl); + nvme_free_request(next); + } else { + /* If the first abort succeeds, stop iterating. */ + break; + } + } +} + +static int +_nvme_ctrlr_submit_abort_request(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_request *req) +{ + /* ACL is a 0's based value. */ + if (ctrlr->outstanding_aborts >= ctrlr->cdata.acl + 1U) { + STAILQ_INSERT_TAIL(&ctrlr->queued_aborts, req, stailq); + return 0; + } else { + ctrlr->outstanding_aborts++; + return nvme_ctrlr_submit_admin_request(ctrlr, req); + } +} + +static void +nvme_ctrlr_cmd_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *req = ctx; + struct spdk_nvme_ctrlr *ctrlr; + + ctrlr = req->qpair->ctrlr; + + ctrlr->outstanding_aborts--; + nvme_ctrlr_retry_queued_abort(ctrlr); + + req->user_cb_fn(req->user_cb_arg, cpl); +} + +int +spdk_nvme_ctrlr_cmd_abort(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + uint16_t cid, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + int rc; + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, nvme_ctrlr_cmd_abort_cpl, NULL); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + req->cb_arg = req; + req->user_cb_fn = cb_fn; + req->user_cb_arg = cb_arg; + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_ABORT; + cmd->cdw10_bits.abort.sqid = qpair->id; + cmd->cdw10_bits.abort.cid = cid; + + rc = _nvme_ctrlr_submit_abort_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +static void +nvme_complete_abort_request(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *req = ctx; + struct nvme_request *parent = req->parent; + struct spdk_nvme_ctrlr *ctrlr; + + ctrlr = req->qpair->ctrlr; + + ctrlr->outstanding_aborts--; + nvme_ctrlr_retry_queued_abort(ctrlr); + + nvme_request_remove_child(parent, req); + + if (!spdk_nvme_cpl_is_abort_success(cpl)) { + parent->parent_status.cdw0 |= 1U; + } + + if (parent->num_children == 0) { + nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair, + parent, &parent->parent_status); + nvme_free_request(parent); + } +} + +static int +nvme_request_add_abort(struct nvme_request *req, void *arg) +{ + struct nvme_request *parent = arg; + struct nvme_request *child; + void *cmd_cb_arg; + + cmd_cb_arg = parent->user_cb_arg; + + if (req->cb_arg != cmd_cb_arg && + (req->parent == NULL || req->parent->cb_arg != cmd_cb_arg)) { + return 0; + } + + child = nvme_allocate_request_null(parent->qpair->ctrlr->adminq, + nvme_complete_abort_request, NULL); + if (child == NULL) { + return -ENOMEM; + } + + child->cb_arg = child; + + child->cmd.opc = SPDK_NVME_OPC_ABORT; + /* Copy SQID from the parent. */ + child->cmd.cdw10_bits.abort.sqid = parent->cmd.cdw10_bits.abort.sqid; + child->cmd.cdw10_bits.abort.cid = req->cmd.cid; + + child->parent = parent; + + TAILQ_INSERT_TAIL(&parent->children, child, child_tailq); + parent->num_children++; + + return 0; +} + +int +spdk_nvme_ctrlr_cmd_abort_ext(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + void *cmd_cb_arg, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + int rc = 0; + struct nvme_request *parent, *child, *tmp; + bool child_failed = false; + int aborted = 0; + + if (cmd_cb_arg == NULL) { + return -EINVAL; + } + + pthread_mutex_lock(&ctrlr->ctrlr_lock); + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + parent = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (parent == NULL) { + pthread_mutex_unlock(&ctrlr->ctrlr_lock); + + return -ENOMEM; + } + + TAILQ_INIT(&parent->children); + parent->num_children = 0; + + parent->cmd.opc = SPDK_NVME_OPC_ABORT; + memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl)); + + /* Hold SQID that the requests to abort are associated with. + * This will be copied to the children. + * + * CID is not set here because the parent is not submitted directly + * and CID is not determined until request to abort is found. + */ + parent->cmd.cdw10_bits.abort.sqid = qpair->id; + + /* This is used to find request to abort. */ + parent->user_cb_arg = cmd_cb_arg; + + /* Add an abort request for each outstanding request which has cmd_cb_arg + * as its callback context. + */ + rc = nvme_transport_qpair_iterate_requests(qpair, nvme_request_add_abort, parent); + if (rc != 0) { + /* Free abort requests already added. */ + child_failed = true; + } + + TAILQ_FOREACH_SAFE(child, &parent->children, child_tailq, tmp) { + if (spdk_likely(!child_failed)) { + rc = _nvme_ctrlr_submit_abort_request(ctrlr, child); + if (spdk_unlikely(rc != 0)) { + child_failed = true; + } + } else { + /* Free remaining abort requests. */ + nvme_request_remove_child(parent, child); + nvme_free_request(child); + } + } + + if (spdk_likely(!child_failed)) { + /* There is no error so far. Abort requests were submitted successfully + * or there was no outstanding request to abort. + * + * Hence abort queued requests which has cmd_cb_arg as its callback + * context next. + */ + aborted = nvme_qpair_abort_queued_reqs(qpair, cmd_cb_arg); + if (parent->num_children == 0) { + /* There was no outstanding request to abort. */ + if (aborted > 0) { + /* The queued requests were successfully aborted. Hence + * complete the parent request with success synchronously. + */ + nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair, + parent, &parent->parent_status); + nvme_free_request(parent); + } else { + /* There was no queued request to abort. */ + rc = -ENOENT; + } + } + } else { + /* Failed to add or submit abort request. */ + if (parent->num_children != 0) { + /* Return success since we must wait for those children + * to complete but set the parent request to failure. + */ + parent->parent_status.cdw0 |= 1U; + rc = 0; + } + } + + if (rc != 0) { + nvme_free_request(parent); + } + + pthread_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_fw_commit *fw_commit, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FIRMWARE_COMMIT; + memcpy(&cmd->cdw10, fw_commit, sizeof(uint32_t)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; + +} + +int +nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr, + uint32_t size, uint32_t offset, void *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, size, cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; + cmd->cdw10 = (size >> 2) - 1; + cmd->cdw11 = offset >> 2; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_security_receive(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp, + uint16_t spsp, uint8_t nssf, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, + cb_fn, cb_arg, false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SECURITY_RECEIVE; + cmd->cdw10_bits.sec_send_recv.nssf = nssf; + cmd->cdw10_bits.sec_send_recv.spsp0 = (uint8_t)spsp; + cmd->cdw10_bits.sec_send_recv.spsp1 = (uint8_t)(spsp >> 8); + cmd->cdw10_bits.sec_send_recv.secp = secp; + cmd->cdw11 = payload_size; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_security_send(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp, + uint16_t spsp, uint8_t nssf, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SECURITY_SEND; + cmd->cdw10_bits.sec_send_recv.nssf = nssf; + cmd->cdw10_bits.sec_send_recv.spsp0 = (uint8_t)spsp; + cmd->cdw10_bits.sec_send_recv.spsp1 = (uint8_t)(spsp >> 8); + cmd->cdw10_bits.sec_send_recv.secp = secp; + cmd->cdw11 = payload_size; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +nvme_ctrlr_cmd_sanitize(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_sanitize *sanitize, uint32_t cdw11, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SANITIZE; + cmd->nsid = nsid; + cmd->cdw11 = cdw11; + memcpy(&cmd->cdw10, sanitize, sizeof(cmd->cdw10)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} diff --git a/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c new file mode 100644 index 000000000..2eba219ce --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c @@ -0,0 +1,88 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/nvme_ocssd.h" +#include "nvme_internal.h" + +bool +spdk_nvme_ctrlr_is_ocssd_supported(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->quirks & NVME_QUIRK_OCSSD) { + /* TODO: There isn't a standardized way to identify Open-Channel SSD + * different verdors may have different conditions. + */ + + /* + * Current QEMU OpenChannel Device needs to check nsdata->vs[0]. + * Here check nsdata->vs[0] of the first namespace. + */ + if (ctrlr->cdata.vid == SPDK_PCI_VID_CNEXLABS) { + if (ctrlr->num_ns && ctrlr->nsdata[0].vendor_specific[0] == 0x1) { + return true; + } + } + } + return false; +} + + +int +spdk_nvme_ocssd_ctrlr_cmd_geometry(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + void *payload, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + if (!payload || (payload_size != sizeof(struct spdk_ocssd_geometry_data))) { + return -EINVAL; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, payload_size, cb_fn, cb_arg, false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_OCSSD_OPC_GEOMETRY; + cmd->nsid = nsid; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} diff --git a/src/spdk/lib/nvme/nvme_cuse.c b/src/spdk/lib/nvme/nvme_cuse.c new file mode 100644 index 000000000..9a5ee1f0d --- /dev/null +++ b/src/spdk/lib/nvme/nvme_cuse.c @@ -0,0 +1,1115 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define FUSE_USE_VERSION 31 + +#include <fuse3/cuse_lowlevel.h> + +#include <linux/nvme_ioctl.h> +#include <linux/fs.h> + +#include "nvme_internal.h" +#include "nvme_io_msg.h" +#include "nvme_cuse.h" + +struct cuse_device { + bool is_started; + + char dev_name[128]; + uint32_t index; + int claim_fd; + char lock_name[64]; + + struct spdk_nvme_ctrlr *ctrlr; /**< NVMe controller */ + uint32_t nsid; /**< NVMe name space id, or 0 */ + + pthread_t tid; + struct fuse_session *session; + + struct cuse_device *ctrlr_device; + struct cuse_device *ns_devices; /**< Array of cuse ns devices */ + + TAILQ_ENTRY(cuse_device) tailq; +}; + +static pthread_mutex_t g_cuse_mtx = PTHREAD_MUTEX_INITIALIZER; +static TAILQ_HEAD(, cuse_device) g_ctrlr_ctx_head = TAILQ_HEAD_INITIALIZER(g_ctrlr_ctx_head); +static struct spdk_bit_array *g_ctrlr_started; + +struct cuse_io_ctx { + struct spdk_nvme_cmd nvme_cmd; + enum spdk_nvme_data_transfer data_transfer; + + uint64_t lba; + uint32_t lba_count; + + void *data; + int data_len; + + fuse_req_t req; +}; + +static void +cuse_io_ctx_free(struct cuse_io_ctx *ctx) +{ + spdk_free(ctx->data); + free(ctx); +} + +#define FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, val) \ + if (out_bufsz == 0) { \ + struct iovec out_iov; \ + out_iov.iov_base = (void *)arg; \ + out_iov.iov_len = sizeof(val); \ + fuse_reply_ioctl_retry(req, NULL, 0, &out_iov, 1); \ + return; \ + } + +static void +cuse_nvme_admin_cmd_cb(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct cuse_io_ctx *ctx = arg; + struct iovec out_iov[2]; + struct spdk_nvme_cpl _cpl; + + if (ctx->data_transfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, NULL, 0); + } else { + memcpy(&_cpl, cpl, sizeof(struct spdk_nvme_cpl)); + + out_iov[0].iov_base = &_cpl.cdw0; + out_iov[0].iov_len = sizeof(_cpl.cdw0); + + if (ctx->data_len > 0) { + out_iov[1].iov_base = ctx->data; + out_iov[1].iov_len = ctx->data_len; + fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, out_iov, 2); + } else { + fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, out_iov, 1); + } + } + + cuse_io_ctx_free(ctx); +} + +static void +cuse_nvme_admin_cmd_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg) +{ + int rc; + struct cuse_io_ctx *ctx = arg; + + rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &ctx->nvme_cmd, ctx->data, ctx->data_len, + cuse_nvme_admin_cmd_cb, (void *)ctx); + if (rc < 0) { + fuse_reply_err(ctx->req, EINVAL); + cuse_io_ctx_free(ctx); + } +} + +static void +cuse_nvme_admin_cmd_send(fuse_req_t req, struct nvme_admin_cmd *admin_cmd, + const void *data) +{ + struct cuse_io_ctx *ctx; + struct cuse_device *cuse_device = fuse_req_userdata(req); + int rv; + + ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx)); + if (!ctx) { + SPDK_ERRLOG("Cannot allocate memory for cuse_io_ctx\n"); + fuse_reply_err(req, ENOMEM); + return; + } + + ctx->req = req; + ctx->data_transfer = spdk_nvme_opc_get_data_transfer(admin_cmd->opcode); + + memset(&ctx->nvme_cmd, 0, sizeof(ctx->nvme_cmd)); + ctx->nvme_cmd.opc = admin_cmd->opcode; + ctx->nvme_cmd.nsid = admin_cmd->nsid; + ctx->nvme_cmd.cdw10 = admin_cmd->cdw10; + ctx->nvme_cmd.cdw11 = admin_cmd->cdw11; + ctx->nvme_cmd.cdw12 = admin_cmd->cdw12; + ctx->nvme_cmd.cdw13 = admin_cmd->cdw13; + ctx->nvme_cmd.cdw14 = admin_cmd->cdw14; + ctx->nvme_cmd.cdw15 = admin_cmd->cdw15; + + ctx->data_len = admin_cmd->data_len; + + if (ctx->data_len > 0) { + ctx->data = spdk_malloc(ctx->data_len, 0, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->data) { + SPDK_ERRLOG("Cannot allocate memory for data\n"); + fuse_reply_err(req, ENOMEM); + free(ctx); + return; + } + if (data != NULL) { + memcpy(ctx->data, data, ctx->data_len); + } + } + + rv = nvme_io_msg_send(cuse_device->ctrlr, 0, cuse_nvme_admin_cmd_execute, ctx); + if (rv) { + SPDK_ERRLOG("Cannot send io msg to the controller\n"); + fuse_reply_err(req, -rv); + cuse_io_ctx_free(ctx); + return; + } +} + +static void +cuse_nvme_admin_cmd(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + struct nvme_admin_cmd *admin_cmd; + struct iovec in_iov[2], out_iov[2]; + + in_iov[0].iov_base = (void *)arg; + in_iov[0].iov_len = sizeof(*admin_cmd); + if (in_bufsz == 0) { + fuse_reply_ioctl_retry(req, in_iov, 1, NULL, 0); + return; + } + + admin_cmd = (struct nvme_admin_cmd *)in_buf; + + switch (spdk_nvme_opc_get_data_transfer(admin_cmd->opcode)) { + case SPDK_NVME_DATA_NONE: + SPDK_ERRLOG("SPDK_NVME_DATA_NONE not implemented\n"); + fuse_reply_err(req, EINVAL); + return; + case SPDK_NVME_DATA_HOST_TO_CONTROLLER: + if (admin_cmd->addr != 0) { + in_iov[1].iov_base = (void *)admin_cmd->addr; + in_iov[1].iov_len = admin_cmd->data_len; + if (in_bufsz == sizeof(*admin_cmd)) { + fuse_reply_ioctl_retry(req, in_iov, 2, NULL, 0); + return; + } + cuse_nvme_admin_cmd_send(req, admin_cmd, in_buf + sizeof(*admin_cmd)); + } else { + cuse_nvme_admin_cmd_send(req, admin_cmd, NULL); + } + return; + case SPDK_NVME_DATA_CONTROLLER_TO_HOST: + if (out_bufsz == 0) { + out_iov[0].iov_base = &((struct nvme_admin_cmd *)arg)->result; + out_iov[0].iov_len = sizeof(uint32_t); + if (admin_cmd->data_len > 0) { + out_iov[1].iov_base = (void *)admin_cmd->addr; + out_iov[1].iov_len = admin_cmd->data_len; + fuse_reply_ioctl_retry(req, in_iov, 1, out_iov, 2); + } else { + fuse_reply_ioctl_retry(req, in_iov, 1, out_iov, 1); + } + return; + } + + cuse_nvme_admin_cmd_send(req, admin_cmd, NULL); + + return; + case SPDK_NVME_DATA_BIDIRECTIONAL: + fuse_reply_err(req, EINVAL); + return; + } +} + +static void +cuse_nvme_reset_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg) +{ + int rc; + fuse_req_t req = arg; + + rc = spdk_nvme_ctrlr_reset(ctrlr); + if (rc) { + fuse_reply_err(req, rc); + return; + } + + fuse_reply_ioctl_iov(req, 0, NULL, 0); +} + +static void +cuse_nvme_reset(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + int rv; + struct cuse_device *cuse_device = fuse_req_userdata(req); + + if (cuse_device->nsid) { + SPDK_ERRLOG("Namespace reset not supported\n"); + fuse_reply_err(req, EINVAL); + return; + } + + rv = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_reset_execute, (void *)req); + if (rv) { + SPDK_ERRLOG("Cannot send reset\n"); + fuse_reply_err(req, EINVAL); + } +} + +/***************************************************************************** + * Namespace IO requests + */ + +static void +cuse_nvme_submit_io_write_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct cuse_io_ctx *ctx = (struct cuse_io_ctx *)ref; + + fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, NULL, 0); + + cuse_io_ctx_free(ctx); +} + +static void +cuse_nvme_submit_io_write_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg) +{ + int rc; + struct cuse_io_ctx *ctx = arg; + struct spdk_nvme_ns *ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + + rc = spdk_nvme_ns_cmd_write(ns, ctrlr->external_io_msgs_qpair, ctx->data, + ctx->lba, /* LBA start */ + ctx->lba_count, /* number of LBAs */ + cuse_nvme_submit_io_write_done, ctx, 0); + + if (rc != 0) { + SPDK_ERRLOG("write failed: rc = %d\n", rc); + fuse_reply_err(ctx->req, rc); + cuse_io_ctx_free(ctx); + return; + } +} + +static void +cuse_nvme_submit_io_write(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + const struct nvme_user_io *user_io = in_buf; + struct cuse_io_ctx *ctx; + struct spdk_nvme_ns *ns; + uint32_t block_size; + int rc; + struct cuse_device *cuse_device = fuse_req_userdata(req); + + ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx)); + if (!ctx) { + SPDK_ERRLOG("Cannot allocate memory for context\n"); + fuse_reply_err(req, ENOMEM); + return; + } + + ctx->req = req; + + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + block_size = spdk_nvme_ns_get_sector_size(ns); + + ctx->lba = user_io->slba; + ctx->lba_count = user_io->nblocks + 1; + ctx->data_len = ctx->lba_count * block_size; + + ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, + SPDK_MALLOC_DMA); + if (ctx->data == NULL) { + SPDK_ERRLOG("Write buffer allocation failed\n"); + fuse_reply_err(ctx->req, ENOMEM); + free(ctx); + return; + } + + memcpy(ctx->data, in_buf + sizeof(*user_io), ctx->data_len); + + rc = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_submit_io_write_cb, + ctx); + if (rc < 0) { + SPDK_ERRLOG("Cannot send write io\n"); + fuse_reply_err(ctx->req, rc); + cuse_io_ctx_free(ctx); + } +} + +static void +cuse_nvme_submit_io_read_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct cuse_io_ctx *ctx = (struct cuse_io_ctx *)ref; + struct iovec iov; + + iov.iov_base = ctx->data; + iov.iov_len = ctx->data_len; + + fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, &iov, 1); + + cuse_io_ctx_free(ctx); +} + +static void +cuse_nvme_submit_io_read_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg) +{ + int rc; + struct cuse_io_ctx *ctx = arg; + struct spdk_nvme_ns *ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + + rc = spdk_nvme_ns_cmd_read(ns, ctrlr->external_io_msgs_qpair, ctx->data, + ctx->lba, /* LBA start */ + ctx->lba_count, /* number of LBAs */ + cuse_nvme_submit_io_read_done, ctx, 0); + + if (rc != 0) { + SPDK_ERRLOG("read failed: rc = %d\n", rc); + fuse_reply_err(ctx->req, rc); + cuse_io_ctx_free(ctx); + return; + } +} + +static void +cuse_nvme_submit_io_read(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + int rc; + struct cuse_io_ctx *ctx; + const struct nvme_user_io *user_io = in_buf; + struct cuse_device *cuse_device = fuse_req_userdata(req); + struct spdk_nvme_ns *ns; + uint32_t block_size; + + ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx)); + if (!ctx) { + SPDK_ERRLOG("Cannot allocate memory for context\n"); + fuse_reply_err(req, ENOMEM); + return; + } + + ctx->req = req; + ctx->lba = user_io->slba; + ctx->lba_count = user_io->nblocks; + + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + block_size = spdk_nvme_ns_get_sector_size(ns); + + ctx->data_len = ctx->lba_count * block_size; + ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, + SPDK_MALLOC_DMA); + if (ctx->data == NULL) { + SPDK_ERRLOG("Read buffer allocation failed\n"); + fuse_reply_err(ctx->req, ENOMEM); + free(ctx); + return; + } + + rc = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_submit_io_read_cb, ctx); + if (rc < 0) { + SPDK_ERRLOG("Cannot send read io\n"); + fuse_reply_err(ctx->req, rc); + cuse_io_ctx_free(ctx); + } +} + + +static void +cuse_nvme_submit_io(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + const struct nvme_user_io *user_io; + struct iovec in_iov[2], out_iov; + + in_iov[0].iov_base = (void *)arg; + in_iov[0].iov_len = sizeof(*user_io); + if (in_bufsz == 0) { + fuse_reply_ioctl_retry(req, in_iov, 1, NULL, 0); + return; + } + + user_io = in_buf; + + switch (user_io->opcode) { + case SPDK_NVME_OPC_READ: + out_iov.iov_base = (void *)user_io->addr; + out_iov.iov_len = (user_io->nblocks + 1) * 512; + if (out_bufsz == 0) { + fuse_reply_ioctl_retry(req, in_iov, 1, &out_iov, 1); + return; + } + + cuse_nvme_submit_io_read(req, cmd, arg, fi, flags, in_buf, + in_bufsz, out_bufsz); + break; + case SPDK_NVME_OPC_WRITE: + in_iov[1].iov_base = (void *)user_io->addr; + in_iov[1].iov_len = (user_io->nblocks + 1) * 512; + if (in_bufsz == sizeof(*user_io)) { + fuse_reply_ioctl_retry(req, in_iov, 2, NULL, 0); + return; + } + + cuse_nvme_submit_io_write(req, cmd, arg, fi, flags, in_buf, + in_bufsz, out_bufsz); + + break; + default: + SPDK_ERRLOG("SUBMIT_IO: opc:%d not valid\n", user_io->opcode); + fuse_reply_err(req, EINVAL); + return; + } + +} + +/***************************************************************************** + * Other namespace IOCTLs + */ +static void +cuse_blkgetsize64(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + uint64_t size; + struct spdk_nvme_ns *ns; + struct cuse_device *cuse_device = fuse_req_userdata(req); + + FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, size); + + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + size = spdk_nvme_ns_get_num_sectors(ns); + fuse_reply_ioctl(req, 0, &size, sizeof(size)); +} + +static void +cuse_blkpbszget(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + int pbsz; + struct spdk_nvme_ns *ns; + struct cuse_device *cuse_device = fuse_req_userdata(req); + + FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, pbsz); + + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + pbsz = spdk_nvme_ns_get_sector_size(ns); + fuse_reply_ioctl(req, 0, &pbsz, sizeof(pbsz)); +} + +static void +cuse_blkgetsize(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + long size; + struct spdk_nvme_ns *ns; + struct cuse_device *cuse_device = fuse_req_userdata(req); + + FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, size); + + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + + /* return size in 512 bytes blocks */ + size = spdk_nvme_ns_get_num_sectors(ns) * 512 / spdk_nvme_ns_get_sector_size(ns); + fuse_reply_ioctl(req, 0, &size, sizeof(size)); +} + +static void +cuse_getid(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + struct cuse_device *cuse_device = fuse_req_userdata(req); + + fuse_reply_ioctl(req, cuse_device->nsid, NULL, 0); +} + +static void +cuse_ctrlr_ioctl(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + if (flags & FUSE_IOCTL_COMPAT) { + fuse_reply_err(req, ENOSYS); + return; + } + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + cuse_nvme_admin_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case NVME_IOCTL_RESET: + cuse_nvme_reset(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + default: + SPDK_ERRLOG("Unsupported IOCTL 0x%X.\n", cmd); + fuse_reply_err(req, EINVAL); + } +} + +static void +cuse_ns_ioctl(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + if (flags & FUSE_IOCTL_COMPAT) { + fuse_reply_err(req, ENOSYS); + return; + } + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + cuse_nvme_admin_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case NVME_IOCTL_SUBMIT_IO: + cuse_nvme_submit_io(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case NVME_IOCTL_ID: + cuse_getid(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case BLKPBSZGET: + cuse_blkpbszget(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case BLKGETSIZE: + /* Returns the device size as a number of 512-byte blocks (returns pointer to long) */ + cuse_blkgetsize(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case BLKGETSIZE64: + /* Returns the device size in sectors (returns pointer to uint64_t) */ + cuse_blkgetsize64(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + default: + SPDK_ERRLOG("Unsupported IOCTL 0x%X.\n", cmd); + fuse_reply_err(req, EINVAL); + } +} + +/***************************************************************************** + * CUSE threads initialization. + */ + +static void cuse_open(fuse_req_t req, struct fuse_file_info *fi) +{ + fuse_reply_open(req, fi); +} + +static const struct cuse_lowlevel_ops cuse_ctrlr_clop = { + .open = cuse_open, + .ioctl = cuse_ctrlr_ioctl, +}; + +static const struct cuse_lowlevel_ops cuse_ns_clop = { + .open = cuse_open, + .ioctl = cuse_ns_ioctl, +}; + +static void * +cuse_thread(void *arg) +{ + struct cuse_device *cuse_device = arg; + char *cuse_argv[] = { "cuse", "-f" }; + int cuse_argc = SPDK_COUNTOF(cuse_argv); + char devname_arg[128 + 8]; + const char *dev_info_argv[] = { devname_arg }; + struct cuse_info ci; + int multithreaded; + int rc; + struct fuse_buf buf = { .mem = NULL }; + struct pollfd fds; + int timeout_msecs = 500; + + spdk_unaffinitize_thread(); + + snprintf(devname_arg, sizeof(devname_arg), "DEVNAME=%s", cuse_device->dev_name); + + memset(&ci, 0, sizeof(ci)); + ci.dev_info_argc = 1; + ci.dev_info_argv = dev_info_argv; + ci.flags = CUSE_UNRESTRICTED_IOCTL; + + if (cuse_device->nsid) { + cuse_device->session = cuse_lowlevel_setup(cuse_argc, cuse_argv, &ci, &cuse_ns_clop, + &multithreaded, cuse_device); + } else { + cuse_device->session = cuse_lowlevel_setup(cuse_argc, cuse_argv, &ci, &cuse_ctrlr_clop, + &multithreaded, cuse_device); + } + if (!cuse_device->session) { + SPDK_ERRLOG("Cannot create cuse session\n"); + goto err; + } + + SPDK_NOTICELOG("fuse session for device %s created\n", cuse_device->dev_name); + + /* Receive and process fuse requests */ + fds.fd = fuse_session_fd(cuse_device->session); + fds.events = POLLIN; + while (!fuse_session_exited(cuse_device->session)) { + rc = poll(&fds, 1, timeout_msecs); + if (rc <= 0) { + continue; + } + rc = fuse_session_receive_buf(cuse_device->session, &buf); + if (rc > 0) { + fuse_session_process_buf(cuse_device->session, &buf); + } + } + free(buf.mem); + fuse_session_reset(cuse_device->session); + cuse_lowlevel_teardown(cuse_device->session); +err: + pthread_exit(NULL); +} + +/***************************************************************************** + * CUSE devices management + */ + +static int +cuse_nvme_ns_start(struct cuse_device *ctrlr_device, uint32_t nsid) +{ + struct cuse_device *ns_device; + int rv; + + ns_device = &ctrlr_device->ns_devices[nsid - 1]; + if (ns_device->is_started) { + return 0; + } + + ns_device->ctrlr = ctrlr_device->ctrlr; + ns_device->ctrlr_device = ctrlr_device; + ns_device->nsid = nsid; + rv = snprintf(ns_device->dev_name, sizeof(ns_device->dev_name), "%sn%d", + ctrlr_device->dev_name, ns_device->nsid); + if (rv < 0) { + SPDK_ERRLOG("Device name too long.\n"); + free(ns_device); + return -ENAMETOOLONG; + } + + rv = pthread_create(&ns_device->tid, NULL, cuse_thread, ns_device); + if (rv != 0) { + SPDK_ERRLOG("pthread_create failed\n"); + return -rv; + } + + ns_device->is_started = true; + + return 0; +} + +static void +cuse_nvme_ns_stop(struct cuse_device *ctrlr_device, uint32_t nsid) +{ + struct cuse_device *ns_device; + + ns_device = &ctrlr_device->ns_devices[nsid - 1]; + if (!ns_device->is_started) { + return; + } + + fuse_session_exit(ns_device->session); + pthread_join(ns_device->tid, NULL); + ns_device->is_started = false; +} + +static int +nvme_cuse_claim(struct cuse_device *ctrlr_device, uint32_t index) +{ + int dev_fd; + int pid; + void *dev_map; + struct flock cusedev_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + }; + + snprintf(ctrlr_device->lock_name, sizeof(ctrlr_device->lock_name), + "/tmp/spdk_nvme_cuse_lock_%" PRIu32, index); + + dev_fd = open(ctrlr_device->lock_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (dev_fd == -1) { + SPDK_ERRLOG("could not open %s\n", ctrlr_device->lock_name); + return -errno; + } + + if (ftruncate(dev_fd, sizeof(int)) != 0) { + SPDK_ERRLOG("could not truncate %s\n", ctrlr_device->lock_name); + close(dev_fd); + return -errno; + } + + dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, + MAP_SHARED, dev_fd, 0); + if (dev_map == MAP_FAILED) { + SPDK_ERRLOG("could not mmap dev %s (%d)\n", ctrlr_device->lock_name, errno); + close(dev_fd); + return -errno; + } + + if (fcntl(dev_fd, F_SETLK, &cusedev_lock) != 0) { + pid = *(int *)dev_map; + SPDK_ERRLOG("Cannot create lock on device %s, probably" + " process %d has claimed it\n", ctrlr_device->lock_name, pid); + munmap(dev_map, sizeof(int)); + close(dev_fd); + /* F_SETLK returns unspecified errnos, normalize them */ + return -EACCES; + } + + *(int *)dev_map = (int)getpid(); + munmap(dev_map, sizeof(int)); + ctrlr_device->claim_fd = dev_fd; + ctrlr_device->index = index; + /* Keep dev_fd open to maintain the lock. */ + return 0; +} + +static void +nvme_cuse_unclaim(struct cuse_device *ctrlr_device) +{ + close(ctrlr_device->claim_fd); + ctrlr_device->claim_fd = -1; + unlink(ctrlr_device->lock_name); +} + +static void +cuse_nvme_ctrlr_stop(struct cuse_device *ctrlr_device) +{ + uint32_t i; + uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr_device->ctrlr); + + for (i = 1; i <= num_ns; i++) { + cuse_nvme_ns_stop(ctrlr_device, i); + } + + fuse_session_exit(ctrlr_device->session); + pthread_join(ctrlr_device->tid, NULL); + TAILQ_REMOVE(&g_ctrlr_ctx_head, ctrlr_device, tailq); + spdk_bit_array_clear(g_ctrlr_started, ctrlr_device->index); + if (spdk_bit_array_count_set(g_ctrlr_started) == 0) { + spdk_bit_array_free(&g_ctrlr_started); + } + nvme_cuse_unclaim(ctrlr_device); + free(ctrlr_device->ns_devices); + free(ctrlr_device); +} + +static int +cuse_nvme_ctrlr_update_namespaces(struct cuse_device *ctrlr_device) +{ + uint32_t nsid; + uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr_device->ctrlr); + + for (nsid = 1; nsid <= num_ns; nsid++) { + if (!spdk_nvme_ctrlr_is_active_ns(ctrlr_device->ctrlr, nsid)) { + cuse_nvme_ns_stop(ctrlr_device, nsid); + continue; + } + + if (cuse_nvme_ns_start(ctrlr_device, nsid) < 0) { + SPDK_ERRLOG("Cannot start CUSE namespace device."); + return -1; + } + } + + return 0; +} + +static int +nvme_cuse_start(struct spdk_nvme_ctrlr *ctrlr) +{ + int rv = 0; + struct cuse_device *ctrlr_device; + uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); + + SPDK_NOTICELOG("Creating cuse device for controller\n"); + + if (g_ctrlr_started == NULL) { + g_ctrlr_started = spdk_bit_array_create(128); + if (g_ctrlr_started == NULL) { + SPDK_ERRLOG("Cannot create bit array\n"); + return -ENOMEM; + } + } + + ctrlr_device = (struct cuse_device *)calloc(1, sizeof(struct cuse_device)); + if (!ctrlr_device) { + SPDK_ERRLOG("Cannot allocate memory for ctrlr_device."); + rv = -ENOMEM; + goto err2; + } + + ctrlr_device->ctrlr = ctrlr; + + /* Check if device already exists, if not increment index until success */ + ctrlr_device->index = 0; + while (1) { + ctrlr_device->index = spdk_bit_array_find_first_clear(g_ctrlr_started, ctrlr_device->index); + if (ctrlr_device->index == UINT32_MAX) { + SPDK_ERRLOG("Too many registered controllers\n"); + goto err2; + } + + if (nvme_cuse_claim(ctrlr_device, ctrlr_device->index) == 0) { + break; + } + ctrlr_device->index++; + } + spdk_bit_array_set(g_ctrlr_started, ctrlr_device->index); + snprintf(ctrlr_device->dev_name, sizeof(ctrlr_device->dev_name), "spdk/nvme%d", + ctrlr_device->index); + + rv = pthread_create(&ctrlr_device->tid, NULL, cuse_thread, ctrlr_device); + if (rv != 0) { + SPDK_ERRLOG("pthread_create failed\n"); + rv = -rv; + goto err3; + } + TAILQ_INSERT_TAIL(&g_ctrlr_ctx_head, ctrlr_device, tailq); + + ctrlr_device->ns_devices = (struct cuse_device *)calloc(num_ns, sizeof(struct cuse_device)); + /* Start all active namespaces */ + if (cuse_nvme_ctrlr_update_namespaces(ctrlr_device) < 0) { + SPDK_ERRLOG("Cannot start CUSE namespace devices."); + cuse_nvme_ctrlr_stop(ctrlr_device); + rv = -1; + goto err3; + } + + return 0; + +err3: + spdk_bit_array_clear(g_ctrlr_started, ctrlr_device->index); +err2: + free(ctrlr_device); + if (spdk_bit_array_count_set(g_ctrlr_started) == 0) { + spdk_bit_array_free(&g_ctrlr_started); + } + return rv; +} + +static struct cuse_device * +nvme_cuse_get_cuse_ctrlr_device(struct spdk_nvme_ctrlr *ctrlr) +{ + struct cuse_device *ctrlr_device = NULL; + + TAILQ_FOREACH(ctrlr_device, &g_ctrlr_ctx_head, tailq) { + if (ctrlr_device->ctrlr == ctrlr) { + break; + } + } + + return ctrlr_device; +} + +static struct cuse_device * +nvme_cuse_get_cuse_ns_device(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + struct cuse_device *ctrlr_device = NULL; + uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); + + if (nsid < 1 || nsid > num_ns) { + return NULL; + } + + ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr); + if (!ctrlr_device) { + return NULL; + } + + if (!ctrlr_device->ns_devices[nsid - 1].is_started) { + return NULL; + } + + return &ctrlr_device->ns_devices[nsid - 1]; +} + +static void +nvme_cuse_stop(struct spdk_nvme_ctrlr *ctrlr) +{ + struct cuse_device *ctrlr_device; + + pthread_mutex_lock(&g_cuse_mtx); + + ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr); + if (!ctrlr_device) { + SPDK_ERRLOG("Cannot find associated CUSE device\n"); + pthread_mutex_unlock(&g_cuse_mtx); + return; + } + + cuse_nvme_ctrlr_stop(ctrlr_device); + + pthread_mutex_unlock(&g_cuse_mtx); +} + +static void +nvme_cuse_update(struct spdk_nvme_ctrlr *ctrlr) +{ + struct cuse_device *ctrlr_device; + + pthread_mutex_lock(&g_cuse_mtx); + + ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr); + if (!ctrlr_device) { + pthread_mutex_unlock(&g_cuse_mtx); + return; + } + + cuse_nvme_ctrlr_update_namespaces(ctrlr_device); + + pthread_mutex_unlock(&g_cuse_mtx); +} + +static struct nvme_io_msg_producer cuse_nvme_io_msg_producer = { + .name = "cuse", + .stop = nvme_cuse_stop, + .update = nvme_cuse_update, +}; + +int +spdk_nvme_cuse_register(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + rc = nvme_io_msg_ctrlr_register(ctrlr, &cuse_nvme_io_msg_producer); + if (rc) { + return rc; + } + + pthread_mutex_lock(&g_cuse_mtx); + + rc = nvme_cuse_start(ctrlr); + if (rc) { + nvme_io_msg_ctrlr_unregister(ctrlr, &cuse_nvme_io_msg_producer); + } + + pthread_mutex_unlock(&g_cuse_mtx); + + return rc; +} + +int +spdk_nvme_cuse_unregister(struct spdk_nvme_ctrlr *ctrlr) +{ + struct cuse_device *ctrlr_device; + + pthread_mutex_lock(&g_cuse_mtx); + + ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr); + if (!ctrlr_device) { + SPDK_ERRLOG("Cannot find associated CUSE device\n"); + pthread_mutex_unlock(&g_cuse_mtx); + return -ENODEV; + } + + cuse_nvme_ctrlr_stop(ctrlr_device); + + pthread_mutex_unlock(&g_cuse_mtx); + + nvme_io_msg_ctrlr_unregister(ctrlr, &cuse_nvme_io_msg_producer); + + return 0; +} + +void +spdk_nvme_cuse_update_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + nvme_cuse_update(ctrlr); +} + +int +spdk_nvme_cuse_get_ctrlr_name(struct spdk_nvme_ctrlr *ctrlr, char *name, size_t *size) +{ + struct cuse_device *ctrlr_device; + size_t req_len; + + pthread_mutex_lock(&g_cuse_mtx); + + ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr); + if (!ctrlr_device) { + pthread_mutex_unlock(&g_cuse_mtx); + return -ENODEV; + } + + req_len = strnlen(ctrlr_device->dev_name, sizeof(ctrlr_device->dev_name)); + if (*size < req_len) { + *size = req_len; + pthread_mutex_unlock(&g_cuse_mtx); + return -ENOSPC; + } + snprintf(name, req_len + 1, "%s", ctrlr_device->dev_name); + + pthread_mutex_unlock(&g_cuse_mtx); + + return 0; +} + +int +spdk_nvme_cuse_get_ns_name(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, char *name, size_t *size) +{ + struct cuse_device *ns_device; + size_t req_len; + + pthread_mutex_lock(&g_cuse_mtx); + + ns_device = nvme_cuse_get_cuse_ns_device(ctrlr, nsid); + if (!ns_device) { + pthread_mutex_unlock(&g_cuse_mtx); + return -ENODEV; + } + + req_len = strnlen(ns_device->dev_name, sizeof(ns_device->dev_name)); + if (*size < req_len) { + *size = req_len; + pthread_mutex_unlock(&g_cuse_mtx); + return -ENOSPC; + } + snprintf(name, req_len + 1, "%s", ns_device->dev_name); + + pthread_mutex_unlock(&g_cuse_mtx); + + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_cuse.h b/src/spdk/lib/nvme/nvme_cuse.h new file mode 100644 index 000000000..92b475190 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_cuse.h @@ -0,0 +1,42 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVME_CUSE_H__ +#define __NVME_CUSE_H__ + +#include "spdk/nvme.h" + +int nvme_cuse_register(struct spdk_nvme_ctrlr *ctrlr, const char *dev_path); +void nvme_cuse_unregister(struct spdk_nvme_ctrlr *ctrlr); + +#endif /* __NVME_CUSE_H__ */ diff --git a/src/spdk/lib/nvme/nvme_fabric.c b/src/spdk/lib/nvme/nvme_fabric.c new file mode 100644 index 000000000..9fff20873 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_fabric.c @@ -0,0 +1,475 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over Fabrics transport-independent functions + */ + +#include "nvme_internal.h" + +#include "spdk/endian.h" +#include "spdk/string.h" + +static int +nvme_fabric_prop_set_cmd(struct spdk_nvme_ctrlr *ctrlr, + uint32_t offset, uint8_t size, uint64_t value) +{ + struct spdk_nvmf_fabric_prop_set_cmd cmd = {}; + struct nvme_completion_poll_status *status; + int rc; + + assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8); + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + cmd.opcode = SPDK_NVME_OPC_FABRIC; + cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; + cmd.ofst = offset; + cmd.attrib.size = size; + cmd.value.u64 = value; + + rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd, + NULL, 0, + nvme_completion_poll_cb, status); + if (rc < 0) { + free(status); + return rc; + } + + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + SPDK_ERRLOG("Property Set failed\n"); + return -1; + } + free(status); + + return 0; +} + +static int +nvme_fabric_prop_get_cmd(struct spdk_nvme_ctrlr *ctrlr, + uint32_t offset, uint8_t size, uint64_t *value) +{ + struct spdk_nvmf_fabric_prop_set_cmd cmd = {}; + struct nvme_completion_poll_status *status; + struct spdk_nvmf_fabric_prop_get_rsp *response; + int rc; + + assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8); + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + cmd.opcode = SPDK_NVME_OPC_FABRIC; + cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; + cmd.ofst = offset; + cmd.attrib.size = size; + + rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd, + NULL, 0, nvme_completion_poll_cb, + status); + if (rc < 0) { + free(status); + return rc; + } + + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + SPDK_ERRLOG("Property Get failed\n"); + return -1; + } + + response = (struct spdk_nvmf_fabric_prop_get_rsp *)&status->cpl; + + if (size == SPDK_NVMF_PROP_SIZE_4) { + *value = response->value.u32.low; + } else { + *value = response->value.u64; + } + + free(status); + + return 0; +} + +int +nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, value); +} + +int +nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value); +} + +int +nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + uint64_t tmp_value; + int rc; + rc = nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, &tmp_value); + + if (!rc) { + *value = (uint32_t)tmp_value; + } + return rc; +} + +int +nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + return nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value); +} + +static void +nvme_fabric_discover_probe(struct spdk_nvmf_discovery_log_page_entry *entry, + struct spdk_nvme_probe_ctx *probe_ctx, + int discover_priority) +{ + struct spdk_nvme_transport_id trid; + uint8_t *end; + size_t len; + + memset(&trid, 0, sizeof(trid)); + + if (entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + SPDK_WARNLOG("Skipping unsupported discovery service referral\n"); + return; + } else if (entry->subtype != SPDK_NVMF_SUBTYPE_NVME) { + SPDK_WARNLOG("Skipping unknown subtype %u\n", entry->subtype); + return; + } + + trid.trtype = entry->trtype; + spdk_nvme_transport_id_populate_trstring(&trid, spdk_nvme_transport_id_trtype_str(entry->trtype)); + if (!spdk_nvme_transport_available_by_name(trid.trstring)) { + SPDK_WARNLOG("NVMe transport type %u not available; skipping probe\n", + trid.trtype); + return; + } + + snprintf(trid.trstring, sizeof(trid.trstring), "%s", probe_ctx->trid.trstring); + trid.adrfam = entry->adrfam; + + /* Ensure that subnqn is null terminated. */ + end = memchr(entry->subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1); + if (!end) { + SPDK_ERRLOG("Discovery entry SUBNQN is not null terminated\n"); + return; + } + len = end - entry->subnqn; + memcpy(trid.subnqn, entry->subnqn, len); + trid.subnqn[len] = '\0'; + + /* Convert traddr to a null terminated string. */ + len = spdk_strlen_pad(entry->traddr, sizeof(entry->traddr), ' '); + memcpy(trid.traddr, entry->traddr, len); + if (spdk_str_chomp(trid.traddr) != 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRADDR\n"); + } + + /* Convert trsvcid to a null terminated string. */ + len = spdk_strlen_pad(entry->trsvcid, sizeof(entry->trsvcid), ' '); + memcpy(trid.trsvcid, entry->trsvcid, len); + if (spdk_str_chomp(trid.trsvcid) != 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRSVCID\n"); + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "subnqn=%s, trtype=%u, traddr=%s, trsvcid=%s\n", + trid.subnqn, trid.trtype, + trid.traddr, trid.trsvcid); + + /* Copy the priority from the discovery ctrlr */ + trid.priority = discover_priority; + + nvme_ctrlr_probe(&trid, probe_ctx, NULL); +} + +static int +nvme_fabric_get_discovery_log_page(struct spdk_nvme_ctrlr *ctrlr, + void *log_page, uint32_t size, uint64_t offset) +{ + struct nvme_completion_poll_status *status; + int rc; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_DISCOVERY, 0, log_page, size, offset, + nvme_completion_poll_cb, status); + if (rc < 0) { + free(status); + return -1; + } + + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + return -1; + } + free(status); + + return 0; +} + +int +nvme_fabric_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, + bool direct_connect) +{ + struct spdk_nvme_ctrlr_opts discovery_opts; + struct spdk_nvme_ctrlr *discovery_ctrlr; + union spdk_nvme_cc_register cc; + int rc; + struct nvme_completion_poll_status *status; + + if (strcmp(probe_ctx->trid.subnqn, SPDK_NVMF_DISCOVERY_NQN) != 0) { + /* It is not a discovery_ctrlr info and try to directly connect it */ + rc = nvme_ctrlr_probe(&probe_ctx->trid, probe_ctx, NULL); + return rc; + } + + spdk_nvme_ctrlr_get_default_ctrlr_opts(&discovery_opts, sizeof(discovery_opts)); + /* For discovery_ctrlr set the timeout to 0 */ + discovery_opts.keep_alive_timeout_ms = 0; + + discovery_ctrlr = nvme_transport_ctrlr_construct(&probe_ctx->trid, &discovery_opts, NULL); + if (discovery_ctrlr == NULL) { + return -1; + } + nvme_qpair_set_state(discovery_ctrlr->adminq, NVME_QPAIR_ENABLED); + + /* TODO: this should be using the normal NVMe controller initialization process +1 */ + cc.raw = 0; + cc.bits.en = 1; + cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */ + cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */ + rc = nvme_transport_ctrlr_set_reg_4(discovery_ctrlr, offsetof(struct spdk_nvme_registers, cc.raw), + cc.raw); + if (rc < 0) { + SPDK_ERRLOG("Failed to set cc\n"); + nvme_ctrlr_destruct(discovery_ctrlr); + return -1; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + nvme_ctrlr_destruct(discovery_ctrlr); + return -ENOMEM; + } + + /* get the cdata info */ + rc = nvme_ctrlr_cmd_identify(discovery_ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0, + &discovery_ctrlr->cdata, sizeof(discovery_ctrlr->cdata), + nvme_completion_poll_cb, status); + if (rc != 0) { + SPDK_ERRLOG("Failed to identify cdata\n"); + nvme_ctrlr_destruct(discovery_ctrlr); + free(status); + return rc; + } + + if (nvme_wait_for_completion(discovery_ctrlr->adminq, status)) { + SPDK_ERRLOG("nvme_identify_controller failed!\n"); + nvme_ctrlr_destruct(discovery_ctrlr); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + + free(status); + + /* Direct attach through spdk_nvme_connect() API */ + if (direct_connect == true) { + /* Set the ready state to skip the normal init process */ + discovery_ctrlr->state = NVME_CTRLR_STATE_READY; + nvme_ctrlr_connected(probe_ctx, discovery_ctrlr); + nvme_ctrlr_add_process(discovery_ctrlr, 0); + return 0; + } + + rc = nvme_fabric_ctrlr_discover(discovery_ctrlr, probe_ctx); + nvme_ctrlr_destruct(discovery_ctrlr); + return rc; +} + +int +nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_probe_ctx *probe_ctx) +{ + struct spdk_nvmf_discovery_log_page *log_page; + struct spdk_nvmf_discovery_log_page_entry *log_page_entry; + char buffer[4096]; + int rc; + uint64_t i, numrec, buffer_max_entries_first, buffer_max_entries, log_page_offset = 0; + uint64_t remaining_num_rec = 0; + uint16_t recfmt; + + memset(buffer, 0x0, 4096); + buffer_max_entries_first = (sizeof(buffer) - offsetof(struct spdk_nvmf_discovery_log_page, + entries[0])) / + sizeof(struct spdk_nvmf_discovery_log_page_entry); + buffer_max_entries = sizeof(buffer) / sizeof(struct spdk_nvmf_discovery_log_page_entry); + do { + rc = nvme_fabric_get_discovery_log_page(ctrlr, buffer, sizeof(buffer), log_page_offset); + if (rc < 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Get Log Page - Discovery error\n"); + return rc; + } + + if (!remaining_num_rec) { + log_page = (struct spdk_nvmf_discovery_log_page *)buffer; + recfmt = from_le16(&log_page->recfmt); + if (recfmt != 0) { + SPDK_ERRLOG("Unrecognized discovery log record format %" PRIu16 "\n", recfmt); + return -EPROTO; + } + remaining_num_rec = log_page->numrec; + log_page_offset = offsetof(struct spdk_nvmf_discovery_log_page, entries[0]); + log_page_entry = &log_page->entries[0]; + numrec = spdk_min(remaining_num_rec, buffer_max_entries_first); + } else { + numrec = spdk_min(remaining_num_rec, buffer_max_entries); + log_page_entry = (struct spdk_nvmf_discovery_log_page_entry *)buffer; + } + + for (i = 0; i < numrec; i++) { + nvme_fabric_discover_probe(log_page_entry++, probe_ctx, ctrlr->trid.priority); + } + remaining_num_rec -= numrec; + log_page_offset += numrec * sizeof(struct spdk_nvmf_discovery_log_page_entry); + } while (remaining_num_rec != 0); + + return 0; +} + +int +nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries) +{ + struct nvme_completion_poll_status *status; + struct spdk_nvmf_fabric_connect_rsp *rsp; + struct spdk_nvmf_fabric_connect_cmd cmd; + struct spdk_nvmf_fabric_connect_data *nvmf_data; + struct spdk_nvme_ctrlr *ctrlr; + int rc; + + if (num_entries == 0 || num_entries > SPDK_NVME_IO_QUEUE_MAX_ENTRIES) { + return -EINVAL; + } + + ctrlr = qpair->ctrlr; + if (!ctrlr) { + return -EINVAL; + } + + nvmf_data = spdk_zmalloc(sizeof(*nvmf_data), 0, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!nvmf_data) { + SPDK_ERRLOG("nvmf_data allocation error\n"); + return -ENOMEM; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + spdk_free(nvmf_data); + return -ENOMEM; + } + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = SPDK_NVME_OPC_FABRIC; + cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; + cmd.qid = qpair->id; + cmd.sqsize = num_entries - 1; + cmd.kato = ctrlr->opts.keep_alive_timeout_ms; + + if (nvme_qpair_is_admin_queue(qpair)) { + nvmf_data->cntlid = 0xFFFF; + } else { + nvmf_data->cntlid = ctrlr->cntlid; + } + + SPDK_STATIC_ASSERT(sizeof(nvmf_data->hostid) == sizeof(ctrlr->opts.extended_host_id), + "host ID size mismatch"); + memcpy(nvmf_data->hostid, ctrlr->opts.extended_host_id, sizeof(nvmf_data->hostid)); + snprintf(nvmf_data->hostnqn, sizeof(nvmf_data->hostnqn), "%s", ctrlr->opts.hostnqn); + snprintf(nvmf_data->subnqn, sizeof(nvmf_data->subnqn), "%s", ctrlr->trid.subnqn); + + rc = spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, + (struct spdk_nvme_cmd *)&cmd, + nvmf_data, sizeof(*nvmf_data), + nvme_completion_poll_cb, status); + if (rc < 0) { + SPDK_ERRLOG("Connect command failed\n"); + spdk_free(nvmf_data); + free(status); + return rc; + } + + if (nvme_wait_for_completion(qpair, status)) { + SPDK_ERRLOG("Connect command failed\n"); + spdk_free(nvmf_data); + if (!status->timed_out) { + free(status); + } + return -EIO; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + rsp = (struct spdk_nvmf_fabric_connect_rsp *)&status->cpl; + ctrlr->cntlid = rsp->status_code_specific.success.cntlid; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cntlid); + } + + spdk_free(nvmf_data); + free(status); + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_internal.h b/src/spdk/lib/nvme/nvme_internal.h new file mode 100644 index 000000000..98fec279d --- /dev/null +++ b/src/spdk/lib/nvme/nvme_internal.h @@ -0,0 +1,1233 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVME_INTERNAL_H__ +#define __NVME_INTERNAL_H__ + +#include "spdk/config.h" +#include "spdk/likely.h" +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" + +#if defined(__i386__) || defined(__x86_64__) +#include <x86intrin.h> +#endif + +#include "spdk/queue.h" +#include "spdk/barrier.h" +#include "spdk/bit_array.h" +#include "spdk/mmio.h" +#include "spdk/pci_ids.h" +#include "spdk/util.h" +#include "spdk/memory.h" +#include "spdk/nvme_intel.h" +#include "spdk/nvmf_spec.h" +#include "spdk/uuid.h" + +#include "spdk_internal/assert.h" +#include "spdk_internal/log.h" + +extern pid_t g_spdk_nvme_pid; + +/* + * Some Intel devices support vendor-unique read latency log page even + * though the log page directory says otherwise. + */ +#define NVME_INTEL_QUIRK_READ_LATENCY 0x1 + +/* + * Some Intel devices support vendor-unique write latency log page even + * though the log page directory says otherwise. + */ +#define NVME_INTEL_QUIRK_WRITE_LATENCY 0x2 + +/* + * The controller needs a delay before starts checking the device + * readiness, which is done by reading the NVME_CSTS_RDY bit. + */ +#define NVME_QUIRK_DELAY_BEFORE_CHK_RDY 0x4 + +/* + * The controller performs best when I/O is split on particular + * LBA boundaries. + */ +#define NVME_INTEL_QUIRK_STRIPING 0x8 + +/* + * The controller needs a delay after allocating an I/O queue pair + * before it is ready to accept I/O commands. + */ +#define NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC 0x10 + +/* + * Earlier NVMe devices do not indicate whether unmapped blocks + * will read all zeroes or not. This define indicates that the + * device does in fact read all zeroes after an unmap event + */ +#define NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE 0x20 + +/* + * The controller doesn't handle Identify value others than 0 or 1 correctly. + */ +#define NVME_QUIRK_IDENTIFY_CNS 0x40 + +/* + * The controller supports Open Channel command set if matching additional + * condition, like the first byte (value 0x1) in the vendor specific + * bits of the namespace identify structure is set. + */ +#define NVME_QUIRK_OCSSD 0x80 + +/* + * The controller has an Intel vendor ID but does not support Intel vendor-specific + * log pages. This is primarily for QEMU emulated SSDs which report an Intel vendor + * ID but do not support these log pages. + */ +#define NVME_INTEL_QUIRK_NO_LOG_PAGES 0x100 + +/* + * The controller does not set SHST_COMPLETE in a reasonable amount of time. This + * is primarily seen in virtual VMWare NVMe SSDs. This quirk merely adds an additional + * error message that on VMWare NVMe SSDs, the shutdown timeout may be expected. + */ +#define NVME_QUIRK_SHST_COMPLETE 0x200 + +/* + * The controller requires an extra delay before starting the initialization process + * during attach. + */ +#define NVME_QUIRK_DELAY_BEFORE_INIT 0x400 + +/* + * Some SSDs exhibit poor performance with the default SPDK NVMe IO queue size. + * This quirk will increase the default to 1024 which matches other operating + * systems, at the cost of some extra memory usage. Users can still override + * the increased default by changing the spdk_nvme_io_qpair_opts when allocating + * a new queue pair. + */ +#define NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE 0x800 + +/** + * The maximum access width to PCI memory space is 8 Bytes, don't use AVX2 or + * SSE instructions to optimize the memory access(memcpy or memset) larger than + * 8 Bytes. + */ +#define NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH 0x1000 + +/** + * The SSD does not support OPAL even through it sets the security bit in OACS. + */ +#define NVME_QUIRK_OACS_SECURITY 0x2000 + +#define NVME_MAX_ASYNC_EVENTS (8) + +#define NVME_MAX_ADMIN_TIMEOUT_IN_SECS (30) + +/* Maximum log page size to fetch for AERs. */ +#define NVME_MAX_AER_LOG_SIZE (4096) + +/* + * NVME_MAX_IO_QUEUES in nvme_spec.h defines the 64K spec-limit, but this + * define specifies the maximum number of queues this driver will actually + * try to configure, if available. + */ +#define DEFAULT_MAX_IO_QUEUES (1024) +#define DEFAULT_ADMIN_QUEUE_SIZE (32) +#define DEFAULT_IO_QUEUE_SIZE (256) +#define DEFAULT_IO_QUEUE_SIZE_FOR_QUIRK (1024) /* Matches Linux kernel driver */ + +#define DEFAULT_IO_QUEUE_REQUESTS (512) + +#define SPDK_NVME_DEFAULT_RETRY_COUNT (4) + +#define SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED (0) +#define SPDK_NVME_DEFAULT_TRANSPORT_ACK_TIMEOUT SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED + +#define MIN_KEEP_ALIVE_TIMEOUT_IN_MS (10000) + +/* We want to fit submission and completion rings each in a single 2MB + * hugepage to ensure physical address contiguity. + */ +#define MAX_IO_QUEUE_ENTRIES (VALUE_2MB / spdk_max( \ + sizeof(struct spdk_nvme_cmd), \ + sizeof(struct spdk_nvme_cpl))) + +enum nvme_payload_type { + NVME_PAYLOAD_TYPE_INVALID = 0, + + /** nvme_request::u.payload.contig_buffer is valid for this request */ + NVME_PAYLOAD_TYPE_CONTIG, + + /** nvme_request::u.sgl is valid for this request */ + NVME_PAYLOAD_TYPE_SGL, +}; + +/** + * Descriptor for a request data payload. + */ +struct nvme_payload { + /** + * Functions for retrieving physical addresses for scattered payloads. + */ + spdk_nvme_req_reset_sgl_cb reset_sgl_fn; + spdk_nvme_req_next_sge_cb next_sge_fn; + + /** + * If reset_sgl_fn == NULL, this is a contig payload, and contig_or_cb_arg contains the + * virtual memory address of a single virtually contiguous buffer. + * + * If reset_sgl_fn != NULL, this is a SGL payload, and contig_or_cb_arg contains the + * cb_arg that will be passed to the SGL callback functions. + */ + void *contig_or_cb_arg; + + /** Virtual memory address of a single virtually contiguous metadata buffer */ + void *md; +}; + +#define NVME_PAYLOAD_CONTIG(contig_, md_) \ + (struct nvme_payload) { \ + .reset_sgl_fn = NULL, \ + .next_sge_fn = NULL, \ + .contig_or_cb_arg = (contig_), \ + .md = (md_), \ + } + +#define NVME_PAYLOAD_SGL(reset_sgl_fn_, next_sge_fn_, cb_arg_, md_) \ + (struct nvme_payload) { \ + .reset_sgl_fn = (reset_sgl_fn_), \ + .next_sge_fn = (next_sge_fn_), \ + .contig_or_cb_arg = (cb_arg_), \ + .md = (md_), \ + } + +static inline enum nvme_payload_type +nvme_payload_type(const struct nvme_payload *payload) { + return payload->reset_sgl_fn ? NVME_PAYLOAD_TYPE_SGL : NVME_PAYLOAD_TYPE_CONTIG; +} + +struct nvme_error_cmd { + bool do_not_submit; + uint64_t timeout_tsc; + uint32_t err_count; + uint8_t opc; + struct spdk_nvme_status status; + TAILQ_ENTRY(nvme_error_cmd) link; +}; + +struct nvme_request { + struct spdk_nvme_cmd cmd; + + uint8_t retries; + + uint8_t timed_out : 1; + + /** + * True if the request is in the queued_req list. + */ + uint8_t queued : 1; + uint8_t reserved : 6; + + /** + * Number of children requests still outstanding for this + * request which was split into multiple child requests. + */ + uint16_t num_children; + + /** + * Offset in bytes from the beginning of payload for this request. + * This is used for I/O commands that are split into multiple requests. + */ + uint32_t payload_offset; + uint32_t md_offset; + + uint32_t payload_size; + + /** + * Timeout ticks for error injection requests, can be extended in future + * to support per-request timeout feature. + */ + uint64_t timeout_tsc; + + /** + * Data payload for this request's command. + */ + struct nvme_payload payload; + + spdk_nvme_cmd_cb cb_fn; + void *cb_arg; + STAILQ_ENTRY(nvme_request) stailq; + + struct spdk_nvme_qpair *qpair; + + /* + * The value of spdk_get_ticks() when the request was submitted to the hardware. + * Only set if ctrlr->timeout_enabled is true. + */ + uint64_t submit_tick; + + /** + * The active admin request can be moved to a per process pending + * list based on the saved pid to tell which process it belongs + * to. The cpl saves the original completion information which + * is used in the completion callback. + * NOTE: these below two fields are only used for admin request. + */ + pid_t pid; + struct spdk_nvme_cpl cpl; + + uint32_t md_size; + + /** + * The following members should not be reordered with members + * above. These members are only needed when splitting + * requests which is done rarely, and the driver is careful + * to not touch the following fields until a split operation is + * needed, to avoid touching an extra cacheline. + */ + + /** + * Points to the outstanding child requests for a parent request. + * Only valid if a request was split into multiple children + * requests, and is not initialized for non-split requests. + */ + TAILQ_HEAD(, nvme_request) children; + + /** + * Linked-list pointers for a child request in its parent's list. + */ + TAILQ_ENTRY(nvme_request) child_tailq; + + /** + * Points to a parent request if part of a split request, + * NULL otherwise. + */ + struct nvme_request *parent; + + /** + * Completion status for a parent request. Initialized to all 0's + * (SUCCESS) before child requests are submitted. If a child + * request completes with error, the error status is copied here, + * to ensure that the parent request is also completed with error + * status once all child requests are completed. + */ + struct spdk_nvme_cpl parent_status; + + /** + * The user_cb_fn and user_cb_arg fields are used for holding the original + * callback data when using nvme_allocate_request_user_copy. + */ + spdk_nvme_cmd_cb user_cb_fn; + void *user_cb_arg; + void *user_buffer; +}; + +struct nvme_completion_poll_status { + struct spdk_nvme_cpl cpl; + bool done; + /* This flag indicates that the request has been timed out and the memory + must be freed in a completion callback */ + bool timed_out; +}; + +struct nvme_async_event_request { + struct spdk_nvme_ctrlr *ctrlr; + struct nvme_request *req; + struct spdk_nvme_cpl cpl; +}; + +enum nvme_qpair_state { + NVME_QPAIR_DISCONNECTED, + NVME_QPAIR_DISCONNECTING, + NVME_QPAIR_CONNECTING, + NVME_QPAIR_CONNECTED, + NVME_QPAIR_ENABLING, + NVME_QPAIR_ENABLED, + NVME_QPAIR_DESTROYING, +}; + +struct spdk_nvme_qpair { + struct spdk_nvme_ctrlr *ctrlr; + + uint16_t id; + + uint8_t qprio; + + uint8_t state : 3; + + /* + * Members for handling IO qpair deletion inside of a completion context. + * These are specifically defined as single bits, so that they do not + * push this data structure out to another cacheline. + */ + uint8_t in_completion_context : 1; + uint8_t delete_after_completion_context: 1; + + /* + * Set when no deletion notification is needed. For example, the process + * which allocated this qpair exited unexpectedly. + */ + uint8_t no_deletion_notification_needed: 1; + + uint8_t first_fused_submitted: 1; + + enum spdk_nvme_transport_type trtype; + + STAILQ_HEAD(, nvme_request) free_req; + STAILQ_HEAD(, nvme_request) queued_req; + STAILQ_HEAD(, nvme_request) aborting_queued_req; + + /* List entry for spdk_nvme_transport_poll_group::qpairs */ + STAILQ_ENTRY(spdk_nvme_qpair) poll_group_stailq; + + /** Commands opcode in this list will return error */ + TAILQ_HEAD(, nvme_error_cmd) err_cmd_head; + /** Requests in this list will return error */ + STAILQ_HEAD(, nvme_request) err_req_head; + + /* List entry for spdk_nvme_ctrlr::active_io_qpairs */ + TAILQ_ENTRY(spdk_nvme_qpair) tailq; + + /* List entry for spdk_nvme_ctrlr_process::allocated_io_qpairs */ + TAILQ_ENTRY(spdk_nvme_qpair) per_process_tailq; + + struct spdk_nvme_ctrlr_process *active_proc; + + struct spdk_nvme_transport_poll_group *poll_group; + + void *poll_group_tailq_head; + + void *req_buf; + + const struct spdk_nvme_transport *transport; + + uint8_t transport_failure_reason: 2; +}; + +struct spdk_nvme_poll_group { + void *ctx; + STAILQ_HEAD(, spdk_nvme_transport_poll_group) tgroups; +}; + +struct spdk_nvme_transport_poll_group { + struct spdk_nvme_poll_group *group; + const struct spdk_nvme_transport *transport; + STAILQ_HEAD(, spdk_nvme_qpair) connected_qpairs; + STAILQ_HEAD(, spdk_nvme_qpair) disconnected_qpairs; + STAILQ_ENTRY(spdk_nvme_transport_poll_group) link; + bool in_completion_context; + uint64_t num_qpairs_to_delete; +}; + +struct spdk_nvme_ns { + struct spdk_nvme_ctrlr *ctrlr; + uint32_t sector_size; + + /* + * Size of data transferred as part of each block, + * including metadata if FLBAS indicates the metadata is transferred + * as part of the data buffer at the end of each LBA. + */ + uint32_t extended_lba_size; + + uint32_t md_size; + uint32_t pi_type; + uint32_t sectors_per_max_io; + uint32_t sectors_per_stripe; + uint32_t id; + uint16_t flags; + + /* Namespace Identification Descriptor List (CNS = 03h) */ + uint8_t id_desc_list[4096]; +}; + +/** + * State of struct spdk_nvme_ctrlr (in particular, during initialization). + */ +enum nvme_ctrlr_state { + /** + * Wait before initializing the controller. + */ + NVME_CTRLR_STATE_INIT_DELAY, + + /** + * Controller has not been initialized yet. + */ + NVME_CTRLR_STATE_INIT, + + /** + * Waiting for CSTS.RDY to transition from 0 to 1 so that CC.EN may be set to 0. + */ + NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1, + + /** + * Waiting for CSTS.RDY to transition from 1 to 0 so that CC.EN may be set to 1. + */ + NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, + + /** + * Enable the controller by writing CC.EN to 1 + */ + NVME_CTRLR_STATE_ENABLE, + + /** + * Waiting for CSTS.RDY to transition from 0 to 1 after enabling the controller. + */ + NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1, + + /** + * Reset the Admin queue of the controller. + */ + NVME_CTRLR_STATE_RESET_ADMIN_QUEUE, + + /** + * Identify Controller command will be sent to then controller. + */ + NVME_CTRLR_STATE_IDENTIFY, + + /** + * Waiting for Identify Controller command be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY, + + /** + * Set Number of Queues of the controller. + */ + NVME_CTRLR_STATE_SET_NUM_QUEUES, + + /** + * Waiting for Set Num of Queues command to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES, + + /** + * Construct Namespace data structures of the controller. + */ + NVME_CTRLR_STATE_CONSTRUCT_NS, + + /** + * Get active Namespace list of the controller. + */ + NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS, + + /** + * Waiting for the Identify Active Namespace commands to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS, + + /** + * Get Identify Namespace Data structure for each NS. + */ + NVME_CTRLR_STATE_IDENTIFY_NS, + + /** + * Waiting for the Identify Namespace commands to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS, + + /** + * Get Identify Namespace Identification Descriptors. + */ + NVME_CTRLR_STATE_IDENTIFY_ID_DESCS, + + /** + * Waiting for the Identify Namespace Identification + * Descriptors to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS, + + /** + * Configure AER of the controller. + */ + NVME_CTRLR_STATE_CONFIGURE_AER, + + /** + * Waiting for the Configure AER to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER, + + /** + * Set supported log pages of the controller. + */ + NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, + + /** + * Set supported features of the controller. + */ + NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES, + + /** + * Set Doorbell Buffer Config of the controller. + */ + NVME_CTRLR_STATE_SET_DB_BUF_CFG, + + /** + * Waiting for Doorbell Buffer Config to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG, + + /** + * Set Keep Alive Timeout of the controller. + */ + NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, + + /** + * Waiting for Set Keep Alive Timeout to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT, + + /** + * Set Host ID of the controller. + */ + NVME_CTRLR_STATE_SET_HOST_ID, + + /** + * Waiting for Set Host ID to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_HOST_ID, + + /** + * Controller initialization has completed and the controller is ready. + */ + NVME_CTRLR_STATE_READY, + + /** + * Controller inilialization has an error. + */ + NVME_CTRLR_STATE_ERROR +}; + +#define NVME_TIMEOUT_INFINITE 0 + +/* + * Used to track properties for all processes accessing the controller. + */ +struct spdk_nvme_ctrlr_process { + /** Whether it is the primary process */ + bool is_primary; + + /** Process ID */ + pid_t pid; + + /** Active admin requests to be completed */ + STAILQ_HEAD(, nvme_request) active_reqs; + + TAILQ_ENTRY(spdk_nvme_ctrlr_process) tailq; + + /** Per process PCI device handle */ + struct spdk_pci_device *devhandle; + + /** Reference to track the number of attachment to this controller. */ + int ref; + + /** Allocated IO qpairs */ + TAILQ_HEAD(, spdk_nvme_qpair) allocated_io_qpairs; + + spdk_nvme_aer_cb aer_cb_fn; + void *aer_cb_arg; + + /** + * A function pointer to timeout callback function + */ + spdk_nvme_timeout_cb timeout_cb_fn; + void *timeout_cb_arg; + uint64_t timeout_ticks; +}; + +/* + * One of these per allocated PCI device. + */ +struct spdk_nvme_ctrlr { + /* Hot data (accessed in I/O path) starts here. */ + + /** Array of namespaces indexed by nsid - 1 */ + struct spdk_nvme_ns *ns; + + uint32_t num_ns; + + bool is_removed; + + bool is_resetting; + + bool is_failed; + + bool is_destructed; + + bool timeout_enabled; + + uint16_t max_sges; + + uint16_t cntlid; + + /** Controller support flags */ + uint64_t flags; + + /** NVMEoF in-capsule data size in bytes */ + uint32_t ioccsz_bytes; + + /** NVMEoF in-capsule data offset in 16 byte units */ + uint16_t icdoff; + + /* Cold data (not accessed in normal I/O path) is after this point. */ + + struct spdk_nvme_transport_id trid; + + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + + enum nvme_ctrlr_state state; + uint64_t state_timeout_tsc; + + uint64_t next_keep_alive_tick; + uint64_t keep_alive_interval_ticks; + + TAILQ_ENTRY(spdk_nvme_ctrlr) tailq; + + /** All the log pages supported */ + bool log_page_supported[256]; + + /** All the features supported */ + bool feature_supported[256]; + + /** maximum i/o size in bytes */ + uint32_t max_xfer_size; + + /** minimum page size supported by this controller in bytes */ + uint32_t min_page_size; + + /** selected memory page size for this controller in bytes */ + uint32_t page_size; + + uint32_t num_aers; + struct nvme_async_event_request aer[NVME_MAX_ASYNC_EVENTS]; + + /** guards access to the controller itself, including admin queues */ + pthread_mutex_t ctrlr_lock; + + struct spdk_nvme_qpair *adminq; + + /** shadow doorbell buffer */ + uint32_t *shadow_doorbell; + /** eventidx buffer */ + uint32_t *eventidx; + + /** + * Identify Controller data. + */ + struct spdk_nvme_ctrlr_data cdata; + + /** + * Keep track of active namespaces + */ + uint32_t *active_ns_list; + + /** + * Array of Identify Namespace data. + * + * Stored separately from ns since nsdata should not normally be accessed during I/O. + */ + struct spdk_nvme_ns_data *nsdata; + + struct spdk_bit_array *free_io_qids; + TAILQ_HEAD(, spdk_nvme_qpair) active_io_qpairs; + + struct spdk_nvme_ctrlr_opts opts; + + uint64_t quirks; + + /* Extra sleep time during controller initialization */ + uint64_t sleep_timeout_tsc; + + /** Track all the processes manage this controller */ + TAILQ_HEAD(, spdk_nvme_ctrlr_process) active_procs; + + + STAILQ_HEAD(, nvme_request) queued_aborts; + uint32_t outstanding_aborts; + + /* CB to notify the user when the ctrlr is removed/failed. */ + spdk_nvme_remove_cb remove_cb; + void *cb_ctx; + + struct spdk_nvme_qpair *external_io_msgs_qpair; + pthread_mutex_t external_io_msgs_lock; + struct spdk_ring *external_io_msgs; + + STAILQ_HEAD(, nvme_io_msg_producer) io_producers; +}; + +struct spdk_nvme_probe_ctx { + struct spdk_nvme_transport_id trid; + void *cb_ctx; + spdk_nvme_probe_cb probe_cb; + spdk_nvme_attach_cb attach_cb; + spdk_nvme_remove_cb remove_cb; + TAILQ_HEAD(, spdk_nvme_ctrlr) init_ctrlrs; +}; + +struct nvme_driver { + pthread_mutex_t lock; + + /** Multi-process shared attached controller list */ + TAILQ_HEAD(, spdk_nvme_ctrlr) shared_attached_ctrlrs; + + bool initialized; + struct spdk_uuid default_extended_host_id; + + /** netlink socket fd for hotplug messages */ + int hotplug_fd; +}; + +extern struct nvme_driver *g_spdk_nvme_driver; + +int nvme_driver_init(void); + +#define nvme_delay usleep + +static inline bool +nvme_qpair_is_admin_queue(struct spdk_nvme_qpair *qpair) +{ + return qpair->id == 0; +} + +static inline bool +nvme_qpair_is_io_queue(struct spdk_nvme_qpair *qpair) +{ + return qpair->id != 0; +} + +static inline int +nvme_robust_mutex_lock(pthread_mutex_t *mtx) +{ + int rc = pthread_mutex_lock(mtx); + +#ifndef __FreeBSD__ + if (rc == EOWNERDEAD) { + rc = pthread_mutex_consistent(mtx); + } +#endif + + return rc; +} + +static inline int +nvme_robust_mutex_unlock(pthread_mutex_t *mtx) +{ + return pthread_mutex_unlock(mtx); +} + +/* Poll group management functions. */ +int nvme_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair); +int nvme_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair); + +/* Admin functions */ +int nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr, + uint8_t cns, uint16_t cntid, uint32_t nsid, + void *payload, size_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr, + uint32_t num_queues, spdk_nvme_cmd_cb cb_fn, + void *cb_arg); +int nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr, + union spdk_nvme_feat_async_event_configuration config, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr, + uint64_t prp1, uint64_t prp2, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn, + void *cb_arg); +int nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_format *format, spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_fw_commit *fw_commit, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr, + uint32_t size, uint32_t offset, void *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_sanitize(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_sanitize *sanitize, uint32_t cdw11, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +void nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl); +int nvme_wait_for_completion(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status); +int nvme_wait_for_completion_robust_lock(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status, + pthread_mutex_t *robust_mutex); +int nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status, + uint64_t timeout_in_secs); + +struct spdk_nvme_ctrlr_process *nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr, + pid_t pid); +struct spdk_nvme_ctrlr_process *nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr); +int nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle); +void nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr); +struct spdk_pci_device *nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr); + +int nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_probe_ctx *probe_ctx, void *devhandle); + +int nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove); +int nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr); +int nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_connected(struct spdk_nvme_probe_ctx *probe_ctx, + struct spdk_nvme_ctrlr *ctrlr); + +int nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_request *req); +int nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap); +int nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs); +int nvme_ctrlr_get_cmbsz(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cmbsz_register *cmbsz); +void nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap, + const union spdk_nvme_vs_register *vs); +void nvme_ctrlr_disconnect_qpair(struct spdk_nvme_qpair *qpair); +int nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id, + struct spdk_nvme_ctrlr *ctrlr, + enum spdk_nvme_qprio qprio, + uint32_t num_requests); +void nvme_qpair_deinit(struct spdk_nvme_qpair *qpair); +void nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair); +int nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req); +void nvme_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); +uint32_t nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg); +void nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_requests); + +int nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ns_set_identify_data(struct spdk_nvme_ns *ns); +int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id, + struct spdk_nvme_ctrlr *ctrlr); +void nvme_ns_destruct(struct spdk_nvme_ns *ns); +int nvme_ns_update(struct spdk_nvme_ns *ns); + +int nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value); +int nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value); +int nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value); +int nvme_fabric_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, bool direct_connect); +int nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value); +int nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_probe_ctx *probe_ctx); +int nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries); + +static inline struct nvme_request * +nvme_allocate_request(struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, uint32_t payload_size, uint32_t md_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + + req = STAILQ_FIRST(&qpair->free_req); + if (req == NULL) { + return req; + } + + STAILQ_REMOVE_HEAD(&qpair->free_req, stailq); + + /* + * Only memset/zero fields that need it. All other fields + * will be initialized appropriately either later in this + * function, or before they are needed later in the + * submission patch. For example, the children + * TAILQ_ENTRY and following members are + * only used as part of I/O splitting so we avoid + * memsetting them until it is actually needed. + * They will be initialized in nvme_request_add_child() + * if the request is split. + */ + memset(req, 0, offsetof(struct nvme_request, payload_size)); + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->payload = *payload; + req->payload_size = payload_size; + req->md_size = md_size; + req->pid = g_spdk_nvme_pid; + req->submit_tick = 0; + + return req; +} + +static inline struct nvme_request * +nvme_allocate_request_contig(struct spdk_nvme_qpair *qpair, + void *buffer, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_payload payload; + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + return nvme_allocate_request(qpair, &payload, payload_size, 0, cb_fn, cb_arg); +} + +static inline struct nvme_request * +nvme_allocate_request_null(struct spdk_nvme_qpair *qpair, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + return nvme_allocate_request_contig(qpair, NULL, 0, cb_fn, cb_arg); +} + +struct nvme_request *nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair, + void *buffer, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, bool host_to_controller); + +static inline void +nvme_complete_request(spdk_nvme_cmd_cb cb_fn, void *cb_arg, struct spdk_nvme_qpair *qpair, + struct nvme_request *req, struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_cpl err_cpl; + struct nvme_error_cmd *cmd; + + /* error injection at completion path, + * only inject for successful completed commands + */ + if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head) && + !spdk_nvme_cpl_is_error(cpl))) { + TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) { + + if (cmd->do_not_submit) { + continue; + } + + if ((cmd->opc == req->cmd.opc) && cmd->err_count) { + + err_cpl = *cpl; + err_cpl.status.sct = cmd->status.sct; + err_cpl.status.sc = cmd->status.sc; + + cpl = &err_cpl; + cmd->err_count--; + break; + } + } + } + + if (cb_fn) { + cb_fn(cb_arg, cpl); + } +} + +static inline void +nvme_free_request(struct nvme_request *req) +{ + assert(req != NULL); + assert(req->num_children == 0); + assert(req->qpair != NULL); + + STAILQ_INSERT_HEAD(&req->qpair->free_req, req, stailq); +} + +static inline void +nvme_qpair_set_state(struct spdk_nvme_qpair *qpair, enum nvme_qpair_state state) +{ + qpair->state = state; +} + +static inline enum nvme_qpair_state +nvme_qpair_get_state(struct spdk_nvme_qpair *qpair) { + return qpair->state; +} + +static inline void +nvme_qpair_free_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + assert(req != NULL); + assert(req->num_children == 0); + + STAILQ_INSERT_HEAD(&qpair->free_req, req, stailq); +} + +static inline void +nvme_request_remove_child(struct nvme_request *parent, struct nvme_request *child) +{ + assert(parent != NULL); + assert(child != NULL); + assert(child->parent == parent); + assert(parent->num_children != 0); + + parent->num_children--; + child->parent = NULL; + TAILQ_REMOVE(&parent->children, child, child_tailq); +} + +static inline void +nvme_cb_complete_child(void *child_arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *child = child_arg; + struct nvme_request *parent = child->parent; + + nvme_request_remove_child(parent, child); + + if (spdk_nvme_cpl_is_error(cpl)) { + memcpy(&parent->parent_status, cpl, sizeof(*cpl)); + } + + if (parent->num_children == 0) { + nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair, + parent, &parent->parent_status); + nvme_free_request(parent); + } +} + +static inline void +nvme_request_add_child(struct nvme_request *parent, struct nvme_request *child) +{ + assert(parent->num_children != UINT16_MAX); + + if (parent->num_children == 0) { + /* + * Defer initialization of the children TAILQ since it falls + * on a separate cacheline. This ensures we do not touch this + * cacheline except on request splitting cases, which are + * relatively rare. + */ + TAILQ_INIT(&parent->children); + parent->parent = NULL; + memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl)); + } + + parent->num_children++; + TAILQ_INSERT_TAIL(&parent->children, child, child_tailq); + child->parent = parent; + child->cb_fn = nvme_cb_complete_child; + child->cb_arg = child; +} + +static inline void +nvme_request_free_children(struct nvme_request *req) +{ + struct nvme_request *child, *tmp; + + if (req->num_children == 0) { + return; + } + + /* free all child nvme_request */ + TAILQ_FOREACH_SAFE(child, &req->children, child_tailq, tmp) { + nvme_request_remove_child(req, child); + nvme_request_free_children(child); + nvme_free_request(child); + } +} + +int nvme_request_check_timeout(struct nvme_request *req, uint16_t cid, + struct spdk_nvme_ctrlr_process *active_proc, uint64_t now_tick); +uint64_t nvme_get_quirks(const struct spdk_pci_id *id); + +int nvme_robust_mutex_init_shared(pthread_mutex_t *mtx); +int nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx); + +bool nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl); + +struct spdk_nvme_ctrlr *nvme_get_ctrlr_by_trid_unsafe( + const struct spdk_nvme_transport_id *trid); + +const struct spdk_nvme_transport *nvme_get_transport(const char *transport_name); +const struct spdk_nvme_transport *nvme_get_first_transport(void); +const struct spdk_nvme_transport *nvme_get_next_transport(const struct spdk_nvme_transport + *transport); + +/* Transport specific functions */ +struct spdk_nvme_ctrlr *nvme_transport_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle); +int nvme_transport_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr); +int nvme_transport_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, bool direct_connect); +int nvme_transport_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr); +int nvme_transport_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value); +int nvme_transport_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value); +int nvme_transport_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value); +int nvme_transport_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value); +uint32_t nvme_transport_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr); +uint16_t nvme_transport_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr); +struct spdk_nvme_qpair *nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + uint16_t qid, const struct spdk_nvme_io_qpair_opts *opts); +int nvme_transport_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr); +void *nvme_transport_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size); +int nvme_transport_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr); +int nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair); +int nvme_transport_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair); +void nvme_transport_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair); +void nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); +int nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair); +int nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req); +int32_t nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair, + uint32_t max_completions); +void nvme_transport_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair); +int nvme_transport_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg); + +struct spdk_nvme_transport_poll_group *nvme_transport_poll_group_create( + const struct spdk_nvme_transport *transport); +int nvme_transport_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair); +int nvme_transport_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair); +int nvme_transport_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair); +int nvme_transport_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair); +int64_t nvme_transport_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb); +int nvme_transport_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup); +/* + * Below ref related functions must be called with the global + * driver lock held for the multi-process condition. + * Within these functions, the per ctrlr ctrlr_lock is also + * acquired for the multi-thread condition. + */ +void nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr); +int nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr); + +static inline bool +_is_page_aligned(uint64_t address, uint64_t page_size) +{ + return (address & (page_size - 1)) == 0; +} + +#endif /* __NVME_INTERNAL_H__ */ diff --git a/src/spdk/lib/nvme/nvme_io_msg.c b/src/spdk/lib/nvme/nvme_io_msg.c new file mode 100644 index 000000000..fb5aec3d4 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_io_msg.c @@ -0,0 +1,216 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" +#include "nvme_io_msg.h" + +#define SPDK_NVME_MSG_IO_PROCESS_SIZE 8 + +/** + * Send message to IO queue. + */ +int +nvme_io_msg_send(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_io_msg_fn fn, + void *arg) +{ + int rc; + struct spdk_nvme_io_msg *io; + + /* Protect requests ring against preemptive producers */ + pthread_mutex_lock(&ctrlr->external_io_msgs_lock); + + io = (struct spdk_nvme_io_msg *)calloc(1, sizeof(struct spdk_nvme_io_msg)); + if (!io) { + SPDK_ERRLOG("IO msg allocation failed."); + pthread_mutex_unlock(&ctrlr->external_io_msgs_lock); + return -ENOMEM; + } + + io->ctrlr = ctrlr; + io->nsid = nsid; + io->fn = fn; + io->arg = arg; + + rc = spdk_ring_enqueue(ctrlr->external_io_msgs, (void **)&io, 1, NULL); + if (rc != 1) { + assert(false); + free(io); + pthread_mutex_unlock(&ctrlr->external_io_msgs_lock); + return -ENOMEM; + } + + pthread_mutex_unlock(&ctrlr->external_io_msgs_lock); + + return 0; +} + +int +nvme_io_msg_process(struct spdk_nvme_ctrlr *ctrlr) +{ + int i; + int count; + struct spdk_nvme_io_msg *io; + void *requests[SPDK_NVME_MSG_IO_PROCESS_SIZE]; + + if (!ctrlr->external_io_msgs || !ctrlr->external_io_msgs_qpair) { + /* Not ready or pending reset */ + return 0; + } + + spdk_nvme_qpair_process_completions(ctrlr->external_io_msgs_qpair, 0); + + count = spdk_ring_dequeue(ctrlr->external_io_msgs, requests, + SPDK_NVME_MSG_IO_PROCESS_SIZE); + if (count == 0) { + return 0; + } + + for (i = 0; i < count; i++) { + io = requests[i]; + + assert(io != NULL); + + io->fn(io->ctrlr, io->nsid, io->arg); + free(io); + } + + return count; +} + +static bool +nvme_io_msg_is_producer_registered(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_io_msg_producer *io_msg_producer) +{ + struct nvme_io_msg_producer *tmp; + + STAILQ_FOREACH(tmp, &ctrlr->io_producers, link) { + if (tmp == io_msg_producer) { + return true; + } + } + return false; +} + +int +nvme_io_msg_ctrlr_register(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_io_msg_producer *io_msg_producer) +{ + if (io_msg_producer == NULL) { + SPDK_ERRLOG("io_msg_producer cannot be NULL\n"); + return -EINVAL; + } + + if (nvme_io_msg_is_producer_registered(ctrlr, io_msg_producer)) { + return -EEXIST; + } + + if (!STAILQ_EMPTY(&ctrlr->io_producers) || ctrlr->is_resetting) { + /* There are registered producers - IO messaging already started */ + STAILQ_INSERT_TAIL(&ctrlr->io_producers, io_msg_producer, link); + return 0; + } + + pthread_mutex_init(&ctrlr->external_io_msgs_lock, NULL); + + /** + * Initialize ring and qpair for controller + */ + ctrlr->external_io_msgs = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY); + if (!ctrlr->external_io_msgs) { + SPDK_ERRLOG("Unable to allocate memory for message ring\n"); + return -ENOMEM; + } + + ctrlr->external_io_msgs_qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0); + if (ctrlr->external_io_msgs_qpair == NULL) { + SPDK_ERRLOG("spdk_nvme_ctrlr_alloc_io_qpair() failed\n"); + spdk_ring_free(ctrlr->external_io_msgs); + ctrlr->external_io_msgs = NULL; + return -ENOMEM; + } + + STAILQ_INSERT_TAIL(&ctrlr->io_producers, io_msg_producer, link); + + return 0; +} + +void +nvme_io_msg_ctrlr_update(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_io_msg_producer *io_msg_producer; + + /* Update all producers */ + STAILQ_FOREACH(io_msg_producer, &ctrlr->io_producers, link) { + io_msg_producer->update(ctrlr); + } +} + +void +nvme_io_msg_ctrlr_detach(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_io_msg_producer *io_msg_producer, *tmp; + + /* Stop all producers */ + STAILQ_FOREACH_SAFE(io_msg_producer, &ctrlr->io_producers, link, tmp) { + io_msg_producer->stop(ctrlr); + STAILQ_REMOVE(&ctrlr->io_producers, io_msg_producer, nvme_io_msg_producer, link); + } + + if (ctrlr->external_io_msgs) { + spdk_ring_free(ctrlr->external_io_msgs); + ctrlr->external_io_msgs = NULL; + } + + if (ctrlr->external_io_msgs_qpair) { + spdk_nvme_ctrlr_free_io_qpair(ctrlr->external_io_msgs_qpair); + ctrlr->external_io_msgs_qpair = NULL; + } + + pthread_mutex_destroy(&ctrlr->external_io_msgs_lock); +} + +void +nvme_io_msg_ctrlr_unregister(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_io_msg_producer *io_msg_producer) +{ + assert(io_msg_producer != NULL); + + if (!nvme_io_msg_is_producer_registered(ctrlr, io_msg_producer)) { + return; + } + + STAILQ_REMOVE(&ctrlr->io_producers, io_msg_producer, nvme_io_msg_producer, link); + if (STAILQ_EMPTY(&ctrlr->io_producers)) { + nvme_io_msg_ctrlr_detach(ctrlr); + } +} diff --git a/src/spdk/lib/nvme/nvme_io_msg.h b/src/spdk/lib/nvme/nvme_io_msg.h new file mode 100644 index 000000000..9c18261d5 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_io_msg.h @@ -0,0 +1,90 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * SPDK cuse + */ + + +#ifndef SPDK_NVME_IO_MSG_H_ +#define SPDK_NVME_IO_MSG_H_ + +typedef void (*spdk_nvme_io_msg_fn)(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + void *arg); + +struct spdk_nvme_io_msg { + struct spdk_nvme_ctrlr *ctrlr; + uint32_t nsid; + + spdk_nvme_io_msg_fn fn; + void *arg; +}; + +struct nvme_io_msg_producer { + const char *name; + void (*update)(struct spdk_nvme_ctrlr *ctrlr); + void (*stop)(struct spdk_nvme_ctrlr *ctrlr); + STAILQ_ENTRY(nvme_io_msg_producer) link; +}; + +int nvme_io_msg_send(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_io_msg_fn fn, + void *arg); + +/** + * Process IO message sent to controller from external module. + * + * This call process requests from the ring, send IO to an allocated qpair or + * admin commands in its context. This call is non-blocking and intended to be + * polled by SPDK thread to provide safe environment for NVMe request + * completition sent by external module to controller. + * + * The caller must ensure that each controller is polled by only one thread at + * a time. + * + * This function may be called at any point while the controller is attached to + * the SPDK NVMe driver. + * + * \param ctrlr Opaque handle to NVMe controller. + * + * \return number of processed external IO messages. + */ +int nvme_io_msg_process(struct spdk_nvme_ctrlr *ctrlr); + +int nvme_io_msg_ctrlr_register(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_io_msg_producer *io_msg_producer); +void nvme_io_msg_ctrlr_unregister(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_io_msg_producer *io_msg_producer); +void nvme_io_msg_ctrlr_detach(struct spdk_nvme_ctrlr *ctrlr); +void nvme_io_msg_ctrlr_update(struct spdk_nvme_ctrlr *ctrlr); + +#endif /* SPDK_NVME_IO_MSG_H_ */ diff --git a/src/spdk/lib/nvme/nvme_ns.c b/src/spdk/lib/nvme/nvme_ns.c new file mode 100644 index 000000000..5d424e5c7 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ns.c @@ -0,0 +1,401 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +static inline struct spdk_nvme_ns_data * +_nvme_ns_get_data(struct spdk_nvme_ns *ns) +{ + return &ns->ctrlr->nsdata[ns->id - 1]; +} + +/** + * Update Namespace flags based on Identify Controller + * and Identify Namespace. This can be also used for + * Namespace Attribute Notice events and Namespace + * operations such as Attach/Detach. + */ +void +nvme_ns_set_identify_data(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ns_data *nsdata; + + nsdata = _nvme_ns_get_data(ns); + + ns->flags = 0x0000; + + ns->sector_size = 1 << nsdata->lbaf[nsdata->flbas.format].lbads; + ns->extended_lba_size = ns->sector_size; + + ns->md_size = nsdata->lbaf[nsdata->flbas.format].ms; + if (nsdata->flbas.extended) { + ns->flags |= SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED; + ns->extended_lba_size += ns->md_size; + } + + ns->sectors_per_max_io = spdk_nvme_ns_get_max_io_xfer_size(ns) / ns->extended_lba_size; + + if (nsdata->noiob) { + ns->sectors_per_stripe = nsdata->noiob; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u optimal IO boundary %" PRIu32 " blocks\n", + ns->id, ns->sectors_per_stripe); + } else if (ns->ctrlr->quirks & NVME_INTEL_QUIRK_STRIPING && + ns->ctrlr->cdata.vs[3] != 0) { + ns->sectors_per_stripe = (1ULL << ns->ctrlr->cdata.vs[3]) * ns->ctrlr->min_page_size / + ns->sector_size; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u stripe size quirk %" PRIu32 " blocks\n", + ns->id, ns->sectors_per_stripe); + } else { + ns->sectors_per_stripe = 0; + } + + if (ns->ctrlr->cdata.oncs.dsm) { + ns->flags |= SPDK_NVME_NS_DEALLOCATE_SUPPORTED; + } + + if (ns->ctrlr->cdata.oncs.compare) { + ns->flags |= SPDK_NVME_NS_COMPARE_SUPPORTED; + } + + if (ns->ctrlr->cdata.vwc.present) { + ns->flags |= SPDK_NVME_NS_FLUSH_SUPPORTED; + } + + if (ns->ctrlr->cdata.oncs.write_zeroes) { + ns->flags |= SPDK_NVME_NS_WRITE_ZEROES_SUPPORTED; + } + + if (ns->ctrlr->cdata.oncs.write_unc) { + ns->flags |= SPDK_NVME_NS_WRITE_UNCORRECTABLE_SUPPORTED; + } + + if (nsdata->nsrescap.raw) { + ns->flags |= SPDK_NVME_NS_RESERVATION_SUPPORTED; + } + + ns->pi_type = SPDK_NVME_FMT_NVM_PROTECTION_DISABLE; + if (nsdata->lbaf[nsdata->flbas.format].ms && nsdata->dps.pit) { + ns->flags |= SPDK_NVME_NS_DPS_PI_SUPPORTED; + ns->pi_type = nsdata->dps.pit; + } +} + +static int +nvme_ctrlr_identify_ns(struct spdk_nvme_ns *ns) +{ + struct nvme_completion_poll_status *status; + struct spdk_nvme_ns_data *nsdata; + int rc; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + nsdata = _nvme_ns_get_data(ns); + rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id, + nsdata, sizeof(*nsdata), + nvme_completion_poll_cb, status); + if (rc != 0) { + free(status); + return rc; + } + + if (nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, status, + &ns->ctrlr->ctrlr_lock)) { + if (!status->timed_out) { + free(status); + } + /* This can occur if the namespace is not active. Simply zero the + * namespace data and continue. */ + nvme_ns_destruct(ns); + return 0; + } + free(status); + + nvme_ns_set_identify_data(ns); + + return 0; +} + +static int +nvme_ctrlr_identify_id_desc(struct spdk_nvme_ns *ns) +{ + struct nvme_completion_poll_status *status; + int rc; + + memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list)); + + if (ns->ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) || + (ns->ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n"); + return 0; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Attempting to retrieve NS ID Descriptor List\n"); + rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST, 0, ns->id, + ns->id_desc_list, sizeof(ns->id_desc_list), + nvme_completion_poll_cb, status); + if (rc < 0) { + free(status); + return rc; + } + + rc = nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, status, &ns->ctrlr->ctrlr_lock); + if (rc != 0) { + SPDK_WARNLOG("Failed to retrieve NS ID Descriptor List\n"); + memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list)); + } + + if (!status->timed_out) { + free(status); + } + + return rc; +} + +uint32_t +spdk_nvme_ns_get_id(struct spdk_nvme_ns *ns) +{ + return ns->id; +} + +bool +spdk_nvme_ns_is_active(struct spdk_nvme_ns *ns) +{ + const struct spdk_nvme_ns_data *nsdata = NULL; + + /* + * According to the spec, valid NS has non-zero id. + */ + if (ns->id == 0) { + return false; + } + + nsdata = _nvme_ns_get_data(ns); + + /* + * According to the spec, Identify Namespace will return a zero-filled structure for + * inactive namespace IDs. + * Check NCAP since it must be nonzero for an active namespace. + */ + return nsdata->ncap != 0; +} + +struct spdk_nvme_ctrlr * +spdk_nvme_ns_get_ctrlr(struct spdk_nvme_ns *ns) +{ + return ns->ctrlr; +} + +uint32_t +spdk_nvme_ns_get_max_io_xfer_size(struct spdk_nvme_ns *ns) +{ + return ns->ctrlr->max_xfer_size; +} + +uint32_t +spdk_nvme_ns_get_sector_size(struct spdk_nvme_ns *ns) +{ + return ns->sector_size; +} + +uint32_t +spdk_nvme_ns_get_extended_sector_size(struct spdk_nvme_ns *ns) +{ + return ns->extended_lba_size; +} + +uint64_t +spdk_nvme_ns_get_num_sectors(struct spdk_nvme_ns *ns) +{ + return _nvme_ns_get_data(ns)->nsze; +} + +uint64_t +spdk_nvme_ns_get_size(struct spdk_nvme_ns *ns) +{ + return spdk_nvme_ns_get_num_sectors(ns) * spdk_nvme_ns_get_sector_size(ns); +} + +uint32_t +spdk_nvme_ns_get_flags(struct spdk_nvme_ns *ns) +{ + return ns->flags; +} + +enum spdk_nvme_pi_type +spdk_nvme_ns_get_pi_type(struct spdk_nvme_ns *ns) { + return ns->pi_type; +} + +bool +spdk_nvme_ns_supports_extended_lba(struct spdk_nvme_ns *ns) +{ + return (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) ? true : false; +} + +bool +spdk_nvme_ns_supports_compare(struct spdk_nvme_ns *ns) +{ + return (ns->flags & SPDK_NVME_NS_COMPARE_SUPPORTED) ? true : false; +} + +uint32_t +spdk_nvme_ns_get_md_size(struct spdk_nvme_ns *ns) +{ + return ns->md_size; +} + +const struct spdk_nvme_ns_data * +spdk_nvme_ns_get_data(struct spdk_nvme_ns *ns) +{ + return _nvme_ns_get_data(ns); +} + +enum spdk_nvme_dealloc_logical_block_read_value spdk_nvme_ns_get_dealloc_logical_block_read_value( + struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + const struct spdk_nvme_ns_data *data = spdk_nvme_ns_get_data(ns); + + if (ctrlr->quirks & NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE) { + return SPDK_NVME_DEALLOC_READ_00; + } else { + return data->dlfeat.bits.read_value; + } +} + +uint32_t +spdk_nvme_ns_get_optimal_io_boundary(struct spdk_nvme_ns *ns) +{ + return ns->sectors_per_stripe; +} + +static const void * +nvme_ns_find_id_desc(const struct spdk_nvme_ns *ns, enum spdk_nvme_nidt type, size_t *length) +{ + const struct spdk_nvme_ns_id_desc *desc; + size_t offset; + + offset = 0; + while (offset + 4 < sizeof(ns->id_desc_list)) { + desc = (const struct spdk_nvme_ns_id_desc *)&ns->id_desc_list[offset]; + + if (desc->nidl == 0) { + /* End of list */ + return NULL; + } + + /* + * Check if this descriptor fits within the list. + * 4 is the fixed-size descriptor header (not counted in NIDL). + */ + if (offset + desc->nidl + 4 > sizeof(ns->id_desc_list)) { + /* Descriptor longer than remaining space in list (invalid) */ + return NULL; + } + + if (desc->nidt == type) { + *length = desc->nidl; + return &desc->nid[0]; + } + + offset += 4 + desc->nidl; + } + + return NULL; +} + +const struct spdk_uuid * +spdk_nvme_ns_get_uuid(const struct spdk_nvme_ns *ns) +{ + const struct spdk_uuid *uuid; + size_t uuid_size; + + uuid = nvme_ns_find_id_desc(ns, SPDK_NVME_NIDT_UUID, &uuid_size); + if (uuid == NULL || uuid_size != sizeof(*uuid)) { + return NULL; + } + + return uuid; +} + +int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id, + struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + assert(id > 0); + + ns->ctrlr = ctrlr; + ns->id = id; + + rc = nvme_ctrlr_identify_ns(ns); + if (rc != 0) { + return rc; + } + + return nvme_ctrlr_identify_id_desc(ns); +} + +void nvme_ns_destruct(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ns_data *nsdata; + + if (!ns->id) { + return; + } + + nsdata = _nvme_ns_get_data(ns); + memset(nsdata, 0, sizeof(*nsdata)); + ns->sector_size = 0; + ns->extended_lba_size = 0; + ns->md_size = 0; + ns->pi_type = 0; + ns->sectors_per_max_io = 0; + ns->sectors_per_stripe = 0; + ns->flags = 0; +} + +int nvme_ns_update(struct spdk_nvme_ns *ns) +{ + return nvme_ctrlr_identify_ns(ns); +} diff --git a/src/spdk/lib/nvme/nvme_ns_cmd.c b/src/spdk/lib/nvme/nvme_ns_cmd.c new file mode 100644 index 000000000..eaa825fa8 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ns_cmd.c @@ -0,0 +1,1074 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +static inline struct nvme_request *_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, uint32_t opc, uint32_t io_flags, + uint16_t apptag_mask, uint16_t apptag, bool check_sgl); + + +static bool +nvme_ns_check_request_length(uint32_t lba_count, uint32_t sectors_per_max_io, + uint32_t sectors_per_stripe, uint32_t qdepth) +{ + uint32_t child_per_io = UINT32_MAX; + + /* After a namespace is destroyed(e.g. hotplug), all the fields associated with the + * namespace will be cleared to zero, the function will return TRUE for this case, + * and -EINVAL will be returned to caller. + */ + if (sectors_per_stripe > 0) { + child_per_io = (lba_count + sectors_per_stripe - 1) / sectors_per_stripe; + } else if (sectors_per_max_io > 0) { + child_per_io = (lba_count + sectors_per_max_io - 1) / sectors_per_max_io; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "checking maximum i/o length %d\n", child_per_io); + + return child_per_io >= qdepth; +} + +static struct nvme_request * +_nvme_add_child_request(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, + struct nvme_request *parent, bool check_sgl) +{ + struct nvme_request *child; + + child = _nvme_ns_cmd_rw(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, cb_fn, + cb_arg, opc, io_flags, apptag_mask, apptag, check_sgl); + if (child == NULL) { + nvme_request_free_children(parent); + nvme_free_request(parent); + return NULL; + } + + nvme_request_add_child(parent, child); + return child; +} + +static struct nvme_request * +_nvme_ns_cmd_split_request(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, struct nvme_request *req, + uint32_t sectors_per_max_io, uint32_t sector_mask, + uint16_t apptag_mask, uint16_t apptag) +{ + uint32_t sector_size; + uint32_t md_size = ns->md_size; + uint32_t remaining_lba_count = lba_count; + struct nvme_request *child; + + sector_size = ns->extended_lba_size; + + if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) && + (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) && + (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) && + (md_size == 8)) { + sector_size -= 8; + } + + while (remaining_lba_count > 0) { + lba_count = sectors_per_max_io - (lba & sector_mask); + lba_count = spdk_min(remaining_lba_count, lba_count); + + child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset, + lba, lba_count, cb_fn, cb_arg, opc, + io_flags, apptag_mask, apptag, req, true); + if (child == NULL) { + return NULL; + } + + remaining_lba_count -= lba_count; + lba += lba_count; + payload_offset += lba_count * sector_size; + md_offset += lba_count * md_size; + } + + return req; +} + +static inline bool +_is_io_flags_valid(uint32_t io_flags) +{ + if (io_flags & ~SPDK_NVME_IO_FLAGS_VALID_MASK) { + /* Invalid io_flags */ + SPDK_ERRLOG("Invalid io_flags 0x%x\n", io_flags); + return false; + } + + return true; +} + +static void +_nvme_ns_cmd_setup_request(struct spdk_nvme_ns *ns, struct nvme_request *req, + uint32_t opc, uint64_t lba, uint32_t lba_count, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct spdk_nvme_cmd *cmd; + + assert(_is_io_flags_valid(io_flags)); + + cmd = &req->cmd; + cmd->opc = opc; + cmd->nsid = ns->id; + + *(uint64_t *)&cmd->cdw10 = lba; + + if (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) { + switch (ns->pi_type) { + case SPDK_NVME_FMT_NVM_PROTECTION_TYPE1: + case SPDK_NVME_FMT_NVM_PROTECTION_TYPE2: + cmd->cdw14 = (uint32_t)lba; + break; + } + } + + cmd->fuse = (io_flags & SPDK_NVME_IO_FLAGS_FUSE_MASK); + + cmd->cdw12 = lba_count - 1; + cmd->cdw12 |= (io_flags & SPDK_NVME_IO_FLAGS_CDW12_MASK); + + cmd->cdw15 = apptag_mask; + cmd->cdw15 = (cmd->cdw15 << 16 | apptag); +} + +static struct nvme_request * +_nvme_ns_cmd_split_request_prp(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, struct nvme_request *req, + uint16_t apptag_mask, uint16_t apptag) +{ + spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn; + spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn; + void *sgl_cb_arg = req->payload.contig_or_cb_arg; + bool start_valid, end_valid, last_sge, child_equals_parent; + uint64_t child_lba = lba; + uint32_t req_current_length = 0; + uint32_t child_length = 0; + uint32_t sge_length; + uint32_t page_size = qpair->ctrlr->page_size; + uintptr_t address; + + reset_sgl_fn(sgl_cb_arg, payload_offset); + next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length); + while (req_current_length < req->payload_size) { + + if (sge_length == 0) { + continue; + } else if (req_current_length + sge_length > req->payload_size) { + sge_length = req->payload_size - req_current_length; + } + + /* + * The start of the SGE is invalid if the start address is not page aligned, + * unless it is the first SGE in the child request. + */ + start_valid = child_length == 0 || _is_page_aligned(address, page_size); + + /* Boolean for whether this is the last SGE in the parent request. */ + last_sge = (req_current_length + sge_length == req->payload_size); + + /* + * The end of the SGE is invalid if the end address is not page aligned, + * unless it is the last SGE in the parent request. + */ + end_valid = last_sge || _is_page_aligned(address + sge_length, page_size); + + /* + * This child request equals the parent request, meaning that no splitting + * was required for the parent request (the one passed into this function). + * In this case, we do not create a child request at all - we just send + * the original request as a single request at the end of this function. + */ + child_equals_parent = (child_length + sge_length == req->payload_size); + + if (start_valid) { + /* + * The start of the SGE is valid, so advance the length parameters, + * to include this SGE with previous SGEs for this child request + * (if any). If it is not valid, we do not advance the length + * parameters nor get the next SGE, because we must send what has + * been collected before this SGE as a child request. + */ + child_length += sge_length; + req_current_length += sge_length; + if (req_current_length < req->payload_size) { + next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length); + } + /* + * If the next SGE is not page aligned, we will need to create a child + * request for what we have so far, and then start a new child request for + * the next SGE. + */ + start_valid = _is_page_aligned(address, page_size); + } + + if (start_valid && end_valid && !last_sge) { + continue; + } + + /* + * We need to create a split here. Send what we have accumulated so far as a child + * request. Checking if child_equals_parent allows us to *not* create a child request + * when no splitting is required - in that case we will fall-through and just create + * a single request with no children for the entire I/O. + */ + if (!child_equals_parent) { + struct nvme_request *child; + uint32_t child_lba_count; + + if ((child_length % ns->extended_lba_size) != 0) { + SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n", + child_length, ns->extended_lba_size); + return NULL; + } + child_lba_count = child_length / ns->extended_lba_size; + /* + * Note the last parameter is set to "false" - this tells the recursive + * call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting + * since we have already verified it here. + */ + child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset, + child_lba, child_lba_count, + cb_fn, cb_arg, opc, io_flags, + apptag_mask, apptag, req, false); + if (child == NULL) { + return NULL; + } + payload_offset += child_length; + md_offset += child_lba_count * ns->md_size; + child_lba += child_lba_count; + child_length = 0; + } + } + + if (child_length == req->payload_size) { + /* No splitting was required, so setup the whole payload as one request. */ + _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag); + } + + return req; +} + +static struct nvme_request * +_nvme_ns_cmd_split_request_sgl(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, struct nvme_request *req, + uint16_t apptag_mask, uint16_t apptag) +{ + spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn; + spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn; + void *sgl_cb_arg = req->payload.contig_or_cb_arg; + uint64_t child_lba = lba; + uint32_t req_current_length = 0; + uint32_t child_length = 0; + uint32_t sge_length; + uint16_t max_sges, num_sges; + uintptr_t address; + + max_sges = ns->ctrlr->max_sges; + + reset_sgl_fn(sgl_cb_arg, payload_offset); + num_sges = 0; + + while (req_current_length < req->payload_size) { + next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length); + + if (req_current_length + sge_length > req->payload_size) { + sge_length = req->payload_size - req_current_length; + } + + child_length += sge_length; + req_current_length += sge_length; + num_sges++; + + if (num_sges < max_sges && req_current_length < req->payload_size) { + continue; + } + + /* + * We need to create a split here. Send what we have accumulated so far as a child + * request. Checking if the child equals the full payload allows us to *not* + * create a child request when no splitting is required - in that case we will + * fall-through and just create a single request with no children for the entire I/O. + */ + if (child_length != req->payload_size) { + struct nvme_request *child; + uint32_t child_lba_count; + + if ((child_length % ns->extended_lba_size) != 0) { + SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n", + child_length, ns->extended_lba_size); + return NULL; + } + child_lba_count = child_length / ns->extended_lba_size; + /* + * Note the last parameter is set to "false" - this tells the recursive + * call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting + * since we have already verified it here. + */ + child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset, + child_lba, child_lba_count, + cb_fn, cb_arg, opc, io_flags, + apptag_mask, apptag, req, false); + if (child == NULL) { + return NULL; + } + payload_offset += child_length; + md_offset += child_lba_count * ns->md_size; + child_lba += child_lba_count; + child_length = 0; + num_sges = 0; + } + } + + if (child_length == req->payload_size) { + /* No splitting was required, so setup the whole payload as one request. */ + _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag); + } + + return req; +} + +static inline struct nvme_request * +_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, bool check_sgl) +{ + struct nvme_request *req; + uint32_t sector_size; + uint32_t sectors_per_max_io; + uint32_t sectors_per_stripe; + + sector_size = ns->extended_lba_size; + sectors_per_max_io = ns->sectors_per_max_io; + sectors_per_stripe = ns->sectors_per_stripe; + + if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) && + (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) && + (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) && + (ns->md_size == 8)) { + sector_size -= 8; + } + + req = nvme_allocate_request(qpair, payload, lba_count * sector_size, lba_count * ns->md_size, + cb_fn, cb_arg); + if (req == NULL) { + return NULL; + } + + req->payload_offset = payload_offset; + req->md_offset = md_offset; + + /* + * Intel DC P3*00 NVMe controllers benefit from driver-assisted striping. + * If this controller defines a stripe boundary and this I/O spans a stripe + * boundary, split the request into multiple requests and submit each + * separately to hardware. + */ + if (sectors_per_stripe > 0 && + (((lba & (sectors_per_stripe - 1)) + lba_count) > sectors_per_stripe)) { + + return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, + cb_fn, + cb_arg, opc, + io_flags, req, sectors_per_stripe, sectors_per_stripe - 1, apptag_mask, apptag); + } else if (lba_count > sectors_per_max_io) { + return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, + cb_fn, + cb_arg, opc, + io_flags, req, sectors_per_max_io, 0, apptag_mask, apptag); + } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL && check_sgl) { + if (ns->ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) { + return _nvme_ns_cmd_split_request_sgl(ns, qpair, payload, payload_offset, md_offset, + lba, lba_count, cb_fn, cb_arg, opc, io_flags, + req, apptag_mask, apptag); + } else { + return _nvme_ns_cmd_split_request_prp(ns, qpair, payload, payload_offset, md_offset, + lba, lba_count, cb_fn, cb_arg, opc, io_flags, + req, apptag_mask, apptag); + } + } + + _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag); + return req; +} + +int +spdk_nvme_ns_cmd_compare(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, + io_flags, 0, + 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_compare_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + void *buffer, + void *metadata, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, + io_flags, + apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_comparev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata, + uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_read(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, 0, + 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_read_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer, + void *metadata, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, + apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_readv_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata, + uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_write(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + void *buffer, uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_write_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_writev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata, + uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + uint64_t *tmp_lba; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (lba_count == 0 || lba_count > UINT16_MAX + 1) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_WRITE_ZEROES; + cmd->nsid = ns->id; + + tmp_lba = (uint64_t *)&cmd->cdw10; + *tmp_lba = lba; + cmd->cdw12 = lba_count - 1; + cmd->fuse = (io_flags & SPDK_NVME_IO_FLAGS_FUSE_MASK); + cmd->cdw12 |= (io_flags & SPDK_NVME_IO_FLAGS_CDW12_MASK); + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_write_uncorrectable(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + uint64_t *tmp_lba; + + if (lba_count == 0 || lba_count > UINT16_MAX + 1) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_WRITE_UNCORRECTABLE; + cmd->nsid = ns->id; + + tmp_lba = (uint64_t *)&cmd->cdw10; + *tmp_lba = lba; + cmd->cdw12 = lba_count - 1; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_dataset_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint32_t type, + const struct spdk_nvme_dsm_range *ranges, uint16_t num_ranges, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (num_ranges == 0 || num_ranges > SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES) { + return -EINVAL; + } + + if (ranges == NULL) { + return -EINVAL; + } + + req = nvme_allocate_request_user_copy(qpair, (void *)ranges, + num_ranges * sizeof(struct spdk_nvme_dsm_range), + cb_fn, cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DATASET_MANAGEMENT; + cmd->nsid = ns->id; + + cmd->cdw10_bits.dsm.nr = num_ranges - 1; + cmd->cdw11 = type; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FLUSH; + cmd->nsid = ns->id; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_register(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_reservation_register_data *payload, + bool ignore_key, + enum spdk_nvme_reservation_register_action action, + enum spdk_nvme_reservation_register_cptpl cptpl, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(qpair, + payload, sizeof(struct spdk_nvme_reservation_register_data), + cb_fn, cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_REGISTER; + cmd->nsid = ns->id; + + cmd->cdw10_bits.resv_register.rrega = action; + cmd->cdw10_bits.resv_register.iekey = ignore_key; + cmd->cdw10_bits.resv_register.cptpl = cptpl; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_release(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_reservation_key_data *payload, + bool ignore_key, + enum spdk_nvme_reservation_release_action action, + enum spdk_nvme_reservation_type type, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(qpair, + payload, sizeof(struct spdk_nvme_reservation_key_data), cb_fn, + cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_RELEASE; + cmd->nsid = ns->id; + + cmd->cdw10_bits.resv_release.rrela = action; + cmd->cdw10_bits.resv_release.iekey = ignore_key; + cmd->cdw10_bits.resv_release.rtype = type; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_acquire(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_reservation_acquire_data *payload, + bool ignore_key, + enum spdk_nvme_reservation_acquire_action action, + enum spdk_nvme_reservation_type type, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(qpair, + payload, sizeof(struct spdk_nvme_reservation_acquire_data), + cb_fn, cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_ACQUIRE; + cmd->nsid = ns->id; + + cmd->cdw10_bits.resv_acquire.racqa = action; + cmd->cdw10_bits.resv_acquire.iekey = ignore_key; + cmd->cdw10_bits.resv_acquire.rtype = type; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_report(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *payload, uint32_t len, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + uint32_t num_dwords; + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (len % 4) { + return -EINVAL; + } + num_dwords = len / 4; + + req = nvme_allocate_request_user_copy(qpair, payload, len, cb_fn, cb_arg, false); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_REPORT; + cmd->nsid = ns->id; + + cmd->cdw10 = num_dwords; + + return nvme_qpair_submit_request(qpair, req); +} diff --git a/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c new file mode 100644 index 000000000..f60aa6789 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c @@ -0,0 +1,233 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/nvme_ocssd.h" +#include "nvme_internal.h" + +int +spdk_nvme_ocssd_ns_cmd_vector_reset(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + uint64_t *lba_list, uint32_t num_lbas, + struct spdk_ocssd_chunk_information_entry *chunk_info, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (!lba_list || (num_lbas == 0) || + (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_OCSSD_OPC_VECTOR_RESET; + cmd->nsid = ns->id; + + if (chunk_info != NULL) { + cmd->mptr = spdk_vtophys(chunk_info, NULL); + } + + /* + * Dword 10 and 11 store a pointer to the list of logical block addresses. + * If there is a single entry in the LBA list, the logical block + * address should be stored instead. + */ + if (num_lbas == 1) { + *(uint64_t *)&cmd->cdw10 = *lba_list; + } else { + *(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list, NULL); + } + + cmd->cdw12 = num_lbas - 1; + + return nvme_qpair_submit_request(qpair, req); +} + +static int +_nvme_ocssd_ns_cmd_vector_rw_with_md(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + enum spdk_ocssd_io_opcode opc, + uint32_t io_flags) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + struct nvme_payload payload; + uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY; + + if (io_flags & ~valid_flags) { + return -EINVAL; + } + + if (!buffer || !lba_list || (num_lbas == 0) || + (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = nvme_allocate_request(qpair, &payload, num_lbas * ns->sector_size, num_lbas * ns->md_size, + cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = opc; + cmd->nsid = ns->id; + + /* + * Dword 10 and 11 store a pointer to the list of logical block addresses. + * If there is a single entry in the LBA list, the logical block + * address should be stored instead. + */ + if (num_lbas == 1) { + *(uint64_t *)&cmd->cdw10 = *lba_list; + } else { + *(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list, NULL); + } + + cmd->cdw12 = num_lbas - 1; + cmd->cdw12 |= io_flags; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_write_with_md(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_write(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_read_with_md(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_read(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_copy(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + uint64_t *dst_lba_list, + uint64_t *src_lba_list, + uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY; + + if (io_flags & ~valid_flags) { + return -EINVAL; + } + + if (!dst_lba_list || !src_lba_list || (num_lbas == 0) || + (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_OCSSD_OPC_VECTOR_COPY; + cmd->nsid = ns->id; + + /* + * Dword 10 and 11 store a pointer to the list of source logical + * block addresses. + * Dword 14 and 15 store a pointer to the list of destination logical + * block addresses. + * If there is a single entry in the LBA list, the logical block + * address should be stored instead. + */ + if (num_lbas == 1) { + *(uint64_t *)&cmd->cdw10 = *src_lba_list; + *(uint64_t *)&cmd->cdw14 = *dst_lba_list; + } else { + *(uint64_t *)&cmd->cdw10 = spdk_vtophys(src_lba_list, NULL); + *(uint64_t *)&cmd->cdw14 = spdk_vtophys(dst_lba_list, NULL); + } + + cmd->cdw12 = num_lbas - 1; + cmd->cdw12 |= io_flags; + + return nvme_qpair_submit_request(qpair, req); +} diff --git a/src/spdk/lib/nvme/nvme_opal.c b/src/spdk/lib/nvme/nvme_opal.c new file mode 100644 index 000000000..e0a3aa7fa --- /dev/null +++ b/src/spdk/lib/nvme/nvme_opal.c @@ -0,0 +1,2566 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "spdk/opal.h" +#include "spdk_internal/log.h" +#include "spdk/util.h" + +#include "nvme_opal_internal.h" + +static void +opal_nvme_security_recv_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct opal_session *sess = arg; + struct spdk_opal_dev *dev = sess->dev; + void *response = sess->resp; + struct spdk_opal_compacket *header = response; + int ret; + + if (spdk_nvme_cpl_is_error(cpl)) { + sess->sess_cb(sess, -EIO, sess->cb_arg); + return; + } + + if (!header->outstanding_data && !header->min_transfer) { + sess->sess_cb(sess, 0, sess->cb_arg); + return; + } + + memset(response, 0, IO_BUFFER_LENGTH); + ret = spdk_nvme_ctrlr_cmd_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG, + dev->comid, 0, sess->resp, IO_BUFFER_LENGTH, + opal_nvme_security_recv_done, sess); + if (ret) { + sess->sess_cb(sess, ret, sess->cb_arg); + } +} + +static void +opal_nvme_security_send_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct opal_session *sess = arg; + struct spdk_opal_dev *dev = sess->dev; + int ret; + + if (spdk_nvme_cpl_is_error(cpl)) { + sess->sess_cb(sess, -EIO, sess->cb_arg); + return; + } + + ret = spdk_nvme_ctrlr_cmd_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG, + dev->comid, 0, sess->resp, IO_BUFFER_LENGTH, + opal_nvme_security_recv_done, sess); + if (ret) { + sess->sess_cb(sess, ret, sess->cb_arg); + } +} + +static int +opal_nvme_security_send(struct spdk_opal_dev *dev, struct opal_session *sess, + opal_sess_cb sess_cb, void *cb_arg) +{ + sess->sess_cb = sess_cb; + sess->cb_arg = cb_arg; + + return spdk_nvme_ctrlr_cmd_security_send(dev->ctrlr, SPDK_SCSI_SECP_TCG, dev->comid, + 0, sess->cmd, IO_BUFFER_LENGTH, + opal_nvme_security_send_done, sess); +} + +static void +opal_send_recv_done(struct opal_session *sess, int status, void *ctx) +{ + sess->status = status; + sess->done = true; +} + +static int +opal_send_recv(struct spdk_opal_dev *dev, struct opal_session *sess) +{ + int ret; + + sess->done = false; + ret = opal_nvme_security_send(dev, sess, opal_send_recv_done, NULL); + if (ret) { + return ret; + } + + while (!sess->done) { + spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr); + } + + return sess->status; +} + +static struct opal_session * +opal_alloc_session(struct spdk_opal_dev *dev) +{ + struct opal_session *sess; + + sess = calloc(1, sizeof(*sess)); + if (!sess) { + return NULL; + } + sess->dev = dev; + + return sess; +} + +static void +opal_add_token_u8(int *err, struct opal_session *sess, uint8_t token) +{ + if (*err) { + return; + } + if (sess->cmd_pos >= IO_BUFFER_LENGTH - 1) { + SPDK_ERRLOG("Error adding u8: end of buffer.\n"); + *err = -ERANGE; + return; + } + sess->cmd[sess->cmd_pos++] = token; +} + +static void +opal_add_short_atom_header(struct opal_session *sess, bool bytestring, + bool has_sign, size_t len) +{ + uint8_t atom; + int err = 0; + + atom = SPDK_SHORT_ATOM_ID; + atom |= bytestring ? SPDK_SHORT_ATOM_BYTESTRING_FLAG : 0; + atom |= has_sign ? SPDK_SHORT_ATOM_SIGN_FLAG : 0; + atom |= len & SPDK_SHORT_ATOM_LEN_MASK; + + opal_add_token_u8(&err, sess, atom); +} + +static void +opal_add_medium_atom_header(struct opal_session *sess, bool bytestring, + bool has_sign, size_t len) +{ + uint8_t header; + + header = SPDK_MEDIUM_ATOM_ID; + header |= bytestring ? SPDK_MEDIUM_ATOM_BYTESTRING_FLAG : 0; + header |= has_sign ? SPDK_MEDIUM_ATOM_SIGN_FLAG : 0; + header |= (len >> 8) & SPDK_MEDIUM_ATOM_LEN_MASK; + sess->cmd[sess->cmd_pos++] = header; + sess->cmd[sess->cmd_pos++] = len; +} + +static void +opal_add_token_bytestring(int *err, struct opal_session *sess, + const uint8_t *bytestring, size_t len) +{ + size_t header_len = 1; + bool is_short_atom = true; + + if (*err) { + return; + } + + if (len & ~SPDK_SHORT_ATOM_LEN_MASK) { + header_len = 2; + is_short_atom = false; + } + + if (len >= IO_BUFFER_LENGTH - sess->cmd_pos - header_len) { + SPDK_ERRLOG("Error adding bytestring: end of buffer.\n"); + *err = -ERANGE; + return; + } + + if (is_short_atom) { + opal_add_short_atom_header(sess, true, false, len); + } else { + opal_add_medium_atom_header(sess, true, false, len); + } + + memcpy(&sess->cmd[sess->cmd_pos], bytestring, len); + sess->cmd_pos += len; +} + +static void +opal_add_token_u64(int *err, struct opal_session *sess, uint64_t number) +{ + int startat = 0; + + if (*err) { + return; + } + + /* add header first */ + if (number <= SPDK_TINY_ATOM_DATA_MASK) { + sess->cmd[sess->cmd_pos++] = (uint8_t) number & SPDK_TINY_ATOM_DATA_MASK; + } else { + if (number < 0x100) { + sess->cmd[sess->cmd_pos++] = 0x81; /* short atom, 1 byte length */ + startat = 0; + } else if (number < 0x10000) { + sess->cmd[sess->cmd_pos++] = 0x82; /* short atom, 2 byte length */ + startat = 1; + } else if (number < 0x100000000) { + sess->cmd[sess->cmd_pos++] = 0x84; /* short atom, 4 byte length */ + startat = 3; + } else { + sess->cmd[sess->cmd_pos++] = 0x88; /* short atom, 8 byte length */ + startat = 7; + } + + /* add number value */ + for (int i = startat; i > -1; i--) { + sess->cmd[sess->cmd_pos++] = (uint8_t)((number >> (i * 8)) & 0xff); + } + } +} + +static void +opal_add_tokens(int *err, struct opal_session *sess, int num, ...) +{ + int i; + va_list args_ptr; + enum spdk_opal_token tmp; + + va_start(args_ptr, num); + + for (i = 0; i < num; i++) { + tmp = va_arg(args_ptr, enum spdk_opal_token); + opal_add_token_u8(err, sess, tmp); + if (*err != 0) { break; } + } + + va_end(args_ptr); +} + +static int +opal_cmd_finalize(struct opal_session *sess, uint32_t hsn, uint32_t tsn, bool eod) +{ + struct spdk_opal_header *hdr; + int err = 0; + + if (eod) { + opal_add_tokens(&err, sess, 6, SPDK_OPAL_ENDOFDATA, + SPDK_OPAL_STARTLIST, + 0, 0, 0, + SPDK_OPAL_ENDLIST); + } + + if (err) { + SPDK_ERRLOG("Error finalizing command.\n"); + return -EFAULT; + } + + hdr = (struct spdk_opal_header *)sess->cmd; + + to_be32(&hdr->packet.session_tsn, tsn); + to_be32(&hdr->packet.session_hsn, hsn); + + to_be32(&hdr->sub_packet.length, sess->cmd_pos - sizeof(*hdr)); + while (sess->cmd_pos % 4) { + if (sess->cmd_pos >= IO_BUFFER_LENGTH) { + SPDK_ERRLOG("Error: Buffer overrun\n"); + return -ERANGE; + } + sess->cmd[sess->cmd_pos++] = 0; + } + to_be32(&hdr->packet.length, sess->cmd_pos - sizeof(hdr->com_packet) - + sizeof(hdr->packet)); + to_be32(&hdr->com_packet.length, sess->cmd_pos - sizeof(hdr->com_packet)); + + return 0; +} + +static size_t +opal_response_parse_tiny(struct spdk_opal_resp_token *token, + const uint8_t *pos) +{ + token->pos = pos; + token->len = 1; + token->width = OPAL_WIDTH_TINY; + + if (pos[0] & SPDK_TINY_ATOM_SIGN_FLAG) { + token->type = OPAL_DTA_TOKENID_SINT; + } else { + token->type = OPAL_DTA_TOKENID_UINT; + token->stored.unsigned_num = pos[0] & SPDK_TINY_ATOM_DATA_MASK; + } + + return token->len; +} + +static int +opal_response_parse_short(struct spdk_opal_resp_token *token, + const uint8_t *pos) +{ + token->pos = pos; + token->len = (pos[0] & SPDK_SHORT_ATOM_LEN_MASK) + 1; /* plus 1-byte header */ + token->width = OPAL_WIDTH_SHORT; + + if (pos[0] & SPDK_SHORT_ATOM_BYTESTRING_FLAG) { + token->type = OPAL_DTA_TOKENID_BYTESTRING; + } else if (pos[0] & SPDK_SHORT_ATOM_SIGN_FLAG) { + token->type = OPAL_DTA_TOKENID_SINT; + } else { + uint64_t u_integer = 0; + size_t i, b = 0; + + token->type = OPAL_DTA_TOKENID_UINT; + if (token->len > 9) { + SPDK_ERRLOG("uint64 with more than 8 bytes\n"); + return -EINVAL; + } + for (i = token->len - 1; i > 0; i--) { + u_integer |= ((uint64_t)pos[i] << (8 * b)); + b++; + } + token->stored.unsigned_num = u_integer; + } + + return token->len; +} + +static size_t +opal_response_parse_medium(struct spdk_opal_resp_token *token, + const uint8_t *pos) +{ + token->pos = pos; + token->len = (((pos[0] & SPDK_MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; /* plus 2-byte header */ + token->width = OPAL_WIDTH_MEDIUM; + + if (pos[0] & SPDK_MEDIUM_ATOM_BYTESTRING_FLAG) { + token->type = OPAL_DTA_TOKENID_BYTESTRING; + } else if (pos[0] & SPDK_MEDIUM_ATOM_SIGN_FLAG) { + token->type = OPAL_DTA_TOKENID_SINT; + } else { + token->type = OPAL_DTA_TOKENID_UINT; + } + + return token->len; +} + +static size_t +opal_response_parse_long(struct spdk_opal_resp_token *token, + const uint8_t *pos) +{ + token->pos = pos; + token->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; /* plus 4-byte header */ + token->width = OPAL_WIDTH_LONG; + + if (pos[0] & SPDK_LONG_ATOM_BYTESTRING_FLAG) { + token->type = OPAL_DTA_TOKENID_BYTESTRING; + } else if (pos[0] & SPDK_LONG_ATOM_SIGN_FLAG) { + token->type = OPAL_DTA_TOKENID_SINT; + } else { + token->type = OPAL_DTA_TOKENID_UINT; + } + + return token->len; +} + +static size_t +opal_response_parse_token(struct spdk_opal_resp_token *token, + const uint8_t *pos) +{ + token->pos = pos; + token->len = 1; + token->type = OPAL_DTA_TOKENID_TOKEN; + token->width = OPAL_WIDTH_TOKEN; + + return token->len; +} + +static int +opal_response_parse(const uint8_t *buf, size_t length, + struct spdk_opal_resp_parsed *resp) +{ + const struct spdk_opal_header *hdr; + struct spdk_opal_resp_token *token_iter; + int num_entries = 0; + int total; + size_t token_length; + const uint8_t *pos; + uint32_t clen, plen, slen; + + if (!buf || !resp) { + return -EINVAL; + } + + hdr = (struct spdk_opal_header *)buf; + pos = buf + sizeof(*hdr); + + clen = from_be32(&hdr->com_packet.length); + plen = from_be32(&hdr->packet.length); + slen = from_be32(&hdr->sub_packet.length); + SPDK_DEBUGLOG(SPDK_LOG_OPAL, "Response size: cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + SPDK_ERRLOG("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); + return -EINVAL; + } + + if (pos > buf + length) { + SPDK_ERRLOG("Pointer out of range\n"); + return -EFAULT; + } + + token_iter = resp->resp_tokens; + total = slen; + + while (total > 0) { + if (pos[0] <= SPDK_TINY_ATOM_TYPE_MAX) { /* tiny atom */ + token_length = opal_response_parse_tiny(token_iter, pos); + } else if (pos[0] <= SPDK_SHORT_ATOM_TYPE_MAX) { /* short atom */ + token_length = opal_response_parse_short(token_iter, pos); + } else if (pos[0] <= SPDK_MEDIUM_ATOM_TYPE_MAX) { /* medium atom */ + token_length = opal_response_parse_medium(token_iter, pos); + } else if (pos[0] <= SPDK_LONG_ATOM_TYPE_MAX) { /* long atom */ + token_length = opal_response_parse_long(token_iter, pos); + } else { /* TOKEN */ + token_length = opal_response_parse_token(token_iter, pos); + } + + if (token_length <= 0) { + SPDK_ERRLOG("Parse response failure.\n"); + return -EINVAL; + } + + pos += token_length; + total -= token_length; + token_iter++; + num_entries++; + + if (total < 0) { + SPDK_ERRLOG("Length not matching.\n"); + return -EINVAL; + } + } + + if (num_entries == 0) { + SPDK_ERRLOG("Couldn't parse response.\n"); + return -EINVAL; + } + resp->num = num_entries; + + return 0; +} + +static inline bool +opal_response_token_matches(const struct spdk_opal_resp_token *token, + uint8_t match) +{ + if (!token || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) { + return false; + } + return true; +} + +static const struct spdk_opal_resp_token * +opal_response_get_token(const struct spdk_opal_resp_parsed *resp, int index) +{ + const struct spdk_opal_resp_token *token; + + if (index >= resp->num) { + SPDK_ERRLOG("Token number doesn't exist: %d, resp: %d\n", + index, resp->num); + return NULL; + } + + token = &resp->resp_tokens[index]; + if (token->len == 0) { + SPDK_ERRLOG("Token length must be non-zero\n"); + return NULL; + } + + return token; +} + +static uint64_t +opal_response_get_u64(const struct spdk_opal_resp_parsed *resp, int index) +{ + if (!resp) { + SPDK_ERRLOG("Response is NULL\n"); + return 0; + } + + if (resp->resp_tokens[index].type != OPAL_DTA_TOKENID_UINT) { + SPDK_ERRLOG("Token is not unsigned int: %d\n", + resp->resp_tokens[index].type); + return 0; + } + + if (!(resp->resp_tokens[index].width == OPAL_WIDTH_TINY || + resp->resp_tokens[index].width == OPAL_WIDTH_SHORT)) { + SPDK_ERRLOG("Atom is not short or tiny: %d\n", + resp->resp_tokens[index].width); + return 0; + } + + return resp->resp_tokens[index].stored.unsigned_num; +} + +static uint16_t +opal_response_get_u16(const struct spdk_opal_resp_parsed *resp, int index) +{ + uint64_t i = opal_response_get_u64(resp, index); + if (i > 0xffffull) { + SPDK_ERRLOG("parse reponse u16 failed. Overflow\n"); + return 0; + } + return (uint16_t) i; +} + +static uint8_t +opal_response_get_u8(const struct spdk_opal_resp_parsed *resp, int index) +{ + uint64_t i = opal_response_get_u64(resp, index); + if (i > 0xffull) { + SPDK_ERRLOG("parse reponse u8 failed. Overflow\n"); + return 0; + } + return (uint8_t) i; +} + +static size_t +opal_response_get_string(const struct spdk_opal_resp_parsed *resp, int n, + const char **store) +{ + uint8_t header_len; + struct spdk_opal_resp_token token; + *store = NULL; + if (!resp) { + SPDK_ERRLOG("Response is NULL\n"); + return 0; + } + + if (n > resp->num) { + SPDK_ERRLOG("Response has %d tokens. Can't access %d\n", + resp->num, n); + return 0; + } + + token = resp->resp_tokens[n]; + if (token.type != OPAL_DTA_TOKENID_BYTESTRING) { + SPDK_ERRLOG("Token is not a byte string!\n"); + return 0; + } + + switch (token.width) { + case OPAL_WIDTH_SHORT: + header_len = 1; + break; + case OPAL_WIDTH_MEDIUM: + header_len = 2; + break; + case OPAL_WIDTH_LONG: + header_len = 4; + break; + default: + SPDK_ERRLOG("Can't get string from this Token\n"); + return 0; + } + + *store = token.pos + header_len; + return token.len - header_len; +} + +static int +opal_response_status(const struct spdk_opal_resp_parsed *resp) +{ + const struct spdk_opal_resp_token *tok; + + /* if we get an EOS token, just return 0 */ + tok = opal_response_get_token(resp, 0); + if (opal_response_token_matches(tok, SPDK_OPAL_ENDOFSESSION)) { + return 0; + } + + if (resp->num < 5) { + return SPDK_DTAERROR_NO_METHOD_STATUS; + } + + tok = opal_response_get_token(resp, resp->num - 5); /* the first token should be STARTLIST */ + if (!opal_response_token_matches(tok, SPDK_OPAL_STARTLIST)) { + return SPDK_DTAERROR_NO_METHOD_STATUS; + } + + tok = opal_response_get_token(resp, resp->num - 1); /* the last token should be ENDLIST */ + if (!opal_response_token_matches(tok, SPDK_OPAL_ENDLIST)) { + return SPDK_DTAERROR_NO_METHOD_STATUS; + } + + /* The second and third values in the status list are reserved, and are + defined in core spec to be 0x00 and 0x00 and SHOULD be ignored by the host. */ + return (int)opal_response_get_u64(resp, + resp->num - 4); /* We only need the first value in the status list. */ +} + +static int +opal_parse_and_check_status(struct opal_session *sess) +{ + int error; + + error = opal_response_parse(sess->resp, IO_BUFFER_LENGTH, &sess->parsed_resp); + if (error) { + SPDK_ERRLOG("Couldn't parse response.\n"); + return error; + } + return opal_response_status(&sess->parsed_resp); +} + +static inline void +opal_clear_cmd(struct opal_session *sess) +{ + sess->cmd_pos = sizeof(struct spdk_opal_header); + memset(sess->cmd, 0, IO_BUFFER_LENGTH); +} + +static inline void +opal_set_comid(struct opal_session *sess, uint16_t comid) +{ + struct spdk_opal_header *hdr = (struct spdk_opal_header *)sess->cmd; + + hdr->com_packet.comid[0] = comid >> 8; + hdr->com_packet.comid[1] = comid; + hdr->com_packet.extended_comid[0] = 0; + hdr->com_packet.extended_comid[1] = 0; +} + +static inline int +opal_init_key(struct spdk_opal_key *opal_key, const char *passwd) +{ + int len; + + if (passwd == NULL || passwd[0] == '\0') { + SPDK_ERRLOG("Password is empty. Create key failed\n"); + return -EINVAL; + } + + len = strlen(passwd); + + if (len >= OPAL_KEY_MAX) { + SPDK_ERRLOG("Password too long. Create key failed\n"); + return -EINVAL; + } + + opal_key->key_len = len; + memcpy(opal_key->key, passwd, opal_key->key_len); + + return 0; +} + +static void +opal_build_locking_range(uint8_t *buffer, uint8_t locking_range) +{ + memcpy(buffer, spdk_opal_uid[UID_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH); + + /* global */ + if (locking_range == 0) { + return; + } + + /* non-global */ + buffer[5] = LOCKING_RANGE_NON_GLOBAL; + buffer[7] = locking_range; +} + +static void +opal_check_tper(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_tper_feat *tper = data; + + dev->feat_info.tper = *tper; +} + +/* + * check single user mode + */ +static bool +opal_check_sum(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_single_user_mode_feat *sum = data; + uint32_t num_locking_objects = from_be32(&sum->num_locking_objects); + + if (num_locking_objects == 0) { + SPDK_NOTICELOG("Need at least one locking object.\n"); + return false; + } + + dev->feat_info.single_user = *sum; + + return true; +} + +static void +opal_check_lock(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_locking_feat *lock = data; + + dev->feat_info.locking = *lock; +} + +static void +opal_check_geometry(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_geo_feat *geo = data; + + dev->feat_info.geo = *geo; +} + +static void +opal_check_datastore(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_datastore_feat *datastore = data; + + dev->feat_info.datastore = *datastore; +} + +static uint16_t +opal_get_comid_v100(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_v100_feat *v100 = data; + uint16_t base_comid = from_be16(&v100->base_comid); + + dev->feat_info.v100 = *v100; + + return base_comid; +} + +static uint16_t +opal_get_comid_v200(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_v200_feat *v200 = data; + uint16_t base_comid = from_be16(&v200->base_comid); + + dev->feat_info.v200 = *v200; + + return base_comid; +} + +static int +opal_discovery0_end(struct spdk_opal_dev *dev, void *payload, uint32_t payload_size) +{ + bool supported = false, single_user = false; + const struct spdk_opal_d0_hdr *hdr = (struct spdk_opal_d0_hdr *)payload; + struct spdk_opal_d0_feat_hdr *feat_hdr; + const uint8_t *epos = payload, *cpos = payload; + uint16_t comid = 0; + uint32_t hlen = from_be32(&(hdr->length)); + + if (hlen > payload_size - sizeof(*hdr)) { + SPDK_ERRLOG("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, payload_size); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ + cpos += sizeof(*hdr); /* current position on buffer */ + + while (cpos < epos) { + feat_hdr = (struct spdk_opal_d0_feat_hdr *)cpos; + uint16_t feat_code = from_be16(&feat_hdr->code); + + switch (feat_code) { + case FEATURECODE_TPER: + opal_check_tper(dev, cpos); + break; + case FEATURECODE_SINGLEUSER: + single_user = opal_check_sum(dev, cpos); + break; + case FEATURECODE_GEOMETRY: + opal_check_geometry(dev, cpos); + break; + case FEATURECODE_LOCKING: + opal_check_lock(dev, cpos); + break; + case FEATURECODE_DATASTORE: + opal_check_datastore(dev, cpos); + break; + case FEATURECODE_OPALV100: + comid = opal_get_comid_v100(dev, cpos); + supported = true; + break; + case FEATURECODE_OPALV200: + comid = opal_get_comid_v200(dev, cpos); + supported = true; + break; + default: + SPDK_INFOLOG(SPDK_LOG_OPAL, "Unknow feature code: %d\n", feat_code); + } + cpos += feat_hdr->length + sizeof(*feat_hdr); + } + + if (supported == false) { + SPDK_ERRLOG("Opal Not Supported.\n"); + return -ENOTSUP; + } + + if (single_user == false) { + SPDK_INFOLOG(SPDK_LOG_OPAL, "Single User Mode Not Supported\n"); + } + + dev->comid = comid; + return 0; +} + +static int +opal_discovery0(struct spdk_opal_dev *dev, void *payload, uint32_t payload_size) +{ + int ret; + + ret = spdk_nvme_ctrlr_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG, LV0_DISCOVERY_COMID, + 0, payload, payload_size); + if (ret) { + return ret; + } + + return opal_discovery0_end(dev, payload, payload_size); +} + +static int +opal_end_session(struct spdk_opal_dev *dev, struct opal_session *sess, uint16_t comid) +{ + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, comid); + opal_add_token_u8(&err, sess, SPDK_OPAL_ENDOFSESSION); + + if (err < 0) { + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, false); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + sess->hsn = 0; + sess->tsn = 0; + + return opal_parse_and_check_status(sess); +} + +void +spdk_opal_dev_destruct(struct spdk_opal_dev *dev) +{ + free(dev); +} + +static int +opal_start_session_done(struct opal_session *sess) +{ + uint32_t hsn, tsn; + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + hsn = opal_response_get_u64(&sess->parsed_resp, 4); + tsn = opal_response_get_u64(&sess->parsed_resp, 5); + + if (hsn == 0 && tsn == 0) { + SPDK_ERRLOG("Couldn't authenticate session\n"); + return -EPERM; + } + + sess->hsn = hsn; + sess->tsn = tsn; + + return 0; +} + +static int +opal_start_generic_session(struct spdk_opal_dev *dev, + struct opal_session *sess, + enum opal_uid_enum auth, + enum opal_uid_enum sp_type, + const char *key, + uint8_t key_len) +{ + uint32_t hsn; + int err = 0; + int ret; + + if (key == NULL && auth != UID_ANYBODY) { + return OPAL_INVAL_PARAM; + } + + opal_clear_cmd(sess); + + opal_set_comid(sess, dev->comid); + hsn = GENERIC_HOST_SESSION_NUM; + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_SMUID], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[STARTSESSION_METHOD], + OPAL_UID_LENGTH); + opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST); + opal_add_token_u64(&err, sess, hsn); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[sp_type], OPAL_UID_LENGTH); + opal_add_token_u8(&err, sess, SPDK_OPAL_TRUE); /* Write */ + + switch (auth) { + case UID_ANYBODY: + opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST); + break; + case UID_ADMIN1: + case UID_SID: + opal_add_token_u8(&err, sess, SPDK_OPAL_STARTNAME); + opal_add_token_u8(&err, sess, 0); /* HostChallenge */ + opal_add_token_bytestring(&err, sess, key, key_len); + opal_add_tokens(&err, sess, 3, /* number of token */ + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + 3);/* HostSignAuth */ + opal_add_token_bytestring(&err, sess, spdk_opal_uid[auth], + OPAL_UID_LENGTH); + opal_add_token_u8(&err, sess, SPDK_OPAL_ENDNAME); + opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST); + break; + default: + SPDK_ERRLOG("Cannot start Admin SP session with auth %d\n", auth); + return -EINVAL; + } + + if (err) { + SPDK_ERRLOG("Error building start adminsp session command.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_start_session_done(sess); +} + +static int +opal_get_msid_cpin_pin_done(struct opal_session *sess, + struct spdk_opal_key *opal_key) +{ + const char *msid_pin; + size_t strlen; + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + strlen = opal_response_get_string(&sess->parsed_resp, 4, &msid_pin); + if (!msid_pin) { + SPDK_ERRLOG("Couldn't extract PIN from response\n"); + return -EINVAL; + } + + opal_key->key_len = strlen; + memcpy(opal_key->key, msid_pin, opal_key->key_len); + + SPDK_DEBUGLOG(SPDK_LOG_OPAL, "MSID = %p\n", opal_key->key); + return 0; +} + +static int +opal_get_msid_cpin_pin(struct spdk_opal_dev *dev, struct opal_session *sess, + struct spdk_opal_key *opal_key) +{ + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_C_PIN_MSID], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_STARTCOLUMN, + SPDK_OPAL_PIN, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_ENDCOLUMN, + SPDK_OPAL_PIN, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building Get MSID CPIN PIN command.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_get_msid_cpin_pin_done(sess, opal_key); +} + +static int +opal_build_generic_pw_cmd(struct opal_session *sess, uint8_t *key, size_t key_len, + uint8_t *cpin_uid, struct spdk_opal_dev *dev) +{ + int err = 0; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, cpin_uid, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], + OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 6, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_PIN); + opal_add_token_bytestring(&err, sess, key, key_len); + opal_add_tokens(&err, sess, 4, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + if (err) { + return err; + } + + return opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); +} + +static int +opal_get_locking_sp_lifecycle_done(struct opal_session *sess) +{ + uint8_t lifecycle; + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + lifecycle = opal_response_get_u64(&sess->parsed_resp, 4); + if (lifecycle != OPAL_MANUFACTURED_INACTIVE) { /* status before activate */ + SPDK_ERRLOG("Couldn't determine the status of the Lifecycle state\n"); + return -EINVAL; + } + + return 0; +} + +static int +opal_get_locking_sp_lifecycle(struct spdk_opal_dev *dev, struct opal_session *sess) +{ + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_STARTCOLUMN, + SPDK_OPAL_LIFECYCLE, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_ENDCOLUMN, + SPDK_OPAL_LIFECYCLE, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error Building GET Lifecycle Status command\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_get_locking_sp_lifecycle_done(sess); +} + +static int +opal_activate(struct spdk_opal_dev *dev, struct opal_session *sess) +{ + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[ACTIVATE_METHOD], + OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building Activate LockingSP command.\n"); + return err; + } + + /* TODO: Single User Mode for activatation */ + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_start_auth_session(struct spdk_opal_dev *dev, + struct opal_session *sess, + enum spdk_opal_user user, + struct spdk_opal_key *opal_key) +{ + uint8_t uid_user[OPAL_UID_LENGTH]; + int err = 0; + int ret; + uint32_t hsn = GENERIC_HOST_SESSION_NUM; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + if (user != OPAL_ADMIN1) { + memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH); + uid_user[7] = user; + } else { + memcpy(uid_user, spdk_opal_uid[UID_ADMIN1], OPAL_UID_LENGTH); + } + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_SMUID], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[STARTSESSION_METHOD], + OPAL_UID_LENGTH); + + opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST); + opal_add_token_u64(&err, sess, hsn); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP], + OPAL_UID_LENGTH); + opal_add_tokens(&err, sess, 3, SPDK_OPAL_TRUE, SPDK_OPAL_STARTNAME, + 0); /* True for a Read-Write session */ + opal_add_token_bytestring(&err, sess, opal_key->key, opal_key->key_len); + opal_add_tokens(&err, sess, 3, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME, 3); /* HostSignAuth */ + opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH); + opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building STARTSESSION command.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_start_session_done(sess); +} + +static int +opal_lock_unlock_range(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_locking_range locking_range, + enum spdk_opal_lock_state l_state) +{ + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + uint8_t read_locked, write_locked; + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_build_locking_range(uid_locking_range, locking_range); + + switch (l_state) { + case OPAL_READONLY: + read_locked = 0; + write_locked = 1; + break; + case OPAL_READWRITE: + read_locked = 0; + write_locked = 0; + break; + case OPAL_RWLOCK: + read_locked = 1; + write_locked = 1; + break; + default: + SPDK_ERRLOG("Tried to set an invalid locking state.\n"); + return -EINVAL; + } + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 15, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_READLOCKED, + read_locked, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_WRITELOCKED, + write_locked, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building SET command.\n"); + return err; + } + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int opal_generic_locking_range_enable_disable(struct spdk_opal_dev *dev, + struct opal_session *sess, + uint8_t *uid, bool read_lock_enabled, bool write_lock_enabled) +{ + int err = 0; + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 23, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + + SPDK_OPAL_STARTNAME, + SPDK_OPAL_READLOCKENABLED, + read_lock_enabled, + SPDK_OPAL_ENDNAME, + + SPDK_OPAL_STARTNAME, + SPDK_OPAL_WRITELOCKENABLED, + write_lock_enabled, + SPDK_OPAL_ENDNAME, + + SPDK_OPAL_STARTNAME, + SPDK_OPAL_READLOCKED, + 0, + SPDK_OPAL_ENDNAME, + + SPDK_OPAL_STARTNAME, + SPDK_OPAL_WRITELOCKED, + 0, + SPDK_OPAL_ENDNAME, + + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + if (err) { + SPDK_ERRLOG("Error building locking range enable/disable command.\n"); + } + return err; +} + +static int +opal_setup_locking_range(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_locking_range locking_range, + uint64_t range_start, uint64_t range_length, + bool read_lock_enabled, bool write_lock_enabled) +{ + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_build_locking_range(uid_locking_range, locking_range); + + if (locking_range == 0) { + err = opal_generic_locking_range_enable_disable(dev, sess, uid_locking_range, + read_lock_enabled, write_lock_enabled); + } else { + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], + OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 6, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_RANGESTART); + opal_add_token_u64(&err, sess, range_start); + opal_add_tokens(&err, sess, 3, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_RANGELENGTH); + opal_add_token_u64(&err, sess, range_length); + opal_add_tokens(&err, sess, 3, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_READLOCKENABLED); + opal_add_token_u64(&err, sess, read_lock_enabled); + opal_add_tokens(&err, sess, 3, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_WRITELOCKENABLED); + opal_add_token_u64(&err, sess, write_lock_enabled); + opal_add_tokens(&err, sess, 4, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + } + if (err) { + SPDK_ERRLOG("Error building Setup Locking range command.\n"); + return err; + + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_get_max_ranges_done(struct opal_session *sess) +{ + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + /* "MaxRanges" is token 4 of response */ + return opal_response_get_u16(&sess->parsed_resp, 4); +} + +static int +opal_get_max_ranges(struct spdk_opal_dev *dev, struct opal_session *sess) +{ + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKING_INFO_TABLE], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_STARTCOLUMN, + SPDK_OPAL_MAXRANGES, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_ENDCOLUMN, + SPDK_OPAL_MAXRANGES, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error Building GET Lifecycle Status command\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_get_max_ranges_done(sess); +} + +static int +opal_get_locking_range_info_done(struct opal_session *sess, + struct spdk_opal_locking_range_info *info) +{ + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + info->range_start = opal_response_get_u64(&sess->parsed_resp, 4); + info->range_length = opal_response_get_u64(&sess->parsed_resp, 8); + info->read_lock_enabled = opal_response_get_u8(&sess->parsed_resp, 12); + info->write_lock_enabled = opal_response_get_u8(&sess->parsed_resp, 16); + info->read_locked = opal_response_get_u8(&sess->parsed_resp, 20); + info->write_locked = opal_response_get_u8(&sess->parsed_resp, 24); + + return 0; +} + +static int +opal_get_locking_range_info(struct spdk_opal_dev *dev, + struct opal_session *sess, + enum spdk_opal_locking_range locking_range_id) +{ + int err = 0; + int ret; + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + struct spdk_opal_locking_range_info *info; + + opal_build_locking_range(uid_locking_range, locking_range_id); + + assert(locking_range_id < SPDK_OPAL_MAX_LOCKING_RANGE); + info = &dev->locking_ranges[locking_range_id]; + memset(info, 0, sizeof(*info)); + info->locking_range_id = locking_range_id; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH); + + + opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_STARTCOLUMN, + SPDK_OPAL_RANGESTART, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_ENDCOLUMN, + SPDK_OPAL_WRITELOCKED, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error Building get locking range info command\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_get_locking_range_info_done(sess, info); +} + +static int +opal_enable_user(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_user user) +{ + int err = 0; + int ret; + uint8_t uid_user[OPAL_UID_LENGTH]; + + memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH); + uid_user[7] = user; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 11, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_AUTH_ENABLE, + SPDK_OPAL_TRUE, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error Building enable user command\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_add_user_to_locking_range(struct spdk_opal_dev *dev, + struct opal_session *sess, + enum spdk_opal_user user, + enum spdk_opal_locking_range locking_range, + enum spdk_opal_lock_state l_state) +{ + int err = 0; + int ret; + uint8_t uid_user[OPAL_UID_LENGTH]; + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + + memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH); + uid_user[7] = user; + + switch (l_state) { + case OPAL_READONLY: + memcpy(uid_locking_range, spdk_opal_uid[UID_LOCKINGRANGE_ACE_RDLOCKED], OPAL_UID_LENGTH); + break; + case OPAL_READWRITE: + memcpy(uid_locking_range, spdk_opal_uid[UID_LOCKINGRANGE_ACE_WRLOCKED], OPAL_UID_LENGTH); + break; + default: + SPDK_ERRLOG("locking state should only be OPAL_READONLY or OPAL_READWRITE\n"); + return -EINVAL; + } + + uid_locking_range[7] = locking_range; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 8, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_BOOLEAN_EXPR, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH / 2); + opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH / 2); + opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_BOOLEAN_ACE], OPAL_UID_LENGTH / 2); + opal_add_tokens(&err, sess, 7, + SPDK_OPAL_TRUE, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + if (err) { + SPDK_ERRLOG("Error building add user to locking range command\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_new_user_passwd(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_user user, + struct spdk_opal_key *opal_key) +{ + uint8_t uid_cpin[OPAL_UID_LENGTH]; + int ret; + + if (user == OPAL_ADMIN1) { + memcpy(uid_cpin, spdk_opal_uid[UID_C_PIN_ADMIN1], OPAL_UID_LENGTH); + } else { + memcpy(uid_cpin, spdk_opal_uid[UID_C_PIN_USER1], OPAL_UID_LENGTH); + uid_cpin[7] = user; + } + + ret = opal_build_generic_pw_cmd(sess, opal_key->key, opal_key->key_len, uid_cpin, dev); + if (ret != 0) { + SPDK_ERRLOG("Error building set password command\n"); + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_set_sid_cpin_pin(struct spdk_opal_dev *dev, struct opal_session *sess, char *new_passwd) +{ + uint8_t cpin_uid[OPAL_UID_LENGTH]; + struct spdk_opal_key opal_key = {}; + int ret; + + ret = opal_init_key(&opal_key, new_passwd); + if (ret != 0) { + return ret; + } + + memcpy(cpin_uid, spdk_opal_uid[UID_C_PIN_SID], OPAL_UID_LENGTH); + + if (opal_build_generic_pw_cmd(sess, opal_key.key, opal_key.key_len, cpin_uid, dev)) { + SPDK_ERRLOG("Error building Set SID cpin\n"); + return -ERANGE; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +int +spdk_opal_cmd_take_ownership(struct spdk_opal_dev *dev, char *new_passwd) +{ + int ret; + struct spdk_opal_key opal_key = {}; + struct opal_session *sess; + + assert(dev != NULL); + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_generic_session(dev, sess, UID_ANYBODY, UID_ADMINSP, NULL, 0); + if (ret) { + SPDK_ERRLOG("start admin SP session error %d\n", ret); + goto end; + } + + ret = opal_get_msid_cpin_pin(dev, sess, &opal_key); + if (ret) { + SPDK_ERRLOG("get msid error %d\n", ret); + opal_end_session(dev, sess, dev->comid); + goto end; + } + + ret = opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + goto end; + } + + /* reuse the session structure */ + memset(sess, 0, sizeof(*sess)); + sess->dev = dev; + ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP, + opal_key.key, opal_key.key_len); + if (ret) { + SPDK_ERRLOG("start admin SP session error %d\n", ret); + goto end; + } + memset(&opal_key, 0, sizeof(struct spdk_opal_key)); + + ret = opal_set_sid_cpin_pin(dev, sess, new_passwd); + if (ret) { + SPDK_ERRLOG("set cpin error %d\n", ret); + opal_end_session(dev, sess, dev->comid); + goto end; + } + + ret = opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + +end: + free(sess); + return ret; +} + +struct spdk_opal_dev * + spdk_opal_dev_construct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_opal_dev *dev; + void *payload; + + dev = calloc(1, sizeof(*dev)); + if (!dev) { + SPDK_ERRLOG("Memory allocation failed\n"); + return NULL; + } + + dev->ctrlr = ctrlr; + + payload = calloc(1, IO_BUFFER_LENGTH); + if (!payload) { + free(dev); + return NULL; + } + + if (opal_discovery0(dev, payload, IO_BUFFER_LENGTH)) { + SPDK_INFOLOG(SPDK_LOG_OPAL, "Opal is not supported on this device\n"); + free(dev); + free(payload); + return NULL; + } + + free(payload); + return dev; +} + +static int +opal_build_revert_tper_cmd(struct spdk_opal_dev *dev, struct opal_session *sess) +{ + int err = 0; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_ADMINSP], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[REVERT_METHOD], + OPAL_UID_LENGTH); + opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST); + opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST); + if (err) { + SPDK_ERRLOG("Error building REVERT TPER command.\n"); + return -ERANGE; + } + + return opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); +} + +static int +opal_gen_new_active_key(struct spdk_opal_dev *dev, struct opal_session *sess, + struct spdk_opal_key *active_key) +{ + uint8_t uid_data[OPAL_UID_LENGTH] = {0}; + int err = 0; + int length; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + if (active_key->key_len == 0) { + SPDK_ERRLOG("Error finding previous data to generate new active key\n"); + return -EINVAL; + } + + length = spdk_min(active_key->key_len, OPAL_UID_LENGTH); + memcpy(uid_data, active_key->key, length); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_data, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GENKEY_METHOD], + OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building new key generation command.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_get_active_key_done(struct opal_session *sess, struct spdk_opal_key *active_key) +{ + const char *key; + size_t str_len; + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + str_len = opal_response_get_string(&sess->parsed_resp, 4, &key); + if (!key) { + SPDK_ERRLOG("Couldn't extract active key from response\n"); + return -EINVAL; + } + + active_key->key_len = str_len; + memcpy(active_key->key, key, active_key->key_len); + + SPDK_DEBUGLOG(SPDK_LOG_OPAL, "active key = %p\n", active_key->key); + return 0; +} + +static int +opal_get_active_key(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_locking_range locking_range, + struct spdk_opal_key *active_key) +{ + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_build_locking_range(uid_locking_range, locking_range); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], + OPAL_UID_LENGTH); + opal_add_tokens(&err, sess, 12, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_STARTCOLUMN, + SPDK_OPAL_ACTIVEKEY, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_ENDCOLUMN, + SPDK_OPAL_ACTIVEKEY, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building get active key command.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_get_active_key_done(sess, active_key); +} + +static int +opal_erase_locking_range(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_locking_range locking_range) +{ + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_build_locking_range(uid_locking_range, locking_range); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[ERASE_METHOD], + OPAL_UID_LENGTH); + opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building erase locking range.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +int +spdk_opal_cmd_revert_tper(struct spdk_opal_dev *dev, const char *passwd) +{ + int ret; + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret) { + SPDK_ERRLOG("Init key failed\n"); + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP, + opal_key.key, opal_key.key_len); + if (ret) { + SPDK_ERRLOG("Error on starting admin SP session with error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_build_revert_tper_cmd(dev, sess); + if (ret) { + opal_end_session(dev, sess, dev->comid); + SPDK_ERRLOG("Build revert tper command with error %d\n", ret); + goto end; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + opal_end_session(dev, sess, dev->comid); + SPDK_ERRLOG("Error on reverting TPer with error %d\n", ret); + goto end; + } + + ret = opal_parse_and_check_status(sess); + if (ret) { + opal_end_session(dev, sess, dev->comid); + SPDK_ERRLOG("Error on reverting TPer with error %d\n", ret); + } + /* No opal_end_session() required here for successful case */ + +end: + free(sess); + return ret; +} + +int +spdk_opal_cmd_activate_locking_sp(struct spdk_opal_dev *dev, const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP, + opal_key.key, opal_key.key_len); + if (ret) { + SPDK_ERRLOG("Error on starting admin SP session with error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_get_locking_sp_lifecycle(dev, sess); + if (ret) { + SPDK_ERRLOG("Error on getting SP lifecycle with error %d\n", ret); + goto end; + } + + ret = opal_activate(dev, sess); + if (ret) { + SPDK_ERRLOG("Error on activation with error %d\n", ret); + } + +end: + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("Error on ending session with error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_lock_unlock(struct spdk_opal_dev *dev, enum spdk_opal_user user, + enum spdk_opal_lock_state flag, enum spdk_opal_locking_range locking_range, + const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, user, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_lock_unlock_range(dev, sess, locking_range, flag); + if (ret) { + SPDK_ERRLOG("lock unlock range error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_setup_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user, + enum spdk_opal_locking_range locking_range_id, uint64_t range_start, + uint64_t range_length, const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, user, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_setup_locking_range(dev, sess, locking_range_id, range_start, range_length, true, + true); + if (ret) { + SPDK_ERRLOG("setup locking range error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_get_max_ranges(struct spdk_opal_dev *dev, const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + if (dev->max_ranges) { + return dev->max_ranges; + } + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, OPAL_ADMIN1, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_get_max_ranges(dev, sess); + if (ret > 0) { + dev->max_ranges = ret; + } + + ret = opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + + return (ret == 0 ? dev->max_ranges : ret); +} + +int +spdk_opal_cmd_get_locking_range_info(struct spdk_opal_dev *dev, const char *passwd, + enum spdk_opal_user user_id, + enum spdk_opal_locking_range locking_range_id) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, user_id, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_get_locking_range_info(dev, sess, locking_range_id); + if (ret) { + SPDK_ERRLOG("get locking range info error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_enable_user(struct spdk_opal_dev *dev, enum spdk_opal_user user_id, + const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_generic_session(dev, sess, UID_ADMIN1, UID_LOCKINGSP, + opal_key.key, opal_key.key_len); + if (ret) { + SPDK_ERRLOG("start locking SP session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_enable_user(dev, sess, user_id); + if (ret) { + SPDK_ERRLOG("enable user error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_add_user_to_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id, + enum spdk_opal_locking_range locking_range_id, + enum spdk_opal_lock_state lock_flag, const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_generic_session(dev, sess, UID_ADMIN1, UID_LOCKINGSP, + opal_key.key, opal_key.key_len); + if (ret) { + SPDK_ERRLOG("start locking SP session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_add_user_to_locking_range(dev, sess, user_id, locking_range_id, lock_flag); + if (ret) { + SPDK_ERRLOG("add user to locking range error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_set_new_passwd(struct spdk_opal_dev *dev, enum spdk_opal_user user_id, + const char *new_passwd, const char *old_passwd, bool new_user) +{ + struct opal_session *sess; + struct spdk_opal_key old_key = {}; + struct spdk_opal_key new_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&old_key, old_passwd); + if (ret != 0) { + return ret; + } + + ret = opal_init_key(&new_key, new_passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, new_user ? OPAL_ADMIN1 : user_id, + &old_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_new_user_passwd(dev, sess, user_id, &new_key); + if (ret) { + SPDK_ERRLOG("set new passwd error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_erase_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id, + enum spdk_opal_locking_range locking_range_id, const char *password) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, password); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, user_id, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_erase_locking_range(dev, sess, locking_range_id); + if (ret) { + SPDK_ERRLOG("get active key error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_secure_erase_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id, + enum spdk_opal_locking_range locking_range_id, const char *password) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + struct spdk_opal_key *active_key; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, password); + if (ret != 0) { + return ret; + } + + active_key = calloc(1, sizeof(*active_key)); + if (!active_key) { + return -ENOMEM; + } + + sess = opal_alloc_session(dev); + if (!sess) { + free(active_key); + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, user_id, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(active_key); + free(sess); + return ret; + } + + ret = opal_get_active_key(dev, sess, locking_range_id, active_key); + if (ret) { + SPDK_ERRLOG("get active key error %d\n", ret); + goto end; + } + + ret = opal_gen_new_active_key(dev, sess, active_key); + if (ret) { + SPDK_ERRLOG("generate new active key error %d\n", ret); + goto end; + } + memset(active_key, 0, sizeof(struct spdk_opal_key)); + +end: + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + free(active_key); + free(sess); + return ret; +} + +struct spdk_opal_d0_features_info * +spdk_opal_get_d0_features_info(struct spdk_opal_dev *dev) +{ + return &dev->feat_info; +} + +bool +spdk_opal_supported(struct spdk_opal_dev *dev) +{ + return false; +} + +struct spdk_opal_locking_range_info * +spdk_opal_get_locking_range_info(struct spdk_opal_dev *dev, enum spdk_opal_locking_range id) +{ + assert(id < SPDK_OPAL_MAX_LOCKING_RANGE); + return &dev->locking_ranges[id]; +} + +void +spdk_opal_free_locking_range_info(struct spdk_opal_dev *dev, enum spdk_opal_locking_range id) +{ + struct spdk_opal_locking_range_info *info; + + assert(id < SPDK_OPAL_MAX_LOCKING_RANGE); + info = &dev->locking_ranges[id]; + memset(info, 0, sizeof(*info)); +} + +/* Log component for opal submodule */ +SPDK_LOG_REGISTER_COMPONENT("opal", SPDK_LOG_OPAL) diff --git a/src/spdk/lib/nvme/nvme_opal_internal.h b/src/spdk/lib/nvme/nvme_opal_internal.h new file mode 100644 index 000000000..11815d435 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_opal_internal.h @@ -0,0 +1,272 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_OPAL_INTERNAL_H +#define SPDK_OPAL_INTERNAL_H + +#include "spdk/opal_spec.h" +#include "spdk/opal.h" +#include "spdk/scsi_spec.h" + +#define IO_BUFFER_LENGTH 2048 +#define MAX_TOKS 64 +#define OPAL_KEY_MAX 256 +#define OPAL_UID_LENGTH 8 + +#define GENERIC_HOST_SESSION_NUM 0x69 + +#define OPAL_INVAL_PARAM 12 + +#define SPDK_DTAERROR_NO_METHOD_STATUS 0x89 + +enum opal_token_type { + OPAL_DTA_TOKENID_BYTESTRING = 0xE0, + OPAL_DTA_TOKENID_SINT = 0xE1, + OPAL_DTA_TOKENID_UINT = 0xE2, + OPAL_DTA_TOKENID_TOKEN = 0xE3, /* actual token is returned */ + OPAL_DTA_TOKENID_INVALID = 0X0, +}; + +enum opal_atom_width { + OPAL_WIDTH_TINY, /* 1 byte in length */ + OPAL_WIDTH_SHORT, /* a 1-byte header and contain up to 15 bytes of data */ + OPAL_WIDTH_MEDIUM, /* a 2-byte header and contain up to 2047 bytes of data */ + OPAL_WIDTH_LONG, /* a 4-byte header and which contain up to 16,777,215 bytes of data */ + OPAL_WIDTH_TOKEN +}; + +enum opal_uid_enum { + /* users */ + UID_SMUID, + UID_THISSP, + UID_ADMINSP, + UID_LOCKINGSP, + UID_ANYBODY, + UID_SID, + UID_ADMIN1, + UID_USER1, + UID_USER2, + + /* tables */ + UID_LOCKINGRANGE_GLOBAL, + UID_LOCKINGRANGE_ACE_RDLOCKED, + UID_LOCKINGRANGE_ACE_WRLOCKED, + UID_MBRCONTROL, + UID_MBR, + UID_AUTHORITY_TABLE, + UID_C_PIN_TABLE, + UID_LOCKING_INFO_TABLE, + UID_PSID, + + /* C_PIN_TABLE object ID's */ + UID_C_PIN_MSID, + UID_C_PIN_SID, + UID_C_PIN_ADMIN1, + UID_C_PIN_USER1, + + /* half UID's (only first 4 bytes used) */ + UID_HALF_AUTHORITY_OBJ_REF, + UID_HALF_BOOLEAN_ACE, +}; + +/* enum for indexing the spdk_opal_method array */ +enum opal_method_enum { + PROPERTIES_METHOD, + STARTSESSION_METHOD, + REVERT_METHOD, + ACTIVATE_METHOD, + NEXT_METHOD, + GETACL_METHOD, + GENKEY_METHOD, + REVERTSP_METHOD, + GET_METHOD, + SET_METHOD, + AUTHENTICATE_METHOD, + RANDOM_METHOD, + ERASE_METHOD, +}; + +struct spdk_opal_key { + uint8_t key_len; + uint8_t key[OPAL_KEY_MAX]; +}; + +const uint8_t spdk_opal_uid[][OPAL_UID_LENGTH] = { + /* users */ + [UID_SMUID] = /* Session Manager UID */ + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff }, + [UID_THISSP] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }, + [UID_ADMINSP] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 }, + [UID_LOCKINGSP] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 }, + [UID_ANYBODY] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 }, + [UID_SID] = /* Security Identifier UID */ + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 }, + [UID_ADMIN1] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 }, + [UID_USER1] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 }, + [UID_USER2] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 }, + + /* tables */ + [UID_LOCKINGRANGE_GLOBAL] = + { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, + [UID_LOCKINGRANGE_ACE_RDLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 }, + [UID_LOCKINGRANGE_ACE_WRLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 }, + [UID_MBRCONTROL] = + { 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 }, + [UID_MBR] = + { 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 }, + [UID_AUTHORITY_TABLE] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00}, + [UID_C_PIN_TABLE] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00}, + [UID_LOCKING_INFO_TABLE] = + { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 }, + [UID_PSID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 }, + + /* C_PIN_TABLE object ID's */ + [UID_C_PIN_MSID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02}, + [UID_C_PIN_SID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01}, + [UID_C_PIN_ADMIN1] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01}, + [UID_C_PIN_USER1] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x03, 0x00, 0x01}, + + /* half UID's (only first 4 bytes used) */ + [UID_HALF_AUTHORITY_OBJ_REF] = + { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff }, + [UID_HALF_BOOLEAN_ACE] = + { 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff }, +}; + +/* + * TCG Storage SSC Methods. + */ +const uint8_t spdk_opal_method[][OPAL_UID_LENGTH] = { + [PROPERTIES_METHOD] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 }, + [STARTSESSION_METHOD] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 }, + [REVERT_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 }, + [ACTIVATE_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 }, + [NEXT_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 }, + [GETACL_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d }, + [GENKEY_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 }, + [REVERTSP_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 }, + [GET_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 }, + [SET_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 }, + [AUTHENTICATE_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c }, + [RANDOM_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 }, + [ERASE_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 }, +}; + +/* + * Response token + */ +struct spdk_opal_resp_token { + const uint8_t *pos; + uint8_t _padding[7]; + union { + uint64_t unsigned_num; + int64_t signed_num; + } stored; + size_t len; /* header + data */ + enum opal_token_type type; + enum opal_atom_width width; +}; + +struct spdk_opal_resp_parsed { + int num; + struct spdk_opal_resp_token resp_tokens[MAX_TOKS]; +}; + +/* header of a response */ +struct spdk_opal_header { + struct spdk_opal_compacket com_packet; + struct spdk_opal_packet packet; + struct spdk_opal_data_subpacket sub_packet; +}; + +struct opal_session; +struct spdk_opal_dev; + +typedef void (*opal_sess_cb)(struct opal_session *sess, int status, void *ctx); + +struct opal_session { + uint32_t hsn; + uint32_t tsn; + size_t cmd_pos; + uint8_t cmd[IO_BUFFER_LENGTH]; + uint8_t resp[IO_BUFFER_LENGTH]; + struct spdk_opal_resp_parsed parsed_resp; + + opal_sess_cb sess_cb; + void *cb_arg; + bool done; + int status; + struct spdk_opal_dev *dev; +}; + +struct spdk_opal_dev { + struct spdk_nvme_ctrlr *ctrlr; + + uint16_t comid; + + struct spdk_opal_d0_features_info feat_info; + + uint8_t max_ranges; /* max locking range number */ + struct spdk_opal_locking_range_info locking_ranges[SPDK_OPAL_MAX_LOCKING_RANGE]; +}; + +#endif diff --git a/src/spdk/lib/nvme/nvme_pcie.c b/src/spdk/lib/nvme/nvme_pcie.c new file mode 100644 index 000000000..132e34cdc --- /dev/null +++ b/src/spdk/lib/nvme/nvme_pcie.c @@ -0,0 +1,2604 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2017, IBM Corporation. All rights reserved. + * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over PCIe transport + */ + +#include "spdk/stdinc.h" +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "nvme_internal.h" +#include "nvme_uevent.h" + +/* + * Number of completion queue entries to process before ringing the + * completion queue doorbell. + */ +#define NVME_MIN_COMPLETIONS (1) +#define NVME_MAX_COMPLETIONS (128) + +/* + * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL + * segment. + */ +#define NVME_MAX_SGL_DESCRIPTORS (250) + +#define NVME_MAX_PRP_LIST_ENTRIES (503) + +struct nvme_pcie_enum_ctx { + struct spdk_nvme_probe_ctx *probe_ctx; + struct spdk_pci_addr pci_addr; + bool has_pci_addr; +}; + +/* PCIe transport extensions for spdk_nvme_ctrlr */ +struct nvme_pcie_ctrlr { + struct spdk_nvme_ctrlr ctrlr; + + /** NVMe MMIO register space */ + volatile struct spdk_nvme_registers *regs; + + /** NVMe MMIO register size */ + uint64_t regs_size; + + struct { + /* BAR mapping address which contains controller memory buffer */ + void *bar_va; + + /* BAR physical address which contains controller memory buffer */ + uint64_t bar_pa; + + /* Controller memory buffer size in Bytes */ + uint64_t size; + + /* Current offset of controller memory buffer, relative to start of BAR virt addr */ + uint64_t current_offset; + + void *mem_register_addr; + size_t mem_register_size; + } cmb; + + /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */ + uint32_t doorbell_stride_u32; + + /* Opaque handle to associated PCI device. */ + struct spdk_pci_device *devhandle; + + /* Flag to indicate the MMIO register has been remapped */ + bool is_remapped; +}; + +struct nvme_tracker { + TAILQ_ENTRY(nvme_tracker) tq_list; + + struct nvme_request *req; + uint16_t cid; + + uint16_t rsvd0; + uint32_t rsvd1; + + spdk_nvme_cmd_cb cb_fn; + void *cb_arg; + + uint64_t prp_sgl_bus_addr; + + /* Don't move, metadata SGL is always contiguous with Data Block SGL */ + struct spdk_nvme_sgl_descriptor meta_sgl; + union { + uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES]; + struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS]; + } u; +}; +/* + * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary + * and so that there is no padding required to meet alignment requirements. + */ +SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K"); +SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned"); +SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, meta_sgl) & 7) == 0, "SGL must be Qword aligned"); + +struct nvme_pcie_poll_group { + struct spdk_nvme_transport_poll_group group; +}; + +/* PCIe transport extensions for spdk_nvme_qpair */ +struct nvme_pcie_qpair { + /* Submission queue tail doorbell */ + volatile uint32_t *sq_tdbl; + + /* Completion queue head doorbell */ + volatile uint32_t *cq_hdbl; + + /* Submission queue */ + struct spdk_nvme_cmd *cmd; + + /* Completion queue */ + struct spdk_nvme_cpl *cpl; + + TAILQ_HEAD(, nvme_tracker) free_tr; + TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr; + + /* Array of trackers indexed by command ID. */ + struct nvme_tracker *tr; + + uint16_t num_entries; + + uint8_t retry_count; + + uint16_t max_completions_cap; + + uint16_t last_sq_tail; + uint16_t sq_tail; + uint16_t cq_head; + uint16_t sq_head; + + struct { + uint8_t phase : 1; + uint8_t delay_cmd_submit : 1; + uint8_t has_shadow_doorbell : 1; + } flags; + + /* + * Base qpair structure. + * This is located after the hot data in this structure so that the important parts of + * nvme_pcie_qpair are in the same cache line. + */ + struct spdk_nvme_qpair qpair; + + struct { + /* Submission queue shadow tail doorbell */ + volatile uint32_t *sq_tdbl; + + /* Completion queue shadow head doorbell */ + volatile uint32_t *cq_hdbl; + + /* Submission queue event index */ + volatile uint32_t *sq_eventidx; + + /* Completion queue event index */ + volatile uint32_t *cq_eventidx; + } shadow_doorbell; + + /* + * Fields below this point should not be touched on the normal I/O path. + */ + + bool sq_in_cmb; + + uint64_t cmd_bus_addr; + uint64_t cpl_bus_addr; + + struct spdk_nvme_cmd *sq_vaddr; + struct spdk_nvme_cpl *cq_vaddr; +}; + +static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, + struct spdk_pci_addr *pci_addr); +static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, + const struct spdk_nvme_io_qpair_opts *opts); +static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair); + +__thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL; +static uint16_t g_signal_lock; +static bool g_sigset = false; + +static void +nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx) +{ + void *map_address; + uint16_t flag = 0; + + if (!__atomic_compare_exchange_n(&g_signal_lock, &flag, 1, false, __ATOMIC_ACQUIRE, + __ATOMIC_RELAXED)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "request g_signal_lock failed\n"); + return; + } + + assert(g_thread_mmio_ctrlr != NULL); + + if (!g_thread_mmio_ctrlr->is_remapped) { + map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (map_address == MAP_FAILED) { + SPDK_ERRLOG("mmap failed\n"); + __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); + return; + } + memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers)); + g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address; + g_thread_mmio_ctrlr->is_remapped = true; + } + __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); +} + +static void +nvme_pcie_ctrlr_setup_signal(void) +{ + struct sigaction sa; + + sa.sa_sigaction = nvme_sigbus_fault_sighandler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + sigaction(SIGBUS, &sa, NULL); +} + +static inline struct nvme_pcie_ctrlr * +nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE); + return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr); +} + +static int +_nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) +{ + struct spdk_nvme_ctrlr *ctrlr, *tmp; + struct spdk_uevent event; + struct spdk_pci_addr pci_addr; + + if (g_spdk_nvme_driver->hotplug_fd < 0) { + return 0; + } + + while (nvme_get_uevent(g_spdk_nvme_driver->hotplug_fd, &event) > 0) { + if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO || + event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) { + if (event.action == SPDK_NVME_UEVENT_ADD) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n", + event.traddr); + if (spdk_process_is_primary()) { + if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) { + nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr); + } + } + } else if (event.action == SPDK_NVME_UEVENT_REMOVE) { + struct spdk_nvme_transport_id trid; + + memset(&trid, 0, sizeof(trid)); + spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE); + snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr); + + ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid); + if (ctrlr == NULL) { + return 0; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n", + event.traddr); + + nvme_ctrlr_fail(ctrlr, true); + + /* get the user app to clean up and stop I/O */ + if (ctrlr->remove_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + } + } + } + + /* Initiate removal of physically hotremoved PCI controllers. Even after + * they're hotremoved from the system, SPDK might still report them via RPC. + */ + TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { + bool do_remove = false; + struct nvme_pcie_ctrlr *pctrlr; + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + continue; + } + + pctrlr = nvme_pcie_ctrlr(ctrlr); + if (spdk_pci_device_is_removed(pctrlr->devhandle)) { + do_remove = true; + } + + if (do_remove) { + nvme_ctrlr_fail(ctrlr, true); + if (ctrlr->remove_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + } + } + return 0; +} + +static inline struct nvme_pcie_qpair * +nvme_pcie_qpair(struct spdk_nvme_qpair *qpair) +{ + assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE); + return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair); +} + +static volatile void * +nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + return (volatile void *)((uintptr_t)pctrlr->regs + offset); +} + +static int +nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 4); + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value); + g_thread_mmio_ctrlr = NULL; + return 0; +} + +static int +nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 8); + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value); + g_thread_mmio_ctrlr = NULL; + return 0; +} + +static int +nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 4); + assert(value != NULL); + g_thread_mmio_ctrlr = pctrlr; + *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset)); + g_thread_mmio_ctrlr = NULL; + if (~(*value) == 0) { + return -1; + } + + return 0; +} + +static int +nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 8); + assert(value != NULL); + g_thread_mmio_ctrlr = pctrlr; + *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset)); + g_thread_mmio_ctrlr = NULL; + if (~(*value) == 0) { + return -1; + } + + return 0; +} + +static int +nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) +{ + return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq), + value); +} + +static int +nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) +{ + return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq), + value); +} + +static int +nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa) +{ + return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw), + aqa->raw); +} + +static int +nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc) +{ + return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw), + &cmbloc->raw); +} + +static int +nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz) +{ + return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), + &cmbsz->raw); +} + +static uint32_t +nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + /* + * For commands requiring more than 2 PRP entries, one PRP will be + * embedded in the command (prp1), and the rest of the PRP entries + * will be in a list pointed to by the command (prp2). This means + * that real max number of PRP entries we support is 506+1, which + * results in a max xfer size of 506*ctrlr->page_size. + */ + return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size; +} + +static uint16_t +nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + return NVME_MAX_SGL_DESCRIPTORS; +} + +static void +nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc; + void *addr = NULL; + uint32_t bir; + union spdk_nvme_cmbsz_register cmbsz; + union spdk_nvme_cmbloc_register cmbloc; + uint64_t size, unit_size, offset, bar_size = 0, bar_phys_addr = 0; + + if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || + nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { + SPDK_ERRLOG("get registers failed\n"); + goto exit; + } + + if (!cmbsz.bits.sz) { + goto exit; + } + + bir = cmbloc.bits.bir; + /* Values 0 2 3 4 5 are valid for BAR */ + if (bir > 5 || bir == 1) { + goto exit; + } + + /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */ + unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu); + /* controller memory buffer size in Bytes */ + size = unit_size * cmbsz.bits.sz; + /* controller memory buffer offset from BAR in Bytes */ + offset = unit_size * cmbloc.bits.ofst; + + rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr, + &bar_phys_addr, &bar_size); + if ((rc != 0) || addr == NULL) { + goto exit; + } + + if (offset > bar_size) { + goto exit; + } + + if (size > bar_size - offset) { + goto exit; + } + + pctrlr->cmb.bar_va = addr; + pctrlr->cmb.bar_pa = bar_phys_addr; + pctrlr->cmb.size = size; + pctrlr->cmb.current_offset = offset; + + if (!cmbsz.bits.sqs) { + pctrlr->ctrlr.opts.use_cmb_sqs = false; + } + + return; +exit: + pctrlr->ctrlr.opts.use_cmb_sqs = false; + return; +} + +static int +nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc = 0; + union spdk_nvme_cmbloc_register cmbloc; + void *addr = pctrlr->cmb.bar_va; + + if (addr) { + if (pctrlr->cmb.mem_register_addr) { + spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size); + } + + if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { + SPDK_ERRLOG("get_cmbloc() failed\n"); + return -EIO; + } + rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr); + } + return rc; +} + +static int +nvme_pcie_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + if (pctrlr->cmb.bar_va == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); + return -ENOTSUP; + } + + if (ctrlr->opts.use_cmb_sqs) { + SPDK_ERRLOG("CMB is already in use for submission queues.\n"); + return -ENOTSUP; + } + + return 0; +} + +static void * +nvme_pcie_ctrlr_map_io_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + union spdk_nvme_cmbsz_register cmbsz; + union spdk_nvme_cmbloc_register cmbloc; + uint64_t mem_register_start, mem_register_end; + int rc; + + if (pctrlr->cmb.mem_register_addr != NULL) { + *size = pctrlr->cmb.mem_register_size; + return pctrlr->cmb.mem_register_addr; + } + + *size = 0; + + if (pctrlr->cmb.bar_va == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); + return NULL; + } + + if (ctrlr->opts.use_cmb_sqs) { + SPDK_ERRLOG("CMB is already in use for submission queues.\n"); + return NULL; + } + + if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || + nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { + SPDK_ERRLOG("get registers failed\n"); + return NULL; + } + + /* If only SQS is supported */ + if (!(cmbsz.bits.wds || cmbsz.bits.rds)) { + return NULL; + } + + /* If CMB is less than 4MiB in size then abort CMB mapping */ + if (pctrlr->cmb.size < (1ULL << 22)) { + return NULL; + } + + mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset + + VALUE_2MB - 1); + mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset + + pctrlr->cmb.size); + pctrlr->cmb.mem_register_addr = (void *)mem_register_start; + pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start; + + rc = spdk_mem_register((void *)mem_register_start, mem_register_end - mem_register_start); + if (rc) { + SPDK_ERRLOG("spdk_mem_register() failed\n"); + return NULL; + } + + pctrlr->cmb.mem_register_addr = (void *)mem_register_start; + pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start; + + *size = pctrlr->cmb.mem_register_size; + return pctrlr->cmb.mem_register_addr; +} + +static int +nvme_pcie_ctrlr_unmap_io_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + int rc; + + if (pctrlr->cmb.mem_register_addr == NULL) { + return 0; + } + + rc = spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size); + + if (rc == 0) { + pctrlr->cmb.mem_register_addr = NULL; + pctrlr->cmb.mem_register_size = 0; + } + + return rc; +} + +static int +nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc; + void *addr = NULL; + uint64_t phys_addr = 0, size = 0; + + rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr, + &phys_addr, &size); + + if ((addr == NULL) || (rc != 0)) { + SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n", + rc, addr); + return -1; + } + + pctrlr->regs = (volatile struct spdk_nvme_registers *)addr; + pctrlr->regs_size = size; + nvme_pcie_ctrlr_map_cmb(pctrlr); + + return 0; +} + +static int +nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc = 0; + void *addr = (void *)pctrlr->regs; + + if (pctrlr->ctrlr.is_removed) { + return rc; + } + + rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr); + if (rc != 0) { + SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc); + return -1; + } + + if (addr) { + /* NOTE: addr may have been remapped here. We're relying on DPDK to call + * munmap internally. + */ + rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr); + } + return rc; +} + +static int +nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries) +{ + struct nvme_pcie_qpair *pqpair; + int rc; + + pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pqpair == NULL) { + return -ENOMEM; + } + + pqpair->num_entries = num_entries; + pqpair->flags.delay_cmd_submit = 0; + + ctrlr->adminq = &pqpair->qpair; + + rc = nvme_qpair_init(ctrlr->adminq, + 0, /* qpair ID */ + ctrlr, + SPDK_NVME_QPRIO_URGENT, + num_entries); + if (rc != 0) { + return rc; + } + + return nvme_pcie_qpair_construct(ctrlr->adminq, NULL); +} + +/* This function must only be called while holding g_spdk_nvme_driver->lock */ +static int +pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) +{ + struct spdk_nvme_transport_id trid = {}; + struct nvme_pcie_enum_ctx *enum_ctx = ctx; + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_pci_addr pci_addr; + + pci_addr = spdk_pci_device_get_addr(pci_dev); + + spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE); + spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr); + + ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid); + if (!spdk_process_is_primary()) { + if (!ctrlr) { + SPDK_ERRLOG("Controller must be constructed in the primary process first.\n"); + return -1; + } + + return nvme_ctrlr_add_process(ctrlr, pci_dev); + } + + /* check whether user passes the pci_addr */ + if (enum_ctx->has_pci_addr && + (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) { + return 1; + } + + return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev); +} + +static int +nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, + bool direct_connect) +{ + struct nvme_pcie_enum_ctx enum_ctx = {}; + + enum_ctx.probe_ctx = probe_ctx; + + if (strlen(probe_ctx->trid.traddr) != 0) { + if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) { + return -1; + } + enum_ctx.has_pci_addr = true; + } + + /* Only the primary process can monitor hotplug. */ + if (spdk_process_is_primary()) { + _nvme_pcie_hotplug_monitor(probe_ctx); + } + + if (enum_ctx.has_pci_addr == false) { + return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), + pcie_nvme_enum_cb, &enum_ctx); + } else { + return spdk_pci_device_attach(spdk_pci_nvme_get_driver(), + pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr); + } +} + +static int +nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr) +{ + struct nvme_pcie_enum_ctx enum_ctx; + + enum_ctx.probe_ctx = probe_ctx; + enum_ctx.has_pci_addr = true; + enum_ctx.pci_addr = *pci_addr; + + return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx); +} + +static struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + struct spdk_pci_device *pci_dev = devhandle; + struct nvme_pcie_ctrlr *pctrlr; + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + uint16_t cmd_reg; + int rc; + struct spdk_pci_id pci_id; + + rc = spdk_pci_device_claim(pci_dev); + if (rc < 0) { + SPDK_ERRLOG("could not claim device %s (%s)\n", + trid->traddr, spdk_strerror(-rc)); + return NULL; + } + + pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pctrlr == NULL) { + spdk_pci_device_unclaim(pci_dev); + SPDK_ERRLOG("could not allocate ctrlr\n"); + return NULL; + } + + pctrlr->is_remapped = false; + pctrlr->ctrlr.is_removed = false; + pctrlr->devhandle = devhandle; + pctrlr->ctrlr.opts = *opts; + pctrlr->ctrlr.trid = *trid; + + rc = nvme_ctrlr_construct(&pctrlr->ctrlr); + if (rc != 0) { + spdk_pci_device_unclaim(pci_dev); + spdk_free(pctrlr); + return NULL; + } + + rc = nvme_pcie_ctrlr_allocate_bars(pctrlr); + if (rc != 0) { + spdk_pci_device_unclaim(pci_dev); + spdk_free(pctrlr); + return NULL; + } + + /* Enable PCI busmaster and disable INTx */ + spdk_pci_device_cfg_read16(pci_dev, &cmd_reg, 4); + cmd_reg |= 0x404; + spdk_pci_device_cfg_write16(pci_dev, cmd_reg, 4); + + if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) { + SPDK_ERRLOG("get_cap() failed\n"); + spdk_pci_device_unclaim(pci_dev); + spdk_free(pctrlr); + return NULL; + } + + if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) { + SPDK_ERRLOG("get_vs() failed\n"); + spdk_pci_device_unclaim(pci_dev); + spdk_free(pctrlr); + return NULL; + } + + nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs); + + /* Doorbell stride is 2 ^ (dstrd + 2), + * but we want multiples of 4, so drop the + 2 */ + pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd; + + pci_id = spdk_pci_device_get_id(pci_dev); + pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id); + + rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr, pctrlr->ctrlr.opts.admin_queue_size); + if (rc != 0) { + nvme_ctrlr_destruct(&pctrlr->ctrlr); + return NULL; + } + + /* Construct the primary process properties */ + rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev); + if (rc != 0) { + nvme_ctrlr_destruct(&pctrlr->ctrlr); + return NULL; + } + + if (g_sigset != true) { + nvme_pcie_ctrlr_setup_signal(); + g_sigset = true; + } + + return &pctrlr->ctrlr; +} + +static int +nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq); + union spdk_nvme_aqa_register aqa; + + if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) { + SPDK_ERRLOG("set_asq() failed\n"); + return -EIO; + } + + if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) { + SPDK_ERRLOG("set_acq() failed\n"); + return -EIO; + } + + aqa.raw = 0; + /* acqs and asqs are 0-based. */ + aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; + aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; + + if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) { + SPDK_ERRLOG("set_aqa() failed\n"); + return -EIO; + } + + return 0; +} + +static int +nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr); + + if (ctrlr->adminq) { + nvme_pcie_qpair_destroy(ctrlr->adminq); + } + + nvme_ctrlr_destruct_finish(ctrlr); + + nvme_ctrlr_free_processes(ctrlr); + + nvme_pcie_ctrlr_free_bars(pctrlr); + + if (devhandle) { + spdk_pci_device_unclaim(devhandle); + spdk_pci_device_detach(devhandle); + } + + spdk_free(pctrlr); + + return 0; +} + +static void +nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr) +{ + tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp); + tr->cid = cid; + tr->req = NULL; +} + +static int +nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + uint32_t i; + + /* all head/tail vals are set to 0 */ + pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0; + + /* + * First time through the completion queue, HW will set phase + * bit on completions to 1. So set this to 1 here, indicating + * we're looking for a 1 to know which entries have completed. + * we'll toggle the bit each time when the completion queue + * rolls over. + */ + pqpair->flags.phase = 1; + for (i = 0; i < pqpair->num_entries; i++) { + pqpair->cpl[i].status.p = 0; + } + + return 0; +} + +static void * +nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t size, uint64_t alignment, + uint64_t *phys_addr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + uintptr_t addr; + + if (pctrlr->cmb.mem_register_addr != NULL) { + /* BAR is mapped for data */ + return NULL; + } + + addr = (uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset; + addr = (addr + (alignment - 1)) & ~(alignment - 1); + + /* CMB may only consume part of the BAR, calculate accordingly */ + if (addr + size > ((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.size)) { + SPDK_ERRLOG("Tried to allocate past valid CMB range!\n"); + return NULL; + } + *phys_addr = pctrlr->cmb.bar_pa + addr - (uintptr_t)pctrlr->cmb.bar_va; + + pctrlr->cmb.current_offset = (addr + size) - (uintptr_t)pctrlr->cmb.bar_va; + + return (void *)addr; +} + +static int +nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, + const struct spdk_nvme_io_qpair_opts *opts) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr; + uint16_t i; + volatile uint32_t *doorbell_base; + uint16_t num_trackers; + size_t page_align = sysconf(_SC_PAGESIZE); + size_t queue_align, queue_len; + uint32_t flags = SPDK_MALLOC_DMA; + uint64_t sq_paddr = 0; + uint64_t cq_paddr = 0; + + if (opts) { + pqpair->sq_vaddr = opts->sq.vaddr; + pqpair->cq_vaddr = opts->cq.vaddr; + sq_paddr = opts->sq.paddr; + cq_paddr = opts->cq.paddr; + } + + pqpair->retry_count = ctrlr->opts.transport_retry_count; + + /* + * Limit the maximum number of completions to return per call to prevent wraparound, + * and calculate how many trackers can be submitted at once without overflowing the + * completion queue. + */ + pqpair->max_completions_cap = pqpair->num_entries / 4; + pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS); + pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS); + num_trackers = pqpair->num_entries - pqpair->max_completions_cap; + + SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n", + pqpair->max_completions_cap, num_trackers); + + assert(num_trackers != 0); + + pqpair->sq_in_cmb = false; + + if (nvme_qpair_is_admin_queue(&pqpair->qpair)) { + flags |= SPDK_MALLOC_SHARE; + } + + /* cmd and cpl rings must be aligned on page size boundaries. */ + if (ctrlr->opts.use_cmb_sqs) { + pqpair->cmd = nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd), + page_align, &pqpair->cmd_bus_addr); + if (pqpair->cmd != NULL) { + pqpair->sq_in_cmb = true; + } + } + + if (pqpair->sq_in_cmb == false) { + if (pqpair->sq_vaddr) { + pqpair->cmd = pqpair->sq_vaddr; + } else { + /* To ensure physical address contiguity we make each ring occupy + * a single hugepage only. See MAX_IO_QUEUE_ENTRIES. + */ + queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cmd); + queue_align = spdk_max(spdk_align32pow2(queue_len), page_align); + pqpair->cmd = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags); + if (pqpair->cmd == NULL) { + SPDK_ERRLOG("alloc qpair_cmd failed\n"); + return -ENOMEM; + } + } + if (sq_paddr) { + assert(pqpair->sq_vaddr != NULL); + pqpair->cmd_bus_addr = sq_paddr; + } else { + pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL); + if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) { + SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n"); + return -EFAULT; + } + } + } + + if (pqpair->cq_vaddr) { + pqpair->cpl = pqpair->cq_vaddr; + } else { + queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cpl); + queue_align = spdk_max(spdk_align32pow2(queue_len), page_align); + pqpair->cpl = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags); + if (pqpair->cpl == NULL) { + SPDK_ERRLOG("alloc qpair_cpl failed\n"); + return -ENOMEM; + } + } + if (cq_paddr) { + assert(pqpair->cq_vaddr != NULL); + pqpair->cpl_bus_addr = cq_paddr; + } else { + pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL); + if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) { + SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n"); + return -EFAULT; + } + } + + doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl; + pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; + pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; + + /* + * Reserve space for all of the trackers in a single allocation. + * struct nvme_tracker must be padded so that its size is already a power of 2. + * This ensures the PRP list embedded in the nvme_tracker object will not span a + * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing. + */ + pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pqpair->tr == NULL) { + SPDK_ERRLOG("nvme_tr failed\n"); + return -ENOMEM; + } + + TAILQ_INIT(&pqpair->free_tr); + TAILQ_INIT(&pqpair->outstanding_tr); + + for (i = 0; i < num_trackers; i++) { + tr = &pqpair->tr[i]; + nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL)); + TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); + } + + nvme_pcie_qpair_reset(qpair); + + return 0; +} + +/* Used when dst points to MMIO (i.e. CMB) in a virtual machine - in these cases we must + * not use wide instructions because QEMU will not emulate such instructions to MMIO space. + * So this function ensures we only copy 8 bytes at a time. + */ +static inline void +nvme_pcie_copy_command_mmio(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) +{ + uint64_t *dst64 = (uint64_t *)dst; + const uint64_t *src64 = (const uint64_t *)src; + uint32_t i; + + for (i = 0; i < sizeof(*dst) / 8; i++) { + dst64[i] = src64[i]; + } +} + +static inline void +nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) +{ + /* dst and src are known to be non-overlapping and 64-byte aligned. */ +#if defined(__SSE2__) + __m128i *d128 = (__m128i *)dst; + const __m128i *s128 = (const __m128i *)src; + + _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0])); + _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1])); + _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2])); + _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3])); +#else + *dst = *src; +#endif +} + +/** + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req, struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_request *active_req = req; + struct spdk_nvme_ctrlr_process *active_proc; + + /* + * The admin request is from another process. Move to the per + * process list for that process to handle it later. + */ + assert(nvme_qpair_is_admin_queue(qpair)); + assert(active_req->pid != getpid()); + + active_proc = nvme_ctrlr_get_process(ctrlr, active_req->pid); + if (active_proc) { + /* Save the original completion information */ + memcpy(&active_req->cpl, cpl, sizeof(*cpl)); + STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq); + } else { + SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n", + active_req->pid); + + nvme_free_request(active_req); + } +} + +/** + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_request *req, *tmp_req; + pid_t pid = getpid(); + struct spdk_nvme_ctrlr_process *proc; + + /* + * Check whether there is any pending admin request from + * other active processes. + */ + assert(nvme_qpair_is_admin_queue(qpair)); + + proc = nvme_ctrlr_get_current_process(ctrlr); + if (!proc) { + SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid); + assert(proc); + return; + } + + STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { + STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); + + assert(req->pid == pid); + + nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl); + nvme_free_request(req); + } +} + +static inline int +nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) +{ + return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old); +} + +static bool +nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value, + volatile uint32_t *shadow_db, + volatile uint32_t *eventidx) +{ + uint16_t old; + + if (!shadow_db) { + return true; + } + + old = *shadow_db; + *shadow_db = value; + + /* + * Ensure that the doorbell is updated before reading the EventIdx from + * memory + */ + spdk_mb(); + + if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) { + return false; + } + + return true; +} + +static inline void +nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); + bool need_mmio = true; + + if (qpair->first_fused_submitted) { + /* This is first cmd of two fused commands - don't ring doorbell */ + qpair->first_fused_submitted = 0; + return; + } + + if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { + need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, + pqpair->sq_tail, + pqpair->shadow_doorbell.sq_tdbl, + pqpair->shadow_doorbell.sq_eventidx); + } + + if (spdk_likely(need_mmio)) { + spdk_wmb(); + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail); + g_thread_mmio_ctrlr = NULL; + } +} + +static inline void +nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); + bool need_mmio = true; + + if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { + need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, + pqpair->cq_head, + pqpair->shadow_doorbell.cq_hdbl, + pqpair->shadow_doorbell.cq_eventidx); + } + + if (spdk_likely(need_mmio)) { + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head); + g_thread_mmio_ctrlr = NULL; + } +} + +static void +nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) +{ + struct nvme_request *req; + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + req = tr->req; + assert(req != NULL); + + if (req->cmd.fuse == SPDK_NVME_IO_FLAGS_FUSE_FIRST) { + /* This is first cmd of two fused commands - don't ring doorbell */ + qpair->first_fused_submitted = 1; + } + + /* Don't use wide instructions to copy NVMe command, this is limited by QEMU + * virtual NVMe controller, the maximum access width is 8 Bytes for one time. + */ + if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) { + nvme_pcie_copy_command_mmio(&pqpair->cmd[pqpair->sq_tail], &req->cmd); + } else { + /* Copy the command from the tracker to the submission queue. */ + nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd); + } + + if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) { + pqpair->sq_tail = 0; + } + + if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) { + SPDK_ERRLOG("sq_tail is passing sq_head!\n"); + } + + if (!pqpair->flags.delay_cmd_submit) { + nvme_pcie_qpair_ring_sq_doorbell(qpair); + } +} + +static void +nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, + struct spdk_nvme_cpl *cpl, bool print_on_error) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_request *req; + bool retry, error; + bool req_from_current_proc = true; + + req = tr->req; + + assert(req != NULL); + + error = spdk_nvme_cpl_is_error(cpl); + retry = error && nvme_completion_is_retry(cpl) && + req->retries < pqpair->retry_count; + + if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) { + spdk_nvme_qpair_print_command(qpair, &req->cmd); + spdk_nvme_qpair_print_completion(qpair, cpl); + } + + assert(cpl->cid == req->cmd.cid); + + if (retry) { + req->retries++; + nvme_pcie_qpair_submit_tracker(qpair, tr); + } else { + TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); + + /* Only check admin requests from different processes. */ + if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { + req_from_current_proc = false; + nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl); + } else { + nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl); + } + + if (req_from_current_proc == true) { + nvme_qpair_free_request(qpair, req); + } + + tr->req = NULL; + + TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); + } +} + +static void +nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair, + struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, + bool print_on_error) +{ + struct spdk_nvme_cpl cpl; + + memset(&cpl, 0, sizeof(cpl)); + cpl.sqid = qpair->id; + cpl.cid = tr->cid; + cpl.status.sct = sct; + cpl.status.sc = sc; + cpl.status.dnr = dnr; + nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); +} + +static void +nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr, *temp, *last; + + last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head); + + /* Abort previously submitted (outstanding) trs */ + TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) { + if (!qpair->ctrlr->opts.disable_error_logging) { + SPDK_ERRLOG("aborting outstanding command\n"); + } + nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); + + if (tr == last) { + break; + } + } +} + +static int +nvme_pcie_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr, *tmp; + int rc; + + assert(iter_fn != NULL); + + TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { + assert(tr->req != NULL); + + rc = iter_fn(tr->req, arg); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +static void +nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr; + + tr = TAILQ_FIRST(&pqpair->outstanding_tr); + while (tr != NULL) { + assert(tr->req != NULL); + if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + nvme_pcie_qpair_manual_complete_tracker(qpair, tr, + SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0, + false); + tr = TAILQ_FIRST(&pqpair->outstanding_tr); + } else { + tr = TAILQ_NEXT(tr, tq_list); + } + } +} + +static void +nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair) +{ + nvme_pcie_admin_qpair_abort_aers(qpair); +} + +static int +nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_pcie_admin_qpair_destroy(qpair); + } + /* + * We check sq_vaddr and cq_vaddr to see if the user specified the memory + * buffers when creating the I/O queue. + * If the user specified them, we cannot free that memory. + * Nor do we free it if it's in the CMB. + */ + if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) { + spdk_free(pqpair->cmd); + } + if (!pqpair->cq_vaddr && pqpair->cpl) { + spdk_free(pqpair->cpl); + } + if (pqpair->tr) { + spdk_free(pqpair->tr); + } + + nvme_qpair_deinit(qpair); + + spdk_free(pqpair); + + return 0; +} + +static void +nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + nvme_pcie_qpair_abort_trackers(qpair, dnr); +} + +static int +nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, + void *cb_arg) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ; + + cmd->cdw10_bits.create_io_q.qid = io_que->id; + cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1; + + cmd->cdw11_bits.create_io_cq.pc = 1; + cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ; + + cmd->cdw10_bits.create_io_q.qid = io_que->id; + cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1; + cmd->cdw11_bits.create_io_sq.pc = 1; + cmd->cdw11_bits.create_io_sq.qprio = io_que->qprio; + cmd->cdw11_bits.create_io_sq.cqid = io_que->id; + cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ; + cmd->cdw10_bits.delete_io_q.qid = qpair->id; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ; + cmd->cdw10_bits.delete_io_q.qid = qpair->id; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +_nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + uint16_t qid) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_completion_poll_status *status; + int rc; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + free(status); + return rc; + } + + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + SPDK_ERRLOG("nvme_create_io_cq failed!\n"); + if (!status->timed_out) { + free(status); + } + return -1; + } + + memset(status, 0, sizeof(*status)); + rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + free(status); + return rc; + } + + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + SPDK_ERRLOG("nvme_create_io_sq failed!\n"); + if (status->timed_out) { + /* Request is still queued, the memory will be freed in a completion callback. + allocate a new request */ + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + } + + memset(status, 0, sizeof(*status)); + /* Attempt to delete the completion queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + /* The originall or newly allocated status structure can be freed since + * the corresponding request has been completed of failed to submit */ + free(status); + return -1; + } + nvme_wait_for_completion(ctrlr->adminq, status); + if (!status->timed_out) { + /* status can be freed regardless of nvme_wait_for_completion return value */ + free(status); + } + return -1; + } + + if (ctrlr->shadow_doorbell) { + pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) * + pctrlr->doorbell_stride_u32; + pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) * + pctrlr->doorbell_stride_u32; + pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) * + pctrlr->doorbell_stride_u32; + pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) * + pctrlr->doorbell_stride_u32; + pqpair->flags.has_shadow_doorbell = 1; + } else { + pqpair->flags.has_shadow_doorbell = 0; + } + nvme_pcie_qpair_reset(qpair); + free(status); + + return 0; +} + +static struct spdk_nvme_qpair * +nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + struct nvme_pcie_qpair *pqpair; + struct spdk_nvme_qpair *qpair; + int rc; + + assert(ctrlr != NULL); + + pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pqpair == NULL) { + return NULL; + } + + pqpair->num_entries = opts->io_queue_size; + pqpair->flags.delay_cmd_submit = opts->delay_cmd_submit; + + qpair = &pqpair->qpair; + + rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests); + if (rc != 0) { + nvme_pcie_qpair_destroy(qpair); + return NULL; + } + + rc = nvme_pcie_qpair_construct(qpair, opts); + + if (rc != 0) { + nvme_pcie_qpair_destroy(qpair); + return NULL; + } + + return qpair; +} + +static int +nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + if (nvme_qpair_is_admin_queue(qpair)) { + return 0; + } else { + return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id); + } +} + +static void +nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ +} + +static int +nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_completion_poll_status *status; + int rc; + + assert(ctrlr != NULL); + + if (ctrlr->is_removed) { + goto free; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + /* Delete the I/O submission queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc); + free(status); + return rc; + } + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + return -1; + } + + memset(status, 0, sizeof(*status)); + /* Delete the completion queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc); + free(status); + return rc; + } + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + return -1; + } + free(status); + +free: + if (qpair->no_deletion_notification_needed == 0) { + /* Abort the rest of the I/O */ + nvme_pcie_qpair_abort_trackers(qpair, 1); + } + + nvme_pcie_qpair_destroy(qpair); + return 0; +} + +static void +nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) +{ + /* + * Bad vtophys translation, so abort this request and return + * immediately. + */ + nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_INVALID_FIELD, + 1 /* do not retry */, true); +} + +/* + * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes. + * + * *prp_index will be updated to account for the number of PRP entries used. + */ +static inline int +nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len, + uint32_t page_size) +{ + struct spdk_nvme_cmd *cmd = &tr->req->cmd; + uintptr_t page_mask = page_size - 1; + uint64_t phys_addr; + uint32_t i; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n", + *prp_index, virt_addr, (uint32_t)len); + + if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { + SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); + return -EFAULT; + } + + i = *prp_index; + while (len) { + uint32_t seg_len; + + /* + * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, + * so prp_index == count is valid. + */ + if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { + SPDK_ERRLOG("out of PRP entries\n"); + return -EFAULT; + } + + phys_addr = spdk_vtophys(virt_addr, NULL); + if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) { + SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr); + return -EFAULT; + } + + if (i == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr); + cmd->dptr.prp.prp1 = phys_addr; + seg_len = page_size - ((uintptr_t)virt_addr & page_mask); + } else { + if ((phys_addr & page_mask) != 0) { + SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); + return -EFAULT; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr); + tr->u.prp[i - 1] = phys_addr; + seg_len = page_size; + } + + seg_len = spdk_min(seg_len, len); + virt_addr += seg_len; + len -= seg_len; + i++; + } + + cmd->psdt = SPDK_NVME_PSDT_PRP; + if (i <= 1) { + cmd->dptr.prp.prp2 = 0; + } else if (i == 2) { + cmd->dptr.prp.prp2 = tr->u.prp[0]; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); + } else { + cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); + } + + *prp_index = i; + return 0; +} + +static int +nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair, + struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned) +{ + assert(0); + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EINVAL; +} + +/** + * Build PRP list describing physically contiguous payload buffer. + */ +static int +nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr, bool dword_aligned) +{ + uint32_t prp_index = 0; + int rc; + + rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset, + req->payload_size, qpair->ctrlr->page_size); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + } + + return rc; +} + +/** + * Build an SGL describing a physically contiguous payload buffer. + * + * This is more efficient than using PRP because large buffers can be + * described this way. + */ +static int +nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr, bool dword_aligned) +{ + void *virt_addr; + uint64_t phys_addr, mapping_length; + uint32_t length; + struct spdk_nvme_sgl_descriptor *sgl; + uint32_t nseg = 0; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + + sgl = tr->u.sgl; + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.subtype = 0; + + length = req->payload_size; + virt_addr = req->payload.contig_or_cb_arg + req->payload_offset; + mapping_length = length; + + while (length > 0) { + if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; + } + + if (dword_aligned && ((uintptr_t)virt_addr & 3)) { + SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; + } + + phys_addr = spdk_vtophys(virt_addr, &mapping_length); + if (phys_addr == SPDK_VTOPHYS_ERROR) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; + } + + mapping_length = spdk_min(length, mapping_length); + + length -= mapping_length; + virt_addr += mapping_length; + + sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + sgl->unkeyed.length = mapping_length; + sgl->address = phys_addr; + sgl->unkeyed.subtype = 0; + + sgl++; + nseg++; + } + + if (nseg == 1) { + /* + * The whole transfer can be described by a single SGL descriptor. + * Use the special case described by the spec where SGL1's type is Data Block. + * This means the SGL in the tracker is not used at all, so copy the first (and only) + * SGL element into SGL1. + */ + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; + req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; + } else { + /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because + * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page. + */ + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; + req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; + req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); + } + + return 0; +} + +/** + * Build SGL list describing scattered payload buffer. + */ +static int +nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr, bool dword_aligned) +{ + int rc; + void *virt_addr; + uint64_t phys_addr; + uint32_t remaining_transfer_len, remaining_user_sge_len, length; + struct spdk_nvme_sgl_descriptor *sgl; + uint32_t nseg = 0; + + /* + * Build scattered payloads. + */ + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + sgl = tr->u.sgl; + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.subtype = 0; + + remaining_transfer_len = req->payload_size; + + while (remaining_transfer_len > 0) { + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, + &virt_addr, &remaining_user_sge_len); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; + } + + /* Bit Bucket SGL descriptor */ + if ((uint64_t)virt_addr == UINT64_MAX) { + /* TODO: enable WRITE and COMPARE when necessary */ + if (req->cmd.opc != SPDK_NVME_OPC_READ) { + SPDK_ERRLOG("Only READ command can be supported\n"); + goto exit; + } + if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { + SPDK_ERRLOG("Too many SGL entries\n"); + goto exit; + } + + sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET; + /* If the SGL describes a destination data buffer, the length of data + * buffer shall be discarded by controller, and the length is included + * in Number of Logical Blocks (NLB) parameter. Otherwise, the length + * is not included in the NLB parameter. + */ + remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); + remaining_transfer_len -= remaining_user_sge_len; + + sgl->unkeyed.length = remaining_user_sge_len; + sgl->address = 0; + sgl->unkeyed.subtype = 0; + + sgl++; + nseg++; + + continue; + } + + remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); + remaining_transfer_len -= remaining_user_sge_len; + while (remaining_user_sge_len > 0) { + if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { + SPDK_ERRLOG("Too many SGL entries\n"); + goto exit; + } + + if (dword_aligned && ((uintptr_t)virt_addr & 3)) { + SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); + goto exit; + } + + phys_addr = spdk_vtophys(virt_addr, NULL); + if (phys_addr == SPDK_VTOPHYS_ERROR) { + goto exit; + } + + length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr)); + remaining_user_sge_len -= length; + virt_addr += length; + + if (nseg > 0 && phys_addr == + (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) { + /* extend previous entry */ + (*(sgl - 1)).unkeyed.length += length; + continue; + } + + sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + sgl->unkeyed.length = length; + sgl->address = phys_addr; + sgl->unkeyed.subtype = 0; + + sgl++; + nseg++; + } + } + + if (nseg == 1) { + /* + * The whole transfer can be described by a single SGL descriptor. + * Use the special case described by the spec where SGL1's type is Data Block. + * This means the SGL in the tracker is not used at all, so copy the first (and only) + * SGL element into SGL1. + */ + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; + req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; + } else { + /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because + * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page. + */ + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; + req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; + req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); + } + + return 0; + +exit: + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; +} + +/** + * Build PRP list describing scattered payload buffer. + */ +static int +nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr, bool dword_aligned) +{ + int rc; + void *virt_addr; + uint32_t remaining_transfer_len, length; + uint32_t prp_index = 0; + uint32_t page_size = qpair->ctrlr->page_size; + + /* + * Build scattered payloads. + */ + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + remaining_transfer_len = req->payload_size; + while (remaining_transfer_len > 0) { + assert(req->payload.next_sge_fn != NULL); + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; + } + + length = spdk_min(remaining_transfer_len, length); + + /* + * Any incompatible sges should have been handled up in the splitting routine, + * but assert here as an additional check. + * + * All SGEs except last must end on a page boundary. + */ + assert((length == remaining_transfer_len) || + _is_page_aligned((uintptr_t)virt_addr + length, page_size)); + + rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return rc; + } + + remaining_transfer_len -= length; + } + + return 0; +} + +typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *, + bool); + +static build_req_fn const g_nvme_pcie_build_req_table[][2] = { + [NVME_PAYLOAD_TYPE_INVALID] = { + nvme_pcie_qpair_build_request_invalid, /* PRP */ + nvme_pcie_qpair_build_request_invalid /* SGL */ + }, + [NVME_PAYLOAD_TYPE_CONTIG] = { + nvme_pcie_qpair_build_contig_request, /* PRP */ + nvme_pcie_qpair_build_contig_hw_sgl_request /* SGL */ + }, + [NVME_PAYLOAD_TYPE_SGL] = { + nvme_pcie_qpair_build_prps_sgl_request, /* PRP */ + nvme_pcie_qpair_build_hw_sgl_request /* SGL */ + } +}; + +static int +nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, + bool sgl_supported, bool dword_aligned) +{ + void *md_payload; + struct nvme_request *req = tr->req; + + if (req->payload.md) { + md_payload = req->payload.md + req->md_offset; + if (dword_aligned && ((uintptr_t)md_payload & 3)) { + SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload); + goto exit; + } + + if (sgl_supported && dword_aligned) { + assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG); + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL; + tr->meta_sgl.address = spdk_vtophys(md_payload, NULL); + if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR) { + goto exit; + } + tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + tr->meta_sgl.unkeyed.length = req->md_size; + tr->meta_sgl.unkeyed.subtype = 0; + req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor); + } else { + req->cmd.mptr = spdk_vtophys(md_payload, NULL); + if (req->cmd.mptr == SPDK_VTOPHYS_ERROR) { + goto exit; + } + } + } + + return 0; + +exit: + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EINVAL; +} + +static int +nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + struct nvme_tracker *tr; + int rc = 0; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + enum nvme_payload_type payload_type; + bool sgl_supported; + bool dword_aligned = true; + + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + } + + tr = TAILQ_FIRST(&pqpair->free_tr); + + if (tr == NULL) { + /* Inform the upper layer to try again later. */ + rc = -EAGAIN; + goto exit; + } + + TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ + TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list); + tr->req = req; + tr->cb_fn = req->cb_fn; + tr->cb_arg = req->cb_arg; + req->cmd.cid = tr->cid; + + if (req->payload_size != 0) { + payload_type = nvme_payload_type(&req->payload); + /* According to the specification, PRPs shall be used for all + * Admin commands for NVMe over PCIe implementations. + */ + sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 && + !nvme_qpair_is_admin_queue(qpair); + + if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) { + dword_aligned = false; + } + rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned); + if (rc < 0) { + goto exit; + } + + rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, dword_aligned); + if (rc < 0) { + goto exit; + } + } + + nvme_pcie_qpair_submit_tracker(qpair, tr); + +exit: + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + } + + return rc; +} + +static void +nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair) +{ + uint64_t t02; + struct nvme_tracker *tr, *tmp; + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + + /* Don't check timeouts during controller initialization. */ + if (ctrlr->state != NVME_CTRLR_STATE_READY) { + return; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + active_proc = nvme_ctrlr_get_current_process(ctrlr); + } else { + active_proc = qpair->active_proc; + } + + /* Only check timeouts if the current process has a timeout callback. */ + if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { + return; + } + + t02 = spdk_get_ticks(); + TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { + assert(tr->req != NULL); + + if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) { + /* + * The requests are in order, so as soon as one has not timed out, + * stop iterating. + */ + break; + } + } +} + +static int32_t +nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr; + struct spdk_nvme_cpl *cpl, *next_cpl; + uint32_t num_completions = 0; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + uint16_t next_cq_head; + uint8_t next_phase; + bool next_is_valid = false; + + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + } + + if (max_completions == 0 || max_completions > pqpair->max_completions_cap) { + /* + * max_completions == 0 means unlimited, but complete at most + * max_completions_cap batch of I/O at a time so that the completion + * queue doorbells don't wrap around. + */ + max_completions = pqpair->max_completions_cap; + } + + while (1) { + cpl = &pqpair->cpl[pqpair->cq_head]; + + if (!next_is_valid && cpl->status.p != pqpair->flags.phase) { + break; + } + + if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) { + next_cq_head = pqpair->cq_head + 1; + next_phase = pqpair->flags.phase; + } else { + next_cq_head = 0; + next_phase = !pqpair->flags.phase; + } + next_cpl = &pqpair->cpl[next_cq_head]; + next_is_valid = (next_cpl->status.p == next_phase); + if (next_is_valid) { + __builtin_prefetch(&pqpair->tr[next_cpl->cid]); + } + +#ifdef __PPC64__ + /* + * This memory barrier prevents reordering of: + * - load after store from/to tr + * - load after load cpl phase and cpl cid + */ + spdk_mb(); +#elif defined(__aarch64__) + __asm volatile("dmb oshld" ::: "memory"); +#endif + + if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) { + pqpair->cq_head = 0; + pqpair->flags.phase = !pqpair->flags.phase; + } + + tr = &pqpair->tr[cpl->cid]; + /* Prefetch the req's STAILQ_ENTRY since we'll need to access it + * as part of putting the req back on the qpair's free list. + */ + __builtin_prefetch(&tr->req->stailq); + pqpair->sq_head = cpl->sqhd; + + if (tr->req) { + nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true); + } else { + SPDK_ERRLOG("cpl does not map to outstanding cmd\n"); + spdk_nvme_qpair_print_completion(qpair, cpl); + assert(0); + } + + if (++num_completions == max_completions) { + break; + } + } + + if (num_completions > 0) { + nvme_pcie_qpair_ring_cq_doorbell(qpair); + } + + if (pqpair->flags.delay_cmd_submit) { + if (pqpair->last_sq_tail != pqpair->sq_tail) { + nvme_pcie_qpair_ring_sq_doorbell(qpair); + pqpair->last_sq_tail = pqpair->sq_tail; + } + } + + if (spdk_unlikely(ctrlr->timeout_enabled)) { + /* + * User registered for timeout callback + */ + nvme_pcie_qpair_check_timeout(qpair); + } + + /* Before returning, complete any pending admin request. */ + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_pcie_qpair_complete_pending_admin_request(qpair); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + } + + return num_completions; +} + +static struct spdk_nvme_transport_poll_group * +nvme_pcie_poll_group_create(void) +{ + struct nvme_pcie_poll_group *group = calloc(1, sizeof(*group)); + + if (group == NULL) { + SPDK_ERRLOG("Unable to allocate poll group.\n"); + return NULL; + } + + return &group->group; +} + +static int +nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static int +nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static int +nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static int +nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static int64_t +nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) +{ + struct spdk_nvme_qpair *qpair, *tmp_qpair; + int32_t local_completions = 0; + int64_t total_completions = 0; + + STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { + disconnected_qpair_cb(qpair, tgroup->group->ctx); + } + + STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) { + local_completions = spdk_nvme_qpair_process_completions(qpair, completions_per_qpair); + if (local_completions < 0) { + disconnected_qpair_cb(qpair, tgroup->group->ctx); + local_completions = 0; + } + total_completions += local_completions; + } + + return total_completions; +} + +static int +nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) +{ + if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { + return -EBUSY; + } + + free(tgroup); + + return 0; +} + +static struct spdk_pci_id nvme_pci_driver_id[] = { + { + .class_id = SPDK_PCI_CLASS_NVME, + .vendor_id = SPDK_PCI_ANY_ID, + .device_id = SPDK_PCI_ANY_ID, + .subvendor_id = SPDK_PCI_ANY_ID, + .subdevice_id = SPDK_PCI_ANY_ID, + }, + { .vendor_id = 0, /* sentinel */ }, +}; + +SPDK_PCI_DRIVER_REGISTER("nvme", nvme_pci_driver_id, + SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE); + +const struct spdk_nvme_transport_ops pcie_ops = { + .name = "PCIE", + .type = SPDK_NVME_TRANSPORT_PCIE, + .ctrlr_construct = nvme_pcie_ctrlr_construct, + .ctrlr_scan = nvme_pcie_ctrlr_scan, + .ctrlr_destruct = nvme_pcie_ctrlr_destruct, + .ctrlr_enable = nvme_pcie_ctrlr_enable, + + .ctrlr_set_reg_4 = nvme_pcie_ctrlr_set_reg_4, + .ctrlr_set_reg_8 = nvme_pcie_ctrlr_set_reg_8, + .ctrlr_get_reg_4 = nvme_pcie_ctrlr_get_reg_4, + .ctrlr_get_reg_8 = nvme_pcie_ctrlr_get_reg_8, + + .ctrlr_get_max_xfer_size = nvme_pcie_ctrlr_get_max_xfer_size, + .ctrlr_get_max_sges = nvme_pcie_ctrlr_get_max_sges, + + .ctrlr_reserve_cmb = nvme_pcie_ctrlr_reserve_cmb, + .ctrlr_map_cmb = nvme_pcie_ctrlr_map_io_cmb, + .ctrlr_unmap_cmb = nvme_pcie_ctrlr_unmap_io_cmb, + + .ctrlr_create_io_qpair = nvme_pcie_ctrlr_create_io_qpair, + .ctrlr_delete_io_qpair = nvme_pcie_ctrlr_delete_io_qpair, + .ctrlr_connect_qpair = nvme_pcie_ctrlr_connect_qpair, + .ctrlr_disconnect_qpair = nvme_pcie_ctrlr_disconnect_qpair, + + .qpair_abort_reqs = nvme_pcie_qpair_abort_reqs, + .qpair_reset = nvme_pcie_qpair_reset, + .qpair_submit_request = nvme_pcie_qpair_submit_request, + .qpair_process_completions = nvme_pcie_qpair_process_completions, + .qpair_iterate_requests = nvme_pcie_qpair_iterate_requests, + .admin_qpair_abort_aers = nvme_pcie_admin_qpair_abort_aers, + + .poll_group_create = nvme_pcie_poll_group_create, + .poll_group_connect_qpair = nvme_pcie_poll_group_connect_qpair, + .poll_group_disconnect_qpair = nvme_pcie_poll_group_disconnect_qpair, + .poll_group_add = nvme_pcie_poll_group_add, + .poll_group_remove = nvme_pcie_poll_group_remove, + .poll_group_process_completions = nvme_pcie_poll_group_process_completions, + .poll_group_destroy = nvme_pcie_poll_group_destroy, +}; + +SPDK_NVME_TRANSPORT_REGISTER(pcie, &pcie_ops); diff --git a/src/spdk/lib/nvme/nvme_poll_group.c b/src/spdk/lib/nvme/nvme_poll_group.c new file mode 100644 index 000000000..291f55e63 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_poll_group.c @@ -0,0 +1,164 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "nvme_internal.h" + +struct spdk_nvme_poll_group * +spdk_nvme_poll_group_create(void *ctx) +{ + struct spdk_nvme_poll_group *group; + + group = calloc(1, sizeof(*group)); + if (group == NULL) { + return NULL; + } + + group->ctx = ctx; + STAILQ_INIT(&group->tgroups); + + return group; +} + +int +spdk_nvme_poll_group_add(struct spdk_nvme_poll_group *group, struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_transport_poll_group *tgroup; + const struct spdk_nvme_transport *transport; + + if (nvme_qpair_get_state(qpair) != NVME_QPAIR_DISCONNECTED) { + return -EINVAL; + } + + STAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (tgroup->transport == qpair->transport) { + break; + } + } + + /* See if a new transport has been added (dlopen style) and we need to update the poll group */ + if (!tgroup) { + transport = nvme_get_first_transport(); + while (transport != NULL) { + if (transport == qpair->transport) { + tgroup = nvme_transport_poll_group_create(transport); + if (tgroup == NULL) { + return -ENOMEM; + } + tgroup->group = group; + STAILQ_INSERT_TAIL(&group->tgroups, tgroup, link); + break; + } + transport = nvme_get_next_transport(transport); + } + } + + return tgroup ? nvme_transport_poll_group_add(tgroup, qpair) : -ENODEV; +} + +int +spdk_nvme_poll_group_remove(struct spdk_nvme_poll_group *group, struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_transport_poll_group *tgroup; + + STAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (tgroup->transport == qpair->transport) { + return nvme_transport_poll_group_remove(tgroup, qpair); + } + } + + return -ENODEV; +} + +int +nvme_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) +{ + return nvme_transport_poll_group_connect_qpair(qpair); +} + +int +nvme_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + return nvme_transport_poll_group_disconnect_qpair(qpair); +} + +int64_t +spdk_nvme_poll_group_process_completions(struct spdk_nvme_poll_group *group, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) +{ + struct spdk_nvme_transport_poll_group *tgroup; + int64_t local_completions = 0, error_reason = 0, num_completions = 0; + + if (disconnected_qpair_cb == NULL) { + return -EINVAL; + } + + STAILQ_FOREACH(tgroup, &group->tgroups, link) { + local_completions = nvme_transport_poll_group_process_completions(tgroup, completions_per_qpair, + disconnected_qpair_cb); + if (local_completions < 0 && error_reason == 0) { + error_reason = local_completions; + } else { + num_completions += local_completions; + /* Just to be safe */ + assert(num_completions >= 0); + } + } + + return error_reason ? error_reason : num_completions; +} + +void * +spdk_nvme_poll_group_get_ctx(struct spdk_nvme_poll_group *group) +{ + return group->ctx; +} + +int +spdk_nvme_poll_group_destroy(struct spdk_nvme_poll_group *group) +{ + struct spdk_nvme_transport_poll_group *tgroup, *tmp_tgroup; + + STAILQ_FOREACH_SAFE(tgroup, &group->tgroups, link, tmp_tgroup) { + STAILQ_REMOVE(&group->tgroups, tgroup, spdk_nvme_transport_poll_group, link); + if (nvme_transport_poll_group_destroy(tgroup) != 0) { + STAILQ_INSERT_TAIL(&group->tgroups, tgroup, link); + return -EBUSY; + } + + } + + free(group); + + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_qpair.c b/src/spdk/lib/nvme/nvme_qpair.c new file mode 100644 index 000000000..a3fdc2169 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_qpair.c @@ -0,0 +1,1064 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" +#include "spdk/nvme_ocssd.h" + +#define NVME_CMD_DPTR_STR_SIZE 256 + +static int nvme_qpair_resubmit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req); + +struct nvme_string { + uint16_t value; + const char *str; +}; + +static const struct nvme_string admin_opcode[] = { + { SPDK_NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" }, + { SPDK_NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" }, + { SPDK_NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" }, + { SPDK_NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" }, + { SPDK_NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" }, + { SPDK_NVME_OPC_IDENTIFY, "IDENTIFY" }, + { SPDK_NVME_OPC_ABORT, "ABORT" }, + { SPDK_NVME_OPC_SET_FEATURES, "SET FEATURES" }, + { SPDK_NVME_OPC_GET_FEATURES, "GET FEATURES" }, + { SPDK_NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" }, + { SPDK_NVME_OPC_NS_MANAGEMENT, "NAMESPACE MANAGEMENT" }, + { SPDK_NVME_OPC_FIRMWARE_COMMIT, "FIRMWARE COMMIT" }, + { SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" }, + { SPDK_NVME_OPC_DEVICE_SELF_TEST, "DEVICE SELF-TEST" }, + { SPDK_NVME_OPC_NS_ATTACHMENT, "NAMESPACE ATTACHMENT" }, + { SPDK_NVME_OPC_KEEP_ALIVE, "KEEP ALIVE" }, + { SPDK_NVME_OPC_DIRECTIVE_SEND, "DIRECTIVE SEND" }, + { SPDK_NVME_OPC_DIRECTIVE_RECEIVE, "DIRECTIVE RECEIVE" }, + { SPDK_NVME_OPC_VIRTUALIZATION_MANAGEMENT, "VIRTUALIZATION MANAGEMENT" }, + { SPDK_NVME_OPC_NVME_MI_SEND, "NVME-MI SEND" }, + { SPDK_NVME_OPC_NVME_MI_RECEIVE, "NVME-MI RECEIVE" }, + { SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG, "DOORBELL BUFFER CONFIG" }, + { SPDK_NVME_OPC_FABRIC, "FABRIC" }, + { SPDK_NVME_OPC_FORMAT_NVM, "FORMAT NVM" }, + { SPDK_NVME_OPC_SECURITY_SEND, "SECURITY SEND" }, + { SPDK_NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" }, + { SPDK_NVME_OPC_SANITIZE, "SANITIZE" }, + { SPDK_NVME_OPC_GET_LBA_STATUS, "GET LBA STATUS" }, + { SPDK_OCSSD_OPC_GEOMETRY, "OCSSD / GEOMETRY" }, + { 0xFFFF, "ADMIN COMMAND" } +}; + +static const struct nvme_string fabric_opcode[] = { + { SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET, "PROPERTY SET" }, + { SPDK_NVMF_FABRIC_COMMAND_CONNECT, "CONNECT" }, + { SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET, "PROPERTY GET" }, + { SPDK_NVMF_FABRIC_COMMAND_AUTHENTICATION_SEND, "AUTHENTICATION SEND" }, + { SPDK_NVMF_FABRIC_COMMAND_AUTHENTICATION_RECV, "AUTHENTICATION RECV" }, + { 0xFFFF, "RESERVED / VENDOR SPECIFIC" } +}; + +static const struct nvme_string feat_opcode[] = { + { SPDK_NVME_FEAT_ARBITRATION, "ARBITRATION" }, + { SPDK_NVME_FEAT_POWER_MANAGEMENT, "POWER MANAGEMENT" }, + { SPDK_NVME_FEAT_LBA_RANGE_TYPE, "LBA RANGE TYPE" }, + { SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD, "TEMPERATURE THRESHOLD" }, + { SPDK_NVME_FEAT_ERROR_RECOVERY, "ERROR_RECOVERY" }, + { SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE, "VOLATILE WRITE CACHE" }, + { SPDK_NVME_FEAT_NUMBER_OF_QUEUES, "NUMBER OF QUEUES" }, + { SPDK_NVME_FEAT_INTERRUPT_COALESCING, "INTERRUPT COALESCING" }, + { SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION, "INTERRUPT VECTOR CONFIGURATION" }, + { SPDK_NVME_FEAT_WRITE_ATOMICITY, "WRITE ATOMICITY" }, + { SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, "ASYNC EVENT CONFIGURATION" }, + { SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION, "AUTONOMOUS POWER STATE TRANSITION" }, + { SPDK_NVME_FEAT_HOST_MEM_BUFFER, "HOST MEM BUFFER" }, + { SPDK_NVME_FEAT_TIMESTAMP, "TIMESTAMP" }, + { SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, "KEEP ALIVE TIMER" }, + { SPDK_NVME_FEAT_HOST_CONTROLLED_THERMAL_MANAGEMENT, "HOST CONTROLLED THERMAL MANAGEMENT" }, + { SPDK_NVME_FEAT_NON_OPERATIONAL_POWER_STATE_CONFIG, "NON OPERATIONAL POWER STATE CONFIG" }, + { SPDK_NVME_FEAT_SOFTWARE_PROGRESS_MARKER, "SOFTWARE PROGRESS MARKER" }, + { SPDK_NVME_FEAT_HOST_IDENTIFIER, "HOST IDENTIFIER" }, + { SPDK_NVME_FEAT_HOST_RESERVE_MASK, "HOST RESERVE MASK" }, + { SPDK_NVME_FEAT_HOST_RESERVE_PERSIST, "HOST RESERVE PERSIST" }, + { 0xFFFF, "RESERVED" } +}; + +static const struct nvme_string io_opcode[] = { + { SPDK_NVME_OPC_FLUSH, "FLUSH" }, + { SPDK_NVME_OPC_WRITE, "WRITE" }, + { SPDK_NVME_OPC_READ, "READ" }, + { SPDK_NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" }, + { SPDK_NVME_OPC_COMPARE, "COMPARE" }, + { SPDK_NVME_OPC_WRITE_ZEROES, "WRITE ZEROES" }, + { SPDK_NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" }, + { SPDK_NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" }, + { SPDK_NVME_OPC_RESERVATION_REPORT, "RESERVATION REPORT" }, + { SPDK_NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" }, + { SPDK_NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" }, + { SPDK_OCSSD_OPC_VECTOR_RESET, "OCSSD / VECTOR RESET" }, + { SPDK_OCSSD_OPC_VECTOR_WRITE, "OCSSD / VECTOR WRITE" }, + { SPDK_OCSSD_OPC_VECTOR_READ, "OCSSD / VECTOR READ" }, + { SPDK_OCSSD_OPC_VECTOR_COPY, "OCSSD / VECTOR COPY" }, + { 0xFFFF, "IO COMMAND" } +}; + +static const struct nvme_string sgl_type[] = { + { SPDK_NVME_SGL_TYPE_DATA_BLOCK, "DATA BLOCK" }, + { SPDK_NVME_SGL_TYPE_BIT_BUCKET, "BIT BUCKET" }, + { SPDK_NVME_SGL_TYPE_SEGMENT, "SEGMENT" }, + { SPDK_NVME_SGL_TYPE_LAST_SEGMENT, "LAST SEGMENT" }, + { SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK, "TRANSPORT DATA BLOCK" }, + { SPDK_NVME_SGL_TYPE_VENDOR_SPECIFIC, "VENDOR SPECIFIC" }, + { 0xFFFF, "RESERVED" } +}; + +static const struct nvme_string sgl_subtype[] = { + { SPDK_NVME_SGL_SUBTYPE_ADDRESS, "ADDRESS" }, + { SPDK_NVME_SGL_SUBTYPE_OFFSET, "OFFSET" }, + { SPDK_NVME_SGL_SUBTYPE_TRANSPORT, "TRANSPORT" }, + { SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY, "INVALIDATE KEY" }, + { 0xFFFF, "RESERVED" } +}; + +static const char * +nvme_get_string(const struct nvme_string *strings, uint16_t value) +{ + const struct nvme_string *entry; + + entry = strings; + + while (entry->value != 0xFFFF) { + if (entry->value == value) { + return entry->str; + } + entry++; + } + return entry->str; +} + +static void +nvme_get_sgl_unkeyed(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + + snprintf(buf, size, " len:0x%x", sgl->unkeyed.length); +} + +static void +nvme_get_sgl_keyed(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + + snprintf(buf, size, " len:0x%x key:0x%x", sgl->keyed.length, sgl->keyed.key); +} + +static void +nvme_get_sgl(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + int c; + + c = snprintf(buf, size, "SGL %s %s 0x%" PRIx64, nvme_get_string(sgl_type, sgl->generic.type), + nvme_get_string(sgl_subtype, sgl->generic.subtype), sgl->address); + assert(c >= 0 && (size_t)c < size); + + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK) { + nvme_get_sgl_unkeyed(buf + c, size - c, cmd); + } + + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { + nvme_get_sgl_keyed(buf + c, size - c, cmd); + } +} + +static void +nvme_get_prp(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + snprintf(buf, size, "PRP1 0x%" PRIx64 " PRP2 0x%" PRIx64, cmd->dptr.prp.prp1, cmd->dptr.prp.prp2); +} + +static void +nvme_get_dptr(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + if (spdk_nvme_opc_get_data_transfer(cmd->opc) != SPDK_NVME_DATA_NONE) { + switch (cmd->psdt) { + case SPDK_NVME_PSDT_PRP: + nvme_get_prp(buf, size, cmd); + break; + case SPDK_NVME_PSDT_SGL_MPTR_CONTIG: + case SPDK_NVME_PSDT_SGL_MPTR_SGL: + nvme_get_sgl(buf, size, cmd); + break; + default: + ; + } + } +} + +static void +nvme_admin_qpair_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd) +{ + struct spdk_nvmf_capsule_cmd *fcmd = (void *)cmd; + char dptr[NVME_CMD_DPTR_STR_SIZE] = {'\0'}; + + assert(cmd != NULL); + + nvme_get_dptr(dptr, sizeof(dptr), cmd); + + switch ((int)cmd->opc) { + case SPDK_NVME_OPC_SET_FEATURES: + case SPDK_NVME_OPC_GET_FEATURES: + SPDK_NOTICELOG("%s %s cid:%d cdw10:%08x %s\n", + nvme_get_string(admin_opcode, cmd->opc), nvme_get_string(feat_opcode, + cmd->cdw10_bits.set_features.fid), cmd->cid, cmd->cdw10, dptr); + break; + case SPDK_NVME_OPC_FABRIC: + SPDK_NOTICELOG("%s %s qid:%d cid:%d %s\n", + nvme_get_string(admin_opcode, cmd->opc), nvme_get_string(fabric_opcode, fcmd->fctype), qid, + fcmd->cid, dptr); + break; + default: + SPDK_NOTICELOG("%s (%02x) qid:%d cid:%d nsid:%x cdw10:%08x cdw11:%08x %s\n", + nvme_get_string(admin_opcode, cmd->opc), cmd->opc, qid, cmd->cid, cmd->nsid, cmd->cdw10, + cmd->cdw11, dptr); + } +} + +static void +nvme_io_qpair_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd) +{ + char dptr[NVME_CMD_DPTR_STR_SIZE] = {'\0'}; + + assert(cmd != NULL); + + nvme_get_dptr(dptr, sizeof(dptr), cmd); + + switch ((int)cmd->opc) { + case SPDK_NVME_OPC_WRITE: + case SPDK_NVME_OPC_READ: + case SPDK_NVME_OPC_WRITE_UNCORRECTABLE: + case SPDK_NVME_OPC_COMPARE: + SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d " + "lba:%llu len:%d %s\n", + nvme_get_string(io_opcode, cmd->opc), qid, cmd->cid, cmd->nsid, + ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10, + (cmd->cdw12 & 0xFFFF) + 1, dptr); + break; + case SPDK_NVME_OPC_FLUSH: + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d\n", + nvme_get_string(io_opcode, cmd->opc), qid, cmd->cid, cmd->nsid); + break; + default: + SPDK_NOTICELOG("%s (%02x) sqid:%d cid:%d nsid:%d\n", + nvme_get_string(io_opcode, cmd->opc), cmd->opc, qid, cmd->cid, cmd->nsid); + break; + } +} + +void +spdk_nvme_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd) +{ + assert(cmd != NULL); + + if (qid == 0 || cmd->opc == SPDK_NVME_OPC_FABRIC) { + nvme_admin_qpair_print_command(qid, cmd); + } else { + nvme_io_qpair_print_command(qid, cmd); + } +} + +void +spdk_nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cmd *cmd) +{ + assert(qpair != NULL); + assert(cmd != NULL); + + spdk_nvme_print_command(qpair->id, cmd); +} + +static const struct nvme_string generic_status[] = { + { SPDK_NVME_SC_SUCCESS, "SUCCESS" }, + { SPDK_NVME_SC_INVALID_OPCODE, "INVALID OPCODE" }, + { SPDK_NVME_SC_INVALID_FIELD, "INVALID FIELD" }, + { SPDK_NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" }, + { SPDK_NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" }, + { SPDK_NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" }, + { SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" }, + { SPDK_NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" }, + { SPDK_NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" }, + { SPDK_NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" }, + { SPDK_NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" }, + { SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" }, + { SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" }, + { SPDK_NVME_SC_INVALID_SGL_SEG_DESCRIPTOR, "INVALID SGL SEGMENT DESCRIPTOR" }, + { SPDK_NVME_SC_INVALID_NUM_SGL_DESCIRPTORS, "INVALID NUMBER OF SGL DESCRIPTORS" }, + { SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" }, + { SPDK_NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" }, + { SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" }, + { SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF, "INVALID CONTROLLER MEMORY BUFFER" }, + { SPDK_NVME_SC_INVALID_PRP_OFFSET, "INVALID PRP OFFSET" }, + { SPDK_NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" }, + { SPDK_NVME_SC_OPERATION_DENIED, "OPERATION DENIED" }, + { SPDK_NVME_SC_INVALID_SGL_OFFSET, "INVALID SGL OFFSET" }, + { SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT, "HOSTID INCONSISTENT FORMAT" }, + { SPDK_NVME_SC_KEEP_ALIVE_EXPIRED, "KEEP ALIVE EXPIRED" }, + { SPDK_NVME_SC_KEEP_ALIVE_INVALID, "KEEP ALIVE INVALID" }, + { SPDK_NVME_SC_ABORTED_PREEMPT, "ABORTED - PREEMPT AND ABORT" }, + { SPDK_NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" }, + { SPDK_NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" }, + { SPDK_NVME_SC_SGL_DATA_BLOCK_GRANULARITY_INVALID, "DATA BLOCK GRANULARITY INVALID" }, + { SPDK_NVME_SC_COMMAND_INVALID_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" }, + { SPDK_NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" }, + { SPDK_NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" }, + { SPDK_NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" }, + { SPDK_NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" }, + { SPDK_NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" }, + { 0xFFFF, "GENERIC" } +}; + +static const struct nvme_string command_specific_status[] = { + { SPDK_NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" }, + { SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" }, + { SPDK_NVME_SC_INVALID_QUEUE_SIZE, "INVALID QUEUE SIZE" }, + { SPDK_NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" }, + { SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" }, + { SPDK_NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" }, + { SPDK_NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" }, + { SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" }, + { SPDK_NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" }, + { SPDK_NVME_SC_INVALID_FORMAT, "INVALID FORMAT" }, + { SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET, "FIRMWARE REQUIRES CONVENTIONAL RESET" }, + { SPDK_NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" }, + { SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE, "FEATURE ID NOT SAVEABLE" }, + { SPDK_NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" }, + { SPDK_NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" }, + { SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET, "FIRMWARE REQUIRES NVM RESET" }, + { SPDK_NVME_SC_FIRMWARE_REQ_RESET, "FIRMWARE REQUIRES RESET" }, + { SPDK_NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION, "FIRMWARE REQUIRES MAX TIME VIOLATION" }, + { SPDK_NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" }, + { SPDK_NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" }, + { SPDK_NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" }, + { SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE, "NAMESPACE ID UNAVAILABLE" }, + { SPDK_NVME_SC_NAMESPACE_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" }, + { SPDK_NVME_SC_NAMESPACE_IS_PRIVATE, "NAMESPACE IS PRIVATE" }, + { SPDK_NVME_SC_NAMESPACE_NOT_ATTACHED, "NAMESPACE NOT ATTACHED" }, + { SPDK_NVME_SC_THINPROVISIONING_NOT_SUPPORTED, "THINPROVISIONING NOT SUPPORTED" }, + { SPDK_NVME_SC_CONTROLLER_LIST_INVALID, "CONTROLLER LIST INVALID" }, + { SPDK_NVME_SC_DEVICE_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" }, + { SPDK_NVME_SC_BOOT_PARTITION_WRITE_PROHIBITED, "BOOT PARTITION WRITE PROHIBITED" }, + { SPDK_NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER ID" }, + { SPDK_NVME_SC_INVALID_SECONDARY_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" }, + { SPDK_NVME_SC_INVALID_NUM_CTRLR_RESOURCES, "INVALID NUMBER OF CONTROLLER RESOURCES" }, + { SPDK_NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" }, + { SPDK_NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" }, + { SPDK_NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" }, + { SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_RANGE, "WRITE TO RO RANGE" }, + { 0xFFFF, "COMMAND SPECIFIC" } +}; + +static const struct nvme_string media_error_status[] = { + { SPDK_NVME_SC_WRITE_FAULTS, "WRITE FAULTS" }, + { SPDK_NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" }, + { SPDK_NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" }, + { SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" }, + { SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" }, + { SPDK_NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" }, + { SPDK_NVME_SC_ACCESS_DENIED, "ACCESS DENIED" }, + { SPDK_NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" }, + { SPDK_OCSSD_SC_OFFLINE_CHUNK, "RESET OFFLINE CHUNK" }, + { SPDK_OCSSD_SC_INVALID_RESET, "INVALID RESET" }, + { SPDK_OCSSD_SC_WRITE_FAIL_WRITE_NEXT_UNIT, "WRITE FAIL WRITE NEXT UNIT" }, + { SPDK_OCSSD_SC_WRITE_FAIL_CHUNK_EARLY_CLOSE, "WRITE FAIL CHUNK EARLY CLOSE" }, + { SPDK_OCSSD_SC_OUT_OF_ORDER_WRITE, "OUT OF ORDER WRITE" }, + { SPDK_OCSSD_SC_READ_HIGH_ECC, "READ HIGH ECC" }, + { 0xFFFF, "MEDIA ERROR" } +}; + +static const struct nvme_string path_status[] = { + { SPDK_NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" }, + { SPDK_NVME_SC_CONTROLLER_PATH_ERROR, "CONTROLLER PATH ERROR" }, + { SPDK_NVME_SC_HOST_PATH_ERROR, "HOST PATH ERROR" }, + { SPDK_NVME_SC_ABORTED_BY_HOST, "ABORTED BY HOST" }, + { 0xFFFF, "PATH ERROR" } +}; + +const char * +spdk_nvme_cpl_get_status_string(const struct spdk_nvme_status *status) +{ + const struct nvme_string *entry; + + switch (status->sct) { + case SPDK_NVME_SCT_GENERIC: + entry = generic_status; + break; + case SPDK_NVME_SCT_COMMAND_SPECIFIC: + entry = command_specific_status; + break; + case SPDK_NVME_SCT_MEDIA_ERROR: + entry = media_error_status; + break; + case SPDK_NVME_SCT_PATH: + entry = path_status; + break; + case SPDK_NVME_SCT_VENDOR_SPECIFIC: + return "VENDOR SPECIFIC"; + default: + return "RESERVED"; + } + + return nvme_get_string(entry, status->sc); +} + +void +spdk_nvme_print_completion(uint16_t qid, struct spdk_nvme_cpl *cpl) +{ + assert(cpl != NULL); + + /* Check that sqid matches qid. Note that sqid is reserved + * for fabrics so don't print an error when sqid is 0. */ + if (cpl->sqid != qid && cpl->sqid != 0) { + SPDK_ERRLOG("sqid %u doesn't match qid\n", cpl->sqid); + } + + SPDK_NOTICELOG("%s (%02x/%02x) qid:%d cid:%d cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n", + spdk_nvme_cpl_get_status_string(&cpl->status), + cpl->status.sct, cpl->status.sc, qid, cpl->cid, cpl->cdw0, + cpl->sqhd, cpl->status.p, cpl->status.m, cpl->status.dnr); +} + +void +spdk_nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cpl *cpl) +{ + spdk_nvme_print_completion(qpair->id, cpl); +} + +bool +nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl) +{ + /* + * TODO: spec is not clear how commands that are aborted due + * to TLER will be marked. So for now, it seems + * NAMESPACE_NOT_READY is the only case where we should + * look at the DNR bit. + */ + switch ((int)cpl->status.sct) { + case SPDK_NVME_SCT_GENERIC: + switch ((int)cpl->status.sc) { + case SPDK_NVME_SC_NAMESPACE_NOT_READY: + case SPDK_NVME_SC_FORMAT_IN_PROGRESS: + if (cpl->status.dnr) { + return false; + } else { + return true; + } + case SPDK_NVME_SC_INVALID_OPCODE: + case SPDK_NVME_SC_INVALID_FIELD: + case SPDK_NVME_SC_COMMAND_ID_CONFLICT: + case SPDK_NVME_SC_DATA_TRANSFER_ERROR: + case SPDK_NVME_SC_ABORTED_POWER_LOSS: + case SPDK_NVME_SC_INTERNAL_DEVICE_ERROR: + case SPDK_NVME_SC_ABORTED_BY_REQUEST: + case SPDK_NVME_SC_ABORTED_SQ_DELETION: + case SPDK_NVME_SC_ABORTED_FAILED_FUSED: + case SPDK_NVME_SC_ABORTED_MISSING_FUSED: + case SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT: + case SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR: + case SPDK_NVME_SC_LBA_OUT_OF_RANGE: + case SPDK_NVME_SC_CAPACITY_EXCEEDED: + default: + return false; + } + case SPDK_NVME_SCT_PATH: + /* + * Per NVMe TP 4028 (Path and Transport Error Enhancements), retries should be + * based on the setting of the DNR bit for Internal Path Error + */ + switch ((int)cpl->status.sc) { + case SPDK_NVME_SC_INTERNAL_PATH_ERROR: + return !cpl->status.dnr; + default: + return false; + } + case SPDK_NVME_SCT_COMMAND_SPECIFIC: + case SPDK_NVME_SCT_MEDIA_ERROR: + case SPDK_NVME_SCT_VENDOR_SPECIFIC: + default: + return false; + } +} + +static void +nvme_qpair_manual_complete_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req, uint32_t sct, uint32_t sc, + uint32_t dnr, bool print_on_error) +{ + struct spdk_nvme_cpl cpl; + bool error; + + memset(&cpl, 0, sizeof(cpl)); + cpl.sqid = qpair->id; + cpl.status.sct = sct; + cpl.status.sc = sc; + cpl.status.dnr = dnr; + + error = spdk_nvme_cpl_is_error(&cpl); + + if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) { + SPDK_NOTICELOG("Command completed manually:\n"); + spdk_nvme_qpair_print_command(qpair, &req->cmd); + spdk_nvme_qpair_print_completion(qpair, &cpl); + } + + nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &cpl); + nvme_free_request(req); +} + +static void +_nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + struct nvme_request *req; + + while (!STAILQ_EMPTY(&qpair->queued_req)) { + req = STAILQ_FIRST(&qpair->queued_req); + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + if (!qpair->ctrlr->opts.disable_error_logging) { + SPDK_ERRLOG("aborting queued i/o\n"); + } + nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); + } +} + +/* The callback to a request may submit the next request which is queued and + * then the same callback may abort it immediately. This repetition may cause + * infinite recursive calls. Hence move aborting requests to another list here + * and abort them later at resubmission. + */ +static void +_nvme_qpair_complete_abort_queued_reqs(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + + while (!STAILQ_EMPTY(&qpair->aborting_queued_req)) { + req = STAILQ_FIRST(&qpair->aborting_queued_req); + STAILQ_REMOVE_HEAD(&qpair->aborting_queued_req, stailq); + nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, 1, true); + } +} + +uint32_t +nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg) +{ + struct nvme_request *req, *tmp; + uint32_t aborting = 0; + + STAILQ_FOREACH_SAFE(req, &qpair->queued_req, stailq, tmp) { + if (req->cb_arg == cmd_cb_arg) { + STAILQ_REMOVE(&qpair->queued_req, req, nvme_request, stailq); + STAILQ_INSERT_TAIL(&qpair->aborting_queued_req, req, stailq); + if (!qpair->ctrlr->opts.disable_error_logging) { + SPDK_ERRLOG("aborting queued i/o\n"); + } + aborting++; + } + } + + return aborting; +} + +static inline bool +nvme_qpair_check_enabled(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + + /* + * Either during initial connect or reset, the qpair should follow the given state machine. + * QPAIR_DISABLED->QPAIR_CONNECTING->QPAIR_CONNECTED->QPAIR_ENABLING->QPAIR_ENABLED. In the + * reset case, once the qpair is properly connected, we need to abort any outstanding requests + * from the old transport connection and encourage the application to retry them. We also need + * to submit any queued requests that built up while we were in the connected or enabling state. + */ + if (nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTED && !qpair->ctrlr->is_resetting) { + nvme_qpair_set_state(qpair, NVME_QPAIR_ENABLING); + /* + * PCIe is special, for fabrics transports, we can abort requests before disconnect during reset + * but we have historically not disconnected pcie qpairs during reset so we have to abort requests + * here. + */ + if (qpair->ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + nvme_qpair_abort_reqs(qpair, 0); + } + nvme_qpair_set_state(qpair, NVME_QPAIR_ENABLED); + while (!STAILQ_EMPTY(&qpair->queued_req)) { + req = STAILQ_FIRST(&qpair->queued_req); + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + if (nvme_qpair_resubmit_request(qpair, req)) { + break; + } + } + } + + /* + * When doing a reset, we must disconnect the qpair on the proper core. + * Note, reset is the only case where we set the failure reason without + * setting the qpair state since reset is done at the generic layer on the + * controller thread and we can't disconnect I/O qpairs from the controller + * thread. + */ + if (qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE && + nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED) { + /* Don't disconnect PCIe qpairs. They are a special case for reset. */ + if (qpair->ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + nvme_ctrlr_disconnect_qpair(qpair); + } + return false; + } + + return nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED; +} + +void +nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_requests) +{ + uint32_t i; + int resubmit_rc; + struct nvme_request *req; + + for (i = 0; i < num_requests; i++) { + if (qpair->ctrlr->is_resetting) { + break; + } + if ((req = STAILQ_FIRST(&qpair->queued_req)) == NULL) { + break; + } + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + resubmit_rc = nvme_qpair_resubmit_request(qpair, req); + if (spdk_unlikely(resubmit_rc != 0)) { + SPDK_ERRLOG("Unable to resubmit as many requests as we completed.\n"); + break; + } + } + + _nvme_qpair_complete_abort_queued_reqs(qpair); +} + +int32_t +spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + int32_t ret; + struct nvme_request *req, *tmp; + + if (spdk_unlikely(qpair->ctrlr->is_failed)) { + if (qpair->ctrlr->is_removed) { + nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING); + nvme_qpair_abort_reqs(qpair, 1 /* Do not retry */); + } + return -ENXIO; + } + + if (spdk_unlikely(!nvme_qpair_check_enabled(qpair) && + !(nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING))) { + /* + * qpair is not enabled, likely because a controller reset is + * in progress. + */ + return -ENXIO; + } + + /* error injection for those queued error requests */ + if (spdk_unlikely(!STAILQ_EMPTY(&qpair->err_req_head))) { + STAILQ_FOREACH_SAFE(req, &qpair->err_req_head, stailq, tmp) { + if (spdk_get_ticks() - req->submit_tick > req->timeout_tsc) { + STAILQ_REMOVE(&qpair->err_req_head, req, nvme_request, stailq); + nvme_qpair_manual_complete_request(qpair, req, + req->cpl.status.sct, + req->cpl.status.sc, 0, true); + } + } + } + + qpair->in_completion_context = 1; + ret = nvme_transport_qpair_process_completions(qpair, max_completions); + if (ret < 0) { + SPDK_ERRLOG("CQ error, abort requests after transport retry counter exceeded\n"); + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_ctrlr_fail(qpair->ctrlr, false); + } + } + qpair->in_completion_context = 0; + if (qpair->delete_after_completion_context) { + /* + * A request to delete this qpair was made in the context of this completion + * routine - so it is safe to delete it now. + */ + spdk_nvme_ctrlr_free_io_qpair(qpair); + return ret; + } + + /* + * At this point, ret must represent the number of completions we reaped. + * submit as many queued requests as we completed. + */ + nvme_qpair_resubmit_requests(qpair, ret); + + return ret; +} + +spdk_nvme_qp_failure_reason +spdk_nvme_qpair_get_failure_reason(struct spdk_nvme_qpair *qpair) +{ + return qpair->transport_failure_reason; +} + +int +nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id, + struct spdk_nvme_ctrlr *ctrlr, + enum spdk_nvme_qprio qprio, + uint32_t num_requests) +{ + size_t req_size_padded; + uint32_t i; + + qpair->id = id; + qpair->qprio = qprio; + + qpair->in_completion_context = 0; + qpair->delete_after_completion_context = 0; + qpair->no_deletion_notification_needed = 0; + + qpair->ctrlr = ctrlr; + qpair->trtype = ctrlr->trid.trtype; + + STAILQ_INIT(&qpair->free_req); + STAILQ_INIT(&qpair->queued_req); + STAILQ_INIT(&qpair->aborting_queued_req); + TAILQ_INIT(&qpair->err_cmd_head); + STAILQ_INIT(&qpair->err_req_head); + + req_size_padded = (sizeof(struct nvme_request) + 63) & ~(size_t)63; + + qpair->req_buf = spdk_zmalloc(req_size_padded * num_requests, 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (qpair->req_buf == NULL) { + SPDK_ERRLOG("no memory to allocate qpair(cntlid:0x%x sqid:%d) req_buf with %d request\n", + ctrlr->cntlid, qpair->id, num_requests); + return -ENOMEM; + } + + for (i = 0; i < num_requests; i++) { + struct nvme_request *req = qpair->req_buf + i * req_size_padded; + + req->qpair = qpair; + STAILQ_INSERT_HEAD(&qpair->free_req, req, stailq); + } + + return 0; +} + +void +nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + + while (!STAILQ_EMPTY(&qpair->err_req_head)) { + req = STAILQ_FIRST(&qpair->err_req_head); + STAILQ_REMOVE_HEAD(&qpair->err_req_head, stailq); + nvme_qpair_manual_complete_request(qpair, req, + req->cpl.status.sct, + req->cpl.status.sc, 0, true); + } +} + +void +nvme_qpair_deinit(struct spdk_nvme_qpair *qpair) +{ + struct nvme_error_cmd *cmd, *entry; + + _nvme_qpair_abort_queued_reqs(qpair, 1); + _nvme_qpair_complete_abort_queued_reqs(qpair); + nvme_qpair_complete_error_reqs(qpair); + + TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) { + TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link); + spdk_free(cmd); + } + + spdk_free(qpair->req_buf); +} + +static inline int +_nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + int rc = 0; + struct nvme_request *child_req, *tmp; + struct nvme_error_cmd *cmd; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + bool child_req_failed = false; + + nvme_qpair_check_enabled(qpair); + + if (req->num_children) { + /* + * This is a split (parent) request. Submit all of the children but not the parent + * request itself, since the parent is the original unsplit request. + */ + TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) { + if (spdk_likely(!child_req_failed)) { + rc = nvme_qpair_submit_request(qpair, child_req); + if (spdk_unlikely(rc != 0)) { + child_req_failed = true; + } + } else { /* free remaining child_reqs since one child_req fails */ + nvme_request_remove_child(req, child_req); + nvme_request_free_children(child_req); + nvme_free_request(child_req); + } + } + + if (spdk_unlikely(child_req_failed)) { + /* part of children requests have been submitted, + * return success since we must wait for those children to complete, + * but set the parent request to failure. + */ + if (req->num_children) { + req->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return 0; + } + goto error; + } + + return rc; + } + + /* queue those requests which matches with opcode in err_cmd list */ + if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head))) { + TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) { + if (!cmd->do_not_submit) { + continue; + } + + if ((cmd->opc == req->cmd.opc) && cmd->err_count) { + /* add to error request list and set cpl */ + req->timeout_tsc = cmd->timeout_tsc; + req->submit_tick = spdk_get_ticks(); + req->cpl.status.sct = cmd->status.sct; + req->cpl.status.sc = cmd->status.sc; + STAILQ_INSERT_TAIL(&qpair->err_req_head, req, stailq); + cmd->err_count--; + return 0; + } + } + } + + if (spdk_unlikely(ctrlr->is_failed)) { + rc = -ENXIO; + goto error; + } + + /* assign submit_tick before submitting req to specific transport */ + if (spdk_unlikely(ctrlr->timeout_enabled)) { + if (req->submit_tick == 0) { /* req submitted for the first time */ + req->submit_tick = spdk_get_ticks(); + req->timed_out = false; + } + } else { + req->submit_tick = 0; + } + + /* Allow two cases: + * 1. NVMe qpair is enabled. + * 2. Always allow fabrics commands through - these get + * the controller out of reset state. + */ + if (spdk_likely(nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED) || + (req->cmd.opc == SPDK_NVME_OPC_FABRIC && + nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING)) { + rc = nvme_transport_qpair_submit_request(qpair, req); + } else { + /* The controller is being reset - queue this request and + * submit it later when the reset is completed. + */ + return -EAGAIN; + } + + if (spdk_likely(rc == 0)) { + req->queued = false; + return 0; + } + + if (rc == -EAGAIN) { + return -EAGAIN; + } + +error: + if (req->parent != NULL) { + nvme_request_remove_child(req->parent, req); + } + + /* The request is from queued_req list we should trigger the callback from caller */ + if (spdk_unlikely(req->queued)) { + nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, true, true); + return rc; + } + + nvme_free_request(req); + + return rc; +} + +int +nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + int rc; + + /* This prevents us from entering an infinite loop when freeing queued I/O in disconnect. */ + if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING || + nvme_qpair_get_state(qpair) == NVME_QPAIR_DESTROYING)) { + if (req->parent != NULL) { + nvme_request_remove_child(req->parent, req); + } + nvme_free_request(req); + return -ENXIO; + } + + if (spdk_unlikely(!STAILQ_EMPTY(&qpair->queued_req) && req->num_children == 0)) { + /* + * requests that have no children should be sent to the transport after all + * currently queued requests. Requests with chilren will be split and go back + * through this path. + */ + STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); + req->queued = true; + return 0; + } + + rc = _nvme_qpair_submit_request(qpair, req); + if (rc == -EAGAIN) { + STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); + req->queued = true; + rc = 0; + } + + return rc; +} + +static int +nvme_qpair_resubmit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + int rc; + + /* + * We should never have a request with children on the queue. + * This is necessary to preserve the 1:1 relationship between + * completions and resubmissions. + */ + assert(req->num_children == 0); + assert(req->queued); + rc = _nvme_qpair_submit_request(qpair, req); + if (spdk_unlikely(rc == -EAGAIN)) { + STAILQ_INSERT_HEAD(&qpair->queued_req, req, stailq); + } + + return rc; +} + +void +nvme_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + nvme_qpair_complete_error_reqs(qpair); + _nvme_qpair_abort_queued_reqs(qpair, dnr); + _nvme_qpair_complete_abort_queued_reqs(qpair); + nvme_transport_qpair_abort_reqs(qpair, dnr); +} + +int +spdk_nvme_qpair_add_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + uint8_t opc, bool do_not_submit, + uint64_t timeout_in_us, + uint32_t err_count, + uint8_t sct, uint8_t sc) +{ + struct nvme_error_cmd *entry, *cmd = NULL; + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + TAILQ_FOREACH(entry, &qpair->err_cmd_head, link) { + if (entry->opc == opc) { + cmd = entry; + break; + } + } + + if (cmd == NULL) { + cmd = spdk_zmalloc(sizeof(*cmd), 64, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!cmd) { + return -ENOMEM; + } + TAILQ_INSERT_TAIL(&qpair->err_cmd_head, cmd, link); + } + + cmd->do_not_submit = do_not_submit; + cmd->err_count = err_count; + cmd->timeout_tsc = timeout_in_us * spdk_get_ticks_hz() / 1000000ULL; + cmd->opc = opc; + cmd->status.sct = sct; + cmd->status.sc = sc; + + return 0; +} + +void +spdk_nvme_qpair_remove_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + uint8_t opc) +{ + struct nvme_error_cmd *cmd, *entry; + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) { + if (cmd->opc == opc) { + TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link); + spdk_free(cmd); + return; + } + } + + return; +} diff --git a/src/spdk/lib/nvme/nvme_quirks.c b/src/spdk/lib/nvme/nvme_quirks.c new file mode 100644 index 000000000..38c8f0eae --- /dev/null +++ b/src/spdk/lib/nvme/nvme_quirks.c @@ -0,0 +1,155 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +struct nvme_quirk { + struct spdk_pci_id id; + uint64_t flags; +}; + +static const struct nvme_quirk nvme_quirks[] = { + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0953, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE | + NVME_QUIRK_DELAY_BEFORE_INIT | + NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A53, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE | + NVME_QUIRK_DELAY_BEFORE_INIT | + NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A54, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE | + NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A55, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE | + NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_MEMBLAZE, 0x0540, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_BEFORE_CHK_RDY + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_SAMSUNG, 0xa821, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_BEFORE_CHK_RDY + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_SAMSUNG, 0xa822, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_BEFORE_CHK_RDY + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_VIRTUALBOX, 0x4e56, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x5845, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_IDENTIFY_CNS | + NVME_INTEL_QUIRK_NO_LOG_PAGES | + NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_CNEXLABS, 0x1f1f, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_IDENTIFY_CNS | + NVME_QUIRK_OCSSD + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_VMWARE, 0x07f0, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_SHST_COMPLETE + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x2700, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_OACS_SECURITY + }, + { {0x000000, 0x0000, 0x0000, 0x0000, 0x0000}, 0} +}; + +/* Compare each field. SPDK_PCI_ANY_ID in s1 matches everything */ +static bool +pci_id_match(const struct spdk_pci_id *s1, const struct spdk_pci_id *s2) +{ + if ((s1->class_id == SPDK_PCI_CLASS_ANY_ID || s1->class_id == s2->class_id) && + (s1->vendor_id == SPDK_PCI_ANY_ID || s1->vendor_id == s2->vendor_id) && + (s1->device_id == SPDK_PCI_ANY_ID || s1->device_id == s2->device_id) && + (s1->subvendor_id == SPDK_PCI_ANY_ID || s1->subvendor_id == s2->subvendor_id) && + (s1->subdevice_id == SPDK_PCI_ANY_ID || s1->subdevice_id == s2->subdevice_id)) { + return true; + } + return false; +} + +uint64_t +nvme_get_quirks(const struct spdk_pci_id *id) +{ + const struct nvme_quirk *quirk = nvme_quirks; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Searching for %04x:%04x [%04x:%04x]...\n", + id->vendor_id, id->device_id, + id->subvendor_id, id->subdevice_id); + + while (quirk->id.vendor_id) { + if (pci_id_match(&quirk->id, id)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Matched quirk %04x:%04x [%04x:%04x]:\n", + quirk->id.vendor_id, quirk->id.device_id, + quirk->id.subvendor_id, quirk->id.subdevice_id); + +#define PRINT_QUIRK(quirk_flag) \ + do { \ + if (quirk->flags & (quirk_flag)) { \ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Quirk enabled: %s\n", #quirk_flag); \ + } \ + } while (0) + + PRINT_QUIRK(NVME_INTEL_QUIRK_READ_LATENCY); + PRINT_QUIRK(NVME_INTEL_QUIRK_WRITE_LATENCY); + PRINT_QUIRK(NVME_QUIRK_DELAY_BEFORE_CHK_RDY); + PRINT_QUIRK(NVME_INTEL_QUIRK_STRIPING); + PRINT_QUIRK(NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC); + PRINT_QUIRK(NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE); + PRINT_QUIRK(NVME_QUIRK_IDENTIFY_CNS); + PRINT_QUIRK(NVME_QUIRK_OCSSD); + + return quirk->flags; + } + quirk++; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "No quirks found.\n"); + + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_rdma.c b/src/spdk/lib/nvme/nvme_rdma.c new file mode 100644 index 000000000..84537c4a1 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_rdma.c @@ -0,0 +1,2852 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over RDMA transport + */ + +#include "spdk/stdinc.h" + +#include "spdk/assert.h" +#include "spdk/log.h" +#include "spdk/trace.h" +#include "spdk/queue.h" +#include "spdk/nvme.h" +#include "spdk/nvmf_spec.h" +#include "spdk/string.h" +#include "spdk/endian.h" +#include "spdk/likely.h" +#include "spdk/config.h" + +#include "nvme_internal.h" +#include "spdk_internal/rdma.h" + +#define NVME_RDMA_TIME_OUT_IN_MS 2000 +#define NVME_RDMA_RW_BUFFER_SIZE 131072 + +/* + * NVME RDMA qpair Resource Defaults + */ +#define NVME_RDMA_DEFAULT_TX_SGE 2 +#define NVME_RDMA_DEFAULT_RX_SGE 1 + +/* Max number of NVMe-oF SGL descriptors supported by the host */ +#define NVME_RDMA_MAX_SGL_DESCRIPTORS 16 + +/* number of STAILQ entries for holding pending RDMA CM events. */ +#define NVME_RDMA_NUM_CM_EVENTS 256 + +/* CM event processing timeout */ +#define NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US 1000000 + +/* The default size for a shared rdma completion queue. */ +#define DEFAULT_NVME_RDMA_CQ_SIZE 4096 + +/* + * In the special case of a stale connection we don't expose a mechanism + * for the user to retry the connection so we need to handle it internally. + */ +#define NVME_RDMA_STALE_CONN_RETRY_MAX 5 +#define NVME_RDMA_STALE_CONN_RETRY_DELAY_US 10000 + +/* + * Maximum value of transport_retry_count used by RDMA controller + */ +#define NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT 7 + +/* + * Maximum value of transport_ack_timeout used by RDMA controller + */ +#define NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT 31 + +/* + * Number of poller cycles to keep a pointer to destroyed qpairs + * in the poll group. + */ +#define NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES 50 + +/* + * The max length of keyed SGL data block (3 bytes) + */ +#define NVME_RDMA_MAX_KEYED_SGL_LENGTH ((1u << 24u) - 1) + +#define WC_PER_QPAIR(queue_depth) (queue_depth * 2) + +enum nvme_rdma_wr_type { + RDMA_WR_TYPE_RECV, + RDMA_WR_TYPE_SEND, +}; + +struct nvme_rdma_wr { + /* Using this instead of the enum allows this struct to only occupy one byte. */ + uint8_t type; +}; + +struct spdk_nvmf_cmd { + struct spdk_nvme_cmd cmd; + struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS]; +}; + +struct spdk_nvme_rdma_hooks g_nvme_hooks = {}; + +/* Mapping from virtual address to ibv_mr pointer for a protection domain */ +struct spdk_nvme_rdma_mr_map { + struct ibv_pd *pd; + struct spdk_mem_map *map; + uint64_t ref; + LIST_ENTRY(spdk_nvme_rdma_mr_map) link; +}; + +/* STAILQ wrapper for cm events. */ +struct nvme_rdma_cm_event_entry { + struct rdma_cm_event *evt; + STAILQ_ENTRY(nvme_rdma_cm_event_entry) link; +}; + +/* NVMe RDMA transport extensions for spdk_nvme_ctrlr */ +struct nvme_rdma_ctrlr { + struct spdk_nvme_ctrlr ctrlr; + + struct ibv_pd *pd; + + uint16_t max_sge; + + struct rdma_event_channel *cm_channel; + + STAILQ_HEAD(, nvme_rdma_cm_event_entry) pending_cm_events; + + STAILQ_HEAD(, nvme_rdma_cm_event_entry) free_cm_events; + + struct nvme_rdma_cm_event_entry *cm_events; +}; + +struct nvme_rdma_destroyed_qpair { + struct nvme_rdma_qpair *destroyed_qpair_tracker; + uint32_t completed_cycles; + STAILQ_ENTRY(nvme_rdma_destroyed_qpair) link; +}; + +struct nvme_rdma_poller { + struct ibv_context *device; + struct ibv_cq *cq; + int required_num_wc; + int current_num_wc; + STAILQ_ENTRY(nvme_rdma_poller) link; +}; + +struct nvme_rdma_poll_group { + struct spdk_nvme_transport_poll_group group; + STAILQ_HEAD(, nvme_rdma_poller) pollers; + int num_pollers; + STAILQ_HEAD(, nvme_rdma_destroyed_qpair) destroyed_qpairs; +}; + +struct spdk_nvme_send_wr_list { + struct ibv_send_wr *first; + struct ibv_send_wr *last; +}; + +struct spdk_nvme_recv_wr_list { + struct ibv_recv_wr *first; + struct ibv_recv_wr *last; +}; + +/* Memory regions */ +union nvme_rdma_mr { + struct ibv_mr *mr; + uint64_t key; +}; + +/* NVMe RDMA qpair extensions for spdk_nvme_qpair */ +struct nvme_rdma_qpair { + struct spdk_nvme_qpair qpair; + + struct spdk_rdma_qp *rdma_qp; + struct rdma_cm_id *cm_id; + struct ibv_cq *cq; + + struct spdk_nvme_rdma_req *rdma_reqs; + + uint32_t max_send_sge; + + uint32_t max_recv_sge; + + uint16_t num_entries; + + bool delay_cmd_submit; + + bool poll_group_disconnect_in_progress; + + uint32_t num_completions; + + /* Parallel arrays of response buffers + response SGLs of size num_entries */ + struct ibv_sge *rsp_sgls; + struct spdk_nvme_rdma_rsp *rsps; + + struct ibv_recv_wr *rsp_recv_wrs; + + struct spdk_nvme_send_wr_list sends_to_post; + struct spdk_nvme_recv_wr_list recvs_to_post; + + /* Memory region describing all rsps for this qpair */ + union nvme_rdma_mr rsp_mr; + + /* + * Array of num_entries NVMe commands registered as RDMA message buffers. + * Indexed by rdma_req->id. + */ + struct spdk_nvmf_cmd *cmds; + + /* Memory region describing all cmds for this qpair */ + union nvme_rdma_mr cmd_mr; + + struct spdk_nvme_rdma_mr_map *mr_map; + + TAILQ_HEAD(, spdk_nvme_rdma_req) free_reqs; + TAILQ_HEAD(, spdk_nvme_rdma_req) outstanding_reqs; + + /* Counts of outstanding send and recv objects */ + uint16_t current_num_recvs; + uint16_t current_num_sends; + + /* Placed at the end of the struct since it is not used frequently */ + struct rdma_cm_event *evt; + + /* Used by poll group to keep the qpair around until it is ready to remove it. */ + bool defer_deletion_to_pg; +}; + +enum NVME_RDMA_COMPLETION_FLAGS { + NVME_RDMA_SEND_COMPLETED = 1u << 0, + NVME_RDMA_RECV_COMPLETED = 1u << 1, +}; + +struct spdk_nvme_rdma_req { + uint16_t id; + uint16_t completion_flags: 2; + uint16_t reserved: 14; + /* if completion of RDMA_RECV received before RDMA_SEND, we will complete nvme request + * during processing of RDMA_SEND. To complete the request we must know the index + * of nvme_cpl received in RDMA_RECV, so store it in this field */ + uint16_t rsp_idx; + + struct nvme_rdma_wr rdma_wr; + + struct ibv_send_wr send_wr; + + struct nvme_request *req; + + struct ibv_sge send_sgl[NVME_RDMA_DEFAULT_TX_SGE]; + + TAILQ_ENTRY(spdk_nvme_rdma_req) link; +}; + +enum nvme_rdma_key_type { + NVME_RDMA_MR_RKEY, + NVME_RDMA_MR_LKEY +}; + +struct spdk_nvme_rdma_rsp { + struct spdk_nvme_cpl cpl; + struct nvme_rdma_qpair *rqpair; + uint16_t idx; + struct nvme_rdma_wr rdma_wr; +}; + +static const char *rdma_cm_event_str[] = { + "RDMA_CM_EVENT_ADDR_RESOLVED", + "RDMA_CM_EVENT_ADDR_ERROR", + "RDMA_CM_EVENT_ROUTE_RESOLVED", + "RDMA_CM_EVENT_ROUTE_ERROR", + "RDMA_CM_EVENT_CONNECT_REQUEST", + "RDMA_CM_EVENT_CONNECT_RESPONSE", + "RDMA_CM_EVENT_CONNECT_ERROR", + "RDMA_CM_EVENT_UNREACHABLE", + "RDMA_CM_EVENT_REJECTED", + "RDMA_CM_EVENT_ESTABLISHED", + "RDMA_CM_EVENT_DISCONNECTED", + "RDMA_CM_EVENT_DEVICE_REMOVAL", + "RDMA_CM_EVENT_MULTICAST_JOIN", + "RDMA_CM_EVENT_MULTICAST_ERROR", + "RDMA_CM_EVENT_ADDR_CHANGE", + "RDMA_CM_EVENT_TIMEWAIT_EXIT" +}; + +static LIST_HEAD(, spdk_nvme_rdma_mr_map) g_rdma_mr_maps = LIST_HEAD_INITIALIZER(&g_rdma_mr_maps); +static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER; +struct nvme_rdma_qpair *nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group, + uint32_t qp_num); + +static inline void * +nvme_rdma_calloc(size_t nmemb, size_t size) +{ + if (!g_nvme_hooks.get_rkey) { + return calloc(nmemb, size); + } else { + return spdk_zmalloc(nmemb * size, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + } +} + +static inline void +nvme_rdma_free(void *buf) +{ + if (!g_nvme_hooks.get_rkey) { + free(buf); + } else { + spdk_free(buf); + } +} + +static int nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair); + +static inline struct nvme_rdma_qpair * +nvme_rdma_qpair(struct spdk_nvme_qpair *qpair) +{ + assert(qpair->trtype == SPDK_NVME_TRANSPORT_RDMA); + return SPDK_CONTAINEROF(qpair, struct nvme_rdma_qpair, qpair); +} + +static inline struct nvme_rdma_poll_group * +nvme_rdma_poll_group(struct spdk_nvme_transport_poll_group *group) +{ + return (SPDK_CONTAINEROF(group, struct nvme_rdma_poll_group, group)); +} + +static inline struct nvme_rdma_ctrlr * +nvme_rdma_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA); + return SPDK_CONTAINEROF(ctrlr, struct nvme_rdma_ctrlr, ctrlr); +} + +static struct spdk_nvme_rdma_req * +nvme_rdma_req_get(struct nvme_rdma_qpair *rqpair) +{ + struct spdk_nvme_rdma_req *rdma_req; + + rdma_req = TAILQ_FIRST(&rqpair->free_reqs); + if (rdma_req) { + TAILQ_REMOVE(&rqpair->free_reqs, rdma_req, link); + TAILQ_INSERT_TAIL(&rqpair->outstanding_reqs, rdma_req, link); + } + + return rdma_req; +} + +static void +nvme_rdma_req_put(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req) +{ + rdma_req->completion_flags = 0; + rdma_req->req = NULL; + TAILQ_INSERT_HEAD(&rqpair->free_reqs, rdma_req, link); +} + +static void +nvme_rdma_req_complete(struct spdk_nvme_rdma_req *rdma_req, + struct spdk_nvme_cpl *rsp) +{ + struct nvme_request *req = rdma_req->req; + struct nvme_rdma_qpair *rqpair; + + assert(req != NULL); + + rqpair = nvme_rdma_qpair(req->qpair); + TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); + + nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp); + nvme_free_request(req); +} + +static const char * +nvme_rdma_cm_event_str_get(uint32_t event) +{ + if (event < SPDK_COUNTOF(rdma_cm_event_str)) { + return rdma_cm_event_str[event]; + } else { + return "Undefined"; + } +} + + +static int +nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair) +{ + struct rdma_cm_event *event = rqpair->evt; + struct spdk_nvmf_rdma_accept_private_data *accept_data; + int rc = 0; + + if (event) { + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + case RDMA_CM_EVENT_ROUTE_ERROR: + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + break; + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + rc = spdk_rdma_qp_complete_connect(rqpair->rdma_qp); + /* fall through */ + case RDMA_CM_EVENT_ESTABLISHED: + accept_data = (struct spdk_nvmf_rdma_accept_private_data *)event->param.conn.private_data; + if (accept_data == NULL) { + rc = -1; + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested queue depth %d. Actually got queue depth %d.\n", + rqpair->num_entries, accept_data->crqsize); + rqpair->num_entries = spdk_min(rqpair->num_entries, accept_data->crqsize); + } + break; + case RDMA_CM_EVENT_DISCONNECTED: + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE; + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + break; + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + break; + default: + SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); + break; + } + rqpair->evt = NULL; + rdma_ack_cm_event(event); + } + + return rc; +} + +/* + * This function must be called under the nvme controller's lock + * because it touches global controller variables. The lock is taken + * by the generic transport code before invoking a few of the functions + * in this file: nvme_rdma_ctrlr_connect_qpair, nvme_rdma_ctrlr_delete_io_qpair, + * and conditionally nvme_rdma_qpair_process_completions when it is calling + * completions on the admin qpair. When adding a new call to this function, please + * verify that it is in a situation where it falls under the lock. + */ +static int +nvme_rdma_poll_events(struct nvme_rdma_ctrlr *rctrlr) +{ + struct nvme_rdma_cm_event_entry *entry, *tmp; + struct nvme_rdma_qpair *event_qpair; + struct rdma_cm_event *event; + struct rdma_event_channel *channel = rctrlr->cm_channel; + + STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) { + event_qpair = nvme_rdma_qpair(entry->evt->id->context); + if (event_qpair->evt == NULL) { + event_qpair->evt = entry->evt; + STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link); + STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link); + } + } + + while (rdma_get_cm_event(channel, &event) == 0) { + event_qpair = nvme_rdma_qpair(event->id->context); + if (event_qpair->evt == NULL) { + event_qpair->evt = event; + } else { + assert(rctrlr == nvme_rdma_ctrlr(event_qpair->qpair.ctrlr)); + entry = STAILQ_FIRST(&rctrlr->free_cm_events); + if (entry == NULL) { + rdma_ack_cm_event(event); + return -ENOMEM; + } + STAILQ_REMOVE(&rctrlr->free_cm_events, entry, nvme_rdma_cm_event_entry, link); + entry->evt = event; + STAILQ_INSERT_TAIL(&rctrlr->pending_cm_events, entry, link); + } + } + + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return 0; + } else { + return errno; + } +} + +static int +nvme_rdma_validate_cm_event(enum rdma_cm_event_type expected_evt_type, + struct rdma_cm_event *reaped_evt) +{ + int rc = -EBADMSG; + + if (expected_evt_type == reaped_evt->event) { + return 0; + } + + switch (expected_evt_type) { + case RDMA_CM_EVENT_ESTABLISHED: + /* + * There is an enum ib_cm_rej_reason in the kernel headers that sets 10 as + * IB_CM_REJ_STALE_CONN. I can't find the corresponding userspace but we get + * the same values here. + */ + if (reaped_evt->event == RDMA_CM_EVENT_REJECTED && reaped_evt->status == 10) { + rc = -ESTALE; + } else if (reaped_evt->event == RDMA_CM_EVENT_CONNECT_RESPONSE) { + /* + * If we are using a qpair which is not created using rdma cm API + * then we will receive RDMA_CM_EVENT_CONNECT_RESPONSE instead of + * RDMA_CM_EVENT_ESTABLISHED. + */ + return 0; + } + break; + default: + break; + } + + SPDK_ERRLOG("Expected %s but received %s (%d) from CM event channel (status = %d)\n", + nvme_rdma_cm_event_str_get(expected_evt_type), + nvme_rdma_cm_event_str_get(reaped_evt->event), reaped_evt->event, + reaped_evt->status); + return rc; +} + +static int +nvme_rdma_process_event(struct nvme_rdma_qpair *rqpair, + struct rdma_event_channel *channel, + enum rdma_cm_event_type evt) +{ + struct nvme_rdma_ctrlr *rctrlr; + uint64_t timeout_ticks; + int rc = 0, rc2; + + if (rqpair->evt != NULL) { + rc = nvme_rdma_qpair_process_cm_event(rqpair); + if (rc) { + return rc; + } + } + + timeout_ticks = (NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC + + spdk_get_ticks(); + rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); + assert(rctrlr != NULL); + + while (!rqpair->evt && spdk_get_ticks() < timeout_ticks && rc == 0) { + rc = nvme_rdma_poll_events(rctrlr); + } + + if (rc) { + return rc; + } + + if (rqpair->evt == NULL) { + return -EADDRNOTAVAIL; + } + + rc = nvme_rdma_validate_cm_event(evt, rqpair->evt); + + rc2 = nvme_rdma_qpair_process_cm_event(rqpair); + /* bad message takes precedence over the other error codes from processing the event. */ + return rc == 0 ? rc2 : rc; +} + +static int +nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair) +{ + int rc; + struct spdk_rdma_qp_init_attr attr = {}; + struct ibv_device_attr dev_attr; + struct nvme_rdma_ctrlr *rctrlr; + + rc = ibv_query_device(rqpair->cm_id->verbs, &dev_attr); + if (rc != 0) { + SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); + return -1; + } + + if (rqpair->qpair.poll_group) { + assert(!rqpair->cq); + rc = nvme_poll_group_connect_qpair(&rqpair->qpair); + if (rc) { + SPDK_ERRLOG("Unable to activate the rdmaqpair.\n"); + return -1; + } + assert(rqpair->cq); + } else { + rqpair->cq = ibv_create_cq(rqpair->cm_id->verbs, rqpair->num_entries * 2, rqpair, NULL, 0); + if (!rqpair->cq) { + SPDK_ERRLOG("Unable to create completion queue: errno %d: %s\n", errno, spdk_strerror(errno)); + return -1; + } + } + + rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); + if (g_nvme_hooks.get_ibv_pd) { + rctrlr->pd = g_nvme_hooks.get_ibv_pd(&rctrlr->ctrlr.trid, rqpair->cm_id->verbs); + } else { + rctrlr->pd = NULL; + } + + attr.pd = rctrlr->pd; + attr.send_cq = rqpair->cq; + attr.recv_cq = rqpair->cq; + attr.cap.max_send_wr = rqpair->num_entries; /* SEND operations */ + attr.cap.max_recv_wr = rqpair->num_entries; /* RECV operations */ + attr.cap.max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, dev_attr.max_sge); + attr.cap.max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, dev_attr.max_sge); + + rqpair->rdma_qp = spdk_rdma_qp_create(rqpair->cm_id, &attr); + + if (!rqpair->rdma_qp) { + return -1; + } + + /* ibv_create_qp will change the values in attr.cap. Make sure we store the proper value. */ + rqpair->max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, attr.cap.max_send_sge); + rqpair->max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, attr.cap.max_recv_sge); + rqpair->current_num_recvs = 0; + rqpair->current_num_sends = 0; + + rctrlr->pd = rqpair->rdma_qp->qp->pd; + + rqpair->cm_id->context = &rqpair->qpair; + + return 0; +} + +static inline int +nvme_rdma_qpair_submit_sends(struct nvme_rdma_qpair *rqpair) +{ + struct ibv_send_wr *bad_send_wr; + int rc; + + rc = spdk_rdma_qp_flush_send_wrs(rqpair->rdma_qp, &bad_send_wr); + + if (spdk_unlikely(rc)) { + SPDK_ERRLOG("Failed to post WRs on send queue, errno %d (%s), bad_wr %p\n", + rc, spdk_strerror(rc), bad_send_wr); + while (bad_send_wr != NULL) { + assert(rqpair->current_num_sends > 0); + rqpair->current_num_sends--; + bad_send_wr = bad_send_wr->next; + } + return rc; + } + + return 0; +} + +static inline int +nvme_rdma_qpair_submit_recvs(struct nvme_rdma_qpair *rqpair) +{ + struct ibv_recv_wr *bad_recv_wr; + int rc = 0; + + if (rqpair->recvs_to_post.first) { + rc = ibv_post_recv(rqpair->rdma_qp->qp, rqpair->recvs_to_post.first, &bad_recv_wr); + if (spdk_unlikely(rc)) { + SPDK_ERRLOG("Failed to post WRs on receive queue, errno %d (%s), bad_wr %p\n", + rc, spdk_strerror(rc), bad_recv_wr); + while (bad_recv_wr != NULL) { + assert(rqpair->current_num_sends > 0); + rqpair->current_num_recvs--; + bad_recv_wr = bad_recv_wr->next; + } + } + + rqpair->recvs_to_post.first = NULL; + } + return rc; +} + +/* Append the given send wr structure to the qpair's outstanding sends list. */ +/* This function accepts only a single wr. */ +static inline int +nvme_rdma_qpair_queue_send_wr(struct nvme_rdma_qpair *rqpair, struct ibv_send_wr *wr) +{ + assert(wr->next == NULL); + + assert(rqpair->current_num_sends < rqpair->num_entries); + + rqpair->current_num_sends++; + spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, wr); + + if (!rqpair->delay_cmd_submit) { + return nvme_rdma_qpair_submit_sends(rqpair); + } + + return 0; +} + +/* Append the given recv wr structure to the qpair's outstanding recvs list. */ +/* This function accepts only a single wr. */ +static inline int +nvme_rdma_qpair_queue_recv_wr(struct nvme_rdma_qpair *rqpair, struct ibv_recv_wr *wr) +{ + + assert(wr->next == NULL); + assert(rqpair->current_num_recvs < rqpair->num_entries); + + rqpair->current_num_recvs++; + if (rqpair->recvs_to_post.first == NULL) { + rqpair->recvs_to_post.first = wr; + } else { + rqpair->recvs_to_post.last->next = wr; + } + + rqpair->recvs_to_post.last = wr; + + if (!rqpair->delay_cmd_submit) { + return nvme_rdma_qpair_submit_recvs(rqpair); + } + + return 0; +} + +#define nvme_rdma_trace_ibv_sge(sg_list) \ + if (sg_list) { \ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "local addr %p length 0x%x lkey 0x%x\n", \ + (void *)(sg_list)->addr, (sg_list)->length, (sg_list)->lkey); \ + } + +static int +nvme_rdma_post_recv(struct nvme_rdma_qpair *rqpair, uint16_t rsp_idx) +{ + struct ibv_recv_wr *wr; + + wr = &rqpair->rsp_recv_wrs[rsp_idx]; + wr->next = NULL; + nvme_rdma_trace_ibv_sge(wr->sg_list); + return nvme_rdma_qpair_queue_recv_wr(rqpair, wr); +} + +static int +nvme_rdma_reg_mr(struct rdma_cm_id *cm_id, union nvme_rdma_mr *mr, void *mem, size_t length) +{ + if (!g_nvme_hooks.get_rkey) { + mr->mr = rdma_reg_msgs(cm_id, mem, length); + if (mr->mr == NULL) { + SPDK_ERRLOG("Unable to register mr: %s (%d)\n", + spdk_strerror(errno), errno); + return -1; + } + } else { + mr->key = g_nvme_hooks.get_rkey(cm_id->pd, mem, length); + } + + return 0; +} + +static void +nvme_rdma_dereg_mr(union nvme_rdma_mr *mr) +{ + if (!g_nvme_hooks.get_rkey) { + if (mr->mr && rdma_dereg_mr(mr->mr)) { + SPDK_ERRLOG("Unable to de-register mr\n"); + } + } else { + if (mr->key) { + g_nvme_hooks.put_rkey(mr->key); + } + } + memset(mr, 0, sizeof(*mr)); +} + +static uint32_t +nvme_rdma_mr_get_lkey(union nvme_rdma_mr *mr) +{ + uint32_t lkey; + + if (!g_nvme_hooks.get_rkey) { + lkey = mr->mr->lkey; + } else { + lkey = *((uint64_t *) mr->key); + } + + return lkey; +} + +static void +nvme_rdma_unregister_rsps(struct nvme_rdma_qpair *rqpair) +{ + nvme_rdma_dereg_mr(&rqpair->rsp_mr); +} + +static void +nvme_rdma_free_rsps(struct nvme_rdma_qpair *rqpair) +{ + nvme_rdma_free(rqpair->rsps); + rqpair->rsps = NULL; + nvme_rdma_free(rqpair->rsp_sgls); + rqpair->rsp_sgls = NULL; + nvme_rdma_free(rqpair->rsp_recv_wrs); + rqpair->rsp_recv_wrs = NULL; +} + +static int +nvme_rdma_alloc_rsps(struct nvme_rdma_qpair *rqpair) +{ + rqpair->rsps = NULL; + rqpair->rsp_recv_wrs = NULL; + + rqpair->rsp_sgls = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsp_sgls)); + if (!rqpair->rsp_sgls) { + SPDK_ERRLOG("Failed to allocate rsp_sgls\n"); + goto fail; + } + + rqpair->rsp_recv_wrs = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsp_recv_wrs)); + if (!rqpair->rsp_recv_wrs) { + SPDK_ERRLOG("Failed to allocate rsp_recv_wrs\n"); + goto fail; + } + + rqpair->rsps = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsps)); + if (!rqpair->rsps) { + SPDK_ERRLOG("can not allocate rdma rsps\n"); + goto fail; + } + + return 0; +fail: + nvme_rdma_free_rsps(rqpair); + return -ENOMEM; +} + +static int +nvme_rdma_register_rsps(struct nvme_rdma_qpair *rqpair) +{ + uint16_t i; + int rc; + uint32_t lkey; + + rc = nvme_rdma_reg_mr(rqpair->cm_id, &rqpair->rsp_mr, + rqpair->rsps, rqpair->num_entries * sizeof(*rqpair->rsps)); + + if (rc < 0) { + goto fail; + } + + lkey = nvme_rdma_mr_get_lkey(&rqpair->rsp_mr); + + for (i = 0; i < rqpair->num_entries; i++) { + struct ibv_sge *rsp_sgl = &rqpair->rsp_sgls[i]; + struct spdk_nvme_rdma_rsp *rsp = &rqpair->rsps[i]; + + rsp->rqpair = rqpair; + rsp->rdma_wr.type = RDMA_WR_TYPE_RECV; + rsp->idx = i; + rsp_sgl->addr = (uint64_t)&rqpair->rsps[i]; + rsp_sgl->length = sizeof(struct spdk_nvme_cpl); + rsp_sgl->lkey = lkey; + + rqpair->rsp_recv_wrs[i].wr_id = (uint64_t)&rsp->rdma_wr; + rqpair->rsp_recv_wrs[i].next = NULL; + rqpair->rsp_recv_wrs[i].sg_list = rsp_sgl; + rqpair->rsp_recv_wrs[i].num_sge = 1; + + rc = nvme_rdma_post_recv(rqpair, i); + if (rc) { + goto fail; + } + } + + rc = nvme_rdma_qpair_submit_recvs(rqpair); + if (rc) { + goto fail; + } + + return 0; + +fail: + nvme_rdma_unregister_rsps(rqpair); + return rc; +} + +static void +nvme_rdma_unregister_reqs(struct nvme_rdma_qpair *rqpair) +{ + nvme_rdma_dereg_mr(&rqpair->cmd_mr); +} + +static void +nvme_rdma_free_reqs(struct nvme_rdma_qpair *rqpair) +{ + if (!rqpair->rdma_reqs) { + return; + } + + nvme_rdma_free(rqpair->cmds); + rqpair->cmds = NULL; + + nvme_rdma_free(rqpair->rdma_reqs); + rqpair->rdma_reqs = NULL; +} + +static int +nvme_rdma_alloc_reqs(struct nvme_rdma_qpair *rqpair) +{ + uint16_t i; + + rqpair->rdma_reqs = nvme_rdma_calloc(rqpair->num_entries, sizeof(struct spdk_nvme_rdma_req)); + if (rqpair->rdma_reqs == NULL) { + SPDK_ERRLOG("Failed to allocate rdma_reqs\n"); + goto fail; + } + + rqpair->cmds = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->cmds)); + if (!rqpair->cmds) { + SPDK_ERRLOG("Failed to allocate RDMA cmds\n"); + goto fail; + } + + + TAILQ_INIT(&rqpair->free_reqs); + TAILQ_INIT(&rqpair->outstanding_reqs); + for (i = 0; i < rqpair->num_entries; i++) { + struct spdk_nvme_rdma_req *rdma_req; + struct spdk_nvmf_cmd *cmd; + + rdma_req = &rqpair->rdma_reqs[i]; + rdma_req->rdma_wr.type = RDMA_WR_TYPE_SEND; + cmd = &rqpair->cmds[i]; + + rdma_req->id = i; + + /* The first RDMA sgl element will always point + * at this data structure. Depending on whether + * an NVMe-oF SGL is required, the length of + * this element may change. */ + rdma_req->send_sgl[0].addr = (uint64_t)cmd; + rdma_req->send_wr.wr_id = (uint64_t)&rdma_req->rdma_wr; + rdma_req->send_wr.next = NULL; + rdma_req->send_wr.opcode = IBV_WR_SEND; + rdma_req->send_wr.send_flags = IBV_SEND_SIGNALED; + rdma_req->send_wr.sg_list = rdma_req->send_sgl; + rdma_req->send_wr.imm_data = 0; + + TAILQ_INSERT_TAIL(&rqpair->free_reqs, rdma_req, link); + } + + return 0; +fail: + nvme_rdma_free_reqs(rqpair); + return -ENOMEM; +} + +static int +nvme_rdma_register_reqs(struct nvme_rdma_qpair *rqpair) +{ + int i; + int rc; + uint32_t lkey; + + rc = nvme_rdma_reg_mr(rqpair->cm_id, &rqpair->cmd_mr, + rqpair->cmds, rqpair->num_entries * sizeof(*rqpair->cmds)); + + if (rc < 0) { + goto fail; + } + + lkey = nvme_rdma_mr_get_lkey(&rqpair->cmd_mr); + + for (i = 0; i < rqpair->num_entries; i++) { + rqpair->rdma_reqs[i].send_sgl[0].lkey = lkey; + } + + return 0; + +fail: + nvme_rdma_unregister_reqs(rqpair); + return -ENOMEM; +} + +static int +nvme_rdma_resolve_addr(struct nvme_rdma_qpair *rqpair, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct rdma_event_channel *cm_channel) +{ + int ret; + + ret = rdma_resolve_addr(rqpair->cm_id, src_addr, dst_addr, + NVME_RDMA_TIME_OUT_IN_MS); + if (ret) { + SPDK_ERRLOG("rdma_resolve_addr, %d\n", errno); + return ret; + } + + ret = nvme_rdma_process_event(rqpair, cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED); + if (ret) { + SPDK_ERRLOG("RDMA address resolution error\n"); + return -1; + } + + if (rqpair->qpair.ctrlr->opts.transport_ack_timeout != SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED) { +#ifdef SPDK_CONFIG_RDMA_SET_ACK_TIMEOUT + uint8_t timeout = rqpair->qpair.ctrlr->opts.transport_ack_timeout; + ret = rdma_set_option(rqpair->cm_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_ACK_TIMEOUT, + &timeout, sizeof(timeout)); + if (ret) { + SPDK_NOTICELOG("Can't apply RDMA_OPTION_ID_ACK_TIMEOUT %d, ret %d\n", timeout, ret); + } +#else + SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport_ack_timeout is not supported\n"); +#endif + } + + + ret = rdma_resolve_route(rqpair->cm_id, NVME_RDMA_TIME_OUT_IN_MS); + if (ret) { + SPDK_ERRLOG("rdma_resolve_route\n"); + return ret; + } + + ret = nvme_rdma_process_event(rqpair, cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED); + if (ret) { + SPDK_ERRLOG("RDMA route resolution error\n"); + return -1; + } + + return 0; +} + +static int +nvme_rdma_connect(struct nvme_rdma_qpair *rqpair) +{ + struct rdma_conn_param param = {}; + struct spdk_nvmf_rdma_request_private_data request_data = {}; + struct ibv_device_attr attr; + int ret; + struct spdk_nvme_ctrlr *ctrlr; + struct nvme_rdma_ctrlr *rctrlr; + + ret = ibv_query_device(rqpair->cm_id->verbs, &attr); + if (ret != 0) { + SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); + return ret; + } + + param.responder_resources = spdk_min(rqpair->num_entries, attr.max_qp_rd_atom); + + ctrlr = rqpair->qpair.ctrlr; + if (!ctrlr) { + return -1; + } + rctrlr = nvme_rdma_ctrlr(ctrlr); + assert(rctrlr != NULL); + + request_data.qid = rqpair->qpair.id; + request_data.hrqsize = rqpair->num_entries; + request_data.hsqsize = rqpair->num_entries - 1; + request_data.cntlid = ctrlr->cntlid; + + param.private_data = &request_data; + param.private_data_len = sizeof(request_data); + param.retry_count = ctrlr->opts.transport_retry_count; + param.rnr_retry_count = 7; + + /* Fields below are ignored by rdma cm if qpair has been + * created using rdma cm API. */ + param.srq = 0; + param.qp_num = rqpair->rdma_qp->qp->qp_num; + + ret = rdma_connect(rqpair->cm_id, ¶m); + if (ret) { + SPDK_ERRLOG("nvme rdma connect error\n"); + return ret; + } + + ret = nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_ESTABLISHED); + if (ret == -ESTALE) { + SPDK_NOTICELOG("Received a stale connection notice during connection.\n"); + return -EAGAIN; + } else if (ret) { + SPDK_ERRLOG("RDMA connect error %d\n", ret); + return ret; + } else { + return 0; + } +} + +static int +nvme_rdma_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service) +{ + struct addrinfo *res; + struct addrinfo hints; + int ret; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = family; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = 0; + + ret = getaddrinfo(addr, service, &hints, &res); + if (ret) { + SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret); + return ret; + } + + if (res->ai_addrlen > sizeof(*sa)) { + SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen); + ret = EINVAL; + } else { + memcpy(sa, res->ai_addr, res->ai_addrlen); + } + + freeaddrinfo(res); + return ret; +} + +static int +nvme_rdma_mr_map_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t size) +{ + struct ibv_pd *pd = cb_ctx; + struct ibv_mr *mr; + int rc; + + switch (action) { + case SPDK_MEM_MAP_NOTIFY_REGISTER: + if (!g_nvme_hooks.get_rkey) { + mr = ibv_reg_mr(pd, vaddr, size, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (mr == NULL) { + SPDK_ERRLOG("ibv_reg_mr() failed\n"); + return -EFAULT; + } else { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); + } + } else { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, + g_nvme_hooks.get_rkey(pd, vaddr, size)); + } + break; + case SPDK_MEM_MAP_NOTIFY_UNREGISTER: + if (!g_nvme_hooks.get_rkey) { + mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); + if (mr) { + ibv_dereg_mr(mr); + } + } + rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); + break; + default: + SPDK_UNREACHABLE(); + } + + return rc; +} + +static int +nvme_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) +{ + /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ + return addr_1 == addr_2; +} + +static int +nvme_rdma_register_mem(struct nvme_rdma_qpair *rqpair) +{ + struct ibv_pd *pd = rqpair->rdma_qp->qp->pd; + struct spdk_nvme_rdma_mr_map *mr_map; + const struct spdk_mem_map_ops nvme_rdma_map_ops = { + .notify_cb = nvme_rdma_mr_map_notify, + .are_contiguous = nvme_rdma_check_contiguous_entries + }; + + pthread_mutex_lock(&g_rdma_mr_maps_mutex); + + /* Look up existing mem map registration for this pd */ + LIST_FOREACH(mr_map, &g_rdma_mr_maps, link) { + if (mr_map->pd == pd) { + mr_map->ref++; + rqpair->mr_map = mr_map; + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + return 0; + } + } + + mr_map = nvme_rdma_calloc(1, sizeof(*mr_map)); + if (mr_map == NULL) { + SPDK_ERRLOG("Failed to allocate mr_map\n"); + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + return -1; + } + + mr_map->ref = 1; + mr_map->pd = pd; + mr_map->map = spdk_mem_map_alloc((uint64_t)NULL, &nvme_rdma_map_ops, pd); + if (mr_map->map == NULL) { + SPDK_ERRLOG("spdk_mem_map_alloc() failed\n"); + nvme_rdma_free(mr_map); + + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + return -1; + } + + rqpair->mr_map = mr_map; + LIST_INSERT_HEAD(&g_rdma_mr_maps, mr_map, link); + + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + + return 0; +} + +static void +nvme_rdma_unregister_mem(struct nvme_rdma_qpair *rqpair) +{ + struct spdk_nvme_rdma_mr_map *mr_map; + + mr_map = rqpair->mr_map; + rqpair->mr_map = NULL; + + if (mr_map == NULL) { + return; + } + + pthread_mutex_lock(&g_rdma_mr_maps_mutex); + + assert(mr_map->ref > 0); + mr_map->ref--; + if (mr_map->ref == 0) { + LIST_REMOVE(mr_map, link); + spdk_mem_map_free(&mr_map->map); + nvme_rdma_free(mr_map); + } + + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); +} + +static int +_nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct sockaddr_storage dst_addr; + struct sockaddr_storage src_addr; + bool src_addr_specified; + int rc; + struct nvme_rdma_ctrlr *rctrlr; + struct nvme_rdma_qpair *rqpair; + int family; + + rqpair = nvme_rdma_qpair(qpair); + rctrlr = nvme_rdma_ctrlr(ctrlr); + assert(rctrlr != NULL); + + switch (ctrlr->trid.adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + family = AF_INET; + break; + case SPDK_NVMF_ADRFAM_IPV6: + family = AF_INET6; + break; + default: + SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family); + + memset(&dst_addr, 0, sizeof(dst_addr)); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid); + rc = nvme_rdma_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid); + if (rc != 0) { + SPDK_ERRLOG("dst_addr nvme_rdma_parse_addr() failed\n"); + return -1; + } + + if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) { + memset(&src_addr, 0, sizeof(src_addr)); + rc = nvme_rdma_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid); + if (rc != 0) { + SPDK_ERRLOG("src_addr nvme_rdma_parse_addr() failed\n"); + return -1; + } + src_addr_specified = true; + } else { + src_addr_specified = false; + } + + rc = rdma_create_id(rctrlr->cm_channel, &rqpair->cm_id, rqpair, RDMA_PS_TCP); + if (rc < 0) { + SPDK_ERRLOG("rdma_create_id() failed\n"); + return -1; + } + + rc = nvme_rdma_resolve_addr(rqpair, + src_addr_specified ? (struct sockaddr *)&src_addr : NULL, + (struct sockaddr *)&dst_addr, rctrlr->cm_channel); + if (rc < 0) { + SPDK_ERRLOG("nvme_rdma_resolve_addr() failed\n"); + return -1; + } + + rc = nvme_rdma_qpair_init(rqpair); + if (rc < 0) { + SPDK_ERRLOG("nvme_rdma_qpair_init() failed\n"); + return -1; + } + + rc = nvme_rdma_connect(rqpair); + if (rc != 0) { + SPDK_ERRLOG("Unable to connect the rqpair\n"); + return rc; + } + + rc = nvme_rdma_register_reqs(rqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); + if (rc) { + SPDK_ERRLOG("Unable to register rqpair RDMA requests\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests registered\n"); + + rc = nvme_rdma_register_rsps(rqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); + if (rc < 0) { + SPDK_ERRLOG("Unable to register rqpair RDMA responses\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses registered\n"); + + rc = nvme_rdma_register_mem(rqpair); + if (rc < 0) { + SPDK_ERRLOG("Unable to register memory for RDMA\n"); + return -1; + } + + rc = nvme_fabric_qpair_connect(&rqpair->qpair, rqpair->num_entries); + if (rc < 0) { + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; + SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); + return -1; + } + + return 0; +} + +static int +nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + int rc; + int retry_count = 0; + + rc = _nvme_rdma_ctrlr_connect_qpair(ctrlr, qpair); + + /* + * -EAGAIN represents the special case where the target side still thought it was connected. + * Most NICs will fail the first connection attempt, and the NICs will clean up whatever + * state they need to. After that, subsequent connection attempts will succeed. + */ + if (rc == -EAGAIN) { + SPDK_NOTICELOG("Detected stale connection on Target side for qpid: %d\n", qpair->id); + do { + nvme_delay(NVME_RDMA_STALE_CONN_RETRY_DELAY_US); + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + rc = _nvme_rdma_ctrlr_connect_qpair(ctrlr, qpair); + retry_count++; + } while (rc == -EAGAIN && retry_count < NVME_RDMA_STALE_CONN_RETRY_MAX); + } + + return rc; +} + +/* + * Build SGL describing empty payload. + */ +static int +nvme_rdma_build_null_request(struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + /* The RDMA SGL needs one element describing the NVMe command. */ + rdma_req->send_wr.num_sge = 1; + + req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + req->cmd.dptr.sgl1.keyed.length = 0; + req->cmd.dptr.sgl1.keyed.key = 0; + req->cmd.dptr.sgl1.address = 0; + + return 0; +} + +static inline bool +nvme_rdma_get_key(struct spdk_mem_map *map, void *payload, uint64_t size, + enum nvme_rdma_key_type key_type, uint32_t *key) +{ + struct ibv_mr *mr; + uint64_t real_size = size; + uint32_t _key = 0; + + if (!g_nvme_hooks.get_rkey) { + mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)payload, &real_size); + + if (spdk_unlikely(!mr)) { + SPDK_ERRLOG("No translation for ptr %p, size %lu\n", payload, size); + return false; + } + switch (key_type) { + case NVME_RDMA_MR_RKEY: + _key = mr->rkey; + break; + case NVME_RDMA_MR_LKEY: + _key = mr->lkey; + break; + default: + SPDK_ERRLOG("Invalid key type %d\n", key_type); + assert(0); + return false; + } + } else { + _key = spdk_mem_map_translate(map, (uint64_t)payload, &real_size); + } + + if (spdk_unlikely(real_size < size)) { + SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n"); + return false; + } + + *key = _key; + return true; +} + +/* + * Build inline SGL describing contiguous payload buffer. + */ +static int +nvme_rdma_build_contig_inline_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + uint32_t lkey = 0; + void *payload; + + payload = req->payload.contig_or_cb_arg + req->payload_offset; + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + + if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, payload, req->payload_size, + NVME_RDMA_MR_LKEY, &lkey))) { + return -1; + } + + rdma_req->send_sgl[1].lkey = lkey; + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + rdma_req->send_sgl[1].addr = (uint64_t)payload; + rdma_req->send_sgl[1].length = (uint32_t)req->payload_size; + + /* The RDMA SGL contains two elements. The first describes + * the NVMe command and the second describes the data + * payload. */ + rdma_req->send_wr.num_sge = 2; + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size; + /* Inline only supported for icdoff == 0 currently. This function will + * not get called for controllers with other values. */ + req->cmd.dptr.sgl1.address = (uint64_t)0; + + return 0; +} + +/* + * Build SGL describing contiguous payload buffer. + */ +static int +nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + void *payload = req->payload.contig_or_cb_arg + req->payload_offset; + uint32_t rkey = 0; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + + if (spdk_unlikely(req->payload_size > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) { + SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n", + req->payload_size, NVME_RDMA_MAX_KEYED_SGL_LENGTH); + return -1; + } + + if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, payload, req->payload_size, + NVME_RDMA_MR_RKEY, &rkey))) { + return -1; + } + + req->cmd.dptr.sgl1.keyed.key = rkey; + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + /* The RDMA SGL needs one element describing the NVMe command. */ + rdma_req->send_wr.num_sge = 1; + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + req->cmd.dptr.sgl1.keyed.length = req->payload_size; + req->cmd.dptr.sgl1.address = (uint64_t)payload; + + return 0; +} + +/* + * Build SGL describing scattered payload buffer. + */ +static int +nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + struct spdk_nvmf_cmd *cmd = &rqpair->cmds[rdma_req->id]; + void *virt_addr; + uint32_t remaining_size; + uint32_t sge_length; + int rc, max_num_sgl, num_sgl_desc; + uint32_t rkey = 0; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + max_num_sgl = req->qpair->ctrlr->max_sges; + + remaining_size = req->payload_size; + num_sgl_desc = 0; + do { + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &sge_length); + if (rc) { + return -1; + } + + sge_length = spdk_min(remaining_size, sge_length); + + if (spdk_unlikely(sge_length > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) { + SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n", + sge_length, NVME_RDMA_MAX_KEYED_SGL_LENGTH); + return -1; + } + + if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, virt_addr, sge_length, + NVME_RDMA_MR_RKEY, &rkey))) { + return -1; + } + + cmd->sgl[num_sgl_desc].keyed.key = rkey; + cmd->sgl[num_sgl_desc].keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + cmd->sgl[num_sgl_desc].keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + cmd->sgl[num_sgl_desc].keyed.length = sge_length; + cmd->sgl[num_sgl_desc].address = (uint64_t)virt_addr; + + remaining_size -= sge_length; + num_sgl_desc++; + } while (remaining_size > 0 && num_sgl_desc < max_num_sgl); + + + /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */ + if (remaining_size > 0) { + return -1; + } + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + + /* The RDMA SGL needs one element describing some portion + * of the spdk_nvmf_cmd structure. */ + rdma_req->send_wr.num_sge = 1; + + /* + * If only one SGL descriptor is required, it can be embedded directly in the command + * as a data block descriptor. + */ + if (num_sgl_desc == 1) { + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + req->cmd.dptr.sgl1.keyed.type = cmd->sgl[0].keyed.type; + req->cmd.dptr.sgl1.keyed.subtype = cmd->sgl[0].keyed.subtype; + req->cmd.dptr.sgl1.keyed.length = cmd->sgl[0].keyed.length; + req->cmd.dptr.sgl1.keyed.key = cmd->sgl[0].keyed.key; + req->cmd.dptr.sgl1.address = cmd->sgl[0].address; + } else { + /* + * Otherwise, The SGL descriptor embedded in the command must point to the list of + * SGL descriptors used to describe the operation. In that case it is a last segment descriptor. + */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd) + sizeof(struct + spdk_nvme_sgl_descriptor) * num_sgl_desc; + + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.unkeyed.length = num_sgl_desc * sizeof(struct spdk_nvme_sgl_descriptor); + req->cmd.dptr.sgl1.address = (uint64_t)0; + } + + return 0; +} + +/* + * Build inline SGL describing sgl payload buffer. + */ +static int +nvme_rdma_build_sgl_inline_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + uint32_t lkey = 0; + uint32_t length; + void *virt_addr; + int rc; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); + if (rc) { + return -1; + } + + if (length < req->payload_size) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Inline SGL request split so sending separately.\n"); + return nvme_rdma_build_sgl_request(rqpair, rdma_req); + } + + if (length > req->payload_size) { + length = req->payload_size; + } + + if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, virt_addr, length, + NVME_RDMA_MR_LKEY, &lkey))) { + return -1; + } + + rdma_req->send_sgl[1].addr = (uint64_t)virt_addr; + rdma_req->send_sgl[1].length = length; + rdma_req->send_sgl[1].lkey = lkey; + + rdma_req->send_wr.num_sge = 2; + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size; + /* Inline only supported for icdoff == 0 currently. This function will + * not get called for controllers with other values. */ + req->cmd.dptr.sgl1.address = (uint64_t)0; + + return 0; +} + +static int +nvme_rdma_req_init(struct nvme_rdma_qpair *rqpair, struct nvme_request *req, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct spdk_nvme_ctrlr *ctrlr = rqpair->qpair.ctrlr; + enum nvme_payload_type payload_type; + bool icd_supported; + int rc; + + assert(rdma_req->req == NULL); + rdma_req->req = req; + req->cmd.cid = rdma_req->id; + payload_type = nvme_payload_type(&req->payload); + /* + * Check if icdoff is non zero, to avoid interop conflicts with + * targets with non-zero icdoff. Both SPDK and the Linux kernel + * targets use icdoff = 0. For targets with non-zero icdoff, we + * will currently just not use inline data for now. + */ + icd_supported = spdk_nvme_opc_get_data_transfer(req->cmd.opc) == SPDK_NVME_DATA_HOST_TO_CONTROLLER + && req->payload_size <= ctrlr->ioccsz_bytes && ctrlr->icdoff == 0; + + if (req->payload_size == 0) { + rc = nvme_rdma_build_null_request(rdma_req); + } else if (payload_type == NVME_PAYLOAD_TYPE_CONTIG) { + if (icd_supported) { + rc = nvme_rdma_build_contig_inline_request(rqpair, rdma_req); + } else { + rc = nvme_rdma_build_contig_request(rqpair, rdma_req); + } + } else if (payload_type == NVME_PAYLOAD_TYPE_SGL) { + if (icd_supported) { + rc = nvme_rdma_build_sgl_inline_request(rqpair, rdma_req); + } else { + rc = nvme_rdma_build_sgl_request(rqpair, rdma_req); + } + } else { + rc = -1; + } + + if (rc) { + rdma_req->req = NULL; + return rc; + } + + memcpy(&rqpair->cmds[rdma_req->id], &req->cmd, sizeof(req->cmd)); + return 0; +} + +static struct spdk_nvme_qpair * +nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr, + uint16_t qid, uint32_t qsize, + enum spdk_nvme_qprio qprio, + uint32_t num_requests, + bool delay_cmd_submit) +{ + struct nvme_rdma_qpair *rqpair; + struct spdk_nvme_qpair *qpair; + int rc; + + rqpair = nvme_rdma_calloc(1, sizeof(struct nvme_rdma_qpair)); + if (!rqpair) { + SPDK_ERRLOG("failed to get create rqpair\n"); + return NULL; + } + + rqpair->num_entries = qsize; + rqpair->delay_cmd_submit = delay_cmd_submit; + qpair = &rqpair->qpair; + rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests); + if (rc != 0) { + return NULL; + } + + rc = nvme_rdma_alloc_reqs(rqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); + if (rc) { + SPDK_ERRLOG("Unable to allocate rqpair RDMA requests\n"); + nvme_rdma_free(rqpair); + return NULL; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests allocated\n"); + + rc = nvme_rdma_alloc_rsps(rqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); + if (rc < 0) { + SPDK_ERRLOG("Unable to allocate rqpair RDMA responses\n"); + nvme_rdma_free_reqs(rqpair); + nvme_rdma_free(rqpair); + return NULL; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses allocated\n"); + + return qpair; +} + +static void +nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct nvme_rdma_ctrlr *rctrlr = NULL; + struct nvme_rdma_cm_event_entry *entry, *tmp; + + nvme_rdma_unregister_mem(rqpair); + nvme_rdma_unregister_reqs(rqpair); + nvme_rdma_unregister_rsps(rqpair); + + if (rqpair->evt) { + rdma_ack_cm_event(rqpair->evt); + rqpair->evt = NULL; + } + + /* + * This works because we have the controller lock both in + * this function and in the function where we add new events. + */ + if (qpair->ctrlr != NULL) { + rctrlr = nvme_rdma_ctrlr(qpair->ctrlr); + STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) { + if (nvme_rdma_qpair(entry->evt->id->context) == rqpair) { + STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link); + rdma_ack_cm_event(entry->evt); + STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link); + } + } + } + + if (rqpair->cm_id) { + if (rqpair->rdma_qp) { + spdk_rdma_qp_disconnect(rqpair->rdma_qp); + if (rctrlr != NULL) { + if (nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_DISCONNECTED)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Target did not respond to qpair disconnect.\n"); + } + } + spdk_rdma_qp_destroy(rqpair->rdma_qp); + rqpair->rdma_qp = NULL; + } + + rdma_destroy_id(rqpair->cm_id); + rqpair->cm_id = NULL; + } + + if (rqpair->cq) { + ibv_destroy_cq(rqpair->cq); + rqpair->cq = NULL; + } +} + +static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); + +static int +nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_rdma_qpair *rqpair; + + rqpair = nvme_rdma_qpair(qpair); + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + if (rqpair->defer_deletion_to_pg) { + nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING); + return 0; + } + + nvme_rdma_qpair_abort_reqs(qpair, 1); + nvme_qpair_deinit(qpair); + + nvme_rdma_free_reqs(rqpair); + nvme_rdma_free_rsps(rqpair); + nvme_rdma_free(rqpair); + + return 0; +} + +static struct spdk_nvme_qpair * +nvme_rdma_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + return nvme_rdma_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio, + opts->io_queue_requests, + opts->delay_cmd_submit); +} + +static int +nvme_rdma_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + /* do nothing here */ + return 0; +} + +static int nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr); + +static struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + struct nvme_rdma_ctrlr *rctrlr; + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + struct ibv_context **contexts; + struct ibv_device_attr dev_attr; + int i, flag, rc; + + rctrlr = nvme_rdma_calloc(1, sizeof(struct nvme_rdma_ctrlr)); + if (rctrlr == NULL) { + SPDK_ERRLOG("could not allocate ctrlr\n"); + return NULL; + } + + rctrlr->ctrlr.opts = *opts; + rctrlr->ctrlr.trid = *trid; + + if (opts->transport_retry_count > NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT) { + SPDK_NOTICELOG("transport_retry_count exceeds max value %d, use max value\n", + NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT); + rctrlr->ctrlr.opts.transport_retry_count = NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT; + } + + if (opts->transport_ack_timeout > NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT) { + SPDK_NOTICELOG("transport_ack_timeout exceeds max value %d, use max value\n", + NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT); + rctrlr->ctrlr.opts.transport_ack_timeout = NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT; + } + + contexts = rdma_get_devices(NULL); + if (contexts == NULL) { + SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); + nvme_rdma_free(rctrlr); + return NULL; + } + + i = 0; + rctrlr->max_sge = NVME_RDMA_MAX_SGL_DESCRIPTORS; + + while (contexts[i] != NULL) { + rc = ibv_query_device(contexts[i], &dev_attr); + if (rc < 0) { + SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); + rdma_free_devices(contexts); + nvme_rdma_free(rctrlr); + return NULL; + } + rctrlr->max_sge = spdk_min(rctrlr->max_sge, (uint16_t)dev_attr.max_sge); + i++; + } + + rdma_free_devices(contexts); + + rc = nvme_ctrlr_construct(&rctrlr->ctrlr); + if (rc != 0) { + nvme_rdma_free(rctrlr); + return NULL; + } + + STAILQ_INIT(&rctrlr->pending_cm_events); + STAILQ_INIT(&rctrlr->free_cm_events); + rctrlr->cm_events = nvme_rdma_calloc(NVME_RDMA_NUM_CM_EVENTS, sizeof(*rctrlr->cm_events)); + if (rctrlr->cm_events == NULL) { + SPDK_ERRLOG("unable to allocat buffers to hold CM events.\n"); + goto destruct_ctrlr; + } + + for (i = 0; i < NVME_RDMA_NUM_CM_EVENTS; i++) { + STAILQ_INSERT_TAIL(&rctrlr->free_cm_events, &rctrlr->cm_events[i], link); + } + + rctrlr->cm_channel = rdma_create_event_channel(); + if (rctrlr->cm_channel == NULL) { + SPDK_ERRLOG("rdma_create_event_channel() failed\n"); + goto destruct_ctrlr; + } + + flag = fcntl(rctrlr->cm_channel->fd, F_GETFL); + if (fcntl(rctrlr->cm_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("Cannot set event channel to non blocking\n"); + goto destruct_ctrlr; + } + + rctrlr->ctrlr.adminq = nvme_rdma_ctrlr_create_qpair(&rctrlr->ctrlr, 0, + rctrlr->ctrlr.opts.admin_queue_size, 0, + rctrlr->ctrlr.opts.admin_queue_size, false); + if (!rctrlr->ctrlr.adminq) { + SPDK_ERRLOG("failed to create admin qpair\n"); + goto destruct_ctrlr; + } + + rc = nvme_transport_ctrlr_connect_qpair(&rctrlr->ctrlr, rctrlr->ctrlr.adminq); + if (rc < 0) { + SPDK_ERRLOG("failed to connect admin qpair\n"); + goto destruct_ctrlr; + } + + if (nvme_ctrlr_get_cap(&rctrlr->ctrlr, &cap)) { + SPDK_ERRLOG("get_cap() failed\n"); + goto destruct_ctrlr; + } + + if (nvme_ctrlr_get_vs(&rctrlr->ctrlr, &vs)) { + SPDK_ERRLOG("get_vs() failed\n"); + goto destruct_ctrlr; + } + + if (nvme_ctrlr_add_process(&rctrlr->ctrlr, 0) != 0) { + SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n"); + goto destruct_ctrlr; + } + + nvme_ctrlr_init_cap(&rctrlr->ctrlr, &cap, &vs); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "successfully initialized the nvmf ctrlr\n"); + return &rctrlr->ctrlr; + +destruct_ctrlr: + nvme_ctrlr_destruct(&rctrlr->ctrlr); + return NULL; +} + +static int +nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr); + struct nvme_rdma_cm_event_entry *entry; + + if (ctrlr->adminq) { + nvme_rdma_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq); + } + + STAILQ_FOREACH(entry, &rctrlr->pending_cm_events, link) { + rdma_ack_cm_event(entry->evt); + } + + STAILQ_INIT(&rctrlr->free_cm_events); + STAILQ_INIT(&rctrlr->pending_cm_events); + nvme_rdma_free(rctrlr->cm_events); + + if (rctrlr->cm_channel) { + rdma_destroy_event_channel(rctrlr->cm_channel); + rctrlr->cm_channel = NULL; + } + + nvme_ctrlr_destruct_finish(ctrlr); + + nvme_rdma_free(rctrlr); + + return 0; +} + +static int +nvme_rdma_qpair_submit_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req) +{ + struct nvme_rdma_qpair *rqpair; + struct spdk_nvme_rdma_req *rdma_req; + struct ibv_send_wr *wr; + + rqpair = nvme_rdma_qpair(qpair); + assert(rqpair != NULL); + assert(req != NULL); + + rdma_req = nvme_rdma_req_get(rqpair); + if (!rdma_req) { + /* Inform the upper layer to try again later. */ + return -EAGAIN; + } + + if (nvme_rdma_req_init(rqpair, req, rdma_req)) { + SPDK_ERRLOG("nvme_rdma_req_init() failed\n"); + TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); + nvme_rdma_req_put(rqpair, rdma_req); + return -1; + } + + wr = &rdma_req->send_wr; + wr->next = NULL; + nvme_rdma_trace_ibv_sge(wr->sg_list); + return nvme_rdma_qpair_queue_send_wr(rqpair, wr); +} + +static int +nvme_rdma_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + /* Currently, doing nothing here */ + return 0; +} + +static void +nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + struct spdk_nvme_rdma_req *rdma_req, *tmp; + struct spdk_nvme_cpl cpl; + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + cpl.status.dnr = dnr; + + /* + * We cannot abort requests at the RDMA layer without + * unregistering them. If we do, we can still get error + * free completions on the shared completion queue. + */ + if (nvme_qpair_get_state(qpair) > NVME_QPAIR_DISCONNECTING && + nvme_qpair_get_state(qpair) != NVME_QPAIR_DESTROYING) { + nvme_ctrlr_disconnect_qpair(qpair); + } + + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + nvme_rdma_req_complete(rdma_req, &cpl); + nvme_rdma_req_put(rqpair, rdma_req); + } +} + +static void +nvme_rdma_qpair_check_timeout(struct spdk_nvme_qpair *qpair) +{ + uint64_t t02; + struct spdk_nvme_rdma_req *rdma_req, *tmp; + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + + /* Don't check timeouts during controller initialization. */ + if (ctrlr->state != NVME_CTRLR_STATE_READY) { + return; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + active_proc = nvme_ctrlr_get_current_process(ctrlr); + } else { + active_proc = qpair->active_proc; + } + + /* Only check timeouts if the current process has a timeout callback. */ + if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { + return; + } + + t02 = spdk_get_ticks(); + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + assert(rdma_req->req != NULL); + + if (nvme_request_check_timeout(rdma_req->req, rdma_req->id, active_proc, t02)) { + /* + * The requests are in order, so as soon as one has not timed out, + * stop iterating. + */ + break; + } + } +} + +static inline int +nvme_rdma_request_ready(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req) +{ + nvme_rdma_req_complete(rdma_req, &rqpair->rsps[rdma_req->rsp_idx].cpl); + nvme_rdma_req_put(rqpair, rdma_req); + return nvme_rdma_post_recv(rqpair, rdma_req->rsp_idx); +} + +#define MAX_COMPLETIONS_PER_POLL 128 + +static void +nvme_rdma_fail_qpair(struct spdk_nvme_qpair *qpair, int failure_reason) +{ + if (failure_reason == IBV_WC_RETRY_EXC_ERR) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE; + } else if (qpair->transport_failure_reason == SPDK_NVME_QPAIR_FAILURE_NONE) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; + } + + nvme_ctrlr_disconnect_qpair(qpair); +} + +static void +nvme_rdma_conditional_fail_qpair(struct nvme_rdma_qpair *rqpair, struct nvme_rdma_poll_group *group) +{ + struct nvme_rdma_destroyed_qpair *qpair_tracker; + + assert(rqpair); + if (group) { + STAILQ_FOREACH(qpair_tracker, &group->destroyed_qpairs, link) { + if (qpair_tracker->destroyed_qpair_tracker == rqpair) { + return; + } + } + } + nvme_rdma_fail_qpair(&rqpair->qpair, 0); +} + +static int +nvme_rdma_cq_process_completions(struct ibv_cq *cq, uint32_t batch_size, + struct nvme_rdma_poll_group *group, + struct nvme_rdma_qpair *rdma_qpair) +{ + struct ibv_wc wc[MAX_COMPLETIONS_PER_POLL]; + struct nvme_rdma_qpair *rqpair; + struct spdk_nvme_rdma_req *rdma_req; + struct spdk_nvme_rdma_rsp *rdma_rsp; + struct nvme_rdma_wr *rdma_wr; + uint32_t reaped = 0; + int completion_rc = 0; + int rc, i; + + rc = ibv_poll_cq(cq, batch_size, wc); + if (rc < 0) { + SPDK_ERRLOG("Error polling CQ! (%d): %s\n", + errno, spdk_strerror(errno)); + return -ECANCELED; + } else if (rc == 0) { + return 0; + } + + for (i = 0; i < rc; i++) { + rdma_wr = (struct nvme_rdma_wr *)wc[i].wr_id; + switch (rdma_wr->type) { + case RDMA_WR_TYPE_RECV: + rdma_rsp = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_rsp, rdma_wr); + rqpair = rdma_rsp->rqpair; + assert(rqpair->current_num_recvs > 0); + rqpair->current_num_recvs--; + + if (wc[i].status) { + SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n", + rqpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); + nvme_rdma_conditional_fail_qpair(rqpair, group); + completion_rc = -ENXIO; + continue; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CQ recv completion\n"); + + if (wc[i].byte_len < sizeof(struct spdk_nvme_cpl)) { + SPDK_ERRLOG("recv length %u less than expected response size\n", wc[i].byte_len); + nvme_rdma_conditional_fail_qpair(rqpair, group); + completion_rc = -ENXIO; + continue; + } + rdma_req = &rqpair->rdma_reqs[rdma_rsp->cpl.cid]; + rdma_req->completion_flags |= NVME_RDMA_RECV_COMPLETED; + rdma_req->rsp_idx = rdma_rsp->idx; + + if ((rdma_req->completion_flags & NVME_RDMA_SEND_COMPLETED) != 0) { + if (spdk_unlikely(nvme_rdma_request_ready(rqpair, rdma_req))) { + SPDK_ERRLOG("Unable to re-post rx descriptor\n"); + nvme_rdma_conditional_fail_qpair(rqpair, group); + completion_rc = -ENXIO; + continue; + } + reaped++; + rqpair->num_completions++; + } + break; + + case RDMA_WR_TYPE_SEND: + rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_req, rdma_wr); + + /* If we are flushing I/O */ + if (wc[i].status) { + rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL; + if (!rqpair) { + rqpair = rdma_qpair != NULL ? rdma_qpair : nvme_rdma_poll_group_get_qpair_by_id(group, + wc[i].qp_num); + } + assert(rqpair); + assert(rqpair->current_num_sends > 0); + rqpair->current_num_sends--; + nvme_rdma_conditional_fail_qpair(rqpair, group); + SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n", + rqpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); + completion_rc = -ENXIO; + continue; + } + + rqpair = nvme_rdma_qpair(rdma_req->req->qpair); + rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED; + rqpair->current_num_sends--; + + if ((rdma_req->completion_flags & NVME_RDMA_RECV_COMPLETED) != 0) { + if (spdk_unlikely(nvme_rdma_request_ready(rqpair, rdma_req))) { + SPDK_ERRLOG("Unable to re-post rx descriptor\n"); + nvme_rdma_conditional_fail_qpair(rqpair, group); + completion_rc = -ENXIO; + continue; + } + reaped++; + rqpair->num_completions++; + } + break; + + default: + SPDK_ERRLOG("Received an unexpected opcode on the CQ: %d\n", rdma_wr->type); + return -ECANCELED; + } + } + + if (completion_rc) { + return completion_rc; + } + + return reaped; +} + +static void +dummy_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) +{ + +} + +static int +nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair, + uint32_t max_completions) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + int rc = 0, batch_size; + struct ibv_cq *cq; + struct nvme_rdma_ctrlr *rctrlr; + + /* + * This is used during the connection phase. It's possible that we are still reaping error completions + * from other qpairs so we need to call the poll group function. Also, it's more correct since the cq + * is shared. + */ + if (qpair->poll_group != NULL) { + return spdk_nvme_poll_group_process_completions(qpair->poll_group->group, max_completions, + dummy_disconnected_qpair_cb); + } + + if (max_completions == 0) { + max_completions = rqpair->num_entries; + } else { + max_completions = spdk_min(max_completions, rqpair->num_entries); + } + + if (nvme_qpair_is_admin_queue(&rqpair->qpair)) { + rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); + nvme_rdma_poll_events(rctrlr); + } + nvme_rdma_qpair_process_cm_event(rqpair); + + if (spdk_unlikely(qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE)) { + nvme_rdma_fail_qpair(qpair, 0); + return -ENXIO; + } + + cq = rqpair->cq; + + rqpair->num_completions = 0; + do { + batch_size = spdk_min((max_completions - rqpair->num_completions), MAX_COMPLETIONS_PER_POLL); + rc = nvme_rdma_cq_process_completions(cq, batch_size, NULL, rqpair); + + if (rc == 0) { + break; + /* Handle the case where we fail to poll the cq. */ + } else if (rc == -ECANCELED) { + nvme_rdma_fail_qpair(qpair, 0); + return -ENXIO; + } else if (rc == -ENXIO) { + return rc; + } + } while (rqpair->num_completions < max_completions); + + if (spdk_unlikely(nvme_rdma_qpair_submit_sends(rqpair) || + nvme_rdma_qpair_submit_recvs(rqpair))) { + nvme_rdma_fail_qpair(qpair, 0); + return -ENXIO; + } + + if (spdk_unlikely(rqpair->qpair.ctrlr->timeout_enabled)) { + nvme_rdma_qpair_check_timeout(qpair); + } + + return rqpair->num_completions; +} + +static uint32_t +nvme_rdma_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + /* max_mr_size by ibv_query_device indicates the largest value that we can + * set for a registered memory region. It is independent from the actual + * I/O size and is very likely to be larger than 2 MiB which is the + * granularity we currently register memory regions. Hence return + * UINT32_MAX here and let the generic layer use the controller data to + * moderate this value. + */ + return UINT32_MAX; +} + +static uint16_t +nvme_rdma_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr); + + return rctrlr->max_sge; +} + +static int +nvme_rdma_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct spdk_nvme_rdma_req *rdma_req, *tmp; + int rc; + + assert(iter_fn != NULL); + + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + assert(rdma_req->req != NULL); + + rc = iter_fn(rdma_req->req, arg); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +static void +nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_rdma_req *rdma_req, *tmp; + struct spdk_nvme_cpl cpl; + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + assert(rdma_req->req != NULL); + + if (rdma_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + continue; + } + + nvme_rdma_req_complete(rdma_req, &cpl); + nvme_rdma_req_put(rqpair, rdma_req); + } +} + +static int +nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *ctx) +{ + struct nvme_rdma_poller *poller; + + poller = calloc(1, sizeof(*poller)); + if (poller == NULL) { + SPDK_ERRLOG("Unable to allocate poller.\n"); + return -ENOMEM; + } + + poller->device = ctx; + poller->cq = ibv_create_cq(poller->device, DEFAULT_NVME_RDMA_CQ_SIZE, group, NULL, 0); + + if (poller->cq == NULL) { + free(poller); + return -EINVAL; + } + + STAILQ_INSERT_HEAD(&group->pollers, poller, link); + group->num_pollers++; + poller->current_num_wc = DEFAULT_NVME_RDMA_CQ_SIZE; + poller->required_num_wc = 0; + return 0; +} + +static void +nvme_rdma_poll_group_free_pollers(struct nvme_rdma_poll_group *group) +{ + struct nvme_rdma_poller *poller, *tmp_poller; + + STAILQ_FOREACH_SAFE(poller, &group->pollers, link, tmp_poller) { + if (poller->cq) { + ibv_destroy_cq(poller->cq); + } + STAILQ_REMOVE(&group->pollers, poller, nvme_rdma_poller, link); + free(poller); + } +} + +static struct spdk_nvme_transport_poll_group * +nvme_rdma_poll_group_create(void) +{ + struct nvme_rdma_poll_group *group; + struct ibv_context **contexts; + int i = 0; + + group = calloc(1, sizeof(*group)); + if (group == NULL) { + SPDK_ERRLOG("Unable to allocate poll group.\n"); + return NULL; + } + + STAILQ_INIT(&group->pollers); + + contexts = rdma_get_devices(NULL); + if (contexts == NULL) { + SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); + free(group); + return NULL; + } + + while (contexts[i] != NULL) { + if (nvme_rdma_poller_create(group, contexts[i])) { + nvme_rdma_poll_group_free_pollers(group); + free(group); + rdma_free_devices(contexts); + return NULL; + } + i++; + } + + rdma_free_devices(contexts); + STAILQ_INIT(&group->destroyed_qpairs); + return &group->group; +} + +struct nvme_rdma_qpair * +nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group, uint32_t qp_num) +{ + struct spdk_nvme_qpair *qpair; + struct nvme_rdma_destroyed_qpair *rqpair_tracker; + struct nvme_rdma_qpair *rqpair; + + STAILQ_FOREACH(qpair, &group->group.disconnected_qpairs, poll_group_stailq) { + rqpair = nvme_rdma_qpair(qpair); + if (rqpair->rdma_qp->qp->qp_num == qp_num) { + return rqpair; + } + } + + STAILQ_FOREACH(qpair, &group->group.connected_qpairs, poll_group_stailq) { + rqpair = nvme_rdma_qpair(qpair); + if (rqpair->rdma_qp->qp->qp_num == qp_num) { + return rqpair; + } + } + + STAILQ_FOREACH(rqpair_tracker, &group->destroyed_qpairs, link) { + rqpair = rqpair_tracker->destroyed_qpair_tracker; + if (rqpair->rdma_qp->qp->qp_num == qp_num) { + return rqpair; + } + } + + return NULL; +} + +static int +nvme_rdma_resize_cq(struct nvme_rdma_qpair *rqpair, struct nvme_rdma_poller *poller) +{ + int current_num_wc, required_num_wc; + + required_num_wc = poller->required_num_wc + WC_PER_QPAIR(rqpair->num_entries); + current_num_wc = poller->current_num_wc; + if (current_num_wc < required_num_wc) { + current_num_wc = spdk_max(current_num_wc * 2, required_num_wc); + } + + if (poller->current_num_wc != current_num_wc) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Resize RDMA CQ from %d to %d\n", poller->current_num_wc, + current_num_wc); + if (ibv_resize_cq(poller->cq, current_num_wc)) { + SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno)); + return -1; + } + + poller->current_num_wc = current_num_wc; + } + + poller->required_num_wc = required_num_wc; + return 0; +} + +static int +nvme_rdma_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct nvme_rdma_poll_group *group = nvme_rdma_poll_group(qpair->poll_group); + struct nvme_rdma_poller *poller; + + assert(rqpair->cq == NULL); + + STAILQ_FOREACH(poller, &group->pollers, link) { + if (poller->device == rqpair->cm_id->verbs) { + if (nvme_rdma_resize_cq(rqpair, poller)) { + return -EPROTO; + } + rqpair->cq = poller->cq; + break; + } + } + + if (rqpair->cq == NULL) { + SPDK_ERRLOG("Unable to find a cq for qpair %p on poll group %p\n", qpair, qpair->poll_group); + return -EINVAL; + } + + return 0; +} + +static int +nvme_rdma_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct nvme_rdma_poll_group *group; + struct nvme_rdma_destroyed_qpair *destroyed_qpair; + enum nvme_qpair_state state; + + if (rqpair->poll_group_disconnect_in_progress) { + return -EINPROGRESS; + } + + rqpair->poll_group_disconnect_in_progress = true; + state = nvme_qpair_get_state(qpair); + group = nvme_rdma_poll_group(qpair->poll_group); + rqpair->cq = NULL; + + /* + * We want to guard against an endless recursive loop while making + * sure the qpair is disconnected before we disconnect it from the qpair. + */ + if (state > NVME_QPAIR_DISCONNECTING && state != NVME_QPAIR_DESTROYING) { + nvme_ctrlr_disconnect_qpair(qpair); + } + + /* + * If this fails, the system is in serious trouble, + * just let the qpair get cleaned up immediately. + */ + destroyed_qpair = calloc(1, sizeof(*destroyed_qpair)); + if (destroyed_qpair == NULL) { + return 0; + } + + destroyed_qpair->destroyed_qpair_tracker = rqpair; + destroyed_qpair->completed_cycles = 0; + STAILQ_INSERT_TAIL(&group->destroyed_qpairs, destroyed_qpair, link); + + rqpair->defer_deletion_to_pg = true; + + rqpair->poll_group_disconnect_in_progress = false; + return 0; +} + +static int +nvme_rdma_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static int +nvme_rdma_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { + return nvme_poll_group_disconnect_qpair(qpair); + } + + return 0; +} + +static void +nvme_rdma_poll_group_delete_qpair(struct nvme_rdma_poll_group *group, + struct nvme_rdma_destroyed_qpair *qpair_tracker) +{ + struct nvme_rdma_qpair *rqpair = qpair_tracker->destroyed_qpair_tracker; + + rqpair->defer_deletion_to_pg = false; + if (nvme_qpair_get_state(&rqpair->qpair) == NVME_QPAIR_DESTROYING) { + nvme_rdma_ctrlr_delete_io_qpair(rqpair->qpair.ctrlr, &rqpair->qpair); + } + STAILQ_REMOVE(&group->destroyed_qpairs, qpair_tracker, nvme_rdma_destroyed_qpair, link); + free(qpair_tracker); +} + +static int64_t +nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) +{ + struct spdk_nvme_qpair *qpair, *tmp_qpair; + struct nvme_rdma_destroyed_qpair *qpair_tracker, *tmp_qpair_tracker; + struct nvme_rdma_qpair *rqpair; + struct nvme_rdma_poll_group *group; + struct nvme_rdma_poller *poller; + int num_qpairs = 0, batch_size, rc; + int64_t total_completions = 0; + uint64_t completions_allowed = 0; + uint64_t completions_per_poller = 0; + uint64_t poller_completions = 0; + + + if (completions_per_qpair == 0) { + completions_per_qpair = MAX_COMPLETIONS_PER_POLL; + } + + group = nvme_rdma_poll_group(tgroup); + STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { + disconnected_qpair_cb(qpair, tgroup->group->ctx); + } + + STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) { + rqpair = nvme_rdma_qpair(qpair); + rqpair->num_completions = 0; + nvme_rdma_qpair_process_cm_event(rqpair); + + if (spdk_unlikely(qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE)) { + nvme_rdma_fail_qpair(qpair, 0); + disconnected_qpair_cb(qpair, tgroup->group->ctx); + continue; + } + num_qpairs++; + } + + completions_allowed = completions_per_qpair * num_qpairs; + completions_per_poller = spdk_max(completions_allowed / group->num_pollers, 1); + + STAILQ_FOREACH(poller, &group->pollers, link) { + poller_completions = 0; + do { + batch_size = spdk_min((completions_per_poller - poller_completions), MAX_COMPLETIONS_PER_POLL); + rc = nvme_rdma_cq_process_completions(poller->cq, batch_size, group, NULL); + if (rc <= 0) { + if (rc == -ECANCELED) { + return -EIO; + } + break; + } + + poller_completions += rc; + } while (poller_completions < completions_per_poller); + total_completions += poller_completions; + } + + STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) { + rqpair = nvme_rdma_qpair(qpair); + if (spdk_unlikely(qpair->ctrlr->timeout_enabled)) { + nvme_rdma_qpair_check_timeout(qpair); + } + + nvme_rdma_qpair_submit_sends(rqpair); + nvme_rdma_qpair_submit_recvs(rqpair); + nvme_qpair_resubmit_requests(&rqpair->qpair, rqpair->num_completions); + } + + /* + * Once a qpair is disconnected, we can still get flushed completions for those disconnected qpairs. + * For most pieces of hardware, those requests will complete immediately. However, there are certain + * cases where flushed requests will linger. Default is to destroy qpair after all completions are freed, + * but have a fallback for other cases where we don't get all of our completions back. + */ + STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) { + qpair_tracker->completed_cycles++; + rqpair = qpair_tracker->destroyed_qpair_tracker; + if ((rqpair->current_num_sends == 0 && rqpair->current_num_recvs == 0) || + qpair_tracker->completed_cycles > NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES) { + nvme_rdma_poll_group_delete_qpair(group, qpair_tracker); + } + } + + return total_completions; +} + +static int +nvme_rdma_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) +{ + struct nvme_rdma_poll_group *group = nvme_rdma_poll_group(tgroup); + struct nvme_rdma_destroyed_qpair *qpair_tracker, *tmp_qpair_tracker; + struct nvme_rdma_qpair *rqpair; + + if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { + return -EBUSY; + } + + STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) { + rqpair = qpair_tracker->destroyed_qpair_tracker; + if (nvme_qpair_get_state(&rqpair->qpair) == NVME_QPAIR_DESTROYING) { + rqpair->defer_deletion_to_pg = false; + nvme_rdma_ctrlr_delete_io_qpair(rqpair->qpair.ctrlr, &rqpair->qpair); + } + + STAILQ_REMOVE(&group->destroyed_qpairs, qpair_tracker, nvme_rdma_destroyed_qpair, link); + free(qpair_tracker); + } + + nvme_rdma_poll_group_free_pollers(group); + free(group); + + return 0; +} + +void +spdk_nvme_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) +{ + g_nvme_hooks = *hooks; +} + +const struct spdk_nvme_transport_ops rdma_ops = { + .name = "RDMA", + .type = SPDK_NVME_TRANSPORT_RDMA, + .ctrlr_construct = nvme_rdma_ctrlr_construct, + .ctrlr_scan = nvme_fabric_ctrlr_scan, + .ctrlr_destruct = nvme_rdma_ctrlr_destruct, + .ctrlr_enable = nvme_rdma_ctrlr_enable, + + .ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4, + .ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8, + .ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4, + .ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8, + + .ctrlr_get_max_xfer_size = nvme_rdma_ctrlr_get_max_xfer_size, + .ctrlr_get_max_sges = nvme_rdma_ctrlr_get_max_sges, + + .ctrlr_create_io_qpair = nvme_rdma_ctrlr_create_io_qpair, + .ctrlr_delete_io_qpair = nvme_rdma_ctrlr_delete_io_qpair, + .ctrlr_connect_qpair = nvme_rdma_ctrlr_connect_qpair, + .ctrlr_disconnect_qpair = nvme_rdma_ctrlr_disconnect_qpair, + + .qpair_abort_reqs = nvme_rdma_qpair_abort_reqs, + .qpair_reset = nvme_rdma_qpair_reset, + .qpair_submit_request = nvme_rdma_qpair_submit_request, + .qpair_process_completions = nvme_rdma_qpair_process_completions, + .qpair_iterate_requests = nvme_rdma_qpair_iterate_requests, + .admin_qpair_abort_aers = nvme_rdma_admin_qpair_abort_aers, + + .poll_group_create = nvme_rdma_poll_group_create, + .poll_group_connect_qpair = nvme_rdma_poll_group_connect_qpair, + .poll_group_disconnect_qpair = nvme_rdma_poll_group_disconnect_qpair, + .poll_group_add = nvme_rdma_poll_group_add, + .poll_group_remove = nvme_rdma_poll_group_remove, + .poll_group_process_completions = nvme_rdma_poll_group_process_completions, + .poll_group_destroy = nvme_rdma_poll_group_destroy, + +}; + +SPDK_NVME_TRANSPORT_REGISTER(rdma, &rdma_ops); diff --git a/src/spdk/lib/nvme/nvme_tcp.c b/src/spdk/lib/nvme/nvme_tcp.c new file mode 100644 index 000000000..98e8c6827 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_tcp.c @@ -0,0 +1,1973 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe/TCP transport + */ + +#include "nvme_internal.h" + +#include "spdk/endian.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/stdinc.h" +#include "spdk/crc32.h" +#include "spdk/endian.h" +#include "spdk/assert.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/trace.h" +#include "spdk/util.h" + +#include "spdk_internal/nvme_tcp.h" + +#define NVME_TCP_RW_BUFFER_SIZE 131072 +#define NVME_TCP_TIME_OUT_IN_SECONDS 2 + +#define NVME_TCP_HPDA_DEFAULT 0 +#define NVME_TCP_MAX_R2T_DEFAULT 1 +#define NVME_TCP_PDU_H2C_MIN_DATA_SIZE 4096 +#define NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE 8192 + +/* NVMe TCP transport extensions for spdk_nvme_ctrlr */ +struct nvme_tcp_ctrlr { + struct spdk_nvme_ctrlr ctrlr; +}; + +struct nvme_tcp_poll_group { + struct spdk_nvme_transport_poll_group group; + struct spdk_sock_group *sock_group; + uint32_t completions_per_qpair; + int64_t num_completions; +}; + +/* NVMe TCP qpair extensions for spdk_nvme_qpair */ +struct nvme_tcp_qpair { + struct spdk_nvme_qpair qpair; + struct spdk_sock *sock; + + TAILQ_HEAD(, nvme_tcp_req) free_reqs; + TAILQ_HEAD(, nvme_tcp_req) outstanding_reqs; + + TAILQ_HEAD(, nvme_tcp_pdu) send_queue; + struct nvme_tcp_pdu recv_pdu; + struct nvme_tcp_pdu send_pdu; /* only for error pdu and init pdu */ + struct nvme_tcp_pdu *send_pdus; /* Used by tcp_reqs */ + enum nvme_tcp_pdu_recv_state recv_state; + + struct nvme_tcp_req *tcp_reqs; + + uint16_t num_entries; + + bool host_hdgst_enable; + bool host_ddgst_enable; + + /** Specifies the maximum number of PDU-Data bytes per H2C Data Transfer PDU */ + uint32_t maxh2cdata; + + uint32_t maxr2t; + + /* 0 based value, which is used to guide the padding */ + uint8_t cpda; + + enum nvme_tcp_qpair_state state; +}; + +enum nvme_tcp_req_state { + NVME_TCP_REQ_FREE, + NVME_TCP_REQ_ACTIVE, + NVME_TCP_REQ_ACTIVE_R2T, +}; + +struct nvme_tcp_req { + struct nvme_request *req; + enum nvme_tcp_req_state state; + uint16_t cid; + uint16_t ttag; + uint32_t datao; + uint32_t r2tl_remain; + uint32_t active_r2ts; + bool in_capsule_data; + /* It is used to track whether the req can be safely freed */ + struct { + uint8_t send_ack : 1; + uint8_t data_recv : 1; + uint8_t r2t_recv : 1; + uint8_t reserved : 5; + } ordering; + struct nvme_tcp_pdu *send_pdu; + struct iovec iov[NVME_TCP_MAX_SGL_DESCRIPTORS]; + uint32_t iovcnt; + struct nvme_tcp_qpair *tqpair; + TAILQ_ENTRY(nvme_tcp_req) link; +}; + +static void nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req); + +static inline struct nvme_tcp_qpair * +nvme_tcp_qpair(struct spdk_nvme_qpair *qpair) +{ + assert(qpair->trtype == SPDK_NVME_TRANSPORT_TCP); + return SPDK_CONTAINEROF(qpair, struct nvme_tcp_qpair, qpair); +} + +static inline struct nvme_tcp_poll_group * +nvme_tcp_poll_group(struct spdk_nvme_transport_poll_group *group) +{ + return SPDK_CONTAINEROF(group, struct nvme_tcp_poll_group, group); +} + +static inline struct nvme_tcp_ctrlr * +nvme_tcp_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP); + return SPDK_CONTAINEROF(ctrlr, struct nvme_tcp_ctrlr, ctrlr); +} + +static struct nvme_tcp_req * +nvme_tcp_req_get(struct nvme_tcp_qpair *tqpair) +{ + struct nvme_tcp_req *tcp_req; + + tcp_req = TAILQ_FIRST(&tqpair->free_reqs); + if (!tcp_req) { + return NULL; + } + + assert(tcp_req->state == NVME_TCP_REQ_FREE); + tcp_req->state = NVME_TCP_REQ_ACTIVE; + TAILQ_REMOVE(&tqpair->free_reqs, tcp_req, link); + tcp_req->datao = 0; + tcp_req->req = NULL; + tcp_req->in_capsule_data = false; + tcp_req->r2tl_remain = 0; + tcp_req->active_r2ts = 0; + tcp_req->iovcnt = 0; + tcp_req->ordering.send_ack = 0; + tcp_req->ordering.data_recv = 0; + tcp_req->ordering.r2t_recv = 0; + memset(tcp_req->send_pdu, 0, sizeof(struct nvme_tcp_pdu)); + TAILQ_INSERT_TAIL(&tqpair->outstanding_reqs, tcp_req, link); + + return tcp_req; +} + +static void +nvme_tcp_req_put(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) +{ + assert(tcp_req->state != NVME_TCP_REQ_FREE); + tcp_req->state = NVME_TCP_REQ_FREE; + TAILQ_INSERT_HEAD(&tqpair->free_reqs, tcp_req, link); +} + +static int +nvme_tcp_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service) +{ + struct addrinfo *res; + struct addrinfo hints; + int ret; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = family; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = 0; + + ret = getaddrinfo(addr, service, &hints, &res); + if (ret) { + SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret); + return ret; + } + + if (res->ai_addrlen > sizeof(*sa)) { + SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen); + ret = EINVAL; + } else { + memcpy(sa, res->ai_addr, res->ai_addrlen); + } + + freeaddrinfo(res); + return ret; +} + +static void +nvme_tcp_free_reqs(struct nvme_tcp_qpair *tqpair) +{ + free(tqpair->tcp_reqs); + tqpair->tcp_reqs = NULL; + + spdk_free(tqpair->send_pdus); + tqpair->send_pdus = NULL; +} + +static int +nvme_tcp_alloc_reqs(struct nvme_tcp_qpair *tqpair) +{ + uint16_t i; + struct nvme_tcp_req *tcp_req; + + tqpair->tcp_reqs = calloc(tqpair->num_entries, sizeof(struct nvme_tcp_req)); + if (tqpair->tcp_reqs == NULL) { + SPDK_ERRLOG("Failed to allocate tcp_reqs on tqpair=%p\n", tqpair); + goto fail; + } + + tqpair->send_pdus = spdk_zmalloc(tqpair->num_entries * sizeof(struct nvme_tcp_pdu), + 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + + if (tqpair->send_pdus == NULL) { + SPDK_ERRLOG("Failed to allocate send_pdus on tqpair=%p\n", tqpair); + goto fail; + } + + TAILQ_INIT(&tqpair->send_queue); + TAILQ_INIT(&tqpair->free_reqs); + TAILQ_INIT(&tqpair->outstanding_reqs); + for (i = 0; i < tqpair->num_entries; i++) { + tcp_req = &tqpair->tcp_reqs[i]; + tcp_req->cid = i; + tcp_req->tqpair = tqpair; + tcp_req->send_pdu = &tqpair->send_pdus[i]; + TAILQ_INSERT_TAIL(&tqpair->free_reqs, tcp_req, link); + } + + return 0; +fail: + nvme_tcp_free_reqs(tqpair); + return -ENOMEM; +} + +static void +nvme_tcp_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + struct nvme_tcp_pdu *pdu; + + spdk_sock_close(&tqpair->sock); + + /* clear the send_queue */ + while (!TAILQ_EMPTY(&tqpair->send_queue)) { + pdu = TAILQ_FIRST(&tqpair->send_queue); + /* Remove the pdu from the send_queue to prevent the wrong sending out + * in the next round connection + */ + TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); + } +} + +static void nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); + +static int +nvme_tcp_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_qpair *tqpair; + + if (!qpair) { + return -1; + } + + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + nvme_tcp_qpair_abort_reqs(qpair, 1); + nvme_qpair_deinit(qpair); + tqpair = nvme_tcp_qpair(qpair); + nvme_tcp_free_reqs(tqpair); + free(tqpair); + + return 0; +} + +static int +nvme_tcp_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + return 0; +} + +static int +nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_tcp_ctrlr *tctrlr = nvme_tcp_ctrlr(ctrlr); + + if (ctrlr->adminq) { + nvme_tcp_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq); + } + + nvme_ctrlr_destruct_finish(ctrlr); + + free(tctrlr); + + return 0; +} + +static void +_pdu_write_done(void *cb_arg, int err) +{ + struct nvme_tcp_pdu *pdu = cb_arg; + struct nvme_tcp_qpair *tqpair = pdu->qpair; + + TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); + + if (err != 0) { + nvme_transport_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair); + return; + } + + assert(pdu->cb_fn != NULL); + pdu->cb_fn(pdu->cb_arg); +} + +static int +nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu, + nvme_tcp_qpair_xfer_complete_cb cb_fn, + void *cb_arg) +{ + int hlen; + uint32_t crc32c; + uint32_t mapped_length = 0; + + hlen = pdu->hdr.common.hlen; + + /* Header Digest */ + if (g_nvme_tcp_hdgst[pdu->hdr.common.pdu_type] && tqpair->host_hdgst_enable) { + crc32c = nvme_tcp_pdu_calc_header_digest(pdu); + MAKE_DIGEST_WORD((uint8_t *)pdu->hdr.raw + hlen, crc32c); + } + + /* Data Digest */ + if (pdu->data_len > 0 && g_nvme_tcp_ddgst[pdu->hdr.common.pdu_type] && tqpair->host_ddgst_enable) { + crc32c = nvme_tcp_pdu_calc_data_digest(pdu); + MAKE_DIGEST_WORD(pdu->data_digest, crc32c); + } + + pdu->cb_fn = cb_fn; + pdu->cb_arg = cb_arg; + + pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, NVME_TCP_MAX_SGL_DESCRIPTORS, pdu, + tqpair->host_hdgst_enable, tqpair->host_ddgst_enable, + &mapped_length); + pdu->qpair = tqpair; + pdu->sock_req.cb_fn = _pdu_write_done; + pdu->sock_req.cb_arg = pdu; + TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq); + spdk_sock_writev_async(tqpair->sock, &pdu->sock_req); + + return 0; +} + +/* + * Build SGL describing contiguous payload buffer. + */ +static int +nvme_tcp_build_contig_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) +{ + struct nvme_request *req = tcp_req->req; + + tcp_req->iov[0].iov_base = req->payload.contig_or_cb_arg + req->payload_offset; + tcp_req->iov[0].iov_len = req->payload_size; + tcp_req->iovcnt = 1; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + + return 0; +} + +/* + * Build SGL describing scattered payload buffer. + */ +static int +nvme_tcp_build_sgl_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) +{ + int rc; + uint32_t length, remaining_size, iovcnt = 0, max_num_sgl; + struct nvme_request *req = tcp_req->req; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + max_num_sgl = spdk_min(req->qpair->ctrlr->max_sges, NVME_TCP_MAX_SGL_DESCRIPTORS); + remaining_size = req->payload_size; + + do { + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &tcp_req->iov[iovcnt].iov_base, + &length); + if (rc) { + return -1; + } + + length = spdk_min(length, remaining_size); + tcp_req->iov[iovcnt].iov_len = length; + remaining_size -= length; + iovcnt++; + } while (remaining_size > 0 && iovcnt < max_num_sgl); + + + /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */ + if (remaining_size > 0) { + SPDK_ERRLOG("Failed to construct tcp_req=%p, and the iovcnt=%u, remaining_size=%u\n", + tcp_req, iovcnt, remaining_size); + return -1; + } + + tcp_req->iovcnt = iovcnt; + + return 0; +} + +static int +nvme_tcp_req_init(struct nvme_tcp_qpair *tqpair, struct nvme_request *req, + struct nvme_tcp_req *tcp_req) +{ + struct spdk_nvme_ctrlr *ctrlr = tqpair->qpair.ctrlr; + int rc = 0; + enum spdk_nvme_data_transfer xfer; + uint32_t max_incapsule_data_size; + + tcp_req->req = req; + req->cmd.cid = tcp_req->cid; + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_TRANSPORT; + req->cmd.dptr.sgl1.unkeyed.length = req->payload_size; + + if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { + rc = nvme_tcp_build_contig_request(tqpair, tcp_req); + } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { + rc = nvme_tcp_build_sgl_request(tqpair, tcp_req); + } else { + rc = -1; + } + + if (rc) { + return rc; + } + + if (req->cmd.opc == SPDK_NVME_OPC_FABRIC) { + struct spdk_nvmf_capsule_cmd *nvmf_cmd = (struct spdk_nvmf_capsule_cmd *)&req->cmd; + + xfer = spdk_nvme_opc_get_data_transfer(nvmf_cmd->fctype); + } else { + xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc); + } + if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + max_incapsule_data_size = ctrlr->ioccsz_bytes; + if ((req->cmd.opc == SPDK_NVME_OPC_FABRIC) || nvme_qpair_is_admin_queue(&tqpair->qpair)) { + max_incapsule_data_size = spdk_min(max_incapsule_data_size, NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE); + } + + if (req->payload_size <= max_incapsule_data_size) { + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.address = 0; + tcp_req->in_capsule_data = true; + } + } + + return 0; +} + +static inline void +nvme_tcp_req_put_safe(struct nvme_tcp_req *tcp_req) +{ + if (tcp_req->ordering.send_ack && tcp_req->ordering.data_recv) { + assert(tcp_req->state == NVME_TCP_REQ_ACTIVE); + assert(tcp_req->tqpair != NULL); + nvme_tcp_req_put(tcp_req->tqpair, tcp_req); + } +} + +static void +nvme_tcp_qpair_cmd_send_complete(void *cb_arg) +{ + struct nvme_tcp_req *tcp_req = cb_arg; + + tcp_req->ordering.send_ack = 1; + /* Handle the r2t case */ + if (spdk_unlikely(tcp_req->ordering.r2t_recv)) { + nvme_tcp_send_h2c_data(tcp_req); + } else { + nvme_tcp_req_put_safe(tcp_req); + } +} + +static int +nvme_tcp_qpair_capsule_cmd_send(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_req *tcp_req) +{ + struct nvme_tcp_pdu *pdu; + struct spdk_nvme_tcp_cmd *capsule_cmd; + uint32_t plen = 0, alignment; + uint8_t pdo; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + pdu = tcp_req->send_pdu; + + capsule_cmd = &pdu->hdr.capsule_cmd; + capsule_cmd->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD; + plen = capsule_cmd->common.hlen = sizeof(*capsule_cmd); + capsule_cmd->ccsqe = tcp_req->req->cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "capsule_cmd cid=%u on tqpair(%p)\n", tcp_req->req->cmd.cid, tqpair); + + if (tqpair->host_hdgst_enable) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Header digest is enabled for capsule command on tcp_req=%p\n", + tcp_req); + capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; + plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + if ((tcp_req->req->payload_size == 0) || !tcp_req->in_capsule_data) { + goto end; + } + + pdo = plen; + pdu->padding_len = 0; + if (tqpair->cpda) { + alignment = (tqpair->cpda + 1) << 2; + if (alignment > plen) { + pdu->padding_len = alignment - plen; + pdo = alignment; + plen = alignment; + } + } + + capsule_cmd->common.pdo = pdo; + plen += tcp_req->req->payload_size; + if (tqpair->host_ddgst_enable) { + capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF; + plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + tcp_req->datao = 0; + nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt, + 0, tcp_req->req->payload_size); +end: + capsule_cmd->common.plen = plen; + return nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_qpair_cmd_send_complete, tcp_req); + +} + +static int +nvme_tcp_qpair_submit_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req) +{ + struct nvme_tcp_qpair *tqpair; + struct nvme_tcp_req *tcp_req; + + tqpair = nvme_tcp_qpair(qpair); + assert(tqpair != NULL); + assert(req != NULL); + + tcp_req = nvme_tcp_req_get(tqpair); + if (!tcp_req) { + /* Inform the upper layer to try again later. */ + return -EAGAIN; + } + + if (nvme_tcp_req_init(tqpair, req, tcp_req)) { + SPDK_ERRLOG("nvme_tcp_req_init() failed\n"); + TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link); + nvme_tcp_req_put(tqpair, tcp_req); + return -1; + } + + return nvme_tcp_qpair_capsule_cmd_send(tqpair, tcp_req); +} + +static int +nvme_tcp_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static void +nvme_tcp_req_complete(struct nvme_tcp_req *tcp_req, + struct spdk_nvme_cpl *rsp) +{ + struct nvme_request *req; + + assert(tcp_req->req != NULL); + req = tcp_req->req; + + TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link); + nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp); + nvme_free_request(req); +} + +static void +nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + struct nvme_tcp_req *tcp_req, *tmp; + struct spdk_nvme_cpl cpl; + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + cpl.status.dnr = dnr; + + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { + nvme_tcp_req_complete(tcp_req, &cpl); + nvme_tcp_req_put(tqpair, tcp_req); + } +} + +static void +nvme_tcp_qpair_set_recv_state(struct nvme_tcp_qpair *tqpair, + enum nvme_tcp_pdu_recv_state state) +{ + if (tqpair->recv_state == state) { + SPDK_ERRLOG("The recv state of tqpair=%p is same with the state(%d) to be set\n", + tqpair, state); + return; + } + + tqpair->recv_state = state; + switch (state) { + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY: + case NVME_TCP_PDU_RECV_STATE_ERROR: + memset(&tqpair->recv_pdu, 0, sizeof(struct nvme_tcp_pdu)); + break; + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH: + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH: + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD: + default: + break; + } +} + +static void +nvme_tcp_qpair_send_h2c_term_req_complete(void *cb_arg) +{ + struct nvme_tcp_qpair *tqpair = cb_arg; + + tqpair->state = NVME_TCP_QPAIR_STATE_EXITING; +} + +static void +nvme_tcp_qpair_send_h2c_term_req(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu, + enum spdk_nvme_tcp_term_req_fes fes, uint32_t error_offset) +{ + struct nvme_tcp_pdu *rsp_pdu; + struct spdk_nvme_tcp_term_req_hdr *h2c_term_req; + uint32_t h2c_term_req_hdr_len = sizeof(*h2c_term_req); + uint8_t copy_len; + + rsp_pdu = &tqpair->send_pdu; + memset(rsp_pdu, 0, sizeof(*rsp_pdu)); + h2c_term_req = &rsp_pdu->hdr.term_req; + h2c_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ; + h2c_term_req->common.hlen = h2c_term_req_hdr_len; + + if ((fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) || + (fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) { + DSET32(&h2c_term_req->fei, error_offset); + } + + copy_len = pdu->hdr.common.hlen; + if (copy_len > SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE) { + copy_len = SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE; + } + + /* Copy the error info into the buffer */ + memcpy((uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, pdu->hdr.raw, copy_len); + nvme_tcp_pdu_set_data(rsp_pdu, (uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, copy_len); + + /* Contain the header len of the wrong received pdu */ + h2c_term_req->common.plen = h2c_term_req->common.hlen + copy_len; + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_send_h2c_term_req_complete, NULL); + +} + +static void +nvme_tcp_pdu_ch_handle(struct nvme_tcp_qpair *tqpair) +{ + struct nvme_tcp_pdu *pdu; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + uint32_t expected_hlen, hd_len = 0; + bool plen_error = false; + + pdu = &tqpair->recv_pdu; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "pdu type = %d\n", pdu->hdr.common.pdu_type); + if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_RESP) { + if (tqpair->state != NVME_TCP_QPAIR_STATE_INVALID) { + SPDK_ERRLOG("Already received IC_RESP PDU, and we should reject this pdu=%p\n", pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR; + goto err; + } + expected_hlen = sizeof(struct spdk_nvme_tcp_ic_resp); + if (pdu->hdr.common.plen != expected_hlen) { + plen_error = true; + } + } else { + if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) { + SPDK_ERRLOG("The TCP/IP tqpair connection is not negotitated\n"); + fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR; + goto err; + } + + switch (pdu->hdr.common.pdu_type) { + case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP: + expected_hlen = sizeof(struct spdk_nvme_tcp_rsp); + if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) { + hd_len = SPDK_NVME_TCP_DIGEST_LEN; + } + + if (pdu->hdr.common.plen != (expected_hlen + hd_len)) { + plen_error = true; + } + break; + case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: + expected_hlen = sizeof(struct spdk_nvme_tcp_c2h_data_hdr); + if (pdu->hdr.common.plen < pdu->hdr.common.pdo) { + plen_error = true; + } + break; + case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: + expected_hlen = sizeof(struct spdk_nvme_tcp_term_req_hdr); + if ((pdu->hdr.common.plen <= expected_hlen) || + (pdu->hdr.common.plen > SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE)) { + plen_error = true; + } + break; + case SPDK_NVME_TCP_PDU_TYPE_R2T: + expected_hlen = sizeof(struct spdk_nvme_tcp_r2t_hdr); + if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) { + hd_len = SPDK_NVME_TCP_DIGEST_LEN; + } + + if (pdu->hdr.common.plen != (expected_hlen + hd_len)) { + plen_error = true; + } + break; + + default: + SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu.hdr.common.pdu_type); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdu_type); + goto err; + } + } + + if (pdu->hdr.common.hlen != expected_hlen) { + SPDK_ERRLOG("Expected PDU header length %u, got %u\n", + expected_hlen, pdu->hdr.common.hlen); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, hlen); + goto err; + + } else if (plen_error) { + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen); + goto err; + } else { + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH); + nvme_tcp_pdu_calc_psh_len(&tqpair->recv_pdu, tqpair->host_hdgst_enable); + return; + } +err: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); +} + +static struct nvme_tcp_req * +get_nvme_active_req_by_cid(struct nvme_tcp_qpair *tqpair, uint32_t cid) +{ + assert(tqpair != NULL); + if ((cid >= tqpair->num_entries) || (tqpair->tcp_reqs[cid].state == NVME_TCP_REQ_FREE)) { + return NULL; + } + + return &tqpair->tcp_reqs[cid]; +} + +static void +nvme_tcp_c2h_data_payload_handle(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu, uint32_t *reaped) +{ + struct nvme_tcp_req *tcp_req; + struct spdk_nvme_tcp_c2h_data_hdr *c2h_data; + struct spdk_nvme_cpl cpl = {}; + uint8_t flags; + + tcp_req = pdu->req; + assert(tcp_req != NULL); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + c2h_data = &pdu->hdr.c2h_data; + tcp_req->datao += pdu->data_len; + flags = c2h_data->common.flags; + + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + if (flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS) { + if (tcp_req->datao == tcp_req->req->payload_size) { + cpl.status.p = 0; + } else { + cpl.status.p = 1; + } + + cpl.cid = tcp_req->cid; + cpl.sqid = tqpair->qpair.id; + nvme_tcp_req_complete(tcp_req, &cpl); + if (tcp_req->ordering.send_ack) { + (*reaped)++; + } + + tcp_req->ordering.data_recv = 1; + nvme_tcp_req_put_safe(tcp_req); + } +} + +static const char *spdk_nvme_tcp_term_req_fes_str[] = { + "Invalid PDU Header Field", + "PDU Sequence Error", + "Header Digest Error", + "Data Transfer Out of Range", + "Data Transfer Limit Exceeded", + "Unsupported parameter", +}; + +static void +nvme_tcp_c2h_term_req_dump(struct spdk_nvme_tcp_term_req_hdr *c2h_term_req) +{ + SPDK_ERRLOG("Error info of pdu(%p): %s\n", c2h_term_req, + spdk_nvme_tcp_term_req_fes_str[c2h_term_req->fes]); + if ((c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) || + (c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "The offset from the start of the PDU header is %u\n", + DGET32(c2h_term_req->fei)); + } + /* we may also need to dump some other info here */ +} + +static void +nvme_tcp_c2h_term_req_payload_handle(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + nvme_tcp_c2h_term_req_dump(&pdu->hdr.term_req); + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); +} + +static void +nvme_tcp_pdu_payload_handle(struct nvme_tcp_qpair *tqpair, + uint32_t *reaped) +{ + int rc = 0; + struct nvme_tcp_pdu *pdu; + uint32_t crc32c, error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); + pdu = &tqpair->recv_pdu; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + + /* check data digest if need */ + if (pdu->ddgst_enable) { + crc32c = nvme_tcp_pdu_calc_data_digest(pdu); + rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c); + if (rc == 0) { + SPDK_ERRLOG("data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR; + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; + } + } + + switch (pdu->hdr.common.pdu_type) { + case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: + nvme_tcp_c2h_data_payload_handle(tqpair, pdu, reaped); + break; + + case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: + nvme_tcp_c2h_term_req_payload_handle(tqpair, pdu); + break; + + default: + /* The code should not go to here */ + SPDK_ERRLOG("The code should not go to here\n"); + break; + } +} + +static void +nvme_tcp_send_icreq_complete(void *cb_arg) +{ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Complete the icreq send for tqpair=%p\n", + (struct nvme_tcp_qpair *)cb_arg); +} + +static void +nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvme_tcp_ic_resp *ic_resp = &pdu->hdr.ic_resp; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + int recv_buf_size; + + /* Only PFV 0 is defined currently */ + if (ic_resp->pfv != 0) { + SPDK_ERRLOG("Expected ICResp PFV %u, got %u\n", 0u, ic_resp->pfv); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, pfv); + goto end; + } + + if (ic_resp->maxh2cdata < NVME_TCP_PDU_H2C_MIN_DATA_SIZE) { + SPDK_ERRLOG("Expected ICResp maxh2cdata >=%u, got %u\n", NVME_TCP_PDU_H2C_MIN_DATA_SIZE, + ic_resp->maxh2cdata); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, maxh2cdata); + goto end; + } + tqpair->maxh2cdata = ic_resp->maxh2cdata; + + if (ic_resp->cpda > SPDK_NVME_TCP_CPDA_MAX) { + SPDK_ERRLOG("Expected ICResp cpda <=%u, got %u\n", SPDK_NVME_TCP_CPDA_MAX, ic_resp->cpda); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, cpda); + goto end; + } + tqpair->cpda = ic_resp->cpda; + + tqpair->host_hdgst_enable = ic_resp->dgst.bits.hdgst_enable ? true : false; + tqpair->host_ddgst_enable = ic_resp->dgst.bits.ddgst_enable ? true : false; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_hdgst_enable: %u\n", tqpair->host_hdgst_enable); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_ddgst_enable: %u\n", tqpair->host_ddgst_enable); + + /* Now that we know whether digests are enabled, properly size the receive buffer to + * handle several incoming 4K read commands according to SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR + * parameter. */ + recv_buf_size = 0x1000 + sizeof(struct spdk_nvme_tcp_c2h_data_hdr); + + if (tqpair->host_hdgst_enable) { + recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN; + } + + if (tqpair->host_ddgst_enable) { + recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN; + } + + if (spdk_sock_set_recvbuf(tqpair->sock, recv_buf_size * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR) < 0) { + SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n", + tqpair, + recv_buf_size); + /* Not fatal. */ + } + + tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING; + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + return; +end: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; +} + +static void +nvme_tcp_capsule_resp_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu, + uint32_t *reaped) +{ + struct nvme_tcp_req *tcp_req; + struct spdk_nvme_tcp_rsp *capsule_resp = &pdu->hdr.capsule_resp; + uint32_t cid, error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + struct spdk_nvme_cpl cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + cpl = capsule_resp->rccqe; + cid = cpl.cid; + + /* Recv the pdu again */ + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + + tcp_req = get_nvme_active_req_by_cid(tqpair, cid); + if (!tcp_req) { + SPDK_ERRLOG("no tcp_req is found with cid=%u for tqpair=%p\n", cid, tqpair); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_rsp, rccqe); + goto end; + + } + + nvme_tcp_req_complete(tcp_req, &cpl); + if (tcp_req->ordering.send_ack) { + (*reaped)++; + } + + tcp_req->ordering.data_recv = 1; + nvme_tcp_req_put_safe(tcp_req); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "complete tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair); + + return; + +end: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; +} + +static void +nvme_tcp_c2h_term_req_hdr_handle(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvme_tcp_term_req_hdr *c2h_term_req = &pdu->hdr.term_req; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + if (c2h_term_req->fes > SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER) { + SPDK_ERRLOG("Fatal Error Stauts(FES) is unknown for c2h_term_req pdu=%p\n", pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_term_req_hdr, fes); + goto end; + } + + /* set the data buffer */ + nvme_tcp_pdu_set_data(pdu, (uint8_t *)pdu->hdr.raw + c2h_term_req->common.hlen, + c2h_term_req->common.plen - c2h_term_req->common.hlen); + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); + return; +end: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; +} + +static void +nvme_tcp_c2h_data_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu) +{ + struct nvme_tcp_req *tcp_req; + struct spdk_nvme_tcp_c2h_data_hdr *c2h_data = &pdu->hdr.c2h_data; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "c2h_data info on tqpair(%p): datao=%u, datal=%u, cccid=%d\n", + tqpair, c2h_data->datao, c2h_data->datal, c2h_data->cccid); + tcp_req = get_nvme_active_req_by_cid(tqpair, c2h_data->cccid); + if (!tcp_req) { + SPDK_ERRLOG("no tcp_req found for c2hdata cid=%d\n", c2h_data->cccid); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, cccid); + goto end; + + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "tcp_req(%p) on tqpair(%p): datao=%u, payload_size=%u\n", + tcp_req, tqpair, tcp_req->datao, tcp_req->req->payload_size); + + if (c2h_data->datal > tcp_req->req->payload_size) { + SPDK_ERRLOG("Invalid datal for tcp_req(%p), datal(%u) exceeds payload_size(%u)\n", + tcp_req, c2h_data->datal, tcp_req->req->payload_size); + fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; + goto end; + } + + if (tcp_req->datao != c2h_data->datao) { + SPDK_ERRLOG("Invalid datao for tcp_req(%p), received datal(%u) != datao(%u) in tcp_req\n", + tcp_req, c2h_data->datao, tcp_req->datao); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datao); + goto end; + } + + if ((c2h_data->datao + c2h_data->datal) > tcp_req->req->payload_size) { + SPDK_ERRLOG("Invalid data range for tcp_req(%p), received (datao(%u) + datal(%u)) > datao(%u) in tcp_req\n", + tcp_req, c2h_data->datao, c2h_data->datal, tcp_req->req->payload_size); + fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; + error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datal); + goto end; + + } + + nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt, + c2h_data->datao, c2h_data->datal); + pdu->req = tcp_req; + + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); + return; + +end: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; +} + +static void +nvme_tcp_qpair_h2c_data_send_complete(void *cb_arg) +{ + struct nvme_tcp_req *tcp_req = cb_arg; + + assert(tcp_req != NULL); + + tcp_req->ordering.send_ack = 1; + if (tcp_req->r2tl_remain) { + nvme_tcp_send_h2c_data(tcp_req); + } else { + assert(tcp_req->active_r2ts > 0); + tcp_req->active_r2ts--; + tcp_req->state = NVME_TCP_REQ_ACTIVE; + /* Need also call this function to free the resource */ + nvme_tcp_req_put_safe(tcp_req); + } +} + +static void +nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(tcp_req->req->qpair); + struct nvme_tcp_pdu *rsp_pdu; + struct spdk_nvme_tcp_h2c_data_hdr *h2c_data; + uint32_t plen, pdo, alignment; + + /* Reinit the send_ack and r2t_recv bits */ + tcp_req->ordering.send_ack = 0; + tcp_req->ordering.r2t_recv = 0; + rsp_pdu = tcp_req->send_pdu; + memset(rsp_pdu, 0, sizeof(*rsp_pdu)); + h2c_data = &rsp_pdu->hdr.h2c_data; + + h2c_data->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_DATA; + plen = h2c_data->common.hlen = sizeof(*h2c_data); + h2c_data->cccid = tcp_req->cid; + h2c_data->ttag = tcp_req->ttag; + h2c_data->datao = tcp_req->datao; + + h2c_data->datal = spdk_min(tcp_req->r2tl_remain, tqpair->maxh2cdata); + nvme_tcp_pdu_set_data_buf(rsp_pdu, tcp_req->iov, tcp_req->iovcnt, + h2c_data->datao, h2c_data->datal); + tcp_req->r2tl_remain -= h2c_data->datal; + + if (tqpair->host_hdgst_enable) { + h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; + plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + rsp_pdu->padding_len = 0; + pdo = plen; + if (tqpair->cpda) { + alignment = (tqpair->cpda + 1) << 2; + if (alignment > plen) { + rsp_pdu->padding_len = alignment - plen; + pdo = plen = alignment; + } + } + + h2c_data->common.pdo = pdo; + plen += h2c_data->datal; + if (tqpair->host_ddgst_enable) { + h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF; + plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + h2c_data->common.plen = plen; + tcp_req->datao += h2c_data->datal; + if (!tcp_req->r2tl_remain) { + h2c_data->common.flags |= SPDK_NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "h2c_data info: datao=%u, datal=%u, pdu_len=%u for tqpair=%p\n", + h2c_data->datao, h2c_data->datal, h2c_data->common.plen, tqpair); + + nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_h2c_data_send_complete, tcp_req); +} + +static void +nvme_tcp_r2t_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu) +{ + struct nvme_tcp_req *tcp_req; + struct spdk_nvme_tcp_r2t_hdr *r2t = &pdu->hdr.r2t; + uint32_t cid, error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + cid = r2t->cccid; + tcp_req = get_nvme_active_req_by_cid(tqpair, cid); + if (!tcp_req) { + SPDK_ERRLOG("Cannot find tcp_req for tqpair=%p\n", tqpair); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, cccid); + goto end; + } + + tcp_req->ordering.r2t_recv = 1; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "r2t info: r2to=%u, r2tl=%u for tqpair=%p\n", r2t->r2to, r2t->r2tl, + tqpair); + + if (tcp_req->state == NVME_TCP_REQ_ACTIVE) { + assert(tcp_req->active_r2ts == 0); + tcp_req->state = NVME_TCP_REQ_ACTIVE_R2T; + } + + tcp_req->active_r2ts++; + if (tcp_req->active_r2ts > tqpair->maxr2t) { + fes = SPDK_NVME_TCP_TERM_REQ_FES_R2T_LIMIT_EXCEEDED; + SPDK_ERRLOG("Invalid R2T: it exceeds the R2T maixmal=%u for tqpair=%p\n", tqpair->maxr2t, tqpair); + goto end; + } + + if (tcp_req->datao != r2t->r2to) { + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2to); + goto end; + + } + + if ((r2t->r2tl + r2t->r2to) > tcp_req->req->payload_size) { + SPDK_ERRLOG("Invalid R2T info for tcp_req=%p: (r2to(%u) + r2tl(%u)) exceeds payload_size(%u)\n", + tcp_req, r2t->r2to, r2t->r2tl, tqpair->maxh2cdata); + fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; + error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2tl); + goto end; + + } + + tcp_req->ttag = r2t->ttag; + tcp_req->r2tl_remain = r2t->r2tl; + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + + if (spdk_likely(tcp_req->ordering.send_ack)) { + nvme_tcp_send_h2c_data(tcp_req); + } + return; + +end: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; + +} + +static void +nvme_tcp_pdu_psh_handle(struct nvme_tcp_qpair *tqpair, uint32_t *reaped) +{ + struct nvme_tcp_pdu *pdu; + int rc; + uint32_t crc32c, error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH); + pdu = &tqpair->recv_pdu; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter: pdu type =%u\n", pdu->hdr.common.pdu_type); + /* check header digest if needed */ + if (pdu->has_hdgst) { + crc32c = nvme_tcp_pdu_calc_header_digest(pdu); + rc = MATCH_DIGEST_WORD((uint8_t *)pdu->hdr.raw + pdu->hdr.common.hlen, crc32c); + if (rc == 0) { + SPDK_ERRLOG("header digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR; + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; + + } + } + + switch (pdu->hdr.common.pdu_type) { + case SPDK_NVME_TCP_PDU_TYPE_IC_RESP: + nvme_tcp_icresp_handle(tqpair, pdu); + break; + case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP: + nvme_tcp_capsule_resp_hdr_handle(tqpair, pdu, reaped); + break; + case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: + nvme_tcp_c2h_data_hdr_handle(tqpair, pdu); + break; + + case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: + nvme_tcp_c2h_term_req_hdr_handle(tqpair, pdu); + break; + case SPDK_NVME_TCP_PDU_TYPE_R2T: + nvme_tcp_r2t_hdr_handle(tqpair, pdu); + break; + + default: + SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu.hdr.common.pdu_type); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = 1; + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + break; + } + +} + +static int +nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped) +{ + int rc = 0; + struct nvme_tcp_pdu *pdu; + uint32_t data_len; + enum nvme_tcp_pdu_recv_state prev_state; + + /* The loop here is to allow for several back-to-back state changes. */ + do { + prev_state = tqpair->recv_state; + switch (tqpair->recv_state) { + /* If in a new state */ + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY: + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH); + break; + /* common header */ + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH: + pdu = &tqpair->recv_pdu; + if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) { + rc = nvme_tcp_read_data(tqpair->sock, + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes, + (uint8_t *)&pdu->hdr.common + pdu->ch_valid_bytes); + if (rc < 0) { + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + break; + } + pdu->ch_valid_bytes += rc; + if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) { + return NVME_TCP_PDU_IN_PROGRESS; + } + } + + /* The command header of this PDU has now been read from the socket. */ + nvme_tcp_pdu_ch_handle(tqpair); + break; + /* Wait for the pdu specific header */ + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH: + pdu = &tqpair->recv_pdu; + rc = nvme_tcp_read_data(tqpair->sock, + pdu->psh_len - pdu->psh_valid_bytes, + (uint8_t *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes); + if (rc < 0) { + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + break; + } + + pdu->psh_valid_bytes += rc; + if (pdu->psh_valid_bytes < pdu->psh_len) { + return NVME_TCP_PDU_IN_PROGRESS; + } + + /* All header(ch, psh, head digist) of this PDU has now been read from the socket. */ + nvme_tcp_pdu_psh_handle(tqpair, reaped); + break; + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD: + pdu = &tqpair->recv_pdu; + /* check whether the data is valid, if not we just return */ + if (!pdu->data_len) { + return NVME_TCP_PDU_IN_PROGRESS; + } + + data_len = pdu->data_len; + /* data digest */ + if (spdk_unlikely((pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_C2H_DATA) && + tqpair->host_ddgst_enable)) { + data_len += SPDK_NVME_TCP_DIGEST_LEN; + pdu->ddgst_enable = true; + } + + rc = nvme_tcp_read_payload_data(tqpair->sock, pdu); + if (rc < 0) { + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + break; + } + + pdu->readv_offset += rc; + if (pdu->readv_offset < data_len) { + return NVME_TCP_PDU_IN_PROGRESS; + } + + assert(pdu->readv_offset == data_len); + /* All of this PDU has now been read from the socket. */ + nvme_tcp_pdu_payload_handle(tqpair, reaped); + break; + case NVME_TCP_PDU_RECV_STATE_ERROR: + rc = NVME_TCP_PDU_FATAL; + break; + default: + assert(0); + break; + } + } while (prev_state != tqpair->recv_state); + + return rc; +} + +static void +nvme_tcp_qpair_check_timeout(struct spdk_nvme_qpair *qpair) +{ + uint64_t t02; + struct nvme_tcp_req *tcp_req, *tmp; + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + + /* Don't check timeouts during controller initialization. */ + if (ctrlr->state != NVME_CTRLR_STATE_READY) { + return; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + active_proc = nvme_ctrlr_get_current_process(ctrlr); + } else { + active_proc = qpair->active_proc; + } + + /* Only check timeouts if the current process has a timeout callback. */ + if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { + return; + } + + t02 = spdk_get_ticks(); + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { + assert(tcp_req->req != NULL); + + if (nvme_request_check_timeout(tcp_req->req, tcp_req->cid, active_proc, t02)) { + /* + * The requests are in order, so as soon as one has not timed out, + * stop iterating. + */ + break; + } + } +} + +static int +nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + uint32_t reaped; + int rc; + + rc = spdk_sock_flush(tqpair->sock); + if (rc < 0) { + return rc; + } + + if (max_completions == 0) { + max_completions = tqpair->num_entries; + } else { + max_completions = spdk_min(max_completions, tqpair->num_entries); + } + + reaped = 0; + do { + rc = nvme_tcp_read_pdu(tqpair, &reaped); + if (rc < 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Error polling CQ! (%d): %s\n", + errno, spdk_strerror(errno)); + goto fail; + } else if (rc == 0) { + /* Partial PDU is read */ + break; + } + + } while (reaped < max_completions); + + if (spdk_unlikely(tqpair->qpair.ctrlr->timeout_enabled)) { + nvme_tcp_qpair_check_timeout(qpair); + } + + return reaped; +fail: + + /* + * Since admin queues take the ctrlr_lock before entering this function, + * we can call nvme_transport_ctrlr_disconnect_qpair. For other qpairs we need + * to call the generic function which will take the lock for us. + */ + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; + + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_transport_ctrlr_disconnect_qpair(qpair->ctrlr, qpair); + } else { + nvme_ctrlr_disconnect_qpair(qpair); + } + return -ENXIO; +} + +static void +nvme_tcp_qpair_sock_cb(void *ctx, struct spdk_sock_group *group, struct spdk_sock *sock) +{ + struct spdk_nvme_qpair *qpair = ctx; + struct nvme_tcp_poll_group *pgroup = nvme_tcp_poll_group(qpair->poll_group); + int32_t num_completions; + + num_completions = spdk_nvme_qpair_process_completions(qpair, pgroup->completions_per_qpair); + + if (pgroup->num_completions >= 0 && num_completions >= 0) { + pgroup->num_completions += num_completions; + } else { + pgroup->num_completions = -ENXIO; + } +} + +static int +nvme_tcp_qpair_icreq_send(struct nvme_tcp_qpair *tqpair) +{ + struct spdk_nvme_tcp_ic_req *ic_req; + struct nvme_tcp_pdu *pdu; + uint64_t icreq_timeout_tsc; + int rc; + + pdu = &tqpair->send_pdu; + memset(&tqpair->send_pdu, 0, sizeof(tqpair->send_pdu)); + ic_req = &pdu->hdr.ic_req; + + ic_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_REQ; + ic_req->common.hlen = ic_req->common.plen = sizeof(*ic_req); + ic_req->pfv = 0; + ic_req->maxr2t = NVME_TCP_MAX_R2T_DEFAULT - 1; + ic_req->hpda = NVME_TCP_HPDA_DEFAULT; + + ic_req->dgst.bits.hdgst_enable = tqpair->qpair.ctrlr->opts.header_digest; + ic_req->dgst.bits.ddgst_enable = tqpair->qpair.ctrlr->opts.data_digest; + + nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_send_icreq_complete, tqpair); + + icreq_timeout_tsc = spdk_get_ticks() + (NVME_TCP_TIME_OUT_IN_SECONDS * spdk_get_ticks_hz()); + do { + rc = nvme_tcp_qpair_process_completions(&tqpair->qpair, 0); + } while ((tqpair->state == NVME_TCP_QPAIR_STATE_INVALID) && + (rc == 0) && (spdk_get_ticks() <= icreq_timeout_tsc)); + + if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) { + SPDK_ERRLOG("Failed to construct the tqpair=%p via correct icresp\n", tqpair); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Succesfully construct the tqpair=%p via correct icresp\n", tqpair); + + return 0; +} + +static int +nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct sockaddr_storage dst_addr; + struct sockaddr_storage src_addr; + int rc; + struct nvme_tcp_qpair *tqpair; + int family; + long int port; + struct spdk_sock_opts opts; + + tqpair = nvme_tcp_qpair(qpair); + + switch (ctrlr->trid.adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + family = AF_INET; + break; + case SPDK_NVMF_ADRFAM_IPV6: + family = AF_INET6; + break; + default: + SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family); + + memset(&dst_addr, 0, sizeof(dst_addr)); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid); + rc = nvme_tcp_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid); + if (rc != 0) { + SPDK_ERRLOG("dst_addr nvme_tcp_parse_addr() failed\n"); + return -1; + } + + if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) { + memset(&src_addr, 0, sizeof(src_addr)); + rc = nvme_tcp_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid); + if (rc != 0) { + SPDK_ERRLOG("src_addr nvme_tcp_parse_addr() failed\n"); + return -1; + } + } + + port = spdk_strtol(ctrlr->trid.trsvcid, 10); + if (port <= 0 || port >= INT_MAX) { + SPDK_ERRLOG("Invalid port: %s\n", ctrlr->trid.trsvcid); + return -1; + } + + opts.opts_size = sizeof(opts); + spdk_sock_get_default_opts(&opts); + opts.priority = ctrlr->trid.priority; + tqpair->sock = spdk_sock_connect_ext(ctrlr->trid.traddr, port, NULL, &opts); + if (!tqpair->sock) { + SPDK_ERRLOG("sock connection error of tqpair=%p with addr=%s, port=%ld\n", + tqpair, ctrlr->trid.traddr, port); + return -1; + } + + tqpair->maxr2t = NVME_TCP_MAX_R2T_DEFAULT; + /* Explicitly set the state and recv_state of tqpair */ + tqpair->state = NVME_TCP_QPAIR_STATE_INVALID; + if (tqpair->recv_state != NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY) { + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + } + rc = nvme_tcp_qpair_icreq_send(tqpair); + if (rc != 0) { + SPDK_ERRLOG("Unable to connect the tqpair\n"); + return -1; + } + + rc = nvme_fabric_qpair_connect(&tqpair->qpair, tqpair->num_entries); + if (rc < 0) { + SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); + return -1; + } + + return 0; +} + +static struct spdk_nvme_qpair * +nvme_tcp_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr, + uint16_t qid, uint32_t qsize, + enum spdk_nvme_qprio qprio, + uint32_t num_requests) +{ + struct nvme_tcp_qpair *tqpair; + struct spdk_nvme_qpair *qpair; + int rc; + + tqpair = calloc(1, sizeof(struct nvme_tcp_qpair)); + if (!tqpair) { + SPDK_ERRLOG("failed to get create tqpair\n"); + return NULL; + } + + tqpair->num_entries = qsize; + qpair = &tqpair->qpair; + rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests); + if (rc != 0) { + free(tqpair); + return NULL; + } + + rc = nvme_tcp_alloc_reqs(tqpair); + if (rc) { + nvme_tcp_ctrlr_delete_io_qpair(ctrlr, qpair); + return NULL; + } + + return qpair; +} + +static struct spdk_nvme_qpair * +nvme_tcp_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + return nvme_tcp_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio, + opts->io_queue_requests); +} + +static struct spdk_nvme_ctrlr *nvme_tcp_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + struct nvme_tcp_ctrlr *tctrlr; + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + int rc; + + tctrlr = calloc(1, sizeof(*tctrlr)); + if (tctrlr == NULL) { + SPDK_ERRLOG("could not allocate ctrlr\n"); + return NULL; + } + + tctrlr->ctrlr.opts = *opts; + tctrlr->ctrlr.trid = *trid; + + rc = nvme_ctrlr_construct(&tctrlr->ctrlr); + if (rc != 0) { + free(tctrlr); + return NULL; + } + + tctrlr->ctrlr.adminq = nvme_tcp_ctrlr_create_qpair(&tctrlr->ctrlr, 0, + tctrlr->ctrlr.opts.admin_queue_size, 0, + tctrlr->ctrlr.opts.admin_queue_size); + if (!tctrlr->ctrlr.adminq) { + SPDK_ERRLOG("failed to create admin qpair\n"); + nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr); + return NULL; + } + + rc = nvme_transport_ctrlr_connect_qpair(&tctrlr->ctrlr, tctrlr->ctrlr.adminq); + if (rc < 0) { + SPDK_ERRLOG("failed to connect admin qpair\n"); + nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr); + return NULL; + } + + if (nvme_ctrlr_get_cap(&tctrlr->ctrlr, &cap)) { + SPDK_ERRLOG("get_cap() failed\n"); + nvme_ctrlr_destruct(&tctrlr->ctrlr); + return NULL; + } + + if (nvme_ctrlr_get_vs(&tctrlr->ctrlr, &vs)) { + SPDK_ERRLOG("get_vs() failed\n"); + nvme_ctrlr_destruct(&tctrlr->ctrlr); + return NULL; + } + + if (nvme_ctrlr_add_process(&tctrlr->ctrlr, 0) != 0) { + SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n"); + nvme_ctrlr_destruct(&tctrlr->ctrlr); + return NULL; + } + + nvme_ctrlr_init_cap(&tctrlr->ctrlr, &cap, &vs); + + return &tctrlr->ctrlr; +} + +static uint32_t +nvme_tcp_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + /* TCP transport doens't limit maximum IO transfer size. */ + return UINT32_MAX; +} + +static uint16_t +nvme_tcp_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + /* + * We do not support >1 SGE in the initiator currently, + * so we can only return 1 here. Once that support is + * added, this should return ctrlr->cdata.nvmf_specific.msdbd + * instead. + */ + return 1; +} + +static int +nvme_tcp_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + struct nvme_tcp_req *tcp_req, *tmp; + int rc; + + assert(iter_fn != NULL); + + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { + assert(tcp_req->req != NULL); + + rc = iter_fn(tcp_req->req, arg); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +static void +nvme_tcp_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_req *tcp_req, *tmp; + struct spdk_nvme_cpl cpl; + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { + assert(tcp_req->req != NULL); + if (tcp_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + continue; + } + + nvme_tcp_req_complete(tcp_req, &cpl); + nvme_tcp_req_put(tqpair, tcp_req); + } +} + +static struct spdk_nvme_transport_poll_group * +nvme_tcp_poll_group_create(void) +{ + struct nvme_tcp_poll_group *group = calloc(1, sizeof(*group)); + + if (group == NULL) { + SPDK_ERRLOG("Unable to allocate poll group.\n"); + return NULL; + } + + group->sock_group = spdk_sock_group_create(group); + if (group->sock_group == NULL) { + free(group); + SPDK_ERRLOG("Unable to allocate sock group.\n"); + return NULL; + } + + return &group->group; +} + +static int +nvme_tcp_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group); + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + + if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) { + return -EPROTO; + } + return 0; +} + +static int +nvme_tcp_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group); + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + + if (tqpair->sock && group->sock_group) { + if (spdk_sock_group_remove_sock(group->sock_group, tqpair->sock)) { + return -EPROTO; + } + } + return 0; +} + +static int +nvme_tcp_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); + + /* disconnected qpairs won't have a sock to add. */ + if (nvme_qpair_get_state(qpair) >= NVME_QPAIR_CONNECTED) { + if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) { + return -EPROTO; + } + } + + return 0; +} + +static int +nvme_tcp_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { + return nvme_poll_group_disconnect_qpair(qpair); + } + + return 0; +} + +static int64_t +nvme_tcp_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) +{ + struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); + struct spdk_nvme_qpair *qpair, *tmp_qpair; + + group->completions_per_qpair = completions_per_qpair; + group->num_completions = 0; + + spdk_sock_group_poll(group->sock_group); + + STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { + disconnected_qpair_cb(qpair, tgroup->group->ctx); + } + + return group->num_completions; +} + +static int +nvme_tcp_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) +{ + int rc; + struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); + + if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { + return -EBUSY; + } + + rc = spdk_sock_group_close(&group->sock_group); + if (rc != 0) { + SPDK_ERRLOG("Failed to close the sock group for a tcp poll group.\n"); + assert(false); + } + + free(tgroup); + + return 0; +} + +const struct spdk_nvme_transport_ops tcp_ops = { + .name = "TCP", + .type = SPDK_NVME_TRANSPORT_TCP, + .ctrlr_construct = nvme_tcp_ctrlr_construct, + .ctrlr_scan = nvme_fabric_ctrlr_scan, + .ctrlr_destruct = nvme_tcp_ctrlr_destruct, + .ctrlr_enable = nvme_tcp_ctrlr_enable, + + .ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4, + .ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8, + .ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4, + .ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8, + + .ctrlr_get_max_xfer_size = nvme_tcp_ctrlr_get_max_xfer_size, + .ctrlr_get_max_sges = nvme_tcp_ctrlr_get_max_sges, + + .ctrlr_create_io_qpair = nvme_tcp_ctrlr_create_io_qpair, + .ctrlr_delete_io_qpair = nvme_tcp_ctrlr_delete_io_qpair, + .ctrlr_connect_qpair = nvme_tcp_ctrlr_connect_qpair, + .ctrlr_disconnect_qpair = nvme_tcp_ctrlr_disconnect_qpair, + + .qpair_abort_reqs = nvme_tcp_qpair_abort_reqs, + .qpair_reset = nvme_tcp_qpair_reset, + .qpair_submit_request = nvme_tcp_qpair_submit_request, + .qpair_process_completions = nvme_tcp_qpair_process_completions, + .qpair_iterate_requests = nvme_tcp_qpair_iterate_requests, + .admin_qpair_abort_aers = nvme_tcp_admin_qpair_abort_aers, + + .poll_group_create = nvme_tcp_poll_group_create, + .poll_group_connect_qpair = nvme_tcp_poll_group_connect_qpair, + .poll_group_disconnect_qpair = nvme_tcp_poll_group_disconnect_qpair, + .poll_group_add = nvme_tcp_poll_group_add, + .poll_group_remove = nvme_tcp_poll_group_remove, + .poll_group_process_completions = nvme_tcp_poll_group_process_completions, + .poll_group_destroy = nvme_tcp_poll_group_destroy, +}; + +SPDK_NVME_TRANSPORT_REGISTER(tcp, &tcp_ops); diff --git a/src/spdk/lib/nvme/nvme_transport.c b/src/spdk/lib/nvme/nvme_transport.c new file mode 100644 index 000000000..76efd5966 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_transport.c @@ -0,0 +1,591 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe transport abstraction + */ + +#include "nvme_internal.h" +#include "spdk/queue.h" + +#define SPDK_MAX_NUM_OF_TRANSPORTS 16 + +struct spdk_nvme_transport { + struct spdk_nvme_transport_ops ops; + TAILQ_ENTRY(spdk_nvme_transport) link; +}; + +TAILQ_HEAD(nvme_transport_list, spdk_nvme_transport) g_spdk_nvme_transports = + TAILQ_HEAD_INITIALIZER(g_spdk_nvme_transports); + +struct spdk_nvme_transport g_spdk_transports[SPDK_MAX_NUM_OF_TRANSPORTS] = {}; +int g_current_transport_index = 0; + +const struct spdk_nvme_transport * +nvme_get_first_transport(void) +{ + return TAILQ_FIRST(&g_spdk_nvme_transports); +} + +const struct spdk_nvme_transport * +nvme_get_next_transport(const struct spdk_nvme_transport *transport) +{ + return TAILQ_NEXT(transport, link); +} + +/* + * Unfortunately, due to NVMe PCIe multiprocess support, we cannot store the + * transport object in either the controller struct or the admin qpair. THis means + * that a lot of admin related transport calls will have to call nvme_get_transport + * in order to knwo which functions to call. + * In the I/O path, we have the ability to store the transport struct in the I/O + * qpairs to avoid taking a performance hit. + */ +const struct spdk_nvme_transport * +nvme_get_transport(const char *transport_name) +{ + struct spdk_nvme_transport *registered_transport; + + TAILQ_FOREACH(registered_transport, &g_spdk_nvme_transports, link) { + if (strcasecmp(transport_name, registered_transport->ops.name) == 0) { + return registered_transport; + } + } + + return NULL; +} + +bool +spdk_nvme_transport_available(enum spdk_nvme_transport_type trtype) +{ + return nvme_get_transport(spdk_nvme_transport_id_trtype_str(trtype)) == NULL ? false : true; +} + +bool +spdk_nvme_transport_available_by_name(const char *transport_name) +{ + return nvme_get_transport(transport_name) == NULL ? false : true; +} + +void spdk_nvme_transport_register(const struct spdk_nvme_transport_ops *ops) +{ + struct spdk_nvme_transport *new_transport; + + if (nvme_get_transport(ops->name)) { + SPDK_ERRLOG("Double registering NVMe transport %s is prohibited.\n", ops->name); + assert(false); + } + + if (g_current_transport_index == SPDK_MAX_NUM_OF_TRANSPORTS) { + SPDK_ERRLOG("Unable to register new NVMe transport.\n"); + assert(false); + return; + } + new_transport = &g_spdk_transports[g_current_transport_index++]; + + new_transport->ops = *ops; + TAILQ_INSERT_TAIL(&g_spdk_nvme_transports, new_transport, link); +} + +struct spdk_nvme_ctrlr *nvme_transport_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(trid->trstring); + struct spdk_nvme_ctrlr *ctrlr; + + if (transport == NULL) { + SPDK_ERRLOG("Transport %s doesn't exist.", trid->trstring); + return NULL; + } + + ctrlr = transport->ops.ctrlr_construct(trid, opts, devhandle); + + return ctrlr; +} + +int +nvme_transport_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, + bool direct_connect) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(probe_ctx->trid.trstring); + + if (transport == NULL) { + SPDK_ERRLOG("Transport %s doesn't exist.", probe_ctx->trid.trstring); + return -ENOENT; + } + + return transport->ops.ctrlr_scan(probe_ctx, direct_connect); +} + +int +nvme_transport_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_destruct(ctrlr); +} + +int +nvme_transport_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_enable(ctrlr); +} + +int +nvme_transport_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_set_reg_4(ctrlr, offset, value); +} + +int +nvme_transport_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_set_reg_8(ctrlr, offset, value); +} + +int +nvme_transport_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_get_reg_4(ctrlr, offset, value); +} + +int +nvme_transport_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_get_reg_8(ctrlr, offset, value); +} + +uint32_t +nvme_transport_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_get_max_xfer_size(ctrlr); +} + +uint16_t +nvme_transport_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_get_max_sges(ctrlr); +} + +int +nvme_transport_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + if (transport->ops.ctrlr_reserve_cmb != NULL) { + return transport->ops.ctrlr_reserve_cmb(ctrlr); + } + + return -ENOTSUP; +} + +void * +nvme_transport_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + if (transport->ops.ctrlr_map_cmb != NULL) { + return transport->ops.ctrlr_map_cmb(ctrlr, size); + } + + return NULL; +} + +int +nvme_transport_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + if (transport->ops.ctrlr_unmap_cmb != NULL) { + return transport->ops.ctrlr_unmap_cmb(ctrlr); + } + + return 0; +} + +struct spdk_nvme_qpair * +nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + struct spdk_nvme_qpair *qpair; + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + qpair = transport->ops.ctrlr_create_io_qpair(ctrlr, qid, opts); + if (qpair != NULL && !nvme_qpair_is_admin_queue(qpair)) { + qpair->transport = transport; + } + + return qpair; +} + +int +nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + + /* Do not rely on qpair->transport. For multi-process cases, a foreign process may delete + * the IO qpair, in which case the transport object would be invalid (each process has their + * own unique transport objects since they contain function pointers). So we look up the + * transport object in the delete_io_qpair case. + */ + return transport->ops.ctrlr_delete_io_qpair(ctrlr, qpair); +} + +int +nvme_transport_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + uint8_t transport_failure_reason; + int rc; + + assert(transport != NULL); + if (!nvme_qpair_is_admin_queue(qpair)) { + qpair->transport = transport; + } + + transport_failure_reason = qpair->transport_failure_reason; + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_NONE; + + nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTING); + rc = transport->ops.ctrlr_connect_qpair(ctrlr, qpair); + if (rc != 0) { + goto err; + } + + nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED); + if (qpair->poll_group) { + rc = nvme_poll_group_connect_qpair(qpair); + if (rc) { + goto err; + } + } + + return rc; + +err: + /* If the qpair was unable to reconnect, restore the original failure reason. */ + qpair->transport_failure_reason = transport_failure_reason; + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED); + return rc; +} + +void +nvme_transport_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING || + nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTED) { + return; + } + + nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTING); + assert(transport != NULL); + if (qpair->poll_group) { + nvme_poll_group_disconnect_qpair(qpair); + } + + transport->ops.ctrlr_disconnect_qpair(ctrlr, qpair); + + nvme_qpair_abort_reqs(qpair, 0); + nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED); +} + +void +nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + const struct spdk_nvme_transport *transport; + + assert(dnr <= 1); + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + qpair->transport->ops.qpair_abort_reqs(qpair, dnr); + } else { + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + transport->ops.qpair_abort_reqs(qpair, dnr); + } +} + +int +nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + const struct spdk_nvme_transport *transport; + + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + return qpair->transport->ops.qpair_reset(qpair); + } + + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + return transport->ops.qpair_reset(qpair); +} + +int +nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + const struct spdk_nvme_transport *transport; + + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + return qpair->transport->ops.qpair_submit_request(qpair, req); + } + + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + return transport->ops.qpair_submit_request(qpair, req); +} + +int32_t +nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + const struct spdk_nvme_transport *transport; + + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + return qpair->transport->ops.qpair_process_completions(qpair, max_completions); + } + + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + return transport->ops.qpair_process_completions(qpair, max_completions); +} + +int +nvme_transport_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + const struct spdk_nvme_transport *transport; + + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + return qpair->transport->ops.qpair_iterate_requests(qpair, iter_fn, arg); + } + + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + return transport->ops.qpair_iterate_requests(qpair, iter_fn, arg); +} + +void +nvme_transport_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + + assert(transport != NULL); + transport->ops.admin_qpair_abort_aers(qpair); +} + +struct spdk_nvme_transport_poll_group * +nvme_transport_poll_group_create(const struct spdk_nvme_transport *transport) +{ + struct spdk_nvme_transport_poll_group *group = NULL; + + group = transport->ops.poll_group_create(); + if (group) { + group->transport = transport; + STAILQ_INIT(&group->connected_qpairs); + STAILQ_INIT(&group->disconnected_qpairs); + } + + return group; +} + +int +nvme_transport_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + int rc; + + rc = tgroup->transport->ops.poll_group_add(tgroup, qpair); + if (rc == 0) { + qpair->poll_group = tgroup; + assert(nvme_qpair_get_state(qpair) < NVME_QPAIR_CONNECTED); + qpair->poll_group_tailq_head = &tgroup->disconnected_qpairs; + STAILQ_INSERT_TAIL(&tgroup->disconnected_qpairs, qpair, poll_group_stailq); + } + + return rc; +} + +int +nvme_transport_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + int rc; + + rc = tgroup->transport->ops.poll_group_remove(tgroup, qpair); + if (rc == 0) { + if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { + STAILQ_REMOVE(&tgroup->connected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq); + } else if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) { + STAILQ_REMOVE(&tgroup->disconnected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq); + } else { + return -ENOENT; + } + + qpair->poll_group = NULL; + qpair->poll_group_tailq_head = NULL; + } + + return rc; +} + +int64_t +nvme_transport_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) +{ + struct spdk_nvme_qpair *qpair; + int64_t rc; + + tgroup->in_completion_context = true; + rc = tgroup->transport->ops.poll_group_process_completions(tgroup, completions_per_qpair, + disconnected_qpair_cb); + tgroup->in_completion_context = false; + + if (spdk_unlikely(tgroup->num_qpairs_to_delete > 0)) { + /* deleted qpairs are more likely to be in the disconnected qpairs list. */ + STAILQ_FOREACH(qpair, &tgroup->disconnected_qpairs, poll_group_stailq) { + if (spdk_unlikely(qpair->delete_after_completion_context)) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + if (--tgroup->num_qpairs_to_delete == 0) { + return rc; + } + } + } + + STAILQ_FOREACH(qpair, &tgroup->connected_qpairs, poll_group_stailq) { + if (spdk_unlikely(qpair->delete_after_completion_context)) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + if (--tgroup->num_qpairs_to_delete == 0) { + return rc; + } + } + } + /* Just in case. */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Mismatch between qpairs to delete and poll group number.\n"); + tgroup->num_qpairs_to_delete = 0; + } + + return rc; +} + +int +nvme_transport_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) +{ + return tgroup->transport->ops.poll_group_destroy(tgroup); +} + +int +nvme_transport_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_transport_poll_group *tgroup; + int rc; + + tgroup = qpair->poll_group; + + if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) { + return 0; + } + + if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { + rc = tgroup->transport->ops.poll_group_disconnect_qpair(qpair); + if (rc == 0) { + qpair->poll_group_tailq_head = &tgroup->disconnected_qpairs; + STAILQ_REMOVE(&tgroup->connected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq); + STAILQ_INSERT_TAIL(&tgroup->disconnected_qpairs, qpair, poll_group_stailq); + /* EINPROGRESS indicates that a call has already been made to this function. + * It just keeps us from segfaulting on a double removal/insert. + */ + } else if (rc == -EINPROGRESS) { + rc = 0; + } + return rc; + } + + return -EINVAL; +} + +int +nvme_transport_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_transport_poll_group *tgroup; + int rc; + + tgroup = qpair->poll_group; + + if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { + return 0; + } + + if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) { + rc = tgroup->transport->ops.poll_group_connect_qpair(qpair); + if (rc == 0) { + qpair->poll_group_tailq_head = &tgroup->connected_qpairs; + STAILQ_REMOVE(&tgroup->disconnected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq); + STAILQ_INSERT_TAIL(&tgroup->connected_qpairs, qpair, poll_group_stailq); + } + + return rc == -EINPROGRESS ? 0 : rc; + } + + + return -EINVAL; +} diff --git a/src/spdk/lib/nvme/nvme_uevent.c b/src/spdk/lib/nvme/nvme_uevent.c new file mode 100644 index 000000000..1bcfff1cb --- /dev/null +++ b/src/spdk/lib/nvme/nvme_uevent.c @@ -0,0 +1,213 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/string.h" + +#include "spdk/log.h" + +#include "nvme_uevent.h" + +#ifdef __linux__ + +#include <linux/netlink.h> + +#define SPDK_UEVENT_MSG_LEN 4096 + +int +nvme_uevent_connect(void) +{ + struct sockaddr_nl addr; + int netlink_fd; + int size = 64 * 1024; + int flag; + + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; + addr.nl_pid = getpid(); + addr.nl_groups = 0xffffffff; + + netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT); + if (netlink_fd < 0) { + return -1; + } + + setsockopt(netlink_fd, SOL_SOCKET, SO_RCVBUFFORCE, &size, sizeof(size)); + + flag = fcntl(netlink_fd, F_GETFL); + if (fcntl(netlink_fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", netlink_fd, + spdk_strerror(errno)); + close(netlink_fd); + return -1; + } + + if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + close(netlink_fd); + return -1; + } + return netlink_fd; +} + +/* Note: We only parse the event from uio subsystem and will ignore + * all the event from other subsystem. the event from uio subsystem + * as below: + * action: "add" or "remove" + * subsystem: "uio" + * dev_path: "/devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0" + */ +static int +parse_event(const char *buf, struct spdk_uevent *event) +{ + char action[SPDK_UEVENT_MSG_LEN]; + char subsystem[SPDK_UEVENT_MSG_LEN]; + char dev_path[SPDK_UEVENT_MSG_LEN]; + char driver[SPDK_UEVENT_MSG_LEN]; + char vfio_pci_addr[SPDK_UEVENT_MSG_LEN]; + + memset(action, 0, SPDK_UEVENT_MSG_LEN); + memset(subsystem, 0, SPDK_UEVENT_MSG_LEN); + memset(dev_path, 0, SPDK_UEVENT_MSG_LEN); + memset(driver, 0, SPDK_UEVENT_MSG_LEN); + memset(vfio_pci_addr, 0, SPDK_UEVENT_MSG_LEN); + + while (*buf) { + if (!strncmp(buf, "ACTION=", 7)) { + buf += 7; + snprintf(action, sizeof(action), "%s", buf); + } else if (!strncmp(buf, "DEVPATH=", 8)) { + buf += 8; + snprintf(dev_path, sizeof(dev_path), "%s", buf); + } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { + buf += 10; + snprintf(subsystem, sizeof(subsystem), "%s", buf); + } else if (!strncmp(buf, "DRIVER=", 7)) { + buf += 7; + snprintf(driver, sizeof(driver), "%s", buf); + } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { + buf += 14; + snprintf(vfio_pci_addr, sizeof(vfio_pci_addr), "%s", buf); + } + while (*buf++) + ; + } + + if (!strncmp(subsystem, "uio", 3)) { + char *pci_address, *tmp; + struct spdk_pci_addr pci_addr; + + event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UIO; + if (!strncmp(action, "add", 3)) { + event->action = SPDK_NVME_UEVENT_ADD; + } + if (!strncmp(action, "remove", 6)) { + event->action = SPDK_NVME_UEVENT_REMOVE; + } + tmp = strstr(dev_path, "/uio/"); + + memset(tmp, 0, SPDK_UEVENT_MSG_LEN - (tmp - dev_path)); + + pci_address = strrchr(dev_path, '/'); + pci_address++; + if (spdk_pci_addr_parse(&pci_addr, pci_address) != 0) { + SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", pci_address); + return -1; + } + spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr); + return 1; + } + if (!strncmp(driver, "vfio-pci", 8)) { + struct spdk_pci_addr pci_addr; + + event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_VFIO; + if (!strncmp(action, "bind", 4)) { + event->action = SPDK_NVME_UEVENT_ADD; + } + if (!strncmp(action, "remove", 6)) { + event->action = SPDK_NVME_UEVENT_REMOVE; + } + if (spdk_pci_addr_parse(&pci_addr, vfio_pci_addr) != 0) { + SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", vfio_pci_addr); + return -1; + } + spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr); + return 1; + + } + return -1; +} + +int +nvme_get_uevent(int fd, struct spdk_uevent *uevent) +{ + int ret; + char buf[SPDK_UEVENT_MSG_LEN]; + + memset(uevent, 0, sizeof(struct spdk_uevent)); + memset(buf, 0, SPDK_UEVENT_MSG_LEN); + + ret = recv(fd, buf, SPDK_UEVENT_MSG_LEN - 1, MSG_DONTWAIT); + if (ret > 0) { + return parse_event(buf, uevent); + } + + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return 0; + } else { + SPDK_ERRLOG("Socket read error(%d): %s\n", errno, spdk_strerror(errno)); + return -1; + } + } + + /* connection closed */ + if (ret == 0) { + return -1; + } + return 0; +} + +#else /* Not Linux */ + +int +nvme_uevent_connect(void) +{ + return -1; +} + +int +nvme_get_uevent(int fd, struct spdk_uevent *uevent) +{ + return -1; +} +#endif diff --git a/src/spdk/lib/nvme/nvme_uevent.h b/src/spdk/lib/nvme/nvme_uevent.h new file mode 100644 index 000000000..778d73c2a --- /dev/null +++ b/src/spdk/lib/nvme/nvme_uevent.h @@ -0,0 +1,61 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * SPDK uevent + */ + +#include "spdk/env.h" +#include "spdk/nvmf_spec.h" + +#ifndef SPDK_UEVENT_H_ +#define SPDK_UEVENT_H_ + +#define SPDK_NVME_UEVENT_SUBSYSTEM_UIO 1 +#define SPDK_NVME_UEVENT_SUBSYSTEM_VFIO 2 + +enum spdk_nvme_uevent_action { + SPDK_NVME_UEVENT_ADD = 0, + SPDK_NVME_UEVENT_REMOVE = 1, +}; + +struct spdk_uevent { + enum spdk_nvme_uevent_action action; + int subsystem; + char traddr[SPDK_NVMF_TRADDR_MAX_LEN + 1]; +}; + +int nvme_uevent_connect(void); +int nvme_get_uevent(int fd, struct spdk_uevent *uevent); + +#endif /* SPDK_UEVENT_H_ */ diff --git a/src/spdk/lib/nvme/spdk_nvme.map b/src/spdk/lib/nvme/spdk_nvme.map new file mode 100644 index 000000000..63a04eeca --- /dev/null +++ b/src/spdk/lib/nvme/spdk_nvme.map @@ -0,0 +1,185 @@ +{ + global: + + # public functions from nvme.h + spdk_nvme_transport_register; + spdk_nvme_transport_available; + spdk_nvme_transport_available_by_name; + spdk_nvme_transport_id_parse; + spdk_nvme_transport_id_populate_trstring; + spdk_nvme_transport_id_parse_trtype; + spdk_nvme_transport_id_trtype_str; + spdk_nvme_transport_id_adrfam_str; + spdk_nvme_transport_id_parse_adrfam; + spdk_nvme_transport_id_compare; + spdk_nvme_trid_populate_transport; + spdk_nvme_host_id_parse; + + spdk_nvme_prchk_flags_parse; + spdk_nvme_prchk_flags_str; + + spdk_nvme_probe; + spdk_nvme_connect; + spdk_nvme_connect_async; + spdk_nvme_probe_async; + spdk_nvme_probe_poll_async; + spdk_nvme_detach; + + spdk_nvme_ctrlr_is_discovery; + spdk_nvme_ctrlr_get_default_ctrlr_opts; + spdk_nvme_ctrlr_set_trid; + spdk_nvme_ctrlr_reset; + spdk_nvme_ctrlr_fail; + spdk_nvme_ctrlr_is_failed; + spdk_nvme_ctrlr_get_data; + spdk_nvme_ctrlr_get_regs_csts; + spdk_nvme_ctrlr_get_regs_cap; + spdk_nvme_ctrlr_get_regs_vs; + spdk_nvme_ctrlr_get_regs_cmbsz; + spdk_nvme_ctrlr_get_num_ns; + spdk_nvme_ctrlr_get_pci_device; + spdk_nvme_ctrlr_get_max_xfer_size; + spdk_nvme_ctrlr_is_active_ns; + spdk_nvme_ctrlr_get_first_active_ns; + spdk_nvme_ctrlr_get_next_active_ns; + spdk_nvme_ctrlr_is_log_page_supported; + spdk_nvme_ctrlr_is_feature_supported; + spdk_nvme_ctrlr_register_aer_callback; + spdk_nvme_ctrlr_register_timeout_callback; + spdk_nvme_ctrlr_get_default_io_qpair_opts; + spdk_nvme_ctrlr_alloc_io_qpair; + spdk_nvme_ctrlr_connect_io_qpair; + spdk_nvme_ctrlr_disconnect_io_qpair; + spdk_nvme_ctrlr_reconnect_io_qpair; + spdk_nvme_ctrlr_get_admin_qp_failure_reason; + spdk_nvme_ctrlr_free_io_qpair; + spdk_nvme_ctrlr_io_cmd_raw_no_payload_build; + spdk_nvme_ctrlr_cmd_io_raw; + spdk_nvme_ctrlr_cmd_io_raw_with_md; + spdk_nvme_ctrlr_cmd_admin_raw; + spdk_nvme_ctrlr_process_admin_completions; + spdk_nvme_ctrlr_get_ns; + spdk_nvme_ctrlr_cmd_get_log_page; + spdk_nvme_ctrlr_cmd_get_log_page_ext; + spdk_nvme_ctrlr_cmd_abort; + spdk_nvme_ctrlr_cmd_abort_ext; + spdk_nvme_ctrlr_cmd_set_feature; + spdk_nvme_ctrlr_cmd_get_feature; + spdk_nvme_ctrlr_cmd_get_feature_ns; + spdk_nvme_ctrlr_cmd_set_feature_ns; + spdk_nvme_ctrlr_cmd_security_receive; + spdk_nvme_ctrlr_cmd_security_send; + spdk_nvme_ctrlr_security_receive; + spdk_nvme_ctrlr_security_send; + spdk_nvme_ctrlr_get_flags; + spdk_nvme_ctrlr_attach_ns; + spdk_nvme_ctrlr_detach_ns; + spdk_nvme_ctrlr_create_ns; + spdk_nvme_ctrlr_delete_ns; + spdk_nvme_ctrlr_format; + spdk_nvme_ctrlr_update_firmware; + spdk_nvme_ctrlr_get_registers; + spdk_nvme_ctrlr_reserve_cmb; + spdk_nvme_ctrlr_map_cmb; + spdk_nvme_ctrlr_unmap_cmb; + spdk_nvme_ctrlr_get_transport_id; + + spdk_nvme_poll_group_create; + spdk_nvme_poll_group_add; + spdk_nvme_poll_group_remove; + spdk_nvme_poll_group_destroy; + spdk_nvme_poll_group_process_completions; + spdk_nvme_poll_group_get_ctx; + + spdk_nvme_ns_get_data; + spdk_nvme_ns_get_id; + spdk_nvme_ns_get_ctrlr; + spdk_nvme_ns_is_active; + spdk_nvme_ns_get_max_io_xfer_size; + spdk_nvme_ns_get_sector_size; + spdk_nvme_ns_get_extended_sector_size; + spdk_nvme_ns_get_num_sectors; + spdk_nvme_ns_get_size; + spdk_nvme_ns_get_pi_type; + spdk_nvme_ns_get_md_size; + spdk_nvme_ns_supports_extended_lba; + spdk_nvme_ns_supports_compare; + spdk_nvme_ns_get_dealloc_logical_block_read_value; + spdk_nvme_ns_get_optimal_io_boundary; + spdk_nvme_ns_get_uuid; + spdk_nvme_ns_get_flags; + + spdk_nvme_ns_cmd_write; + spdk_nvme_ns_cmd_writev; + spdk_nvme_ns_cmd_writev_with_md; + spdk_nvme_ns_cmd_write_with_md; + spdk_nvme_ns_cmd_write_zeroes; + spdk_nvme_ns_cmd_write_uncorrectable; + spdk_nvme_ns_cmd_read; + spdk_nvme_ns_cmd_readv; + spdk_nvme_ns_cmd_readv_with_md; + spdk_nvme_ns_cmd_read_with_md; + spdk_nvme_ns_cmd_dataset_management; + spdk_nvme_ns_cmd_flush; + spdk_nvme_ns_cmd_reservation_register; + spdk_nvme_ns_cmd_reservation_release; + spdk_nvme_ns_cmd_reservation_acquire; + spdk_nvme_ns_cmd_reservation_report; + spdk_nvme_ns_cmd_compare; + spdk_nvme_ns_cmd_comparev; + spdk_nvme_ns_cmd_comparev_with_md; + spdk_nvme_ns_cmd_compare_with_md; + + spdk_nvme_qpair_process_completions; + spdk_nvme_qpair_get_failure_reason; + spdk_nvme_qpair_add_cmd_error_injection; + spdk_nvme_qpair_remove_cmd_error_injection; + spdk_nvme_qpair_print_command; + spdk_nvme_qpair_print_completion; + spdk_nvme_print_command; + spdk_nvme_print_completion; + + spdk_nvme_cpl_get_status_string; + + spdk_nvme_rdma_init_hooks; + + spdk_nvme_cuse_get_ctrlr_name; + spdk_nvme_cuse_get_ns_name; + spdk_nvme_cuse_register; + spdk_nvme_cuse_unregister; + spdk_nvme_cuse_update_namespaces; + + spdk_nvme_map_prps; + + # public functions from nvme_ocssd.h + spdk_nvme_ctrlr_is_ocssd_supported; + spdk_nvme_ocssd_ctrlr_cmd_geometry; + spdk_nvme_ocssd_ns_cmd_vector_reset; + spdk_nvme_ocssd_ns_cmd_vector_write; + spdk_nvme_ocssd_ns_cmd_vector_write_with_md; + spdk_nvme_ocssd_ns_cmd_vector_read; + spdk_nvme_ocssd_ns_cmd_vector_read_with_md; + spdk_nvme_ocssd_ns_cmd_vector_copy; + + # public functions from opal.h + spdk_opal_dev_construct; + spdk_opal_dev_destruct; + spdk_opal_get_d0_features_info; + spdk_opal_supported; + spdk_opal_cmd_take_ownership; + spdk_opal_cmd_revert_tper; + spdk_opal_cmd_activate_locking_sp; + spdk_opal_cmd_lock_unlock; + spdk_opal_cmd_setup_locking_range; + spdk_opal_cmd_get_max_ranges; + spdk_opal_cmd_get_locking_range_info; + spdk_opal_cmd_enable_user; + spdk_opal_cmd_add_user_to_locking_range; + spdk_opal_cmd_set_new_passwd; + spdk_opal_cmd_erase_locking_range; + spdk_opal_cmd_secure_erase_locking_range; + spdk_opal_get_locking_range_info; + spdk_opal_free_locking_range_info; + + local: *; +}; diff --git a/src/spdk/lib/nvmf/Makefile b/src/spdk/lib/nvmf/Makefile new file mode 100644 index 000000000..b4556564a --- /dev/null +++ b/src/spdk/lib/nvmf/Makefile @@ -0,0 +1,75 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 5 +SO_MINOR := 0 + +C_SRCS = ctrlr.c ctrlr_discovery.c ctrlr_bdev.c \ + subsystem.c nvmf.c nvmf_rpc.c transport.c tcp.c + +C_SRCS-$(CONFIG_RDMA) += rdma.c +LIBNAME = nvmf +LOCAL_SYS_LIBS = -luuid +ifeq ($(CONFIG_RDMA),y) +LOCAL_SYS_LIBS += -libverbs -lrdmacm +#Attach only if FreeBSD and RDMA is specified with configure +ifeq ($(OS),FreeBSD) +# Mellanox - MLX4 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx4.*)","") +LOCAL_SYS_LIBS += -lmlx4 +endif +# Mellanox - MLX5 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx5.*)","") +LOCAL_SYS_LIBS += -lmlx5 +endif +# Chelsio HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libcxgb4.*)","") +LOCAL_SYS_LIBS += -lcxgb4 +endif +endif +endif + +ifeq ($(CONFIG_FC),y) +C_SRCS += fc.c fc_ls.c +CFLAGS += -I$(CURDIR) +ifneq ($(strip $(CONFIG_FC_PATH)),) +CFLAGS += -I$(CONFIG_FC_PATH) +endif +endif + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_nvmf.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/nvmf/ctrlr.c b/src/spdk/lib/nvmf/ctrlr.c new file mode 100644 index 000000000..638cde9d2 --- /dev/null +++ b/src/spdk/lib/nvmf/ctrlr.c @@ -0,0 +1,3224 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/bit_array.h" +#include "spdk/endian.h" +#include "spdk/thread.h" +#include "spdk/trace.h" +#include "spdk/nvme_spec.h" +#include "spdk/nvmf_cmd.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/version.h" + +#include "spdk_internal/log.h" + +#define MIN_KEEP_ALIVE_TIMEOUT_IN_MS 10000 +#define NVMF_DISC_KATO_IN_MS 120000 +#define KAS_TIME_UNIT_IN_MS 100 +#define KAS_DEFAULT_VALUE (MIN_KEEP_ALIVE_TIMEOUT_IN_MS / KAS_TIME_UNIT_IN_MS) + +/* + * Report the SPDK version as the firmware revision. + * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts. + */ +#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING + +/* + * Support for custom admin command handlers + */ +struct spdk_nvmf_custom_admin_cmd { + spdk_nvmf_custom_cmd_hdlr hdlr; + uint32_t nsid; /* nsid to forward */ +}; + +static struct spdk_nvmf_custom_admin_cmd g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_MAX_OPC + 1]; + +static void _nvmf_request_complete(void *ctx); + +static inline void +nvmf_invalid_connect_response(struct spdk_nvmf_fabric_connect_rsp *rsp, + uint8_t iattr, uint16_t ipo) +{ + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + rsp->status_code_specific.invalid.iattr = iattr; + rsp->status_code_specific.invalid.ipo = ipo; +} + +#define SPDK_NVMF_INVALID_CONNECT_CMD(rsp, field) \ + nvmf_invalid_connect_response(rsp, 0, offsetof(struct spdk_nvmf_fabric_connect_cmd, field)) +#define SPDK_NVMF_INVALID_CONNECT_DATA(rsp, field) \ + nvmf_invalid_connect_response(rsp, 1, offsetof(struct spdk_nvmf_fabric_connect_data, field)) + +static void +nvmf_ctrlr_stop_keep_alive_timer(struct spdk_nvmf_ctrlr *ctrlr) +{ + if (!ctrlr) { + SPDK_ERRLOG("Controller is NULL\n"); + return; + } + + if (ctrlr->keep_alive_poller == NULL) { + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Stop keep alive poller\n"); + spdk_poller_unregister(&ctrlr->keep_alive_poller); +} + +static void +nvmf_ctrlr_disconnect_qpairs_done(struct spdk_io_channel_iter *i, int status) +{ + if (status == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ctrlr disconnect qpairs complete successfully\n"); + } else { + SPDK_ERRLOG("Fail to disconnect ctrlr qpairs\n"); + } +} + +static int +_nvmf_ctrlr_disconnect_qpairs_on_pg(struct spdk_io_channel_iter *i, bool include_admin) +{ + int rc = 0; + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_qpair *qpair, *temp_qpair; + struct spdk_io_channel *ch; + struct spdk_nvmf_poll_group *group; + + ctrlr = spdk_io_channel_iter_get_ctx(i); + ch = spdk_io_channel_iter_get_channel(i); + group = spdk_io_channel_get_ctx(ch); + + TAILQ_FOREACH_SAFE(qpair, &group->qpairs, link, temp_qpair) { + if (qpair->ctrlr == ctrlr && (include_admin || !nvmf_qpair_is_admin_queue(qpair))) { + rc = spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); + if (rc) { + SPDK_ERRLOG("Qpair disconnect failed\n"); + return rc; + } + } + } + + return rc; +} + +static void +nvmf_ctrlr_disconnect_qpairs_on_pg(struct spdk_io_channel_iter *i) +{ + spdk_for_each_channel_continue(i, _nvmf_ctrlr_disconnect_qpairs_on_pg(i, true)); +} + +static void +nvmf_ctrlr_disconnect_io_qpairs_on_pg(struct spdk_io_channel_iter *i) +{ + spdk_for_each_channel_continue(i, _nvmf_ctrlr_disconnect_qpairs_on_pg(i, false)); +} + +static int +nvmf_ctrlr_keep_alive_poll(void *ctx) +{ + uint64_t keep_alive_timeout_tick; + uint64_t now = spdk_get_ticks(); + struct spdk_nvmf_ctrlr *ctrlr = ctx; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Polling ctrlr keep alive timeout\n"); + + /* If the Keep alive feature is in use and the timer expires */ + keep_alive_timeout_tick = ctrlr->last_keep_alive_tick + + ctrlr->feat.keep_alive_timer.bits.kato * spdk_get_ticks_hz() / UINT64_C(1000); + if (now > keep_alive_timeout_tick) { + SPDK_NOTICELOG("Disconnecting host from subsystem %s due to keep alive timeout.\n", + ctrlr->subsys->subnqn); + /* set the Controller Fatal Status bit to '1' */ + if (ctrlr->vcprop.csts.bits.cfs == 0) { + ctrlr->vcprop.csts.bits.cfs = 1; + + /* + * disconnect qpairs, terminate Transport connection + * destroy ctrlr, break the host to controller association + * disconnect qpairs with qpair->ctrlr == ctrlr + */ + spdk_for_each_channel(ctrlr->subsys->tgt, + nvmf_ctrlr_disconnect_qpairs_on_pg, + ctrlr, + nvmf_ctrlr_disconnect_qpairs_done); + } + } + + return SPDK_POLLER_BUSY; +} + +static void +nvmf_ctrlr_start_keep_alive_timer(struct spdk_nvmf_ctrlr *ctrlr) +{ + if (!ctrlr) { + SPDK_ERRLOG("Controller is NULL\n"); + return; + } + + /* if cleared to 0 then the Keep Alive Timer is disabled */ + if (ctrlr->feat.keep_alive_timer.bits.kato != 0) { + + ctrlr->last_keep_alive_tick = spdk_get_ticks(); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Ctrlr add keep alive poller\n"); + ctrlr->keep_alive_poller = SPDK_POLLER_REGISTER(nvmf_ctrlr_keep_alive_poll, ctrlr, + ctrlr->feat.keep_alive_timer.bits.kato * 1000); + } +} + +static void +ctrlr_add_qpair_and_update_rsp(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvmf_fabric_connect_rsp *rsp) +{ + assert(ctrlr->admin_qpair->group->thread == spdk_get_thread()); + + /* check if we would exceed ctrlr connection limit */ + if (qpair->qid >= spdk_bit_array_capacity(ctrlr->qpair_mask)) { + SPDK_ERRLOG("Requested QID %u but Max QID is %u\n", + qpair->qid, spdk_bit_array_capacity(ctrlr->qpair_mask) - 1); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return; + } + + if (spdk_bit_array_get(ctrlr->qpair_mask, qpair->qid)) { + SPDK_ERRLOG("Got I/O connect with duplicate QID %u\n", qpair->qid); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return; + } + + qpair->ctrlr = ctrlr; + spdk_bit_array_set(ctrlr->qpair_mask, qpair->qid); + + rsp->status.sc = SPDK_NVME_SC_SUCCESS; + rsp->status_code_specific.success.cntlid = ctrlr->cntlid; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "connect capsule response: cntlid = 0x%04x\n", + rsp->status_code_specific.success.cntlid); +} + +static void +_nvmf_ctrlr_add_admin_qpair(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + ctrlr->admin_qpair = qpair; + nvmf_ctrlr_start_keep_alive_timer(ctrlr); + ctrlr_add_qpair_and_update_rsp(qpair, ctrlr, rsp); + _nvmf_request_complete(req); +} + +static void +_nvmf_subsystem_add_ctrlr(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + if (nvmf_subsystem_add_ctrlr(ctrlr->subsys, ctrlr)) { + SPDK_ERRLOG("Unable to add controller to subsystem\n"); + spdk_bit_array_free(&ctrlr->qpair_mask); + free(ctrlr); + qpair->ctrlr = NULL; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_nvmf_request_complete(req); + return; + } + + spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_add_admin_qpair, req); +} + +static void +nvmf_ctrlr_cdata_init(struct spdk_nvmf_transport *transport, struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr_data *cdata) +{ + cdata->kas = KAS_DEFAULT_VALUE; + cdata->sgls.supported = 1; + cdata->sgls.keyed_sgl = 1; + cdata->sgls.sgl_offset = 1; + cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16; + cdata->nvmf_specific.ioccsz += transport->opts.in_capsule_data_size / 16; + cdata->nvmf_specific.iorcsz = sizeof(struct spdk_nvme_cpl) / 16; + cdata->nvmf_specific.icdoff = 0; /* offset starts directly after SQE */ + cdata->nvmf_specific.ctrattr.ctrlr_model = SPDK_NVMF_CTRLR_MODEL_DYNAMIC; + cdata->nvmf_specific.msdbd = 1; + + if (transport->ops->cdata_init) { + transport->ops->cdata_init(transport, subsystem, cdata); + } +} + +static struct spdk_nvmf_ctrlr * +nvmf_ctrlr_create(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_request *req, + struct spdk_nvmf_fabric_connect_cmd *connect_cmd, + struct spdk_nvmf_fabric_connect_data *connect_data) +{ + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_transport *transport; + + ctrlr = calloc(1, sizeof(*ctrlr)); + if (ctrlr == NULL) { + SPDK_ERRLOG("Memory allocation failed\n"); + return NULL; + } + + TAILQ_INIT(&ctrlr->log_head); + ctrlr->subsys = subsystem; + ctrlr->thread = req->qpair->group->thread; + + transport = req->qpair->transport; + ctrlr->qpair_mask = spdk_bit_array_create(transport->opts.max_qpairs_per_ctrlr); + if (!ctrlr->qpair_mask) { + SPDK_ERRLOG("Failed to allocate controller qpair mask\n"); + free(ctrlr); + return NULL; + } + + nvmf_ctrlr_cdata_init(transport, subsystem, &ctrlr->cdata); + + /* + * KAS: This field indicates the granularity of the Keep Alive Timer in 100ms units. + * If this field is cleared to 0h, then Keep Alive is not supported. + */ + if (ctrlr->cdata.kas) { + ctrlr->feat.keep_alive_timer.bits.kato = spdk_divide_round_up(connect_cmd->kato, + KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS) * + KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS; + } + + ctrlr->feat.async_event_configuration.bits.ns_attr_notice = 1; + ctrlr->feat.volatile_write_cache.bits.wce = 1; + + if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + /* + * If keep-alive timeout is not set, discovery controllers use some + * arbitrary high value in order to cleanup stale discovery sessions + * + * From the 1.0a nvme-of spec: + * "The Keep Alive command is reserved for + * Discovery controllers. A transport may specify a + * fixed Discovery controller activity timeout value + * (e.g., 2 minutes). If no commands are received + * by a Discovery controller within that time + * period, the controller may perform the + * actions for Keep Alive Timer expiration". + * kato is in millisecond. + */ + if (ctrlr->feat.keep_alive_timer.bits.kato == 0) { + ctrlr->feat.keep_alive_timer.bits.kato = NVMF_DISC_KATO_IN_MS; + } + } + + /* Subtract 1 for admin queue, 1 for 0's based */ + ctrlr->feat.number_of_queues.bits.ncqr = transport->opts.max_qpairs_per_ctrlr - 1 - + 1; + ctrlr->feat.number_of_queues.bits.nsqr = transport->opts.max_qpairs_per_ctrlr - 1 - + 1; + + spdk_uuid_copy(&ctrlr->hostid, (struct spdk_uuid *)connect_data->hostid); + memcpy(ctrlr->hostnqn, connect_data->hostnqn, sizeof(ctrlr->hostnqn)); + + ctrlr->vcprop.cap.raw = 0; + ctrlr->vcprop.cap.bits.cqr = 1; /* NVMe-oF specification required */ + ctrlr->vcprop.cap.bits.mqes = transport->opts.max_queue_depth - + 1; /* max queue depth */ + ctrlr->vcprop.cap.bits.ams = 0; /* optional arb mechanisms */ + ctrlr->vcprop.cap.bits.to = 1; /* ready timeout - 500 msec units */ + ctrlr->vcprop.cap.bits.dstrd = 0; /* fixed to 0 for NVMe-oF */ + ctrlr->vcprop.cap.bits.css = SPDK_NVME_CAP_CSS_NVM; /* NVM command set */ + ctrlr->vcprop.cap.bits.mpsmin = 0; /* 2 ^ (12 + mpsmin) == 4k */ + ctrlr->vcprop.cap.bits.mpsmax = 0; /* 2 ^ (12 + mpsmax) == 4k */ + + /* Version Supported: 1.3 */ + ctrlr->vcprop.vs.bits.mjr = 1; + ctrlr->vcprop.vs.bits.mnr = 3; + ctrlr->vcprop.vs.bits.ter = 0; + + ctrlr->vcprop.cc.raw = 0; + ctrlr->vcprop.cc.bits.en = 0; /* Init controller disabled */ + + ctrlr->vcprop.csts.raw = 0; + ctrlr->vcprop.csts.bits.rdy = 0; /* Init controller as not ready */ + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cap 0x%" PRIx64 "\n", ctrlr->vcprop.cap.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "vs 0x%x\n", ctrlr->vcprop.vs.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cc 0x%x\n", ctrlr->vcprop.cc.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "csts 0x%x\n", ctrlr->vcprop.csts.raw); + + ctrlr->dif_insert_or_strip = transport->opts.dif_insert_or_strip; + + req->qpair->ctrlr = ctrlr; + spdk_thread_send_msg(subsystem->thread, _nvmf_subsystem_add_ctrlr, req); + + return ctrlr; +} + +static void +_nvmf_ctrlr_destruct(void *ctx) +{ + struct spdk_nvmf_ctrlr *ctrlr = ctx; + struct spdk_nvmf_reservation_log *log, *log_tmp; + + nvmf_ctrlr_stop_keep_alive_timer(ctrlr); + + TAILQ_FOREACH_SAFE(log, &ctrlr->log_head, link, log_tmp) { + TAILQ_REMOVE(&ctrlr->log_head, log, link); + free(log); + } + free(ctrlr); +} + +void +nvmf_ctrlr_destruct(struct spdk_nvmf_ctrlr *ctrlr) +{ + nvmf_subsystem_remove_ctrlr(ctrlr->subsys, ctrlr); + + spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_destruct, ctrlr); +} + +static void +nvmf_ctrlr_add_io_qpair(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + /* Unit test will check qpair->ctrlr after calling spdk_nvmf_ctrlr_connect. + * For error case, the value should be NULL. So set it to NULL at first. + */ + qpair->ctrlr = NULL; + + if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + SPDK_ERRLOG("I/O connect not allowed on discovery controller\n"); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + if (!ctrlr->vcprop.cc.bits.en) { + SPDK_ERRLOG("Got I/O connect before ctrlr was enabled\n"); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + if (1u << ctrlr->vcprop.cc.bits.iosqes != sizeof(struct spdk_nvme_cmd)) { + SPDK_ERRLOG("Got I/O connect with invalid IOSQES %u\n", + ctrlr->vcprop.cc.bits.iosqes); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + if (1u << ctrlr->vcprop.cc.bits.iocqes != sizeof(struct spdk_nvme_cpl)) { + SPDK_ERRLOG("Got I/O connect with invalid IOCQES %u\n", + ctrlr->vcprop.cc.bits.iocqes); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + ctrlr_add_qpair_and_update_rsp(qpair, ctrlr, rsp); +end: + spdk_nvmf_request_complete(req); +} + +static void +_nvmf_ctrlr_add_io_qpair(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_fabric_connect_data *data = req->data; + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_qpair *admin_qpair; + struct spdk_nvmf_tgt *tgt = qpair->transport->tgt; + struct spdk_nvmf_subsystem *subsystem; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect I/O Queue for controller id 0x%x\n", data->cntlid); + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, data->subnqn); + /* We already checked this in spdk_nvmf_ctrlr_connect */ + assert(subsystem != NULL); + + ctrlr = nvmf_subsystem_get_ctrlr(subsystem, data->cntlid); + if (ctrlr == NULL) { + SPDK_ERRLOG("Unknown controller ID 0x%x\n", data->cntlid); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, cntlid); + spdk_nvmf_request_complete(req); + return; + } + + admin_qpair = ctrlr->admin_qpair; + qpair->ctrlr = ctrlr; + spdk_thread_send_msg(admin_qpair->group->thread, nvmf_ctrlr_add_io_qpair, req); +} + +static bool +nvmf_qpair_access_allowed(struct spdk_nvmf_qpair *qpair, struct spdk_nvmf_subsystem *subsystem, + const char *hostnqn) +{ + struct spdk_nvme_transport_id listen_trid = {}; + + if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) { + SPDK_ERRLOG("Subsystem '%s' does not allow host '%s'\n", subsystem->subnqn, hostnqn); + return false; + } + + if (spdk_nvmf_qpair_get_listen_trid(qpair, &listen_trid)) { + SPDK_ERRLOG("Subsystem '%s' is unable to enforce access control due to an internal error.\n", + subsystem->subnqn); + return false; + } + + if (!spdk_nvmf_subsystem_listener_allowed(subsystem, &listen_trid)) { + SPDK_ERRLOG("Subsystem '%s' does not allow host '%s' to connect at this address.\n", + subsystem->subnqn, hostnqn); + return false; + } + + return true; +} + +static int +_nvmf_ctrlr_connect(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_fabric_connect_data *data = req->data; + struct spdk_nvmf_fabric_connect_cmd *cmd = &req->cmd->connect_cmd; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_transport *transport = qpair->transport; + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_subsystem *subsystem; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "recfmt 0x%x qid %u sqsize %u\n", + cmd->recfmt, cmd->qid, cmd->sqsize); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect data:\n"); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " cntlid: 0x%04x\n", data->cntlid); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " hostid: %08x-%04x-%04x-%02x%02x-%04x%08x ***\n", + ntohl(*(uint32_t *)&data->hostid[0]), + ntohs(*(uint16_t *)&data->hostid[4]), + ntohs(*(uint16_t *)&data->hostid[6]), + data->hostid[8], + data->hostid[9], + ntohs(*(uint16_t *)&data->hostid[10]), + ntohl(*(uint32_t *)&data->hostid[12])); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " subnqn: \"%s\"\n", data->subnqn); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " hostnqn: \"%s\"\n", data->hostnqn); + + subsystem = spdk_nvmf_tgt_find_subsystem(transport->tgt, data->subnqn); + if (!subsystem) { + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (cmd->recfmt != 0) { + SPDK_ERRLOG("Connect command unsupported RECFMT %u\n", cmd->recfmt); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* + * SQSIZE is a 0-based value, so it must be at least 1 (minimum queue depth is 2) and + * strictly less than max_aq_depth (admin queues) or max_queue_depth (io queues). + */ + if (cmd->sqsize == 0) { + SPDK_ERRLOG("Invalid SQSIZE = 0\n"); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (cmd->qid == 0) { + if (cmd->sqsize >= transport->opts.max_aq_depth) { + SPDK_ERRLOG("Invalid SQSIZE for admin queue %u (min 1, max %u)\n", + cmd->sqsize, transport->opts.max_aq_depth - 1); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + } else if (cmd->sqsize >= transport->opts.max_queue_depth) { + SPDK_ERRLOG("Invalid SQSIZE %u (min 1, max %u)\n", + cmd->sqsize, transport->opts.max_queue_depth - 1); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + qpair->sq_head_max = cmd->sqsize; + qpair->qid = cmd->qid; + + if (0 == qpair->qid) { + qpair->group->stat.admin_qpairs++; + } else { + qpair->group->stat.io_qpairs++; + } + + if (cmd->qid == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect Admin Queue for controller ID 0x%x\n", data->cntlid); + + if (data->cntlid != 0xFFFF) { + /* This NVMf target only supports dynamic mode. */ + SPDK_ERRLOG("The NVMf target only supports dynamic mode (CNTLID = 0x%x).\n", data->cntlid); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, cntlid); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* Establish a new ctrlr */ + ctrlr = nvmf_ctrlr_create(subsystem, req, cmd, data); + if (!ctrlr) { + SPDK_ERRLOG("nvmf_ctrlr_create() failed\n"); + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } else { + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + } else { + spdk_thread_send_msg(subsystem->thread, _nvmf_ctrlr_add_io_qpair, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } +} + +static inline bool +nvmf_request_is_fabric_connect(struct spdk_nvmf_request *req) +{ + return req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC && + req->cmd->nvmf_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_CONNECT; +} + +static struct spdk_nvmf_subsystem_poll_group * +nvmf_subsystem_pg_from_connect_cmd(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_fabric_connect_data *data; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + assert(nvmf_request_is_fabric_connect(req)); + assert(req->qpair->ctrlr == NULL); + + data = req->data; + tgt = req->qpair->transport->tgt; + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, data->subnqn); + if (subsystem == NULL) { + return NULL; + } + + return &req->qpair->group->sgroups[subsystem->id]; +} + +int +spdk_nvmf_ctrlr_connect(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_subsystem_poll_group *sgroup; + enum spdk_nvmf_request_exec_status status; + + sgroup = nvmf_subsystem_pg_from_connect_cmd(req); + if (!sgroup) { + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn); + status = SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + goto out; + } + + sgroup->io_outstanding++; + TAILQ_INSERT_TAIL(&qpair->outstanding, req, link); + + status = _nvmf_ctrlr_connect(req); + +out: + if (status == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) { + _nvmf_request_complete(req); + } + + return status; +} + +static int +nvmf_ctrlr_cmd_connect(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_fabric_connect_data *data = req->data; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_transport *transport = req->qpair->transport; + struct spdk_nvmf_subsystem *subsystem; + + if (req->length < sizeof(struct spdk_nvmf_fabric_connect_data)) { + SPDK_ERRLOG("Connect command data length 0x%x too small\n", req->length); + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + subsystem = spdk_nvmf_tgt_find_subsystem(transport->tgt, data->subnqn); + if (!subsystem) { + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if ((subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE) || + (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSING) || + (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED) || + (subsystem->state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING)) { + SPDK_ERRLOG("Subsystem '%s' is not ready\n", subsystem->subnqn); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_CONTROLLER_BUSY; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* Ensure that hostnqn is null terminated */ + if (!memchr(data->hostnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1)) { + SPDK_ERRLOG("Connect HOSTNQN is not null terminated\n"); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, hostnqn); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (!nvmf_qpair_access_allowed(req->qpair, subsystem, data->hostnqn)) { + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_HOST; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return _nvmf_ctrlr_connect(req); +} + +static void +nvmf_ctrlr_cc_reset_done(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_nvmf_ctrlr *ctrlr = spdk_io_channel_iter_get_ctx(i); + + if (status < 0) { + SPDK_ERRLOG("Fail to disconnect io ctrlr qpairs\n"); + assert(false); + } + + /* Only a subset of the registers are cleared out on a reset */ + ctrlr->vcprop.cc.raw = 0; + ctrlr->vcprop.csts.raw = 0; + +} + +const struct spdk_nvmf_registers * +spdk_nvmf_ctrlr_get_regs(struct spdk_nvmf_ctrlr *ctrlr) +{ + return &ctrlr->vcprop; +} + +static uint64_t +nvmf_prop_get_cap(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.cap.raw; +} + +static uint64_t +nvmf_prop_get_vs(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.vs.raw; +} + +static uint64_t +nvmf_prop_get_cc(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.cc.raw; +} + +static bool +nvmf_prop_set_cc(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value) +{ + union spdk_nvme_cc_register cc, diff; + + cc.raw = value; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cur CC: 0x%08x\n", ctrlr->vcprop.cc.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "new CC: 0x%08x\n", cc.raw); + + /* + * Calculate which bits changed between the current and new CC. + * Mark each bit as 0 once it is handled to determine if any unhandled bits were changed. + */ + diff.raw = cc.raw ^ ctrlr->vcprop.cc.raw; + + if (diff.bits.en) { + if (cc.bits.en) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Enable!\n"); + ctrlr->vcprop.cc.bits.en = 1; + ctrlr->vcprop.csts.bits.rdy = 1; + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Disable!\n"); + ctrlr->vcprop.cc.bits.en = 0; + spdk_for_each_channel(ctrlr->subsys->tgt, + nvmf_ctrlr_disconnect_io_qpairs_on_pg, + ctrlr, + nvmf_ctrlr_cc_reset_done); + } + diff.bits.en = 0; + } + + if (diff.bits.shn) { + if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || + cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Shutdown %u%ub!\n", + cc.bits.shn >> 1, cc.bits.shn & 1); + ctrlr->vcprop.cc.bits.shn = cc.bits.shn; + ctrlr->vcprop.cc.bits.en = 0; + ctrlr->vcprop.csts.bits.rdy = 0; + ctrlr->vcprop.csts.bits.shst = SPDK_NVME_SHST_COMPLETE; + } else if (cc.bits.shn == 0) { + ctrlr->vcprop.cc.bits.shn = 0; + } else { + SPDK_ERRLOG("Prop Set CC: Invalid SHN value %u%ub\n", + cc.bits.shn >> 1, cc.bits.shn & 1); + return false; + } + diff.bits.shn = 0; + } + + if (diff.bits.iosqes) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Prop Set IOSQES = %u (%u bytes)\n", + cc.bits.iosqes, 1u << cc.bits.iosqes); + ctrlr->vcprop.cc.bits.iosqes = cc.bits.iosqes; + diff.bits.iosqes = 0; + } + + if (diff.bits.iocqes) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Prop Set IOCQES = %u (%u bytes)\n", + cc.bits.iocqes, 1u << cc.bits.iocqes); + ctrlr->vcprop.cc.bits.iocqes = cc.bits.iocqes; + diff.bits.iocqes = 0; + } + + if (diff.bits.ams) { + SPDK_ERRLOG("Arbitration Mechanism Selected (AMS) 0x%x not supported!\n", cc.bits.ams); + return false; + } + + if (diff.bits.mps) { + SPDK_ERRLOG("Memory Page Size (MPS) %u KiB not supported!\n", (1 << (2 + cc.bits.mps))); + return false; + } + + if (diff.bits.css) { + SPDK_ERRLOG("I/O Command Set Selected (CSS) 0x%x not supported!\n", cc.bits.css); + return false; + } + + if (diff.raw != 0) { + SPDK_ERRLOG("Prop Set CC toggled reserved bits 0x%x!\n", diff.raw); + return false; + } + + return true; +} + +static uint64_t +nvmf_prop_get_csts(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.csts.raw; +} + +static uint64_t +nvmf_prop_get_aqa(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.aqa.raw; +} + +static bool +nvmf_prop_set_aqa(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value) +{ + union spdk_nvme_aqa_register aqa; + + aqa.raw = value; + + if (aqa.bits.asqs > ctrlr->vcprop.cap.bits.mqes || + aqa.bits.acqs > ctrlr->vcprop.cap.bits.mqes) { + return false; + } + + ctrlr->vcprop.aqa.raw = value; + + return true; +} + +static uint64_t +nvmf_prop_get_asq(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.asq; +} + +static bool +nvmf_prop_set_asq_lower(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value) +{ + ctrlr->vcprop.asq = (ctrlr->vcprop.asq & (0xFFFFFFFFULL << 32ULL)) | value; + + return true; +} + +static bool +nvmf_prop_set_asq_upper(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value) +{ + ctrlr->vcprop.asq = (ctrlr->vcprop.asq & 0xFFFFFFFFULL) | ((uint64_t)value << 32ULL); + + return true; +} + +static uint64_t +nvmf_prop_get_acq(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.acq; +} + +static bool +nvmf_prop_set_acq_lower(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value) +{ + ctrlr->vcprop.acq = (ctrlr->vcprop.acq & (0xFFFFFFFFULL << 32ULL)) | value; + + return true; +} + +static bool +nvmf_prop_set_acq_upper(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value) +{ + ctrlr->vcprop.acq = (ctrlr->vcprop.acq & 0xFFFFFFFFULL) | ((uint64_t)value << 32ULL); + + return true; +} + +struct nvmf_prop { + uint32_t ofst; + uint8_t size; + char name[11]; + uint64_t (*get_cb)(struct spdk_nvmf_ctrlr *ctrlr); + bool (*set_cb)(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value); + bool (*set_upper_cb)(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value); +}; + +#define PROP(field, size, get_cb, set_cb, set_upper_cb) \ + { \ + offsetof(struct spdk_nvme_registers, field), \ + size, \ + #field, \ + get_cb, set_cb, set_upper_cb \ + } + +static const struct nvmf_prop nvmf_props[] = { + PROP(cap, 8, nvmf_prop_get_cap, NULL, NULL), + PROP(vs, 4, nvmf_prop_get_vs, NULL, NULL), + PROP(cc, 4, nvmf_prop_get_cc, nvmf_prop_set_cc, NULL), + PROP(csts, 4, nvmf_prop_get_csts, NULL, NULL), + PROP(aqa, 4, nvmf_prop_get_aqa, nvmf_prop_set_aqa, NULL), + PROP(asq, 8, nvmf_prop_get_asq, nvmf_prop_set_asq_lower, nvmf_prop_set_asq_upper), + PROP(acq, 8, nvmf_prop_get_acq, nvmf_prop_set_acq_lower, nvmf_prop_set_acq_upper), +}; + +static const struct nvmf_prop * +find_prop(uint32_t ofst, uint8_t size) +{ + size_t i; + + for (i = 0; i < SPDK_COUNTOF(nvmf_props); i++) { + const struct nvmf_prop *prop = &nvmf_props[i]; + + if ((ofst >= prop->ofst) && (ofst + size <= prop->ofst + prop->size)) { + return prop; + } + } + + return NULL; +} + +static int +nvmf_property_get(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_fabric_prop_get_cmd *cmd = &req->cmd->prop_get_cmd; + struct spdk_nvmf_fabric_prop_get_rsp *response = &req->rsp->prop_get_rsp; + const struct nvmf_prop *prop; + uint8_t size; + + response->status.sc = 0; + response->value.u64 = 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "size %d, offset 0x%x\n", + cmd->attrib.size, cmd->ofst); + + switch (cmd->attrib.size) { + case SPDK_NVMF_PROP_SIZE_4: + size = 4; + break; + case SPDK_NVMF_PROP_SIZE_8: + size = 8; + break; + default: + SPDK_ERRLOG("Invalid size value %d\n", cmd->attrib.size); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + prop = find_prop(cmd->ofst, size); + if (prop == NULL || prop->get_cb == NULL) { + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "name: %s\n", prop->name); + + response->value.u64 = prop->get_cb(ctrlr); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "response value: 0x%" PRIx64 "\n", response->value.u64); + + if (size != prop->size) { + /* The size must be 4 and the prop->size is 8. Figure out which part of the property to read. */ + assert(size == 4); + assert(prop->size == 8); + + if (cmd->ofst == prop->ofst) { + /* Keep bottom 4 bytes only */ + response->value.u64 &= 0xFFFFFFFF; + } else { + /* Keep top 4 bytes only */ + response->value.u64 >>= 32; + } + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_property_set(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_fabric_prop_set_cmd *cmd = &req->cmd->prop_set_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + const struct nvmf_prop *prop; + uint64_t value; + uint8_t size; + bool ret; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "size %d, offset 0x%x, value 0x%" PRIx64 "\n", + cmd->attrib.size, cmd->ofst, cmd->value.u64); + + switch (cmd->attrib.size) { + case SPDK_NVMF_PROP_SIZE_4: + size = 4; + break; + case SPDK_NVMF_PROP_SIZE_8: + size = 8; + break; + default: + SPDK_ERRLOG("Invalid size value %d\n", cmd->attrib.size); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + prop = find_prop(cmd->ofst, size); + if (prop == NULL || prop->set_cb == NULL) { + SPDK_ERRLOG("Invalid offset 0x%x\n", cmd->ofst); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "name: %s\n", prop->name); + + value = cmd->value.u64; + + if (prop->size == 4) { + ret = prop->set_cb(ctrlr, (uint32_t)value); + } else if (size != prop->size) { + /* The size must be 4 and the prop->size is 8. Figure out which part of the property to write. */ + assert(size == 4); + assert(prop->size == 8); + + if (cmd->ofst == prop->ofst) { + ret = prop->set_cb(ctrlr, (uint32_t)value); + } else { + ret = prop->set_upper_cb(ctrlr, (uint32_t)value); + } + } else { + ret = prop->set_cb(ctrlr, (uint32_t)value); + if (ret) { + ret = prop->set_upper_cb(ctrlr, (uint32_t)(value >> 32)); + } + } + + if (!ret) { + SPDK_ERRLOG("prop set_cb failed\n"); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_arbitration(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Arbitration (cdw11 = 0x%0x)\n", cmd->cdw11); + + ctrlr->feat.arbitration.raw = cmd->cdw11; + ctrlr->feat.arbitration.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_power_management(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Power Management (cdw11 = 0x%0x)\n", cmd->cdw11); + + /* Only PS = 0 is allowed, since we report NPSS = 0 */ + if (cmd->cdw11_bits.feat_power_management.bits.ps != 0) { + SPDK_ERRLOG("Invalid power state %u\n", cmd->cdw11_bits.feat_power_management.bits.ps); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ctrlr->feat.power_management.raw = cmd->cdw11; + ctrlr->feat.power_management.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static bool +temp_threshold_opts_valid(const union spdk_nvme_feat_temperature_threshold *opts) +{ + /* + * Valid TMPSEL values: + * 0000b - 1000b: temperature sensors + * 1111b: set all implemented temperature sensors + */ + if (opts->bits.tmpsel >= 9 && opts->bits.tmpsel != 15) { + /* 1001b - 1110b: reserved */ + SPDK_ERRLOG("Invalid TMPSEL %u\n", opts->bits.tmpsel); + return false; + } + + /* + * Valid THSEL values: + * 00b: over temperature threshold + * 01b: under temperature threshold + */ + if (opts->bits.thsel > 1) { + /* 10b - 11b: reserved */ + SPDK_ERRLOG("Invalid THSEL %u\n", opts->bits.thsel); + return false; + } + + return true; +} + +static int +nvmf_ctrlr_set_features_temperature_threshold(struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Temperature Threshold (cdw11 = 0x%0x)\n", cmd->cdw11); + + if (!temp_threshold_opts_valid(&cmd->cdw11_bits.feat_temp_threshold)) { + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* TODO: no sensors implemented - ignore new values */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_get_features_temperature_threshold(struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Temperature Threshold (cdw11 = 0x%0x)\n", cmd->cdw11); + + if (!temp_threshold_opts_valid(&cmd->cdw11_bits.feat_temp_threshold)) { + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* TODO: no sensors implemented - return 0 for all thresholds */ + rsp->cdw0 = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_error_recovery(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Error Recovery (cdw11 = 0x%0x)\n", cmd->cdw11); + + if (cmd->cdw11_bits.feat_error_recovery.bits.dulbe) { + /* + * Host is not allowed to set this bit, since we don't advertise it in + * Identify Namespace. + */ + SPDK_ERRLOG("Host set unsupported DULBE bit\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ctrlr->feat.error_recovery.raw = cmd->cdw11; + ctrlr->feat.error_recovery.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_volatile_write_cache(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Volatile Write Cache (cdw11 = 0x%0x)\n", cmd->cdw11); + + ctrlr->feat.volatile_write_cache.raw = cmd->cdw11; + ctrlr->feat.volatile_write_cache.bits.reserved = 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Volatile Write Cache %s\n", + ctrlr->feat.volatile_write_cache.bits.wce ? "Enabled" : "Disabled"); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_write_atomicity(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Write Atomicity (cdw11 = 0x%0x)\n", cmd->cdw11); + + ctrlr->feat.write_atomicity.raw = cmd->cdw11; + ctrlr->feat.write_atomicity.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_host_identifier(struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + SPDK_ERRLOG("Set Features - Host Identifier not allowed\n"); + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_get_features_host_identifier(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Host Identifier\n"); + + if (!cmd->cdw11_bits.feat_host_identifier.bits.exhid) { + /* NVMe over Fabrics requires EXHID=1 (128-bit/16-byte host ID) */ + SPDK_ERRLOG("Get Features - Host Identifier with EXHID=0 not allowed\n"); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (req->data == NULL || req->length < sizeof(ctrlr->hostid)) { + SPDK_ERRLOG("Invalid data buffer for Get Features - Host Identifier\n"); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + spdk_uuid_copy((struct spdk_uuid *)req->data, &ctrlr->hostid); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_get_features_reservation_notification_mask(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvmf_ns *ns; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "get Features - Reservation Notificaton Mask\n"); + + if (cmd->nsid == 0xffffffffu) { + SPDK_ERRLOG("get Features - Invalid Namespace ID\n"); + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid); + if (ns == NULL) { + SPDK_ERRLOG("Set Features - Invalid Namespace ID\n"); + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + rsp->cdw0 = ns->mask; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_reservation_notification_mask(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvmf_ns *ns; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Reservation Notificaton Mask\n"); + + if (cmd->nsid == 0xffffffffu) { + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + ns->mask = cmd->cdw11; + } + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid); + if (ns == NULL) { + SPDK_ERRLOG("Set Features - Invalid Namespace ID\n"); + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + ns->mask = cmd->cdw11; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_get_features_reservation_persistence(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + struct spdk_nvmf_ns *ns; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Reservation Persistence\n"); + + ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid); + /* NSID with 0xffffffffu also included */ + if (ns == NULL) { + SPDK_ERRLOG("Get Features - Invalid Namespace ID\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + response->cdw0 = ns->ptpl_activated; + + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_SUCCESS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_reservation_persistence(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + struct spdk_nvmf_ns *ns; + bool ptpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Reservation Persistence\n"); + + ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid); + ptpl = cmd->cdw11_bits.feat_rsv_persistence.bits.ptpl; + + if (cmd->nsid != 0xffffffffu && ns && ns->ptpl_file) { + ns->ptpl_activated = ptpl; + } else if (cmd->nsid == 0xffffffffu) { + for (ns = spdk_nvmf_subsystem_get_first_ns(ctrlr->subsys); ns && ns->ptpl_file; + ns = spdk_nvmf_subsystem_get_next_ns(ctrlr->subsys, ns)) { + ns->ptpl_activated = ptpl; + } + } else { + SPDK_ERRLOG("Set Features - Invalid Namespace ID or Reservation Configuration\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* TODO: Feature not changeable for now */ + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_keep_alive_timer(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Keep Alive Timer (%u ms)\n", cmd->cdw11); + + /* + * if attempts to disable keep alive by setting kato to 0h + * a status value of keep alive invalid shall be returned + */ + if (cmd->cdw11_bits.feat_keep_alive_timer.bits.kato == 0) { + rsp->status.sc = SPDK_NVME_SC_KEEP_ALIVE_INVALID; + } else if (cmd->cdw11_bits.feat_keep_alive_timer.bits.kato < MIN_KEEP_ALIVE_TIMEOUT_IN_MS) { + ctrlr->feat.keep_alive_timer.bits.kato = MIN_KEEP_ALIVE_TIMEOUT_IN_MS; + } else { + /* round up to milliseconds */ + ctrlr->feat.keep_alive_timer.bits.kato = spdk_divide_round_up( + cmd->cdw11_bits.feat_keep_alive_timer.bits.kato, + KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS) * + KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS; + } + + /* + * if change the keep alive timeout value successfully + * update the keep alive poller. + */ + if (cmd->cdw11_bits.feat_keep_alive_timer.bits.kato != 0) { + if (ctrlr->keep_alive_poller != NULL) { + spdk_poller_unregister(&ctrlr->keep_alive_poller); + } + ctrlr->keep_alive_poller = SPDK_POLLER_REGISTER(nvmf_ctrlr_keep_alive_poll, ctrlr, + ctrlr->feat.keep_alive_timer.bits.kato * 1000); + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Keep Alive Timer set to %u ms\n", + ctrlr->feat.keep_alive_timer.bits.kato); + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_number_of_queues(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint32_t count; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Number of Queues, cdw11 0x%x\n", + req->cmd->nvme_cmd.cdw11); + + count = spdk_bit_array_count_set(ctrlr->qpair_mask); + /* verify that the controller is ready to process commands */ + if (count > 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Queue pairs already active!\n"); + rsp->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + } else { + /* + * Ignore the value requested by the host - + * always return the pre-configured value based on max_qpairs_allowed. + */ + rsp->cdw0 = ctrlr->feat.number_of_queues.raw; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_set_features_async_event_configuration(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Async Event Configuration, cdw11 0x%08x\n", + cmd->cdw11); + ctrlr->feat.async_event_configuration.raw = cmd->cdw11; + ctrlr->feat.async_event_configuration.bits.reserved = 0; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_async_event_request(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvmf_subsystem_poll_group *sgroup; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Async Event Request\n"); + + /* Four asynchronous events are supported for now */ + if (ctrlr->nr_aer_reqs >= NVMF_MAX_ASYNC_EVENTS) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "AERL exceeded\n"); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (ctrlr->notice_event.bits.async_event_type == + SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) { + rsp->cdw0 = ctrlr->notice_event.raw; + ctrlr->notice_event.raw = 0; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (ctrlr->reservation_event.bits.async_event_type == + SPDK_NVME_ASYNC_EVENT_TYPE_IO) { + rsp->cdw0 = ctrlr->reservation_event.raw; + ctrlr->reservation_event.raw = 0; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* AER cmd is an exception */ + sgroup = &req->qpair->group->sgroups[ctrlr->subsys->id]; + assert(sgroup != NULL); + sgroup->io_outstanding--; + + ctrlr->aer_req[ctrlr->nr_aer_reqs++] = req; + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static void +nvmf_get_firmware_slot_log_page(void *buffer, uint64_t offset, uint32_t length) +{ + struct spdk_nvme_firmware_page fw_page; + size_t copy_len; + + memset(&fw_page, 0, sizeof(fw_page)); + fw_page.afi.active_slot = 1; + fw_page.afi.next_reset_slot = 0; + spdk_strcpy_pad(fw_page.revision[0], FW_VERSION, sizeof(fw_page.revision[0]), ' '); + + if (offset < sizeof(fw_page)) { + copy_len = spdk_min(sizeof(fw_page) - offset, length); + if (copy_len > 0) { + memcpy(buffer, (const char *)&fw_page + offset, copy_len); + } + } +} + +void +nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid) +{ + uint16_t max_changes = SPDK_COUNTOF(ctrlr->changed_ns_list.ns_list); + uint16_t i; + bool found = false; + + for (i = 0; i < ctrlr->changed_ns_list_count; i++) { + if (ctrlr->changed_ns_list.ns_list[i] == nsid) { + /* nsid is already in the list */ + found = true; + break; + } + } + + if (!found) { + if (ctrlr->changed_ns_list_count == max_changes) { + /* Out of space - set first entry to FFFFFFFFh and zero-fill the rest. */ + ctrlr->changed_ns_list.ns_list[0] = 0xFFFFFFFFu; + for (i = 1; i < max_changes; i++) { + ctrlr->changed_ns_list.ns_list[i] = 0; + } + } else { + ctrlr->changed_ns_list.ns_list[ctrlr->changed_ns_list_count++] = nsid; + } + } +} + +static void +nvmf_get_changed_ns_list_log_page(struct spdk_nvmf_ctrlr *ctrlr, + void *buffer, uint64_t offset, uint32_t length) +{ + size_t copy_length; + + if (offset < sizeof(ctrlr->changed_ns_list)) { + copy_length = spdk_min(length, sizeof(ctrlr->changed_ns_list) - offset); + if (copy_length) { + memcpy(buffer, (char *)&ctrlr->changed_ns_list + offset, copy_length); + } + } + + /* Clear log page each time it is read */ + ctrlr->changed_ns_list_count = 0; + memset(&ctrlr->changed_ns_list, 0, sizeof(ctrlr->changed_ns_list)); +} + +/* The structure can be modified if we provide support for other commands in future */ +static const struct spdk_nvme_cmds_and_effect_log_page g_cmds_and_effect_log_page = { + .admin_cmds_supported = { + /* CSUPP, LBCC, NCC, NIC, CCC, CSE */ + /* Get Log Page */ + [SPDK_NVME_OPC_GET_LOG_PAGE] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Identify */ + [SPDK_NVME_OPC_IDENTIFY] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Abort */ + [SPDK_NVME_OPC_ABORT] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Set Features */ + [SPDK_NVME_OPC_SET_FEATURES] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Get Features */ + [SPDK_NVME_OPC_GET_FEATURES] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Async Event Request */ + [SPDK_NVME_OPC_ASYNC_EVENT_REQUEST] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Keep Alive */ + [SPDK_NVME_OPC_KEEP_ALIVE] = {1, 0, 0, 0, 0, 0, 0, 0}, + }, + .io_cmds_supported = { + /* FLUSH */ + [SPDK_NVME_OPC_FLUSH] = {1, 1, 0, 0, 0, 0, 0, 0}, + /* WRITE */ + [SPDK_NVME_OPC_WRITE] = {1, 1, 0, 0, 0, 0, 0, 0}, + /* READ */ + [SPDK_NVME_OPC_READ] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* WRITE ZEROES */ + [SPDK_NVME_OPC_WRITE_ZEROES] = {1, 1, 0, 0, 0, 0, 0, 0}, + /* DATASET MANAGEMENT */ + [SPDK_NVME_OPC_DATASET_MANAGEMENT] = {1, 1, 0, 0, 0, 0, 0, 0}, + /* COMPARE */ + [SPDK_NVME_OPC_COMPARE] = {1, 0, 0, 0, 0, 0, 0, 0}, + }, +}; + +static void +nvmf_get_cmds_and_effects_log_page(void *buffer, + uint64_t offset, uint32_t length) +{ + uint32_t page_size = sizeof(struct spdk_nvme_cmds_and_effect_log_page); + size_t copy_len = 0; + size_t zero_len = length; + + if (offset < page_size) { + copy_len = spdk_min(page_size - offset, length); + zero_len -= copy_len; + memcpy(buffer, (char *)(&g_cmds_and_effect_log_page) + offset, copy_len); + } + + if (zero_len) { + memset((char *)buffer + copy_len, 0, zero_len); + } +} + +static void +nvmf_get_reservation_notification_log_page(struct spdk_nvmf_ctrlr *ctrlr, + void *data, uint64_t offset, uint32_t length) +{ + uint32_t unit_log_len, avail_log_len, next_pos, copy_len; + struct spdk_nvmf_reservation_log *log, *log_tmp; + uint8_t *buf = data; + + unit_log_len = sizeof(struct spdk_nvme_reservation_notification_log); + /* No available log, return 1 zeroed log page */ + if (!ctrlr->num_avail_log_pages) { + memset(buf, 0, spdk_min(length, unit_log_len)); + return; + } + + avail_log_len = ctrlr->num_avail_log_pages * unit_log_len; + if (offset >= avail_log_len) { + return; + } + + next_pos = copy_len = 0; + TAILQ_FOREACH_SAFE(log, &ctrlr->log_head, link, log_tmp) { + TAILQ_REMOVE(&ctrlr->log_head, log, link); + ctrlr->num_avail_log_pages--; + + next_pos += unit_log_len; + if (next_pos > offset) { + copy_len = spdk_min(next_pos - offset, length); + memcpy(buf, &log->log, copy_len); + length -= copy_len; + offset += copy_len; + buf += copy_len; + } + free(log); + + if (length == 0) { + break; + } + } + return; +} + +static int +nvmf_ctrlr_get_log_page(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + uint64_t offset, len; + uint32_t numdl, numdu; + uint8_t lid; + + if (req->data == NULL) { + SPDK_ERRLOG("get log command with no buffer\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + offset = (uint64_t)cmd->cdw12 | ((uint64_t)cmd->cdw13 << 32); + if (offset & 3) { + SPDK_ERRLOG("Invalid log page offset 0x%" PRIx64 "\n", offset); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + numdl = cmd->cdw10_bits.get_log_page.numdl; + numdu = cmd->cdw11_bits.get_log_page.numdu; + len = ((numdu << 16) + numdl + (uint64_t)1) * 4; + if (len > req->length) { + SPDK_ERRLOG("Get log page: len (%" PRIu64 ") > buf size (%u)\n", + len, req->length); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + lid = cmd->cdw10_bits.get_log_page.lid; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get log page: LID=0x%02X offset=0x%" PRIx64 " len=0x%" PRIx64 "\n", + lid, offset, len); + + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + switch (lid) { + case SPDK_NVME_LOG_DISCOVERY: + nvmf_get_discovery_log_page(subsystem->tgt, ctrlr->hostnqn, req->iov, req->iovcnt, offset, + len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + default: + goto invalid_log_page; + } + } else { + switch (lid) { + case SPDK_NVME_LOG_ERROR: + case SPDK_NVME_LOG_HEALTH_INFORMATION: + /* TODO: actually fill out log page data */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_FIRMWARE_SLOT: + nvmf_get_firmware_slot_log_page(req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_COMMAND_EFFECTS_LOG: + nvmf_get_cmds_and_effects_log_page(req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_CHANGED_NS_LIST: + nvmf_get_changed_ns_list_log_page(ctrlr, req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_RESERVATION_NOTIFICATION: + nvmf_get_reservation_notification_log_page(ctrlr, req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + default: + goto invalid_log_page; + } + } + +invalid_log_page: + SPDK_ERRLOG("Unsupported Get Log Page 0x%02X\n", lid); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +int +spdk_nvmf_ctrlr_identify_ns(struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvme_cmd *cmd, + struct spdk_nvme_cpl *rsp, + struct spdk_nvme_ns_data *nsdata) +{ + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvmf_ns *ns; + uint32_t max_num_blocks; + + if (cmd->nsid == 0 || cmd->nsid > subsystem->max_nsid) { + SPDK_ERRLOG("Identify Namespace for invalid NSID %u\n", cmd->nsid); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ns = _nvmf_subsystem_get_ns(subsystem, cmd->nsid); + if (ns == NULL || ns->bdev == NULL) { + /* + * Inactive namespaces should return a zero filled data structure. + * The data buffer is already zeroed by nvmf_ctrlr_process_admin_cmd(), + * so we can just return early here. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Identify Namespace for inactive NSID %u\n", cmd->nsid); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_SUCCESS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + nvmf_bdev_ctrlr_identify_ns(ns, nsdata, ctrlr->dif_insert_or_strip); + + /* Due to bug in the Linux kernel NVMe driver we have to set noiob no larger than mdts */ + max_num_blocks = ctrlr->admin_qpair->transport->opts.max_io_size / + (1U << nsdata->lbaf[nsdata->flbas.format].lbads); + if (nsdata->noiob > max_num_blocks) { + nsdata->noiob = max_num_blocks; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static void +nvmf_ctrlr_populate_oacs(struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvme_ctrlr_data *cdata) +{ + cdata->oacs.virtualization_management = + g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_VIRTUALIZATION_MANAGEMENT].hdlr != NULL; + cdata->oacs.nvme_mi = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NVME_MI_SEND].hdlr != NULL + && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NVME_MI_RECEIVE].hdlr != NULL; + cdata->oacs.directives = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_DIRECTIVE_SEND].hdlr != NULL + && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_DIRECTIVE_RECEIVE].hdlr != NULL; + cdata->oacs.device_self_test = + g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_DEVICE_SELF_TEST].hdlr != NULL; + cdata->oacs.ns_manage = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NS_MANAGEMENT].hdlr != NULL + && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NS_ATTACHMENT].hdlr != NULL; + cdata->oacs.firmware = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD].hdlr != + NULL + && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_FIRMWARE_COMMIT].hdlr != NULL; + cdata->oacs.format = + g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_FORMAT_NVM].hdlr != NULL; + cdata->oacs.security = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_SECURITY_SEND].hdlr != NULL + && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_SECURITY_RECEIVE].hdlr != NULL; + cdata->oacs.get_lba_status = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_GET_LBA_STATUS].hdlr != + NULL; +} + +int +spdk_nvmf_ctrlr_identify_ctrlr(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_ctrlr_data *cdata) +{ + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvmf_transport *transport = ctrlr->admin_qpair->transport; + + /* + * Common fields for discovery and NVM subsystems + */ + spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' '); + assert((transport->opts.max_io_size % 4096) == 0); + cdata->mdts = spdk_u32log2(transport->opts.max_io_size / 4096); + cdata->cntlid = ctrlr->cntlid; + cdata->ver = ctrlr->vcprop.vs; + cdata->aerl = NVMF_MAX_ASYNC_EVENTS - 1; + cdata->lpa.edlp = 1; + cdata->elpe = 127; + cdata->maxcmd = transport->opts.max_queue_depth; + cdata->sgls = ctrlr->cdata.sgls; + cdata->fuses.compare_and_write = 1; + cdata->acwu = 1; + spdk_strcpy_pad(cdata->subnqn, subsystem->subnqn, sizeof(cdata->subnqn), '\0'); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ctrlr data: maxcmd 0x%x\n", cdata->maxcmd); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "sgls data: 0x%x\n", from_le32(&cdata->sgls)); + + /* + * NVM subsystem fields (reserved for discovery subsystems) + */ + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_NVME) { + spdk_strcpy_pad(cdata->mn, spdk_nvmf_subsystem_get_mn(subsystem), sizeof(cdata->mn), ' '); + spdk_strcpy_pad(cdata->sn, spdk_nvmf_subsystem_get_sn(subsystem), sizeof(cdata->sn), ' '); + cdata->kas = ctrlr->cdata.kas; + + cdata->rab = 6; + cdata->cmic.multi_port = 1; + cdata->cmic.multi_host = 1; + cdata->oaes.ns_attribute_notices = 1; + cdata->ctratt.host_id_exhid_supported = 1; + /* TODO: Concurrent execution of multiple abort commands. */ + cdata->acl = 0; + cdata->aerl = 0; + cdata->frmw.slot1_ro = 1; + cdata->frmw.num_slots = 1; + + cdata->lpa.celp = 1; /* Command Effects log page supported */ + + cdata->sqes.min = 6; + cdata->sqes.max = 6; + cdata->cqes.min = 4; + cdata->cqes.max = 4; + cdata->nn = subsystem->max_nsid; + cdata->vwc.present = 1; + cdata->vwc.flush_broadcast = SPDK_NVME_FLUSH_BROADCAST_NOT_SUPPORTED; + + cdata->nvmf_specific = ctrlr->cdata.nvmf_specific; + + cdata->oncs.dsm = nvmf_ctrlr_dsm_supported(ctrlr); + cdata->oncs.write_zeroes = nvmf_ctrlr_write_zeroes_supported(ctrlr); + cdata->oncs.reservations = 1; + + nvmf_ctrlr_populate_oacs(ctrlr, cdata); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: ioccsz 0x%x\n", + cdata->nvmf_specific.ioccsz); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: iorcsz 0x%x\n", + cdata->nvmf_specific.iorcsz); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: icdoff 0x%x\n", + cdata->nvmf_specific.icdoff); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: ctrattr 0x%x\n", + *(uint8_t *)&cdata->nvmf_specific.ctrattr); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: msdbd 0x%x\n", + cdata->nvmf_specific.msdbd); + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_identify_active_ns_list(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_cmd *cmd, + struct spdk_nvme_cpl *rsp, + struct spdk_nvme_ns_list *ns_list) +{ + struct spdk_nvmf_ns *ns; + uint32_t count = 0; + + if (cmd->nsid >= 0xfffffffeUL) { + SPDK_ERRLOG("Identify Active Namespace List with invalid NSID %u\n", cmd->nsid); + rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + if (ns->opts.nsid <= cmd->nsid) { + continue; + } + + ns_list->ns_list[count++] = ns->opts.nsid; + if (count == SPDK_COUNTOF(ns_list->ns_list)) { + break; + } + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static void +_add_ns_id_desc(void **buf_ptr, size_t *buf_remain, + enum spdk_nvme_nidt type, + const void *data, size_t data_size) +{ + struct spdk_nvme_ns_id_desc *desc; + size_t desc_size = sizeof(*desc) + data_size; + + /* + * These should never fail in practice, since all valid NS ID descriptors + * should be defined so that they fit in the available 4096-byte buffer. + */ + assert(data_size > 0); + assert(data_size <= UINT8_MAX); + assert(desc_size < *buf_remain); + if (data_size == 0 || data_size > UINT8_MAX || desc_size > *buf_remain) { + return; + } + + desc = *buf_ptr; + desc->nidt = type; + desc->nidl = data_size; + memcpy(desc->nid, data, data_size); + + *buf_ptr += desc_size; + *buf_remain -= desc_size; +} + +static int +nvmf_ctrlr_identify_ns_id_descriptor_list( + struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_cmd *cmd, + struct spdk_nvme_cpl *rsp, + void *id_desc_list, size_t id_desc_list_size) +{ + struct spdk_nvmf_ns *ns; + size_t buf_remain = id_desc_list_size; + void *buf_ptr = id_desc_list; + + ns = _nvmf_subsystem_get_ns(subsystem, cmd->nsid); + if (ns == NULL || ns->bdev == NULL) { + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + +#define ADD_ID_DESC(type, data, size) \ + do { \ + if (!spdk_mem_all_zero(data, size)) { \ + _add_ns_id_desc(&buf_ptr, &buf_remain, type, data, size); \ + } \ + } while (0) + + ADD_ID_DESC(SPDK_NVME_NIDT_EUI64, ns->opts.eui64, sizeof(ns->opts.eui64)); + ADD_ID_DESC(SPDK_NVME_NIDT_NGUID, ns->opts.nguid, sizeof(ns->opts.nguid)); + ADD_ID_DESC(SPDK_NVME_NIDT_UUID, &ns->opts.uuid, sizeof(ns->opts.uuid)); + + /* + * The list is automatically 0-terminated because controller to host buffers in + * admin commands always get zeroed in nvmf_ctrlr_process_admin_cmd(). + */ + +#undef ADD_ID_DESC + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_identify(struct spdk_nvmf_request *req) +{ + uint8_t cns; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + + if (req->data == NULL || req->length < 4096) { + SPDK_ERRLOG("identify command with invalid buffer\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + cns = cmd->cdw10_bits.identify.cns; + + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY && + cns != SPDK_NVME_IDENTIFY_CTRLR) { + /* Discovery controllers only support Identify Controller */ + goto invalid_cns; + } + + switch (cns) { + case SPDK_NVME_IDENTIFY_NS: + return spdk_nvmf_ctrlr_identify_ns(ctrlr, cmd, rsp, req->data); + case SPDK_NVME_IDENTIFY_CTRLR: + return spdk_nvmf_ctrlr_identify_ctrlr(ctrlr, req->data); + case SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST: + return nvmf_ctrlr_identify_active_ns_list(subsystem, cmd, rsp, req->data); + case SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST: + return nvmf_ctrlr_identify_ns_id_descriptor_list(subsystem, cmd, rsp, req->data, req->length); + default: + goto invalid_cns; + } + +invalid_cns: + SPDK_ERRLOG("Identify command with unsupported CNS 0x%02x\n", cns); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static bool +nvmf_qpair_abort_aer(struct spdk_nvmf_qpair *qpair, uint16_t cid) +{ + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvmf_request *req; + int i; + + if (!nvmf_qpair_is_admin_queue(qpair)) { + return false; + } + + for (i = 0; i < ctrlr->nr_aer_reqs; i++) { + if (ctrlr->aer_req[i]->cmd->nvme_cmd.cid == cid) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Aborting AER request\n"); + req = ctrlr->aer_req[i]; + ctrlr->aer_req[i] = NULL; + ctrlr->nr_aer_reqs--; + + /* Move the last req to the aborting position for making aer_reqs + * in continuous + */ + if (i < ctrlr->nr_aer_reqs) { + ctrlr->aer_req[i] = ctrlr->aer_req[ctrlr->nr_aer_reqs]; + ctrlr->aer_req[ctrlr->nr_aer_reqs] = NULL; + } + + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; + _nvmf_request_complete(req); + return true; + } + } + + return false; +} + +static void +nvmf_qpair_abort_request(struct spdk_nvmf_qpair *qpair, struct spdk_nvmf_request *req) +{ + uint16_t cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; + + if (nvmf_qpair_abort_aer(qpair, cid)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "abort ctrlr=%p sqid=%u cid=%u successful\n", + qpair->ctrlr, qpair->qid, cid); + req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command successfully aborted */ + + spdk_nvmf_request_complete(req); + return; + } + + nvmf_transport_qpair_abort_request(qpair, req); +} + +static void +nvmf_ctrlr_abort_done(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i); + + if (status == 0) { + /* There was no qpair whose ID matches SQID of the abort command. + * Hence call _nvmf_request_complete() here. + */ + _nvmf_request_complete(req); + } +} + +static void +nvmf_ctrlr_abort_on_pg(struct spdk_io_channel_iter *i) +{ + struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch); + uint16_t sqid = req->cmd->nvme_cmd.cdw10_bits.abort.sqid; + struct spdk_nvmf_qpair *qpair; + + TAILQ_FOREACH(qpair, &group->qpairs, link) { + if (qpair->ctrlr == req->qpair->ctrlr && qpair->qid == sqid) { + /* Found the qpair */ + + nvmf_qpair_abort_request(qpair, req); + + /* Return -1 for the status so the iteration across threads stops. */ + spdk_for_each_channel_continue(i, -1); + return; + } + } + + spdk_for_each_channel_continue(i, 0); +} + +static int +nvmf_ctrlr_abort(struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + rsp->cdw0 = 1U; /* Command not aborted */ + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_SUCCESS; + + /* Send a message to each poll group, searching for this ctrlr, sqid, and command. */ + spdk_for_each_channel(req->qpair->ctrlr->subsys->tgt, + nvmf_ctrlr_abort_on_pg, + req, + nvmf_ctrlr_abort_done + ); + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +int +nvmf_ctrlr_abort_request(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_request *req_to_abort = req->req_to_abort; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + struct spdk_io_channel *ch; + int rc; + + assert(req_to_abort != NULL); + + if (g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_ABORT].hdlr && + nvmf_qpair_is_admin_queue(req_to_abort->qpair)) { + return g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_ABORT].hdlr(req); + } + + rc = spdk_nvmf_request_get_bdev(req_to_abort->cmd->nvme_cmd.nsid, req_to_abort, + &bdev, &desc, &ch); + if (rc != 0) { + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return spdk_nvmf_bdev_ctrlr_abort_cmd(bdev, desc, ch, req, req_to_abort); +} + +static int +get_features_generic(struct spdk_nvmf_request *req, uint32_t cdw0) +{ + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + rsp->cdw0 = cdw0; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_ctrlr_get_features(struct spdk_nvmf_request *req) +{ + uint8_t feature; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + feature = cmd->cdw10_bits.get_features.fid; + switch (feature) { + case SPDK_NVME_FEAT_ARBITRATION: + return get_features_generic(req, ctrlr->feat.arbitration.raw); + case SPDK_NVME_FEAT_POWER_MANAGEMENT: + return get_features_generic(req, ctrlr->feat.power_management.raw); + case SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD: + return nvmf_ctrlr_get_features_temperature_threshold(req); + case SPDK_NVME_FEAT_ERROR_RECOVERY: + return get_features_generic(req, ctrlr->feat.error_recovery.raw); + case SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE: + return get_features_generic(req, ctrlr->feat.volatile_write_cache.raw); + case SPDK_NVME_FEAT_NUMBER_OF_QUEUES: + return get_features_generic(req, ctrlr->feat.number_of_queues.raw); + case SPDK_NVME_FEAT_WRITE_ATOMICITY: + return get_features_generic(req, ctrlr->feat.write_atomicity.raw); + case SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + return get_features_generic(req, ctrlr->feat.async_event_configuration.raw); + case SPDK_NVME_FEAT_KEEP_ALIVE_TIMER: + return get_features_generic(req, ctrlr->feat.keep_alive_timer.raw); + case SPDK_NVME_FEAT_HOST_IDENTIFIER: + return nvmf_ctrlr_get_features_host_identifier(req); + case SPDK_NVME_FEAT_HOST_RESERVE_MASK: + return nvmf_ctrlr_get_features_reservation_notification_mask(req); + case SPDK_NVME_FEAT_HOST_RESERVE_PERSIST: + return nvmf_ctrlr_get_features_reservation_persistence(req); + default: + SPDK_ERRLOG("Get Features command with unsupported feature ID 0x%02x\n", feature); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + +static int +nvmf_ctrlr_set_features(struct spdk_nvmf_request *req) +{ + uint8_t feature, save; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + /* + * Features are not saveable by the controller as indicated by + * ONCS field of the Identify Controller data. + * */ + save = cmd->cdw10_bits.set_features.sv; + if (save) { + response->status.sc = SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE; + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + feature = cmd->cdw10_bits.set_features.fid; + switch (feature) { + case SPDK_NVME_FEAT_ARBITRATION: + return nvmf_ctrlr_set_features_arbitration(req); + case SPDK_NVME_FEAT_POWER_MANAGEMENT: + return nvmf_ctrlr_set_features_power_management(req); + case SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD: + return nvmf_ctrlr_set_features_temperature_threshold(req); + case SPDK_NVME_FEAT_ERROR_RECOVERY: + return nvmf_ctrlr_set_features_error_recovery(req); + case SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE: + return nvmf_ctrlr_set_features_volatile_write_cache(req); + case SPDK_NVME_FEAT_NUMBER_OF_QUEUES: + return nvmf_ctrlr_set_features_number_of_queues(req); + case SPDK_NVME_FEAT_WRITE_ATOMICITY: + return nvmf_ctrlr_set_features_write_atomicity(req); + case SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + return nvmf_ctrlr_set_features_async_event_configuration(req); + case SPDK_NVME_FEAT_KEEP_ALIVE_TIMER: + return nvmf_ctrlr_set_features_keep_alive_timer(req); + case SPDK_NVME_FEAT_HOST_IDENTIFIER: + return nvmf_ctrlr_set_features_host_identifier(req); + case SPDK_NVME_FEAT_HOST_RESERVE_MASK: + return nvmf_ctrlr_set_features_reservation_notification_mask(req); + case SPDK_NVME_FEAT_HOST_RESERVE_PERSIST: + return nvmf_ctrlr_set_features_reservation_persistence(req); + default: + SPDK_ERRLOG("Set Features command with unsupported feature ID 0x%02x\n", feature); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + +static int +nvmf_ctrlr_keep_alive(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Keep Alive\n"); + /* + * To handle keep alive just clear or reset the + * ctrlr based keep alive duration counter. + * When added, a separate timer based process + * will monitor if the time since last recorded + * keep alive has exceeded the max duration and + * take appropriate action. + */ + ctrlr->last_keep_alive_tick = spdk_get_ticks(); + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +int +nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int rc; + + if (ctrlr == NULL) { + SPDK_ERRLOG("Admin command sent before CONNECT\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (ctrlr->vcprop.cc.bits.en != 1) { + SPDK_ERRLOG("Admin command sent to disabled controller\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (req->data && spdk_nvme_opc_get_data_transfer(cmd->opc) == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + memset(req->data, 0, req->length); + } + + if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + /* Discovery controllers only support Get Log Page, Identify and Keep Alive. */ + switch (cmd->opc) { + case SPDK_NVME_OPC_IDENTIFY: + case SPDK_NVME_OPC_GET_LOG_PAGE: + case SPDK_NVME_OPC_KEEP_ALIVE: + break; + default: + goto invalid_opcode; + } + } + + /* Call a custom adm cmd handler if set. Aborts are handled in a different path (see nvmf_passthru_admin_cmd) */ + if (g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].hdlr && cmd->opc != SPDK_NVME_OPC_ABORT) { + rc = g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].hdlr(req); + if (rc >= SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) { + /* The handler took care of this commmand */ + return rc; + } + } + + switch (cmd->opc) { + case SPDK_NVME_OPC_GET_LOG_PAGE: + return nvmf_ctrlr_get_log_page(req); + case SPDK_NVME_OPC_IDENTIFY: + return nvmf_ctrlr_identify(req); + case SPDK_NVME_OPC_ABORT: + return nvmf_ctrlr_abort(req); + case SPDK_NVME_OPC_GET_FEATURES: + return nvmf_ctrlr_get_features(req); + case SPDK_NVME_OPC_SET_FEATURES: + return nvmf_ctrlr_set_features(req); + case SPDK_NVME_OPC_ASYNC_EVENT_REQUEST: + return nvmf_ctrlr_async_event_request(req); + case SPDK_NVME_OPC_KEEP_ALIVE: + return nvmf_ctrlr_keep_alive(req); + + case SPDK_NVME_OPC_CREATE_IO_SQ: + case SPDK_NVME_OPC_CREATE_IO_CQ: + case SPDK_NVME_OPC_DELETE_IO_SQ: + case SPDK_NVME_OPC_DELETE_IO_CQ: + /* Create and Delete I/O CQ/SQ not allowed in NVMe-oF */ + goto invalid_opcode; + + default: + goto invalid_opcode; + } + +invalid_opcode: + SPDK_ERRLOG("Unsupported admin opcode 0x%x\n", cmd->opc); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +int +nvmf_ctrlr_process_fabrics_cmd(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_capsule_cmd *cap_hdr; + + cap_hdr = &req->cmd->nvmf_cmd; + + if (qpair->ctrlr == NULL) { + /* No ctrlr established yet; the only valid command is Connect */ + if (cap_hdr->fctype == SPDK_NVMF_FABRIC_COMMAND_CONNECT) { + return nvmf_ctrlr_cmd_connect(req); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Got fctype 0x%x, expected Connect\n", + cap_hdr->fctype); + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + } else if (nvmf_qpair_is_admin_queue(qpair)) { + /* + * Controller session is established, and this is an admin queue. + * Disallow Connect and allow other fabrics commands. + */ + switch (cap_hdr->fctype) { + case SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET: + return nvmf_property_set(req); + case SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET: + return nvmf_property_get(req); + default: + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "unknown fctype 0x%02x\n", + cap_hdr->fctype); + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + } else { + /* Controller session is established, and this is an I/O queue */ + /* For now, no I/O-specific Fabrics commands are implemented (other than Connect) */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Unexpected I/O fctype 0x%x\n", cap_hdr->fctype); + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + +static inline int +nvmf_ctrlr_async_event_notification(struct spdk_nvmf_ctrlr *ctrlr, + union spdk_nvme_async_event_completion *event) +{ + struct spdk_nvmf_request *req; + struct spdk_nvme_cpl *rsp; + + assert(ctrlr->nr_aer_reqs > 0); + + req = ctrlr->aer_req[--ctrlr->nr_aer_reqs]; + rsp = &req->rsp->nvme_cpl; + + rsp->cdw0 = event->raw; + + _nvmf_request_complete(req); + ctrlr->aer_req[ctrlr->nr_aer_reqs] = NULL; + + return 0; +} + +int +nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr) +{ + union spdk_nvme_async_event_completion event = {0}; + + /* Users may disable the event notification */ + if (!ctrlr->feat.async_event_configuration.bits.ns_attr_notice) { + return 0; + } + + event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE; + event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED; + event.bits.log_page_identifier = SPDK_NVME_LOG_CHANGED_NS_LIST; + + /* If there is no outstanding AER request, queue the event. Then + * if an AER is later submitted, this event can be sent as a + * response. + */ + if (ctrlr->nr_aer_reqs == 0) { + if (ctrlr->notice_event.bits.async_event_type == + SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) { + return 0; + } + + ctrlr->notice_event.raw = event.raw; + return 0; + } + + return nvmf_ctrlr_async_event_notification(ctrlr, &event); +} + +void +nvmf_ctrlr_async_event_reservation_notification(struct spdk_nvmf_ctrlr *ctrlr) +{ + union spdk_nvme_async_event_completion event = {0}; + + if (!ctrlr->num_avail_log_pages) { + return; + } + event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_IO; + event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_RESERVATION_LOG_AVAIL; + event.bits.log_page_identifier = SPDK_NVME_LOG_RESERVATION_NOTIFICATION; + + /* If there is no outstanding AER request, queue the event. Then + * if an AER is later submitted, this event can be sent as a + * response. + */ + if (ctrlr->nr_aer_reqs == 0) { + if (ctrlr->reservation_event.bits.async_event_type == + SPDK_NVME_ASYNC_EVENT_TYPE_IO) { + return; + } + + ctrlr->reservation_event.raw = event.raw; + return; + } + + nvmf_ctrlr_async_event_notification(ctrlr, &event); +} + +void +nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + int i; + + if (!nvmf_qpair_is_admin_queue(qpair)) { + return; + } + + for (i = 0; i < ctrlr->nr_aer_reqs; i++) { + spdk_nvmf_request_free(ctrlr->aer_req[i]); + ctrlr->aer_req[i] = NULL; + } + + ctrlr->nr_aer_reqs = 0; +} + +void +nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr) +{ + struct spdk_nvmf_request *req; + int i; + + for (i = 0; i < ctrlr->nr_aer_reqs; i++) { + req = ctrlr->aer_req[i]; + + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; + _nvmf_request_complete(req); + + ctrlr->aer_req[i] = NULL; + } + + ctrlr->nr_aer_reqs = 0; +} + +static void +_nvmf_ctrlr_add_reservation_log(void *ctx) +{ + struct spdk_nvmf_reservation_log *log = (struct spdk_nvmf_reservation_log *)ctx; + struct spdk_nvmf_ctrlr *ctrlr = log->ctrlr; + + ctrlr->log_page_count++; + + /* Maximum number of queued log pages is 255 */ + if (ctrlr->num_avail_log_pages == 0xff) { + struct spdk_nvmf_reservation_log *entry; + entry = TAILQ_LAST(&ctrlr->log_head, log_page_head); + entry->log.log_page_count = ctrlr->log_page_count; + free(log); + return; + } + + log->log.log_page_count = ctrlr->log_page_count; + log->log.num_avail_log_pages = ctrlr->num_avail_log_pages++; + TAILQ_INSERT_TAIL(&ctrlr->log_head, log, link); + + nvmf_ctrlr_async_event_reservation_notification(ctrlr); +} + +void +nvmf_ctrlr_reservation_notice_log(struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvmf_ns *ns, + enum spdk_nvme_reservation_notification_log_page_type type) +{ + struct spdk_nvmf_reservation_log *log; + + switch (type) { + case SPDK_NVME_RESERVATION_LOG_PAGE_EMPTY: + return; + case SPDK_NVME_REGISTRATION_PREEMPTED: + if (ns->mask & SPDK_NVME_REGISTRATION_PREEMPTED_MASK) { + return; + } + break; + case SPDK_NVME_RESERVATION_RELEASED: + if (ns->mask & SPDK_NVME_RESERVATION_RELEASED_MASK) { + return; + } + break; + case SPDK_NVME_RESERVATION_PREEMPTED: + if (ns->mask & SPDK_NVME_RESERVATION_PREEMPTED_MASK) { + return; + } + break; + default: + return; + } + + log = calloc(1, sizeof(*log)); + if (!log) { + SPDK_ERRLOG("Alloc log page failed, ignore the log\n"); + return; + } + log->ctrlr = ctrlr; + log->log.type = type; + log->log.nsid = ns->nsid; + + spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_add_reservation_log, log); +} + +/* Check from subsystem poll group's namespace information data structure */ +static bool +nvmf_ns_info_ctrlr_is_registrant(struct spdk_nvmf_subsystem_pg_ns_info *ns_info, + struct spdk_nvmf_ctrlr *ctrlr) +{ + uint32_t i; + + for (i = 0; i < SPDK_NVMF_MAX_NUM_REGISTRANTS; i++) { + if (!spdk_uuid_compare(&ns_info->reg_hostid[i], &ctrlr->hostid)) { + return true; + } + } + + return false; +} + +/* + * Check the NVMe command is permitted or not for current controller(Host). + */ +static int +nvmf_ns_reservation_request_check(struct spdk_nvmf_subsystem_pg_ns_info *ns_info, + struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + enum spdk_nvme_reservation_type rtype = ns_info->rtype; + uint8_t status = SPDK_NVME_SC_SUCCESS; + uint8_t racqa; + bool is_registrant; + + /* No valid reservation */ + if (!rtype) { + return 0; + } + + is_registrant = nvmf_ns_info_ctrlr_is_registrant(ns_info, ctrlr); + /* All registrants type and current ctrlr is a valid registrant */ + if ((rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE_ALL_REGS || + rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS) && is_registrant) { + return 0; + } else if (!spdk_uuid_compare(&ns_info->holder_id, &ctrlr->hostid)) { + return 0; + } + + /* Non-holder for current controller */ + switch (cmd->opc) { + case SPDK_NVME_OPC_READ: + case SPDK_NVME_OPC_COMPARE: + if (rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS) { + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + goto exit; + } + if ((rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_REG_ONLY || + rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS) && !is_registrant) { + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + } + break; + case SPDK_NVME_OPC_FLUSH: + case SPDK_NVME_OPC_WRITE: + case SPDK_NVME_OPC_WRITE_UNCORRECTABLE: + case SPDK_NVME_OPC_WRITE_ZEROES: + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + if (rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE || + rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS) { + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + goto exit; + } + if (!is_registrant) { + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + } + break; + case SPDK_NVME_OPC_RESERVATION_ACQUIRE: + racqa = cmd->cdw10_bits.resv_acquire.racqa; + if (racqa == SPDK_NVME_RESERVE_ACQUIRE) { + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + goto exit; + } + if (!is_registrant) { + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + } + break; + case SPDK_NVME_OPC_RESERVATION_RELEASE: + if (!is_registrant) { + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + } + break; + default: + break; + } + +exit: + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = status; + if (status == SPDK_NVME_SC_RESERVATION_CONFLICT) { + return -EPERM; + } + + return 0; +} + +static int +nvmf_ctrlr_process_io_fused_cmd(struct spdk_nvmf_request *req, struct spdk_bdev *bdev, + struct spdk_bdev_desc *desc, struct spdk_io_channel *ch) +{ + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvmf_request *first_fused_req = req->qpair->first_fused_req; + int rc; + + if (cmd->fuse == SPDK_NVME_CMD_FUSE_FIRST) { + /* first fused operation (should be compare) */ + if (first_fused_req != NULL) { + struct spdk_nvme_cpl *fused_response = &first_fused_req->rsp->nvme_cpl; + + SPDK_ERRLOG("Wrong sequence of fused operations\n"); + + /* abort req->qpair->first_fused_request and continue with new fused command */ + fused_response->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED; + fused_response->status.sct = SPDK_NVME_SCT_GENERIC; + _nvmf_request_complete(first_fused_req); + } else if (cmd->opc != SPDK_NVME_OPC_COMPARE) { + SPDK_ERRLOG("Wrong op code of fused operations\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + req->qpair->first_fused_req = req; + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } else if (cmd->fuse == SPDK_NVME_CMD_FUSE_SECOND) { + /* second fused operation (should be write) */ + if (first_fused_req == NULL) { + SPDK_ERRLOG("Wrong sequence of fused operations\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } else if (cmd->opc != SPDK_NVME_OPC_WRITE) { + struct spdk_nvme_cpl *fused_response = &first_fused_req->rsp->nvme_cpl; + + SPDK_ERRLOG("Wrong op code of fused operations\n"); + + /* abort req->qpair->first_fused_request and fail current command */ + fused_response->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED; + fused_response->status.sct = SPDK_NVME_SCT_GENERIC; + _nvmf_request_complete(first_fused_req); + + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_OPCODE; + req->qpair->first_fused_req = NULL; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* save request of first command to generate response later */ + req->first_fused_req = first_fused_req; + req->qpair->first_fused_req = NULL; + } else { + SPDK_ERRLOG("Invalid fused command fuse field.\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = nvmf_bdev_ctrlr_compare_and_write_cmd(bdev, desc, ch, req->first_fused_req, req); + + if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) { + if (spdk_nvme_cpl_is_error(rsp)) { + struct spdk_nvme_cpl *fused_response = &first_fused_req->rsp->nvme_cpl; + + fused_response->status = rsp->status; + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; + /* Complete first of fused commands. Second will be completed by upper layer */ + _nvmf_request_complete(first_fused_req); + req->first_fused_req = NULL; + } + } + + return rc; +} + +int +nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req) +{ + uint32_t nsid; + struct spdk_nvmf_ns *ns; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + struct spdk_io_channel *ch; + struct spdk_nvmf_poll_group *group = req->qpair->group; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + struct spdk_nvmf_subsystem_pg_ns_info *ns_info; + + /* pre-set response details for this command */ + response->status.sc = SPDK_NVME_SC_SUCCESS; + nsid = cmd->nsid; + + if (spdk_unlikely(ctrlr == NULL)) { + SPDK_ERRLOG("I/O command sent before CONNECT\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(ctrlr->vcprop.cc.bits.en != 1)) { + SPDK_ERRLOG("I/O command sent to disabled controller\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); + if (ns == NULL || ns->bdev == NULL) { + SPDK_ERRLOG("Unsuccessful query for nsid %u\n", cmd->nsid); + response->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + response->status.dnr = 1; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* scan-build falsely reporting dereference of null pointer */ + assert(group != NULL && group->sgroups != NULL); + ns_info = &group->sgroups[ctrlr->subsys->id].ns_info[nsid - 1]; + if (nvmf_ns_reservation_request_check(ns_info, ctrlr, req)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Reservation Conflict for nsid %u, opcode %u\n", + cmd->nsid, cmd->opc); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + bdev = ns->bdev; + desc = ns->desc; + ch = ns_info->channel; + + if (spdk_unlikely(cmd->fuse & SPDK_NVME_CMD_FUSE_MASK)) { + return nvmf_ctrlr_process_io_fused_cmd(req, bdev, desc, ch); + } else if (spdk_unlikely(req->qpair->first_fused_req != NULL)) { + struct spdk_nvme_cpl *fused_response = &req->qpair->first_fused_req->rsp->nvme_cpl; + + SPDK_ERRLOG("Expected second of fused commands - failing first of fused commands\n"); + + /* abort req->qpair->first_fused_request and continue with new command */ + fused_response->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED; + fused_response->status.sct = SPDK_NVME_SCT_GENERIC; + _nvmf_request_complete(req->qpair->first_fused_req); + req->qpair->first_fused_req = NULL; + } + + switch (cmd->opc) { + case SPDK_NVME_OPC_READ: + return nvmf_bdev_ctrlr_read_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_WRITE: + return nvmf_bdev_ctrlr_write_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_COMPARE: + return nvmf_bdev_ctrlr_compare_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_WRITE_ZEROES: + return nvmf_bdev_ctrlr_write_zeroes_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_FLUSH: + return nvmf_bdev_ctrlr_flush_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + return nvmf_bdev_ctrlr_dsm_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_RESERVATION_REGISTER: + case SPDK_NVME_OPC_RESERVATION_ACQUIRE: + case SPDK_NVME_OPC_RESERVATION_RELEASE: + case SPDK_NVME_OPC_RESERVATION_REPORT: + spdk_thread_send_msg(ctrlr->subsys->thread, nvmf_ns_reservation_request, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + default: + return nvmf_bdev_ctrlr_nvme_passthru_io(bdev, desc, ch, req); + } +} + +static void +nvmf_qpair_request_cleanup(struct spdk_nvmf_qpair *qpair) +{ + if (qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING) { + assert(qpair->state_cb != NULL); + + if (TAILQ_EMPTY(&qpair->outstanding)) { + qpair->state_cb(qpair->state_cb_arg, 0); + } + } +} + +int +spdk_nvmf_request_free(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + + TAILQ_REMOVE(&qpair->outstanding, req, link); + if (nvmf_transport_req_free(req)) { + SPDK_ERRLOG("Unable to free transport level request resources.\n"); + } + + nvmf_qpair_request_cleanup(qpair); + + return 0; +} + +static void +_nvmf_request_complete(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_subsystem_poll_group *sgroup = NULL; + bool is_aer = false; + + rsp->sqid = 0; + rsp->status.p = 0; + rsp->cid = req->cmd->nvme_cmd.cid; + + qpair = req->qpair; + if (qpair->ctrlr) { + sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id]; + assert(sgroup != NULL); + is_aer = req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; + } else if (spdk_unlikely(nvmf_request_is_fabric_connect(req))) { + sgroup = nvmf_subsystem_pg_from_connect_cmd(req); + } + + if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf")) { + spdk_nvme_print_completion(qpair->qid, rsp); + } + + TAILQ_REMOVE(&qpair->outstanding, req, link); + if (nvmf_transport_req_complete(req)) { + SPDK_ERRLOG("Transport request completion error!\n"); + } + + /* AER cmd is an exception */ + if (sgroup && !is_aer) { + assert(sgroup->io_outstanding > 0); + sgroup->io_outstanding--; + if (sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSING && + sgroup->io_outstanding == 0) { + sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSED; + sgroup->cb_fn(sgroup->cb_arg, 0); + } + } + + nvmf_qpair_request_cleanup(qpair); +} + +int +spdk_nvmf_request_complete(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + + if (spdk_likely(qpair->group->thread == spdk_get_thread())) { + _nvmf_request_complete(req); + } else { + spdk_thread_send_msg(qpair->group->thread, + _nvmf_request_complete, req); + } + + return 0; +} + +static void +_nvmf_request_exec(struct spdk_nvmf_request *req, + struct spdk_nvmf_subsystem_poll_group *sgroup) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + enum spdk_nvmf_request_exec_status status; + + if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf")) { + spdk_nvme_print_command(qpair->qid, &req->cmd->nvme_cmd); + } + + if (sgroup) { + sgroup->io_outstanding++; + } + + /* Place the request on the outstanding list so we can keep track of it */ + TAILQ_INSERT_TAIL(&qpair->outstanding, req, link); + + if (spdk_unlikely(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC)) { + status = nvmf_ctrlr_process_fabrics_cmd(req); + } else if (spdk_unlikely(nvmf_qpair_is_admin_queue(qpair))) { + status = nvmf_ctrlr_process_admin_cmd(req); + } else { + status = nvmf_ctrlr_process_io_cmd(req); + } + + if (status == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) { + _nvmf_request_complete(req); + } +} + +void +spdk_nvmf_request_exec_fabrics(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_subsystem_poll_group *sgroup = NULL; + + assert(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC); + + if (qpair->ctrlr) { + sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id]; + assert(sgroup != NULL); + } else { + sgroup = nvmf_subsystem_pg_from_connect_cmd(req); + } + + _nvmf_request_exec(req, sgroup); +} + +void +spdk_nvmf_request_exec(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_subsystem_poll_group *sgroup = NULL; + + if (qpair->ctrlr) { + sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id]; + assert(sgroup != NULL); + } else if (spdk_unlikely(nvmf_request_is_fabric_connect(req))) { + sgroup = nvmf_subsystem_pg_from_connect_cmd(req); + } + + if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) { + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + /* Place the request on the outstanding list so we can keep track of it */ + TAILQ_INSERT_TAIL(&qpair->outstanding, req, link); + /* Still increment io_outstanding because request_complete decrements it */ + if (sgroup != NULL) { + sgroup->io_outstanding++; + } + _nvmf_request_complete(req); + return; + } + + /* Check if the subsystem is paused (if there is a subsystem) */ + if (sgroup != NULL) { + if (sgroup->state != SPDK_NVMF_SUBSYSTEM_ACTIVE) { + /* The subsystem is not currently active. Queue this request. */ + TAILQ_INSERT_TAIL(&sgroup->queued, req, link); + return; + } + } + + _nvmf_request_exec(req, sgroup); +} + +static bool +nvmf_ctrlr_get_dif_ctx(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd, + struct spdk_dif_ctx *dif_ctx) +{ + struct spdk_nvmf_ns *ns; + struct spdk_bdev *bdev; + + if (ctrlr == NULL || cmd == NULL) { + return false; + } + + ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid); + if (ns == NULL || ns->bdev == NULL) { + return false; + } + + bdev = ns->bdev; + + switch (cmd->opc) { + case SPDK_NVME_OPC_READ: + case SPDK_NVME_OPC_WRITE: + case SPDK_NVME_OPC_COMPARE: + return nvmf_bdev_ctrlr_get_dif_ctx(bdev, cmd, dif_ctx); + default: + break; + } + + return false; +} + +bool +spdk_nvmf_request_get_dif_ctx(struct spdk_nvmf_request *req, struct spdk_dif_ctx *dif_ctx) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + if (spdk_likely(ctrlr == NULL || !ctrlr->dif_insert_or_strip)) { + return false; + } + + if (spdk_unlikely(qpair->state != SPDK_NVMF_QPAIR_ACTIVE)) { + return false; + } + + if (spdk_unlikely(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC)) { + return false; + } + + if (spdk_unlikely(nvmf_qpair_is_admin_queue(qpair))) { + return false; + } + + return nvmf_ctrlr_get_dif_ctx(ctrlr, &req->cmd->nvme_cmd, dif_ctx); +} + +void +spdk_nvmf_set_custom_admin_cmd_hdlr(uint8_t opc, spdk_nvmf_custom_cmd_hdlr hdlr) +{ + g_nvmf_custom_admin_cmd_hdlrs[opc].hdlr = hdlr; +} + +static int +nvmf_passthru_admin_cmd(struct spdk_nvmf_request *req) +{ + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + struct spdk_io_channel *ch; + struct spdk_nvme_cmd *cmd = spdk_nvmf_request_get_cmd(req); + struct spdk_nvme_cpl *response = spdk_nvmf_request_get_response(req); + uint32_t bdev_nsid; + int rc; + + if (g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].nsid == 0) { + bdev_nsid = cmd->nsid; + } else { + bdev_nsid = g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].nsid; + } + + rc = spdk_nvmf_request_get_bdev(bdev_nsid, req, &bdev, &desc, &ch); + if (rc) { + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + return spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(bdev, desc, ch, req, NULL); +} + +void +spdk_nvmf_set_passthru_admin_cmd(uint8_t opc, uint32_t forward_nsid) +{ + g_nvmf_custom_admin_cmd_hdlrs[opc].hdlr = nvmf_passthru_admin_cmd; + g_nvmf_custom_admin_cmd_hdlrs[opc].nsid = forward_nsid; +} + +int +spdk_nvmf_request_get_bdev(uint32_t nsid, struct spdk_nvmf_request *req, + struct spdk_bdev **bdev, struct spdk_bdev_desc **desc, struct spdk_io_channel **ch) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_ns *ns; + struct spdk_nvmf_poll_group *group = req->qpair->group; + struct spdk_nvmf_subsystem_pg_ns_info *ns_info; + + *bdev = NULL; + *desc = NULL; + *ch = NULL; + + ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); + if (ns == NULL || ns->bdev == NULL) { + return -EINVAL; + } + + assert(group != NULL && group->sgroups != NULL); + ns_info = &group->sgroups[ctrlr->subsys->id].ns_info[nsid - 1]; + *bdev = ns->bdev; + *desc = ns->desc; + *ch = ns_info->channel; + + return 0; +} + +struct spdk_nvmf_ctrlr *spdk_nvmf_request_get_ctrlr(struct spdk_nvmf_request *req) +{ + return req->qpair->ctrlr; +} + +struct spdk_nvme_cmd *spdk_nvmf_request_get_cmd(struct spdk_nvmf_request *req) +{ + return &req->cmd->nvme_cmd; +} + +struct spdk_nvme_cpl *spdk_nvmf_request_get_response(struct spdk_nvmf_request *req) +{ + return &req->rsp->nvme_cpl; +} + +struct spdk_nvmf_subsystem *spdk_nvmf_request_get_subsystem(struct spdk_nvmf_request *req) +{ + return req->qpair->ctrlr->subsys; +} + +void spdk_nvmf_request_get_data(struct spdk_nvmf_request *req, void **data, uint32_t *length) +{ + *data = req->data; + *length = req->length; +} + +struct spdk_nvmf_subsystem *spdk_nvmf_ctrlr_get_subsystem(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->subsys; +} + +uint16_t spdk_nvmf_ctrlr_get_id(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->cntlid; +} + +struct spdk_nvmf_request *spdk_nvmf_request_get_req_to_abort(struct spdk_nvmf_request *req) +{ + return req->req_to_abort; +} diff --git a/src/spdk/lib/nvmf/ctrlr_bdev.c b/src/spdk/lib/nvmf/ctrlr_bdev.c new file mode 100644 index 000000000..13e0a4309 --- /dev/null +++ b/src/spdk/lib/nvmf/ctrlr_bdev.c @@ -0,0 +1,761 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" + +#include "spdk/bdev.h" +#include "spdk/endian.h" +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/nvme.h" +#include "spdk/nvmf_cmd.h" +#include "spdk/nvmf_spec.h" +#include "spdk/trace.h" +#include "spdk/scsi_spec.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +static bool +nvmf_subsystem_bdev_io_type_supported(struct spdk_nvmf_subsystem *subsystem, + enum spdk_bdev_io_type io_type) +{ + struct spdk_nvmf_ns *ns; + + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + if (ns->bdev == NULL) { + continue; + } + + if (!spdk_bdev_io_type_supported(ns->bdev, io_type)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, + "Subsystem %s namespace %u (%s) does not support io_type %d\n", + spdk_nvmf_subsystem_get_nqn(subsystem), + ns->opts.nsid, spdk_bdev_get_name(ns->bdev), (int)io_type); + return false; + } + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "All devices in Subsystem %s support io_type %d\n", + spdk_nvmf_subsystem_get_nqn(subsystem), (int)io_type); + return true; +} + +bool +nvmf_ctrlr_dsm_supported(struct spdk_nvmf_ctrlr *ctrlr) +{ + return nvmf_subsystem_bdev_io_type_supported(ctrlr->subsys, SPDK_BDEV_IO_TYPE_UNMAP); +} + +bool +nvmf_ctrlr_write_zeroes_supported(struct spdk_nvmf_ctrlr *ctrlr) +{ + return nvmf_subsystem_bdev_io_type_supported(ctrlr->subsys, SPDK_BDEV_IO_TYPE_WRITE_ZEROES); +} + +static void +nvmf_bdev_ctrlr_complete_cmd(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_nvmf_request *req = cb_arg; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int first_sc = 0, first_sct = 0, second_sc = 0, second_sct = 0; + uint32_t cdw0 = 0; + struct spdk_nvmf_request *first_req = req->first_fused_req; + + if (spdk_unlikely(first_req != NULL)) { + /* fused commands - get status for both operations */ + struct spdk_nvme_cpl *fused_response = &first_req->rsp->nvme_cpl; + + spdk_bdev_io_get_nvme_fused_status(bdev_io, &cdw0, &second_sct, &second_sc, &first_sct, &first_sc); + fused_response->cdw0 = cdw0; + fused_response->status.sc = second_sc; + fused_response->status.sct = second_sct; + + /* first request should be completed */ + spdk_nvmf_request_complete(first_req); + req->first_fused_req = NULL; + } else { + spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &first_sct, &first_sc); + } + + response->cdw0 = cdw0; + response->status.sc = first_sc; + response->status.sct = first_sct; + + spdk_nvmf_request_complete(req); + spdk_bdev_free_io(bdev_io); +} + +static void +nvmf_bdev_ctrlr_complete_admin_cmd(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_nvmf_request *req = cb_arg; + + if (req->cmd_cb_fn) { + req->cmd_cb_fn(req); + } + + nvmf_bdev_ctrlr_complete_cmd(bdev_io, success, req); +} + +void +nvmf_bdev_ctrlr_identify_ns(struct spdk_nvmf_ns *ns, struct spdk_nvme_ns_data *nsdata, + bool dif_insert_or_strip) +{ + struct spdk_bdev *bdev = ns->bdev; + uint64_t num_blocks; + + num_blocks = spdk_bdev_get_num_blocks(bdev); + + nsdata->nsze = num_blocks; + nsdata->ncap = num_blocks; + nsdata->nuse = num_blocks; + nsdata->nlbaf = 0; + nsdata->flbas.format = 0; + nsdata->nacwu = spdk_bdev_get_acwu(bdev); + if (!dif_insert_or_strip) { + nsdata->lbaf[0].ms = spdk_bdev_get_md_size(bdev); + nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(bdev)); + if (nsdata->lbaf[0].ms != 0) { + nsdata->flbas.extended = 1; + nsdata->mc.extended = 1; + nsdata->mc.pointer = 0; + nsdata->dps.md_start = spdk_bdev_is_dif_head_of_md(bdev); + + switch (spdk_bdev_get_dif_type(bdev)) { + case SPDK_DIF_TYPE1: + nsdata->dpc.pit1 = 1; + nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_TYPE1; + break; + case SPDK_DIF_TYPE2: + nsdata->dpc.pit2 = 1; + nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_TYPE2; + break; + case SPDK_DIF_TYPE3: + nsdata->dpc.pit3 = 1; + nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_TYPE3; + break; + default: + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Protection Disabled\n"); + nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_DISABLE; + break; + } + } + } else { + nsdata->lbaf[0].ms = 0; + nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_data_block_size(bdev)); + } + nsdata->noiob = spdk_bdev_get_optimal_io_boundary(bdev); + nsdata->nmic.can_share = 1; + if (ns->ptpl_file != NULL) { + nsdata->nsrescap.rescap.persist = 1; + } + nsdata->nsrescap.rescap.write_exclusive = 1; + nsdata->nsrescap.rescap.exclusive_access = 1; + nsdata->nsrescap.rescap.write_exclusive_reg_only = 1; + nsdata->nsrescap.rescap.exclusive_access_reg_only = 1; + nsdata->nsrescap.rescap.write_exclusive_all_reg = 1; + nsdata->nsrescap.rescap.exclusive_access_all_reg = 1; + nsdata->nsrescap.rescap.ignore_existing_key = 1; + + SPDK_STATIC_ASSERT(sizeof(nsdata->nguid) == sizeof(ns->opts.nguid), "size mismatch"); + memcpy(nsdata->nguid, ns->opts.nguid, sizeof(nsdata->nguid)); + + SPDK_STATIC_ASSERT(sizeof(nsdata->eui64) == sizeof(ns->opts.eui64), "size mismatch"); + memcpy(&nsdata->eui64, ns->opts.eui64, sizeof(nsdata->eui64)); +} + +static void +nvmf_bdev_ctrlr_get_rw_params(const struct spdk_nvme_cmd *cmd, uint64_t *start_lba, + uint64_t *num_blocks) +{ + /* SLBA: CDW10 and CDW11 */ + *start_lba = from_le64(&cmd->cdw10); + + /* NLB: CDW12 bits 15:00, 0's based */ + *num_blocks = (from_le32(&cmd->cdw12) & 0xFFFFu) + 1; +} + +static bool +nvmf_bdev_ctrlr_lba_in_range(uint64_t bdev_num_blocks, uint64_t io_start_lba, + uint64_t io_num_blocks) +{ + if (io_start_lba + io_num_blocks > bdev_num_blocks || + io_start_lba + io_num_blocks < io_start_lba) { + return false; + } + + return true; +} + +static void +nvmf_ctrlr_process_io_cmd_resubmit(void *arg) +{ + struct spdk_nvmf_request *req = arg; + + nvmf_ctrlr_process_io_cmd(req); +} + +static void +nvmf_ctrlr_process_admin_cmd_resubmit(void *arg) +{ + struct spdk_nvmf_request *req = arg; + + nvmf_ctrlr_process_admin_cmd(req); +} + +static void +nvmf_bdev_ctrl_queue_io(struct spdk_nvmf_request *req, struct spdk_bdev *bdev, + struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn, void *cb_arg) +{ + int rc; + + req->bdev_io_wait.bdev = bdev; + req->bdev_io_wait.cb_fn = cb_fn; + req->bdev_io_wait.cb_arg = cb_arg; + + rc = spdk_bdev_queue_io_wait(bdev, ch, &req->bdev_io_wait); + if (rc != 0) { + assert(false); + } + req->qpair->group->stat.pending_bdev_io++; +} + +int +nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + uint32_t block_size = spdk_bdev_get_block_size(bdev); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint64_t start_lba; + uint64_t num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks); + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(num_blocks * block_size > req->length)) { + SPDK_ERRLOG("Read NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n", + num_blocks, block_size, req->length); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_readv_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +int +nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + uint32_t block_size = spdk_bdev_get_block_size(bdev); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint64_t start_lba; + uint64_t num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks); + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(num_blocks * block_size > req->length)) { + SPDK_ERRLOG("Write NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n", + num_blocks, block_size, req->length); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_writev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +int +nvmf_bdev_ctrlr_compare_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + uint32_t block_size = spdk_bdev_get_block_size(bdev); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint64_t start_lba; + uint64_t num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks); + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(num_blocks * block_size > req->length)) { + SPDK_ERRLOG("Compare NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n", + num_blocks, block_size, req->length); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_comparev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +int +nvmf_bdev_ctrlr_compare_and_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *cmp_req, struct spdk_nvmf_request *write_req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + uint32_t block_size = spdk_bdev_get_block_size(bdev); + struct spdk_nvme_cmd *cmp_cmd = &cmp_req->cmd->nvme_cmd; + struct spdk_nvme_cmd *write_cmd = &write_req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &write_req->rsp->nvme_cpl; + uint64_t write_start_lba, cmp_start_lba; + uint64_t write_num_blocks, cmp_num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmp_cmd, &cmp_start_lba, &cmp_num_blocks); + nvmf_bdev_ctrlr_get_rw_params(write_cmd, &write_start_lba, &write_num_blocks); + + if (spdk_unlikely(write_start_lba != cmp_start_lba || write_num_blocks != cmp_num_blocks)) { + SPDK_ERRLOG("Fused command start lba / num blocks mismatch\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, write_start_lba, + write_num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(write_num_blocks * block_size > write_req->length)) { + SPDK_ERRLOG("Write NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n", + write_num_blocks, block_size, write_req->length); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_comparev_and_writev_blocks(desc, ch, cmp_req->iov, cmp_req->iovcnt, write_req->iov, + write_req->iovcnt, write_start_lba, write_num_blocks, nvmf_bdev_ctrlr_complete_cmd, write_req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(cmp_req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, cmp_req); + nvmf_bdev_ctrl_queue_io(write_req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, write_req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +int +nvmf_bdev_ctrlr_write_zeroes_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint64_t start_lba; + uint64_t num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks); + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_write_zeroes_blocks(desc, ch, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +int +nvmf_bdev_ctrlr_flush_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int rc; + + /* As for NVMeoF controller, SPDK always set volatile write + * cache bit to 1, return success for those block devices + * which can't support FLUSH command. + */ + if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_SUCCESS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_flush_blocks(desc, ch, 0, spdk_bdev_get_num_blocks(bdev), + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +struct nvmf_bdev_ctrlr_unmap { + struct spdk_nvmf_request *req; + uint32_t count; + struct spdk_bdev_desc *desc; + struct spdk_bdev *bdev; + struct spdk_io_channel *ch; + uint32_t range_index; +}; + +static void +nvmf_bdev_ctrlr_unmap_cpl(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct nvmf_bdev_ctrlr_unmap *unmap_ctx = cb_arg; + struct spdk_nvmf_request *req = unmap_ctx->req; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int sc, sct; + uint32_t cdw0; + + unmap_ctx->count--; + + if (response->status.sct == SPDK_NVME_SCT_GENERIC && + response->status.sc == SPDK_NVME_SC_SUCCESS) { + spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc); + response->cdw0 = cdw0; + response->status.sc = sc; + response->status.sct = sct; + } + + if (unmap_ctx->count == 0) { + spdk_nvmf_request_complete(req); + free(unmap_ctx); + } + spdk_bdev_free_io(bdev_io); +} + +static int +nvmf_bdev_ctrlr_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req, + struct nvmf_bdev_ctrlr_unmap *unmap_ctx); +static void +nvmf_bdev_ctrlr_unmap_resubmit(void *arg) +{ + struct nvmf_bdev_ctrlr_unmap *unmap_ctx = arg; + struct spdk_nvmf_request *req = unmap_ctx->req; + struct spdk_bdev_desc *desc = unmap_ctx->desc; + struct spdk_bdev *bdev = unmap_ctx->bdev; + struct spdk_io_channel *ch = unmap_ctx->ch; + + nvmf_bdev_ctrlr_unmap(bdev, desc, ch, req, unmap_ctx); +} + +static int +nvmf_bdev_ctrlr_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req, + struct nvmf_bdev_ctrlr_unmap *unmap_ctx) +{ + uint16_t nr, i; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + struct spdk_nvme_dsm_range *dsm_range; + uint64_t lba; + uint32_t lba_count; + int rc; + + nr = cmd->cdw10_bits.dsm.nr + 1; + if (nr * sizeof(struct spdk_nvme_dsm_range) > req->length) { + SPDK_ERRLOG("Dataset Management number of ranges > SGL length\n"); + response->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (unmap_ctx == NULL) { + unmap_ctx = calloc(1, sizeof(*unmap_ctx)); + if (!unmap_ctx) { + response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + unmap_ctx->req = req; + unmap_ctx->desc = desc; + unmap_ctx->ch = ch; + unmap_ctx->bdev = bdev; + + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_SUCCESS; + } else { + unmap_ctx->count--; /* dequeued */ + } + + dsm_range = (struct spdk_nvme_dsm_range *)req->data; + for (i = unmap_ctx->range_index; i < nr; i++) { + lba = dsm_range[i].starting_lba; + lba_count = dsm_range[i].length; + + unmap_ctx->count++; + + rc = spdk_bdev_unmap_blocks(desc, ch, lba, lba_count, + nvmf_bdev_ctrlr_unmap_cpl, unmap_ctx); + if (rc) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_bdev_ctrlr_unmap_resubmit, unmap_ctx); + /* Unmap was not yet submitted to bdev */ + /* unmap_ctx->count will be decremented when the request is dequeued */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + unmap_ctx->count--; + /* We can't return here - we may have to wait for any other + * unmaps already sent to complete */ + break; + } + unmap_ctx->range_index++; + } + + if (unmap_ctx->count == 0) { + free(unmap_ctx); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +int +nvmf_bdev_ctrlr_dsm_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + if (cmd->cdw11_bits.dsm.ad) { + return nvmf_bdev_ctrlr_unmap(bdev, desc, ch, req, NULL); + } + + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_SUCCESS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +int +nvmf_bdev_ctrlr_nvme_passthru_io(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + int rc; + + rc = spdk_bdev_nvme_io_passthru(desc, ch, &req->cmd->nvme_cmd, req->data, req->length, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +int +spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req, + spdk_nvmf_nvme_passthru_cmd_cb cb_fn) +{ + int rc; + + req->cmd_cb_fn = cb_fn; + + rc = spdk_bdev_nvme_admin_passthru(desc, ch, &req->cmd->nvme_cmd, req->data, req->length, + nvmf_bdev_ctrlr_complete_admin_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_admin_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static void +nvmf_bdev_ctrlr_complete_abort_cmd(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_nvmf_request *req = cb_arg; + + if (success) { + req->rsp->nvme_cpl.cdw0 &= ~1U; + } + + spdk_nvmf_request_complete(req); + spdk_bdev_free_io(bdev_io); +} + +int +spdk_nvmf_bdev_ctrlr_abort_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req, + struct spdk_nvmf_request *req_to_abort) +{ + int rc; + + assert((req->rsp->nvme_cpl.cdw0 & 1U) != 0); + + rc = spdk_bdev_abort(desc, ch, req_to_abort, nvmf_bdev_ctrlr_complete_abort_cmd, req); + if (spdk_likely(rc == 0)) { + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } else if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_admin_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } else { + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + +bool +nvmf_bdev_ctrlr_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_nvme_cmd *cmd, + struct spdk_dif_ctx *dif_ctx) +{ + uint32_t init_ref_tag, dif_check_flags = 0; + int rc; + + if (spdk_bdev_get_md_size(bdev) == 0) { + return false; + } + + /* Initial Reference Tag is the lower 32 bits of the start LBA. */ + init_ref_tag = (uint32_t)from_le64(&cmd->cdw10); + + if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) { + dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK; + } + + if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) { + dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK; + } + + rc = spdk_dif_ctx_init(dif_ctx, + spdk_bdev_get_block_size(bdev), + spdk_bdev_get_md_size(bdev), + spdk_bdev_is_md_interleaved(bdev), + spdk_bdev_is_dif_head_of_md(bdev), + spdk_bdev_get_dif_type(bdev), + dif_check_flags, + init_ref_tag, 0, 0, 0, 0); + + return (rc == 0) ? true : false; +} diff --git a/src/spdk/lib/nvmf/ctrlr_discovery.c b/src/spdk/lib/nvmf/ctrlr_discovery.c new file mode 100644 index 000000000..ab1c46ba1 --- /dev/null +++ b/src/spdk/lib/nvmf/ctrlr_discovery.c @@ -0,0 +1,159 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over Fabrics discovery service + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/nvmf_spec.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +static struct spdk_nvmf_discovery_log_page * +nvmf_generate_discovery_log(struct spdk_nvmf_tgt *tgt, const char *hostnqn, size_t *log_page_size) +{ + uint64_t numrec = 0; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_subsystem_listener *listener; + struct spdk_nvmf_discovery_log_page_entry *entry; + struct spdk_nvmf_discovery_log_page *disc_log; + size_t cur_size; + uint32_t sid; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Generating log page for genctr %" PRIu64 "\n", + tgt->discovery_genctr); + + cur_size = sizeof(struct spdk_nvmf_discovery_log_page); + disc_log = calloc(1, cur_size); + if (disc_log == NULL) { + SPDK_ERRLOG("Discovery log page memory allocation error\n"); + return NULL; + } + + for (sid = 0; sid < tgt->max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if ((subsystem == NULL) || + (subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE) || + (subsystem->state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING)) { + continue; + } + + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + continue; + } + + if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) { + continue; + } + + for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL; + listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) { + size_t new_size = cur_size + sizeof(*entry); + void *new_log_page = realloc(disc_log, new_size); + + if (new_log_page == NULL) { + SPDK_ERRLOG("Discovery log page memory allocation error\n"); + break; + } + + disc_log = new_log_page; + cur_size = new_size; + + entry = &disc_log->entries[numrec]; + memset(entry, 0, sizeof(*entry)); + entry->portid = numrec; + entry->cntlid = 0xffff; + entry->asqsz = listener->transport->opts.max_aq_depth; + entry->subtype = subsystem->subtype; + snprintf(entry->subnqn, sizeof(entry->subnqn), "%s", subsystem->subnqn); + + nvmf_transport_listener_discover(listener->transport, listener->trid, entry); + + numrec++; + } + } + + disc_log->numrec = numrec; + disc_log->genctr = tgt->discovery_genctr; + *log_page_size = cur_size; + + return disc_log; +} + +void +nvmf_get_discovery_log_page(struct spdk_nvmf_tgt *tgt, const char *hostnqn, struct iovec *iov, + uint32_t iovcnt, uint64_t offset, uint32_t length) +{ + size_t copy_len = 0; + size_t zero_len = 0; + struct iovec *tmp; + size_t log_page_size = 0; + struct spdk_nvmf_discovery_log_page *discovery_log_page; + + discovery_log_page = nvmf_generate_discovery_log(tgt, hostnqn, &log_page_size); + + /* Copy the valid part of the discovery log page, if any */ + if (discovery_log_page) { + for (tmp = iov; tmp < iov + iovcnt; tmp++) { + copy_len = spdk_min(tmp->iov_len, length); + copy_len = spdk_min(log_page_size - offset, copy_len); + + memcpy(tmp->iov_base, (char *)discovery_log_page + offset, copy_len); + + offset += copy_len; + length -= copy_len; + zero_len = tmp->iov_len - copy_len; + if (log_page_size <= offset || length == 0) { + break; + } + } + /* Zero out the rest of the payload */ + if (zero_len) { + memset((char *)tmp->iov_base + copy_len, 0, zero_len); + } + + for (++tmp; tmp < iov + iovcnt; tmp++) { + memset((char *)tmp->iov_base, 0, tmp->iov_len); + } + + free(discovery_log_page); + } +} diff --git a/src/spdk/lib/nvmf/fc.c b/src/spdk/lib/nvmf/fc.c new file mode 100644 index 000000000..678cfc681 --- /dev/null +++ b/src/spdk/lib/nvmf/fc.c @@ -0,0 +1,3957 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2018-2019 Broadcom. All Rights Reserved. + * The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe_FC transport functions. + */ + +#include "spdk/env.h" +#include "spdk/assert.h" +#include "spdk/nvmf_transport.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/util.h" +#include "spdk/likely.h" +#include "spdk/endian.h" +#include "spdk/log.h" +#include "spdk/thread.h" + +#include "spdk_internal/log.h" + +#include "nvmf_fc.h" +#include "fc_lld.h" + +#ifndef DEV_VERIFY +#define DEV_VERIFY assert +#endif + +#ifndef ASSERT_SPDK_FC_MASTER_THREAD +#define ASSERT_SPDK_FC_MASTER_THREAD() \ + DEV_VERIFY(spdk_get_thread() == nvmf_fc_get_master_thread()); +#endif + +/* + * PRLI service parameters + */ +enum spdk_nvmf_fc_service_parameters { + SPDK_NVMF_FC_FIRST_BURST_SUPPORTED = 0x0001, + SPDK_NVMF_FC_DISCOVERY_SERVICE = 0x0008, + SPDK_NVMF_FC_TARGET_FUNCTION = 0x0010, + SPDK_NVMF_FC_INITIATOR_FUNCTION = 0x0020, + SPDK_NVMF_FC_CONFIRMED_COMPLETION_SUPPORTED = 0x0080, +}; + +static char *fc_req_state_strs[] = { + "SPDK_NVMF_FC_REQ_INIT", + "SPDK_NVMF_FC_REQ_READ_BDEV", + "SPDK_NVMF_FC_REQ_READ_XFER", + "SPDK_NVMF_FC_REQ_READ_RSP", + "SPDK_NVMF_FC_REQ_WRITE_BUFFS", + "SPDK_NVMF_FC_REQ_WRITE_XFER", + "SPDK_NVMF_FC_REQ_WRITE_BDEV", + "SPDK_NVMF_FC_REQ_WRITE_RSP", + "SPDK_NVMF_FC_REQ_NONE_BDEV", + "SPDK_NVMF_FC_REQ_NONE_RSP", + "SPDK_NVMF_FC_REQ_SUCCESS", + "SPDK_NVMF_FC_REQ_FAILED", + "SPDK_NVMF_FC_REQ_ABORTED", + "SPDK_NVMF_FC_REQ_BDEV_ABORTED", + "SPDK_NVMF_FC_REQ_PENDING" +}; + +#define OBJECT_NVMF_FC_IO 0xA0 + +#define TRACE_GROUP_NVMF_FC 0x8 +#define TRACE_FC_REQ_INIT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x01) +#define TRACE_FC_REQ_READ_BDEV SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x02) +#define TRACE_FC_REQ_READ_XFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x03) +#define TRACE_FC_REQ_READ_RSP SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x04) +#define TRACE_FC_REQ_WRITE_BUFFS SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x05) +#define TRACE_FC_REQ_WRITE_XFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x06) +#define TRACE_FC_REQ_WRITE_BDEV SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x07) +#define TRACE_FC_REQ_WRITE_RSP SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x08) +#define TRACE_FC_REQ_NONE_BDEV SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x09) +#define TRACE_FC_REQ_NONE_RSP SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0A) +#define TRACE_FC_REQ_SUCCESS SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0B) +#define TRACE_FC_REQ_FAILED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0C) +#define TRACE_FC_REQ_ABORTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0D) +#define TRACE_FC_REQ_BDEV_ABORTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0E) +#define TRACE_FC_REQ_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0F) + +SPDK_TRACE_REGISTER_FN(nvmf_fc_trace, "nvmf_fc", TRACE_GROUP_NVMF_FC) +{ + spdk_trace_register_object(OBJECT_NVMF_FC_IO, 'r'); + spdk_trace_register_description("FC_REQ_NEW", + TRACE_FC_REQ_INIT, + OWNER_NONE, OBJECT_NVMF_FC_IO, 1, 1, ""); + spdk_trace_register_description("FC_REQ_READ_SUBMIT_TO_BDEV", + TRACE_FC_REQ_READ_BDEV, + OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, ""); + spdk_trace_register_description("FC_REQ_READ_XFER_DATA", + TRACE_FC_REQ_READ_XFER, + OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, ""); + spdk_trace_register_description("FC_REQ_READ_RSP", + TRACE_FC_REQ_READ_RSP, + OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, ""); + spdk_trace_register_description("FC_REQ_WRITE_NEED_BUFFER", + TRACE_FC_REQ_WRITE_BUFFS, + OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, ""); + spdk_trace_register_description("FC_REQ_WRITE_XFER_DATA", + TRACE_FC_REQ_WRITE_XFER, + OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, ""); + spdk_trace_register_description("FC_REQ_WRITE_SUBMIT_TO_BDEV", + TRACE_FC_REQ_WRITE_BDEV, + OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, ""); + spdk_trace_register_description("FC_REQ_WRITE_RSP", + TRACE_FC_REQ_WRITE_RSP, + OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, ""); + spdk_trace_register_description("FC_REQ_NONE_SUBMIT_TO_BDEV", + TRACE_FC_REQ_NONE_BDEV, + OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, ""); + spdk_trace_register_description("FC_REQ_NONE_RSP", + TRACE_FC_REQ_NONE_RSP, + OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, ""); + spdk_trace_register_description("FC_REQ_SUCCESS", + TRACE_FC_REQ_SUCCESS, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("FC_REQ_FAILED", + TRACE_FC_REQ_FAILED, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("FC_REQ_ABORTED", + TRACE_FC_REQ_ABORTED, + OWNER_NONE, OBJECT_NONE, 0, 1, ""); + spdk_trace_register_description("FC_REQ_ABORTED_SUBMIT_TO_BDEV", + TRACE_FC_REQ_BDEV_ABORTED, + OWNER_NONE, OBJECT_NONE, 0, 1, ""); + spdk_trace_register_description("FC_REQ_PENDING", + TRACE_FC_REQ_PENDING, + OWNER_NONE, OBJECT_NONE, 0, 1, ""); +} + +/** + * The structure used by all fc adm functions + */ +struct spdk_nvmf_fc_adm_api_data { + void *api_args; + spdk_nvmf_fc_callback cb_func; +}; + +/** + * The callback structure for nport-delete + */ +struct spdk_nvmf_fc_adm_nport_del_cb_data { + struct spdk_nvmf_fc_nport *nport; + uint8_t port_handle; + spdk_nvmf_fc_callback fc_cb_func; + void *fc_cb_ctx; +}; + +/** + * The callback structure for it-delete + */ +struct spdk_nvmf_fc_adm_i_t_del_cb_data { + struct spdk_nvmf_fc_nport *nport; + struct spdk_nvmf_fc_remote_port_info *rport; + uint8_t port_handle; + spdk_nvmf_fc_callback fc_cb_func; + void *fc_cb_ctx; +}; + + +typedef void (*spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn)(void *arg, uint32_t err); + +/** + * The callback structure for the it-delete-assoc callback + */ +struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data { + struct spdk_nvmf_fc_nport *nport; + struct spdk_nvmf_fc_remote_port_info *rport; + uint8_t port_handle; + spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn cb_func; + void *cb_ctx; +}; + +/* + * Call back function pointer for HW port quiesce. + */ +typedef void (*spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn)(void *ctx, int err); + +/** + * Context structure for quiescing a hardware port + */ +struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx { + int quiesce_count; + void *ctx; + spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn cb_func; +}; + +/** + * Context structure used to reset a hardware port + */ +struct spdk_nvmf_fc_adm_hw_port_reset_ctx { + void *reset_args; + spdk_nvmf_fc_callback reset_cb_func; +}; + +/** + * The callback structure for HW port link break event + */ +struct spdk_nvmf_fc_adm_port_link_break_cb_data { + struct spdk_nvmf_hw_port_link_break_args *args; + struct spdk_nvmf_fc_nport_delete_args nport_del_args; + spdk_nvmf_fc_callback cb_func; +}; + +struct spdk_nvmf_fc_transport { + struct spdk_nvmf_transport transport; + pthread_mutex_t lock; +}; + +static struct spdk_nvmf_fc_transport *g_nvmf_ftransport; + +static TAILQ_HEAD(, spdk_nvmf_fc_port) g_spdk_nvmf_fc_port_list = + TAILQ_HEAD_INITIALIZER(g_spdk_nvmf_fc_port_list); + +static struct spdk_thread *g_nvmf_fc_master_thread = NULL; + +static uint32_t g_nvmf_fgroup_count = 0; +static TAILQ_HEAD(, spdk_nvmf_fc_poll_group) g_nvmf_fgroups = + TAILQ_HEAD_INITIALIZER(g_nvmf_fgroups); + +struct spdk_thread * +nvmf_fc_get_master_thread(void) +{ + return g_nvmf_fc_master_thread; +} + +static inline void +nvmf_fc_record_req_trace_point(struct spdk_nvmf_fc_request *fc_req, + enum spdk_nvmf_fc_request_state state) +{ + uint16_t tpoint_id = SPDK_TRACE_MAX_TPOINT_ID; + + switch (state) { + case SPDK_NVMF_FC_REQ_INIT: + /* Start IO tracing */ + tpoint_id = TRACE_FC_REQ_INIT; + break; + case SPDK_NVMF_FC_REQ_READ_BDEV: + tpoint_id = TRACE_FC_REQ_READ_BDEV; + break; + case SPDK_NVMF_FC_REQ_READ_XFER: + tpoint_id = TRACE_FC_REQ_READ_XFER; + break; + case SPDK_NVMF_FC_REQ_READ_RSP: + tpoint_id = TRACE_FC_REQ_READ_RSP; + break; + case SPDK_NVMF_FC_REQ_WRITE_BUFFS: + tpoint_id = TRACE_FC_REQ_WRITE_BUFFS; + break; + case SPDK_NVMF_FC_REQ_WRITE_XFER: + tpoint_id = TRACE_FC_REQ_WRITE_XFER; + break; + case SPDK_NVMF_FC_REQ_WRITE_BDEV: + tpoint_id = TRACE_FC_REQ_WRITE_BDEV; + break; + case SPDK_NVMF_FC_REQ_WRITE_RSP: + tpoint_id = TRACE_FC_REQ_WRITE_RSP; + break; + case SPDK_NVMF_FC_REQ_NONE_BDEV: + tpoint_id = TRACE_FC_REQ_NONE_BDEV; + break; + case SPDK_NVMF_FC_REQ_NONE_RSP: + tpoint_id = TRACE_FC_REQ_NONE_RSP; + break; + case SPDK_NVMF_FC_REQ_SUCCESS: + tpoint_id = TRACE_FC_REQ_SUCCESS; + break; + case SPDK_NVMF_FC_REQ_FAILED: + tpoint_id = TRACE_FC_REQ_FAILED; + break; + case SPDK_NVMF_FC_REQ_ABORTED: + tpoint_id = TRACE_FC_REQ_ABORTED; + break; + case SPDK_NVMF_FC_REQ_BDEV_ABORTED: + tpoint_id = TRACE_FC_REQ_ABORTED; + break; + case SPDK_NVMF_FC_REQ_PENDING: + tpoint_id = TRACE_FC_REQ_PENDING; + break; + default: + assert(0); + break; + } + if (tpoint_id != SPDK_TRACE_MAX_TPOINT_ID) { + spdk_trace_record(tpoint_id, fc_req->poller_lcore, 0, + (uint64_t)(&fc_req->req), 0); + } +} + +static void +nvmf_fc_handle_connection_failure(void *arg) +{ + struct spdk_nvmf_fc_conn *fc_conn = arg; + struct spdk_nvmf_fc_ls_add_conn_api_data *api_data = NULL; + + if (!fc_conn->create_opd) { + return; + } + api_data = &fc_conn->create_opd->u.add_conn; + + nvmf_fc_ls_add_conn_failure(api_data->assoc, api_data->ls_rqst, + api_data->args.fc_conn, api_data->aq_conn); +} + +static void +nvmf_fc_handle_assoc_deletion(void *arg) +{ + struct spdk_nvmf_fc_conn *fc_conn = arg; + + nvmf_fc_delete_association(fc_conn->fc_assoc->tgtport, + fc_conn->fc_assoc->assoc_id, false, true, NULL, NULL); +} + +static int +nvmf_fc_create_req_mempool(struct spdk_nvmf_fc_hwqp *hwqp) +{ + uint32_t i; + struct spdk_nvmf_fc_request *fc_req; + + TAILQ_INIT(&hwqp->free_reqs); + TAILQ_INIT(&hwqp->in_use_reqs); + + hwqp->fc_reqs_buf = calloc(hwqp->rq_size, sizeof(struct spdk_nvmf_fc_request)); + if (hwqp->fc_reqs_buf == NULL) { + SPDK_ERRLOG("create fc request pool failed\n"); + return -ENOMEM; + } + + for (i = 0; i < hwqp->rq_size; i++) { + fc_req = hwqp->fc_reqs_buf + i; + + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_INIT); + TAILQ_INSERT_TAIL(&hwqp->free_reqs, fc_req, link); + } + + return 0; +} + +static inline struct spdk_nvmf_fc_request * +nvmf_fc_hwqp_alloc_fc_request(struct spdk_nvmf_fc_hwqp *hwqp) +{ + struct spdk_nvmf_fc_request *fc_req; + + if (TAILQ_EMPTY(&hwqp->free_reqs)) { + SPDK_ERRLOG("Alloc request buffer failed\n"); + return NULL; + } + + fc_req = TAILQ_FIRST(&hwqp->free_reqs); + TAILQ_REMOVE(&hwqp->free_reqs, fc_req, link); + + memset(fc_req, 0, sizeof(struct spdk_nvmf_fc_request)); + TAILQ_INSERT_TAIL(&hwqp->in_use_reqs, fc_req, link); + TAILQ_INIT(&fc_req->abort_cbs); + return fc_req; +} + +static inline void +nvmf_fc_hwqp_free_fc_request(struct spdk_nvmf_fc_hwqp *hwqp, struct spdk_nvmf_fc_request *fc_req) +{ + if (fc_req->state != SPDK_NVMF_FC_REQ_SUCCESS) { + /* Log an error for debug purpose. */ + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_FAILED); + } + + /* set the magic to mark req as no longer valid. */ + fc_req->magic = 0xDEADBEEF; + + TAILQ_REMOVE(&hwqp->in_use_reqs, fc_req, link); + TAILQ_INSERT_HEAD(&hwqp->free_reqs, fc_req, link); +} + +static inline bool +nvmf_fc_req_in_get_buff(struct spdk_nvmf_fc_request *fc_req) +{ + switch (fc_req->state) { + case SPDK_NVMF_FC_REQ_WRITE_BUFFS: + return true; + default: + return false; + } +} + +void +nvmf_fc_init_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp) +{ + nvmf_fc_init_rqpair_buffers(hwqp); +} + +struct spdk_nvmf_fc_conn * +nvmf_fc_hwqp_find_fc_conn(struct spdk_nvmf_fc_hwqp *hwqp, uint64_t conn_id) +{ + struct spdk_nvmf_fc_conn *fc_conn; + + TAILQ_FOREACH(fc_conn, &hwqp->connection_list, link) { + if (fc_conn->conn_id == conn_id) { + return fc_conn; + } + } + + return NULL; +} + +void +nvmf_fc_hwqp_reinit_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp, void *queues_curr) +{ + struct spdk_nvmf_fc_abts_ctx *ctx; + struct spdk_nvmf_fc_poller_api_queue_sync_args *args = NULL, *tmp = NULL; + + /* Clean up any pending sync callbacks */ + TAILQ_FOREACH_SAFE(args, &hwqp->sync_cbs, link, tmp) { + TAILQ_REMOVE(&hwqp->sync_cbs, args, link); + ctx = args->cb_info.cb_data; + if (ctx) { + if (++ctx->hwqps_responded == ctx->num_hwqps) { + free(ctx->sync_poller_args); + free(ctx->abts_poller_args); + free(ctx); + } + } + } + + nvmf_fc_reinit_q(hwqp->queues, queues_curr); +} + +void +nvmf_fc_init_hwqp(struct spdk_nvmf_fc_port *fc_port, struct spdk_nvmf_fc_hwqp *hwqp) +{ + hwqp->fc_port = fc_port; + + /* clear counters */ + memset(&hwqp->counters, 0, sizeof(struct spdk_nvmf_fc_errors)); + + nvmf_fc_init_poller_queues(hwqp); + if (&fc_port->ls_queue != hwqp) { + nvmf_fc_create_req_mempool(hwqp); + } + + nvmf_fc_init_q(hwqp); + TAILQ_INIT(&hwqp->connection_list); + TAILQ_INIT(&hwqp->sync_cbs); + TAILQ_INIT(&hwqp->ls_pending_queue); +} + +static struct spdk_nvmf_fc_poll_group * +nvmf_fc_get_idlest_poll_group(void) +{ + uint32_t max_count = UINT32_MAX; + struct spdk_nvmf_fc_poll_group *fgroup; + struct spdk_nvmf_fc_poll_group *ret_fgroup = NULL; + + /* find poll group with least number of hwqp's assigned to it */ + TAILQ_FOREACH(fgroup, &g_nvmf_fgroups, link) { + if (fgroup->hwqp_count < max_count) { + ret_fgroup = fgroup; + max_count = fgroup->hwqp_count; + } + } + + return ret_fgroup; +} + +void +nvmf_fc_poll_group_add_hwqp(struct spdk_nvmf_fc_hwqp *hwqp) +{ + struct spdk_nvmf_fc_poll_group *fgroup = NULL; + + assert(hwqp); + if (hwqp == NULL) { + SPDK_ERRLOG("Error: hwqp is NULL\n"); + return; + } + + assert(g_nvmf_fgroup_count); + + fgroup = nvmf_fc_get_idlest_poll_group(); + if (!fgroup) { + SPDK_ERRLOG("Could not assign poll group for hwqp (%d)\n", hwqp->hwqp_id); + return; + } + + hwqp->thread = fgroup->group.group->thread; + hwqp->fgroup = fgroup; + fgroup->hwqp_count++; + nvmf_fc_poller_api_func(hwqp, SPDK_NVMF_FC_POLLER_API_ADD_HWQP, NULL); +} + +void +nvmf_fc_poll_group_remove_hwqp(struct spdk_nvmf_fc_hwqp *hwqp) +{ + assert(hwqp); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, + "Remove hwqp from poller: for port: %d, hwqp: %d\n", + hwqp->fc_port->port_hdl, hwqp->hwqp_id); + + if (!hwqp->fgroup) { + SPDK_ERRLOG("HWQP (%d) not assigned to poll group\n", hwqp->hwqp_id); + } else { + hwqp->fgroup->hwqp_count--; + nvmf_fc_poller_api_func(hwqp, SPDK_NVMF_FC_POLLER_API_REMOVE_HWQP, NULL); + } +} + +/* + * Note: This needs to be used only on master poller. + */ +static uint64_t +nvmf_fc_get_abts_unique_id(void) +{ + static uint32_t u_id = 0; + + return (uint64_t)(++u_id); +} + +static void +nvmf_fc_queue_synced_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret) +{ + struct spdk_nvmf_fc_abts_ctx *ctx = cb_data; + struct spdk_nvmf_fc_poller_api_abts_recvd_args *args, *poller_arg; + + ctx->hwqps_responded++; + + if (ctx->hwqps_responded < ctx->num_hwqps) { + /* Wait for all pollers to complete. */ + return; + } + + /* Free the queue sync poller args. */ + free(ctx->sync_poller_args); + + /* Mark as queue synced */ + ctx->queue_synced = true; + + /* Reset the ctx values */ + ctx->hwqps_responded = 0; + ctx->handled = false; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, + "QueueSync(0x%lx) completed for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n", + ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid); + + /* Resend ABTS to pollers */ + args = ctx->abts_poller_args; + for (int i = 0; i < ctx->num_hwqps; i++) { + poller_arg = args + i; + nvmf_fc_poller_api_func(poller_arg->hwqp, + SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED, + poller_arg); + } +} + +static int +nvmf_fc_handle_abts_notfound(struct spdk_nvmf_fc_abts_ctx *ctx) +{ + struct spdk_nvmf_fc_poller_api_queue_sync_args *args, *poller_arg; + struct spdk_nvmf_fc_poller_api_abts_recvd_args *abts_args, *abts_poller_arg; + + /* check if FC driver supports queue sync */ + if (!nvmf_fc_q_sync_available()) { + return -EPERM; + } + + assert(ctx); + if (!ctx) { + SPDK_ERRLOG("NULL ctx pointer"); + return -EINVAL; + } + + /* Reset the ctx values */ + ctx->hwqps_responded = 0; + + args = calloc(ctx->num_hwqps, + sizeof(struct spdk_nvmf_fc_poller_api_queue_sync_args)); + if (!args) { + SPDK_ERRLOG("QueueSync(0x%lx) failed for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n", + ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid); + return -ENOMEM; + } + ctx->sync_poller_args = args; + + abts_args = ctx->abts_poller_args; + for (int i = 0; i < ctx->num_hwqps; i++) { + abts_poller_arg = abts_args + i; + poller_arg = args + i; + poller_arg->u_id = ctx->u_id; + poller_arg->hwqp = abts_poller_arg->hwqp; + poller_arg->cb_info.cb_func = nvmf_fc_queue_synced_cb; + poller_arg->cb_info.cb_data = ctx; + poller_arg->cb_info.cb_thread = spdk_get_thread(); + + /* Send a Queue sync message to interested pollers */ + nvmf_fc_poller_api_func(poller_arg->hwqp, + SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC, + poller_arg); + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, + "QueueSync(0x%lx) Sent for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n", + ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid); + + /* Post Marker to queue to track aborted request */ + nvmf_fc_issue_q_sync(ctx->ls_hwqp, ctx->u_id, ctx->fcp_rq_id); + + return 0; +} + +static void +nvmf_fc_abts_handled_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret) +{ + struct spdk_nvmf_fc_abts_ctx *ctx = cb_data; + struct spdk_nvmf_fc_nport *nport = NULL; + + if (ret != SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND) { + ctx->handled = true; + } + + ctx->hwqps_responded++; + + if (ctx->hwqps_responded < ctx->num_hwqps) { + /* Wait for all pollers to complete. */ + return; + } + + nport = nvmf_fc_nport_find(ctx->port_hdl, ctx->nport_hdl); + + if (ctx->nport != nport) { + /* Nport can be deleted while this abort is being + * processed by the pollers. + */ + SPDK_NOTICELOG("nport_%d deleted while processing ABTS frame, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n", + ctx->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid); + } else { + if (!ctx->handled) { + /* Try syncing the queues and try one more time */ + if (!ctx->queue_synced && (nvmf_fc_handle_abts_notfound(ctx) == 0)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, + "QueueSync(0x%lx) for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n", + ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid); + return; + } else { + /* Send Reject */ + nvmf_fc_xmt_bls_rsp(&ctx->nport->fc_port->ls_queue, + ctx->oxid, ctx->rxid, ctx->rpi, true, + FCNVME_BLS_REJECT_EXP_INVALID_OXID, NULL, NULL); + } + } else { + /* Send Accept */ + nvmf_fc_xmt_bls_rsp(&ctx->nport->fc_port->ls_queue, + ctx->oxid, ctx->rxid, ctx->rpi, false, + 0, NULL, NULL); + } + } + SPDK_NOTICELOG("BLS_%s sent for ABTS frame nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n", + (ctx->handled) ? "ACC" : "REJ", ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid); + + free(ctx->abts_poller_args); + free(ctx); +} + +void +nvmf_fc_handle_abts_frame(struct spdk_nvmf_fc_nport *nport, uint16_t rpi, + uint16_t oxid, uint16_t rxid) +{ + struct spdk_nvmf_fc_abts_ctx *ctx = NULL; + struct spdk_nvmf_fc_poller_api_abts_recvd_args *args = NULL, *poller_arg; + struct spdk_nvmf_fc_association *assoc = NULL; + struct spdk_nvmf_fc_conn *conn = NULL; + uint32_t hwqp_cnt = 0; + bool skip_hwqp_cnt; + struct spdk_nvmf_fc_hwqp **hwqps = NULL; + uint32_t i; + + SPDK_NOTICELOG("Handle ABTS frame for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n", + nport->nport_hdl, rpi, oxid, rxid); + + /* Allocate memory to track hwqp's with at least 1 active connection. */ + hwqps = calloc(nport->fc_port->num_io_queues, sizeof(struct spdk_nvmf_fc_hwqp *)); + if (hwqps == NULL) { + SPDK_ERRLOG("Unable to allocate temp. hwqp array for abts processing!\n"); + goto bls_rej; + } + + TAILQ_FOREACH(assoc, &nport->fc_associations, link) { + TAILQ_FOREACH(conn, &assoc->fc_conns, assoc_link) { + if (conn->rpi != rpi) { + continue; + } + + skip_hwqp_cnt = false; + for (i = 0; i < hwqp_cnt; i++) { + if (hwqps[i] == conn->hwqp) { + /* Skip. This is already present */ + skip_hwqp_cnt = true; + break; + } + } + if (!skip_hwqp_cnt) { + assert(hwqp_cnt < nport->fc_port->num_io_queues); + hwqps[hwqp_cnt] = conn->hwqp; + hwqp_cnt++; + } + } + } + + if (!hwqp_cnt) { + goto bls_rej; + } + + args = calloc(hwqp_cnt, + sizeof(struct spdk_nvmf_fc_poller_api_abts_recvd_args)); + if (!args) { + goto bls_rej; + } + + ctx = calloc(1, sizeof(struct spdk_nvmf_fc_abts_ctx)); + if (!ctx) { + goto bls_rej; + } + ctx->rpi = rpi; + ctx->oxid = oxid; + ctx->rxid = rxid; + ctx->nport = nport; + ctx->nport_hdl = nport->nport_hdl; + ctx->port_hdl = nport->fc_port->port_hdl; + ctx->num_hwqps = hwqp_cnt; + ctx->ls_hwqp = &nport->fc_port->ls_queue; + ctx->fcp_rq_id = nport->fc_port->fcp_rq_id; + ctx->abts_poller_args = args; + + /* Get a unique context for this ABTS */ + ctx->u_id = nvmf_fc_get_abts_unique_id(); + + for (i = 0; i < hwqp_cnt; i++) { + poller_arg = args + i; + poller_arg->hwqp = hwqps[i]; + poller_arg->cb_info.cb_func = nvmf_fc_abts_handled_cb; + poller_arg->cb_info.cb_data = ctx; + poller_arg->cb_info.cb_thread = spdk_get_thread(); + poller_arg->ctx = ctx; + + nvmf_fc_poller_api_func(poller_arg->hwqp, + SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED, + poller_arg); + } + + free(hwqps); + + return; +bls_rej: + free(args); + free(hwqps); + + /* Send Reject */ + nvmf_fc_xmt_bls_rsp(&nport->fc_port->ls_queue, oxid, rxid, rpi, + true, FCNVME_BLS_REJECT_EXP_NOINFO, NULL, NULL); + SPDK_NOTICELOG("BLS_RJT for ABTS frame for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n", + nport->nport_hdl, rpi, oxid, rxid); + return; +} + +/*** Accessor functions for the FC structures - BEGIN */ +/* + * Returns true if the port is in offline state. + */ +bool +nvmf_fc_port_is_offline(struct spdk_nvmf_fc_port *fc_port) +{ + if (fc_port && (fc_port->hw_port_status == SPDK_FC_PORT_OFFLINE)) { + return true; + } + + return false; +} + +/* + * Returns true if the port is in online state. + */ +bool +nvmf_fc_port_is_online(struct spdk_nvmf_fc_port *fc_port) +{ + if (fc_port && (fc_port->hw_port_status == SPDK_FC_PORT_ONLINE)) { + return true; + } + + return false; +} + +int +nvmf_fc_port_set_online(struct spdk_nvmf_fc_port *fc_port) +{ + if (fc_port && (fc_port->hw_port_status != SPDK_FC_PORT_ONLINE)) { + fc_port->hw_port_status = SPDK_FC_PORT_ONLINE; + return 0; + } + + return -EPERM; +} + +int +nvmf_fc_port_set_offline(struct spdk_nvmf_fc_port *fc_port) +{ + if (fc_port && (fc_port->hw_port_status != SPDK_FC_PORT_OFFLINE)) { + fc_port->hw_port_status = SPDK_FC_PORT_OFFLINE; + return 0; + } + + return -EPERM; +} + +int +nvmf_fc_hwqp_set_online(struct spdk_nvmf_fc_hwqp *hwqp) +{ + if (hwqp && (hwqp->state != SPDK_FC_HWQP_ONLINE)) { + hwqp->state = SPDK_FC_HWQP_ONLINE; + /* reset some queue counters */ + hwqp->num_conns = 0; + return nvmf_fc_set_q_online_state(hwqp, true); + } + + return -EPERM; +} + +int +nvmf_fc_hwqp_set_offline(struct spdk_nvmf_fc_hwqp *hwqp) +{ + if (hwqp && (hwqp->state != SPDK_FC_HWQP_OFFLINE)) { + hwqp->state = SPDK_FC_HWQP_OFFLINE; + return nvmf_fc_set_q_online_state(hwqp, false); + } + + return -EPERM; +} + +void +nvmf_fc_port_add(struct spdk_nvmf_fc_port *fc_port) +{ + TAILQ_INSERT_TAIL(&g_spdk_nvmf_fc_port_list, fc_port, link); +} + +struct spdk_nvmf_fc_port * +nvmf_fc_port_lookup(uint8_t port_hdl) +{ + struct spdk_nvmf_fc_port *fc_port = NULL; + + TAILQ_FOREACH(fc_port, &g_spdk_nvmf_fc_port_list, link) { + if (fc_port->port_hdl == port_hdl) { + return fc_port; + } + } + return NULL; +} + +static void +nvmf_fc_port_cleanup(void) +{ + struct spdk_nvmf_fc_port *fc_port, *tmp; + struct spdk_nvmf_fc_hwqp *hwqp; + uint32_t i; + + TAILQ_FOREACH_SAFE(fc_port, &g_spdk_nvmf_fc_port_list, link, tmp) { + TAILQ_REMOVE(&g_spdk_nvmf_fc_port_list, fc_port, link); + for (i = 0; i < fc_port->num_io_queues; i++) { + hwqp = &fc_port->io_queues[i]; + if (hwqp->fc_reqs_buf) { + free(hwqp->fc_reqs_buf); + } + } + free(fc_port); + } +} + +uint32_t +nvmf_fc_get_prli_service_params(void) +{ + return (SPDK_NVMF_FC_DISCOVERY_SERVICE | SPDK_NVMF_FC_TARGET_FUNCTION); +} + +int +nvmf_fc_port_add_nport(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_nport *nport) +{ + if (fc_port) { + TAILQ_INSERT_TAIL(&fc_port->nport_list, nport, link); + fc_port->num_nports++; + return 0; + } + + return -EINVAL; +} + +int +nvmf_fc_port_remove_nport(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_nport *nport) +{ + if (fc_port && nport) { + TAILQ_REMOVE(&fc_port->nport_list, nport, link); + fc_port->num_nports--; + return 0; + } + + return -EINVAL; +} + +static struct spdk_nvmf_fc_nport * +nvmf_fc_nport_hdl_lookup(struct spdk_nvmf_fc_port *fc_port, uint16_t nport_hdl) +{ + struct spdk_nvmf_fc_nport *fc_nport = NULL; + + TAILQ_FOREACH(fc_nport, &fc_port->nport_list, link) { + if (fc_nport->nport_hdl == nport_hdl) { + return fc_nport; + } + } + + return NULL; +} + +struct spdk_nvmf_fc_nport * +nvmf_fc_nport_find(uint8_t port_hdl, uint16_t nport_hdl) +{ + struct spdk_nvmf_fc_port *fc_port = NULL; + + fc_port = nvmf_fc_port_lookup(port_hdl); + if (fc_port) { + return nvmf_fc_nport_hdl_lookup(fc_port, nport_hdl); + } + + return NULL; +} + +static inline int +nvmf_fc_hwqp_find_nport_and_rport(struct spdk_nvmf_fc_hwqp *hwqp, + uint32_t d_id, struct spdk_nvmf_fc_nport **nport, + uint32_t s_id, struct spdk_nvmf_fc_remote_port_info **rport) +{ + struct spdk_nvmf_fc_nport *n_port; + struct spdk_nvmf_fc_remote_port_info *r_port; + + assert(hwqp); + if (hwqp == NULL) { + SPDK_ERRLOG("Error: hwqp is NULL\n"); + return -EINVAL; + } + assert(nport); + if (nport == NULL) { + SPDK_ERRLOG("Error: nport is NULL\n"); + return -EINVAL; + } + assert(rport); + if (rport == NULL) { + SPDK_ERRLOG("Error: rport is NULL\n"); + return -EINVAL; + } + + TAILQ_FOREACH(n_port, &hwqp->fc_port->nport_list, link) { + if (n_port->d_id == d_id) { + TAILQ_FOREACH(r_port, &n_port->rem_port_list, link) { + if (r_port->s_id == s_id) { + *nport = n_port; + *rport = r_port; + return 0; + } + } + break; + } + } + + return -ENOENT; +} + +/* Returns true if the Nport is empty of all rem_ports */ +bool +nvmf_fc_nport_has_no_rport(struct spdk_nvmf_fc_nport *nport) +{ + if (nport && TAILQ_EMPTY(&nport->rem_port_list)) { + assert(nport->rport_count == 0); + return true; + } else { + return false; + } +} + +int +nvmf_fc_nport_set_state(struct spdk_nvmf_fc_nport *nport, + enum spdk_nvmf_fc_object_state state) +{ + if (nport) { + nport->nport_state = state; + return 0; + } else { + return -EINVAL; + } +} + +bool +nvmf_fc_nport_add_rem_port(struct spdk_nvmf_fc_nport *nport, + struct spdk_nvmf_fc_remote_port_info *rem_port) +{ + if (nport && rem_port) { + TAILQ_INSERT_TAIL(&nport->rem_port_list, rem_port, link); + nport->rport_count++; + return 0; + } else { + return -EINVAL; + } +} + +bool +nvmf_fc_nport_remove_rem_port(struct spdk_nvmf_fc_nport *nport, + struct spdk_nvmf_fc_remote_port_info *rem_port) +{ + if (nport && rem_port) { + TAILQ_REMOVE(&nport->rem_port_list, rem_port, link); + nport->rport_count--; + return 0; + } else { + return -EINVAL; + } +} + +int +nvmf_fc_rport_set_state(struct spdk_nvmf_fc_remote_port_info *rport, + enum spdk_nvmf_fc_object_state state) +{ + if (rport) { + rport->rport_state = state; + return 0; + } else { + return -EINVAL; + } +} +int +nvmf_fc_assoc_set_state(struct spdk_nvmf_fc_association *assoc, + enum spdk_nvmf_fc_object_state state) +{ + if (assoc) { + assoc->assoc_state = state; + return 0; + } else { + return -EINVAL; + } +} + +static struct spdk_nvmf_fc_association * +nvmf_ctrlr_get_fc_assoc(struct spdk_nvmf_ctrlr *ctrlr) +{ + struct spdk_nvmf_qpair *qpair = ctrlr->admin_qpair; + struct spdk_nvmf_fc_conn *fc_conn; + + if (!qpair) { + SPDK_ERRLOG("Controller %d has no associations\n", ctrlr->cntlid); + return NULL; + } + + fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair); + + return fc_conn->fc_assoc; +} + +bool +nvmf_ctrlr_is_on_nport(uint8_t port_hdl, uint16_t nport_hdl, + struct spdk_nvmf_ctrlr *ctrlr) +{ + struct spdk_nvmf_fc_nport *fc_nport = NULL; + struct spdk_nvmf_fc_association *assoc = NULL; + + if (!ctrlr) { + return false; + } + + fc_nport = nvmf_fc_nport_find(port_hdl, nport_hdl); + if (!fc_nport) { + return false; + } + + assoc = nvmf_ctrlr_get_fc_assoc(ctrlr); + if (assoc && assoc->tgtport == fc_nport) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, + "Controller: %d corresponding to association: %p(%lu:%d) is on port: %d nport: %d\n", + ctrlr->cntlid, assoc, assoc->assoc_id, assoc->assoc_state, port_hdl, + nport_hdl); + return true; + } + return false; +} + +static inline bool +nvmf_fc_req_in_bdev(struct spdk_nvmf_fc_request *fc_req) +{ + switch (fc_req->state) { + case SPDK_NVMF_FC_REQ_READ_BDEV: + case SPDK_NVMF_FC_REQ_WRITE_BDEV: + case SPDK_NVMF_FC_REQ_NONE_BDEV: + return true; + default: + return false; + } +} + +static inline bool +nvmf_fc_req_in_pending(struct spdk_nvmf_fc_request *fc_req) +{ + struct spdk_nvmf_request *tmp = NULL; + + STAILQ_FOREACH(tmp, &fc_req->hwqp->fgroup->group.pending_buf_queue, buf_link) { + if (tmp == &fc_req->req) { + return true; + } + } + return false; +} + +static void +nvmf_fc_req_bdev_abort(void *arg1) +{ + struct spdk_nvmf_fc_request *fc_req = arg1; + struct spdk_nvmf_ctrlr *ctrlr = fc_req->req.qpair->ctrlr; + int i; + + /* Initial release - we don't have to abort Admin Queue or + * Fabric commands. The AQ commands supported at this time are + * Get-Log-Page, + * Identify + * Set Features + * Get Features + * AER -> Special case and handled differently. + * Every one of the above Admin commands (except AER) run + * to completion and so an Abort of such commands doesn't + * make sense. + */ + /* The Fabric commands supported are + * Property Set + * Property Get + * Connect -> Special case (async. handling). Not sure how to + * handle at this point. Let it run to completion. + */ + for (i = 0; i < NVMF_MAX_ASYNC_EVENTS; i++) { + if (ctrlr->aer_req[i] == &fc_req->req) { + SPDK_NOTICELOG("Abort AER request\n"); + nvmf_qpair_free_aer(fc_req->req.qpair); + } + } +} + +void +nvmf_fc_request_abort_complete(void *arg1) +{ + struct spdk_nvmf_fc_request *fc_req = + (struct spdk_nvmf_fc_request *)arg1; + struct spdk_nvmf_fc_caller_ctx *ctx = NULL, *tmp = NULL; + + /* Request abort completed. Notify all the callbacks */ + TAILQ_FOREACH_SAFE(ctx, &fc_req->abort_cbs, link, tmp) { + /* Notify */ + ctx->cb(fc_req->hwqp, 0, ctx->cb_args); + /* Remove */ + TAILQ_REMOVE(&fc_req->abort_cbs, ctx, link); + /* free */ + free(ctx); + } + + SPDK_NOTICELOG("FC Request(%p) in state :%s aborted\n", fc_req, + fc_req_state_strs[fc_req->state]); + + _nvmf_fc_request_free(fc_req); +} + +void +nvmf_fc_request_abort(struct spdk_nvmf_fc_request *fc_req, bool send_abts, + spdk_nvmf_fc_caller_cb cb, void *cb_args) +{ + struct spdk_nvmf_fc_caller_ctx *ctx = NULL; + bool kill_req = false; + + /* Add the cb to list */ + if (cb) { + ctx = calloc(1, sizeof(struct spdk_nvmf_fc_caller_ctx)); + if (!ctx) { + SPDK_ERRLOG("ctx alloc failed.\n"); + return; + } + ctx->cb = cb; + ctx->cb_args = cb_args; + + TAILQ_INSERT_TAIL(&fc_req->abort_cbs, ctx, link); + } + + if (!fc_req->is_aborted) { + /* Increment aborted command counter */ + fc_req->hwqp->counters.num_aborted++; + } + + /* If port is dead, skip abort wqe */ + kill_req = nvmf_fc_is_port_dead(fc_req->hwqp); + if (kill_req && nvmf_fc_req_in_xfer(fc_req)) { + fc_req->is_aborted = true; + goto complete; + } + + /* Check if the request is already marked for deletion */ + if (fc_req->is_aborted) { + return; + } + + /* Mark request as aborted */ + fc_req->is_aborted = true; + + /* If xchg is allocated, then save if we need to send abts or not. */ + if (fc_req->xchg) { + fc_req->xchg->send_abts = send_abts; + fc_req->xchg->aborted = true; + } + + if (fc_req->state == SPDK_NVMF_FC_REQ_BDEV_ABORTED) { + /* Aborted by backend */ + goto complete; + } else if (nvmf_fc_req_in_bdev(fc_req)) { + /* Notify bdev */ + spdk_thread_send_msg(fc_req->hwqp->thread, + nvmf_fc_req_bdev_abort, (void *)fc_req); + } else if (nvmf_fc_req_in_xfer(fc_req)) { + /* Notify HBA to abort this exchange */ + nvmf_fc_issue_abort(fc_req->hwqp, fc_req->xchg, NULL, NULL); + } else if (nvmf_fc_req_in_get_buff(fc_req)) { + /* Will be completed by request_complete callback. */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Abort req when getting buffers.\n"); + } else if (nvmf_fc_req_in_pending(fc_req)) { + /* Remove from pending */ + STAILQ_REMOVE(&fc_req->hwqp->fgroup->group.pending_buf_queue, &fc_req->req, + spdk_nvmf_request, buf_link); + goto complete; + } else { + /* Should never happen */ + SPDK_ERRLOG("Request in invalid state\n"); + goto complete; + } + + return; +complete: + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_ABORTED); + nvmf_fc_poller_api_func(fc_req->hwqp, SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE, + (void *)fc_req); +} + +static int +nvmf_fc_request_alloc_buffers(struct spdk_nvmf_fc_request *fc_req) +{ + uint32_t length = fc_req->req.length; + struct spdk_nvmf_fc_poll_group *fgroup = fc_req->hwqp->fgroup; + struct spdk_nvmf_transport_poll_group *group = &fgroup->group; + struct spdk_nvmf_transport *transport = group->transport; + + if (spdk_nvmf_request_get_buffers(&fc_req->req, group, transport, length)) { + return -ENOMEM; + } + + return 0; +} + +static int +nvmf_fc_request_execute(struct spdk_nvmf_fc_request *fc_req) +{ + /* Allocate an XCHG if we dont use send frame for this command. */ + if (!nvmf_fc_use_send_frame(&fc_req->req)) { + fc_req->xchg = nvmf_fc_get_xri(fc_req->hwqp); + if (!fc_req->xchg) { + fc_req->hwqp->counters.no_xchg++; + printf("NO XCHGs!\n"); + goto pending; + } + } + + if (fc_req->req.length) { + if (nvmf_fc_request_alloc_buffers(fc_req) < 0) { + fc_req->hwqp->counters.buf_alloc_err++; + goto pending; + } + fc_req->req.data = fc_req->req.iov[0].iov_base; + } + + if (fc_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "WRITE CMD.\n"); + + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_WRITE_XFER); + + if (nvmf_fc_recv_data(fc_req)) { + /* Dropped return success to caller */ + fc_req->hwqp->counters.unexpected_err++; + _nvmf_fc_request_free(fc_req); + } + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "READ/NONE CMD\n"); + + if (fc_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_READ_BDEV); + } else { + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_NONE_BDEV); + } + spdk_nvmf_request_exec(&fc_req->req); + } + + return 0; + +pending: + if (fc_req->xchg) { + nvmf_fc_put_xchg(fc_req->hwqp, fc_req->xchg); + fc_req->xchg = NULL; + } + + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_PENDING); + + return -EAGAIN; +} + +static int +nvmf_fc_hwqp_handle_request(struct spdk_nvmf_fc_hwqp *hwqp, struct spdk_nvmf_fc_frame_hdr *frame, + uint32_t buf_idx, struct spdk_nvmf_fc_buffer_desc *buffer, uint32_t plen) +{ + uint16_t cmnd_len; + uint64_t rqst_conn_id; + struct spdk_nvmf_fc_request *fc_req = NULL; + struct spdk_nvmf_fc_cmnd_iu *cmd_iu = NULL; + struct spdk_nvmf_fc_conn *fc_conn = NULL; + enum spdk_nvme_data_transfer xfer; + + cmd_iu = buffer->virt; + cmnd_len = cmd_iu->cmnd_iu_len; + cmnd_len = from_be16(&cmnd_len); + + /* check for a valid cmnd_iu format */ + if ((cmd_iu->fc_id != FCNVME_CMND_IU_FC_ID) || + (cmd_iu->scsi_id != FCNVME_CMND_IU_SCSI_ID) || + (cmnd_len != sizeof(struct spdk_nvmf_fc_cmnd_iu) / 4)) { + SPDK_ERRLOG("IU CMD error\n"); + hwqp->counters.nvme_cmd_iu_err++; + return -ENXIO; + } + + xfer = spdk_nvme_opc_get_data_transfer(cmd_iu->flags); + if (xfer == SPDK_NVME_DATA_BIDIRECTIONAL) { + SPDK_ERRLOG("IU CMD xfer error\n"); + hwqp->counters.nvme_cmd_xfer_err++; + return -EPERM; + } + + rqst_conn_id = from_be64(&cmd_iu->conn_id); + + /* Check if conn id is valid */ + fc_conn = nvmf_fc_hwqp_find_fc_conn(hwqp, rqst_conn_id); + if (!fc_conn) { + SPDK_ERRLOG("IU CMD conn(%ld) invalid\n", rqst_conn_id); + hwqp->counters.invalid_conn_err++; + return -ENODEV; + } + + /* If association/connection is being deleted - return */ + if (fc_conn->fc_assoc->assoc_state != SPDK_NVMF_FC_OBJECT_CREATED) { + SPDK_ERRLOG("Association state not valid\n"); + return -EACCES; + } + + if (fc_conn->qpair.state == SPDK_NVMF_QPAIR_ERROR) { + return -EACCES; + } + + /* Make sure xfer len is according to mdts */ + if (from_be32(&cmd_iu->data_len) > + hwqp->fgroup->group.transport->opts.max_io_size) { + SPDK_ERRLOG("IO length requested is greater than MDTS\n"); + return -EINVAL; + } + + /* allocate a request buffer */ + fc_req = nvmf_fc_hwqp_alloc_fc_request(hwqp); + if (fc_req == NULL) { + /* Should not happen. Since fc_reqs == RQ buffers */ + return -ENOMEM; + } + + fc_req->req.length = from_be32(&cmd_iu->data_len); + fc_req->req.qpair = &fc_conn->qpair; + fc_req->req.cmd = (union nvmf_h2c_msg *)&cmd_iu->cmd; + fc_req->req.rsp = (union nvmf_c2h_msg *)&fc_req->ersp.rsp; + fc_req->oxid = frame->ox_id; + fc_req->oxid = from_be16(&fc_req->oxid); + fc_req->rpi = fc_conn->rpi; + fc_req->buf_index = buf_idx; + fc_req->poller_lcore = hwqp->lcore_id; + fc_req->poller_thread = hwqp->thread; + fc_req->hwqp = hwqp; + fc_req->fc_conn = fc_conn; + fc_req->req.xfer = xfer; + fc_req->s_id = (uint32_t)frame->s_id; + fc_req->d_id = (uint32_t)frame->d_id; + fc_req->s_id = from_be32(&fc_req->s_id) >> 8; + fc_req->d_id = from_be32(&fc_req->d_id) >> 8; + + nvmf_fc_record_req_trace_point(fc_req, SPDK_NVMF_FC_REQ_INIT); + if (nvmf_fc_request_execute(fc_req)) { + STAILQ_INSERT_TAIL(&hwqp->fgroup->group.pending_buf_queue, &fc_req->req, buf_link); + } + + return 0; +} + +/* + * These functions are called from the FC LLD + */ + +void +_nvmf_fc_request_free(struct spdk_nvmf_fc_request *fc_req) +{ + struct spdk_nvmf_fc_hwqp *hwqp = fc_req->hwqp; + struct spdk_nvmf_fc_poll_group *fgroup = hwqp->fgroup; + struct spdk_nvmf_transport_poll_group *group = &fgroup->group; + struct spdk_nvmf_transport *transport = group->transport; + + if (!fc_req) { + return; + } + + if (fc_req->xchg) { + nvmf_fc_put_xchg(hwqp, fc_req->xchg); + fc_req->xchg = NULL; + } + + /* Release IO buffers */ + if (fc_req->req.data_from_pool) { + spdk_nvmf_request_free_buffers(&fc_req->req, group, transport); + } + fc_req->req.data = NULL; + fc_req->req.iovcnt = 0; + + /* Release Q buffer */ + nvmf_fc_rqpair_buffer_release(hwqp, fc_req->buf_index); + + /* Free Fc request */ + nvmf_fc_hwqp_free_fc_request(hwqp, fc_req); +} + +void +nvmf_fc_request_set_state(struct spdk_nvmf_fc_request *fc_req, + enum spdk_nvmf_fc_request_state state) +{ + assert(fc_req->magic != 0xDEADBEEF); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, + "FC Request(%p):\n\tState Old:%s New:%s\n", fc_req, + nvmf_fc_request_get_state_str(fc_req->state), + nvmf_fc_request_get_state_str(state)); + nvmf_fc_record_req_trace_point(fc_req, state); + fc_req->state = state; +} + +char * +nvmf_fc_request_get_state_str(int state) +{ + static char *unk_str = "unknown"; + + return (state >= 0 && state < (int)(sizeof(fc_req_state_strs) / sizeof(char *)) ? + fc_req_state_strs[state] : unk_str); +} + +int +nvmf_fc_hwqp_process_frame(struct spdk_nvmf_fc_hwqp *hwqp, + uint32_t buff_idx, + struct spdk_nvmf_fc_frame_hdr *frame, + struct spdk_nvmf_fc_buffer_desc *buffer, + uint32_t plen) +{ + int rc = 0; + uint32_t s_id, d_id; + struct spdk_nvmf_fc_nport *nport = NULL; + struct spdk_nvmf_fc_remote_port_info *rport = NULL; + + s_id = (uint32_t)frame->s_id; + d_id = (uint32_t)frame->d_id; + s_id = from_be32(&s_id) >> 8; + d_id = from_be32(&d_id) >> 8; + + /* Note: In tracelog below, we directly do endian conversion on rx_id and. + * ox_id Since these are fields, we can't pass address to from_be16(). + * Since ox_id and rx_id are only needed for tracelog, assigning to local + * vars. and doing conversion is a waste of time in non-debug builds. */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, + "Process NVME frame s_id:0x%x d_id:0x%x oxid:0x%x rxid:0x%x.\n", + s_id, d_id, + ((frame->ox_id << 8) & 0xff00) | ((frame->ox_id >> 8) & 0xff), + ((frame->rx_id << 8) & 0xff00) | ((frame->rx_id >> 8) & 0xff)); + + rc = nvmf_fc_hwqp_find_nport_and_rport(hwqp, d_id, &nport, s_id, &rport); + if (rc) { + if (nport == NULL) { + SPDK_ERRLOG("Nport not found. Dropping\n"); + /* increment invalid nport counter */ + hwqp->counters.nport_invalid++; + } else if (rport == NULL) { + SPDK_ERRLOG("Rport not found. Dropping\n"); + /* increment invalid rport counter */ + hwqp->counters.rport_invalid++; + } + return rc; + } + + if (nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED || + rport->rport_state != SPDK_NVMF_FC_OBJECT_CREATED) { + SPDK_ERRLOG("%s state not created. Dropping\n", + nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED ? + "Nport" : "Rport"); + return -EACCES; + } + + if ((frame->r_ctl == FCNVME_R_CTL_LS_REQUEST) && + (frame->type == FCNVME_TYPE_NVMF_DATA)) { + struct spdk_nvmf_fc_rq_buf_ls_request *req_buf = buffer->virt; + struct spdk_nvmf_fc_ls_rqst *ls_rqst; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Process LS NVME frame\n"); + + /* Use the RQ buffer for holding LS request. */ + ls_rqst = (struct spdk_nvmf_fc_ls_rqst *)&req_buf->ls_rqst; + + /* Fill in the LS request structure */ + ls_rqst->rqstbuf.virt = (void *)&req_buf->rqst; + ls_rqst->rqstbuf.phys = buffer->phys + + offsetof(struct spdk_nvmf_fc_rq_buf_ls_request, rqst); + ls_rqst->rqstbuf.buf_index = buff_idx; + ls_rqst->rqst_len = plen; + + ls_rqst->rspbuf.virt = (void *)&req_buf->resp; + ls_rqst->rspbuf.phys = buffer->phys + + offsetof(struct spdk_nvmf_fc_rq_buf_ls_request, resp); + ls_rqst->rsp_len = FCNVME_MAX_LS_RSP_SIZE; + + ls_rqst->private_data = (void *)hwqp; + ls_rqst->rpi = rport->rpi; + ls_rqst->oxid = (uint16_t)frame->ox_id; + ls_rqst->oxid = from_be16(&ls_rqst->oxid); + ls_rqst->s_id = s_id; + ls_rqst->d_id = d_id; + ls_rqst->nport = nport; + ls_rqst->rport = rport; + ls_rqst->nvmf_tgt = g_nvmf_ftransport->transport.tgt; + + ls_rqst->xchg = nvmf_fc_get_xri(hwqp); + if (ls_rqst->xchg) { + /* Handover the request to LS module */ + nvmf_fc_handle_ls_rqst(ls_rqst); + } else { + /* No XCHG available. Add to pending list. */ + hwqp->counters.no_xchg++; + TAILQ_INSERT_TAIL(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link); + } + } else if ((frame->r_ctl == FCNVME_R_CTL_CMD_REQ) && + (frame->type == FCNVME_TYPE_FC_EXCHANGE)) { + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Process IO NVME frame\n"); + rc = nvmf_fc_hwqp_handle_request(hwqp, frame, buff_idx, buffer, plen); + } else { + + SPDK_ERRLOG("Unknown frame received. Dropping\n"); + hwqp->counters.unknown_frame++; + rc = -EINVAL; + } + + return rc; +} + +void +nvmf_fc_hwqp_process_pending_reqs(struct spdk_nvmf_fc_hwqp *hwqp) +{ + struct spdk_nvmf_request *req = NULL, *tmp; + struct spdk_nvmf_fc_request *fc_req; + int budget = 64; + + if (!hwqp->fgroup) { + /* LS queue is tied to acceptor_poll group and LS pending requests + * are stagged and processed using hwqp->ls_pending_queue. + */ + return; + } + + STAILQ_FOREACH_SAFE(req, &hwqp->fgroup->group.pending_buf_queue, buf_link, tmp) { + fc_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_fc_request, req); + if (!nvmf_fc_request_execute(fc_req)) { + /* Succesfuly posted, Delete from pending. */ + STAILQ_REMOVE_HEAD(&hwqp->fgroup->group.pending_buf_queue, buf_link); + } + + if (budget) { + budget--; + } else { + return; + } + } +} + +void +nvmf_fc_hwqp_process_pending_ls_rqsts(struct spdk_nvmf_fc_hwqp *hwqp) +{ + struct spdk_nvmf_fc_ls_rqst *ls_rqst = NULL, *tmp; + struct spdk_nvmf_fc_nport *nport = NULL; + struct spdk_nvmf_fc_remote_port_info *rport = NULL; + + TAILQ_FOREACH_SAFE(ls_rqst, &hwqp->ls_pending_queue, ls_pending_link, tmp) { + /* lookup nport and rport again - make sure they are still valid */ + int rc = nvmf_fc_hwqp_find_nport_and_rport(hwqp, ls_rqst->d_id, &nport, ls_rqst->s_id, &rport); + if (rc) { + if (nport == NULL) { + SPDK_ERRLOG("Nport not found. Dropping\n"); + /* increment invalid nport counter */ + hwqp->counters.nport_invalid++; + } else if (rport == NULL) { + SPDK_ERRLOG("Rport not found. Dropping\n"); + /* increment invalid rport counter */ + hwqp->counters.rport_invalid++; + } + TAILQ_REMOVE(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link); + /* Return buffer to chip */ + nvmf_fc_rqpair_buffer_release(hwqp, ls_rqst->rqstbuf.buf_index); + continue; + } + if (nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED || + rport->rport_state != SPDK_NVMF_FC_OBJECT_CREATED) { + SPDK_ERRLOG("%s state not created. Dropping\n", + nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED ? + "Nport" : "Rport"); + TAILQ_REMOVE(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link); + /* Return buffer to chip */ + nvmf_fc_rqpair_buffer_release(hwqp, ls_rqst->rqstbuf.buf_index); + continue; + } + + ls_rqst->xchg = nvmf_fc_get_xri(hwqp); + if (ls_rqst->xchg) { + /* Got an XCHG */ + TAILQ_REMOVE(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link); + /* Handover the request to LS module */ + nvmf_fc_handle_ls_rqst(ls_rqst); + } else { + /* No more XCHGs. Stop processing. */ + hwqp->counters.no_xchg++; + return; + } + } +} + +int +nvmf_fc_handle_rsp(struct spdk_nvmf_fc_request *fc_req) +{ + int rc = 0; + struct spdk_nvmf_request *req = &fc_req->req; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_fc_conn *fc_conn = nvmf_fc_get_conn(qpair); + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint16_t ersp_len = 0; + + /* set sq head value in resp */ + rsp->sqhd = nvmf_fc_advance_conn_sqhead(qpair); + + /* Increment connection responses */ + fc_conn->rsp_count++; + + if (nvmf_fc_send_ersp_required(fc_req, fc_conn->rsp_count, + fc_req->transfered_len)) { + /* Fill ERSP Len */ + to_be16(&ersp_len, (sizeof(struct spdk_nvmf_fc_ersp_iu) / + sizeof(uint32_t))); + fc_req->ersp.ersp_len = ersp_len; + + /* Fill RSN */ + to_be32(&fc_req->ersp.response_seq_no, fc_conn->rsn); + fc_conn->rsn++; + + /* Fill transfer length */ + to_be32(&fc_req->ersp.transferred_data_len, fc_req->transfered_len); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Posting ERSP.\n"); + rc = nvmf_fc_xmt_rsp(fc_req, (uint8_t *)&fc_req->ersp, + sizeof(struct spdk_nvmf_fc_ersp_iu)); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Posting RSP.\n"); + rc = nvmf_fc_xmt_rsp(fc_req, NULL, 0); + } + + return rc; +} + +bool +nvmf_fc_send_ersp_required(struct spdk_nvmf_fc_request *fc_req, + uint32_t rsp_cnt, uint32_t xfer_len) +{ + struct spdk_nvmf_request *req = &fc_req->req; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_fc_conn *fc_conn = nvmf_fc_get_conn(qpair); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint16_t status = *((uint16_t *)&rsp->status); + + /* + * Check if we need to send ERSP + * 1) For every N responses where N == ersp_ratio + * 2) Fabric commands. + * 3) Completion status failed or Completion dw0 or dw1 valid. + * 4) SQ == 90% full. + * 5) Transfer length not equal to CMD IU length + */ + + if (!(rsp_cnt % fc_conn->esrp_ratio) || + (cmd->opc == SPDK_NVME_OPC_FABRIC) || + (status & 0xFFFE) || rsp->cdw0 || rsp->rsvd1 || + (req->length != xfer_len)) { + return true; + } + return false; +} + +static int +nvmf_fc_request_complete(struct spdk_nvmf_request *req) +{ + int rc = 0; + struct spdk_nvmf_fc_request *fc_req = nvmf_fc_get_fc_req(req); + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + if (fc_req->is_aborted) { + /* Defer this to make sure we dont call io cleanup in same context. */ + nvmf_fc_poller_api_func(fc_req->hwqp, SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE, + (void *)fc_req); + } else if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && + req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_READ_XFER); + + rc = nvmf_fc_send_data(fc_req); + } else { + if (req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_WRITE_RSP); + } else if (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_READ_RSP); + } else { + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_NONE_RSP); + } + + rc = nvmf_fc_handle_rsp(fc_req); + } + + if (rc) { + SPDK_ERRLOG("Error in request complete.\n"); + _nvmf_fc_request_free(fc_req); + } + return 0; +} + +struct spdk_nvmf_tgt * +nvmf_fc_get_tgt(void) +{ + if (g_nvmf_ftransport) { + return g_nvmf_ftransport->transport.tgt; + } + return NULL; +} + +/* + * FC Transport Public API begins here + */ + +#define SPDK_NVMF_FC_DEFAULT_MAX_QUEUE_DEPTH 128 +#define SPDK_NVMF_FC_DEFAULT_AQ_DEPTH 32 +#define SPDK_NVMF_FC_DEFAULT_MAX_QPAIRS_PER_CTRLR 5 +#define SPDK_NVMF_FC_DEFAULT_IN_CAPSULE_DATA_SIZE 0 +#define SPDK_NVMF_FC_DEFAULT_MAX_IO_SIZE 65536 +#define SPDK_NVMF_FC_DEFAULT_IO_UNIT_SIZE 4096 +#define SPDK_NVMF_FC_DEFAULT_NUM_SHARED_BUFFERS 8192 +#define SPDK_NVMF_FC_DEFAULT_MAX_SGE (SPDK_NVMF_FC_DEFAULT_MAX_IO_SIZE / \ + SPDK_NVMF_FC_DEFAULT_IO_UNIT_SIZE) + +static void +nvmf_fc_opts_init(struct spdk_nvmf_transport_opts *opts) +{ + opts->max_queue_depth = SPDK_NVMF_FC_DEFAULT_MAX_QUEUE_DEPTH; + opts->max_qpairs_per_ctrlr = SPDK_NVMF_FC_DEFAULT_MAX_QPAIRS_PER_CTRLR; + opts->in_capsule_data_size = SPDK_NVMF_FC_DEFAULT_IN_CAPSULE_DATA_SIZE; + opts->max_io_size = SPDK_NVMF_FC_DEFAULT_MAX_IO_SIZE; + opts->io_unit_size = SPDK_NVMF_FC_DEFAULT_IO_UNIT_SIZE; + opts->max_aq_depth = SPDK_NVMF_FC_DEFAULT_AQ_DEPTH; + opts->num_shared_buffers = SPDK_NVMF_FC_DEFAULT_NUM_SHARED_BUFFERS; +} + +static struct spdk_nvmf_transport * +nvmf_fc_create(struct spdk_nvmf_transport_opts *opts) +{ + uint32_t sge_count; + + SPDK_INFOLOG(SPDK_LOG_NVMF_FC, "*** FC Transport Init ***\n" + " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" + " max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" + " max_aq_depth=%d\n", + opts->max_queue_depth, + opts->max_io_size, + opts->max_qpairs_per_ctrlr - 1, + opts->io_unit_size, + opts->max_aq_depth); + + if (g_nvmf_ftransport) { + SPDK_ERRLOG("Duplicate NVMF-FC transport create request!\n"); + return NULL; + } + + if (spdk_env_get_last_core() < 1) { + SPDK_ERRLOG("Not enough cores/threads (%d) to run NVMF-FC transport!\n", + spdk_env_get_last_core() + 1); + return NULL; + } + + sge_count = opts->max_io_size / opts->io_unit_size; + if (sge_count > SPDK_NVMF_FC_DEFAULT_MAX_SGE) { + SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); + return NULL; + } + + g_nvmf_fc_master_thread = spdk_get_thread(); + g_nvmf_fgroup_count = 0; + g_nvmf_ftransport = calloc(1, sizeof(*g_nvmf_ftransport)); + + if (!g_nvmf_ftransport) { + SPDK_ERRLOG("Failed to allocate NVMF-FC transport\n"); + return NULL; + } + + if (pthread_mutex_init(&g_nvmf_ftransport->lock, NULL)) { + SPDK_ERRLOG("pthread_mutex_init() failed\n"); + free(g_nvmf_ftransport); + g_nvmf_ftransport = NULL; + return NULL; + } + + /* initialize the low level FC driver */ + nvmf_fc_lld_init(); + + return &g_nvmf_ftransport->transport; +} + +static int +nvmf_fc_destroy(struct spdk_nvmf_transport *transport) +{ + if (transport) { + struct spdk_nvmf_fc_transport *ftransport; + struct spdk_nvmf_fc_poll_group *fgroup, *pg_tmp; + + ftransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_fc_transport, transport); + + free(ftransport); + + /* clean up any FC poll groups still around */ + TAILQ_FOREACH_SAFE(fgroup, &g_nvmf_fgroups, link, pg_tmp) { + TAILQ_REMOVE(&g_nvmf_fgroups, fgroup, link); + free(fgroup); + } + g_nvmf_fgroup_count = 0; + + /* low level FC driver clean up */ + nvmf_fc_lld_fini(); + + nvmf_fc_port_cleanup(); + } + + return 0; +} + +static int +nvmf_fc_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + return 0; +} + +static void +nvmf_fc_stop_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *_trid) +{ +} + +static uint32_t +nvmf_fc_accept(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_fc_port *fc_port = NULL; + uint32_t count = 0; + static bool start_lld = false; + + if (spdk_unlikely(!start_lld)) { + start_lld = true; + nvmf_fc_lld_start(); + } + + /* poll the LS queue on each port */ + TAILQ_FOREACH(fc_port, &g_spdk_nvmf_fc_port_list, link) { + if (fc_port->hw_port_status == SPDK_FC_PORT_ONLINE) { + count += nvmf_fc_process_queue(&fc_port->ls_queue); + } + } + + return count; +} + +static void +nvmf_fc_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry) +{ + entry->trtype = (enum spdk_nvme_transport_type) SPDK_NVMF_TRTYPE_FC; + entry->adrfam = trid->adrfam; + entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED; + + spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); + spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); +} + +static struct spdk_nvmf_transport_poll_group * +nvmf_fc_poll_group_create(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_fc_poll_group *fgroup; + struct spdk_nvmf_fc_transport *ftransport = + SPDK_CONTAINEROF(transport, struct spdk_nvmf_fc_transport, transport); + + fgroup = calloc(1, sizeof(struct spdk_nvmf_fc_poll_group)); + if (!fgroup) { + SPDK_ERRLOG("Unable to alloc FC poll group\n"); + return NULL; + } + + TAILQ_INIT(&fgroup->hwqp_list); + + pthread_mutex_lock(&ftransport->lock); + TAILQ_INSERT_TAIL(&g_nvmf_fgroups, fgroup, link); + g_nvmf_fgroup_count++; + pthread_mutex_unlock(&ftransport->lock); + + return &fgroup->group; +} + +static void +nvmf_fc_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_fc_poll_group *fgroup; + struct spdk_nvmf_fc_transport *ftransport = + SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_fc_transport, transport); + + fgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_fc_poll_group, group); + pthread_mutex_lock(&ftransport->lock); + TAILQ_REMOVE(&g_nvmf_fgroups, fgroup, link); + g_nvmf_fgroup_count--; + pthread_mutex_unlock(&ftransport->lock); + + free(fgroup); +} + +static int +nvmf_fc_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_fc_poll_group *fgroup; + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_hwqp *hwqp = NULL; + struct spdk_nvmf_fc_ls_add_conn_api_data *api_data = NULL; + bool hwqp_found = false; + + fgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_fc_poll_group, group); + fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair); + + TAILQ_FOREACH(hwqp, &fgroup->hwqp_list, link) { + if (fc_conn->fc_assoc->tgtport->fc_port == hwqp->fc_port) { + hwqp_found = true; + break; + } + } + + if (!hwqp_found) { + SPDK_ERRLOG("No valid hwqp found for new QP.\n"); + goto err; + } + + if (!nvmf_fc_assign_conn_to_hwqp(hwqp, + &fc_conn->conn_id, + fc_conn->max_queue_depth)) { + SPDK_ERRLOG("Failed to get a connection id for new QP.\n"); + goto err; + } + + fc_conn->hwqp = hwqp; + + /* If this is for ADMIN connection, then update assoc ID. */ + if (fc_conn->qpair.qid == 0) { + fc_conn->fc_assoc->assoc_id = fc_conn->conn_id; + } + + api_data = &fc_conn->create_opd->u.add_conn; + nvmf_fc_poller_api_func(hwqp, SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION, &api_data->args); + return 0; +err: + return -1; +} + +static int +nvmf_fc_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) +{ + uint32_t count = 0; + struct spdk_nvmf_fc_poll_group *fgroup; + struct spdk_nvmf_fc_hwqp *hwqp; + + fgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_fc_poll_group, group); + + TAILQ_FOREACH(hwqp, &fgroup->hwqp_list, link) { + if (hwqp->state == SPDK_FC_HWQP_ONLINE) { + count += nvmf_fc_process_queue(hwqp); + } + } + + return (int) count; +} + +static int +nvmf_fc_request_free(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_fc_request *fc_req = nvmf_fc_get_fc_req(req); + + if (!fc_req->is_aborted) { + nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_BDEV_ABORTED); + nvmf_fc_request_abort(fc_req, true, NULL, NULL); + } else { + nvmf_fc_request_abort_complete(fc_req); + } + return 0; +} + + +static void +nvmf_fc_close_qpair(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_fc_conn *fc_conn; + + fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair); + + if (fc_conn->conn_id == NVMF_FC_INVALID_CONN_ID) { + /* QP creation failure in FC tranport. Cleanup. */ + spdk_thread_send_msg(nvmf_fc_get_master_thread(), + nvmf_fc_handle_connection_failure, fc_conn); + } else if (fc_conn->fc_assoc->assoc_id == fc_conn->conn_id && + fc_conn->fc_assoc->assoc_state != SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) { + /* Admin connection */ + spdk_thread_send_msg(nvmf_fc_get_master_thread(), + nvmf_fc_handle_assoc_deletion, fc_conn); + } +} + +static int +nvmf_fc_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_fc_conn *fc_conn; + + fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair); + memcpy(trid, &fc_conn->trid, sizeof(struct spdk_nvme_transport_id)); + return 0; +} + +static int +nvmf_fc_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_fc_conn *fc_conn; + + fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair); + memcpy(trid, &fc_conn->trid, sizeof(struct spdk_nvme_transport_id)); + return 0; +} + +static int +nvmf_fc_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_fc_conn *fc_conn; + + fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair); + memcpy(trid, &fc_conn->trid, sizeof(struct spdk_nvme_transport_id)); + return 0; +} + +static void +nvmf_fc_qpair_abort_request(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req) +{ + spdk_nvmf_request_complete(req); +} + +const struct spdk_nvmf_transport_ops spdk_nvmf_transport_fc = { + .name = "FC", + .type = (enum spdk_nvme_transport_type) SPDK_NVMF_TRTYPE_FC, + .opts_init = nvmf_fc_opts_init, + .create = nvmf_fc_create, + .destroy = nvmf_fc_destroy, + + .listen = nvmf_fc_listen, + .stop_listen = nvmf_fc_stop_listen, + .accept = nvmf_fc_accept, + + .listener_discover = nvmf_fc_discover, + + .poll_group_create = nvmf_fc_poll_group_create, + .poll_group_destroy = nvmf_fc_poll_group_destroy, + .poll_group_add = nvmf_fc_poll_group_add, + .poll_group_poll = nvmf_fc_poll_group_poll, + + .req_complete = nvmf_fc_request_complete, + .req_free = nvmf_fc_request_free, + .qpair_fini = nvmf_fc_close_qpair, + .qpair_get_peer_trid = nvmf_fc_qpair_get_peer_trid, + .qpair_get_local_trid = nvmf_fc_qpair_get_local_trid, + .qpair_get_listen_trid = nvmf_fc_qpair_get_listen_trid, + .qpair_abort_request = nvmf_fc_qpair_abort_request, +}; + +/* + * Re-initialize the FC-Port after an offline event. + * Only the queue information needs to be populated. XCHG, lcore and other hwqp information remains + * unchanged after the first initialization. + * + */ +static int +nvmf_fc_adm_hw_port_reinit_validate(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_hw_port_init_args *args) +{ + uint32_t i; + + /* Verify that the port was previously in offline or quiesced state */ + if (nvmf_fc_port_is_online(fc_port)) { + SPDK_ERRLOG("SPDK FC port %d already initialized and online.\n", args->port_handle); + return -EINVAL; + } + + /* Reinit information in new LS queue from previous queue */ + nvmf_fc_hwqp_reinit_poller_queues(&fc_port->ls_queue, args->ls_queue); + + fc_port->fcp_rq_id = args->fcp_rq_id; + + /* Initialize the LS queue */ + fc_port->ls_queue.queues = args->ls_queue; + nvmf_fc_init_poller_queues(fc_port->ls_queue.queues); + + for (i = 0; i < fc_port->num_io_queues; i++) { + /* Reinit information in new IO queue from previous queue */ + nvmf_fc_hwqp_reinit_poller_queues(&fc_port->io_queues[i], + args->io_queues[i]); + fc_port->io_queues[i].queues = args->io_queues[i]; + /* Initialize the IO queues */ + nvmf_fc_init_poller_queues(fc_port->io_queues[i].queues); + } + + fc_port->hw_port_status = SPDK_FC_PORT_OFFLINE; + + /* Validate the port information */ + DEV_VERIFY(TAILQ_EMPTY(&fc_port->nport_list)); + DEV_VERIFY(fc_port->num_nports == 0); + if (!TAILQ_EMPTY(&fc_port->nport_list) || (fc_port->num_nports != 0)) { + return -EINVAL; + } + + return 0; +} + +/* Initializes the data for the creation of a FC-Port object in the SPDK + * library. The spdk_nvmf_fc_port is a well defined structure that is part of + * the API to the library. The contents added to this well defined structure + * is private to each vendors implementation. + */ +static int +nvmf_fc_adm_hw_port_data_init(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_hw_port_init_args *args) +{ + /* Used a high number for the LS HWQP so that it does not clash with the + * IO HWQP's and immediately shows a LS queue during tracing. + */ + uint32_t i; + + fc_port->port_hdl = args->port_handle; + fc_port->hw_port_status = SPDK_FC_PORT_OFFLINE; + fc_port->fcp_rq_id = args->fcp_rq_id; + fc_port->num_io_queues = args->io_queue_cnt; + + /* + * Set port context from init args. Used for FCP port stats. + */ + fc_port->port_ctx = args->port_ctx; + + /* + * Initialize the LS queue wherever needed. + */ + fc_port->ls_queue.queues = args->ls_queue; + fc_port->ls_queue.thread = nvmf_fc_get_master_thread(); + fc_port->ls_queue.hwqp_id = SPDK_MAX_NUM_OF_FC_PORTS * fc_port->num_io_queues; + + /* + * Initialize the LS queue. + */ + nvmf_fc_init_hwqp(fc_port, &fc_port->ls_queue); + + /* + * Initialize the IO queues. + */ + for (i = 0; i < args->io_queue_cnt; i++) { + struct spdk_nvmf_fc_hwqp *hwqp = &fc_port->io_queues[i]; + hwqp->hwqp_id = i; + hwqp->queues = args->io_queues[i]; + hwqp->rq_size = args->io_queue_size; + nvmf_fc_init_hwqp(fc_port, hwqp); + } + + /* + * Initialize the LS processing for port + */ + nvmf_fc_ls_init(fc_port); + + /* + * Initialize the list of nport on this HW port. + */ + TAILQ_INIT(&fc_port->nport_list); + fc_port->num_nports = 0; + + return 0; +} + +static void +nvmf_fc_adm_port_hwqp_offline_del_poller(struct spdk_nvmf_fc_port *fc_port) +{ + struct spdk_nvmf_fc_hwqp *hwqp = NULL; + int i = 0; + + hwqp = &fc_port->ls_queue; + (void)nvmf_fc_hwqp_set_offline(hwqp); + + /* Remove poller for all the io queues. */ + for (i = 0; i < (int)fc_port->num_io_queues; i++) { + hwqp = &fc_port->io_queues[i]; + (void)nvmf_fc_hwqp_set_offline(hwqp); + nvmf_fc_poll_group_remove_hwqp(hwqp); + } +} + +/* + * Callback function for HW port link break operation. + * + * Notice that this callback is being triggered when spdk_fc_nport_delete() + * completes, if that spdk_fc_nport_delete() called is issued by + * nvmf_fc_adm_evnt_hw_port_link_break(). + * + * Since nvmf_fc_adm_evnt_hw_port_link_break() can invoke spdk_fc_nport_delete() multiple + * times (one per nport in the HW port's nport_list), a single call to + * nvmf_fc_adm_evnt_hw_port_link_break() can result in multiple calls to this callback function. + * + * As a result, this function only invokes a callback to the caller of + * nvmf_fc_adm_evnt_hw_port_link_break() only when the HW port's nport_list is empty. + */ +static void +nvmf_fc_adm_hw_port_link_break_cb(uint8_t port_handle, + enum spdk_fc_event event_type, void *cb_args, int spdk_err) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_port_link_break_cb_data *offline_cb_args = cb_args; + struct spdk_nvmf_hw_port_link_break_args *offline_args = NULL; + spdk_nvmf_fc_callback cb_func = NULL; + int err = 0; + struct spdk_nvmf_fc_port *fc_port = NULL; + int num_nports = 0; + char log_str[256]; + + if (0 != spdk_err) { + DEV_VERIFY(!"port link break cb: spdk_err not success."); + SPDK_ERRLOG("port link break cb: spdk_err:%d.\n", spdk_err); + goto out; + } + + if (!offline_cb_args) { + DEV_VERIFY(!"port link break cb: port_offline_args is NULL."); + err = -EINVAL; + goto out; + } + + offline_args = offline_cb_args->args; + if (!offline_args) { + DEV_VERIFY(!"port link break cb: offline_args is NULL."); + err = -EINVAL; + goto out; + } + + if (port_handle != offline_args->port_handle) { + DEV_VERIFY(!"port link break cb: port_handle mismatch."); + err = -EINVAL; + goto out; + } + + cb_func = offline_cb_args->cb_func; + if (!cb_func) { + DEV_VERIFY(!"port link break cb: cb_func is NULL."); + err = -EINVAL; + goto out; + } + + fc_port = nvmf_fc_port_lookup(port_handle); + if (!fc_port) { + DEV_VERIFY(!"port link break cb: fc_port is NULL."); + SPDK_ERRLOG("port link break cb: Unable to find port:%d\n", + offline_args->port_handle); + err = -EINVAL; + goto out; + } + + num_nports = fc_port->num_nports; + if (!TAILQ_EMPTY(&fc_port->nport_list)) { + /* + * Don't call the callback unless all nports have been deleted. + */ + goto out; + } + + if (num_nports != 0) { + DEV_VERIFY(!"port link break cb: num_nports in non-zero."); + SPDK_ERRLOG("port link break cb: # of ports should be 0. Instead, num_nports:%d\n", + num_nports); + err = -EINVAL; + } + + /* + * Mark the hwqps as offline and unregister the pollers. + */ + (void)nvmf_fc_adm_port_hwqp_offline_del_poller(fc_port); + + /* + * Since there are no more nports, execute the callback(s). + */ + (void)cb_func(port_handle, SPDK_FC_LINK_BREAK, + (void *)offline_args->cb_ctx, spdk_err); + +out: + free(offline_cb_args); + + snprintf(log_str, sizeof(log_str), + "port link break cb: port:%d evt_type:%d num_nports:%d err:%d spdk_err:%d.\n", + port_handle, event_type, num_nports, err, spdk_err); + + if (err != 0) { + SPDK_ERRLOG("%s", log_str); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str); + } + return; +} + +/* + * FC port must have all its nports deleted before transitioning to offline state. + */ +static void +nvmf_fc_adm_hw_port_offline_nport_delete(struct spdk_nvmf_fc_port *fc_port) +{ + struct spdk_nvmf_fc_nport *nport = NULL; + /* All nports must have been deleted at this point for this fc port */ + DEV_VERIFY(fc_port && TAILQ_EMPTY(&fc_port->nport_list)); + DEV_VERIFY(fc_port->num_nports == 0); + /* Mark the nport states to be zombie, if they exist */ + if (fc_port && !TAILQ_EMPTY(&fc_port->nport_list)) { + TAILQ_FOREACH(nport, &fc_port->nport_list, link) { + (void)nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_ZOMBIE); + } + } +} + +static void +nvmf_fc_adm_i_t_delete_cb(void *args, uint32_t err) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_i_t_del_cb_data *cb_data = args; + struct spdk_nvmf_fc_nport *nport = cb_data->nport; + struct spdk_nvmf_fc_remote_port_info *rport = cb_data->rport; + spdk_nvmf_fc_callback cb_func = cb_data->fc_cb_func; + int spdk_err = 0; + uint8_t port_handle = cb_data->port_handle; + uint32_t s_id = rport->s_id; + uint32_t rpi = rport->rpi; + uint32_t assoc_count = rport->assoc_count; + uint32_t nport_hdl = nport->nport_hdl; + uint32_t d_id = nport->d_id; + char log_str[256]; + + /* + * Assert on any delete failure. + */ + if (0 != err) { + DEV_VERIFY(!"Error in IT Delete callback."); + goto out; + } + + if (cb_func != NULL) { + (void)cb_func(port_handle, SPDK_FC_IT_DELETE, cb_data->fc_cb_ctx, spdk_err); + } + +out: + free(cb_data); + + snprintf(log_str, sizeof(log_str), + "IT delete assoc_cb on nport %d done, port_handle:%d s_id:%d d_id:%d rpi:%d rport_assoc_count:%d rc = %d.\n", + nport_hdl, port_handle, s_id, d_id, rpi, assoc_count, err); + + if (err != 0) { + SPDK_ERRLOG("%s", log_str); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str); + } +} + +static void +nvmf_fc_adm_i_t_delete_assoc_cb(void *args, uint32_t err) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data *cb_data = args; + struct spdk_nvmf_fc_nport *nport = cb_data->nport; + struct spdk_nvmf_fc_remote_port_info *rport = cb_data->rport; + spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn cb_func = cb_data->cb_func; + uint32_t s_id = rport->s_id; + uint32_t rpi = rport->rpi; + uint32_t assoc_count = rport->assoc_count; + uint32_t nport_hdl = nport->nport_hdl; + uint32_t d_id = nport->d_id; + char log_str[256]; + + /* + * Assert on any association delete failure. We continue to delete other + * associations in promoted builds. + */ + if (0 != err) { + DEV_VERIFY(!"Nport's association delete callback returned error"); + if (nport->assoc_count > 0) { + nport->assoc_count--; + } + if (rport->assoc_count > 0) { + rport->assoc_count--; + } + } + + /* + * If this is the last association being deleted for the ITN, + * execute the callback(s). + */ + if (0 == rport->assoc_count) { + /* Remove the rport from the remote port list. */ + if (nvmf_fc_nport_remove_rem_port(nport, rport) != 0) { + SPDK_ERRLOG("Error while removing rport from list.\n"); + DEV_VERIFY(!"Error while removing rport from list."); + } + + if (cb_func != NULL) { + /* + * Callback function is provided by the caller + * of nvmf_fc_adm_i_t_delete_assoc(). + */ + (void)cb_func(cb_data->cb_ctx, 0); + } + free(rport); + free(args); + } + + snprintf(log_str, sizeof(log_str), + "IT delete assoc_cb on nport %d done, s_id:%d d_id:%d rpi:%d rport_assoc_count:%d err = %d.\n", + nport_hdl, s_id, d_id, rpi, assoc_count, err); + + if (err != 0) { + SPDK_ERRLOG("%s", log_str); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str); + } +} + +/** + * Process a IT delete. + */ +static void +nvmf_fc_adm_i_t_delete_assoc(struct spdk_nvmf_fc_nport *nport, + struct spdk_nvmf_fc_remote_port_info *rport, + spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn cb_func, + void *cb_ctx) +{ + int err = 0; + struct spdk_nvmf_fc_association *assoc = NULL; + int assoc_err = 0; + uint32_t num_assoc = 0; + uint32_t num_assoc_del_scheduled = 0; + struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data *cb_data = NULL; + uint8_t port_hdl = nport->port_hdl; + uint32_t s_id = rport->s_id; + uint32_t rpi = rport->rpi; + uint32_t assoc_count = rport->assoc_count; + char log_str[256]; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "IT delete associations on nport:%d begin.\n", + nport->nport_hdl); + + /* + * Allocate memory for callback data. + * This memory will be freed by the callback function. + */ + cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data)); + if (NULL == cb_data) { + SPDK_ERRLOG("Failed to allocate memory for cb_data on nport:%d.\n", nport->nport_hdl); + err = -ENOMEM; + goto out; + } + cb_data->nport = nport; + cb_data->rport = rport; + cb_data->port_handle = port_hdl; + cb_data->cb_func = cb_func; + cb_data->cb_ctx = cb_ctx; + + /* + * Delete all associations, if any, related with this ITN/remote_port. + */ + TAILQ_FOREACH(assoc, &nport->fc_associations, link) { + num_assoc++; + if (assoc->s_id == s_id) { + assoc_err = nvmf_fc_delete_association(nport, + assoc->assoc_id, + false /* send abts */, false, + nvmf_fc_adm_i_t_delete_assoc_cb, cb_data); + if (0 != assoc_err) { + /* + * Mark this association as zombie. + */ + err = -EINVAL; + DEV_VERIFY(!"Error while deleting association"); + (void)nvmf_fc_assoc_set_state(assoc, SPDK_NVMF_FC_OBJECT_ZOMBIE); + } else { + num_assoc_del_scheduled++; + } + } + } + +out: + if ((cb_data) && (num_assoc_del_scheduled == 0)) { + /* + * Since there are no association_delete calls + * successfully scheduled, the association_delete + * callback function will never be called. + * In this case, call the callback function now. + */ + nvmf_fc_adm_i_t_delete_assoc_cb(cb_data, 0); + } + + snprintf(log_str, sizeof(log_str), + "IT delete associations on nport:%d end. " + "s_id:%d rpi:%d assoc_count:%d assoc:%d assoc_del_scheduled:%d rc:%d.\n", + nport->nport_hdl, s_id, rpi, assoc_count, num_assoc, num_assoc_del_scheduled, err); + + if (err == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str); + } else { + SPDK_ERRLOG("%s", log_str); + } +} + +static void +nvmf_fc_adm_queue_quiesce_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_poller_api_quiesce_queue_args *quiesce_api_data = NULL; + struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx *port_quiesce_ctx = NULL; + struct spdk_nvmf_fc_hwqp *hwqp = NULL; + struct spdk_nvmf_fc_port *fc_port = NULL; + int err = 0; + + quiesce_api_data = (struct spdk_nvmf_fc_poller_api_quiesce_queue_args *)cb_data; + hwqp = quiesce_api_data->hwqp; + fc_port = hwqp->fc_port; + port_quiesce_ctx = (struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx *)quiesce_api_data->ctx; + spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn cb_func = port_quiesce_ctx->cb_func; + + /* + * Decrement the callback/quiesced queue count. + */ + port_quiesce_ctx->quiesce_count--; + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Queue%d Quiesced\n", quiesce_api_data->hwqp->hwqp_id); + + free(quiesce_api_data); + /* + * Wait for call backs i.e. max_ioq_queues + LS QUEUE. + */ + if (port_quiesce_ctx->quiesce_count > 0) { + return; + } + + if (fc_port->hw_port_status == SPDK_FC_PORT_QUIESCED) { + SPDK_ERRLOG("Port %d already in quiesced state.\n", fc_port->port_hdl); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d quiesced.\n", fc_port->port_hdl); + fc_port->hw_port_status = SPDK_FC_PORT_QUIESCED; + } + + if (cb_func) { + /* + * Callback function for the called of quiesce. + */ + cb_func(port_quiesce_ctx->ctx, err); + } + + /* + * Free the context structure. + */ + free(port_quiesce_ctx); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d quiesce done, rc = %d.\n", fc_port->port_hdl, + err); +} + +static int +nvmf_fc_adm_hw_queue_quiesce(struct spdk_nvmf_fc_hwqp *fc_hwqp, void *ctx, + spdk_nvmf_fc_poller_api_cb cb_func) +{ + struct spdk_nvmf_fc_poller_api_quiesce_queue_args *args; + enum spdk_nvmf_fc_poller_api_ret rc = SPDK_NVMF_FC_POLLER_API_SUCCESS; + int err = 0; + + args = calloc(1, sizeof(struct spdk_nvmf_fc_poller_api_quiesce_queue_args)); + + if (args == NULL) { + err = -ENOMEM; + SPDK_ERRLOG("Failed to allocate memory for poller quiesce args, hwqp:%d\n", fc_hwqp->hwqp_id); + goto done; + } + args->hwqp = fc_hwqp; + args->ctx = ctx; + args->cb_info.cb_func = cb_func; + args->cb_info.cb_data = args; + args->cb_info.cb_thread = spdk_get_thread(); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Quiesce queue %d\n", fc_hwqp->hwqp_id); + rc = nvmf_fc_poller_api_func(fc_hwqp, SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE, args); + if (rc) { + free(args); + err = -EINVAL; + } + +done: + return err; +} + +/* + * Hw port Quiesce + */ +static int +nvmf_fc_adm_hw_port_quiesce(struct spdk_nvmf_fc_port *fc_port, void *ctx, + spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn cb_func) +{ + struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx *port_quiesce_ctx = NULL; + uint32_t i = 0; + int err = 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port:%d is being quiesced.\n", fc_port->port_hdl); + + /* + * If the port is in an OFFLINE state, set the state to QUIESCED + * and execute the callback. + */ + if (fc_port->hw_port_status == SPDK_FC_PORT_OFFLINE) { + fc_port->hw_port_status = SPDK_FC_PORT_QUIESCED; + } + + if (fc_port->hw_port_status == SPDK_FC_PORT_QUIESCED) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Port %d already in quiesced state.\n", + fc_port->port_hdl); + /* + * Execute the callback function directly. + */ + cb_func(ctx, err); + goto out; + } + + port_quiesce_ctx = calloc(1, sizeof(struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx)); + + if (port_quiesce_ctx == NULL) { + err = -ENOMEM; + SPDK_ERRLOG("Failed to allocate memory for LS queue quiesce ctx, port:%d\n", + fc_port->port_hdl); + goto out; + } + + port_quiesce_ctx->quiesce_count = 0; + port_quiesce_ctx->ctx = ctx; + port_quiesce_ctx->cb_func = cb_func; + + /* + * Quiesce the LS queue. + */ + err = nvmf_fc_adm_hw_queue_quiesce(&fc_port->ls_queue, port_quiesce_ctx, + nvmf_fc_adm_queue_quiesce_cb); + if (err != 0) { + SPDK_ERRLOG("Failed to quiesce the LS queue.\n"); + goto out; + } + port_quiesce_ctx->quiesce_count++; + + /* + * Quiesce the IO queues. + */ + for (i = 0; i < fc_port->num_io_queues; i++) { + err = nvmf_fc_adm_hw_queue_quiesce(&fc_port->io_queues[i], + port_quiesce_ctx, + nvmf_fc_adm_queue_quiesce_cb); + if (err != 0) { + DEV_VERIFY(0); + SPDK_ERRLOG("Failed to quiesce the IO queue:%d.\n", fc_port->io_queues[i].hwqp_id); + } + port_quiesce_ctx->quiesce_count++; + } + +out: + if (port_quiesce_ctx && err != 0) { + free(port_quiesce_ctx); + } + return err; +} + +/* + * Initialize and add a HW port entry to the global + * HW port list. + */ +static void +nvmf_fc_adm_evnt_hw_port_init(void *arg) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_port *fc_port = NULL; + struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg; + struct spdk_nvmf_fc_hw_port_init_args *args = (struct spdk_nvmf_fc_hw_port_init_args *) + api_data->api_args; + int err = 0; + + if (args->io_queue_cnt > spdk_env_get_core_count()) { + SPDK_ERRLOG("IO queues count greater than cores for %d.\n", args->port_handle); + err = EINVAL; + goto abort_port_init; + } + + /* + * 1. Check for duplicate initialization. + */ + fc_port = nvmf_fc_port_lookup(args->port_handle); + if (fc_port != NULL) { + /* Port already exists, check if it has to be re-initialized */ + err = nvmf_fc_adm_hw_port_reinit_validate(fc_port, args); + if (err) { + /* + * In case of an error we do not want to free the fc_port + * so we set that pointer to NULL. + */ + fc_port = NULL; + } + goto abort_port_init; + } + + /* + * 2. Get the memory to instantiate a fc port. + */ + fc_port = calloc(1, sizeof(struct spdk_nvmf_fc_port) + + (args->io_queue_cnt * sizeof(struct spdk_nvmf_fc_hwqp))); + if (fc_port == NULL) { + SPDK_ERRLOG("Failed to allocate memory for fc_port %d.\n", args->port_handle); + err = -ENOMEM; + goto abort_port_init; + } + + /* assign the io_queues array */ + fc_port->io_queues = (struct spdk_nvmf_fc_hwqp *)((uint8_t *)fc_port + sizeof( + struct spdk_nvmf_fc_port)); + + /* + * 3. Initialize the contents for the FC-port + */ + err = nvmf_fc_adm_hw_port_data_init(fc_port, args); + + if (err != 0) { + SPDK_ERRLOG("Data initialization failed for fc_port %d.\n", args->port_handle); + DEV_VERIFY(!"Data initialization failed for fc_port"); + goto abort_port_init; + } + + /* + * 4. Add this port to the global fc port list in the library. + */ + nvmf_fc_port_add(fc_port); + +abort_port_init: + if (err && fc_port) { + free(fc_port); + } + if (api_data->cb_func != NULL) { + (void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_INIT, args->cb_ctx, err); + } + + free(arg); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d initialize done, rc = %d.\n", + args->port_handle, err); +} + +/* + * Online a HW port. + */ +static void +nvmf_fc_adm_evnt_hw_port_online(void *arg) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_port *fc_port = NULL; + struct spdk_nvmf_fc_hwqp *hwqp = NULL; + struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg; + struct spdk_nvmf_fc_hw_port_online_args *args = (struct spdk_nvmf_fc_hw_port_online_args *) + api_data->api_args; + int i = 0; + int err = 0; + + fc_port = nvmf_fc_port_lookup(args->port_handle); + if (fc_port) { + /* Set the port state to online */ + err = nvmf_fc_port_set_online(fc_port); + if (err != 0) { + SPDK_ERRLOG("Hw port %d online failed. err = %d\n", fc_port->port_hdl, err); + DEV_VERIFY(!"Hw port online failed"); + goto out; + } + + hwqp = &fc_port->ls_queue; + hwqp->context = NULL; + (void)nvmf_fc_hwqp_set_online(hwqp); + + /* Cycle through all the io queues and setup a hwqp poller for each. */ + for (i = 0; i < (int)fc_port->num_io_queues; i++) { + hwqp = &fc_port->io_queues[i]; + hwqp->context = NULL; + (void)nvmf_fc_hwqp_set_online(hwqp); + nvmf_fc_poll_group_add_hwqp(hwqp); + } + } else { + SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle); + err = -EINVAL; + } + +out: + if (api_data->cb_func != NULL) { + (void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_ONLINE, args->cb_ctx, err); + } + + free(arg); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d online done, rc = %d.\n", args->port_handle, + err); +} + +/* + * Offline a HW port. + */ +static void +nvmf_fc_adm_evnt_hw_port_offline(void *arg) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_port *fc_port = NULL; + struct spdk_nvmf_fc_hwqp *hwqp = NULL; + struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg; + struct spdk_nvmf_fc_hw_port_offline_args *args = (struct spdk_nvmf_fc_hw_port_offline_args *) + api_data->api_args; + int i = 0; + int err = 0; + + fc_port = nvmf_fc_port_lookup(args->port_handle); + if (fc_port) { + /* Set the port state to offline, if it is not already. */ + err = nvmf_fc_port_set_offline(fc_port); + if (err != 0) { + SPDK_ERRLOG("Hw port %d already offline. err = %d\n", fc_port->port_hdl, err); + err = 0; + goto out; + } + + hwqp = &fc_port->ls_queue; + (void)nvmf_fc_hwqp_set_offline(hwqp); + + /* Remove poller for all the io queues. */ + for (i = 0; i < (int)fc_port->num_io_queues; i++) { + hwqp = &fc_port->io_queues[i]; + (void)nvmf_fc_hwqp_set_offline(hwqp); + nvmf_fc_poll_group_remove_hwqp(hwqp); + } + + /* + * Delete all the nports. Ideally, the nports should have been purged + * before the offline event, in which case, only a validation is required. + */ + nvmf_fc_adm_hw_port_offline_nport_delete(fc_port); + } else { + SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle); + err = -EINVAL; + } +out: + if (api_data->cb_func != NULL) { + (void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_OFFLINE, args->cb_ctx, err); + } + + free(arg); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d offline done, rc = %d.\n", args->port_handle, + err); +} + +struct nvmf_fc_add_rem_listener_ctx { + struct spdk_nvmf_subsystem *subsystem; + bool add_listener; + struct spdk_nvme_transport_id trid; +}; + +static void +nvmf_fc_adm_subsystem_resume_cb(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct nvmf_fc_add_rem_listener_ctx *ctx = (struct nvmf_fc_add_rem_listener_ctx *)cb_arg; + free(ctx); +} + +static void +nvmf_fc_adm_listen_done(void *cb_arg, int status) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct nvmf_fc_add_rem_listener_ctx *ctx = cb_arg; + + if (spdk_nvmf_subsystem_resume(ctx->subsystem, nvmf_fc_adm_subsystem_resume_cb, ctx)) { + SPDK_ERRLOG("Failed to resume subsystem: %s\n", ctx->subsystem->subnqn); + free(ctx); + } +} + +static void +nvmf_fc_adm_subsystem_paused_cb(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct nvmf_fc_add_rem_listener_ctx *ctx = (struct nvmf_fc_add_rem_listener_ctx *)cb_arg; + + if (ctx->add_listener) { + spdk_nvmf_subsystem_add_listener(subsystem, &ctx->trid, nvmf_fc_adm_listen_done, ctx); + } else { + spdk_nvmf_subsystem_remove_listener(subsystem, &ctx->trid); + nvmf_fc_adm_listen_done(ctx, 0); + } +} + +static int +nvmf_fc_adm_add_rem_nport_listener(struct spdk_nvmf_fc_nport *nport, bool add) +{ + struct spdk_nvmf_tgt *tgt = nvmf_fc_get_tgt(); + struct spdk_nvmf_subsystem *subsystem; + + if (!tgt) { + SPDK_ERRLOG("No nvmf target defined\n"); + return -EINVAL; + } + + subsystem = spdk_nvmf_subsystem_get_first(tgt); + while (subsystem) { + struct nvmf_fc_add_rem_listener_ctx *ctx; + + if (spdk_nvmf_subsytem_any_listener_allowed(subsystem) == true) { + ctx = calloc(1, sizeof(struct nvmf_fc_add_rem_listener_ctx)); + if (ctx) { + ctx->add_listener = add; + ctx->subsystem = subsystem; + nvmf_fc_create_trid(&ctx->trid, + nport->fc_nodename.u.wwn, + nport->fc_portname.u.wwn); + + if (spdk_nvmf_tgt_listen(subsystem->tgt, &ctx->trid)) { + SPDK_ERRLOG("Failed to add transport address %s to tgt listeners\n", + ctx->trid.traddr); + free(ctx); + } else if (spdk_nvmf_subsystem_pause(subsystem, + nvmf_fc_adm_subsystem_paused_cb, + ctx)) { + SPDK_ERRLOG("Failed to pause subsystem: %s\n", + subsystem->subnqn); + free(ctx); + } + } + } + + subsystem = spdk_nvmf_subsystem_get_next(subsystem); + } + + return 0; +} + +/* + * Create a Nport. + */ +static void +nvmf_fc_adm_evnt_nport_create(void *arg) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg; + struct spdk_nvmf_fc_nport_create_args *args = (struct spdk_nvmf_fc_nport_create_args *) + api_data->api_args; + struct spdk_nvmf_fc_nport *nport = NULL; + struct spdk_nvmf_fc_port *fc_port = NULL; + int err = 0; + + /* + * Get the physical port. + */ + fc_port = nvmf_fc_port_lookup(args->port_handle); + if (fc_port == NULL) { + err = -EINVAL; + goto out; + } + + /* + * Check for duplicate initialization. + */ + nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle); + if (nport != NULL) { + SPDK_ERRLOG("Duplicate SPDK FC nport %d exists for FC port:%d.\n", args->nport_handle, + args->port_handle); + err = -EINVAL; + goto out; + } + + /* + * Get the memory to instantiate a fc nport. + */ + nport = calloc(1, sizeof(struct spdk_nvmf_fc_nport)); + if (nport == NULL) { + SPDK_ERRLOG("Failed to allocate memory for nport %d.\n", + args->nport_handle); + err = -ENOMEM; + goto out; + } + + /* + * Initialize the contents for the nport + */ + nport->nport_hdl = args->nport_handle; + nport->port_hdl = args->port_handle; + nport->nport_state = SPDK_NVMF_FC_OBJECT_CREATED; + nport->fc_nodename = args->fc_nodename; + nport->fc_portname = args->fc_portname; + nport->d_id = args->d_id; + nport->fc_port = nvmf_fc_port_lookup(args->port_handle); + + (void)nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_CREATED); + TAILQ_INIT(&nport->rem_port_list); + nport->rport_count = 0; + TAILQ_INIT(&nport->fc_associations); + nport->assoc_count = 0; + + /* + * Populate the nport address (as listening address) to the nvmf subsystems. + */ + err = nvmf_fc_adm_add_rem_nport_listener(nport, true); + + (void)nvmf_fc_port_add_nport(fc_port, nport); +out: + if (err && nport) { + free(nport); + } + + if (api_data->cb_func != NULL) { + (void)api_data->cb_func(args->port_handle, SPDK_FC_NPORT_CREATE, args->cb_ctx, err); + } + + free(arg); +} + +static void +nvmf_fc_adm_delete_nport_cb(uint8_t port_handle, enum spdk_fc_event event_type, + void *cb_args, int spdk_err) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_nport_del_cb_data *cb_data = cb_args; + struct spdk_nvmf_fc_nport *nport = cb_data->nport; + spdk_nvmf_fc_callback cb_func = cb_data->fc_cb_func; + int err = 0; + uint16_t nport_hdl = 0; + char log_str[256]; + + /* + * Assert on any delete failure. + */ + if (nport == NULL) { + SPDK_ERRLOG("Nport delete callback returned null nport"); + DEV_VERIFY(!"nport is null."); + goto out; + } + + nport_hdl = nport->nport_hdl; + if (0 != spdk_err) { + SPDK_ERRLOG("Nport delete callback returned error. FC Port: " + "%d, Nport: %d\n", + nport->port_hdl, nport->nport_hdl); + DEV_VERIFY(!"nport delete callback error."); + } + + /* + * Free the nport if this is the last rport being deleted and + * execute the callback(s). + */ + if (nvmf_fc_nport_has_no_rport(nport)) { + if (0 != nport->assoc_count) { + SPDK_ERRLOG("association count != 0\n"); + DEV_VERIFY(!"association count != 0"); + } + + err = nvmf_fc_port_remove_nport(nport->fc_port, nport); + if (0 != err) { + SPDK_ERRLOG("Nport delete callback: Failed to remove " + "nport from nport list. FC Port:%d Nport:%d\n", + nport->port_hdl, nport->nport_hdl); + } + /* Free the nport */ + free(nport); + + if (cb_func != NULL) { + (void)cb_func(cb_data->port_handle, SPDK_FC_NPORT_DELETE, cb_data->fc_cb_ctx, spdk_err); + } + free(cb_data); + } +out: + snprintf(log_str, sizeof(log_str), + "port:%d nport:%d delete cb exit, evt_type:%d rc:%d.\n", + port_handle, nport_hdl, event_type, spdk_err); + + if (err != 0) { + SPDK_ERRLOG("%s", log_str); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str); + } +} + +/* + * Delete Nport. + */ +static void +nvmf_fc_adm_evnt_nport_delete(void *arg) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg; + struct spdk_nvmf_fc_nport_delete_args *args = (struct spdk_nvmf_fc_nport_delete_args *) + api_data->api_args; + struct spdk_nvmf_fc_nport *nport = NULL; + struct spdk_nvmf_fc_adm_nport_del_cb_data *cb_data = NULL; + struct spdk_nvmf_fc_remote_port_info *rport_iter = NULL; + int err = 0; + uint32_t rport_cnt = 0; + int rc = 0; + + /* + * Make sure that the nport exists. + */ + nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle); + if (nport == NULL) { + SPDK_ERRLOG("Unable to find the SPDK FC nport %d for FC Port: %d.\n", args->nport_handle, + args->port_handle); + err = -EINVAL; + goto out; + } + + /* + * Allocate memory for callback data. + */ + cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_nport_del_cb_data)); + if (NULL == cb_data) { + SPDK_ERRLOG("Failed to allocate memory for cb_data %d.\n", args->nport_handle); + err = -ENOMEM; + goto out; + } + + cb_data->nport = nport; + cb_data->port_handle = args->port_handle; + cb_data->fc_cb_func = api_data->cb_func; + cb_data->fc_cb_ctx = args->cb_ctx; + + /* + * Begin nport tear down + */ + if (nport->nport_state == SPDK_NVMF_FC_OBJECT_CREATED) { + (void)nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_TO_BE_DELETED); + } else if (nport->nport_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) { + /* + * Deletion of this nport already in progress. Register callback + * and return. + */ + /* TODO: Register callback in callback vector. For now, set the error and return. */ + err = -ENODEV; + goto out; + } else { + /* nport partially created/deleted */ + DEV_VERIFY(nport->nport_state == SPDK_NVMF_FC_OBJECT_ZOMBIE); + DEV_VERIFY(0 != "Nport in zombie state"); + err = -ENODEV; + goto out; + } + + /* + * Remove this nport from listening addresses across subsystems + */ + rc = nvmf_fc_adm_add_rem_nport_listener(nport, false); + + if (0 != rc) { + err = nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_ZOMBIE); + SPDK_ERRLOG("Unable to remove the listen addr in the subsystems for nport %d.\n", + nport->nport_hdl); + goto out; + } + + /* + * Delete all the remote ports (if any) for the nport + */ + /* TODO - Need to do this with a "first" and a "next" accessor function + * for completeness. Look at app-subsystem as examples. + */ + if (nvmf_fc_nport_has_no_rport(nport)) { + /* No rports to delete. Complete the nport deletion. */ + nvmf_fc_adm_delete_nport_cb(nport->port_hdl, SPDK_FC_NPORT_DELETE, cb_data, 0); + goto out; + } + + TAILQ_FOREACH(rport_iter, &nport->rem_port_list, link) { + struct spdk_nvmf_fc_hw_i_t_delete_args *it_del_args = calloc( + 1, sizeof(struct spdk_nvmf_fc_hw_i_t_delete_args)); + + if (it_del_args == NULL) { + err = -ENOMEM; + SPDK_ERRLOG("SPDK_FC_IT_DELETE no mem to delete rport with rpi:%d s_id:%d.\n", + rport_iter->rpi, rport_iter->s_id); + DEV_VERIFY(!"SPDK_FC_IT_DELETE failed, cannot allocate memory"); + goto out; + } + + rport_cnt++; + it_del_args->port_handle = nport->port_hdl; + it_del_args->nport_handle = nport->nport_hdl; + it_del_args->cb_ctx = (void *)cb_data; + it_del_args->rpi = rport_iter->rpi; + it_del_args->s_id = rport_iter->s_id; + + nvmf_fc_master_enqueue_event(SPDK_FC_IT_DELETE, (void *)it_del_args, + nvmf_fc_adm_delete_nport_cb); + } + +out: + /* On failure, execute the callback function now */ + if ((err != 0) || (rc != 0)) { + SPDK_ERRLOG("NPort %d delete failed, error:%d, fc port:%d, " + "rport_cnt:%d rc:%d.\n", + args->nport_handle, err, args->port_handle, + rport_cnt, rc); + if (cb_data) { + free(cb_data); + } + if (api_data->cb_func != NULL) { + (void)api_data->cb_func(args->port_handle, SPDK_FC_NPORT_DELETE, args->cb_ctx, err); + } + + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, + "NPort %d delete done succesfully, fc port:%d. " + "rport_cnt:%d\n", + args->nport_handle, args->port_handle, rport_cnt); + } + + free(arg); +} + +/* + * Process an PRLI/IT add. + */ +static void +nvmf_fc_adm_evnt_i_t_add(void *arg) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg; + struct spdk_nvmf_fc_hw_i_t_add_args *args = (struct spdk_nvmf_fc_hw_i_t_add_args *) + api_data->api_args; + struct spdk_nvmf_fc_nport *nport = NULL; + struct spdk_nvmf_fc_remote_port_info *rport_iter = NULL; + struct spdk_nvmf_fc_remote_port_info *rport = NULL; + int err = 0; + + /* + * Make sure the nport port exists. + */ + nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle); + if (nport == NULL) { + SPDK_ERRLOG("Unable to find the SPDK FC nport %d\n", args->nport_handle); + err = -EINVAL; + goto out; + } + + /* + * Check for duplicate i_t_add. + */ + TAILQ_FOREACH(rport_iter, &nport->rem_port_list, link) { + if ((rport_iter->s_id == args->s_id) && (rport_iter->rpi == args->rpi)) { + SPDK_ERRLOG("Duplicate rport found for FC nport %d: sid:%d rpi:%d\n", + args->nport_handle, rport_iter->s_id, rport_iter->rpi); + err = -EEXIST; + goto out; + } + } + + /* + * Get the memory to instantiate the remote port + */ + rport = calloc(1, sizeof(struct spdk_nvmf_fc_remote_port_info)); + if (rport == NULL) { + SPDK_ERRLOG("Memory allocation for rem port failed.\n"); + err = -ENOMEM; + goto out; + } + + /* + * Initialize the contents for the rport + */ + (void)nvmf_fc_rport_set_state(rport, SPDK_NVMF_FC_OBJECT_CREATED); + rport->s_id = args->s_id; + rport->rpi = args->rpi; + rport->fc_nodename = args->fc_nodename; + rport->fc_portname = args->fc_portname; + + /* + * Add remote port to nport + */ + if (nvmf_fc_nport_add_rem_port(nport, rport) != 0) { + DEV_VERIFY(!"Error while adding rport to list"); + }; + + /* + * TODO: Do we validate the initiators service parameters? + */ + + /* + * Get the targets service parameters from the library + * to return back to the driver. + */ + args->target_prli_info = nvmf_fc_get_prli_service_params(); + +out: + if (api_data->cb_func != NULL) { + /* + * Passing pointer to the args struct as the first argument. + * The cb_func should handle this appropriately. + */ + (void)api_data->cb_func(args->port_handle, SPDK_FC_IT_ADD, args->cb_ctx, err); + } + + free(arg); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, + "IT add on nport %d done, rc = %d.\n", + args->nport_handle, err); +} + +/** + * Process a IT delete. + */ +static void +nvmf_fc_adm_evnt_i_t_delete(void *arg) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg; + struct spdk_nvmf_fc_hw_i_t_delete_args *args = (struct spdk_nvmf_fc_hw_i_t_delete_args *) + api_data->api_args; + int rc = 0; + struct spdk_nvmf_fc_nport *nport = NULL; + struct spdk_nvmf_fc_adm_i_t_del_cb_data *cb_data = NULL; + struct spdk_nvmf_fc_remote_port_info *rport_iter = NULL; + struct spdk_nvmf_fc_remote_port_info *rport = NULL; + uint32_t num_rport = 0; + char log_str[256]; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "IT delete on nport:%d begin.\n", args->nport_handle); + + /* + * Make sure the nport port exists. If it does not, error out. + */ + nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle); + if (nport == NULL) { + SPDK_ERRLOG("Unable to find the SPDK FC nport:%d\n", args->nport_handle); + rc = -EINVAL; + goto out; + } + + /* + * Find this ITN / rport (remote port). + */ + TAILQ_FOREACH(rport_iter, &nport->rem_port_list, link) { + num_rport++; + if ((rport_iter->s_id == args->s_id) && + (rport_iter->rpi == args->rpi) && + (rport_iter->rport_state == SPDK_NVMF_FC_OBJECT_CREATED)) { + rport = rport_iter; + break; + } + } + + /* + * We should find either zero or exactly one rport. + * + * If we find zero rports, that means that a previous request has + * removed the rport by the time we reached here. In this case, + * simply return out. + */ + if (rport == NULL) { + rc = -ENODEV; + goto out; + } + + /* + * We have found exactly one rport. Allocate memory for callback data. + */ + cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_i_t_del_cb_data)); + if (NULL == cb_data) { + SPDK_ERRLOG("Failed to allocate memory for cb_data for nport:%d.\n", args->nport_handle); + rc = -ENOMEM; + goto out; + } + + cb_data->nport = nport; + cb_data->rport = rport; + cb_data->port_handle = args->port_handle; + cb_data->fc_cb_func = api_data->cb_func; + cb_data->fc_cb_ctx = args->cb_ctx; + + /* + * Validate rport object state. + */ + if (rport->rport_state == SPDK_NVMF_FC_OBJECT_CREATED) { + (void)nvmf_fc_rport_set_state(rport, SPDK_NVMF_FC_OBJECT_TO_BE_DELETED); + } else if (rport->rport_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) { + /* + * Deletion of this rport already in progress. Register callback + * and return. + */ + /* TODO: Register callback in callback vector. For now, set the error and return. */ + rc = -ENODEV; + goto out; + } else { + /* rport partially created/deleted */ + DEV_VERIFY(rport->rport_state == SPDK_NVMF_FC_OBJECT_ZOMBIE); + DEV_VERIFY(!"Invalid rport_state"); + rc = -ENODEV; + goto out; + } + + /* + * We have successfully found a rport to delete. Call + * nvmf_fc_i_t_delete_assoc(), which will perform further + * IT-delete processing as well as free the cb_data. + */ + nvmf_fc_adm_i_t_delete_assoc(nport, rport, nvmf_fc_adm_i_t_delete_cb, + (void *)cb_data); + +out: + if (rc != 0) { + /* + * We have entered here because either we encountered an + * error, or we did not find a rport to delete. + * As a result, we will not call the function + * nvmf_fc_i_t_delete_assoc() for further IT-delete + * processing. Therefore, execute the callback function now. + */ + if (cb_data) { + free(cb_data); + } + if (api_data->cb_func != NULL) { + (void)api_data->cb_func(args->port_handle, SPDK_FC_IT_DELETE, args->cb_ctx, rc); + } + } + + snprintf(log_str, sizeof(log_str), + "IT delete on nport:%d end. num_rport:%d rc = %d.\n", + args->nport_handle, num_rport, rc); + + if (rc != 0) { + SPDK_ERRLOG("%s", log_str); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str); + } + + free(arg); +} + +/* + * Process ABTS received + */ +static void +nvmf_fc_adm_evnt_abts_recv(void *arg) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg; + struct spdk_nvmf_fc_abts_args *args = (struct spdk_nvmf_fc_abts_args *)api_data->api_args; + struct spdk_nvmf_fc_nport *nport = NULL; + int err = 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "FC ABTS received. RPI:%d, oxid:%d, rxid:%d\n", args->rpi, + args->oxid, args->rxid); + + /* + * 1. Make sure the nport port exists. + */ + nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle); + if (nport == NULL) { + SPDK_ERRLOG("Unable to find the SPDK FC nport %d\n", args->nport_handle); + err = -EINVAL; + goto out; + } + + /* + * 2. If the nport is in the process of being deleted, drop the ABTS. + */ + if (nport->nport_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, + "FC ABTS dropped because the nport is being deleted; RPI:%d, oxid:%d, rxid:%d\n", + args->rpi, args->oxid, args->rxid); + err = 0; + goto out; + + } + + /* + * 3. Pass the received ABTS-LS to the library for handling. + */ + nvmf_fc_handle_abts_frame(nport, args->rpi, args->oxid, args->rxid); + +out: + if (api_data->cb_func != NULL) { + /* + * Passing pointer to the args struct as the first argument. + * The cb_func should handle this appropriately. + */ + (void)api_data->cb_func(args->port_handle, SPDK_FC_ABTS_RECV, args, err); + } else { + /* No callback set, free the args */ + free(args); + } + + free(arg); +} + +/* + * Callback function for hw port quiesce. + */ +static void +nvmf_fc_adm_hw_port_quiesce_reset_cb(void *ctx, int err) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_hw_port_reset_ctx *reset_ctx = + (struct spdk_nvmf_fc_adm_hw_port_reset_ctx *)ctx; + struct spdk_nvmf_fc_hw_port_reset_args *args = reset_ctx->reset_args; + spdk_nvmf_fc_callback cb_func = reset_ctx->reset_cb_func; + struct spdk_nvmf_fc_queue_dump_info dump_info; + struct spdk_nvmf_fc_port *fc_port = NULL; + char *dump_buf = NULL; + uint32_t dump_buf_size = SPDK_FC_HW_DUMP_BUF_SIZE; + + /* + * Free the callback context struct. + */ + free(ctx); + + if (err != 0) { + SPDK_ERRLOG("Port %d quiesce operation failed.\n", args->port_handle); + goto out; + } + + if (args->dump_queues == false) { + /* + * Queues need not be dumped. + */ + goto out; + } + + SPDK_ERRLOG("Dumping queues for HW port %d\n", args->port_handle); + + /* + * Get the fc port. + */ + fc_port = nvmf_fc_port_lookup(args->port_handle); + if (fc_port == NULL) { + SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle); + err = -EINVAL; + goto out; + } + + /* + * Allocate memory for the dump buffer. + * This memory will be freed by FCT. + */ + dump_buf = (char *)calloc(1, dump_buf_size); + if (dump_buf == NULL) { + err = -ENOMEM; + SPDK_ERRLOG("Memory allocation for dump buffer failed, SPDK FC port %d\n", args->port_handle); + goto out; + } + *args->dump_buf = (uint32_t *)dump_buf; + dump_info.buffer = dump_buf; + dump_info.offset = 0; + + /* + * Add the dump reason to the top of the buffer. + */ + nvmf_fc_dump_buf_print(&dump_info, "%s\n", args->reason); + + /* + * Dump the hwqp. + */ + nvmf_fc_dump_all_queues(&fc_port->ls_queue, fc_port->io_queues, + fc_port->num_io_queues, &dump_info); + +out: + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d reset done, queues_dumped = %d, rc = %d.\n", + args->port_handle, args->dump_queues, err); + + if (cb_func != NULL) { + (void)cb_func(args->port_handle, SPDK_FC_HW_PORT_RESET, args->cb_ctx, err); + } +} + +/* + * HW port reset + + */ +static void +nvmf_fc_adm_evnt_hw_port_reset(void *arg) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg; + struct spdk_nvmf_fc_hw_port_reset_args *args = (struct spdk_nvmf_fc_hw_port_reset_args *) + api_data->api_args; + struct spdk_nvmf_fc_port *fc_port = NULL; + struct spdk_nvmf_fc_adm_hw_port_reset_ctx *ctx = NULL; + int err = 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d dump\n", args->port_handle); + + /* + * Make sure the physical port exists. + */ + fc_port = nvmf_fc_port_lookup(args->port_handle); + if (fc_port == NULL) { + SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle); + err = -EINVAL; + goto out; + } + + /* + * Save the reset event args and the callback in a context struct. + */ + ctx = calloc(1, sizeof(struct spdk_nvmf_fc_adm_hw_port_reset_ctx)); + + if (ctx == NULL) { + err = -ENOMEM; + SPDK_ERRLOG("Memory allocation for reset ctx failed, SPDK FC port %d\n", args->port_handle); + goto fail; + } + + ctx->reset_args = arg; + ctx->reset_cb_func = api_data->cb_func; + + /* + * Quiesce the hw port. + */ + err = nvmf_fc_adm_hw_port_quiesce(fc_port, ctx, nvmf_fc_adm_hw_port_quiesce_reset_cb); + if (err != 0) { + goto fail; + } + + /* + * Once the ports are successfully quiesced the reset processing + * will continue in the callback function: spdk_fc_port_quiesce_reset_cb + */ + return; +fail: + free(ctx); + +out: + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d dump done, rc = %d.\n", args->port_handle, + err); + + if (api_data->cb_func != NULL) { + (void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_RESET, args->cb_ctx, err); + } + + free(arg); +} + +/* + * Process a link break event on a HW port. + */ +static void +nvmf_fc_adm_evnt_hw_port_link_break(void *arg) +{ + ASSERT_SPDK_FC_MASTER_THREAD(); + struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg; + struct spdk_nvmf_hw_port_link_break_args *args = (struct spdk_nvmf_hw_port_link_break_args *) + api_data->api_args; + struct spdk_nvmf_fc_port *fc_port = NULL; + int err = 0; + struct spdk_nvmf_fc_adm_port_link_break_cb_data *cb_data = NULL; + struct spdk_nvmf_fc_nport *nport = NULL; + uint32_t nport_deletes_sent = 0; + uint32_t nport_deletes_skipped = 0; + struct spdk_nvmf_fc_nport_delete_args *nport_del_args = NULL; + char log_str[256]; + + /* + * Get the fc port using the port handle. + */ + fc_port = nvmf_fc_port_lookup(args->port_handle); + if (!fc_port) { + SPDK_ERRLOG("port link break: Unable to find the SPDK FC port %d\n", + args->port_handle); + err = -EINVAL; + goto out; + } + + /* + * Set the port state to offline, if it is not already. + */ + err = nvmf_fc_port_set_offline(fc_port); + if (err != 0) { + SPDK_ERRLOG("port link break: HW port %d already offline. rc = %d\n", + fc_port->port_hdl, err); + err = 0; + goto out; + } + + /* + * Delete all the nports, if any. + */ + if (!TAILQ_EMPTY(&fc_port->nport_list)) { + TAILQ_FOREACH(nport, &fc_port->nport_list, link) { + /* Skipped the nports that are not in CREATED state */ + if (nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED) { + nport_deletes_skipped++; + continue; + } + + /* Allocate memory for callback data. */ + cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_port_link_break_cb_data)); + if (NULL == cb_data) { + SPDK_ERRLOG("port link break: Failed to allocate memory for cb_data %d.\n", + args->port_handle); + err = -ENOMEM; + goto out; + } + cb_data->args = args; + cb_data->cb_func = api_data->cb_func; + nport_del_args = &cb_data->nport_del_args; + nport_del_args->port_handle = args->port_handle; + nport_del_args->nport_handle = nport->nport_hdl; + nport_del_args->cb_ctx = cb_data; + + nvmf_fc_master_enqueue_event(SPDK_FC_NPORT_DELETE, + (void *)nport_del_args, + nvmf_fc_adm_hw_port_link_break_cb); + + nport_deletes_sent++; + } + } + + if (nport_deletes_sent == 0 && err == 0) { + /* + * Mark the hwqps as offline and unregister the pollers. + */ + (void)nvmf_fc_adm_port_hwqp_offline_del_poller(fc_port); + } + +out: + snprintf(log_str, sizeof(log_str), + "port link break done: port:%d nport_deletes_sent:%d nport_deletes_skipped:%d rc:%d.\n", + args->port_handle, nport_deletes_sent, nport_deletes_skipped, err); + + if (err != 0) { + SPDK_ERRLOG("%s", log_str); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str); + } + + if ((api_data->cb_func != NULL) && (nport_deletes_sent == 0)) { + /* + * No nport_deletes are sent, which would have eventually + * called the port_link_break callback. Therefore, call the + * port_link_break callback here. + */ + (void)api_data->cb_func(args->port_handle, SPDK_FC_LINK_BREAK, args->cb_ctx, err); + } + + free(arg); +} + +static inline void +nvmf_fc_adm_run_on_master_thread(spdk_msg_fn fn, void *args) +{ + if (nvmf_fc_get_master_thread()) { + spdk_thread_send_msg(nvmf_fc_get_master_thread(), fn, args); + } +} + +/* + * Queue up an event in the SPDK masters event queue. + * Used by the FC driver to notify the SPDK master of FC related events. + */ +int +nvmf_fc_master_enqueue_event(enum spdk_fc_event event_type, void *args, + spdk_nvmf_fc_callback cb_func) +{ + int err = 0; + struct spdk_nvmf_fc_adm_api_data *api_data = NULL; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Enqueue event %d.\n", event_type); + + if (event_type >= SPDK_FC_EVENT_MAX) { + SPDK_ERRLOG("Invalid spdk_fc_event_t %d.\n", event_type); + err = -EINVAL; + goto done; + } + + if (args == NULL) { + SPDK_ERRLOG("Null args for event %d.\n", event_type); + err = -EINVAL; + goto done; + } + + api_data = calloc(1, sizeof(*api_data)); + + if (api_data == NULL) { + SPDK_ERRLOG("Failed to alloc api data for event %d.\n", event_type); + err = -ENOMEM; + goto done; + } + + api_data->api_args = args; + api_data->cb_func = cb_func; + + switch (event_type) { + case SPDK_FC_HW_PORT_INIT: + nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_init, + (void *)api_data); + break; + + case SPDK_FC_HW_PORT_ONLINE: + nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_online, + (void *)api_data); + break; + + case SPDK_FC_HW_PORT_OFFLINE: + nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_offline, + (void *)api_data); + break; + + case SPDK_FC_NPORT_CREATE: + nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_nport_create, + (void *)api_data); + break; + + case SPDK_FC_NPORT_DELETE: + nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_nport_delete, + (void *)api_data); + break; + + case SPDK_FC_IT_ADD: + nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_i_t_add, + (void *)api_data); + break; + + case SPDK_FC_IT_DELETE: + nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_i_t_delete, + (void *)api_data); + break; + + case SPDK_FC_ABTS_RECV: + nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_abts_recv, + (void *)api_data); + break; + + case SPDK_FC_LINK_BREAK: + nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_link_break, + (void *)api_data); + break; + + case SPDK_FC_HW_PORT_RESET: + nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_reset, + (void *)api_data); + break; + + case SPDK_FC_UNRECOVERABLE_ERR: + default: + SPDK_ERRLOG("Invalid spdk_fc_event_t: %d\n", event_type); + err = -EINVAL; + break; + } + +done: + + if (err == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Enqueue event %d done successfully\n", event_type); + } else { + SPDK_ERRLOG("Enqueue event %d failed, err = %d\n", event_type, err); + if (api_data) { + free(api_data); + } + } + + return err; +} + +SPDK_NVMF_TRANSPORT_REGISTER(fc, &spdk_nvmf_transport_fc); +SPDK_LOG_REGISTER_COMPONENT("nvmf_fc_adm_api", SPDK_LOG_NVMF_FC_ADM_API); +SPDK_LOG_REGISTER_COMPONENT("nvmf_fc", SPDK_LOG_NVMF_FC) diff --git a/src/spdk/lib/nvmf/fc_ls.c b/src/spdk/lib/nvmf/fc_ls.c new file mode 100644 index 000000000..1aa06bd45 --- /dev/null +++ b/src/spdk/lib/nvmf/fc_ls.c @@ -0,0 +1,1678 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2018-2019 Broadcom. All Rights Reserved. + * The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/env.h" +#include "spdk/assert.h" +#include "spdk/nvmf.h" +#include "spdk/nvmf_spec.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/util.h" +#include "spdk/endian.h" +#include "spdk_internal/log.h" +#include "nvmf_internal.h" +#include "transport.h" + +#include "nvmf_fc.h" +#include "fc_lld.h" + +/* set to 1 to send ls disconnect in response to ls disconnect from host (per standard) */ +#define NVMF_FC_LS_SEND_LS_DISCONNECT 0 + +/* Validation Error indexes into the string table below */ +enum { + VERR_NO_ERROR = 0, + VERR_CR_ASSOC_LEN = 1, + VERR_CR_ASSOC_RQST_LEN = 2, + VERR_CR_ASSOC_CMD = 3, + VERR_CR_ASSOC_CMD_LEN = 4, + VERR_ERSP_RATIO = 5, + VERR_ASSOC_ALLOC_FAIL = 6, + VERR_CONN_ALLOC_FAIL = 7, + VERR_CR_CONN_LEN = 8, + VERR_CR_CONN_RQST_LEN = 9, + VERR_ASSOC_ID = 10, + VERR_ASSOC_ID_LEN = 11, + VERR_NO_ASSOC = 12, + VERR_CONN_ID = 13, + VERR_CONN_ID_LEN = 14, + VERR_NO_CONN = 15, + VERR_CR_CONN_CMD = 16, + VERR_CR_CONN_CMD_LEN = 17, + VERR_DISCONN_LEN = 18, + VERR_DISCONN_RQST_LEN = 19, + VERR_DISCONN_CMD = 20, + VERR_DISCONN_CMD_LEN = 21, + VERR_DISCONN_SCOPE = 22, + VERR_RS_LEN = 23, + VERR_RS_RQST_LEN = 24, + VERR_RS_CMD = 25, + VERR_RS_CMD_LEN = 26, + VERR_RS_RCTL = 27, + VERR_RS_RO = 28, + VERR_CONN_TOO_MANY = 29, + VERR_SUBNQN = 30, + VERR_HOSTNQN = 31, + VERR_SQSIZE = 32, + VERR_NO_RPORT = 33, + VERR_SUBLISTENER = 34, +}; + +static char *validation_errors[] = { + "OK", + "Bad CR_ASSOC Length", + "Bad CR_ASSOC Rqst Length", + "Not CR_ASSOC Cmd", + "Bad CR_ASSOC Cmd Length", + "Bad Ersp Ratio", + "Association Allocation Failed", + "Queue Allocation Failed", + "Bad CR_CONN Length", + "Bad CR_CONN Rqst Length", + "Not Association ID", + "Bad Association ID Length", + "No Association", + "Not Connection ID", + "Bad Connection ID Length", + "No Connection", + "Not CR_CONN Cmd", + "Bad CR_CONN Cmd Length", + "Bad DISCONN Length", + "Bad DISCONN Rqst Length", + "Not DISCONN Cmd", + "Bad DISCONN Cmd Length", + "Bad Disconnect Scope", + "Bad RS Length", + "Bad RS Rqst Length", + "Not RS Cmd", + "Bad RS Cmd Length", + "Bad RS R_CTL", + "Bad RS Relative Offset", + "Too many connections for association", + "Invalid subnqn or subsystem not found", + "Invalid hostnqn or subsystem doesn't allow host", + "SQ size = 0 or too big", + "No Remote Port", + "Bad Subsystem Port", +}; + +static inline void +nvmf_fc_add_assoc_to_tgt_port(struct spdk_nvmf_fc_nport *tgtport, + struct spdk_nvmf_fc_association *assoc, + struct spdk_nvmf_fc_remote_port_info *rport); + +static inline FCNVME_BE32 cpu_to_be32(uint32_t in) +{ + uint32_t t; + + to_be32(&t, in); + return (FCNVME_BE32)t; +} + +static inline FCNVME_BE32 nvmf_fc_lsdesc_len(size_t sz) +{ + uint32_t t; + + to_be32(&t, sz - (2 * sizeof(uint32_t))); + return (FCNVME_BE32)t; +} + +static void +nvmf_fc_ls_format_rsp_hdr(void *buf, uint8_t ls_cmd, uint32_t desc_len, + uint8_t rqst_ls_cmd) +{ + struct spdk_nvmf_fc_ls_acc_hdr *acc_hdr = buf; + + acc_hdr->w0.ls_cmd = ls_cmd; + acc_hdr->desc_list_len = desc_len; + to_be32(&acc_hdr->rqst.desc_tag, FCNVME_LSDESC_RQST); + acc_hdr->rqst.desc_len = + nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_rqst)); + acc_hdr->rqst.w0.ls_cmd = rqst_ls_cmd; +} + +static int +nvmf_fc_ls_format_rjt(void *buf, uint16_t buflen, uint8_t ls_cmd, + uint8_t reason, uint8_t explanation, uint8_t vendor) +{ + struct spdk_nvmf_fc_ls_rjt *rjt = buf; + + bzero(buf, sizeof(struct spdk_nvmf_fc_ls_rjt)); + nvmf_fc_ls_format_rsp_hdr(buf, FCNVME_LSDESC_RQST, + nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_ls_rjt)), + ls_cmd); + to_be32(&rjt->rjt.desc_tag, FCNVME_LSDESC_RJT); + rjt->rjt.desc_len = nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_rjt)); + rjt->rjt.reason_code = reason; + rjt->rjt.reason_explanation = explanation; + rjt->rjt.vendor = vendor; + + return sizeof(struct spdk_nvmf_fc_ls_rjt); +} + +/* ************************************************** */ +/* Allocators/Deallocators (assocations, connections, */ +/* poller API data) */ + +static inline void +nvmf_fc_ls_free_association(struct spdk_nvmf_fc_association *assoc) +{ + struct spdk_nvmf_fc_conn *fc_conn; + + /* return the q slots of the conns for the association */ + TAILQ_FOREACH(fc_conn, &assoc->avail_fc_conns, assoc_avail_link) { + if (fc_conn->conn_id != NVMF_FC_INVALID_CONN_ID) { + nvmf_fc_release_conn(fc_conn->hwqp, fc_conn->conn_id, + fc_conn->max_queue_depth); + } + } + + /* free assocation's send disconnect buffer */ + if (assoc->snd_disconn_bufs) { + nvmf_fc_free_srsr_bufs(assoc->snd_disconn_bufs); + } + + /* free assocation's connections */ + free(assoc->conns_buf); + + /* free the association */ + free(assoc); +} + +static int +nvmf_fc_ls_alloc_connections(struct spdk_nvmf_fc_association *assoc, + struct spdk_nvmf_transport *nvmf_transport) +{ + uint32_t i; + struct spdk_nvmf_fc_conn *fc_conn; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Pre-alloc %d qpairs for host NQN %s\n", + nvmf_transport->opts.max_qpairs_per_ctrlr, assoc->host_nqn); + + /* allocate memory for all connections at once */ + assoc->conns_buf = calloc(nvmf_transport->opts.max_qpairs_per_ctrlr + 1, + sizeof(struct spdk_nvmf_fc_conn)); + if (assoc->conns_buf == NULL) { + SPDK_ERRLOG("Out of memory for connections for new association\n"); + return -ENOMEM; + } + + for (i = 0; i < nvmf_transport->opts.max_qpairs_per_ctrlr; i++) { + fc_conn = assoc->conns_buf + (i * sizeof(struct spdk_nvmf_fc_conn)); + fc_conn->conn_id = NVMF_FC_INVALID_CONN_ID; + fc_conn->qpair.state = SPDK_NVMF_QPAIR_UNINITIALIZED; + fc_conn->qpair.transport = nvmf_transport; + + TAILQ_INSERT_TAIL(&assoc->avail_fc_conns, fc_conn, assoc_avail_link); + } + + return 0; +} + +static struct spdk_nvmf_fc_association * +nvmf_fc_ls_new_association(uint32_t s_id, + struct spdk_nvmf_fc_nport *tgtport, + struct spdk_nvmf_fc_remote_port_info *rport, + struct spdk_nvmf_fc_lsdesc_cr_assoc_cmd *a_cmd, + struct spdk_nvmf_subsystem *subsys, + uint16_t rpi, + struct spdk_nvmf_transport *nvmf_transport) +{ + struct spdk_nvmf_fc_association *assoc; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, + "New Association request for port %d nport %d rpi 0x%x\n", + tgtport->fc_port->port_hdl, tgtport->nport_hdl, rpi); + + assert(rport); + if (!rport) { + SPDK_ERRLOG("rport is null.\n"); + return NULL; + } + + assoc = calloc(1, sizeof(struct spdk_nvmf_fc_association)); + if (!assoc) { + SPDK_ERRLOG("unable to allocate memory for new association\n"); + return NULL; + } + + /* initialize association */ +#if (NVMF_FC_LS_SEND_LS_DISCONNECT == 1) + /* allocate buffers to send LS disconnect command to host */ + assoc->snd_disconn_bufs = + nvmf_fc_alloc_srsr_bufs(sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst), + sizeof(struct spdk_nvmf_fc_ls_rjt)); + if (!assoc->snd_disconn_bufs) { + SPDK_ERRLOG("no dma memory for association's ls disconnect bufs\n"); + free(assoc); + return NULL; + } + + assoc->snd_disconn_bufs->rpi = rpi; +#endif + assoc->s_id = s_id; + assoc->tgtport = tgtport; + assoc->rport = rport; + assoc->subsystem = subsys; + assoc->assoc_state = SPDK_NVMF_FC_OBJECT_CREATED; + memcpy(assoc->host_id, a_cmd->hostid, FCNVME_ASSOC_HOSTID_LEN); + memcpy(assoc->host_nqn, a_cmd->hostnqn, SPDK_NVME_NQN_FIELD_SIZE); + memcpy(assoc->sub_nqn, a_cmd->subnqn, SPDK_NVME_NQN_FIELD_SIZE); + TAILQ_INIT(&assoc->fc_conns); + TAILQ_INIT(&assoc->avail_fc_conns); + assoc->ls_del_op_ctx = NULL; + + /* allocate and assign connections for association */ + rc = nvmf_fc_ls_alloc_connections(assoc, nvmf_transport); + if (rc != 0) { + nvmf_fc_ls_free_association(assoc); + return NULL; + } + + /* add association to target port's association list */ + nvmf_fc_add_assoc_to_tgt_port(tgtport, assoc, rport); + return assoc; +} + +static inline void +nvmf_fc_ls_append_del_cb_ctx(struct spdk_nvmf_fc_association *assoc, + struct nvmf_fc_ls_op_ctx *opd) +{ + /* append to delete assoc callback list */ + if (!assoc->ls_del_op_ctx) { + assoc->ls_del_op_ctx = (void *)opd; + } else { + struct nvmf_fc_ls_op_ctx *nxt = + (struct nvmf_fc_ls_op_ctx *) assoc->ls_del_op_ctx; + while (nxt->next_op_ctx) { + nxt = nxt->next_op_ctx; + } + nxt->next_op_ctx = opd; + } +} + +static struct spdk_nvmf_fc_conn * +nvmf_fc_ls_new_connection(struct spdk_nvmf_fc_association *assoc, uint16_t qid, + uint16_t esrp_ratio, uint16_t rpi, uint16_t sq_size, + struct spdk_nvmf_fc_nport *tgtport) +{ + struct spdk_nvmf_fc_conn *fc_conn; + + fc_conn = TAILQ_FIRST(&assoc->avail_fc_conns); + if (!fc_conn) { + SPDK_ERRLOG("out of connections for association %p\n", assoc); + return NULL; + } + + /* Remove from avail list and add to in use. */ + TAILQ_REMOVE(&assoc->avail_fc_conns, fc_conn, assoc_avail_link); + TAILQ_INSERT_TAIL(&assoc->fc_conns, fc_conn, assoc_link); + + if (qid == 0) { + /* AdminQ connection. */ + assoc->aq_conn = fc_conn; + } + + fc_conn->qpair.qid = qid; + fc_conn->qpair.sq_head_max = sq_size; + TAILQ_INIT(&fc_conn->qpair.outstanding); + fc_conn->esrp_ratio = esrp_ratio; + fc_conn->fc_assoc = assoc; + fc_conn->rpi = rpi; + fc_conn->max_queue_depth = sq_size + 1; + + /* save target port trid in connection (for subsystem + * listener validation in fabric connect command) + */ + nvmf_fc_create_trid(&fc_conn->trid, tgtport->fc_nodename.u.wwn, + tgtport->fc_portname.u.wwn); + + return fc_conn; +} + +static inline void +nvmf_fc_ls_free_connection(struct spdk_nvmf_fc_conn *fc_conn) +{ + TAILQ_INSERT_TAIL(&fc_conn->fc_assoc->avail_fc_conns, fc_conn, assoc_avail_link); +} + +/* End - Allocators/Deallocators (assocations, connections, */ +/* poller API data) */ +/* ******************************************************** */ + +static inline struct spdk_nvmf_fc_association * +nvmf_fc_ls_find_assoc(struct spdk_nvmf_fc_nport *tgtport, uint64_t assoc_id) +{ + struct spdk_nvmf_fc_association *assoc = NULL; + + TAILQ_FOREACH(assoc, &tgtport->fc_associations, link) { + if (assoc->assoc_id == assoc_id) { + if (assoc->assoc_state == SPDK_NVMF_FC_OBJECT_ZOMBIE) { + assoc = NULL; + } + break; + } + } + return assoc; +} + +static inline void +nvmf_fc_add_assoc_to_tgt_port(struct spdk_nvmf_fc_nport *tgtport, + struct spdk_nvmf_fc_association *assoc, + struct spdk_nvmf_fc_remote_port_info *rport) +{ + TAILQ_INSERT_TAIL(&tgtport->fc_associations, assoc, link); + tgtport->assoc_count++; + rport->assoc_count++; +} + +static inline void +nvmf_fc_del_assoc_from_tgt_port(struct spdk_nvmf_fc_association *assoc) +{ + struct spdk_nvmf_fc_nport *tgtport = assoc->tgtport; + + TAILQ_REMOVE(&tgtport->fc_associations, assoc, link); + tgtport->assoc_count--; + assoc->rport->assoc_count--; +} + +static void +nvmf_fc_ls_rsp_fail_del_conn_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret) +{ + struct nvmf_fc_ls_op_ctx *opd = + (struct nvmf_fc_ls_op_ctx *)cb_data; + struct spdk_nvmf_fc_ls_del_conn_api_data *dp = &opd->u.del_conn; + struct spdk_nvmf_fc_association *assoc = dp->assoc; + struct spdk_nvmf_fc_conn *fc_conn = dp->args.fc_conn; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Delete Connection callback " + "for assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id, + fc_conn->conn_id); + + if (dp->aq_conn) { + /* delete association */ + nvmf_fc_del_assoc_from_tgt_port(assoc); + nvmf_fc_ls_free_association(assoc); + } else { + /* remove connection from association's connection list */ + TAILQ_REMOVE(&assoc->fc_conns, fc_conn, assoc_link); + nvmf_fc_ls_free_connection(fc_conn); + } + + free(opd); +} + +static void +nvmf_fc_handle_xmt_ls_rsp_failure(struct spdk_nvmf_fc_association *assoc, + struct spdk_nvmf_fc_conn *fc_conn, + bool aq_conn) +{ + struct spdk_nvmf_fc_ls_del_conn_api_data *api_data; + struct nvmf_fc_ls_op_ctx *opd = NULL; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Transmit LS response failure " + "for assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id, + fc_conn->conn_id); + + + /* create context for delete connection API */ + opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx)); + if (!opd) { /* hopefully this doesn't happen - if so, we leak the connection */ + SPDK_ERRLOG("Mem alloc failed for del conn op data"); + return; + } + + api_data = &opd->u.del_conn; + api_data->assoc = assoc; + api_data->ls_rqst = NULL; + api_data->aq_conn = aq_conn; + api_data->args.fc_conn = fc_conn; + api_data->args.send_abts = false; + api_data->args.hwqp = fc_conn->hwqp; + api_data->args.cb_info.cb_thread = spdk_get_thread(); + api_data->args.cb_info.cb_func = nvmf_fc_ls_rsp_fail_del_conn_cb; + api_data->args.cb_info.cb_data = opd; + + nvmf_fc_poller_api_func(api_data->args.hwqp, + SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION, + &api_data->args); +} + +/* callback from poller's ADD_Connection event */ +static void +nvmf_fc_ls_add_conn_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret) +{ + struct nvmf_fc_ls_op_ctx *opd = + (struct nvmf_fc_ls_op_ctx *)cb_data; + struct spdk_nvmf_fc_ls_add_conn_api_data *dp = &opd->u.add_conn; + struct spdk_nvmf_fc_association *assoc = dp->assoc; + struct spdk_nvmf_fc_nport *tgtport = assoc->tgtport; + struct spdk_nvmf_fc_conn *fc_conn = dp->args.fc_conn; + struct spdk_nvmf_fc_ls_rqst *ls_rqst = dp->ls_rqst; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, + "add_conn_cb: assoc_id = 0x%lx, conn_id = 0x%lx\n", + assoc->assoc_id, fc_conn->conn_id); + + fc_conn->create_opd = NULL; + + if (assoc->assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) { + /* association is already being deleted - don't continue */ + free(opd); + return; + } + + if (dp->aq_conn) { + struct spdk_nvmf_fc_ls_cr_assoc_acc *assoc_acc = + (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt; + /* put connection and association ID in response */ + to_be64(&assoc_acc->conn_id.connection_id, fc_conn->conn_id); + assoc_acc->assoc_id.association_id = assoc_acc->conn_id.connection_id; + } else { + struct spdk_nvmf_fc_ls_cr_conn_acc *conn_acc = + (struct spdk_nvmf_fc_ls_cr_conn_acc *)ls_rqst->rspbuf.virt; + /* put connection ID in response */ + to_be64(&conn_acc->conn_id.connection_id, fc_conn->conn_id); + } + + /* send LS response */ + if (nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst) != 0) { + SPDK_ERRLOG("Send LS response for %s failed - cleaning up\n", + dp->aq_conn ? "association" : "connection"); + nvmf_fc_handle_xmt_ls_rsp_failure(assoc, fc_conn, + dp->aq_conn); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, + "LS response (conn_id 0x%lx) sent\n", fc_conn->conn_id); + } + + free(opd); +} + +void +nvmf_fc_ls_add_conn_failure( + struct spdk_nvmf_fc_association *assoc, + struct spdk_nvmf_fc_ls_rqst *ls_rqst, + struct spdk_nvmf_fc_conn *fc_conn, + bool aq_conn) +{ + struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst; + struct spdk_nvmf_fc_ls_cr_assoc_acc *acc; + struct spdk_nvmf_fc_nport *tgtport = assoc->tgtport; + + if (fc_conn->create_opd) { + free(fc_conn->create_opd); + fc_conn->create_opd = NULL; + } + + rqst = (struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt; + acc = (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt; + + /* send failure response */ + ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, + FCNVME_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd, + FCNVME_RJT_RC_INSUFF_RES, + FCNVME_RJT_EXP_NONE, 0); + + nvmf_fc_ls_free_connection(fc_conn); + if (aq_conn) { + nvmf_fc_del_assoc_from_tgt_port(assoc); + nvmf_fc_ls_free_association(assoc); + } + + nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst); +} + + +static void +nvmf_fc_ls_add_conn_to_poller( + struct spdk_nvmf_fc_association *assoc, + struct spdk_nvmf_fc_ls_rqst *ls_rqst, + struct spdk_nvmf_fc_conn *fc_conn, + bool aq_conn) +{ + struct nvmf_fc_ls_op_ctx *opd; + struct spdk_nvmf_fc_ls_add_conn_api_data *api_data; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Add Connection to poller for " + "assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id, + fc_conn->conn_id); + + opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx)); + if (!opd) { + SPDK_ERRLOG("allocate api data for add conn op failed\n"); + nvmf_fc_ls_add_conn_failure(assoc, ls_rqst, fc_conn, aq_conn); + return; + } + + /* insert conn in association's connection list */ + api_data = &opd->u.add_conn; + assoc->conn_count++; + + api_data->args.fc_conn = fc_conn; + api_data->args.cb_info.cb_thread = spdk_get_thread(); + api_data->args.cb_info.cb_func = nvmf_fc_ls_add_conn_cb; + api_data->args.cb_info.cb_data = (void *)opd; + api_data->assoc = assoc; + api_data->ls_rqst = ls_rqst; + api_data->aq_conn = aq_conn; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, + "New QP callback called.\n"); + + /* Let the nvmf_tgt decide which pollgroup to use. */ + fc_conn->create_opd = opd; + spdk_nvmf_tgt_new_qpair(ls_rqst->nvmf_tgt, &fc_conn->qpair); +} + +/* Delete association functions */ + +static void +nvmf_fc_do_del_assoc_cbs(struct nvmf_fc_ls_op_ctx *opd, int ret) +{ + struct nvmf_fc_ls_op_ctx *nxt; + struct spdk_nvmf_fc_delete_assoc_api_data *dp; + + while (opd) { + dp = &opd->u.del_assoc; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "performing delete assoc. callback\n"); + dp->del_assoc_cb(dp->del_assoc_cb_data, ret); + + nxt = opd->next_op_ctx; + free(opd); + opd = nxt; + } +} + +static void +nvmf_fs_send_ls_disconnect_cb(void *hwqp, int32_t status, void *args) +{ + if (args) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "free disconnect buffers\n"); + nvmf_fc_free_srsr_bufs((struct spdk_nvmf_fc_srsr_bufs *)args); + } +} + +static void +nvmf_fc_del_all_conns_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret) +{ + struct nvmf_fc_ls_op_ctx *opd = (struct nvmf_fc_ls_op_ctx *)cb_data; + struct spdk_nvmf_fc_delete_assoc_api_data *dp = &opd->u.del_assoc; + struct spdk_nvmf_fc_association *assoc = dp->assoc; + struct spdk_nvmf_fc_conn *fc_conn = dp->args.fc_conn; + + /* Assumption here is that there will be no error (i.e. ret=success). + * Since connections are deleted in parallel, nothing can be + * done anyway if there is an error because we need to complete + * all connection deletes and callback to caller */ + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, + "Delete all connections for assoc_id 0x%lx, conn_id = %lx\n", + assoc->assoc_id, fc_conn->conn_id); + + /* remove connection from association's connection list */ + TAILQ_REMOVE(&assoc->fc_conns, fc_conn, assoc_link); + nvmf_fc_ls_free_connection(fc_conn); + + if (--assoc->conn_count == 0) { + /* last connection - remove association from target port's association list */ + struct nvmf_fc_ls_op_ctx *cb_opd = (struct nvmf_fc_ls_op_ctx *)assoc->ls_del_op_ctx; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, + "remove assoc. %lx\n", assoc->assoc_id); + nvmf_fc_del_assoc_from_tgt_port(assoc); + + if (assoc->snd_disconn_bufs && + assoc->tgtport->fc_port->hw_port_status == SPDK_FC_PORT_ONLINE) { + + struct spdk_nvmf_fc_ls_disconnect_rqst *dc_rqst; + struct spdk_nvmf_fc_srsr_bufs *srsr_bufs; + + dc_rqst = (struct spdk_nvmf_fc_ls_disconnect_rqst *) + assoc->snd_disconn_bufs->rqst; + + bzero(dc_rqst, sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst)); + + /* fill in request descriptor */ + dc_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT; + to_be32(&dc_rqst->desc_list_len, + sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst) - + (2 * sizeof(uint32_t))); + + /* fill in disconnect command descriptor */ + to_be32(&dc_rqst->disconn_cmd.desc_tag, FCNVME_LSDESC_DISCONN_CMD); + to_be32(&dc_rqst->disconn_cmd.desc_len, + sizeof(struct spdk_nvmf_fc_lsdesc_disconn_cmd) - + (2 * sizeof(uint32_t))); + + /* fill in association id descriptor */ + to_be32(&dc_rqst->assoc_id.desc_tag, FCNVME_LSDESC_ASSOC_ID), + to_be32(&dc_rqst->assoc_id.desc_len, + sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id) - + (2 * sizeof(uint32_t))); + to_be64(&dc_rqst->assoc_id.association_id, assoc->assoc_id); + + srsr_bufs = assoc->snd_disconn_bufs; + assoc->snd_disconn_bufs = NULL; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Send LS disconnect\n"); + if (nvmf_fc_xmt_srsr_req(&assoc->tgtport->fc_port->ls_queue, + srsr_bufs, nvmf_fs_send_ls_disconnect_cb, + (void *)srsr_bufs)) { + SPDK_ERRLOG("Error sending LS disconnect\n"); + assoc->snd_disconn_bufs = srsr_bufs; + } + } + + nvmf_fc_ls_free_association(assoc); + + /* perform callbacks to all callers to delete association */ + nvmf_fc_do_del_assoc_cbs(cb_opd, 0); + + } + + free(opd); +} + +static void +nvmf_fc_kill_io_del_all_conns_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret) +{ + struct nvmf_fc_ls_op_ctx *opd = (struct nvmf_fc_ls_op_ctx *)cb_data; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Callback after killing outstanding ABTS."); + /* + * NOTE: We should not access any connection or association related data + * structures here. + */ + free(opd); +} + + +/* Disconnect/delete (association) request functions */ + +static int +_nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport, + uint64_t assoc_id, bool send_abts, bool backend_initiated, + spdk_nvmf_fc_del_assoc_cb del_assoc_cb, + void *cb_data, bool from_ls_rqst) +{ + + struct nvmf_fc_ls_op_ctx *opd, *opd_tail, *opd_head = NULL; + struct spdk_nvmf_fc_delete_assoc_api_data *api_data; + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_association *assoc = + nvmf_fc_ls_find_assoc(tgtport, assoc_id); + struct spdk_nvmf_fc_port *fc_port = tgtport->fc_port; + enum spdk_nvmf_fc_object_state assoc_state; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Delete association, " + "assoc_id 0x%lx\n", assoc_id); + + if (!assoc) { + SPDK_ERRLOG("Delete association failed: %s\n", + validation_errors[VERR_NO_ASSOC]); + return VERR_NO_ASSOC; + } + + /* create cb context to put in association's list of + * callbacks to call when delete association is done */ + opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx)); + if (!opd) { + SPDK_ERRLOG("Mem alloc failed for del assoc cb data"); + return -ENOMEM; + } + + api_data = &opd->u.del_assoc; + api_data->assoc = assoc; + api_data->from_ls_rqst = from_ls_rqst; + api_data->del_assoc_cb = del_assoc_cb; + api_data->del_assoc_cb_data = cb_data; + api_data->args.cb_info.cb_data = opd; + nvmf_fc_ls_append_del_cb_ctx(assoc, opd); + + assoc_state = assoc->assoc_state; + if ((assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) && + (fc_port->hw_port_status != SPDK_FC_PORT_QUIESCED)) { + /* association already being deleted */ + return 0; + } + + /* mark assoc. to be deleted */ + assoc->assoc_state = SPDK_NVMF_FC_OBJECT_TO_BE_DELETED; + + /* create a list of all connection to delete */ + TAILQ_FOREACH(fc_conn, &assoc->fc_conns, assoc_link) { + opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx)); + if (!opd) { /* hopefully this doesn't happen */ + SPDK_ERRLOG("Mem alloc failed for del conn op data"); + while (opd_head) { /* free any contexts already allocated */ + opd = opd_head; + opd_head = opd->next_op_ctx; + free(opd); + } + return -ENOMEM; + } + + api_data = &opd->u.del_assoc; + api_data->args.fc_conn = fc_conn; + api_data->assoc = assoc; + api_data->args.send_abts = send_abts; + api_data->args.backend_initiated = backend_initiated; + api_data->args.hwqp = nvmf_fc_get_hwqp_from_conn_id( + assoc->tgtport->fc_port->io_queues, + assoc->tgtport->fc_port->num_io_queues, + fc_conn->conn_id); + api_data->args.cb_info.cb_thread = spdk_get_thread(); + if ((fc_port->hw_port_status == SPDK_FC_PORT_QUIESCED) && + (assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED)) { + /* + * If there are any connections deletes or IO abts that are + * stuck because of firmware reset, a second invocation of + * SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION will result in + * outstanding connections & requests being killed and + * their corresponding callbacks being executed. + */ + api_data->args.cb_info.cb_func = nvmf_fc_kill_io_del_all_conns_cb; + } else { + api_data->args.cb_info.cb_func = nvmf_fc_del_all_conns_cb; + } + api_data->args.cb_info.cb_data = opd; + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, + "conn_id = %lx\n", fc_conn->conn_id); + + if (!opd_head) { + opd_head = opd; + } else { + opd_tail->next_op_ctx = opd; + } + opd_tail = opd; + } + + /* make poller api calls to delete connetions */ + while (opd_head) { + opd = opd_head; + opd_head = opd->next_op_ctx; + api_data = &opd->u.del_assoc; + nvmf_fc_poller_api_func(api_data->args.hwqp, + SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION, + &api_data->args); + } + + return 0; +} + +static void +nvmf_fc_ls_disconnect_assoc_cb(void *cb_data, uint32_t err) +{ + struct nvmf_fc_ls_op_ctx *opd = (struct nvmf_fc_ls_op_ctx *)cb_data; + struct spdk_nvmf_fc_ls_disconn_assoc_api_data *dp = &opd->u.disconn_assoc; + struct spdk_nvmf_fc_nport *tgtport = dp->tgtport; + struct spdk_nvmf_fc_ls_rqst *ls_rqst = dp->ls_rqst; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Disconnect association callback begin " + "nport %d\n", tgtport->nport_hdl); + if (err != 0) { + /* send failure response */ + struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst = + (struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt; + struct spdk_nvmf_fc_ls_cr_assoc_acc *acc = + (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt; + ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, + FCNVME_MAX_LS_BUFFER_SIZE, + rqst->w0.ls_cmd, + FCNVME_RJT_RC_UNAB, + FCNVME_RJT_EXP_NONE, + 0); + } + + nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst); + + free(opd); + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Disconnect association callback complete " + "nport %d err %d\n", tgtport->nport_hdl, err); +} + +static void +nvmf_fc_ls_disconnect_assoc(struct spdk_nvmf_fc_nport *tgtport, + struct spdk_nvmf_fc_ls_rqst *ls_rqst, uint64_t assoc_id) +{ + struct nvmf_fc_ls_op_ctx *opd; + struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst = + (struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt; + struct spdk_nvmf_fc_ls_cr_assoc_acc *acc = + (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt; + struct spdk_nvmf_fc_ls_disconn_assoc_api_data *api_data; + int ret; + uint8_t reason = 0; + + opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx)); + if (!opd) { + /* send failure response */ + SPDK_ERRLOG("Allocate disconn assoc op data failed\n"); + reason = FCNVME_RJT_RC_INSUFF_RES; + goto send_rjt; + } + + api_data = &opd->u.disconn_assoc; + api_data->tgtport = tgtport; + api_data->ls_rqst = ls_rqst; + ret = _nvmf_fc_delete_association(tgtport, assoc_id, + false, false, + nvmf_fc_ls_disconnect_assoc_cb, + api_data, true); + if (!ret) { + return; + } + + /* delete association failed */ + switch (ret) { + case VERR_NO_ASSOC: + reason = FCNVME_RJT_RC_INV_ASSOC; + break; + case -ENOMEM: + reason = FCNVME_RJT_RC_INSUFF_RES; + break; + default: + reason = FCNVME_RJT_RC_LOGIC; + } + + free(opd); + +send_rjt: + ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, + FCNVME_MAX_LS_BUFFER_SIZE, + rqst->w0.ls_cmd, reason, + FCNVME_RJT_EXP_NONE, 0); + nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst); +} + +static int +nvmf_fc_ls_validate_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + + if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) { + return -EPERM; + } + + return 0; +} + +/* **************************** */ +/* LS Reqeust Handler Functions */ + +static void +nvmf_fc_ls_process_cass(uint32_t s_id, + struct spdk_nvmf_fc_nport *tgtport, + struct spdk_nvmf_fc_ls_rqst *ls_rqst) +{ + struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst = + (struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt; + struct spdk_nvmf_fc_ls_cr_assoc_acc *acc = + (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt; + struct spdk_nvmf_fc_association *assoc; + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_subsystem *subsystem = NULL; + const char *hostnqn = (const char *)rqst->assoc_cmd.hostnqn; + int errmsg_ind = 0; + uint8_t rc = FCNVME_RJT_RC_NONE; + uint8_t ec = FCNVME_RJT_EXP_NONE; + struct spdk_nvmf_transport *transport = spdk_nvmf_tgt_get_transport(ls_rqst->nvmf_tgt, + SPDK_NVME_TRANSPORT_NAME_FC); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, + "LS_CASS: ls_rqst_len=%d, desc_list_len=%d, cmd_len=%d, sq_size=%d, " + "Subnqn: %s, Hostnqn: %s, Tgtport nn:%lx, pn:%lx\n", + ls_rqst->rqst_len, from_be32(&rqst->desc_list_len), + from_be32(&rqst->assoc_cmd.desc_len), + from_be32(&rqst->assoc_cmd.sqsize), + rqst->assoc_cmd.subnqn, hostnqn, + tgtport->fc_nodename.u.wwn, tgtport->fc_portname.u.wwn); + + if (ls_rqst->rqst_len < FCNVME_LS_CA_CMD_MIN_LEN) { + SPDK_ERRLOG("assoc_cmd req len = %d, should be at least %d\n", + ls_rqst->rqst_len, FCNVME_LS_CA_CMD_MIN_LEN); + errmsg_ind = VERR_CR_ASSOC_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } else if (from_be32(&rqst->desc_list_len) < + FCNVME_LS_CA_DESC_LIST_MIN_LEN) { + SPDK_ERRLOG("assoc_cmd desc list len = %d, should be at least %d\n", + from_be32(&rqst->desc_list_len), + FCNVME_LS_CA_DESC_LIST_MIN_LEN); + errmsg_ind = VERR_CR_ASSOC_RQST_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } else if (rqst->assoc_cmd.desc_tag != + cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD)) { + errmsg_ind = VERR_CR_ASSOC_CMD; + rc = FCNVME_RJT_RC_INV_PARAM; + } else if (from_be32(&rqst->assoc_cmd.desc_len) < + FCNVME_LS_CA_DESC_MIN_LEN) { + SPDK_ERRLOG("assoc_cmd desc len = %d, should be at least %d\n", + from_be32(&rqst->assoc_cmd.desc_len), + FCNVME_LS_CA_DESC_MIN_LEN); + errmsg_ind = VERR_CR_ASSOC_CMD_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } else if (!rqst->assoc_cmd.ersp_ratio || + (from_be16(&rqst->assoc_cmd.ersp_ratio) >= + from_be16(&rqst->assoc_cmd.sqsize))) { + errmsg_ind = VERR_ERSP_RATIO; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_ESRP; + } else if (from_be16(&rqst->assoc_cmd.sqsize) == 0 || + from_be16(&rqst->assoc_cmd.sqsize) > transport->opts.max_aq_depth) { + errmsg_ind = VERR_SQSIZE; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_SQ_SIZE; + } + + if (rc != FCNVME_RJT_RC_NONE) { + goto rjt_cass; + } + + subsystem = spdk_nvmf_tgt_find_subsystem(ls_rqst->nvmf_tgt, rqst->assoc_cmd.subnqn); + if (subsystem == NULL) { + errmsg_ind = VERR_SUBNQN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_SUBNQN; + goto rjt_cass; + } + + if (nvmf_fc_ls_validate_host(subsystem, hostnqn)) { + errmsg_ind = VERR_HOSTNQN; + rc = FCNVME_RJT_RC_INV_HOST; + ec = FCNVME_RJT_EXP_INV_HOSTNQN; + goto rjt_cass; + } + + /* get new association */ + assoc = nvmf_fc_ls_new_association(s_id, tgtport, ls_rqst->rport, + &rqst->assoc_cmd, subsystem, + ls_rqst->rpi, transport); + if (!assoc) { + errmsg_ind = VERR_ASSOC_ALLOC_FAIL; + rc = FCNVME_RJT_RC_INSUFF_RES; + ec = FCNVME_RJT_EXP_NONE; + goto rjt_cass; + } + + /* alloc admin q (i.e. connection) */ + fc_conn = nvmf_fc_ls_new_connection(assoc, 0, + from_be16(&rqst->assoc_cmd.ersp_ratio), + ls_rqst->rpi, + from_be16(&rqst->assoc_cmd.sqsize), + tgtport); + if (!fc_conn) { + nvmf_fc_ls_free_association(assoc); + errmsg_ind = VERR_CONN_ALLOC_FAIL; + rc = FCNVME_RJT_RC_INSUFF_RES; + ec = FCNVME_RJT_EXP_NONE; + goto rjt_cass; + } + + /* format accept response */ + bzero(acc, sizeof(*acc)); + ls_rqst->rsp_len = sizeof(*acc); + + nvmf_fc_ls_format_rsp_hdr(acc, FCNVME_LS_ACC, + nvmf_fc_lsdesc_len( + sizeof(struct spdk_nvmf_fc_ls_cr_assoc_acc)), + FCNVME_LS_CREATE_ASSOCIATION); + to_be32(&acc->assoc_id.desc_tag, FCNVME_LSDESC_ASSOC_ID); + acc->assoc_id.desc_len = + nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id)); + to_be32(&acc->conn_id.desc_tag, FCNVME_LSDESC_CONN_ID); + acc->conn_id.desc_len = + nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_conn_id)); + + /* assign connection to HWQP poller - also sends response */ + nvmf_fc_ls_add_conn_to_poller(assoc, ls_rqst, fc_conn, true); + + return; + +rjt_cass: + SPDK_ERRLOG("Create Association LS failed: %s\n", validation_errors[errmsg_ind]); + ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, FCNVME_MAX_LS_BUFFER_SIZE, + rqst->w0.ls_cmd, rc, ec, 0); + nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst); +} + +static void +nvmf_fc_ls_process_cioc(struct spdk_nvmf_fc_nport *tgtport, + struct spdk_nvmf_fc_ls_rqst *ls_rqst) +{ + struct spdk_nvmf_fc_ls_cr_conn_rqst *rqst = + (struct spdk_nvmf_fc_ls_cr_conn_rqst *)ls_rqst->rqstbuf.virt; + struct spdk_nvmf_fc_ls_cr_conn_acc *acc = + (struct spdk_nvmf_fc_ls_cr_conn_acc *)ls_rqst->rspbuf.virt; + struct spdk_nvmf_fc_association *assoc; + struct spdk_nvmf_fc_conn *fc_conn = NULL; + int errmsg_ind = 0; + uint8_t rc = FCNVME_RJT_RC_NONE; + uint8_t ec = FCNVME_RJT_EXP_NONE; + struct spdk_nvmf_transport *transport = spdk_nvmf_tgt_get_transport(ls_rqst->nvmf_tgt, + SPDK_NVME_TRANSPORT_NAME_FC); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, + "LS_CIOC: ls_rqst_len=%d, desc_list_len=%d, cmd_len=%d, " + "assoc_id=0x%lx, sq_size=%d, esrp=%d, Tgtport nn:%lx, pn:%lx\n", + ls_rqst->rqst_len, from_be32(&rqst->desc_list_len), + from_be32(&rqst->connect_cmd.desc_len), + from_be64(&rqst->assoc_id.association_id), + from_be32(&rqst->connect_cmd.sqsize), + from_be32(&rqst->connect_cmd.ersp_ratio), + tgtport->fc_nodename.u.wwn, tgtport->fc_portname.u.wwn); + + if (ls_rqst->rqst_len < sizeof(struct spdk_nvmf_fc_ls_cr_conn_rqst)) { + errmsg_ind = VERR_CR_CONN_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } else if (rqst->desc_list_len != + nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_ls_cr_conn_rqst))) { + errmsg_ind = VERR_CR_CONN_RQST_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } else if (rqst->assoc_id.desc_tag != + cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) { + errmsg_ind = VERR_ASSOC_ID; + rc = FCNVME_RJT_RC_INV_PARAM; + } else if (rqst->assoc_id.desc_len != + nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id))) { + errmsg_ind = VERR_ASSOC_ID_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } else if (rqst->connect_cmd.desc_tag != + cpu_to_be32(FCNVME_LSDESC_CREATE_CONN_CMD)) { + errmsg_ind = VERR_CR_CONN_CMD; + rc = FCNVME_RJT_RC_INV_PARAM; + } else if (rqst->connect_cmd.desc_len != + nvmf_fc_lsdesc_len( + sizeof(struct spdk_nvmf_fc_lsdesc_cr_conn_cmd))) { + errmsg_ind = VERR_CR_CONN_CMD_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } else if (!rqst->connect_cmd.ersp_ratio || + (from_be16(&rqst->connect_cmd.ersp_ratio) >= + from_be16(&rqst->connect_cmd.sqsize))) { + errmsg_ind = VERR_ERSP_RATIO; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_ESRP; + } else if (from_be16(&rqst->connect_cmd.sqsize) == 0 || + from_be16(&rqst->connect_cmd.sqsize) > transport->opts.max_queue_depth) { + errmsg_ind = VERR_SQSIZE; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_SQ_SIZE; + } + + if (rc != FCNVME_RJT_RC_NONE) { + goto rjt_cioc; + } + + /* find association */ + assoc = nvmf_fc_ls_find_assoc(tgtport, + from_be64(&rqst->assoc_id.association_id)); + if (!assoc) { + errmsg_ind = VERR_NO_ASSOC; + rc = FCNVME_RJT_RC_INV_ASSOC; + } else if (assoc->assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) { + /* association is being deleted - don't allow more connections */ + errmsg_ind = VERR_NO_ASSOC; + rc = FCNVME_RJT_RC_INV_ASSOC; + } else if (assoc->conn_count >= transport->opts.max_qpairs_per_ctrlr) { + errmsg_ind = VERR_CONN_TOO_MANY; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_Q_ID; + } + + if (rc != FCNVME_RJT_RC_NONE) { + goto rjt_cioc; + } + + fc_conn = nvmf_fc_ls_new_connection(assoc, from_be16(&rqst->connect_cmd.qid), + from_be16(&rqst->connect_cmd.ersp_ratio), + ls_rqst->rpi, + from_be16(&rqst->connect_cmd.sqsize), + tgtport); + if (!fc_conn) { + errmsg_ind = VERR_CONN_ALLOC_FAIL; + rc = FCNVME_RJT_RC_INSUFF_RES; + ec = FCNVME_RJT_EXP_NONE; + goto rjt_cioc; + } + + /* format accept response */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Formatting LS accept response for " + "assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id, + fc_conn->conn_id); + bzero(acc, sizeof(*acc)); + ls_rqst->rsp_len = sizeof(*acc); + nvmf_fc_ls_format_rsp_hdr(acc, FCNVME_LS_ACC, + nvmf_fc_lsdesc_len( + sizeof(struct spdk_nvmf_fc_ls_cr_conn_acc)), + FCNVME_LS_CREATE_CONNECTION); + to_be32(&acc->conn_id.desc_tag, FCNVME_LSDESC_CONN_ID); + acc->conn_id.desc_len = + nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_conn_id)); + + /* assign connection to HWQP poller - also sends response */ + nvmf_fc_ls_add_conn_to_poller(assoc, ls_rqst, fc_conn, false); + + return; + +rjt_cioc: + SPDK_ERRLOG("Create Connection LS failed: %s\n", validation_errors[errmsg_ind]); + + ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, FCNVME_MAX_LS_BUFFER_SIZE, + rqst->w0.ls_cmd, rc, ec, 0); + nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst); +} + +static void +nvmf_fc_ls_process_disc(struct spdk_nvmf_fc_nport *tgtport, + struct spdk_nvmf_fc_ls_rqst *ls_rqst) +{ + struct spdk_nvmf_fc_ls_disconnect_rqst *rqst = + (struct spdk_nvmf_fc_ls_disconnect_rqst *)ls_rqst->rqstbuf.virt; + struct spdk_nvmf_fc_ls_disconnect_acc *acc = + (struct spdk_nvmf_fc_ls_disconnect_acc *)ls_rqst->rspbuf.virt; + struct spdk_nvmf_fc_association *assoc; + int errmsg_ind = 0; + uint8_t rc = FCNVME_RJT_RC_NONE; + uint8_t ec = FCNVME_RJT_EXP_NONE; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, + "LS_DISC: ls_rqst_len=%d, desc_list_len=%d, cmd_len=%d," + "assoc_id=0x%lx\n", + ls_rqst->rqst_len, from_be32(&rqst->desc_list_len), + from_be32(&rqst->disconn_cmd.desc_len), + from_be64(&rqst->assoc_id.association_id)); + + if (ls_rqst->rqst_len < sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst)) { + errmsg_ind = VERR_DISCONN_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } else if (rqst->desc_list_len != + nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst))) { + errmsg_ind = VERR_DISCONN_RQST_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } else if (rqst->assoc_id.desc_tag != + cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) { + errmsg_ind = VERR_ASSOC_ID; + rc = FCNVME_RJT_RC_INV_PARAM; + } else if (rqst->assoc_id.desc_len != + nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id))) { + errmsg_ind = VERR_ASSOC_ID_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } else if (rqst->disconn_cmd.desc_tag != + cpu_to_be32(FCNVME_LSDESC_DISCONN_CMD)) { + rc = FCNVME_RJT_RC_INV_PARAM; + errmsg_ind = VERR_DISCONN_CMD; + } else if (rqst->disconn_cmd.desc_len != + nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_disconn_cmd))) { + errmsg_ind = VERR_DISCONN_CMD_LEN; + rc = FCNVME_RJT_RC_INV_PARAM; + ec = FCNVME_RJT_EXP_INV_LEN; + } + + if (rc != FCNVME_RJT_RC_NONE) { + goto rjt_disc; + } + + /* match an active association */ + assoc = nvmf_fc_ls_find_assoc(tgtport, + from_be64(&rqst->assoc_id.association_id)); + if (!assoc) { + errmsg_ind = VERR_NO_ASSOC; + rc = FCNVME_RJT_RC_INV_ASSOC; + goto rjt_disc; + } + + /* format response */ + bzero(acc, sizeof(*acc)); + ls_rqst->rsp_len = sizeof(*acc); + + nvmf_fc_ls_format_rsp_hdr(acc, FCNVME_LS_ACC, + nvmf_fc_lsdesc_len( + sizeof(struct spdk_nvmf_fc_ls_disconnect_acc)), + FCNVME_LS_DISCONNECT); + + nvmf_fc_ls_disconnect_assoc(tgtport, ls_rqst, assoc->assoc_id); + return; + +rjt_disc: + SPDK_ERRLOG("Disconnect LS failed: %s\n", validation_errors[errmsg_ind]); + ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, FCNVME_MAX_LS_BUFFER_SIZE, + rqst->w0.ls_cmd, rc, ec, 0); + nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst); +} + +/* ************************ */ +/* external functions */ + +void +nvmf_fc_ls_init(struct spdk_nvmf_fc_port *fc_port) +{ +} + +void +nvmf_fc_ls_fini(struct spdk_nvmf_fc_port *fc_port) +{ +} + +void +nvmf_fc_handle_ls_rqst(struct spdk_nvmf_fc_ls_rqst *ls_rqst) +{ + struct spdk_nvmf_fc_ls_rqst_w0 *w0 = + (struct spdk_nvmf_fc_ls_rqst_w0 *)ls_rqst->rqstbuf.virt; + uint32_t s_id = ls_rqst->s_id; + struct spdk_nvmf_fc_nport *tgtport = ls_rqst->nport; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "LS cmd=%d\n", w0->ls_cmd); + + switch (w0->ls_cmd) { + case FCNVME_LS_CREATE_ASSOCIATION: + nvmf_fc_ls_process_cass(s_id, tgtport, ls_rqst); + break; + case FCNVME_LS_CREATE_CONNECTION: + nvmf_fc_ls_process_cioc(tgtport, ls_rqst); + break; + case FCNVME_LS_DISCONNECT: + nvmf_fc_ls_process_disc(tgtport, ls_rqst); + break; + default: + SPDK_ERRLOG("Invalid LS cmd=%d\n", w0->ls_cmd); + ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(ls_rqst->rspbuf.virt, + FCNVME_MAX_LS_BUFFER_SIZE, w0->ls_cmd, + FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0); + nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst); + } +} + +int +nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport, + uint64_t assoc_id, bool send_abts, bool backend_initiated, + spdk_nvmf_fc_del_assoc_cb del_assoc_cb, + void *cb_data) +{ + return _nvmf_fc_delete_association(tgtport, assoc_id, send_abts, backend_initiated, + del_assoc_cb, cb_data, false); +} + +static void +nvmf_fc_poller_api_cb_event(void *arg) +{ + struct spdk_nvmf_fc_poller_api_cb_info *cb_info = + (struct spdk_nvmf_fc_poller_api_cb_info *) arg; + + assert(cb_info != NULL); + cb_info->cb_func(cb_info->cb_data, cb_info->ret); +} + +static void +nvmf_fc_poller_api_perform_cb(struct spdk_nvmf_fc_poller_api_cb_info *cb_info, + enum spdk_nvmf_fc_poller_api_ret ret) +{ + if (cb_info->cb_func && cb_info->cb_thread) { + cb_info->ret = ret; + /* callback to master thread */ + spdk_thread_send_msg(cb_info->cb_thread, nvmf_fc_poller_api_cb_event, + (void *) cb_info); + } +} + +static void +nvmf_fc_poller_api_add_connection(void *arg) +{ + enum spdk_nvmf_fc_poller_api_ret ret = SPDK_NVMF_FC_POLLER_API_SUCCESS; + struct spdk_nvmf_fc_poller_api_add_connection_args *conn_args = + (struct spdk_nvmf_fc_poller_api_add_connection_args *)arg; + struct spdk_nvmf_fc_conn *fc_conn; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Poller add connection, conn_id 0x%lx\n", + conn_args->fc_conn->conn_id); + + /* make sure connection is not already in poller's list */ + fc_conn = nvmf_fc_hwqp_find_fc_conn(conn_args->fc_conn->hwqp, + conn_args->fc_conn->conn_id); + if (fc_conn) { + SPDK_ERRLOG("duplicate connection found"); + ret = SPDK_NVMF_FC_POLLER_API_DUP_CONN_ID; + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, + "conn_id=%lx", fc_conn->conn_id); + TAILQ_INSERT_TAIL(&conn_args->fc_conn->hwqp->connection_list, + conn_args->fc_conn, link); + } + + /* perform callback */ + nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, ret); +} + +static void +nvmf_fc_poller_api_quiesce_queue(void *arg) +{ + struct spdk_nvmf_fc_poller_api_quiesce_queue_args *q_args = + (struct spdk_nvmf_fc_poller_api_quiesce_queue_args *) arg; + struct spdk_nvmf_fc_request *fc_req = NULL, *tmp; + + /* should be already, but make sure queue is quiesced */ + q_args->hwqp->state = SPDK_FC_HWQP_OFFLINE; + + /* + * Kill all the outstanding commands that are in the transfer state and + * in the process of being aborted. + * We can run into this situation if an adapter reset happens when an I_T Nexus delete + * is in progress. + */ + TAILQ_FOREACH_SAFE(fc_req, &q_args->hwqp->in_use_reqs, link, tmp) { + if (nvmf_fc_req_in_xfer(fc_req) && fc_req->is_aborted == true) { + nvmf_fc_poller_api_func(q_args->hwqp, SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE, + (void *)fc_req); + } + } + + /* perform callback */ + nvmf_fc_poller_api_perform_cb(&q_args->cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS); +} + +static void +nvmf_fc_poller_api_activate_queue(void *arg) +{ + struct spdk_nvmf_fc_poller_api_quiesce_queue_args *q_args = + (struct spdk_nvmf_fc_poller_api_quiesce_queue_args *) arg; + + q_args->hwqp->state = SPDK_FC_HWQP_ONLINE; + + /* perform callback */ + nvmf_fc_poller_api_perform_cb(&q_args->cb_info, 0); +} + +static void +nvmf_fc_disconnect_qpair_cb(void *ctx) +{ + struct spdk_nvmf_fc_poller_api_cb_info *cb_info = ctx; + /* perform callback */ + nvmf_fc_poller_api_perform_cb(cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS); +} + +static void +nvmf_fc_poller_conn_abort_done(void *hwqp, int32_t status, void *cb_args) +{ + struct spdk_nvmf_fc_poller_api_del_connection_args *conn_args = cb_args; + + if (conn_args->fc_request_cnt) { + conn_args->fc_request_cnt -= 1; + } + + if (!conn_args->fc_request_cnt) { + if (!TAILQ_EMPTY(&conn_args->hwqp->connection_list)) { + /* All the requests for this connection are aborted. */ + TAILQ_REMOVE(&conn_args->hwqp->connection_list, conn_args->fc_conn, link); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Connection deleted, conn_id 0x%lx\n", + conn_args->fc_conn->conn_id); + + if (!conn_args->backend_initiated) { + /* disconnect qpair from nvmf controller */ + spdk_nvmf_qpair_disconnect(&conn_args->fc_conn->qpair, + nvmf_fc_disconnect_qpair_cb, &conn_args->cb_info); + } + } else { + /* + * Duplicate connection delete can happen if one is + * coming in via an association disconnect and the other + * is initiated by a port reset. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Duplicate conn delete."); + /* perform callback */ + nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS); + } + } +} + +static void +nvmf_fc_poller_api_del_connection(void *arg) +{ + struct spdk_nvmf_fc_poller_api_del_connection_args *conn_args = + (struct spdk_nvmf_fc_poller_api_del_connection_args *)arg; + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_request *fc_req = NULL, *tmp; + struct spdk_nvmf_fc_hwqp *hwqp = conn_args->hwqp; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Poller delete connection, conn_id 0x%lx\n", + conn_args->fc_conn->conn_id); + + /* find the connection in poller's list */ + fc_conn = nvmf_fc_hwqp_find_fc_conn(hwqp, conn_args->fc_conn->conn_id); + if (!fc_conn) { + /* perform callback */ + nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, SPDK_NVMF_FC_POLLER_API_NO_CONN_ID); + return; + } + + conn_args->fc_request_cnt = 0; + + TAILQ_FOREACH_SAFE(fc_req, &hwqp->in_use_reqs, link, tmp) { + if (fc_req->fc_conn->conn_id == fc_conn->conn_id) { + if (nvmf_qpair_is_admin_queue(&fc_conn->qpair) && + (fc_req->req.cmd->nvme_cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST)) { + /* AER will be cleaned by spdk_nvmf_qpair_disconnect. */ + continue; + } + + conn_args->fc_request_cnt += 1; + nvmf_fc_request_abort(fc_req, conn_args->send_abts, + nvmf_fc_poller_conn_abort_done, + conn_args); + } + } + + if (!conn_args->fc_request_cnt) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Connection deleted.\n"); + TAILQ_REMOVE(&hwqp->connection_list, fc_conn, link); + + if (!conn_args->backend_initiated) { + /* disconnect qpair from nvmf controller */ + spdk_nvmf_qpair_disconnect(&fc_conn->qpair, nvmf_fc_disconnect_qpair_cb, + &conn_args->cb_info); + } + } +} + +static void +nvmf_fc_poller_abts_done(void *hwqp, int32_t status, void *cb_args) +{ + struct spdk_nvmf_fc_poller_api_abts_recvd_args *args = cb_args; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, + "ABTS poller done, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n", + args->ctx->rpi, args->ctx->oxid, args->ctx->rxid); + + nvmf_fc_poller_api_perform_cb(&args->cb_info, + SPDK_NVMF_FC_POLLER_API_SUCCESS); +} + +static void +nvmf_fc_poller_api_abts_received(void *arg) +{ + struct spdk_nvmf_fc_poller_api_abts_recvd_args *args = arg; + struct spdk_nvmf_fc_request *fc_req = NULL; + struct spdk_nvmf_fc_hwqp *hwqp = args->hwqp; + + TAILQ_FOREACH(fc_req, &hwqp->in_use_reqs, link) { + if ((fc_req->rpi == args->ctx->rpi) && + (fc_req->oxid == args->ctx->oxid)) { + nvmf_fc_request_abort(fc_req, false, + nvmf_fc_poller_abts_done, args); + return; + } + } + + nvmf_fc_poller_api_perform_cb(&args->cb_info, + SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND); +} + +static void +nvmf_fc_poller_api_queue_sync(void *arg) +{ + struct spdk_nvmf_fc_poller_api_queue_sync_args *args = arg; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, + "HWQP sync requested for u_id = 0x%lx\n", args->u_id); + + /* Add this args to hwqp sync_cb list */ + TAILQ_INSERT_TAIL(&args->hwqp->sync_cbs, args, link); +} + +static void +nvmf_fc_poller_api_queue_sync_done(void *arg) +{ + struct spdk_nvmf_fc_poller_api_queue_sync_done_args *args = arg; + struct spdk_nvmf_fc_hwqp *hwqp = args->hwqp; + uint64_t tag = args->tag; + struct spdk_nvmf_fc_poller_api_queue_sync_args *sync_args = NULL, *tmp = NULL; + + assert(args != NULL); + + TAILQ_FOREACH_SAFE(sync_args, &hwqp->sync_cbs, link, tmp) { + if (sync_args->u_id == tag) { + /* Queue successfully synced. Remove from cb list */ + TAILQ_REMOVE(&hwqp->sync_cbs, sync_args, link); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, + "HWQP sync done for u_id = 0x%lx\n", sync_args->u_id); + + /* Return the status to poller */ + nvmf_fc_poller_api_perform_cb(&sync_args->cb_info, + SPDK_NVMF_FC_POLLER_API_SUCCESS); + return; + } + } + + free(arg); + /* note: no callback from this api */ +} + +static void +nvmf_fc_poller_api_add_hwqp(void *arg) +{ + struct spdk_nvmf_fc_hwqp *hwqp = (struct spdk_nvmf_fc_hwqp *)arg; + + hwqp->lcore_id = spdk_env_get_current_core(); /* for tracing purposes only */ + TAILQ_INSERT_TAIL(&hwqp->fgroup->hwqp_list, hwqp, link); + /* note: no callback from this api */ +} + +static void +nvmf_fc_poller_api_remove_hwqp(void *arg) +{ + struct spdk_nvmf_fc_hwqp *hwqp = (struct spdk_nvmf_fc_hwqp *)arg; + struct spdk_nvmf_fc_poll_group *fgroup = hwqp->fgroup; + + TAILQ_REMOVE(&fgroup->hwqp_list, hwqp, link); + hwqp->fgroup = NULL; + /* note: no callback from this api */ +} + +enum spdk_nvmf_fc_poller_api_ret +nvmf_fc_poller_api_func(struct spdk_nvmf_fc_hwqp *hwqp, enum spdk_nvmf_fc_poller_api api, + void *api_args) { + switch (api) + { + case SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION: + spdk_thread_send_msg(hwqp->thread, + nvmf_fc_poller_api_add_connection, api_args); + break; + + case SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION: + spdk_thread_send_msg(hwqp->thread, + nvmf_fc_poller_api_del_connection, api_args); + break; + + case SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE: + /* quiesce q polling now, don't wait for poller to do it */ + hwqp->state = SPDK_FC_HWQP_OFFLINE; + spdk_thread_send_msg(hwqp->thread, + nvmf_fc_poller_api_quiesce_queue, api_args); + break; + + case SPDK_NVMF_FC_POLLER_API_ACTIVATE_QUEUE: + spdk_thread_send_msg(hwqp->thread, + nvmf_fc_poller_api_activate_queue, api_args); + break; + + case SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED: + spdk_thread_send_msg(hwqp->thread, + nvmf_fc_poller_api_abts_received, api_args); + break; + + case SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE: + spdk_thread_send_msg(hwqp->thread, + nvmf_fc_request_abort_complete, api_args); + break; + + case SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC: + spdk_thread_send_msg(hwqp->thread, + nvmf_fc_poller_api_queue_sync, api_args); + break; + + case SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC_DONE: + spdk_thread_send_msg(hwqp->thread, + nvmf_fc_poller_api_queue_sync_done, api_args); + break; + + case SPDK_NVMF_FC_POLLER_API_ADD_HWQP: + spdk_thread_send_msg(hwqp->thread, nvmf_fc_poller_api_add_hwqp, (void *) hwqp); + break; + + case SPDK_NVMF_FC_POLLER_API_REMOVE_HWQP: + spdk_thread_send_msg(hwqp->thread, nvmf_fc_poller_api_remove_hwqp, (void *) hwqp); + break; + + case SPDK_NVMF_FC_POLLER_API_ADAPTER_EVENT: + case SPDK_NVMF_FC_POLLER_API_AEN: + default: + SPDK_ERRLOG("BAD ARG!"); + return SPDK_NVMF_FC_POLLER_API_INVALID_ARG; + } + + return SPDK_NVMF_FC_POLLER_API_SUCCESS; +} + +SPDK_LOG_REGISTER_COMPONENT("nvmf_fc_poller_api", SPDK_LOG_NVMF_FC_POLLER_API) +SPDK_LOG_REGISTER_COMPONENT("nvmf_fc_ls", SPDK_LOG_NVMF_FC_LS) diff --git a/src/spdk/lib/nvmf/nvmf.c b/src/spdk/lib/nvmf/nvmf.c new file mode 100644 index 000000000..73fa0742e --- /dev/null +++ b/src/spdk/lib/nvmf/nvmf.c @@ -0,0 +1,1457 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2018-2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/bit_array.h" +#include "spdk/conf.h" +#include "spdk/thread.h" +#include "spdk/nvmf.h" +#include "spdk/trace.h" +#include "spdk/endian.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include "nvmf_internal.h" +#include "transport.h" + +SPDK_LOG_REGISTER_COMPONENT("nvmf", SPDK_LOG_NVMF) + +#define SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS 1024 + +static TAILQ_HEAD(, spdk_nvmf_tgt) g_nvmf_tgts = TAILQ_HEAD_INITIALIZER(g_nvmf_tgts); + +typedef void (*nvmf_qpair_disconnect_cpl)(void *ctx, int status); +static void nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf); + +/* supplied to a single call to nvmf_qpair_disconnect */ +struct nvmf_qpair_disconnect_ctx { + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_ctrlr *ctrlr; + nvmf_qpair_disconnect_cb cb_fn; + struct spdk_thread *thread; + void *ctx; + uint16_t qid; +}; + +/* + * There are several times when we need to iterate through the list of all qpairs and selectively delete them. + * In order to do this sequentially without overlap, we must provide a context to recover the next qpair from + * to enable calling nvmf_qpair_disconnect on the next desired qpair. + */ +struct nvmf_qpair_disconnect_many_ctx { + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_poll_group *group; + spdk_nvmf_poll_group_mod_done cpl_fn; + void *cpl_ctx; +}; + +static void +nvmf_qpair_set_state(struct spdk_nvmf_qpair *qpair, + enum spdk_nvmf_qpair_state state) +{ + assert(qpair != NULL); + assert(qpair->group->thread == spdk_get_thread()); + + qpair->state = state; +} + +static int +nvmf_poll_group_poll(void *ctx) +{ + struct spdk_nvmf_poll_group *group = ctx; + int rc; + int count = 0; + struct spdk_nvmf_transport_poll_group *tgroup; + + TAILQ_FOREACH(tgroup, &group->tgroups, link) { + rc = nvmf_transport_poll_group_poll(tgroup); + if (rc < 0) { + return SPDK_POLLER_BUSY; + } + count += rc; + } + + return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +} + +static int +nvmf_tgt_create_poll_group(void *io_device, void *ctx_buf) +{ + struct spdk_nvmf_tgt *tgt = io_device; + struct spdk_nvmf_poll_group *group = ctx_buf; + struct spdk_nvmf_transport *transport; + uint32_t sid; + + TAILQ_INIT(&group->tgroups); + TAILQ_INIT(&group->qpairs); + + TAILQ_FOREACH(transport, &tgt->transports, link) { + nvmf_poll_group_add_transport(group, transport); + } + + group->num_sgroups = tgt->max_subsystems; + group->sgroups = calloc(tgt->max_subsystems, sizeof(struct spdk_nvmf_subsystem_poll_group)); + if (!group->sgroups) { + return -ENOMEM; + } + + for (sid = 0; sid < tgt->max_subsystems; sid++) { + struct spdk_nvmf_subsystem *subsystem; + + subsystem = tgt->subsystems[sid]; + if (!subsystem) { + continue; + } + + if (nvmf_poll_group_add_subsystem(group, subsystem, NULL, NULL) != 0) { + nvmf_tgt_destroy_poll_group(io_device, ctx_buf); + return -1; + } + } + + pthread_mutex_lock(&tgt->mutex); + TAILQ_INSERT_TAIL(&tgt->poll_groups, group, link); + pthread_mutex_unlock(&tgt->mutex); + + group->poller = SPDK_POLLER_REGISTER(nvmf_poll_group_poll, group, 0); + group->thread = spdk_get_thread(); + + return 0; +} + +static void +nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf) +{ + struct spdk_nvmf_tgt *tgt = io_device; + struct spdk_nvmf_poll_group *group = ctx_buf; + struct spdk_nvmf_transport_poll_group *tgroup, *tmp; + struct spdk_nvmf_subsystem_poll_group *sgroup; + uint32_t sid, nsid; + + pthread_mutex_lock(&tgt->mutex); + TAILQ_REMOVE(&tgt->poll_groups, group, link); + pthread_mutex_unlock(&tgt->mutex); + + TAILQ_FOREACH_SAFE(tgroup, &group->tgroups, link, tmp) { + TAILQ_REMOVE(&group->tgroups, tgroup, link); + nvmf_transport_poll_group_destroy(tgroup); + } + + for (sid = 0; sid < group->num_sgroups; sid++) { + sgroup = &group->sgroups[sid]; + + for (nsid = 0; nsid < sgroup->num_ns; nsid++) { + if (sgroup->ns_info[nsid].channel) { + spdk_put_io_channel(sgroup->ns_info[nsid].channel); + sgroup->ns_info[nsid].channel = NULL; + } + } + + free(sgroup->ns_info); + } + + free(group->sgroups); + + if (group->destroy_cb_fn) { + group->destroy_cb_fn(group->destroy_cb_arg, 0); + } +} + +static void +_nvmf_tgt_disconnect_next_qpair(void *ctx) +{ + struct spdk_nvmf_qpair *qpair; + struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx; + struct spdk_nvmf_poll_group *group = qpair_ctx->group; + struct spdk_io_channel *ch; + int rc = 0; + + qpair = TAILQ_FIRST(&group->qpairs); + + if (qpair) { + rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_tgt_disconnect_next_qpair, ctx); + } + + if (!qpair || rc != 0) { + /* When the refcount from the channels reaches 0, nvmf_tgt_destroy_poll_group will be called. */ + ch = spdk_io_channel_from_ctx(group); + spdk_put_io_channel(ch); + free(qpair_ctx); + } +} + +static void +nvmf_tgt_destroy_poll_group_qpairs(struct spdk_nvmf_poll_group *group) +{ + struct nvmf_qpair_disconnect_many_ctx *ctx; + + ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx)); + + if (!ctx) { + SPDK_ERRLOG("Failed to allocate memory for destroy poll group ctx\n"); + return; + } + + spdk_poller_unregister(&group->poller); + + ctx->group = group; + _nvmf_tgt_disconnect_next_qpair(ctx); +} + +struct spdk_nvmf_tgt * +spdk_nvmf_tgt_create(struct spdk_nvmf_target_opts *opts) +{ + struct spdk_nvmf_tgt *tgt, *tmp_tgt; + + if (strnlen(opts->name, NVMF_TGT_NAME_MAX_LENGTH) == NVMF_TGT_NAME_MAX_LENGTH) { + SPDK_ERRLOG("Provided target name exceeds the max length of %u.\n", NVMF_TGT_NAME_MAX_LENGTH); + return NULL; + } + + TAILQ_FOREACH(tmp_tgt, &g_nvmf_tgts, link) { + if (!strncmp(opts->name, tmp_tgt->name, NVMF_TGT_NAME_MAX_LENGTH)) { + SPDK_ERRLOG("Provided target name must be unique.\n"); + return NULL; + } + } + + tgt = calloc(1, sizeof(*tgt)); + if (!tgt) { + return NULL; + } + + snprintf(tgt->name, NVMF_TGT_NAME_MAX_LENGTH, "%s", opts->name); + + if (!opts || !opts->max_subsystems) { + tgt->max_subsystems = SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS; + } else { + tgt->max_subsystems = opts->max_subsystems; + } + + tgt->discovery_genctr = 0; + TAILQ_INIT(&tgt->transports); + TAILQ_INIT(&tgt->poll_groups); + + tgt->subsystems = calloc(tgt->max_subsystems, sizeof(struct spdk_nvmf_subsystem *)); + if (!tgt->subsystems) { + free(tgt); + return NULL; + } + + pthread_mutex_init(&tgt->mutex, NULL); + + TAILQ_INSERT_HEAD(&g_nvmf_tgts, tgt, link); + + spdk_io_device_register(tgt, + nvmf_tgt_create_poll_group, + nvmf_tgt_destroy_poll_group, + sizeof(struct spdk_nvmf_poll_group), + tgt->name); + + return tgt; +} + +static void +nvmf_tgt_destroy_cb(void *io_device) +{ + struct spdk_nvmf_tgt *tgt = io_device; + struct spdk_nvmf_transport *transport, *transport_tmp; + spdk_nvmf_tgt_destroy_done_fn *destroy_cb_fn; + void *destroy_cb_arg; + uint32_t i; + + if (tgt->subsystems) { + for (i = 0; i < tgt->max_subsystems; i++) { + if (tgt->subsystems[i]) { + nvmf_subsystem_remove_all_listeners(tgt->subsystems[i], true); + spdk_nvmf_subsystem_destroy(tgt->subsystems[i]); + } + } + free(tgt->subsystems); + } + + TAILQ_FOREACH_SAFE(transport, &tgt->transports, link, transport_tmp) { + TAILQ_REMOVE(&tgt->transports, transport, link); + spdk_nvmf_transport_destroy(transport); + } + + destroy_cb_fn = tgt->destroy_cb_fn; + destroy_cb_arg = tgt->destroy_cb_arg; + + free(tgt); + + if (destroy_cb_fn) { + destroy_cb_fn(destroy_cb_arg, 0); + } +} + +void +spdk_nvmf_tgt_destroy(struct spdk_nvmf_tgt *tgt, + spdk_nvmf_tgt_destroy_done_fn cb_fn, + void *cb_arg) +{ + tgt->destroy_cb_fn = cb_fn; + tgt->destroy_cb_arg = cb_arg; + + TAILQ_REMOVE(&g_nvmf_tgts, tgt, link); + + spdk_io_device_unregister(tgt, nvmf_tgt_destroy_cb); +} + +const char * +spdk_nvmf_tgt_get_name(struct spdk_nvmf_tgt *tgt) +{ + return tgt->name; +} + +struct spdk_nvmf_tgt * +spdk_nvmf_get_tgt(const char *name) +{ + struct spdk_nvmf_tgt *tgt; + uint32_t num_targets = 0; + + TAILQ_FOREACH(tgt, &g_nvmf_tgts, link) { + if (name) { + if (!strncmp(tgt->name, name, NVMF_TGT_NAME_MAX_LENGTH)) { + return tgt; + } + } + num_targets++; + } + + /* + * special case. If there is only one target and + * no name was specified, return the only available + * target. If there is more than one target, name must + * be specified. + */ + if (!name && num_targets == 1) { + return TAILQ_FIRST(&g_nvmf_tgts); + } + + return NULL; +} + +struct spdk_nvmf_tgt * +spdk_nvmf_get_first_tgt(void) +{ + return TAILQ_FIRST(&g_nvmf_tgts); +} + +struct spdk_nvmf_tgt * +spdk_nvmf_get_next_tgt(struct spdk_nvmf_tgt *prev) +{ + return TAILQ_NEXT(prev, link); +} + +static void +nvmf_write_subsystem_config_json(struct spdk_json_write_ctx *w, + struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_host *host; + struct spdk_nvmf_subsystem_listener *listener; + const struct spdk_nvme_transport_id *trid; + struct spdk_nvmf_ns *ns; + struct spdk_nvmf_ns_opts ns_opts; + uint32_t max_namespaces; + char uuid_str[SPDK_UUID_STRING_LEN]; + const char *adrfam; + + if (spdk_nvmf_subsystem_get_type(subsystem) != SPDK_NVMF_SUBTYPE_NVME) { + return; + } + + /* { */ + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_create_subsystem"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + spdk_json_write_named_bool(w, "allow_any_host", spdk_nvmf_subsystem_get_allow_any_host(subsystem)); + spdk_json_write_named_string(w, "serial_number", spdk_nvmf_subsystem_get_sn(subsystem)); + spdk_json_write_named_string(w, "model_number", spdk_nvmf_subsystem_get_mn(subsystem)); + + max_namespaces = spdk_nvmf_subsystem_get_max_namespaces(subsystem); + if (max_namespaces != 0) { + spdk_json_write_named_uint32(w, "max_namespaces", max_namespaces); + } + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + + for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL; + listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) { + trid = spdk_nvmf_subsystem_listener_get_trid(listener); + + adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_listener"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + + /* "listen_address" : { */ + spdk_json_write_named_object_begin(w, "listen_address"); + + spdk_json_write_named_string(w, "trtype", trid->trstring); + if (adrfam) { + spdk_json_write_named_string(w, "adrfam", adrfam); + } + + spdk_json_write_named_string(w, "traddr", trid->traddr); + spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); + /* } "listen_address" */ + spdk_json_write_object_end(w); + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + } + + for (host = spdk_nvmf_subsystem_get_first_host(subsystem); host != NULL; + host = spdk_nvmf_subsystem_get_next_host(subsystem, host)) { + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_host"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + spdk_json_write_named_string(w, "host", spdk_nvmf_host_get_nqn(host)); + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + } + + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + spdk_nvmf_ns_get_opts(ns, &ns_opts, sizeof(ns_opts)); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_ns"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + + /* "namespace" : { */ + spdk_json_write_named_object_begin(w, "namespace"); + + spdk_json_write_named_uint32(w, "nsid", spdk_nvmf_ns_get_id(ns)); + spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns))); + + if (!spdk_mem_all_zero(ns_opts.nguid, sizeof(ns_opts.nguid))) { + SPDK_STATIC_ASSERT(sizeof(ns_opts.nguid) == sizeof(uint64_t) * 2, "size mismatch"); + spdk_json_write_named_string_fmt(w, "nguid", "%016"PRIX64"%016"PRIX64, from_be64(&ns_opts.nguid[0]), + from_be64(&ns_opts.nguid[8])); + } + + if (!spdk_mem_all_zero(ns_opts.eui64, sizeof(ns_opts.eui64))) { + SPDK_STATIC_ASSERT(sizeof(ns_opts.eui64) == sizeof(uint64_t), "size mismatch"); + spdk_json_write_named_string_fmt(w, "eui64", "%016"PRIX64, from_be64(&ns_opts.eui64)); + } + + if (!spdk_mem_all_zero(&ns_opts.uuid, sizeof(ns_opts.uuid))) { + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &ns_opts.uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + } + + /* "namespace" */ + spdk_json_write_object_end(w); + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + } +} + +void +spdk_nvmf_tgt_write_config_json(struct spdk_json_write_ctx *w, struct spdk_nvmf_tgt *tgt) +{ + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_transport *transport; + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_set_max_subsystems"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_uint32(w, "max_subsystems", tgt->max_subsystems); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + /* write transports */ + TAILQ_FOREACH(transport, &tgt->transports, link) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_create_transport"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "trtype", spdk_nvme_transport_id_trtype_str(transport->ops->type)); + spdk_json_write_named_uint32(w, "max_queue_depth", transport->opts.max_queue_depth); + spdk_json_write_named_uint32(w, "max_io_qpairs_per_ctrlr", + transport->opts.max_qpairs_per_ctrlr - 1); + spdk_json_write_named_uint32(w, "in_capsule_data_size", transport->opts.in_capsule_data_size); + spdk_json_write_named_uint32(w, "max_io_size", transport->opts.max_io_size); + spdk_json_write_named_uint32(w, "io_unit_size", transport->opts.io_unit_size); + spdk_json_write_named_uint32(w, "max_aq_depth", transport->opts.max_aq_depth); + if (transport->ops->type == SPDK_NVME_TRANSPORT_RDMA) { + spdk_json_write_named_uint32(w, "max_srq_depth", transport->opts.max_srq_depth); + } + spdk_json_write_named_uint32(w, "abort_timeout_sec", transport->opts.abort_timeout_sec); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + + subsystem = spdk_nvmf_subsystem_get_first(tgt); + while (subsystem) { + nvmf_write_subsystem_config_json(w, subsystem); + subsystem = spdk_nvmf_subsystem_get_next(subsystem); + } +} + +int +spdk_nvmf_tgt_listen(struct spdk_nvmf_tgt *tgt, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_transport *transport; + const char *trtype; + int rc; + + transport = spdk_nvmf_tgt_get_transport(tgt, trid->trstring); + if (!transport) { + trtype = spdk_nvme_transport_id_trtype_str(trid->trtype); + if (trtype != NULL) { + SPDK_ERRLOG("Unable to listen on transport %s. The transport must be created first.\n", trtype); + } else { + SPDK_ERRLOG("The specified trtype %d is unknown. Please make sure that it is properly registered.\n", + trid->trtype); + } + + return -EINVAL; + } + + rc = spdk_nvmf_transport_listen(transport, trid); + if (rc < 0) { + SPDK_ERRLOG("Unable to listen on address '%s'\n", trid->traddr); + } + + return rc; +} + +int +spdk_nvmf_tgt_stop_listen(struct spdk_nvmf_tgt *tgt, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_transport *transport; + const char *trtype; + int rc; + + transport = spdk_nvmf_tgt_get_transport(tgt, trid->trstring); + if (!transport) { + trtype = spdk_nvme_transport_id_trtype_str(trid->trtype); + if (trtype != NULL) { + SPDK_ERRLOG("Unable to stop listen on transport %s. The transport must be created first.\n", + trtype); + } else { + SPDK_ERRLOG("The specified trtype %d is unknown. Please make sure that it is properly registered.\n", + trid->trtype); + } + return -EINVAL; + } + + rc = spdk_nvmf_transport_stop_listen(transport, trid); + if (rc < 0) { + SPDK_ERRLOG("Failed to stop listening on address '%s'\n", trid->traddr); + return rc; + } + return 0; +} + +struct spdk_nvmf_tgt_add_transport_ctx { + struct spdk_nvmf_tgt *tgt; + struct spdk_nvmf_transport *transport; + spdk_nvmf_tgt_add_transport_done_fn cb_fn; + void *cb_arg; +}; + +static void +_nvmf_tgt_add_transport_done(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_nvmf_tgt_add_transport_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + ctx->cb_fn(ctx->cb_arg, status); + + free(ctx); +} + +static void +_nvmf_tgt_add_transport(struct spdk_io_channel_iter *i) +{ + struct spdk_nvmf_tgt_add_transport_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch); + int rc; + + rc = nvmf_poll_group_add_transport(group, ctx->transport); + spdk_for_each_channel_continue(i, rc); +} + +void spdk_nvmf_tgt_add_transport(struct spdk_nvmf_tgt *tgt, + struct spdk_nvmf_transport *transport, + spdk_nvmf_tgt_add_transport_done_fn cb_fn, + void *cb_arg) +{ + struct spdk_nvmf_tgt_add_transport_ctx *ctx; + + if (spdk_nvmf_tgt_get_transport(tgt, transport->ops->name)) { + cb_fn(cb_arg, -EEXIST); + return; /* transport already created */ + } + + transport->tgt = tgt; + TAILQ_INSERT_TAIL(&tgt->transports, transport, link); + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->tgt = tgt; + ctx->transport = transport; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_for_each_channel(tgt, + _nvmf_tgt_add_transport, + ctx, + _nvmf_tgt_add_transport_done); +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_tgt_find_subsystem(struct spdk_nvmf_tgt *tgt, const char *subnqn) +{ + struct spdk_nvmf_subsystem *subsystem; + uint32_t sid; + + if (!subnqn) { + return NULL; + } + + /* Ensure that subnqn is null terminated */ + if (!memchr(subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1)) { + SPDK_ERRLOG("Connect SUBNQN is not null terminated\n"); + return NULL; + } + + for (sid = 0; sid < tgt->max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem == NULL) { + continue; + } + + if (strcmp(subnqn, subsystem->subnqn) == 0) { + return subsystem; + } + } + + return NULL; +} + +struct spdk_nvmf_transport * +spdk_nvmf_tgt_get_transport(struct spdk_nvmf_tgt *tgt, const char *transport_name) +{ + struct spdk_nvmf_transport *transport; + + TAILQ_FOREACH(transport, &tgt->transports, link) { + if (!strncasecmp(transport->ops->name, transport_name, SPDK_NVMF_TRSTRING_MAX_LEN)) { + return transport; + } + } + return NULL; +} + +struct nvmf_new_qpair_ctx { + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_poll_group *group; +}; + +static void +_nvmf_poll_group_add(void *_ctx) +{ + struct nvmf_new_qpair_ctx *ctx = _ctx; + struct spdk_nvmf_qpair *qpair = ctx->qpair; + struct spdk_nvmf_poll_group *group = ctx->group; + + free(_ctx); + + if (spdk_nvmf_poll_group_add(group, qpair) != 0) { + SPDK_ERRLOG("Unable to add the qpair to a poll group.\n"); + spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); + } +} + +void +spdk_nvmf_tgt_new_qpair(struct spdk_nvmf_tgt *tgt, struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_poll_group *group; + struct nvmf_new_qpair_ctx *ctx; + + group = spdk_nvmf_get_optimal_poll_group(qpair); + if (group == NULL) { + if (tgt->next_poll_group == NULL) { + tgt->next_poll_group = TAILQ_FIRST(&tgt->poll_groups); + if (tgt->next_poll_group == NULL) { + SPDK_ERRLOG("No poll groups exist.\n"); + spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); + return; + } + } + group = tgt->next_poll_group; + tgt->next_poll_group = TAILQ_NEXT(group, link); + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + SPDK_ERRLOG("Unable to send message to poll group.\n"); + spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); + return; + } + + ctx->qpair = qpair; + ctx->group = group; + + spdk_thread_send_msg(group->thread, _nvmf_poll_group_add, ctx); +} + +uint32_t +spdk_nvmf_tgt_accept(struct spdk_nvmf_tgt *tgt) +{ + struct spdk_nvmf_transport *transport, *tmp; + uint32_t count = 0; + + TAILQ_FOREACH_SAFE(transport, &tgt->transports, link, tmp) { + count += nvmf_transport_accept(transport); + } + + return count; +} + +struct spdk_nvmf_poll_group * +spdk_nvmf_poll_group_create(struct spdk_nvmf_tgt *tgt) +{ + struct spdk_io_channel *ch; + + ch = spdk_get_io_channel(tgt); + if (!ch) { + SPDK_ERRLOG("Unable to get I/O channel for target\n"); + return NULL; + } + + return spdk_io_channel_get_ctx(ch); +} + +void +spdk_nvmf_poll_group_destroy(struct spdk_nvmf_poll_group *group, + spdk_nvmf_poll_group_destroy_done_fn cb_fn, + void *cb_arg) +{ + assert(group->destroy_cb_fn == NULL); + group->destroy_cb_fn = cb_fn; + group->destroy_cb_arg = cb_arg; + + /* This function will put the io_channel associated with this poll group */ + nvmf_tgt_destroy_poll_group_qpairs(group); +} + +int +spdk_nvmf_poll_group_add(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + int rc = -1; + struct spdk_nvmf_transport_poll_group *tgroup; + + TAILQ_INIT(&qpair->outstanding); + qpair->group = group; + + TAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (tgroup->transport == qpair->transport) { + rc = nvmf_transport_poll_group_add(tgroup, qpair); + break; + } + } + + /* We add the qpair to the group only it is succesfully added into the tgroup */ + if (rc == 0) { + TAILQ_INSERT_TAIL(&group->qpairs, qpair, link); + nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ACTIVE); + } + + return rc; +} + +static +void _nvmf_ctrlr_destruct(void *ctx) +{ + struct spdk_nvmf_ctrlr *ctrlr = ctx; + + nvmf_ctrlr_destruct(ctrlr); +} + +static void +_nvmf_transport_qpair_fini(void *ctx) +{ + struct spdk_nvmf_qpair *qpair = ctx; + + nvmf_transport_qpair_fini(qpair); +} + +static void +_nvmf_ctrlr_free_from_qpair(void *ctx) +{ + struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx; + struct spdk_nvmf_ctrlr *ctrlr = qpair_ctx->ctrlr; + uint32_t count; + + spdk_bit_array_clear(ctrlr->qpair_mask, qpair_ctx->qid); + count = spdk_bit_array_count_set(ctrlr->qpair_mask); + if (count == 0) { + spdk_bit_array_free(&ctrlr->qpair_mask); + + spdk_thread_send_msg(ctrlr->subsys->thread, _nvmf_ctrlr_destruct, ctrlr); + } + + spdk_thread_send_msg(qpair_ctx->thread, _nvmf_transport_qpair_fini, qpair_ctx->qpair); + if (qpair_ctx->cb_fn) { + spdk_thread_send_msg(qpair_ctx->thread, qpair_ctx->cb_fn, qpair_ctx->ctx); + } + free(qpair_ctx); +} + +void +spdk_nvmf_poll_group_remove(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvmf_transport_poll_group *tgroup; + struct spdk_nvmf_request *req, *tmp; + struct spdk_nvmf_subsystem_poll_group *sgroup; + int rc; + + nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ERROR); + + /* Find the tgroup and remove the qpair from the tgroup */ + TAILQ_FOREACH(tgroup, &qpair->group->tgroups, link) { + if (tgroup->transport == qpair->transport) { + rc = nvmf_transport_poll_group_remove(tgroup, qpair); + if (rc && (rc != ENOTSUP)) { + SPDK_ERRLOG("Cannot remove qpair=%p from transport group=%p\n", + qpair, tgroup); + } + break; + } + } + + if (ctrlr) { + sgroup = &qpair->group->sgroups[ctrlr->subsys->id]; + TAILQ_FOREACH_SAFE(req, &sgroup->queued, link, tmp) { + if (req->qpair == qpair) { + TAILQ_REMOVE(&sgroup->queued, req, link); + if (nvmf_transport_req_free(req)) { + SPDK_ERRLOG("Transport request free error!\n"); + } + } + } + } + + TAILQ_REMOVE(&qpair->group->qpairs, qpair, link); + qpair->group = NULL; +} + +static void +_nvmf_qpair_destroy(void *ctx, int status) +{ + struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx; + struct spdk_nvmf_qpair *qpair = qpair_ctx->qpair; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + assert(qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING); + qpair_ctx->qid = qpair->qid; + + spdk_nvmf_poll_group_remove(qpair); + + if (!ctrlr || !ctrlr->thread) { + nvmf_transport_qpair_fini(qpair); + if (qpair_ctx->cb_fn) { + spdk_thread_send_msg(qpair_ctx->thread, qpair_ctx->cb_fn, qpair_ctx->ctx); + } + free(qpair_ctx); + return; + } + + qpair_ctx->ctrlr = ctrlr; + spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_free_from_qpair, qpair_ctx); +} + +int +spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_cb cb_fn, void *ctx) +{ + struct nvmf_qpair_disconnect_ctx *qpair_ctx; + + /* If we get a qpair in the uninitialized state, we can just destroy it immediately */ + if (qpair->state == SPDK_NVMF_QPAIR_UNINITIALIZED) { + nvmf_transport_qpair_fini(qpair); + if (cb_fn) { + cb_fn(ctx); + } + return 0; + } + + /* The queue pair must be disconnected from the thread that owns it */ + assert(qpair->group->thread == spdk_get_thread()); + + if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) { + /* This can occur if the connection is killed by the target, + * which results in a notification that the connection + * died. Send a message to defer the processing of this + * callback. This allows the stack to unwind in the case + * where a bunch of connections are disconnected in + * a loop. */ + if (cb_fn) { + spdk_thread_send_msg(qpair->group->thread, cb_fn, ctx); + } + return 0; + } + + assert(qpair->state == SPDK_NVMF_QPAIR_ACTIVE); + nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_DEACTIVATING); + + qpair_ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_ctx)); + if (!qpair_ctx) { + SPDK_ERRLOG("Unable to allocate context for nvmf_qpair_disconnect\n"); + return -ENOMEM; + } + + qpair_ctx->qpair = qpair; + qpair_ctx->cb_fn = cb_fn; + qpair_ctx->thread = qpair->group->thread; + qpair_ctx->ctx = ctx; + + /* Check for outstanding I/O */ + if (!TAILQ_EMPTY(&qpair->outstanding)) { + qpair->state_cb = _nvmf_qpair_destroy; + qpair->state_cb_arg = qpair_ctx; + nvmf_qpair_free_aer(qpair); + return 0; + } + + _nvmf_qpair_destroy(qpair_ctx, 0); + + return 0; +} + +int +spdk_nvmf_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return nvmf_transport_qpair_get_peer_trid(qpair, trid); +} + +int +spdk_nvmf_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return nvmf_transport_qpair_get_local_trid(qpair, trid); +} + +int +spdk_nvmf_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return nvmf_transport_qpair_get_listen_trid(qpair, trid); +} + +int +nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_transport_poll_group *tgroup; + + TAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (tgroup->transport == transport) { + /* Transport already in the poll group */ + return 0; + } + } + + tgroup = nvmf_transport_poll_group_create(transport); + if (!tgroup) { + SPDK_ERRLOG("Unable to create poll group for transport\n"); + return -1; + } + + tgroup->group = group; + TAILQ_INSERT_TAIL(&group->tgroups, tgroup, link); + + return 0; +} + +static int +poll_group_update_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_subsystem_poll_group *sgroup; + uint32_t new_num_ns, old_num_ns; + uint32_t i, j; + struct spdk_nvmf_ns *ns; + struct spdk_nvmf_registrant *reg, *tmp; + struct spdk_io_channel *ch; + struct spdk_nvmf_subsystem_pg_ns_info *ns_info; + struct spdk_nvmf_ctrlr *ctrlr; + bool ns_changed; + + /* Make sure our poll group has memory for this subsystem allocated */ + if (subsystem->id >= group->num_sgroups) { + return -ENOMEM; + } + + sgroup = &group->sgroups[subsystem->id]; + + /* Make sure the array of namespace information is the correct size */ + new_num_ns = subsystem->max_nsid; + old_num_ns = sgroup->num_ns; + + ns_changed = false; + + if (old_num_ns == 0) { + if (new_num_ns > 0) { + /* First allocation */ + sgroup->ns_info = calloc(new_num_ns, sizeof(struct spdk_nvmf_subsystem_pg_ns_info)); + if (!sgroup->ns_info) { + return -ENOMEM; + } + } + } else if (new_num_ns > old_num_ns) { + void *buf; + + /* Make the array larger */ + buf = realloc(sgroup->ns_info, new_num_ns * sizeof(struct spdk_nvmf_subsystem_pg_ns_info)); + if (!buf) { + return -ENOMEM; + } + + sgroup->ns_info = buf; + + /* Null out the new namespace information slots */ + for (i = old_num_ns; i < new_num_ns; i++) { + memset(&sgroup->ns_info[i], 0, sizeof(struct spdk_nvmf_subsystem_pg_ns_info)); + } + } else if (new_num_ns < old_num_ns) { + void *buf; + + /* Free the extra I/O channels */ + for (i = new_num_ns; i < old_num_ns; i++) { + ns_info = &sgroup->ns_info[i]; + + if (ns_info->channel) { + spdk_put_io_channel(ns_info->channel); + ns_info->channel = NULL; + } + } + + /* Make the array smaller */ + if (new_num_ns > 0) { + buf = realloc(sgroup->ns_info, new_num_ns * sizeof(struct spdk_nvmf_subsystem_pg_ns_info)); + if (!buf) { + return -ENOMEM; + } + sgroup->ns_info = buf; + } else { + free(sgroup->ns_info); + sgroup->ns_info = NULL; + } + } + + sgroup->num_ns = new_num_ns; + + /* Detect bdevs that were added or removed */ + for (i = 0; i < sgroup->num_ns; i++) { + ns = subsystem->ns[i]; + ns_info = &sgroup->ns_info[i]; + ch = ns_info->channel; + + if (ns == NULL && ch == NULL) { + /* Both NULL. Leave empty */ + } else if (ns == NULL && ch != NULL) { + /* There was a channel here, but the namespace is gone. */ + ns_changed = true; + spdk_put_io_channel(ch); + ns_info->channel = NULL; + } else if (ns != NULL && ch == NULL) { + /* A namespace appeared but there is no channel yet */ + ns_changed = true; + ch = spdk_bdev_get_io_channel(ns->desc); + if (ch == NULL) { + SPDK_ERRLOG("Could not allocate I/O channel.\n"); + return -ENOMEM; + } + ns_info->channel = ch; + } else if (spdk_uuid_compare(&ns_info->uuid, spdk_bdev_get_uuid(ns->bdev)) != 0) { + /* A namespace was here before, but was replaced by a new one. */ + ns_changed = true; + spdk_put_io_channel(ns_info->channel); + memset(ns_info, 0, sizeof(*ns_info)); + + ch = spdk_bdev_get_io_channel(ns->desc); + if (ch == NULL) { + SPDK_ERRLOG("Could not allocate I/O channel.\n"); + return -ENOMEM; + } + ns_info->channel = ch; + } else if (ns_info->num_blocks != spdk_bdev_get_num_blocks(ns->bdev)) { + /* Namespace is still there but size has changed */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Namespace resized: subsystem_id %d," + " nsid %u, pg %p, old %lu, new %lu\n", + subsystem->id, + ns->nsid, + group, + ns_info->num_blocks, + spdk_bdev_get_num_blocks(ns->bdev)); + ns_changed = true; + } + + if (ns == NULL) { + memset(ns_info, 0, sizeof(*ns_info)); + } else { + ns_info->uuid = *spdk_bdev_get_uuid(ns->bdev); + ns_info->num_blocks = spdk_bdev_get_num_blocks(ns->bdev); + ns_info->crkey = ns->crkey; + ns_info->rtype = ns->rtype; + if (ns->holder) { + ns_info->holder_id = ns->holder->hostid; + } + + memset(&ns_info->reg_hostid, 0, SPDK_NVMF_MAX_NUM_REGISTRANTS * sizeof(struct spdk_uuid)); + j = 0; + TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) { + if (j >= SPDK_NVMF_MAX_NUM_REGISTRANTS) { + SPDK_ERRLOG("Maximum %u registrants can support.\n", SPDK_NVMF_MAX_NUM_REGISTRANTS); + return -EINVAL; + } + ns_info->reg_hostid[j++] = reg->hostid; + } + } + } + + if (ns_changed) { + TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) { + if (ctrlr->admin_qpair->group == group) { + nvmf_ctrlr_async_event_ns_notice(ctrlr); + } + } + } + + return 0; +} + +int +nvmf_poll_group_update_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem) +{ + return poll_group_update_subsystem(group, subsystem); +} + +int +nvmf_poll_group_add_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + int rc = 0; + struct spdk_nvmf_subsystem_poll_group *sgroup = &group->sgroups[subsystem->id]; + + TAILQ_INIT(&sgroup->queued); + + rc = poll_group_update_subsystem(group, subsystem); + if (rc) { + nvmf_poll_group_remove_subsystem(group, subsystem, NULL, NULL); + goto fini; + } + + sgroup->state = SPDK_NVMF_SUBSYSTEM_ACTIVE; +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } + + return rc; +} + +static void +_nvmf_poll_group_remove_subsystem_cb(void *ctx, int status) +{ + struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_poll_group *group; + struct spdk_nvmf_subsystem_poll_group *sgroup; + spdk_nvmf_poll_group_mod_done cpl_fn = NULL; + void *cpl_ctx = NULL; + uint32_t nsid; + + group = qpair_ctx->group; + subsystem = qpair_ctx->subsystem; + cpl_fn = qpair_ctx->cpl_fn; + cpl_ctx = qpair_ctx->cpl_ctx; + sgroup = &group->sgroups[subsystem->id]; + + if (status) { + goto fini; + } + + for (nsid = 0; nsid < sgroup->num_ns; nsid++) { + if (sgroup->ns_info[nsid].channel) { + spdk_put_io_channel(sgroup->ns_info[nsid].channel); + sgroup->ns_info[nsid].channel = NULL; + } + } + + sgroup->num_ns = 0; + free(sgroup->ns_info); + sgroup->ns_info = NULL; +fini: + free(qpair_ctx); + if (cpl_fn) { + cpl_fn(cpl_ctx, status); + } +} + +static void +_nvmf_subsystem_disconnect_next_qpair(void *ctx) +{ + struct spdk_nvmf_qpair *qpair; + struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_poll_group *group; + int rc = 0; + + group = qpair_ctx->group; + subsystem = qpair_ctx->subsystem; + + TAILQ_FOREACH(qpair, &group->qpairs, link) { + if ((qpair->ctrlr != NULL) && (qpair->ctrlr->subsys == subsystem)) { + break; + } + } + + if (qpair) { + rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_subsystem_disconnect_next_qpair, qpair_ctx); + } + + if (!qpair || rc != 0) { + _nvmf_poll_group_remove_subsystem_cb(ctx, rc); + } + return; +} + +void +nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_subsystem_poll_group *sgroup; + struct nvmf_qpair_disconnect_many_ctx *ctx; + int rc = 0; + + ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx)); + + if (!ctx) { + SPDK_ERRLOG("Unable to allocate memory for context to remove poll subsystem\n"); + goto fini; + } + + ctx->group = group; + ctx->subsystem = subsystem; + ctx->cpl_fn = cb_fn; + ctx->cpl_ctx = cb_arg; + + sgroup = &group->sgroups[subsystem->id]; + sgroup->state = SPDK_NVMF_SUBSYSTEM_INACTIVE; + + TAILQ_FOREACH(qpair, &group->qpairs, link) { + if ((qpair->ctrlr != NULL) && (qpair->ctrlr->subsys == subsystem)) { + break; + } + } + + if (qpair) { + rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_subsystem_disconnect_next_qpair, ctx); + } else { + /* call the callback immediately. It will handle any channel iteration */ + _nvmf_poll_group_remove_subsystem_cb(ctx, 0); + } + + if (rc != 0) { + free(ctx); + goto fini; + } + + return; +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } +} + +void +nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + struct spdk_nvmf_subsystem_poll_group *sgroup; + int rc = 0; + + if (subsystem->id >= group->num_sgroups) { + rc = -1; + goto fini; + } + + sgroup = &group->sgroups[subsystem->id]; + if (sgroup == NULL) { + rc = -1; + goto fini; + } + + assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_ACTIVE); + sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSING; + + if (sgroup->io_outstanding > 0) { + sgroup->cb_fn = cb_fn; + sgroup->cb_arg = cb_arg; + return; + } + + assert(sgroup->io_outstanding == 0); + sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSED; +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } +} + +void +nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + struct spdk_nvmf_request *req, *tmp; + struct spdk_nvmf_subsystem_poll_group *sgroup; + int rc = 0; + + if (subsystem->id >= group->num_sgroups) { + rc = -1; + goto fini; + } + + sgroup = &group->sgroups[subsystem->id]; + + assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSED); + + rc = poll_group_update_subsystem(group, subsystem); + if (rc) { + goto fini; + } + + sgroup->state = SPDK_NVMF_SUBSYSTEM_ACTIVE; + + /* Release all queued requests */ + TAILQ_FOREACH_SAFE(req, &sgroup->queued, link, tmp) { + TAILQ_REMOVE(&sgroup->queued, req, link); + spdk_nvmf_request_exec(req); + } +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } +} + + +struct spdk_nvmf_poll_group * +spdk_nvmf_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_transport_poll_group *tgroup; + + tgroup = nvmf_transport_get_optimal_poll_group(qpair->transport, qpair); + + if (tgroup == NULL) { + return NULL; + } + + return tgroup->group; +} + +int +spdk_nvmf_poll_group_get_stat(struct spdk_nvmf_tgt *tgt, + struct spdk_nvmf_poll_group_stat *stat) +{ + struct spdk_io_channel *ch; + struct spdk_nvmf_poll_group *group; + + if (tgt == NULL || stat == NULL) { + return -EINVAL; + } + + ch = spdk_get_io_channel(tgt); + group = spdk_io_channel_get_ctx(ch); + *stat = group->stat; + spdk_put_io_channel(ch); + return 0; +} diff --git a/src/spdk/lib/nvmf/nvmf_fc.h b/src/spdk/lib/nvmf/nvmf_fc.h new file mode 100644 index 000000000..10d3ef9cf --- /dev/null +++ b/src/spdk/lib/nvmf/nvmf_fc.h @@ -0,0 +1,999 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2018-2019 Broadcom. All Rights Reserved. + * The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVMF_FC_H__ +#define __NVMF_FC_H__ + +#include "spdk/nvme.h" +#include "spdk/nvmf.h" +#include "spdk/assert.h" +#include "spdk/nvme_spec.h" +#include "spdk/nvmf_fc_spec.h" +#include "spdk/thread.h" +#include "nvmf_internal.h" + +#define SPDK_NVMF_FC_TR_ADDR_LEN 64 +#define NVMF_FC_INVALID_CONN_ID UINT64_MAX + +#define SPDK_FC_HW_DUMP_REASON_STR_MAX_SIZE 256 +#define SPDK_MAX_NUM_OF_FC_PORTS 32 +#define SPDK_NVMF_PORT_ID_MAX_LEN 32 + +/* + * FC HWQP pointer + */ +typedef void *spdk_nvmf_fc_lld_hwqp_t; + +/* + * FC HW port states. + */ +enum spdk_fc_port_state { + SPDK_FC_PORT_OFFLINE = 0, + SPDK_FC_PORT_ONLINE = 1, + SPDK_FC_PORT_QUIESCED = 2, +}; + +enum spdk_fc_hwqp_state { + SPDK_FC_HWQP_OFFLINE = 0, + SPDK_FC_HWQP_ONLINE = 1, +}; + +/* + * NVMF FC Object state + * Add all the generic states of the object here. + * Specific object states can be added separately + */ +enum spdk_nvmf_fc_object_state { + SPDK_NVMF_FC_OBJECT_CREATED = 0, + SPDK_NVMF_FC_OBJECT_TO_BE_DELETED = 1, + SPDK_NVMF_FC_OBJECT_ZOMBIE = 2, /* Partial Create or Delete */ +}; + +/* + * FC request state + */ +enum spdk_nvmf_fc_request_state { + SPDK_NVMF_FC_REQ_INIT = 0, + SPDK_NVMF_FC_REQ_READ_BDEV, + SPDK_NVMF_FC_REQ_READ_XFER, + SPDK_NVMF_FC_REQ_READ_RSP, + SPDK_NVMF_FC_REQ_WRITE_BUFFS, + SPDK_NVMF_FC_REQ_WRITE_XFER, + SPDK_NVMF_FC_REQ_WRITE_BDEV, + SPDK_NVMF_FC_REQ_WRITE_RSP, + SPDK_NVMF_FC_REQ_NONE_BDEV, + SPDK_NVMF_FC_REQ_NONE_RSP, + SPDK_NVMF_FC_REQ_SUCCESS, + SPDK_NVMF_FC_REQ_FAILED, + SPDK_NVMF_FC_REQ_ABORTED, + SPDK_NVMF_FC_REQ_BDEV_ABORTED, + SPDK_NVMF_FC_REQ_PENDING, + SPDK_NVMF_FC_REQ_MAX_STATE, +}; + +/* + * Generic DMA buffer descriptor + */ +struct spdk_nvmf_fc_buffer_desc { + void *virt; + uint64_t phys; + size_t len; + + /* Internal */ + uint32_t buf_index; +}; + +/* + * ABTS hadling context + */ +struct spdk_nvmf_fc_abts_ctx { + bool handled; + uint16_t hwqps_responded; + uint16_t rpi; + uint16_t oxid; + uint16_t rxid; + struct spdk_nvmf_fc_nport *nport; + uint16_t nport_hdl; + uint8_t port_hdl; + void *abts_poller_args; + void *sync_poller_args; + int num_hwqps; + bool queue_synced; + uint64_t u_id; + struct spdk_nvmf_fc_hwqp *ls_hwqp; + uint16_t fcp_rq_id; +}; + +/* + * NVME FC transport errors + */ +struct spdk_nvmf_fc_errors { + uint32_t no_xchg; + uint32_t nport_invalid; + uint32_t unknown_frame; + uint32_t wqe_cmplt_err; + uint32_t wqe_write_err; + uint32_t rq_status_err; + uint32_t rq_buf_len_err; + uint32_t rq_id_err; + uint32_t rq_index_err; + uint32_t invalid_cq_type; + uint32_t invalid_cq_id; + uint32_t fc_req_buf_err; + uint32_t buf_alloc_err; + uint32_t unexpected_err; + uint32_t nvme_cmd_iu_err; + uint32_t nvme_cmd_xfer_err; + uint32_t queue_entry_invalid; + uint32_t invalid_conn_err; + uint32_t fcp_rsp_failure; + uint32_t write_failed; + uint32_t read_failed; + uint32_t rport_invalid; + uint32_t num_aborted; + uint32_t num_abts_sent; +}; + +/* + * Send Single Request/Response Sequence. + */ +struct spdk_nvmf_fc_srsr_bufs { + void *rqst; + size_t rqst_len; + void *rsp; + size_t rsp_len; + uint16_t rpi; +}; + +/* + * Struct representing a nport + */ +struct spdk_nvmf_fc_nport { + + uint16_t nport_hdl; + uint8_t port_hdl; + uint32_t d_id; + enum spdk_nvmf_fc_object_state nport_state; + struct spdk_nvmf_fc_wwn fc_nodename; + struct spdk_nvmf_fc_wwn fc_portname; + + /* list of remote ports (i.e. initiators) connected to nport */ + TAILQ_HEAD(, spdk_nvmf_fc_remote_port_info) rem_port_list; + uint32_t rport_count; + + void *vendor_data; /* available for vendor use */ + + /* list of associations to nport */ + TAILQ_HEAD(, spdk_nvmf_fc_association) fc_associations; + uint32_t assoc_count; + struct spdk_nvmf_fc_port *fc_port; + TAILQ_ENTRY(spdk_nvmf_fc_nport) link; /* list of nports on a hw port. */ +}; + +/* + * NVMF FC Connection + */ +struct spdk_nvmf_fc_conn { + struct spdk_nvmf_qpair qpair; + struct spdk_nvme_transport_id trid; + + uint64_t conn_id; + struct spdk_nvmf_fc_hwqp *hwqp; + uint16_t esrp_ratio; + uint16_t rsp_count; + uint32_t rsn; + + /* The maximum number of I/O outstanding on this connection at one time */ + uint16_t max_queue_depth; + uint16_t max_rw_depth; + /* The current number of I/O outstanding on this connection. This number + * includes all I/O from the time the capsule is first received until it is + * completed. + */ + uint16_t cur_queue_depth; + + /* number of read/write requests that are outstanding */ + uint16_t cur_fc_rw_depth; + + struct spdk_nvmf_fc_association *fc_assoc; + + uint16_t rpi; + + /* for association's connection list */ + TAILQ_ENTRY(spdk_nvmf_fc_conn) assoc_link; + + /* for assocations's available connection list */ + TAILQ_ENTRY(spdk_nvmf_fc_conn) assoc_avail_link; + + /* for hwqp's connection list */ + TAILQ_ENTRY(spdk_nvmf_fc_conn) link; + + /* New QP create context. */ + struct nvmf_fc_ls_op_ctx *create_opd; +}; + +/* + * Structure for maintaining the FC exchanges + */ +struct spdk_nvmf_fc_xchg { + uint32_t xchg_id; /* The actual xchg identifier */ + + /* Internal */ + TAILQ_ENTRY(spdk_nvmf_fc_xchg) link; + bool active; + bool aborted; + bool send_abts; /* Valid if is_aborted is set. */ +}; + +/* + * FC poll group structure + */ +struct spdk_nvmf_fc_poll_group { + struct spdk_nvmf_transport_poll_group group; + struct spdk_nvmf_tgt *nvmf_tgt; + uint32_t hwqp_count; /* number of hwqp's assigned to this pg */ + TAILQ_HEAD(, spdk_nvmf_fc_hwqp) hwqp_list; + + TAILQ_ENTRY(spdk_nvmf_fc_poll_group) link; +}; + +/* + * HWQP poller structure passed from Master thread + */ +struct spdk_nvmf_fc_hwqp { + enum spdk_fc_hwqp_state state; /* queue state (for poller) */ + uint32_t lcore_id; /* core hwqp is running on (for tracing purposes only) */ + struct spdk_thread *thread; /* thread hwqp is running on */ + uint32_t hwqp_id; /* A unique id (per physical port) for a hwqp */ + uint32_t rq_size; /* receive queue size */ + spdk_nvmf_fc_lld_hwqp_t queues; /* vendor HW queue set */ + struct spdk_nvmf_fc_port *fc_port; /* HW port structure for these queues */ + struct spdk_nvmf_fc_poll_group *fgroup; + + /* qpair (fc_connection) list */ + TAILQ_HEAD(, spdk_nvmf_fc_conn) connection_list; + uint32_t num_conns; /* number of connections to queue */ + + struct spdk_nvmf_fc_request *fc_reqs_buf; + TAILQ_HEAD(, spdk_nvmf_fc_request) free_reqs; + TAILQ_HEAD(, spdk_nvmf_fc_request) in_use_reqs; + + struct spdk_nvmf_fc_errors counters; + + /* Pending LS request waiting for FC resource */ + TAILQ_HEAD(, spdk_nvmf_fc_ls_rqst) ls_pending_queue; + + /* Sync req list */ + TAILQ_HEAD(, spdk_nvmf_fc_poller_api_queue_sync_args) sync_cbs; + + TAILQ_ENTRY(spdk_nvmf_fc_hwqp) link; + + void *context; /* Vendor specific context data */ +}; + +/* + * FC HW port. + */ +struct spdk_nvmf_fc_port { + uint8_t port_hdl; + enum spdk_fc_port_state hw_port_status; + uint16_t fcp_rq_id; + struct spdk_nvmf_fc_hwqp ls_queue; + + uint32_t num_io_queues; + struct spdk_nvmf_fc_hwqp *io_queues; + /* + * List of nports on this HW port. + */ + TAILQ_HEAD(, spdk_nvmf_fc_nport)nport_list; + int num_nports; + TAILQ_ENTRY(spdk_nvmf_fc_port) link; + + struct spdk_mempool *io_resource_pool; /* Pools to store bdev_io's for this port */ + void *port_ctx; +}; + +/* + * NVMF FC Request + */ +struct spdk_nvmf_fc_request { + struct spdk_nvmf_request req; + struct spdk_nvmf_fc_ersp_iu ersp; + uint32_t poller_lcore; /* for tracing purposes only */ + struct spdk_thread *poller_thread; + uint16_t buf_index; + struct spdk_nvmf_fc_xchg *xchg; + uint16_t oxid; + uint16_t rpi; + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_hwqp *hwqp; + int state; + uint32_t transfered_len; + bool is_aborted; + uint32_t magic; + uint32_t s_id; + uint32_t d_id; + TAILQ_ENTRY(spdk_nvmf_fc_request) link; + STAILQ_ENTRY(spdk_nvmf_fc_request) pending_link; + TAILQ_HEAD(, spdk_nvmf_fc_caller_ctx) abort_cbs; +}; + +SPDK_STATIC_ASSERT(!offsetof(struct spdk_nvmf_fc_request, req), + "FC request and NVMF request address don't match."); + + +/* + * NVMF FC Association + */ +struct spdk_nvmf_fc_association { + uint64_t assoc_id; + uint32_t s_id; + struct spdk_nvmf_fc_nport *tgtport; + struct spdk_nvmf_fc_remote_port_info *rport; + struct spdk_nvmf_subsystem *subsystem; + enum spdk_nvmf_fc_object_state assoc_state; + + char host_id[FCNVME_ASSOC_HOSTID_LEN]; + char host_nqn[SPDK_NVME_NQN_FIELD_SIZE]; + char sub_nqn[SPDK_NVME_NQN_FIELD_SIZE]; + + struct spdk_nvmf_fc_conn *aq_conn; /* connection for admin queue */ + + uint16_t conn_count; + TAILQ_HEAD(, spdk_nvmf_fc_conn) fc_conns; + + void *conns_buf; + TAILQ_HEAD(, spdk_nvmf_fc_conn) avail_fc_conns; + + TAILQ_ENTRY(spdk_nvmf_fc_association) link; + + /* for port's association free list */ + TAILQ_ENTRY(spdk_nvmf_fc_association) port_free_assoc_list_link; + + void *ls_del_op_ctx; /* delete assoc. callback list */ + + /* disconnect cmd buffers (sent to initiator) */ + struct spdk_nvmf_fc_srsr_bufs *snd_disconn_bufs; +}; + +/* + * FC Remote Port + */ +struct spdk_nvmf_fc_remote_port_info { + uint32_t s_id; + uint32_t rpi; + uint32_t assoc_count; + struct spdk_nvmf_fc_wwn fc_nodename; + struct spdk_nvmf_fc_wwn fc_portname; + enum spdk_nvmf_fc_object_state rport_state; + TAILQ_ENTRY(spdk_nvmf_fc_remote_port_info) link; +}; + +/* + * Poller API error codes + */ +enum spdk_nvmf_fc_poller_api_ret { + SPDK_NVMF_FC_POLLER_API_SUCCESS = 0, + SPDK_NVMF_FC_POLLER_API_ERROR, + SPDK_NVMF_FC_POLLER_API_INVALID_ARG, + SPDK_NVMF_FC_POLLER_API_NO_CONN_ID, + SPDK_NVMF_FC_POLLER_API_DUP_CONN_ID, + SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND, +}; + +/* + * Poller API definitions + */ +enum spdk_nvmf_fc_poller_api { + SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION, + SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION, + SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE, + SPDK_NVMF_FC_POLLER_API_ACTIVATE_QUEUE, + SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED, + SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE, + SPDK_NVMF_FC_POLLER_API_ADAPTER_EVENT, + SPDK_NVMF_FC_POLLER_API_AEN, + SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC, + SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC_DONE, + SPDK_NVMF_FC_POLLER_API_ADD_HWQP, + SPDK_NVMF_FC_POLLER_API_REMOVE_HWQP, +}; + +/* + * Poller API callback function proto + */ +typedef void (*spdk_nvmf_fc_poller_api_cb)(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret); + +/* + * Poller API callback data + */ +struct spdk_nvmf_fc_poller_api_cb_info { + struct spdk_thread *cb_thread; + spdk_nvmf_fc_poller_api_cb cb_func; + void *cb_data; + enum spdk_nvmf_fc_poller_api_ret ret; +}; + +/* + * Poller API structures + */ +struct spdk_nvmf_fc_poller_api_add_connection_args { + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_del_connection_args { + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; + bool send_abts; + /* internal */ + int fc_request_cnt; + bool backend_initiated; +}; + +struct spdk_nvmf_fc_poller_api_quiesce_queue_args { + void *ctx; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_activate_queue_args { + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_abts_recvd_args { + struct spdk_nvmf_fc_abts_ctx *ctx; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_queue_sync_done_args { + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; + uint64_t tag; +}; + +/* + * NVMF LS request structure + */ +struct spdk_nvmf_fc_ls_rqst { + struct spdk_nvmf_fc_buffer_desc rqstbuf; + struct spdk_nvmf_fc_buffer_desc rspbuf; + uint32_t rqst_len; + uint32_t rsp_len; + uint32_t rpi; + struct spdk_nvmf_fc_xchg *xchg; + uint16_t oxid; + void *private_data; /* for LLD only (LS does not touch) */ + TAILQ_ENTRY(spdk_nvmf_fc_ls_rqst) ls_pending_link; + uint32_t s_id; + uint32_t d_id; + struct spdk_nvmf_fc_nport *nport; + struct spdk_nvmf_fc_remote_port_info *rport; + struct spdk_nvmf_tgt *nvmf_tgt; +}; + +/* + * RQ Buffer LS Overlay Structure + */ +#define FCNVME_LS_RSVD_SIZE (FCNVME_MAX_LS_BUFFER_SIZE - \ + (sizeof(struct spdk_nvmf_fc_ls_rqst) + FCNVME_MAX_LS_REQ_SIZE + FCNVME_MAX_LS_RSP_SIZE)) + +struct spdk_nvmf_fc_rq_buf_ls_request { + uint8_t rqst[FCNVME_MAX_LS_REQ_SIZE]; + uint8_t resp[FCNVME_MAX_LS_RSP_SIZE]; + struct spdk_nvmf_fc_ls_rqst ls_rqst; + uint8_t rsvd[FCNVME_LS_RSVD_SIZE]; +}; + +SPDK_STATIC_ASSERT(sizeof(struct spdk_nvmf_fc_rq_buf_ls_request) == + FCNVME_MAX_LS_BUFFER_SIZE, "LS RQ Buffer overflow"); + +/* Poller API structures (arguments and callback data */ +typedef void (*spdk_nvmf_fc_del_assoc_cb)(void *arg, uint32_t err); + +struct spdk_nvmf_fc_ls_add_conn_api_data { + struct spdk_nvmf_fc_poller_api_add_connection_args args; + struct spdk_nvmf_fc_ls_rqst *ls_rqst; + struct spdk_nvmf_fc_association *assoc; + bool aq_conn; /* true if adding connection for new association */ +}; + +/* Disconnect (connection) request functions */ +struct spdk_nvmf_fc_ls_del_conn_api_data { + struct spdk_nvmf_fc_poller_api_del_connection_args args; + struct spdk_nvmf_fc_ls_rqst *ls_rqst; + struct spdk_nvmf_fc_association *assoc; + bool aq_conn; /* true if deleting AQ connection */ +}; + +/* used by LS disconnect association cmd handling */ +struct spdk_nvmf_fc_ls_disconn_assoc_api_data { + struct spdk_nvmf_fc_nport *tgtport; + struct spdk_nvmf_fc_ls_rqst *ls_rqst; +}; + +/* used by delete association call */ +struct spdk_nvmf_fc_delete_assoc_api_data { + struct spdk_nvmf_fc_poller_api_del_connection_args args; + struct spdk_nvmf_fc_association *assoc; + bool from_ls_rqst; /* true = request came for LS */ + spdk_nvmf_fc_del_assoc_cb del_assoc_cb; + void *del_assoc_cb_data; +}; + +struct nvmf_fc_ls_op_ctx { + union { + struct spdk_nvmf_fc_ls_add_conn_api_data add_conn; + struct spdk_nvmf_fc_ls_del_conn_api_data del_conn; + struct spdk_nvmf_fc_ls_disconn_assoc_api_data disconn_assoc; + struct spdk_nvmf_fc_delete_assoc_api_data del_assoc; + } u; + struct nvmf_fc_ls_op_ctx *next_op_ctx; +}; + +struct spdk_nvmf_fc_poller_api_queue_sync_args { + uint64_t u_id; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; + + /* Used internally by poller */ + TAILQ_ENTRY(spdk_nvmf_fc_poller_api_queue_sync_args) link; +}; + +/** + * Following defines and structures are used to pass messages between master thread + * and FCT driver. + */ +enum spdk_fc_event { + SPDK_FC_HW_PORT_INIT, + SPDK_FC_HW_PORT_ONLINE, + SPDK_FC_HW_PORT_OFFLINE, + SPDK_FC_HW_PORT_RESET, + SPDK_FC_NPORT_CREATE, + SPDK_FC_NPORT_DELETE, + SPDK_FC_IT_ADD, /* PRLI */ + SPDK_FC_IT_DELETE, /* PRLI */ + SPDK_FC_ABTS_RECV, + SPDK_FC_LINK_BREAK, + SPDK_FC_HW_PORT_DUMP, + SPDK_FC_UNRECOVERABLE_ERR, + SPDK_FC_EVENT_MAX, +}; + +/** + * Arguments for to dump assoc id + */ +struct spdk_nvmf_fc_dump_assoc_id_args { + uint8_t pport_handle; + uint16_t nport_handle; + uint32_t assoc_id; +}; + +/** + * Arguments for HW port init event. + */ +struct spdk_nvmf_fc_hw_port_init_args { + uint32_t ls_queue_size; + spdk_nvmf_fc_lld_hwqp_t ls_queue; + uint32_t io_queue_size; + uint32_t io_queue_cnt; + spdk_nvmf_fc_lld_hwqp_t *io_queues; + void *cb_ctx; + void *port_ctx; + uint8_t port_handle; + uint8_t nvme_aq_index; /* io_queue used for nvme admin queue */ + uint16_t fcp_rq_id; /* Base rq ID of SCSI queue */ +}; + +/** + * Arguments for HW port link break event. + */ +struct spdk_nvmf_hw_port_link_break_args { + uint8_t port_handle; + void *cb_ctx; +}; + +/** + * Arguments for HW port online event. + */ +struct spdk_nvmf_fc_hw_port_online_args { + uint8_t port_handle; + void *cb_ctx; +}; + +/** + * Arguments for HW port offline event. + */ +struct spdk_nvmf_fc_hw_port_offline_args { + uint8_t port_handle; + void *cb_ctx; +}; + +/** + * Arguments for n-port add event. + */ +struct spdk_nvmf_fc_nport_create_args { + uint8_t port_handle; + uint16_t nport_handle; + struct spdk_uuid container_uuid; /* UUID of the nports container */ + struct spdk_uuid nport_uuid; /* Unique UUID for the nport */ + uint32_t d_id; + struct spdk_nvmf_fc_wwn fc_nodename; + struct spdk_nvmf_fc_wwn fc_portname; + uint32_t subsys_id; /* Subsystemid */ + char port_id[SPDK_NVMF_PORT_ID_MAX_LEN]; + void *cb_ctx; +}; + +/** + * Arguments for n-port delete event. + */ +struct spdk_nvmf_fc_nport_delete_args { + uint8_t port_handle; + uint32_t nport_handle; + uint32_t subsys_id; /* Subsystem id */ + void *cb_ctx; +}; + +/** + * Arguments for I_T add event. + */ +struct spdk_nvmf_fc_hw_i_t_add_args { + uint8_t port_handle; + uint32_t nport_handle; + uint16_t itn_handle; + uint32_t rpi; + uint32_t s_id; + uint32_t initiator_prli_info; + uint32_t target_prli_info; /* populated by the SPDK master */ + struct spdk_nvmf_fc_wwn fc_nodename; + struct spdk_nvmf_fc_wwn fc_portname; + void *cb_ctx; +}; + +/** + * Arguments for I_T delete event. + */ +struct spdk_nvmf_fc_hw_i_t_delete_args { + uint8_t port_handle; + uint32_t nport_handle; + uint16_t itn_handle; /* Only used by FC LLD driver; unused in SPDK */ + uint32_t rpi; + uint32_t s_id; + void *cb_ctx; +}; + +/** + * Arguments for ABTS event. + */ +struct spdk_nvmf_fc_abts_args { + uint8_t port_handle; + uint32_t nport_handle; + uint32_t rpi; + uint16_t oxid, rxid; + void *cb_ctx; +}; + +/** + * Arguments for link break event. + */ +struct spdk_nvmf_fc_link_break_args { + uint8_t port_handle; +}; + +/** + * Arguments for port reset event. + */ +struct spdk_nvmf_fc_hw_port_reset_args { + uint8_t port_handle; + bool dump_queues; + char reason[SPDK_FC_HW_DUMP_REASON_STR_MAX_SIZE]; + uint32_t **dump_buf; + void *cb_ctx; +}; + +/** + * Arguments for unrecoverable error event + */ +struct spdk_nvmf_fc_unrecoverable_error_event_args { +}; + +/** + * Callback function to the FCT driver. + */ +typedef void (*spdk_nvmf_fc_callback)(uint8_t port_handle, + enum spdk_fc_event event_type, + void *arg, int err); + +/** + * Enqueue an FCT event to master thread + * + * \param event_type Type of the event. + * \param args Pointer to the argument structure. + * \param cb_func Callback function into fc driver. + * + * \return 0 on success, non-zero on failure. + */ +int +nvmf_fc_master_enqueue_event(enum spdk_fc_event event_type, + void *args, + spdk_nvmf_fc_callback cb_func); + +/* + * dump info + */ +struct spdk_nvmf_fc_queue_dump_info { + char *buffer; + int offset; +}; +#define SPDK_FC_HW_DUMP_BUF_SIZE (10 * 4096) + +static inline void +nvmf_fc_dump_buf_print(struct spdk_nvmf_fc_queue_dump_info *dump_info, char *fmt, ...) +{ + uint64_t buffer_size = SPDK_FC_HW_DUMP_BUF_SIZE; + int32_t avail = (int32_t)(buffer_size - dump_info->offset); + + if (avail > 0) { + va_list ap; + int32_t written; + + va_start(ap, fmt); + written = vsnprintf(dump_info->buffer + dump_info->offset, avail, fmt, ap); + if (written >= avail) { + dump_info->offset += avail; + } else { + dump_info->offset += written; + } + va_end(ap); + } +} + +/* + * NVMF FC caller callback definitions + */ +typedef void (*spdk_nvmf_fc_caller_cb)(void *hwqp, int32_t status, void *args); + +struct spdk_nvmf_fc_caller_ctx { + void *ctx; + spdk_nvmf_fc_caller_cb cb; + void *cb_args; + TAILQ_ENTRY(spdk_nvmf_fc_caller_ctx) link; +}; + +/* + * NVMF FC Exchange Info (for debug) + */ +struct spdk_nvmf_fc_xchg_info { + uint32_t xchg_base; + uint32_t xchg_total_count; + uint32_t xchg_avail_count; + uint32_t send_frame_xchg_id; + uint8_t send_frame_seqid; +}; + +/* + * NVMF FC inline and function prototypes + */ + +static inline struct spdk_nvmf_fc_request * +nvmf_fc_get_fc_req(struct spdk_nvmf_request *req) +{ + return (struct spdk_nvmf_fc_request *) + ((uintptr_t)req - offsetof(struct spdk_nvmf_fc_request, req)); +} + +static inline bool +nvmf_fc_is_port_dead(struct spdk_nvmf_fc_hwqp *hwqp) +{ + switch (hwqp->fc_port->hw_port_status) { + case SPDK_FC_PORT_QUIESCED: + return true; + default: + return false; + } +} + +static inline bool +nvmf_fc_req_in_xfer(struct spdk_nvmf_fc_request *fc_req) +{ + switch (fc_req->state) { + case SPDK_NVMF_FC_REQ_READ_XFER: + case SPDK_NVMF_FC_REQ_READ_RSP: + case SPDK_NVMF_FC_REQ_WRITE_XFER: + case SPDK_NVMF_FC_REQ_WRITE_RSP: + case SPDK_NVMF_FC_REQ_NONE_RSP: + return true; + default: + return false; + } +} + +static inline void +nvmf_fc_create_trid(struct spdk_nvme_transport_id *trid, uint64_t n_wwn, uint64_t p_wwn) +{ + spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_FC); + trid->adrfam = SPDK_NVMF_ADRFAM_FC; + snprintf(trid->trsvcid, sizeof(trid->trsvcid), "none"); + snprintf(trid->traddr, sizeof(trid->traddr), "nn-0x%lx:pn-0x%lx", n_wwn, p_wwn); +} + +void nvmf_fc_ls_init(struct spdk_nvmf_fc_port *fc_port); + +void nvmf_fc_ls_fini(struct spdk_nvmf_fc_port *fc_port); + +void nvmf_fc_handle_ls_rqst(struct spdk_nvmf_fc_ls_rqst *ls_rqst); +void nvmf_fc_ls_add_conn_failure( + struct spdk_nvmf_fc_association *assoc, + struct spdk_nvmf_fc_ls_rqst *ls_rqst, + struct spdk_nvmf_fc_conn *fc_conn, + bool aq_conn); + +void nvmf_fc_init_hwqp(struct spdk_nvmf_fc_port *fc_port, struct spdk_nvmf_fc_hwqp *hwqp); + +void nvmf_fc_init_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp); + +struct spdk_nvmf_fc_conn *nvmf_fc_hwqp_find_fc_conn(struct spdk_nvmf_fc_hwqp *hwqp, + uint64_t conn_id); + +void nvmf_fc_hwqp_reinit_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp, void *queues_curr); + +struct spdk_nvmf_fc_port *nvmf_fc_port_lookup(uint8_t port_hdl); + +bool nvmf_fc_port_is_offline(struct spdk_nvmf_fc_port *fc_port); + +int nvmf_fc_port_set_offline(struct spdk_nvmf_fc_port *fc_port); + +bool nvmf_fc_port_is_online(struct spdk_nvmf_fc_port *fc_port); + +int nvmf_fc_port_set_online(struct spdk_nvmf_fc_port *fc_port); + +int nvmf_fc_rport_set_state(struct spdk_nvmf_fc_remote_port_info *rport, + enum spdk_nvmf_fc_object_state state); + +void nvmf_fc_port_add(struct spdk_nvmf_fc_port *fc_port); + +int nvmf_fc_port_add_nport(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_nport *nport); + +int nvmf_fc_port_remove_nport(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_nport *nport); + +struct spdk_nvmf_fc_nport *nvmf_fc_nport_find(uint8_t port_hdl, uint16_t nport_hdl); + +int nvmf_fc_nport_set_state(struct spdk_nvmf_fc_nport *nport, + enum spdk_nvmf_fc_object_state state); + +bool nvmf_fc_nport_add_rem_port(struct spdk_nvmf_fc_nport *nport, + struct spdk_nvmf_fc_remote_port_info *rem_port); + +bool nvmf_fc_nport_remove_rem_port(struct spdk_nvmf_fc_nport *nport, + struct spdk_nvmf_fc_remote_port_info *rem_port); + +bool nvmf_fc_nport_has_no_rport(struct spdk_nvmf_fc_nport *nport); + +int nvmf_fc_assoc_set_state(struct spdk_nvmf_fc_association *assoc, + enum spdk_nvmf_fc_object_state state); + +int nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport, + uint64_t assoc_id, bool send_abts, bool backend_initiated, + spdk_nvmf_fc_del_assoc_cb del_assoc_cb, + void *cb_data); + +bool nvmf_ctrlr_is_on_nport(uint8_t port_hdl, uint16_t nport_hdl, + struct spdk_nvmf_ctrlr *ctrlr); + +void nvmf_fc_assign_queue_to_master_thread(struct spdk_nvmf_fc_hwqp *hwqp); + +void nvmf_fc_poll_group_add_hwqp(struct spdk_nvmf_fc_hwqp *hwqp); + +void nvmf_fc_poll_group_remove_hwqp(struct spdk_nvmf_fc_hwqp *hwqp); + +int nvmf_fc_hwqp_set_online(struct spdk_nvmf_fc_hwqp *hwqp); + +int nvmf_fc_hwqp_set_offline(struct spdk_nvmf_fc_hwqp *hwqp); + +uint32_t nvmf_fc_get_prli_service_params(void); + +void nvmf_fc_handle_abts_frame(struct spdk_nvmf_fc_nport *nport, uint16_t rpi, uint16_t oxid, + uint16_t rxid); + +void nvmf_fc_request_abort(struct spdk_nvmf_fc_request *fc_req, bool send_abts, + spdk_nvmf_fc_caller_cb cb, void *cb_args); + +struct spdk_nvmf_tgt *nvmf_fc_get_tgt(void); + +struct spdk_thread *nvmf_fc_get_master_thread(void); + +/* + * These functions are called by low level FC driver + */ + +static inline struct spdk_nvmf_fc_conn * +nvmf_fc_get_conn(struct spdk_nvmf_qpair *qpair) +{ + return (struct spdk_nvmf_fc_conn *) + ((uintptr_t)qpair - offsetof(struct spdk_nvmf_fc_conn, qpair)); +} + +static inline uint16_t +nvmf_fc_advance_conn_sqhead(struct spdk_nvmf_qpair *qpair) +{ + /* advance sq_head pointer - wrap if needed */ + qpair->sq_head = (qpair->sq_head == qpair->sq_head_max) ? + 0 : (qpair->sq_head + 1); + return qpair->sq_head; +} + +static inline bool +nvmf_fc_use_send_frame(struct spdk_nvmf_request *req) +{ + /* For now use for only keepalives. */ + if (req->qpair->qid == 0 && + (req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_KEEP_ALIVE)) { + return true; + } + return false; +} + +enum spdk_nvmf_fc_poller_api_ret nvmf_fc_poller_api_func( + struct spdk_nvmf_fc_hwqp *hwqp, + enum spdk_nvmf_fc_poller_api api, + void *api_args); + +int nvmf_fc_hwqp_process_frame(struct spdk_nvmf_fc_hwqp *hwqp, uint32_t buff_idx, + struct spdk_nvmf_fc_frame_hdr *frame, + struct spdk_nvmf_fc_buffer_desc *buffer, uint32_t plen); + +void nvmf_fc_hwqp_process_pending_reqs(struct spdk_nvmf_fc_hwqp *hwqp); + +void nvmf_fc_hwqp_process_pending_ls_rqsts(struct spdk_nvmf_fc_hwqp *hwqp); + +void nvmf_fc_request_set_state(struct spdk_nvmf_fc_request *fc_req, + enum spdk_nvmf_fc_request_state state); + +char *nvmf_fc_request_get_state_str(int state); + +void _nvmf_fc_request_free(struct spdk_nvmf_fc_request *fc_req); + +void nvmf_fc_request_abort_complete(void *arg1); + +bool nvmf_fc_send_ersp_required(struct spdk_nvmf_fc_request *fc_req, + uint32_t rsp_cnt, uint32_t xfer_len); + +int nvmf_fc_handle_rsp(struct spdk_nvmf_fc_request *req); + +#endif diff --git a/src/spdk/lib/nvmf/nvmf_internal.h b/src/spdk/lib/nvmf/nvmf_internal.h new file mode 100644 index 000000000..f1f3837d5 --- /dev/null +++ b/src/spdk/lib/nvmf/nvmf_internal.h @@ -0,0 +1,371 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVMF_INTERNAL_H__ +#define __NVMF_INTERNAL_H__ + +#include "spdk/stdinc.h" + +#include "spdk/likely.h" +#include "spdk/nvmf.h" +#include "spdk/nvmf_cmd.h" +#include "spdk/nvmf_transport.h" +#include "spdk/nvmf_spec.h" +#include "spdk/assert.h" +#include "spdk/bdev.h" +#include "spdk/queue.h" +#include "spdk/util.h" +#include "spdk/thread.h" + +#define NVMF_MAX_ASYNC_EVENTS (4) + +enum spdk_nvmf_subsystem_state { + SPDK_NVMF_SUBSYSTEM_INACTIVE = 0, + SPDK_NVMF_SUBSYSTEM_ACTIVATING, + SPDK_NVMF_SUBSYSTEM_ACTIVE, + SPDK_NVMF_SUBSYSTEM_PAUSING, + SPDK_NVMF_SUBSYSTEM_PAUSED, + SPDK_NVMF_SUBSYSTEM_RESUMING, + SPDK_NVMF_SUBSYSTEM_DEACTIVATING, +}; + +struct spdk_nvmf_tgt { + char name[NVMF_TGT_NAME_MAX_LENGTH]; + + pthread_mutex_t mutex; + + uint64_t discovery_genctr; + + uint32_t max_subsystems; + + /* Array of subsystem pointers of size max_subsystems indexed by sid */ + struct spdk_nvmf_subsystem **subsystems; + + TAILQ_HEAD(, spdk_nvmf_transport) transports; + TAILQ_HEAD(, spdk_nvmf_poll_group) poll_groups; + + /* Used for round-robin assignment of connections to poll groups */ + struct spdk_nvmf_poll_group *next_poll_group; + + spdk_nvmf_tgt_destroy_done_fn *destroy_cb_fn; + void *destroy_cb_arg; + + TAILQ_ENTRY(spdk_nvmf_tgt) link; +}; + +struct spdk_nvmf_host { + char nqn[SPDK_NVMF_NQN_MAX_LEN + 1]; + TAILQ_ENTRY(spdk_nvmf_host) link; +}; + +struct spdk_nvmf_subsystem_listener { + struct spdk_nvmf_subsystem *subsystem; + spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn; + void *cb_arg; + struct spdk_nvme_transport_id *trid; + struct spdk_nvmf_transport *transport; + TAILQ_ENTRY(spdk_nvmf_subsystem_listener) link; +}; + +/* Maximum number of registrants supported per namespace */ +#define SPDK_NVMF_MAX_NUM_REGISTRANTS 16 + +struct spdk_nvmf_registrant_info { + uint64_t rkey; + char host_uuid[SPDK_UUID_STRING_LEN]; +}; + +struct spdk_nvmf_reservation_info { + bool ptpl_activated; + enum spdk_nvme_reservation_type rtype; + uint64_t crkey; + char bdev_uuid[SPDK_UUID_STRING_LEN]; + char holder_uuid[SPDK_UUID_STRING_LEN]; + uint32_t num_regs; + struct spdk_nvmf_registrant_info registrants[SPDK_NVMF_MAX_NUM_REGISTRANTS]; +}; + +struct spdk_nvmf_subsystem_pg_ns_info { + struct spdk_io_channel *channel; + struct spdk_uuid uuid; + /* current reservation key, no reservation if the value is 0 */ + uint64_t crkey; + /* reservation type */ + enum spdk_nvme_reservation_type rtype; + /* Host ID which holds the reservation */ + struct spdk_uuid holder_id; + /* Host ID for the registrants with the namespace */ + struct spdk_uuid reg_hostid[SPDK_NVMF_MAX_NUM_REGISTRANTS]; + uint64_t num_blocks; +}; + +typedef void(*spdk_nvmf_poll_group_mod_done)(void *cb_arg, int status); + +struct spdk_nvmf_subsystem_poll_group { + /* Array of namespace information for each namespace indexed by nsid - 1 */ + struct spdk_nvmf_subsystem_pg_ns_info *ns_info; + uint32_t num_ns; + + uint64_t io_outstanding; + spdk_nvmf_poll_group_mod_done cb_fn; + void *cb_arg; + + enum spdk_nvmf_subsystem_state state; + + TAILQ_HEAD(, spdk_nvmf_request) queued; +}; + +struct spdk_nvmf_registrant { + TAILQ_ENTRY(spdk_nvmf_registrant) link; + struct spdk_uuid hostid; + /* Registration key */ + uint64_t rkey; +}; + +struct spdk_nvmf_ns { + uint32_t nsid; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + struct spdk_nvmf_ns_opts opts; + /* reservation notificaton mask */ + uint32_t mask; + /* generation code */ + uint32_t gen; + /* registrants head */ + TAILQ_HEAD(, spdk_nvmf_registrant) registrants; + /* current reservation key */ + uint64_t crkey; + /* reservation type */ + enum spdk_nvme_reservation_type rtype; + /* current reservation holder, only valid if reservation type can only have one holder */ + struct spdk_nvmf_registrant *holder; + /* Persist Through Power Loss file which contains the persistent reservation */ + char *ptpl_file; + /* Persist Through Power Loss feature is enabled */ + bool ptpl_activated; +}; + +struct spdk_nvmf_ctrlr_feat { + union spdk_nvme_feat_arbitration arbitration; + union spdk_nvme_feat_power_management power_management; + union spdk_nvme_feat_error_recovery error_recovery; + union spdk_nvme_feat_volatile_write_cache volatile_write_cache; + union spdk_nvme_feat_number_of_queues number_of_queues; + union spdk_nvme_feat_write_atomicity write_atomicity; + union spdk_nvme_feat_async_event_configuration async_event_configuration; + union spdk_nvme_feat_keep_alive_timer keep_alive_timer; +}; + +/* + * NVMf reservation notificaton log page. + */ +struct spdk_nvmf_reservation_log { + struct spdk_nvme_reservation_notification_log log; + TAILQ_ENTRY(spdk_nvmf_reservation_log) link; + struct spdk_nvmf_ctrlr *ctrlr; +}; + +/* + * This structure represents an NVMe-oF controller, + * which is like a "session" in networking terms. + */ +struct spdk_nvmf_ctrlr { + uint16_t cntlid; + char hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1]; + struct spdk_nvmf_subsystem *subsys; + + struct spdk_nvmf_ctrlr_data cdata; + + struct spdk_nvmf_registers vcprop; + + struct spdk_nvmf_ctrlr_feat feat; + + struct spdk_nvmf_qpair *admin_qpair; + struct spdk_thread *thread; + struct spdk_bit_array *qpair_mask; + + struct spdk_nvmf_request *aer_req[NVMF_MAX_ASYNC_EVENTS]; + union spdk_nvme_async_event_completion notice_event; + union spdk_nvme_async_event_completion reservation_event; + uint8_t nr_aer_reqs; + struct spdk_uuid hostid; + + uint16_t changed_ns_list_count; + struct spdk_nvme_ns_list changed_ns_list; + uint64_t log_page_count; + uint8_t num_avail_log_pages; + TAILQ_HEAD(log_page_head, spdk_nvmf_reservation_log) log_head; + + /* Time to trigger keep-alive--poller_time = now_tick + period */ + uint64_t last_keep_alive_tick; + struct spdk_poller *keep_alive_poller; + + bool dif_insert_or_strip; + + TAILQ_ENTRY(spdk_nvmf_ctrlr) link; +}; + +struct spdk_nvmf_subsystem { + struct spdk_thread *thread; + uint32_t id; + enum spdk_nvmf_subsystem_state state; + + char subnqn[SPDK_NVMF_NQN_MAX_LEN + 1]; + enum spdk_nvmf_subtype subtype; + uint16_t next_cntlid; + bool allow_any_host; + bool allow_any_listener; + + struct spdk_nvmf_tgt *tgt; + + char sn[SPDK_NVME_CTRLR_SN_LEN + 1]; + char mn[SPDK_NVME_CTRLR_MN_LEN + 1]; + + /* Array of pointers to namespaces of size max_nsid indexed by nsid - 1 */ + struct spdk_nvmf_ns **ns; + uint32_t max_nsid; + /* This is the maximum allowed nsid to a subsystem */ + uint32_t max_allowed_nsid; + + TAILQ_HEAD(, spdk_nvmf_ctrlr) ctrlrs; + TAILQ_HEAD(, spdk_nvmf_host) hosts; + TAILQ_HEAD(, spdk_nvmf_subsystem_listener) listeners; + + TAILQ_ENTRY(spdk_nvmf_subsystem) entries; +}; + +int nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_transport *transport); +int nvmf_poll_group_update_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem); +int nvmf_poll_group_add_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); + +void nvmf_get_discovery_log_page(struct spdk_nvmf_tgt *tgt, const char *hostnqn, + struct iovec *iov, + uint32_t iovcnt, uint64_t offset, uint32_t length); + +void nvmf_ctrlr_destruct(struct spdk_nvmf_ctrlr *ctrlr); +int nvmf_ctrlr_process_fabrics_cmd(struct spdk_nvmf_request *req); +int nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req); +int nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req); +bool nvmf_ctrlr_dsm_supported(struct spdk_nvmf_ctrlr *ctrlr); +bool nvmf_ctrlr_write_zeroes_supported(struct spdk_nvmf_ctrlr *ctrlr); +void nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid); + +void nvmf_bdev_ctrlr_identify_ns(struct spdk_nvmf_ns *ns, struct spdk_nvme_ns_data *nsdata, + bool dif_insert_or_strip); +int nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req); +int nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req); +int nvmf_bdev_ctrlr_compare_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req); +int nvmf_bdev_ctrlr_compare_and_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *cmp_req, struct spdk_nvmf_request *write_req); +int nvmf_bdev_ctrlr_write_zeroes_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req); +int nvmf_bdev_ctrlr_flush_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req); +int nvmf_bdev_ctrlr_dsm_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req); +int nvmf_bdev_ctrlr_nvme_passthru_io(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req); +bool nvmf_bdev_ctrlr_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_nvme_cmd *cmd, + struct spdk_dif_ctx *dif_ctx); + +int nvmf_subsystem_add_ctrlr(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr *ctrlr); +void nvmf_subsystem_remove_ctrlr(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr *ctrlr); +void nvmf_subsystem_remove_all_listeners(struct spdk_nvmf_subsystem *subsystem, + bool stop); +struct spdk_nvmf_ctrlr *nvmf_subsystem_get_ctrlr(struct spdk_nvmf_subsystem *subsystem, + uint16_t cntlid); +struct spdk_nvmf_subsystem_listener *nvmf_subsystem_find_listener( + struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid); +struct spdk_nvmf_listener *nvmf_transport_find_listener( + struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid); + +int nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr); +void nvmf_ctrlr_async_event_reservation_notification(struct spdk_nvmf_ctrlr *ctrlr); +void nvmf_ns_reservation_request(void *ctx); +void nvmf_ctrlr_reservation_notice_log(struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvmf_ns *ns, + enum spdk_nvme_reservation_notification_log_page_type type); + +/* + * Abort aer is sent on a per controller basis and sends a completion for the aer to the host. + * This function should be called when attempting to recover in error paths when it is OK for + * the host to send a subsequent AER. + */ +void nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr); + +/* + * Free aer simply frees the rdma resources for the aer without informing the host. + * This function should be called when deleting a qpair when one wants to make sure + * the qpair is completely empty before freeing the request. The reason we free the + * AER without sending a completion is to prevent the host from sending another AER. + */ +void nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair); + +int nvmf_ctrlr_abort_request(struct spdk_nvmf_request *req); + +static inline struct spdk_nvmf_ns * +_nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + /* NOTE: This implicitly also checks for 0, since 0 - 1 wraps around to UINT32_MAX. */ + if (spdk_unlikely(nsid - 1 >= subsystem->max_nsid)) { + return NULL; + } + + return subsystem->ns[nsid - 1]; +} + +static inline bool +nvmf_qpair_is_admin_queue(struct spdk_nvmf_qpair *qpair) +{ + return qpair->qid == 0; +} + +#endif /* __NVMF_INTERNAL_H__ */ diff --git a/src/spdk/lib/nvmf/nvmf_rpc.c b/src/spdk/lib/nvmf/nvmf_rpc.c new file mode 100644 index 000000000..5dc9f42f0 --- /dev/null +++ b/src/spdk/lib/nvmf/nvmf_rpc.c @@ -0,0 +1,2012 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2018-2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/bdev.h" +#include "spdk/log.h" +#include "spdk/rpc.h" +#include "spdk/env.h" +#include "spdk/nvme.h" +#include "spdk/nvmf.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" +#include "spdk_internal/assert.h" + +#include "nvmf_internal.h" + +static int +json_write_hex_str(struct spdk_json_write_ctx *w, const void *data, size_t size) +{ + static const char hex_char[16] = "0123456789ABCDEF"; + const uint8_t *buf = data; + char *str, *out; + int rc; + + str = malloc(size * 2 + 1); + if (str == NULL) { + return -1; + } + + out = str; + while (size--) { + unsigned byte = *buf++; + + out[0] = hex_char[(byte >> 4) & 0xF]; + out[1] = hex_char[byte & 0xF]; + + out += 2; + } + *out = '\0'; + + rc = spdk_json_write_string(w, str); + free(str); + + return rc; +} + +static int +hex_nybble_to_num(char c) +{ + if (c >= '0' && c <= '9') { + return c - '0'; + } + + if (c >= 'a' && c <= 'f') { + return c - 'a' + 0xA; + } + + if (c >= 'A' && c <= 'F') { + return c - 'A' + 0xA; + } + + return -1; +} + +static int +hex_byte_to_num(const char *str) +{ + int hi, lo; + + hi = hex_nybble_to_num(str[0]); + if (hi < 0) { + return hi; + } + + lo = hex_nybble_to_num(str[1]); + if (lo < 0) { + return lo; + } + + return hi * 16 + lo; +} + +static int +decode_hex_string_be(const char *str, uint8_t *out, size_t size) +{ + size_t i; + + /* Decode a string in "ABCDEF012345" format to its binary representation */ + for (i = 0; i < size; i++) { + int num = hex_byte_to_num(str); + + if (num < 0) { + /* Invalid hex byte or end of string */ + return -1; + } + + out[i] = (uint8_t)num; + str += 2; + } + + if (i != size || *str != '\0') { + /* Length mismatch */ + return -1; + } + + return 0; +} + +static int +decode_ns_nguid(const struct spdk_json_val *val, void *out) +{ + char *str = NULL; + int rc; + + rc = spdk_json_decode_string(val, &str); + if (rc == 0) { + /* 16-byte NGUID */ + rc = decode_hex_string_be(str, out, 16); + } + + free(str); + return rc; +} + +static int +decode_ns_eui64(const struct spdk_json_val *val, void *out) +{ + char *str = NULL; + int rc; + + rc = spdk_json_decode_string(val, &str); + if (rc == 0) { + /* 8-byte EUI-64 */ + rc = decode_hex_string_be(str, out, 8); + } + + free(str); + return rc; +} + +static int +decode_ns_uuid(const struct spdk_json_val *val, void *out) +{ + char *str = NULL; + int rc; + + rc = spdk_json_decode_string(val, &str); + if (rc == 0) { + rc = spdk_uuid_parse(out, str); + } + + free(str); + return rc; +} + +struct rpc_get_subsystem { + char *tgt_name; +}; + +static const struct spdk_json_object_decoder rpc_get_subsystem_decoders[] = { + {"tgt_name", offsetof(struct rpc_get_subsystem, tgt_name), spdk_json_decode_string, true}, +}; + +static void +dump_nvmf_subsystem(struct spdk_json_write_ctx *w, struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_host *host; + struct spdk_nvmf_subsystem_listener *listener; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + spdk_json_write_name(w, "subtype"); + if (spdk_nvmf_subsystem_get_type(subsystem) == SPDK_NVMF_SUBTYPE_NVME) { + spdk_json_write_string(w, "NVMe"); + } else { + spdk_json_write_string(w, "Discovery"); + } + + spdk_json_write_named_array_begin(w, "listen_addresses"); + + for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL; + listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) { + const struct spdk_nvme_transport_id *trid; + const char *adrfam; + + trid = spdk_nvmf_subsystem_listener_get_trid(listener); + + spdk_json_write_object_begin(w); + adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam); + if (adrfam == NULL) { + adrfam = "unknown"; + } + /* NOTE: "transport" is kept for compatibility; new code should use "trtype" */ + spdk_json_write_named_string(w, "transport", trid->trstring); + spdk_json_write_named_string(w, "trtype", trid->trstring); + spdk_json_write_named_string(w, "adrfam", adrfam); + spdk_json_write_named_string(w, "traddr", trid->traddr); + spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_json_write_named_bool(w, "allow_any_host", + spdk_nvmf_subsystem_get_allow_any_host(subsystem)); + + spdk_json_write_named_array_begin(w, "hosts"); + + for (host = spdk_nvmf_subsystem_get_first_host(subsystem); host != NULL; + host = spdk_nvmf_subsystem_get_next_host(subsystem, host)) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "nqn", spdk_nvmf_host_get_nqn(host)); + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + if (spdk_nvmf_subsystem_get_type(subsystem) == SPDK_NVMF_SUBTYPE_NVME) { + struct spdk_nvmf_ns *ns; + struct spdk_nvmf_ns_opts ns_opts; + uint32_t max_namespaces; + + spdk_json_write_named_string(w, "serial_number", spdk_nvmf_subsystem_get_sn(subsystem)); + + spdk_json_write_named_string(w, "model_number", spdk_nvmf_subsystem_get_mn(subsystem)); + + max_namespaces = spdk_nvmf_subsystem_get_max_namespaces(subsystem); + if (max_namespaces != 0) { + spdk_json_write_named_uint32(w, "max_namespaces", max_namespaces); + } + + spdk_json_write_named_array_begin(w, "namespaces"); + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + spdk_nvmf_ns_get_opts(ns, &ns_opts, sizeof(ns_opts)); + spdk_json_write_object_begin(w); + spdk_json_write_named_int32(w, "nsid", spdk_nvmf_ns_get_id(ns)); + spdk_json_write_named_string(w, "bdev_name", + spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns))); + /* NOTE: "name" is kept for compatibility only - new code should use bdev_name. */ + spdk_json_write_named_string(w, "name", + spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns))); + + if (!spdk_mem_all_zero(ns_opts.nguid, sizeof(ns_opts.nguid))) { + spdk_json_write_name(w, "nguid"); + json_write_hex_str(w, ns_opts.nguid, sizeof(ns_opts.nguid)); + } + + if (!spdk_mem_all_zero(ns_opts.eui64, sizeof(ns_opts.eui64))) { + spdk_json_write_name(w, "eui64"); + json_write_hex_str(w, ns_opts.eui64, sizeof(ns_opts.eui64)); + } + + if (!spdk_mem_all_zero(&ns_opts.uuid, sizeof(ns_opts.uuid))) { + char uuid_str[SPDK_UUID_STRING_LEN]; + + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &ns_opts.uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + } + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + } + spdk_json_write_object_end(w); +} + +static void +rpc_nvmf_get_subsystems(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_subsystem req = { 0 }; + struct spdk_json_write_ctx *w; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + if (params) { + if (spdk_json_decode_object(params, rpc_get_subsystem_decoders, + SPDK_COUNTOF(rpc_get_subsystem_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + return; + } + } + + tgt = spdk_nvmf_get_tgt(req.tgt_name); + if (!tgt) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + free(req.tgt_name); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + subsystem = spdk_nvmf_subsystem_get_first(tgt); + while (subsystem) { + dump_nvmf_subsystem(w, subsystem); + subsystem = spdk_nvmf_subsystem_get_next(subsystem); + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + free(req.tgt_name); +} +SPDK_RPC_REGISTER("nvmf_get_subsystems", rpc_nvmf_get_subsystems, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_get_subsystems, get_nvmf_subsystems) + +struct rpc_subsystem_create { + char *nqn; + char *serial_number; + char *model_number; + char *tgt_name; + uint32_t max_namespaces; + bool allow_any_host; +}; + +static const struct spdk_json_object_decoder rpc_subsystem_create_decoders[] = { + {"nqn", offsetof(struct rpc_subsystem_create, nqn), spdk_json_decode_string}, + {"serial_number", offsetof(struct rpc_subsystem_create, serial_number), spdk_json_decode_string, true}, + {"model_number", offsetof(struct rpc_subsystem_create, model_number), spdk_json_decode_string, true}, + {"tgt_name", offsetof(struct rpc_subsystem_create, tgt_name), spdk_json_decode_string, true}, + {"max_namespaces", offsetof(struct rpc_subsystem_create, max_namespaces), spdk_json_decode_uint32, true}, + {"allow_any_host", offsetof(struct rpc_subsystem_create, allow_any_host), spdk_json_decode_bool, true}, +}; + +static void +rpc_nvmf_subsystem_started(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + + if (!status) { + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + } else { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Subsystem %s start failed", + subsystem->subnqn); + spdk_nvmf_subsystem_destroy(subsystem); + } +} + +static void +rpc_nvmf_create_subsystem(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_subsystem_create *req; + struct spdk_nvmf_subsystem *subsystem = NULL; + struct spdk_nvmf_tgt *tgt; + int rc = -1; + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Memory allocation failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation failed"); + return; + } + + if (spdk_json_decode_object(params, rpc_subsystem_create_decoders, + SPDK_COUNTOF(rpc_subsystem_create_decoders), + req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto cleanup; + } + + tgt = spdk_nvmf_get_tgt(req->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find target %s\n", req->tgt_name); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find target %s", req->tgt_name); + goto cleanup; + } + + subsystem = spdk_nvmf_subsystem_create(tgt, req->nqn, SPDK_NVMF_SUBTYPE_NVME, + req->max_namespaces); + if (!subsystem) { + SPDK_ERRLOG("Unable to create subsystem %s\n", req->nqn); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to create subsystem %s", req->nqn); + goto cleanup; + } + + if (req->serial_number) { + if (spdk_nvmf_subsystem_set_sn(subsystem, req->serial_number)) { + SPDK_ERRLOG("Subsystem %s: invalid serial number '%s'\n", req->nqn, req->serial_number); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid SN %s", req->serial_number); + goto cleanup; + } + } + + if (req->model_number) { + if (spdk_nvmf_subsystem_set_mn(subsystem, req->model_number)) { + SPDK_ERRLOG("Subsystem %s: invalid model number '%s'\n", req->nqn, req->model_number); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid MN %s", req->model_number); + goto cleanup; + } + } + + spdk_nvmf_subsystem_set_allow_any_host(subsystem, req->allow_any_host); + + rc = spdk_nvmf_subsystem_start(subsystem, + rpc_nvmf_subsystem_started, + request); + +cleanup: + free(req->nqn); + free(req->tgt_name); + free(req->serial_number); + free(req->model_number); + free(req); + + if (rc && subsystem) { + spdk_nvmf_subsystem_destroy(subsystem); + } +} +SPDK_RPC_REGISTER("nvmf_create_subsystem", rpc_nvmf_create_subsystem, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_create_subsystem, nvmf_subsystem_create) + +struct rpc_delete_subsystem { + char *nqn; + char *tgt_name; +}; + +static void +free_rpc_delete_subsystem(struct rpc_delete_subsystem *r) +{ + free(r->nqn); + free(r->tgt_name); +} + +static void +rpc_nvmf_subsystem_stopped(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + nvmf_subsystem_remove_all_listeners(subsystem, true); + spdk_nvmf_subsystem_destroy(subsystem); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static const struct spdk_json_object_decoder rpc_delete_subsystem_decoders[] = { + {"nqn", offsetof(struct rpc_delete_subsystem, nqn), spdk_json_decode_string}, + {"tgt_name", offsetof(struct rpc_delete_subsystem, tgt_name), spdk_json_decode_string, true}, +}; + +static void +rpc_nvmf_delete_subsystem(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_subsystem req = { 0 }; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + if (spdk_json_decode_object(params, rpc_delete_subsystem_decoders, + SPDK_COUNTOF(rpc_delete_subsystem_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.nqn == NULL) { + SPDK_ERRLOG("missing name param\n"); + goto invalid; + } + + tgt = spdk_nvmf_get_tgt(req.tgt_name); + if (!tgt) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + goto invalid_custom_response; + } + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, req.nqn); + if (!subsystem) { + goto invalid; + } + + free_rpc_delete_subsystem(&req); + + spdk_nvmf_subsystem_stop(subsystem, + rpc_nvmf_subsystem_stopped, + request); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +invalid_custom_response: + free_rpc_delete_subsystem(&req); +} +SPDK_RPC_REGISTER("nvmf_delete_subsystem", rpc_nvmf_delete_subsystem, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_delete_subsystem, delete_nvmf_subsystem) + +struct rpc_listen_address { + char *transport; + char *adrfam; + char *traddr; + char *trsvcid; +}; + +#define RPC_MAX_LISTEN_ADDRESSES 255 +#define RPC_MAX_NAMESPACES 255 + +struct rpc_listen_addresses { + size_t num_listen_address; + struct rpc_listen_address addresses[RPC_MAX_LISTEN_ADDRESSES]; +}; + +static const struct spdk_json_object_decoder rpc_listen_address_decoders[] = { + /* NOTE: "transport" is kept for compatibility; new code should use "trtype" */ + {"transport", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true}, + {"trtype", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true}, + {"adrfam", offsetof(struct rpc_listen_address, adrfam), spdk_json_decode_string, true}, + {"traddr", offsetof(struct rpc_listen_address, traddr), spdk_json_decode_string}, + {"trsvcid", offsetof(struct rpc_listen_address, trsvcid), spdk_json_decode_string}, +}; + +static int +decode_rpc_listen_address(const struct spdk_json_val *val, void *out) +{ + struct rpc_listen_address *req = (struct rpc_listen_address *)out; + if (spdk_json_decode_object(val, rpc_listen_address_decoders, + SPDK_COUNTOF(rpc_listen_address_decoders), + req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + return -1; + } + return 0; +} + +static void +free_rpc_listen_address(struct rpc_listen_address *r) +{ + free(r->transport); + free(r->adrfam); + free(r->traddr); + free(r->trsvcid); +} + +enum nvmf_rpc_listen_op { + NVMF_RPC_LISTEN_ADD, + NVMF_RPC_LISTEN_REMOVE, +}; + +struct nvmf_rpc_listener_ctx { + char *nqn; + char *tgt_name; + struct spdk_nvmf_tgt *tgt; + struct spdk_nvmf_subsystem *subsystem; + struct rpc_listen_address address; + + struct spdk_jsonrpc_request *request; + struct spdk_nvme_transport_id trid; + enum nvmf_rpc_listen_op op; + bool response_sent; +}; + +static const struct spdk_json_object_decoder nvmf_rpc_listener_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_listener_ctx, nqn), spdk_json_decode_string}, + {"listen_address", offsetof(struct nvmf_rpc_listener_ctx, address), decode_rpc_listen_address}, + {"tgt_name", offsetof(struct nvmf_rpc_listener_ctx, tgt_name), spdk_json_decode_string, true}, +}; + +static void +nvmf_rpc_listener_ctx_free(struct nvmf_rpc_listener_ctx *ctx) +{ + free(ctx->nqn); + free(ctx->tgt_name); + free_rpc_listen_address(&ctx->address); + free(ctx); +} + +static void +nvmf_rpc_listen_resumed(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_listener_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; + + request = ctx->request; + if (ctx->response_sent) { + /* If an error occurred, the response has already been sent. */ + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + nvmf_rpc_listener_ctx_free(ctx); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +nvmf_rpc_subsystem_listen(void *cb_arg, int status) +{ + struct nvmf_rpc_listener_ctx *ctx = cb_arg; + + if (status) { + /* Destroy the listener that we just created. Ignore the error code because + * the RPC is failing already anyway. */ + spdk_nvmf_tgt_stop_listen(ctx->tgt, &ctx->trid); + + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + } + + if (spdk_nvmf_subsystem_resume(ctx->subsystem, nvmf_rpc_listen_resumed, ctx)) { + if (!ctx->response_sent) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } + nvmf_rpc_listener_ctx_free(ctx); + /* Can't really do anything to recover here - subsystem will remain paused. */ + } +} + +static void +nvmf_rpc_listen_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_listener_ctx *ctx = cb_arg; + int rc; + + if (ctx->op == NVMF_RPC_LISTEN_ADD) { + if (!nvmf_subsystem_find_listener(subsystem, &ctx->trid)) { + rc = spdk_nvmf_tgt_listen(ctx->tgt, &ctx->trid); + if (rc == 0) { + spdk_nvmf_subsystem_add_listener(ctx->subsystem, &ctx->trid, nvmf_rpc_subsystem_listen, ctx); + return; + } + + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + } + } else if (ctx->op == NVMF_RPC_LISTEN_REMOVE) { + if (spdk_nvmf_subsystem_remove_listener(subsystem, &ctx->trid)) { + SPDK_ERRLOG("Unable to remove listener.\n"); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + } + spdk_nvmf_tgt_stop_listen(ctx->tgt, &ctx->trid); + } else { + SPDK_UNREACHABLE(); + } + + if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_listen_resumed, ctx)) { + if (!ctx->response_sent) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } + nvmf_rpc_listener_ctx_free(ctx); + /* Can't really do anything to recover here - subsystem will remain paused. */ + } +} + +static int +rpc_listen_address_to_trid(const struct rpc_listen_address *address, + struct spdk_nvme_transport_id *trid) +{ + size_t len; + + memset(trid, 0, sizeof(*trid)); + + if (spdk_nvme_transport_id_populate_trstring(trid, address->transport)) { + SPDK_ERRLOG("Invalid transport string: %s\n", address->transport); + return -EINVAL; + } + + if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, address->transport)) { + SPDK_ERRLOG("Invalid transport type: %s\n", address->transport); + return -EINVAL; + } + + if (address->adrfam) { + if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, address->adrfam)) { + SPDK_ERRLOG("Invalid adrfam: %s\n", address->adrfam); + return -EINVAL; + } + } else { + trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; + } + + len = strlen(address->traddr); + if (len > sizeof(trid->traddr) - 1) { + SPDK_ERRLOG("Transport address longer than %zu characters: %s\n", + sizeof(trid->traddr) - 1, address->traddr); + return -EINVAL; + } + memcpy(trid->traddr, address->traddr, len + 1); + + len = strlen(address->trsvcid); + if (len > sizeof(trid->trsvcid) - 1) { + SPDK_ERRLOG("Transport service id longer than %zu characters: %s\n", + sizeof(trid->trsvcid) - 1, address->trsvcid); + return -EINVAL; + } + memcpy(trid->trsvcid, address->trsvcid, len + 1); + + return 0; +} + +static void +rpc_nvmf_subsystem_add_listener(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_listener_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + ctx->request = request; + + if (spdk_json_decode_object(params, nvmf_rpc_listener_decoder, + SPDK_COUNTOF(nvmf_rpc_listener_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find a target object.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + ctx->tgt = tgt; + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->subsystem = subsystem; + + if (rpc_listen_address_to_trid(&ctx->address, &ctx->trid)) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->op = NVMF_RPC_LISTEN_ADD; + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_listener_ctx_free(ctx); + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_add_listener", rpc_nvmf_subsystem_add_listener, + SPDK_RPC_RUNTIME); + +static void +rpc_nvmf_subsystem_remove_listener(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_listener_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + ctx->request = request; + + if (spdk_json_decode_object(params, nvmf_rpc_listener_decoder, + SPDK_COUNTOF(nvmf_rpc_listener_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find a target object.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + ctx->tgt = tgt; + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->subsystem = subsystem; + + if (rpc_listen_address_to_trid(&ctx->address, &ctx->trid)) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->op = NVMF_RPC_LISTEN_REMOVE; + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_listener_ctx_free(ctx); + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_remove_listener", rpc_nvmf_subsystem_remove_listener, + SPDK_RPC_RUNTIME); + +struct spdk_nvmf_ns_params { + char *bdev_name; + char *ptpl_file; + uint32_t nsid; + char nguid[16]; + char eui64[8]; + struct spdk_uuid uuid; +}; + +struct rpc_namespaces { + size_t num_ns; + struct spdk_nvmf_ns_params ns_params[RPC_MAX_NAMESPACES]; +}; + + +static const struct spdk_json_object_decoder rpc_ns_params_decoders[] = { + {"nsid", offsetof(struct spdk_nvmf_ns_params, nsid), spdk_json_decode_uint32, true}, + {"bdev_name", offsetof(struct spdk_nvmf_ns_params, bdev_name), spdk_json_decode_string}, + {"ptpl_file", offsetof(struct spdk_nvmf_ns_params, ptpl_file), spdk_json_decode_string, true}, + {"nguid", offsetof(struct spdk_nvmf_ns_params, nguid), decode_ns_nguid, true}, + {"eui64", offsetof(struct spdk_nvmf_ns_params, eui64), decode_ns_eui64, true}, + {"uuid", offsetof(struct spdk_nvmf_ns_params, uuid), decode_ns_uuid, true}, +}; + +static int +decode_rpc_ns_params(const struct spdk_json_val *val, void *out) +{ + struct spdk_nvmf_ns_params *ns_params = out; + + return spdk_json_decode_object(val, rpc_ns_params_decoders, + SPDK_COUNTOF(rpc_ns_params_decoders), + ns_params); +} + +struct nvmf_rpc_ns_ctx { + char *nqn; + char *tgt_name; + struct spdk_nvmf_ns_params ns_params; + + struct spdk_jsonrpc_request *request; + bool response_sent; +}; + +static const struct spdk_json_object_decoder nvmf_rpc_subsystem_ns_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_ns_ctx, nqn), spdk_json_decode_string}, + {"namespace", offsetof(struct nvmf_rpc_ns_ctx, ns_params), decode_rpc_ns_params}, + {"tgt_name", offsetof(struct nvmf_rpc_ns_ctx, tgt_name), spdk_json_decode_string, true}, +}; + +static void +nvmf_rpc_ns_ctx_free(struct nvmf_rpc_ns_ctx *ctx) +{ + free(ctx->nqn); + free(ctx->tgt_name); + free(ctx->ns_params.bdev_name); + free(ctx->ns_params.ptpl_file); + free(ctx); +} + +static void +nvmf_rpc_ns_resumed(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_ns_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request = ctx->request; + uint32_t nsid = ctx->ns_params.nsid; + bool response_sent = ctx->response_sent; + struct spdk_json_write_ctx *w; + + nvmf_rpc_ns_ctx_free(ctx); + + if (response_sent) { + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_uint32(w, nsid); + spdk_jsonrpc_end_result(request, w); +} + +static void +nvmf_rpc_ns_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_ns_ctx *ctx = cb_arg; + struct spdk_nvmf_ns_opts ns_opts; + struct spdk_bdev *bdev; + + bdev = spdk_bdev_get_by_name(ctx->ns_params.bdev_name); + if (!bdev) { + SPDK_ERRLOG("No bdev with name %s\n", ctx->ns_params.bdev_name); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + goto resume; + } + + spdk_nvmf_ns_opts_get_defaults(&ns_opts, sizeof(ns_opts)); + ns_opts.nsid = ctx->ns_params.nsid; + + SPDK_STATIC_ASSERT(sizeof(ns_opts.nguid) == sizeof(ctx->ns_params.nguid), "size mismatch"); + memcpy(ns_opts.nguid, ctx->ns_params.nguid, sizeof(ns_opts.nguid)); + + SPDK_STATIC_ASSERT(sizeof(ns_opts.eui64) == sizeof(ctx->ns_params.eui64), "size mismatch"); + memcpy(ns_opts.eui64, ctx->ns_params.eui64, sizeof(ns_opts.eui64)); + + if (!spdk_mem_all_zero(&ctx->ns_params.uuid, sizeof(ctx->ns_params.uuid))) { + ns_opts.uuid = ctx->ns_params.uuid; + } + + ctx->ns_params.nsid = spdk_nvmf_subsystem_add_ns(subsystem, bdev, &ns_opts, sizeof(ns_opts), + ctx->ns_params.ptpl_file); + if (ctx->ns_params.nsid == 0) { + SPDK_ERRLOG("Unable to add namespace\n"); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + goto resume; + } + +resume: + if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_ns_resumed, ctx)) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_ns_ctx_free(ctx); + } +} + +static void +rpc_nvmf_subsystem_add_ns(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_ns_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_ns_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_ns_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_ns_ctx_free(ctx); + return; + } + + ctx->request = request; + ctx->response_sent = false; + + tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find a target object.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + nvmf_rpc_ns_ctx_free(ctx); + return; + } + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_ns_ctx_free(ctx); + return; + } + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_ns_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_ns_ctx_free(ctx); + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_add_ns", rpc_nvmf_subsystem_add_ns, SPDK_RPC_RUNTIME) + +struct nvmf_rpc_remove_ns_ctx { + char *nqn; + char *tgt_name; + uint32_t nsid; + + struct spdk_jsonrpc_request *request; + bool response_sent; +}; + +static const struct spdk_json_object_decoder nvmf_rpc_subsystem_remove_ns_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_remove_ns_ctx, nqn), spdk_json_decode_string}, + {"nsid", offsetof(struct nvmf_rpc_remove_ns_ctx, nsid), spdk_json_decode_uint32}, + {"tgt_name", offsetof(struct nvmf_rpc_remove_ns_ctx, tgt_name), spdk_json_decode_string, true}, +}; + +static void +nvmf_rpc_remove_ns_ctx_free(struct nvmf_rpc_remove_ns_ctx *ctx) +{ + free(ctx->nqn); + free(ctx->tgt_name); + free(ctx); +} + +static void +nvmf_rpc_remove_ns_resumed(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_remove_ns_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request = ctx->request; + bool response_sent = ctx->response_sent; + struct spdk_json_write_ctx *w; + + nvmf_rpc_remove_ns_ctx_free(ctx); + + if (response_sent) { + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +nvmf_rpc_remove_ns_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_remove_ns_ctx *ctx = cb_arg; + int ret; + + ret = spdk_nvmf_subsystem_remove_ns(subsystem, ctx->nsid); + if (ret < 0) { + SPDK_ERRLOG("Unable to remove namespace ID %u\n", ctx->nsid); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + } + + if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_remove_ns_resumed, ctx)) { + if (!ctx->response_sent) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } + nvmf_rpc_remove_ns_ctx_free(ctx); + } +} + +static void +rpc_nvmf_subsystem_remove_ns(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_remove_ns_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_remove_ns_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_remove_ns_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_remove_ns_ctx_free(ctx); + return; + } + + tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find a target object.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + nvmf_rpc_remove_ns_ctx_free(ctx); + return; + } + + ctx->request = request; + ctx->response_sent = false; + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_remove_ns_ctx_free(ctx); + return; + } + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_remove_ns_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_remove_ns_ctx_free(ctx); + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_remove_ns", rpc_nvmf_subsystem_remove_ns, SPDK_RPC_RUNTIME) + +enum nvmf_rpc_host_op { + NVMF_RPC_HOST_ADD, + NVMF_RPC_HOST_REMOVE, + NVMF_RPC_HOST_ALLOW_ANY, +}; + +struct nvmf_rpc_host_ctx { + struct spdk_jsonrpc_request *request; + + char *nqn; + char *host; + char *tgt_name; + + enum nvmf_rpc_host_op op; + + bool allow_any_host; + + bool response_sent; +}; + +static const struct spdk_json_object_decoder nvmf_rpc_subsystem_host_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_host_ctx, nqn), spdk_json_decode_string}, + {"host", offsetof(struct nvmf_rpc_host_ctx, host), spdk_json_decode_string}, + {"tgt_name", offsetof(struct nvmf_rpc_host_ctx, tgt_name), spdk_json_decode_string, true}, +}; + +static void +nvmf_rpc_host_ctx_free(struct nvmf_rpc_host_ctx *ctx) +{ + free(ctx->nqn); + free(ctx->host); + free(ctx->tgt_name); + free(ctx); +} + +static void +nvmf_rpc_host_resumed(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_host_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; + bool response_sent = ctx->response_sent; + + request = ctx->request; + nvmf_rpc_host_ctx_free(ctx); + + if (response_sent) { + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +nvmf_rpc_host_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_host_ctx *ctx = cb_arg; + int rc = -1; + + switch (ctx->op) { + case NVMF_RPC_HOST_ADD: + rc = spdk_nvmf_subsystem_add_host(subsystem, ctx->host); + break; + case NVMF_RPC_HOST_REMOVE: + rc = spdk_nvmf_subsystem_remove_host(subsystem, ctx->host); + break; + case NVMF_RPC_HOST_ALLOW_ANY: + rc = spdk_nvmf_subsystem_set_allow_any_host(subsystem, ctx->allow_any_host); + break; + } + + if (rc != 0) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + ctx->response_sent = true; + } + + if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_host_resumed, ctx)) { + if (!ctx->response_sent) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } + nvmf_rpc_host_ctx_free(ctx); + } +} + +static void +rpc_nvmf_subsystem_add_host(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_host_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_host_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_host_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find a target object.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + ctx->request = request; + ctx->op = NVMF_RPC_HOST_ADD; + ctx->response_sent = false; + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_host_ctx_free(ctx); + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_add_host", rpc_nvmf_subsystem_add_host, SPDK_RPC_RUNTIME) + +static void +rpc_nvmf_subsystem_remove_host(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_host_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_host_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_host_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find a target object.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + ctx->request = request; + ctx->op = NVMF_RPC_HOST_REMOVE; + ctx->response_sent = false; + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_host_ctx_free(ctx); + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_remove_host", rpc_nvmf_subsystem_remove_host, + SPDK_RPC_RUNTIME) + + +static const struct spdk_json_object_decoder nvmf_rpc_subsystem_any_host_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_host_ctx, nqn), spdk_json_decode_string}, + {"allow_any_host", offsetof(struct nvmf_rpc_host_ctx, allow_any_host), spdk_json_decode_bool}, + {"tgt_name", offsetof(struct nvmf_rpc_host_ctx, tgt_name), spdk_json_decode_string, true}, +}; + +static void +rpc_nvmf_subsystem_allow_any_host(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_host_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_tgt *tgt; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_any_host_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_any_host_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find a target object.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + ctx->request = request; + ctx->op = NVMF_RPC_HOST_ALLOW_ANY; + ctx->response_sent = false; + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_host_ctx_free(ctx); + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_allow_any_host", rpc_nvmf_subsystem_allow_any_host, + SPDK_RPC_RUNTIME) + +struct nvmf_rpc_target_ctx { + char *name; + uint32_t max_subsystems; +}; + +static const struct spdk_json_object_decoder nvmf_rpc_create_target_decoder[] = { + {"name", offsetof(struct nvmf_rpc_target_ctx, name), spdk_json_decode_string}, + {"max_subsystems", offsetof(struct nvmf_rpc_target_ctx, max_subsystems), spdk_json_decode_uint32, true}, +}; + +static void +rpc_nvmf_create_target(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_nvmf_target_opts opts; + struct nvmf_rpc_target_ctx ctx = {0}; + struct spdk_nvmf_tgt *tgt; + struct spdk_json_write_ctx *w; + + /* Decode parameters the first time to get the transport type */ + if (spdk_json_decode_object(params, nvmf_rpc_create_target_decoder, + SPDK_COUNTOF(nvmf_rpc_create_target_decoder), + &ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free(ctx.name); + return; + } + + snprintf(opts.name, NVMF_TGT_NAME_MAX_LENGTH, "%s", ctx.name); + opts.max_subsystems = ctx.max_subsystems; + + if (spdk_nvmf_get_tgt(opts.name) != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Target already exists."); + free(ctx.name); + return; + } + + tgt = spdk_nvmf_tgt_create(&opts); + + if (tgt == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to create the requested target."); + free(ctx.name); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, spdk_nvmf_tgt_get_name(tgt)); + spdk_jsonrpc_end_result(request, w); + free(ctx.name); +} +SPDK_RPC_REGISTER("nvmf_create_target", rpc_nvmf_create_target, SPDK_RPC_RUNTIME); + +static const struct spdk_json_object_decoder nvmf_rpc_destroy_target_decoder[] = { + {"name", offsetof(struct nvmf_rpc_target_ctx, name), spdk_json_decode_string}, +}; + +static void +nvmf_rpc_destroy_target_done(void *ctx, int status) +{ + struct spdk_jsonrpc_request *request = ctx; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_nvmf_delete_target(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_target_ctx ctx = {0}; + struct spdk_nvmf_tgt *tgt; + + /* Decode parameters the first time to get the transport type */ + if (spdk_json_decode_object(params, nvmf_rpc_destroy_target_decoder, + SPDK_COUNTOF(nvmf_rpc_destroy_target_decoder), + &ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free(ctx.name); + return; + } + + tgt = spdk_nvmf_get_tgt(ctx.name); + + if (tgt == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "The specified target doesn't exist, cannot delete it."); + free(ctx.name); + return; + } + + spdk_nvmf_tgt_destroy(tgt, nvmf_rpc_destroy_target_done, request); + free(ctx.name); +} +SPDK_RPC_REGISTER("nvmf_delete_target", rpc_nvmf_delete_target, SPDK_RPC_RUNTIME); + +static void +rpc_nvmf_get_targets(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + struct spdk_nvmf_tgt *tgt; + const char *name; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "nvmf_get_targets has no parameters."); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + tgt = spdk_nvmf_get_first_tgt(); + + while (tgt != NULL) { + name = spdk_nvmf_tgt_get_name(tgt); + spdk_json_write_string(w, name); + tgt = spdk_nvmf_get_next_tgt(tgt); + } + + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("nvmf_get_targets", rpc_nvmf_get_targets, SPDK_RPC_RUNTIME); + +struct nvmf_rpc_create_transport_ctx { + char *trtype; + char *tgt_name; + struct spdk_nvmf_transport_opts opts; + struct spdk_jsonrpc_request *request; +}; + +/** + * `max_qpairs_per_ctrlr` represents both admin and IO qpairs, that confuses + * users when they configure a transport using RPC. So it was decided to + * deprecate `max_qpairs_per_ctrlr` RPC parameter and use `max_io_qpairs_per_ctrlr` + * But internal logic remains unchanged and SPDK expects that + * spdk_nvmf_transport_opts::max_qpairs_per_ctrlr includes an admin qpair. + * This function parses the number of IO qpairs and adds +1 for admin qpair. + */ +static int +nvmf_rpc_decode_max_io_qpairs(const struct spdk_json_val *val, void *out) +{ + uint16_t *i = out; + int rc; + + rc = spdk_json_number_to_uint16(val, i); + if (rc == 0) { + (*i)++; + } + + return rc; +} + +/** + * This function parses deprecated `max_qpairs_per_ctrlr` and warns the user to use + * the new parameter `max_io_qpairs_per_ctrlr` + */ +static int +nvmf_rpc_decode_max_qpairs(const struct spdk_json_val *val, void *out) +{ + uint16_t *i = out; + int rc; + + rc = spdk_json_number_to_uint16(val, i); + if (rc == 0) { + SPDK_WARNLOG("Parameter max_qpairs_per_ctrlr is deprecated, use max_io_qpairs_per_ctrlr instead.\n"); + } + + return rc; +} + +static const struct spdk_json_object_decoder nvmf_rpc_create_transport_decoder[] = { + { "trtype", offsetof(struct nvmf_rpc_create_transport_ctx, trtype), spdk_json_decode_string}, + { + "max_queue_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_queue_depth), + spdk_json_decode_uint16, true + }, + { + "max_qpairs_per_ctrlr", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_qpairs_per_ctrlr), + nvmf_rpc_decode_max_qpairs, true + }, + { + "max_io_qpairs_per_ctrlr", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_qpairs_per_ctrlr), + nvmf_rpc_decode_max_io_qpairs, true + }, + { + "in_capsule_data_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.in_capsule_data_size), + spdk_json_decode_uint32, true + }, + { + "max_io_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_io_size), + spdk_json_decode_uint32, true + }, + { + "io_unit_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.io_unit_size), + spdk_json_decode_uint32, true + }, + { + "max_aq_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_aq_depth), + spdk_json_decode_uint32, true + }, + { + "num_shared_buffers", offsetof(struct nvmf_rpc_create_transport_ctx, opts.num_shared_buffers), + spdk_json_decode_uint32, true + }, + { + "buf_cache_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.buf_cache_size), + spdk_json_decode_uint32, true + }, + { + "max_srq_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_srq_depth), + spdk_json_decode_uint32, true + }, + { + "no_srq", offsetof(struct nvmf_rpc_create_transport_ctx, opts.no_srq), + spdk_json_decode_bool, true + }, + { + "c2h_success", offsetof(struct nvmf_rpc_create_transport_ctx, opts.c2h_success), + spdk_json_decode_bool, true + }, + { + "dif_insert_or_strip", offsetof(struct nvmf_rpc_create_transport_ctx, opts.dif_insert_or_strip), + spdk_json_decode_bool, true + }, + { + "sock_priority", offsetof(struct nvmf_rpc_create_transport_ctx, opts.sock_priority), + spdk_json_decode_uint32, true + }, + { + "acceptor_backlog", offsetof(struct nvmf_rpc_create_transport_ctx, opts.acceptor_backlog), + spdk_json_decode_int32, true + }, + { + "abort_timeout_sec", offsetof(struct nvmf_rpc_create_transport_ctx, opts.abort_timeout_sec), + spdk_json_decode_uint32, true + }, + { + "tgt_name", offsetof(struct nvmf_rpc_create_transport_ctx, tgt_name), + spdk_json_decode_string, true + }, +}; + +static void +nvmf_rpc_create_transport_ctx_free(struct nvmf_rpc_create_transport_ctx *ctx) +{ + free(ctx->trtype); + free(ctx->tgt_name); + free(ctx); +} + +static void +nvmf_rpc_tgt_add_transport_done(void *cb_arg, int status) +{ + struct nvmf_rpc_create_transport_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; + + request = ctx->request; + nvmf_rpc_create_transport_ctx_free(ctx); + + if (status) { + SPDK_ERRLOG("Failed to add transport to tgt.(%d)\n", status); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Failed to add transport to tgt.(%d)\n", + status); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_nvmf_create_transport(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_create_transport_ctx *ctx; + enum spdk_nvme_transport_type trtype; + struct spdk_nvmf_transport *transport; + struct spdk_nvmf_tgt *tgt; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + /* Decode parameters the first time to get the transport type */ + if (spdk_json_decode_object(params, nvmf_rpc_create_transport_decoder, + SPDK_COUNTOF(nvmf_rpc_create_transport_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!tgt) { + SPDK_ERRLOG("Unable to find a target object.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + if (spdk_nvme_transport_id_parse_trtype(&trtype, ctx->trtype)) { + SPDK_ERRLOG("Invalid transport type '%s'\n", ctx->trtype); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid transport type '%s'\n", ctx->trtype); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + /* Initialize all the transport options (based on transport type) and decode the + * parameters again to update any options passed in rpc create transport call. + */ + if (!spdk_nvmf_transport_opts_init(ctx->trtype, &ctx->opts)) { + /* This can happen if user specifies PCIE transport type which isn't valid for + * NVMe-oF. + */ + SPDK_ERRLOG("Invalid transport type '%s'\n", ctx->trtype); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid transport type '%s'\n", ctx->trtype); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_create_transport_decoder, + SPDK_COUNTOF(nvmf_rpc_create_transport_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + if (spdk_nvmf_tgt_get_transport(tgt, ctx->trtype)) { + SPDK_ERRLOG("Transport type '%s' already exists\n", ctx->trtype); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Transport type '%s' already exists\n", ctx->trtype); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + transport = spdk_nvmf_transport_create(ctx->trtype, &ctx->opts); + + if (!transport) { + SPDK_ERRLOG("Transport type '%s' create failed\n", ctx->trtype); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Transport type '%s' create failed\n", ctx->trtype); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + /* add transport to target */ + ctx->request = request; + spdk_nvmf_tgt_add_transport(tgt, transport, nvmf_rpc_tgt_add_transport_done, ctx); +} +SPDK_RPC_REGISTER("nvmf_create_transport", rpc_nvmf_create_transport, SPDK_RPC_RUNTIME) + +static void +dump_nvmf_transport(struct spdk_json_write_ctx *w, struct spdk_nvmf_transport *transport) +{ + const struct spdk_nvmf_transport_opts *opts = spdk_nvmf_get_transport_opts(transport); + spdk_nvme_transport_type_t type = spdk_nvmf_get_transport_type(transport); + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "trtype", spdk_nvmf_get_transport_name(transport)); + spdk_json_write_named_uint32(w, "max_queue_depth", opts->max_queue_depth); + spdk_json_write_named_uint32(w, "max_io_qpairs_per_ctrlr", opts->max_qpairs_per_ctrlr - 1); + spdk_json_write_named_uint32(w, "in_capsule_data_size", opts->in_capsule_data_size); + spdk_json_write_named_uint32(w, "max_io_size", opts->max_io_size); + spdk_json_write_named_uint32(w, "io_unit_size", opts->io_unit_size); + spdk_json_write_named_uint32(w, "max_aq_depth", opts->max_aq_depth); + spdk_json_write_named_uint32(w, "num_shared_buffers", opts->num_shared_buffers); + spdk_json_write_named_uint32(w, "buf_cache_size", opts->buf_cache_size); + spdk_json_write_named_bool(w, "dif_insert_or_strip", opts->dif_insert_or_strip); + if (type == SPDK_NVME_TRANSPORT_RDMA) { + spdk_json_write_named_uint32(w, "max_srq_depth", opts->max_srq_depth); + spdk_json_write_named_bool(w, "no_srq", opts->no_srq); + spdk_json_write_named_int32(w, "acceptor_backlog", opts->acceptor_backlog); + } else if (type == SPDK_NVME_TRANSPORT_TCP) { + spdk_json_write_named_bool(w, "c2h_success", opts->c2h_success); + spdk_json_write_named_uint32(w, "sock_priority", opts->sock_priority); + } + spdk_json_write_named_uint32(w, "abort_timeout_sec", opts->abort_timeout_sec); + + spdk_json_write_object_end(w); +} + +struct rpc_get_transport { + char *tgt_name; +}; + +static const struct spdk_json_object_decoder rpc_get_transport_decoders[] = { + {"tgt_name", offsetof(struct rpc_get_transport, tgt_name), spdk_json_decode_string, true}, +}; + +static void +rpc_nvmf_get_transports(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_transport req = { 0 }; + struct spdk_json_write_ctx *w; + struct spdk_nvmf_transport *transport; + struct spdk_nvmf_tgt *tgt; + + if (params) { + if (spdk_json_decode_object(params, rpc_get_transport_decoders, + SPDK_COUNTOF(rpc_get_transport_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + return; + } + } + + tgt = spdk_nvmf_get_tgt(req.tgt_name); + if (!tgt) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + free(req.tgt_name); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + transport = spdk_nvmf_transport_get_first(tgt); + while (transport) { + dump_nvmf_transport(w, transport); + transport = spdk_nvmf_transport_get_next(transport); + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + free(req.tgt_name); +} +SPDK_RPC_REGISTER("nvmf_get_transports", rpc_nvmf_get_transports, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_get_transports, get_nvmf_transports) + +struct rpc_nvmf_get_stats_ctx { + char *tgt_name; + struct spdk_nvmf_tgt *tgt; + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; +}; + +static const struct spdk_json_object_decoder rpc_get_stats_decoders[] = { + {"tgt_name", offsetof(struct rpc_nvmf_get_stats_ctx, tgt_name), spdk_json_decode_string, true}, +}; + +static void +free_get_stats_ctx(struct rpc_nvmf_get_stats_ctx *ctx) +{ + free(ctx->tgt_name); + free(ctx); +} + +static void +rpc_nvmf_get_stats_done(struct spdk_io_channel_iter *i, int status) +{ + struct rpc_nvmf_get_stats_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + spdk_json_write_array_end(ctx->w); + spdk_json_write_object_end(ctx->w); + spdk_jsonrpc_end_result(ctx->request, ctx->w); + free_get_stats_ctx(ctx); +} + +static void +write_nvmf_transport_stats(struct spdk_json_write_ctx *w, + struct spdk_nvmf_transport_poll_group_stat *stat) +{ + uint64_t i; + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "trtype", + spdk_nvme_transport_id_trtype_str(stat->trtype)); + switch (stat->trtype) { + case SPDK_NVME_TRANSPORT_RDMA: + spdk_json_write_named_uint64(w, "pending_data_buffer", stat->rdma.pending_data_buffer); + spdk_json_write_named_array_begin(w, "devices"); + for (i = 0; i < stat->rdma.num_devices; ++i) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "name", stat->rdma.devices[i].name); + spdk_json_write_named_uint64(w, "polls", stat->rdma.devices[i].polls); + spdk_json_write_named_uint64(w, "completions", stat->rdma.devices[i].completions); + spdk_json_write_named_uint64(w, "requests", + stat->rdma.devices[i].requests); + spdk_json_write_named_uint64(w, "request_latency", + stat->rdma.devices[i].request_latency); + spdk_json_write_named_uint64(w, "pending_free_request", + stat->rdma.devices[i].pending_free_request); + spdk_json_write_named_uint64(w, "pending_rdma_read", + stat->rdma.devices[i].pending_rdma_read); + spdk_json_write_named_uint64(w, "pending_rdma_write", + stat->rdma.devices[i].pending_rdma_write); + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + break; + default: + break; + } + spdk_json_write_object_end(w); +} + +static void +_rpc_nvmf_get_stats(struct spdk_io_channel_iter *i) +{ + struct rpc_nvmf_get_stats_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_nvmf_transport *transport; + struct spdk_nvmf_poll_group_stat stat; + struct spdk_nvmf_transport_poll_group_stat *trstat; + int rc; + + if (0 == spdk_nvmf_poll_group_get_stat(ctx->tgt, &stat)) { + spdk_json_write_object_begin(ctx->w); + spdk_json_write_named_string(ctx->w, "name", spdk_thread_get_name(spdk_get_thread())); + spdk_json_write_named_uint32(ctx->w, "admin_qpairs", stat.admin_qpairs); + spdk_json_write_named_uint32(ctx->w, "io_qpairs", stat.io_qpairs); + spdk_json_write_named_uint64(ctx->w, "pending_bdev_io", stat.pending_bdev_io); + + spdk_json_write_named_array_begin(ctx->w, "transports"); + transport = spdk_nvmf_transport_get_first(ctx->tgt); + while (transport) { + rc = spdk_nvmf_transport_poll_group_get_stat(ctx->tgt, transport, &trstat); + if (0 == rc) { + write_nvmf_transport_stats(ctx->w, trstat); + spdk_nvmf_transport_poll_group_free_stat(transport, trstat); + } else if (-ENOTSUP != rc) { + SPDK_ERRLOG("Failed to get poll group statistics for transport %s, errno %d\n", + spdk_nvme_transport_id_trtype_str(spdk_nvmf_get_transport_type(transport)), + rc); + } + transport = spdk_nvmf_transport_get_next(transport); + } + spdk_json_write_array_end(ctx->w); + spdk_json_write_object_end(ctx->w); + } + + spdk_for_each_channel_continue(i, 0); +} + + +static void +rpc_nvmf_get_stats(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_nvmf_get_stats_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation error"); + return; + } + ctx->request = request; + + if (params) { + if (spdk_json_decode_object(params, rpc_get_stats_decoders, + SPDK_COUNTOF(rpc_get_stats_decoders), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_get_stats_ctx(ctx); + return; + } + } + + ctx->tgt = spdk_nvmf_get_tgt(ctx->tgt_name); + if (!ctx->tgt) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to find a target."); + free_get_stats_ctx(ctx); + return; + } + + ctx->w = spdk_jsonrpc_begin_result(ctx->request); + spdk_json_write_object_begin(ctx->w); + spdk_json_write_named_uint64(ctx->w, "tick_rate", spdk_get_ticks_hz()); + spdk_json_write_named_array_begin(ctx->w, "poll_groups"); + + spdk_for_each_channel(ctx->tgt, + _rpc_nvmf_get_stats, + ctx, + rpc_nvmf_get_stats_done); +} + +SPDK_RPC_REGISTER("nvmf_get_stats", rpc_nvmf_get_stats, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/nvmf/rdma.c b/src/spdk/lib/nvmf/rdma.c new file mode 100644 index 000000000..4a4de4374 --- /dev/null +++ b/src/spdk/lib/nvmf/rdma.c @@ -0,0 +1,4313 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/config.h" +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/nvmf_transport.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/util.h" + +#include "spdk_internal/assert.h" +#include "spdk_internal/log.h" +#include "spdk_internal/rdma.h" + +#include "nvmf_internal.h" + +struct spdk_nvme_rdma_hooks g_nvmf_hooks = {}; +const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma; + +/* + RDMA Connection Resource Defaults + */ +#define NVMF_DEFAULT_TX_SGE SPDK_NVMF_MAX_SGL_ENTRIES +#define NVMF_DEFAULT_RSP_SGE 1 +#define NVMF_DEFAULT_RX_SGE 2 + +/* The RDMA completion queue size */ +#define DEFAULT_NVMF_RDMA_CQ_SIZE 4096 +#define MAX_WR_PER_QP(queue_depth) (queue_depth * 3 + 2) + +/* Timeout for destroying defunct rqpairs */ +#define NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US 4000000 + +static int g_spdk_nvmf_ibv_query_mask = + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER | + IBV_QP_SQ_PSN | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_MAX_QP_RD_ATOMIC; + +enum spdk_nvmf_rdma_request_state { + /* The request is not currently in use */ + RDMA_REQUEST_STATE_FREE = 0, + + /* Initial state when request first received */ + RDMA_REQUEST_STATE_NEW, + + /* The request is queued until a data buffer is available. */ + RDMA_REQUEST_STATE_NEED_BUFFER, + + /* The request is waiting on RDMA queue depth availability + * to transfer data from the host to the controller. + */ + RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, + + /* The request is currently transferring data from the host to the controller. */ + RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, + + /* The request is ready to execute at the block device */ + RDMA_REQUEST_STATE_READY_TO_EXECUTE, + + /* The request is currently executing at the block device */ + RDMA_REQUEST_STATE_EXECUTING, + + /* The request finished executing at the block device */ + RDMA_REQUEST_STATE_EXECUTED, + + /* The request is waiting on RDMA queue depth availability + * to transfer data from the controller to the host. + */ + RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, + + /* The request is ready to send a completion */ + RDMA_REQUEST_STATE_READY_TO_COMPLETE, + + /* The request is currently transferring data from the controller to the host. */ + RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, + + /* The request currently has an outstanding completion without an + * associated data transfer. + */ + RDMA_REQUEST_STATE_COMPLETING, + + /* The request completed and can be marked free. */ + RDMA_REQUEST_STATE_COMPLETED, + + /* Terminator */ + RDMA_REQUEST_NUM_STATES, +}; + +#define OBJECT_NVMF_RDMA_IO 0x40 + +#define TRACE_GROUP_NVMF_RDMA 0x4 +#define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0) +#define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1) +#define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2) +#define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3) +#define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4) +#define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5) +#define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6) +#define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7) +#define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8) +#define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9) +#define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA) +#define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xB) +#define TRACE_RDMA_QP_CREATE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xC) +#define TRACE_RDMA_IBV_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xD) +#define TRACE_RDMA_CM_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xE) +#define TRACE_RDMA_QP_STATE_CHANGE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xF) +#define TRACE_RDMA_QP_DISCONNECT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x10) +#define TRACE_RDMA_QP_DESTROY SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x11) + +SPDK_TRACE_REGISTER_FN(nvmf_trace, "nvmf_rdma", TRACE_GROUP_NVMF_RDMA) +{ + spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); + spdk_trace_register_description("RDMA_REQ_NEW", TRACE_RDMA_REQUEST_STATE_NEW, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_TX_PENDING_C2H", + TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_TX_PENDING_H2C", + TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_TX_H2C", + TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", + TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_EXECUTING", + TRACE_RDMA_REQUEST_STATE_EXECUTING, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_EXECUTED", + TRACE_RDMA_REQUEST_STATE_EXECUTED, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPL", + TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_COMPLETING_C2H", + TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_COMPLETING", + TRACE_RDMA_REQUEST_STATE_COMPLETING, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_COMPLETED", + TRACE_RDMA_REQUEST_STATE_COMPLETED, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + + spdk_trace_register_description("RDMA_QP_CREATE", TRACE_RDMA_QP_CREATE, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", TRACE_RDMA_IBV_ASYNC_EVENT, + OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); + spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", TRACE_RDMA_CM_ASYNC_EVENT, + OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); + spdk_trace_register_description("RDMA_QP_STATE_CHANGE", TRACE_RDMA_QP_STATE_CHANGE, + OWNER_NONE, OBJECT_NONE, 0, 1, "state: "); + spdk_trace_register_description("RDMA_QP_DISCONNECT", TRACE_RDMA_QP_DISCONNECT, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("RDMA_QP_DESTROY", TRACE_RDMA_QP_DESTROY, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); +} + +enum spdk_nvmf_rdma_wr_type { + RDMA_WR_TYPE_RECV, + RDMA_WR_TYPE_SEND, + RDMA_WR_TYPE_DATA, +}; + +struct spdk_nvmf_rdma_wr { + enum spdk_nvmf_rdma_wr_type type; +}; + +/* This structure holds commands as they are received off the wire. + * It must be dynamically paired with a full request object + * (spdk_nvmf_rdma_request) to service a request. It is separate + * from the request because RDMA does not appear to order + * completions, so occasionally we'll get a new incoming + * command when there aren't any free request objects. + */ +struct spdk_nvmf_rdma_recv { + struct ibv_recv_wr wr; + struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; + + struct spdk_nvmf_rdma_qpair *qpair; + + /* In-capsule data buffer */ + uint8_t *buf; + + struct spdk_nvmf_rdma_wr rdma_wr; + uint64_t receive_tsc; + + STAILQ_ENTRY(spdk_nvmf_rdma_recv) link; +}; + +struct spdk_nvmf_rdma_request_data { + struct spdk_nvmf_rdma_wr rdma_wr; + struct ibv_send_wr wr; + struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; +}; + +struct spdk_nvmf_rdma_request { + struct spdk_nvmf_request req; + + enum spdk_nvmf_rdma_request_state state; + + struct spdk_nvmf_rdma_recv *recv; + + struct { + struct spdk_nvmf_rdma_wr rdma_wr; + struct ibv_send_wr wr; + struct ibv_sge sgl[NVMF_DEFAULT_RSP_SGE]; + } rsp; + + struct spdk_nvmf_rdma_request_data data; + + uint32_t iovpos; + + uint32_t num_outstanding_data_wr; + uint64_t receive_tsc; + + STAILQ_ENTRY(spdk_nvmf_rdma_request) state_link; +}; + +enum spdk_nvmf_rdma_qpair_disconnect_flags { + RDMA_QP_DISCONNECTING = 1, + RDMA_QP_RECV_DRAINED = 1 << 1, + RDMA_QP_SEND_DRAINED = 1 << 2 +}; + +struct spdk_nvmf_rdma_resource_opts { + struct spdk_nvmf_rdma_qpair *qpair; + /* qp points either to an ibv_qp object or an ibv_srq object depending on the value of shared. */ + void *qp; + struct ibv_pd *pd; + uint32_t max_queue_depth; + uint32_t in_capsule_data_size; + bool shared; +}; + +struct spdk_nvmf_send_wr_list { + struct ibv_send_wr *first; + struct ibv_send_wr *last; +}; + +struct spdk_nvmf_recv_wr_list { + struct ibv_recv_wr *first; + struct ibv_recv_wr *last; +}; + +struct spdk_nvmf_rdma_resources { + /* Array of size "max_queue_depth" containing RDMA requests. */ + struct spdk_nvmf_rdma_request *reqs; + + /* Array of size "max_queue_depth" containing RDMA recvs. */ + struct spdk_nvmf_rdma_recv *recvs; + + /* Array of size "max_queue_depth" containing 64 byte capsules + * used for receive. + */ + union nvmf_h2c_msg *cmds; + struct ibv_mr *cmds_mr; + + /* Array of size "max_queue_depth" containing 16 byte completions + * to be sent back to the user. + */ + union nvmf_c2h_msg *cpls; + struct ibv_mr *cpls_mr; + + /* Array of size "max_queue_depth * InCapsuleDataSize" containing + * buffers to be used for in capsule data. + */ + void *bufs; + struct ibv_mr *bufs_mr; + + /* The list of pending recvs to transfer */ + struct spdk_nvmf_recv_wr_list recvs_to_post; + + /* Receives that are waiting for a request object */ + STAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; + + /* Queue to track free requests */ + STAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue; +}; + +typedef void (*spdk_nvmf_rdma_qpair_ibv_event)(struct spdk_nvmf_rdma_qpair *rqpair); + +struct spdk_nvmf_rdma_ibv_event_ctx { + struct spdk_nvmf_rdma_qpair *rqpair; + spdk_nvmf_rdma_qpair_ibv_event cb_fn; + /* Link to other ibv events associated with this qpair */ + STAILQ_ENTRY(spdk_nvmf_rdma_ibv_event_ctx) link; +}; + +struct spdk_nvmf_rdma_qpair { + struct spdk_nvmf_qpair qpair; + + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_poller *poller; + + struct spdk_rdma_qp *rdma_qp; + struct rdma_cm_id *cm_id; + struct ibv_srq *srq; + struct rdma_cm_id *listen_id; + + /* The maximum number of I/O outstanding on this connection at one time */ + uint16_t max_queue_depth; + + /* The maximum number of active RDMA READ and ATOMIC operations at one time */ + uint16_t max_read_depth; + + /* The maximum number of RDMA SEND operations at one time */ + uint32_t max_send_depth; + + /* The current number of outstanding WRs from this qpair's + * recv queue. Should not exceed device->attr.max_queue_depth. + */ + uint16_t current_recv_depth; + + /* The current number of active RDMA READ operations */ + uint16_t current_read_depth; + + /* The current number of posted WRs from this qpair's + * send queue. Should not exceed max_send_depth. + */ + uint32_t current_send_depth; + + /* The maximum number of SGEs per WR on the send queue */ + uint32_t max_send_sge; + + /* The maximum number of SGEs per WR on the recv queue */ + uint32_t max_recv_sge; + + struct spdk_nvmf_rdma_resources *resources; + + STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_read_queue; + + STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_write_queue; + + /* Number of requests not in the free state */ + uint32_t qd; + + TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; + + STAILQ_ENTRY(spdk_nvmf_rdma_qpair) recv_link; + + STAILQ_ENTRY(spdk_nvmf_rdma_qpair) send_link; + + /* IBV queue pair attributes: they are used to manage + * qp state and recover from errors. + */ + enum ibv_qp_state ibv_state; + + uint32_t disconnect_flags; + + /* Poller registered in case the qpair doesn't properly + * complete the qpair destruct process and becomes defunct. + */ + + struct spdk_poller *destruct_poller; + + /* + * io_channel which is used to destroy qpair when it is removed from poll group + */ + struct spdk_io_channel *destruct_channel; + + /* List of ibv async events */ + STAILQ_HEAD(, spdk_nvmf_rdma_ibv_event_ctx) ibv_events; + + /* There are several ways a disconnect can start on a qpair + * and they are not all mutually exclusive. It is important + * that we only initialize one of these paths. + */ + bool disconnect_started; + /* Lets us know that we have received the last_wqe event. */ + bool last_wqe_reached; +}; + +struct spdk_nvmf_rdma_poller_stat { + uint64_t completions; + uint64_t polls; + uint64_t requests; + uint64_t request_latency; + uint64_t pending_free_request; + uint64_t pending_rdma_read; + uint64_t pending_rdma_write; +}; + +struct spdk_nvmf_rdma_poller { + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_poll_group *group; + + int num_cqe; + int required_num_wr; + struct ibv_cq *cq; + + /* The maximum number of I/O outstanding on the shared receive queue at one time */ + uint16_t max_srq_depth; + + /* Shared receive queue */ + struct ibv_srq *srq; + + struct spdk_nvmf_rdma_resources *resources; + struct spdk_nvmf_rdma_poller_stat stat; + + TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; + + STAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs_pending_recv; + + STAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs_pending_send; + + TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; +}; + +struct spdk_nvmf_rdma_poll_group_stat { + uint64_t pending_data_buffer; +}; + +struct spdk_nvmf_rdma_poll_group { + struct spdk_nvmf_transport_poll_group group; + struct spdk_nvmf_rdma_poll_group_stat stat; + TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; + TAILQ_ENTRY(spdk_nvmf_rdma_poll_group) link; + /* + * buffers which are split across multiple RDMA + * memory regions cannot be used by this transport. + */ + STAILQ_HEAD(, spdk_nvmf_transport_pg_cache_buf) retired_bufs; +}; + +struct spdk_nvmf_rdma_conn_sched { + struct spdk_nvmf_rdma_poll_group *next_admin_pg; + struct spdk_nvmf_rdma_poll_group *next_io_pg; +}; + +/* Assuming rdma_cm uses just one protection domain per ibv_context. */ +struct spdk_nvmf_rdma_device { + struct ibv_device_attr attr; + struct ibv_context *context; + + struct spdk_mem_map *map; + struct ibv_pd *pd; + + int num_srq; + + TAILQ_ENTRY(spdk_nvmf_rdma_device) link; +}; + +struct spdk_nvmf_rdma_port { + const struct spdk_nvme_transport_id *trid; + struct rdma_cm_id *id; + struct spdk_nvmf_rdma_device *device; + TAILQ_ENTRY(spdk_nvmf_rdma_port) link; +}; + +struct spdk_nvmf_rdma_transport { + struct spdk_nvmf_transport transport; + + struct spdk_nvmf_rdma_conn_sched conn_sched; + + struct rdma_event_channel *event_channel; + + struct spdk_mempool *data_wr_pool; + + pthread_mutex_t lock; + + /* fields used to poll RDMA/IB events */ + nfds_t npoll_fds; + struct pollfd *poll_fds; + + TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; + TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; + TAILQ_HEAD(, spdk_nvmf_rdma_poll_group) poll_groups; +}; + +static inline void +nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair); + +static bool +nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_request *rdma_req); + +static inline int +nvmf_rdma_check_ibv_state(enum ibv_qp_state state) +{ + switch (state) { + case IBV_QPS_RESET: + case IBV_QPS_INIT: + case IBV_QPS_RTR: + case IBV_QPS_RTS: + case IBV_QPS_SQD: + case IBV_QPS_SQE: + case IBV_QPS_ERR: + return 0; + default: + return -1; + } +} + +static inline enum spdk_nvme_media_error_status_code +nvmf_rdma_dif_error_to_compl_status(uint8_t err_type) { + enum spdk_nvme_media_error_status_code result; + switch (err_type) + { + case SPDK_DIF_REFTAG_ERROR: + result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR; + break; + case SPDK_DIF_APPTAG_ERROR: + result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR; + break; + case SPDK_DIF_GUARD_ERROR: + result = SPDK_NVME_SC_GUARD_CHECK_ERROR; + break; + default: + SPDK_UNREACHABLE(); + } + + return result; +} + +static enum ibv_qp_state +nvmf_rdma_update_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair) { + enum ibv_qp_state old_state, new_state; + struct ibv_qp_attr qp_attr; + struct ibv_qp_init_attr init_attr; + int rc; + + old_state = rqpair->ibv_state; + rc = ibv_query_qp(rqpair->rdma_qp->qp, &qp_attr, + g_spdk_nvmf_ibv_query_mask, &init_attr); + + if (rc) + { + SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n"); + return IBV_QPS_ERR + 1; + } + + new_state = qp_attr.qp_state; + rqpair->ibv_state = new_state; + qp_attr.ah_attr.port_num = qp_attr.port_num; + + rc = nvmf_rdma_check_ibv_state(new_state); + if (rc) + { + SPDK_ERRLOG("QP#%d: bad state updated: %u, maybe hardware issue\n", rqpair->qpair.qid, new_state); + /* + * IBV_QPS_UNKNOWN undefined if lib version smaller than libibverbs-1.1.8 + * IBV_QPS_UNKNOWN is the enum element after IBV_QPS_ERR + */ + return IBV_QPS_ERR + 1; + } + + if (old_state != new_state) + { + spdk_trace_record(TRACE_RDMA_QP_STATE_CHANGE, 0, 0, + (uintptr_t)rqpair->cm_id, new_state); + } + return new_state; +} + +static void +nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req, + struct spdk_nvmf_rdma_transport *rtransport) +{ + struct spdk_nvmf_rdma_request_data *data_wr; + struct ibv_send_wr *next_send_wr; + uint64_t req_wrid; + + rdma_req->num_outstanding_data_wr = 0; + data_wr = &rdma_req->data; + req_wrid = data_wr->wr.wr_id; + while (data_wr && data_wr->wr.wr_id == req_wrid) { + memset(data_wr->sgl, 0, sizeof(data_wr->wr.sg_list[0]) * data_wr->wr.num_sge); + data_wr->wr.num_sge = 0; + next_send_wr = data_wr->wr.next; + if (data_wr != &rdma_req->data) { + spdk_mempool_put(rtransport->data_wr_pool, data_wr); + } + data_wr = (!next_send_wr || next_send_wr == &rdma_req->rsp.wr) ? NULL : + SPDK_CONTAINEROF(next_send_wr, struct spdk_nvmf_rdma_request_data, wr); + } +} + +static void +nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req) +{ + SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", req->req.data_from_pool); + if (req->req.cmd) { + SPDK_ERRLOG("\t\tRequest opcode: %d\n", req->req.cmd->nvmf_cmd.opcode); + } + if (req->recv) { + SPDK_ERRLOG("\t\tRequest recv wr_id%lu\n", req->recv->wr.wr_id); + } +} + +static void +nvmf_rdma_dump_qpair_contents(struct spdk_nvmf_rdma_qpair *rqpair) +{ + int i; + + SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", rqpair->qpair.qid); + for (i = 0; i < rqpair->max_queue_depth; i++) { + if (rqpair->resources->reqs[i].state != RDMA_REQUEST_STATE_FREE) { + nvmf_rdma_dump_request(&rqpair->resources->reqs[i]); + } + } +} + +static void +nvmf_rdma_resources_destroy(struct spdk_nvmf_rdma_resources *resources) +{ + if (resources->cmds_mr) { + ibv_dereg_mr(resources->cmds_mr); + } + + if (resources->cpls_mr) { + ibv_dereg_mr(resources->cpls_mr); + } + + if (resources->bufs_mr) { + ibv_dereg_mr(resources->bufs_mr); + } + + spdk_free(resources->cmds); + spdk_free(resources->cpls); + spdk_free(resources->bufs); + free(resources->reqs); + free(resources->recvs); + free(resources); +} + + +static struct spdk_nvmf_rdma_resources * +nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts) +{ + struct spdk_nvmf_rdma_resources *resources; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_rdma_recv *rdma_recv; + struct ibv_qp *qp; + struct ibv_srq *srq; + uint32_t i; + int rc; + + resources = calloc(1, sizeof(struct spdk_nvmf_rdma_resources)); + if (!resources) { + SPDK_ERRLOG("Unable to allocate resources for receive queue.\n"); + return NULL; + } + + resources->reqs = calloc(opts->max_queue_depth, sizeof(*resources->reqs)); + resources->recvs = calloc(opts->max_queue_depth, sizeof(*resources->recvs)); + resources->cmds = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cmds), + 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + resources->cpls = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cpls), + 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + + if (opts->in_capsule_data_size > 0) { + resources->bufs = spdk_zmalloc(opts->max_queue_depth * opts->in_capsule_data_size, + 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); + } + + if (!resources->reqs || !resources->recvs || !resources->cmds || + !resources->cpls || (opts->in_capsule_data_size && !resources->bufs)) { + SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); + goto cleanup; + } + + resources->cmds_mr = ibv_reg_mr(opts->pd, resources->cmds, + opts->max_queue_depth * sizeof(*resources->cmds), + IBV_ACCESS_LOCAL_WRITE); + resources->cpls_mr = ibv_reg_mr(opts->pd, resources->cpls, + opts->max_queue_depth * sizeof(*resources->cpls), + 0); + + if (opts->in_capsule_data_size) { + resources->bufs_mr = ibv_reg_mr(opts->pd, resources->bufs, + opts->max_queue_depth * + opts->in_capsule_data_size, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + } + + if (!resources->cmds_mr || !resources->cpls_mr || + (opts->in_capsule_data_size && + !resources->bufs_mr)) { + goto cleanup; + } + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", + resources->cmds, opts->max_queue_depth * sizeof(*resources->cmds), + resources->cmds_mr->lkey); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", + resources->cpls, opts->max_queue_depth * sizeof(*resources->cpls), + resources->cpls_mr->lkey); + if (resources->bufs && resources->bufs_mr) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", + resources->bufs, opts->max_queue_depth * + opts->in_capsule_data_size, resources->bufs_mr->lkey); + } + + /* Initialize queues */ + STAILQ_INIT(&resources->incoming_queue); + STAILQ_INIT(&resources->free_queue); + + for (i = 0; i < opts->max_queue_depth; i++) { + struct ibv_recv_wr *bad_wr = NULL; + + rdma_recv = &resources->recvs[i]; + rdma_recv->qpair = opts->qpair; + + /* Set up memory to receive commands */ + if (resources->bufs) { + rdma_recv->buf = (void *)((uintptr_t)resources->bufs + (i * + opts->in_capsule_data_size)); + } + + rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV; + + rdma_recv->sgl[0].addr = (uintptr_t)&resources->cmds[i]; + rdma_recv->sgl[0].length = sizeof(resources->cmds[i]); + rdma_recv->sgl[0].lkey = resources->cmds_mr->lkey; + rdma_recv->wr.num_sge = 1; + + if (rdma_recv->buf && resources->bufs_mr) { + rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; + rdma_recv->sgl[1].length = opts->in_capsule_data_size; + rdma_recv->sgl[1].lkey = resources->bufs_mr->lkey; + rdma_recv->wr.num_sge++; + } + + rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr; + rdma_recv->wr.sg_list = rdma_recv->sgl; + if (opts->shared) { + srq = (struct ibv_srq *)opts->qp; + rc = ibv_post_srq_recv(srq, &rdma_recv->wr, &bad_wr); + } else { + qp = (struct ibv_qp *)opts->qp; + rc = ibv_post_recv(qp, &rdma_recv->wr, &bad_wr); + } + if (rc) { + goto cleanup; + } + } + + for (i = 0; i < opts->max_queue_depth; i++) { + rdma_req = &resources->reqs[i]; + + if (opts->qpair != NULL) { + rdma_req->req.qpair = &opts->qpair->qpair; + } else { + rdma_req->req.qpair = NULL; + } + rdma_req->req.cmd = NULL; + + /* Set up memory to send responses */ + rdma_req->req.rsp = &resources->cpls[i]; + + rdma_req->rsp.sgl[0].addr = (uintptr_t)&resources->cpls[i]; + rdma_req->rsp.sgl[0].length = sizeof(resources->cpls[i]); + rdma_req->rsp.sgl[0].lkey = resources->cpls_mr->lkey; + + rdma_req->rsp.rdma_wr.type = RDMA_WR_TYPE_SEND; + rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp.rdma_wr; + rdma_req->rsp.wr.next = NULL; + rdma_req->rsp.wr.opcode = IBV_WR_SEND; + rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; + rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; + rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); + + /* Set up memory for data buffers */ + rdma_req->data.rdma_wr.type = RDMA_WR_TYPE_DATA; + rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data.rdma_wr; + rdma_req->data.wr.next = NULL; + rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; + rdma_req->data.wr.sg_list = rdma_req->data.sgl; + rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); + + /* Initialize request state to FREE */ + rdma_req->state = RDMA_REQUEST_STATE_FREE; + STAILQ_INSERT_TAIL(&resources->free_queue, rdma_req, state_link); + } + + return resources; + +cleanup: + nvmf_rdma_resources_destroy(resources); + return NULL; +} + +static void +nvmf_rdma_qpair_clean_ibv_events(struct spdk_nvmf_rdma_qpair *rqpair) +{ + struct spdk_nvmf_rdma_ibv_event_ctx *ctx, *tctx; + STAILQ_FOREACH_SAFE(ctx, &rqpair->ibv_events, link, tctx) { + ctx->rqpair = NULL; + /* Memory allocated for ctx is freed in nvmf_rdma_qpair_process_ibv_event */ + STAILQ_REMOVE(&rqpair->ibv_events, ctx, spdk_nvmf_rdma_ibv_event_ctx, link); + } +} + +static void +nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) +{ + struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; + struct ibv_recv_wr *bad_recv_wr = NULL; + int rc; + + spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0); + + spdk_poller_unregister(&rqpair->destruct_poller); + + if (rqpair->qd != 0) { + struct spdk_nvmf_qpair *qpair = &rqpair->qpair; + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(qpair->transport, + struct spdk_nvmf_rdma_transport, transport); + struct spdk_nvmf_rdma_request *req; + uint32_t i, max_req_count = 0; + + SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", rqpair->qd); + + if (rqpair->srq == NULL) { + nvmf_rdma_dump_qpair_contents(rqpair); + max_req_count = rqpair->max_queue_depth; + } else if (rqpair->poller && rqpair->resources) { + max_req_count = rqpair->poller->max_srq_depth; + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Release incomplete requests\n"); + for (i = 0; i < max_req_count; i++) { + req = &rqpair->resources->reqs[i]; + if (req->req.qpair == qpair && req->state != RDMA_REQUEST_STATE_FREE) { + /* nvmf_rdma_request_process checks qpair ibv and internal state + * and completes a request */ + nvmf_rdma_request_process(rtransport, req); + } + } + assert(rqpair->qd == 0); + } + + if (rqpair->poller) { + TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); + + if (rqpair->srq != NULL && rqpair->resources != NULL) { + /* Drop all received but unprocessed commands for this queue and return them to SRQ */ + STAILQ_FOREACH_SAFE(rdma_recv, &rqpair->resources->incoming_queue, link, recv_tmp) { + if (rqpair == rdma_recv->qpair) { + STAILQ_REMOVE(&rqpair->resources->incoming_queue, rdma_recv, spdk_nvmf_rdma_recv, link); + rc = ibv_post_srq_recv(rqpair->srq, &rdma_recv->wr, &bad_recv_wr); + if (rc) { + SPDK_ERRLOG("Unable to re-post rx descriptor\n"); + } + } + } + } + } + + if (rqpair->cm_id) { + if (rqpair->rdma_qp != NULL) { + spdk_rdma_qp_destroy(rqpair->rdma_qp); + rqpair->rdma_qp = NULL; + } + rdma_destroy_id(rqpair->cm_id); + + if (rqpair->poller != NULL && rqpair->srq == NULL) { + rqpair->poller->required_num_wr -= MAX_WR_PER_QP(rqpair->max_queue_depth); + } + } + + if (rqpair->srq == NULL && rqpair->resources != NULL) { + nvmf_rdma_resources_destroy(rqpair->resources); + } + + nvmf_rdma_qpair_clean_ibv_events(rqpair); + + if (rqpair->destruct_channel) { + spdk_put_io_channel(rqpair->destruct_channel); + rqpair->destruct_channel = NULL; + } + + free(rqpair); +} + +static int +nvmf_rdma_resize_cq(struct spdk_nvmf_rdma_qpair *rqpair, struct spdk_nvmf_rdma_device *device) +{ + struct spdk_nvmf_rdma_poller *rpoller; + int rc, num_cqe, required_num_wr; + + /* Enlarge CQ size dynamically */ + rpoller = rqpair->poller; + required_num_wr = rpoller->required_num_wr + MAX_WR_PER_QP(rqpair->max_queue_depth); + num_cqe = rpoller->num_cqe; + if (num_cqe < required_num_wr) { + num_cqe = spdk_max(num_cqe * 2, required_num_wr); + num_cqe = spdk_min(num_cqe, device->attr.max_cqe); + } + + if (rpoller->num_cqe != num_cqe) { + if (required_num_wr > device->attr.max_cqe) { + SPDK_ERRLOG("RDMA CQE requirement (%d) exceeds device max_cqe limitation (%d)\n", + required_num_wr, device->attr.max_cqe); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Resize RDMA CQ from %d to %d\n", rpoller->num_cqe, num_cqe); + rc = ibv_resize_cq(rpoller->cq, num_cqe); + if (rc) { + SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno)); + return -1; + } + + rpoller->num_cqe = num_cqe; + } + + rpoller->required_num_wr = required_num_wr; + return 0; +} + +static int +nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_transport *transport; + struct spdk_nvmf_rdma_resource_opts opts; + struct spdk_nvmf_rdma_device *device; + struct spdk_rdma_qp_init_attr qp_init_attr = {}; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + device = rqpair->device; + + qp_init_attr.qp_context = rqpair; + qp_init_attr.pd = device->pd; + qp_init_attr.send_cq = rqpair->poller->cq; + qp_init_attr.recv_cq = rqpair->poller->cq; + + if (rqpair->srq) { + qp_init_attr.srq = rqpair->srq; + } else { + qp_init_attr.cap.max_recv_wr = rqpair->max_queue_depth; + } + + /* SEND, READ, and WRITE operations */ + qp_init_attr.cap.max_send_wr = (uint32_t)rqpair->max_queue_depth * 2; + qp_init_attr.cap.max_send_sge = spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_TX_SGE); + qp_init_attr.cap.max_recv_sge = spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_RX_SGE); + + if (rqpair->srq == NULL && nvmf_rdma_resize_cq(rqpair, device) < 0) { + SPDK_ERRLOG("Failed to resize the completion queue. Cannot initialize qpair.\n"); + goto error; + } + + rqpair->rdma_qp = spdk_rdma_qp_create(rqpair->cm_id, &qp_init_attr); + if (!rqpair->rdma_qp) { + goto error; + } + + rqpair->max_send_depth = spdk_min((uint32_t)(rqpair->max_queue_depth * 2), + qp_init_attr.cap.max_send_wr); + rqpair->max_send_sge = spdk_min(NVMF_DEFAULT_TX_SGE, qp_init_attr.cap.max_send_sge); + rqpair->max_recv_sge = spdk_min(NVMF_DEFAULT_RX_SGE, qp_init_attr.cap.max_recv_sge); + spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); + + if (rqpair->poller->srq == NULL) { + rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); + transport = &rtransport->transport; + + opts.qp = rqpair->rdma_qp->qp; + opts.pd = rqpair->cm_id->pd; + opts.qpair = rqpair; + opts.shared = false; + opts.max_queue_depth = rqpair->max_queue_depth; + opts.in_capsule_data_size = transport->opts.in_capsule_data_size; + + rqpair->resources = nvmf_rdma_resources_create(&opts); + + if (!rqpair->resources) { + SPDK_ERRLOG("Unable to allocate resources for receive queue.\n"); + rdma_destroy_qp(rqpair->cm_id); + goto error; + } + } else { + rqpair->resources = rqpair->poller->resources; + } + + rqpair->current_recv_depth = 0; + STAILQ_INIT(&rqpair->pending_rdma_read_queue); + STAILQ_INIT(&rqpair->pending_rdma_write_queue); + + return 0; + +error: + rdma_destroy_id(rqpair->cm_id); + rqpair->cm_id = NULL; + return -1; +} + +/* Append the given recv wr structure to the resource structs outstanding recvs list. */ +/* This function accepts either a single wr or the first wr in a linked list. */ +static void +nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first) +{ + struct ibv_recv_wr *last; + + last = first; + while (last->next != NULL) { + last = last->next; + } + + if (rqpair->resources->recvs_to_post.first == NULL) { + rqpair->resources->recvs_to_post.first = first; + rqpair->resources->recvs_to_post.last = last; + if (rqpair->srq == NULL) { + STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_recv, rqpair, recv_link); + } + } else { + rqpair->resources->recvs_to_post.last->next = first; + rqpair->resources->recvs_to_post.last = last; + } +} + +static int +request_transfer_in(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + + qpair = req->qpair; + rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); + assert(rdma_req != NULL); + + if (spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, &rdma_req->data.wr)) { + STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link); + } + + rqpair->current_read_depth += rdma_req->num_outstanding_data_wr; + rqpair->current_send_depth += rdma_req->num_outstanding_data_wr; + return 0; +} + +static int +request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) +{ + int num_outstanding_data_wr = 0; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvme_cpl *rsp; + struct ibv_send_wr *first = NULL; + + *data_posted = 0; + qpair = req->qpair; + rsp = &req->rsp->nvme_cpl; + rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + /* Advance our sq_head pointer */ + if (qpair->sq_head == qpair->sq_head_max) { + qpair->sq_head = 0; + } else { + qpair->sq_head++; + } + rsp->sqhd = qpair->sq_head; + + /* queue the capsule for the recv buffer */ + assert(rdma_req->recv != NULL); + + nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr); + + rdma_req->recv = NULL; + assert(rqpair->current_recv_depth > 0); + rqpair->current_recv_depth--; + + /* Build the response which consists of optional + * RDMA WRITEs to transfer data, plus an RDMA SEND + * containing the response. + */ + first = &rdma_req->rsp.wr; + + if (rsp->status.sc != SPDK_NVME_SC_SUCCESS) { + /* On failure, data was not read from the controller. So clear the + * number of outstanding data WRs to zero. + */ + rdma_req->num_outstanding_data_wr = 0; + } else if (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + first = &rdma_req->data.wr; + *data_posted = 1; + num_outstanding_data_wr = rdma_req->num_outstanding_data_wr; + } + if (spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, first)) { + STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link); + } + + /* +1 for the rsp wr */ + rqpair->current_send_depth += num_outstanding_data_wr + 1; + + return 0; +} + +static int +nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) +{ + struct spdk_nvmf_rdma_accept_private_data accept_data; + struct rdma_conn_param ctrlr_event_data = {}; + int rc; + + accept_data.recfmt = 0; + accept_data.crqsize = rqpair->max_queue_depth; + + ctrlr_event_data.private_data = &accept_data; + ctrlr_event_data.private_data_len = sizeof(accept_data); + if (id->ps == RDMA_PS_TCP) { + ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ + ctrlr_event_data.initiator_depth = rqpair->max_read_depth; + } + + /* Configure infinite retries for the initiator side qpair. + * When using a shared receive queue on the target side, + * we need to pass this value to the initiator to prevent the + * initiator side NIC from completing SEND requests back to the + * initiator with status rnr_retry_count_exceeded. */ + if (rqpair->srq != NULL) { + ctrlr_event_data.rnr_retry_count = 0x7; + } + + /* When qpair is created without use of rdma cm API, an additional + * information must be provided to initiator in the connection response: + * whether qpair is using SRQ and its qp_num + * Fields below are ignored by rdma cm if qpair has been + * created using rdma cm API. */ + ctrlr_event_data.srq = rqpair->srq ? 1 : 0; + ctrlr_event_data.qp_num = rqpair->rdma_qp->qp->qp_num; + + rc = spdk_rdma_qp_accept(rqpair->rdma_qp, &ctrlr_event_data); + if (rc) { + SPDK_ERRLOG("Error %d on spdk_rdma_qp_accept\n", errno); + } else { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); + } + + return rc; +} + +static void +nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) +{ + struct spdk_nvmf_rdma_reject_private_data rej_data; + + rej_data.recfmt = 0; + rej_data.sts = error; + + rdma_reject(id, &rej_data, sizeof(rej_data)); +} + +static int +nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_qpair *rqpair = NULL; + struct spdk_nvmf_rdma_port *port; + struct rdma_conn_param *rdma_param = NULL; + const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; + uint16_t max_queue_depth; + uint16_t max_read_depth; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ + assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ + + rdma_param = &event->param.conn; + if (rdma_param->private_data == NULL || + rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { + SPDK_ERRLOG("connect request: no private data provided\n"); + nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); + return -1; + } + + private_data = rdma_param->private_data; + if (private_data->recfmt != 0) { + SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); + nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", + event->id->verbs->device->name, event->id->verbs->device->dev_name); + + port = event->listen_id->context; + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", + event->listen_id, event->listen_id->verbs, port); + + /* Figure out the supported queue depth. This is a multi-step process + * that takes into account hardware maximums, host provided values, + * and our target's internal memory limits */ + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); + + /* Start with the maximum queue depth allowed by the target */ + max_queue_depth = rtransport->transport.opts.max_queue_depth; + max_read_depth = rtransport->transport.opts.max_queue_depth; + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", + rtransport->transport.opts.max_queue_depth); + + /* Next check the local NIC's hardware limitations */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, + "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", + port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); + max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); + max_read_depth = spdk_min(max_read_depth, port->device->attr.max_qp_init_rd_atom); + + /* Next check the remote NIC's hardware limitations */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, + "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", + rdma_param->initiator_depth, rdma_param->responder_resources); + if (rdma_param->initiator_depth > 0) { + max_read_depth = spdk_min(max_read_depth, rdma_param->initiator_depth); + } + + /* Finally check for the host software requested values, which are + * optional. */ + if (rdma_param->private_data != NULL && + rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); + max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); + max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", + max_queue_depth, max_read_depth); + + rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); + if (rqpair == NULL) { + SPDK_ERRLOG("Could not allocate new connection.\n"); + nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); + return -1; + } + + rqpair->device = port->device; + rqpair->max_queue_depth = max_queue_depth; + rqpair->max_read_depth = max_read_depth; + rqpair->cm_id = event->id; + rqpair->listen_id = event->listen_id; + rqpair->qpair.transport = transport; + STAILQ_INIT(&rqpair->ibv_events); + /* use qid from the private data to determine the qpair type + qid will be set to the appropriate value when the controller is created */ + rqpair->qpair.qid = private_data->qid; + + event->id->context = &rqpair->qpair; + + spdk_nvmf_tgt_new_qpair(transport->tgt, &rqpair->qpair); + + return 0; +} + +static int +nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t size) +{ + struct ibv_pd *pd = cb_ctx; + struct ibv_mr *mr; + int rc; + + switch (action) { + case SPDK_MEM_MAP_NOTIFY_REGISTER: + if (!g_nvmf_hooks.get_rkey) { + mr = ibv_reg_mr(pd, vaddr, size, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (mr == NULL) { + SPDK_ERRLOG("ibv_reg_mr() failed\n"); + return -1; + } else { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); + } + } else { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, + g_nvmf_hooks.get_rkey(pd, vaddr, size)); + } + break; + case SPDK_MEM_MAP_NOTIFY_UNREGISTER: + if (!g_nvmf_hooks.get_rkey) { + mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); + if (mr) { + ibv_dereg_mr(mr); + } + } + rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); + break; + default: + SPDK_UNREACHABLE(); + } + + return rc; +} + +static int +nvmf_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) +{ + /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ + return addr_1 == addr_2; +} + +static inline void +nvmf_rdma_setup_wr(struct ibv_send_wr *wr, struct ibv_send_wr *next, + enum spdk_nvme_data_transfer xfer) +{ + if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + wr->opcode = IBV_WR_RDMA_WRITE; + wr->send_flags = 0; + wr->next = next; + } else if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + wr->opcode = IBV_WR_RDMA_READ; + wr->send_flags = IBV_SEND_SIGNALED; + wr->next = NULL; + } else { + assert(0); + } +} + +static int +nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_request *rdma_req, + uint32_t num_sgl_descriptors) +{ + struct spdk_nvmf_rdma_request_data *work_requests[SPDK_NVMF_MAX_SGL_ENTRIES]; + struct spdk_nvmf_rdma_request_data *current_data_wr; + uint32_t i; + + if (num_sgl_descriptors > SPDK_NVMF_MAX_SGL_ENTRIES) { + SPDK_ERRLOG("Requested too much entries (%u), the limit is %u\n", + num_sgl_descriptors, SPDK_NVMF_MAX_SGL_ENTRIES); + return -EINVAL; + } + + if (spdk_mempool_get_bulk(rtransport->data_wr_pool, (void **)work_requests, num_sgl_descriptors)) { + return -ENOMEM; + } + + current_data_wr = &rdma_req->data; + + for (i = 0; i < num_sgl_descriptors; i++) { + nvmf_rdma_setup_wr(¤t_data_wr->wr, &work_requests[i]->wr, rdma_req->req.xfer); + current_data_wr->wr.next = &work_requests[i]->wr; + current_data_wr = work_requests[i]; + current_data_wr->wr.sg_list = current_data_wr->sgl; + current_data_wr->wr.wr_id = rdma_req->data.wr.wr_id; + } + + nvmf_rdma_setup_wr(¤t_data_wr->wr, &rdma_req->rsp.wr, rdma_req->req.xfer); + + return 0; +} + +static inline void +nvmf_rdma_setup_request(struct spdk_nvmf_rdma_request *rdma_req) +{ + struct ibv_send_wr *wr = &rdma_req->data.wr; + struct spdk_nvme_sgl_descriptor *sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1; + + wr->wr.rdma.rkey = sgl->keyed.key; + wr->wr.rdma.remote_addr = sgl->address; + nvmf_rdma_setup_wr(wr, &rdma_req->rsp.wr, rdma_req->req.xfer); +} + +static inline void +nvmf_rdma_update_remote_addr(struct spdk_nvmf_rdma_request *rdma_req, uint32_t num_wrs) +{ + struct ibv_send_wr *wr = &rdma_req->data.wr; + struct spdk_nvme_sgl_descriptor *sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1; + uint32_t i; + int j; + uint64_t remote_addr_offset = 0; + + for (i = 0; i < num_wrs; ++i) { + wr->wr.rdma.rkey = sgl->keyed.key; + wr->wr.rdma.remote_addr = sgl->address + remote_addr_offset; + for (j = 0; j < wr->num_sge; ++j) { + remote_addr_offset += wr->sg_list[j].length; + } + wr = wr->next; + } +} + +/* This function is used in the rare case that we have a buffer split over multiple memory regions. */ +static int +nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf) +{ + struct spdk_nvmf_transport_poll_group *group = &rgroup->group; + struct spdk_nvmf_transport *transport = group->transport; + struct spdk_nvmf_transport_pg_cache_buf *old_buf; + void *new_buf; + + if (!(STAILQ_EMPTY(&group->buf_cache))) { + group->buf_cache_count--; + new_buf = STAILQ_FIRST(&group->buf_cache); + STAILQ_REMOVE_HEAD(&group->buf_cache, link); + assert(*buf != NULL); + } else { + new_buf = spdk_mempool_get(transport->data_buf_pool); + } + + if (*buf == NULL) { + return -ENOMEM; + } + + old_buf = *buf; + STAILQ_INSERT_HEAD(&rgroup->retired_bufs, old_buf, link); + *buf = new_buf; + return 0; +} + +static bool +nvmf_rdma_get_lkey(struct spdk_nvmf_rdma_device *device, struct iovec *iov, + uint32_t *_lkey) +{ + uint64_t translation_len; + uint32_t lkey; + + translation_len = iov->iov_len; + + if (!g_nvmf_hooks.get_rkey) { + lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, + (uint64_t)iov->iov_base, &translation_len))->lkey; + } else { + lkey = spdk_mem_map_translate(device->map, + (uint64_t)iov->iov_base, &translation_len); + } + + if (spdk_unlikely(translation_len < iov->iov_len)) { + return false; + } + + *_lkey = lkey; + return true; +} + +static bool +nvmf_rdma_fill_wr_sge(struct spdk_nvmf_rdma_device *device, + struct iovec *iov, struct ibv_send_wr **_wr, + uint32_t *_remaining_data_block, uint32_t *_offset, + uint32_t *_num_extra_wrs, + const struct spdk_dif_ctx *dif_ctx) +{ + struct ibv_send_wr *wr = *_wr; + struct ibv_sge *sg_ele = &wr->sg_list[wr->num_sge]; + uint32_t lkey = 0; + uint32_t remaining, data_block_size, md_size, sge_len; + + if (spdk_unlikely(!nvmf_rdma_get_lkey(device, iov, &lkey))) { + /* This is a very rare case that can occur when using DPDK version < 19.05 */ + SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n"); + return false; + } + + if (spdk_likely(!dif_ctx)) { + sg_ele->lkey = lkey; + sg_ele->addr = (uintptr_t)(iov->iov_base); + sg_ele->length = iov->iov_len; + wr->num_sge++; + } else { + remaining = iov->iov_len - *_offset; + data_block_size = dif_ctx->block_size - dif_ctx->md_size; + md_size = dif_ctx->md_size; + + while (remaining) { + if (wr->num_sge >= SPDK_NVMF_MAX_SGL_ENTRIES) { + if (*_num_extra_wrs > 0 && wr->next) { + *_wr = wr->next; + wr = *_wr; + wr->num_sge = 0; + sg_ele = &wr->sg_list[wr->num_sge]; + (*_num_extra_wrs)--; + } else { + break; + } + } + sg_ele->lkey = lkey; + sg_ele->addr = (uintptr_t)((char *)iov->iov_base + *_offset); + sge_len = spdk_min(remaining, *_remaining_data_block); + sg_ele->length = sge_len; + remaining -= sge_len; + *_remaining_data_block -= sge_len; + *_offset += sge_len; + + sg_ele++; + wr->num_sge++; + + if (*_remaining_data_block == 0) { + /* skip metadata */ + *_offset += md_size; + /* Metadata that do not fit this IO buffer will be included in the next IO buffer */ + remaining -= spdk_min(remaining, md_size); + *_remaining_data_block = data_block_size; + } + + if (remaining == 0) { + /* By subtracting the size of the last IOV from the offset, we ensure that we skip + the remaining metadata bits at the beginning of the next buffer */ + *_offset -= iov->iov_len; + } + } + } + + return true; +} + +static int +nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_poll_group *rgroup, + struct spdk_nvmf_rdma_device *device, + struct spdk_nvmf_rdma_request *rdma_req, + struct ibv_send_wr *wr, + uint32_t length, + uint32_t num_extra_wrs) +{ + struct spdk_nvmf_request *req = &rdma_req->req; + struct spdk_dif_ctx *dif_ctx = NULL; + uint32_t remaining_data_block = 0; + uint32_t offset = 0; + + if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) { + dif_ctx = &rdma_req->req.dif.dif_ctx; + remaining_data_block = dif_ctx->block_size - dif_ctx->md_size; + } + + wr->num_sge = 0; + + while (length && (num_extra_wrs || wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES)) { + while (spdk_unlikely(!nvmf_rdma_fill_wr_sge(device, &req->iov[rdma_req->iovpos], &wr, + &remaining_data_block, &offset, &num_extra_wrs, dif_ctx))) { + if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[rdma_req->iovpos]) == -ENOMEM) { + return -ENOMEM; + } + req->iov[rdma_req->iovpos].iov_base = (void *)((uintptr_t)(req->buffers[rdma_req->iovpos] + + NVMF_DATA_BUFFER_MASK) & + ~NVMF_DATA_BUFFER_MASK); + } + + length -= req->iov[rdma_req->iovpos].iov_len; + rdma_req->iovpos++; + } + + if (length) { + SPDK_ERRLOG("Not enough SG entries to hold data buffer\n"); + return -EINVAL; + } + + return 0; +} + +static inline uint32_t +nvmf_rdma_calc_num_wrs(uint32_t length, uint32_t io_unit_size, uint32_t block_size) +{ + /* estimate the number of SG entries and WRs needed to process the request */ + uint32_t num_sge = 0; + uint32_t i; + uint32_t num_buffers = SPDK_CEIL_DIV(length, io_unit_size); + + for (i = 0; i < num_buffers && length > 0; i++) { + uint32_t buffer_len = spdk_min(length, io_unit_size); + uint32_t num_sge_in_block = SPDK_CEIL_DIV(buffer_len, block_size); + + if (num_sge_in_block * block_size > buffer_len) { + ++num_sge_in_block; + } + num_sge += num_sge_in_block; + length -= buffer_len; + } + return SPDK_CEIL_DIV(num_sge, SPDK_NVMF_MAX_SGL_ENTRIES); +} + +static int +nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_device *device, + struct spdk_nvmf_rdma_request *rdma_req, + uint32_t length) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_request *req = &rdma_req->req; + struct ibv_send_wr *wr = &rdma_req->data.wr; + int rc; + uint32_t num_wrs = 1; + + rqpair = SPDK_CONTAINEROF(req->qpair, struct spdk_nvmf_rdma_qpair, qpair); + rgroup = rqpair->poller->group; + + /* rdma wr specifics */ + nvmf_rdma_setup_request(rdma_req); + + rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport, + length); + if (rc != 0) { + return rc; + } + + assert(req->iovcnt <= rqpair->max_send_sge); + + rdma_req->iovpos = 0; + + if (spdk_unlikely(req->dif.dif_insert_or_strip)) { + num_wrs = nvmf_rdma_calc_num_wrs(length, rtransport->transport.opts.io_unit_size, + req->dif.dif_ctx.block_size); + if (num_wrs > 1) { + rc = nvmf_request_alloc_wrs(rtransport, rdma_req, num_wrs - 1); + if (rc != 0) { + goto err_exit; + } + } + } + + rc = nvmf_rdma_fill_wr_sgl(rgroup, device, rdma_req, wr, length, num_wrs - 1); + if (spdk_unlikely(rc != 0)) { + goto err_exit; + } + + if (spdk_unlikely(num_wrs > 1)) { + nvmf_rdma_update_remote_addr(rdma_req, num_wrs); + } + + /* set the number of outstanding data WRs for this request. */ + rdma_req->num_outstanding_data_wr = num_wrs; + + return rc; + +err_exit: + spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport); + nvmf_rdma_request_free_data(rdma_req, rtransport); + req->iovcnt = 0; + return rc; +} + +static int +nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_device *device, + struct spdk_nvmf_rdma_request *rdma_req) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct ibv_send_wr *current_wr; + struct spdk_nvmf_request *req = &rdma_req->req; + struct spdk_nvme_sgl_descriptor *inline_segment, *desc; + uint32_t num_sgl_descriptors; + uint32_t lengths[SPDK_NVMF_MAX_SGL_ENTRIES]; + uint32_t i; + int rc; + + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + rgroup = rqpair->poller->group; + + inline_segment = &req->cmd->nvme_cmd.dptr.sgl1; + assert(inline_segment->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT); + assert(inline_segment->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET); + + num_sgl_descriptors = inline_segment->unkeyed.length / sizeof(struct spdk_nvme_sgl_descriptor); + assert(num_sgl_descriptors <= SPDK_NVMF_MAX_SGL_ENTRIES); + + if (nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1) != 0) { + return -ENOMEM; + } + + desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address; + for (i = 0; i < num_sgl_descriptors; i++) { + if (spdk_likely(!req->dif.dif_insert_or_strip)) { + lengths[i] = desc->keyed.length; + } else { + req->dif.orig_length += desc->keyed.length; + lengths[i] = spdk_dif_get_length_with_md(desc->keyed.length, &req->dif.dif_ctx); + req->dif.elba_length += lengths[i]; + } + desc++; + } + + rc = spdk_nvmf_request_get_buffers_multi(req, &rgroup->group, &rtransport->transport, + lengths, num_sgl_descriptors); + if (rc != 0) { + nvmf_rdma_request_free_data(rdma_req, rtransport); + return rc; + } + + /* The first WR must always be the embedded data WR. This is how we unwind them later. */ + current_wr = &rdma_req->data.wr; + assert(current_wr != NULL); + + req->length = 0; + rdma_req->iovpos = 0; + desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address; + for (i = 0; i < num_sgl_descriptors; i++) { + /* The descriptors must be keyed data block descriptors with an address, not an offset. */ + if (spdk_unlikely(desc->generic.type != SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK || + desc->keyed.subtype != SPDK_NVME_SGL_SUBTYPE_ADDRESS)) { + rc = -EINVAL; + goto err_exit; + } + + current_wr->num_sge = 0; + + rc = nvmf_rdma_fill_wr_sgl(rgroup, device, rdma_req, current_wr, lengths[i], 0); + if (rc != 0) { + rc = -ENOMEM; + goto err_exit; + } + + req->length += desc->keyed.length; + current_wr->wr.rdma.rkey = desc->keyed.key; + current_wr->wr.rdma.remote_addr = desc->address; + current_wr = current_wr->next; + desc++; + } + +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + /* Go back to the last descriptor in the list. */ + desc--; + if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { + if (desc->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { + rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; + rdma_req->rsp.wr.imm_data = desc->keyed.key; + } + } +#endif + + rdma_req->num_outstanding_data_wr = num_sgl_descriptors; + + return 0; + +err_exit: + spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport); + nvmf_rdma_request_free_data(rdma_req, rtransport); + return rc; +} + +static int +nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_device *device, + struct spdk_nvmf_rdma_request *rdma_req) +{ + struct spdk_nvmf_request *req = &rdma_req->req; + struct spdk_nvme_cpl *rsp; + struct spdk_nvme_sgl_descriptor *sgl; + int rc; + uint32_t length; + + rsp = &req->rsp->nvme_cpl; + sgl = &req->cmd->nvme_cmd.dptr.sgl1; + + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && + (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || + sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { + + length = sgl->keyed.length; + if (length > rtransport->transport.opts.max_io_size) { + SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", + length, rtransport->transport.opts.max_io_size); + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return -1; + } +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { + if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { + rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; + rdma_req->rsp.wr.imm_data = sgl->keyed.key; + } + } +#endif + + /* fill request length and populate iovs */ + req->length = length; + + if (spdk_unlikely(req->dif.dif_insert_or_strip)) { + req->dif.orig_length = length; + length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx); + req->dif.elba_length = length; + } + + rc = nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req, length); + if (spdk_unlikely(rc < 0)) { + if (rc == -EINVAL) { + SPDK_ERRLOG("SGL length exceeds the max I/O size\n"); + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return -1; + } + /* No available buffers. Queue this request up. */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); + return 0; + } + + /* backward compatible */ + req->data = req->iov[0].iov_base; + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, + req->iovcnt); + + return 0; + } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && + sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { + uint64_t offset = sgl->address; + uint32_t max_len = rtransport->transport.opts.in_capsule_data_size; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", + offset, sgl->unkeyed.length); + + if (offset > max_len) { + SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", + offset, max_len); + rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; + return -1; + } + max_len -= (uint32_t)offset; + + if (sgl->unkeyed.length > max_len) { + SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", + sgl->unkeyed.length, max_len); + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return -1; + } + + rdma_req->num_outstanding_data_wr = 0; + req->data = rdma_req->recv->buf + offset; + req->data_from_pool = false; + req->length = sgl->unkeyed.length; + + req->iov[0].iov_base = req->data; + req->iov[0].iov_len = req->length; + req->iovcnt = 1; + + return 0; + } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT && + sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { + + rc = nvmf_rdma_request_fill_iovs_multi_sgl(rtransport, device, rdma_req); + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); + return 0; + } else if (rc == -EINVAL) { + SPDK_ERRLOG("Multi SGL element request length exceeds the max I/O size\n"); + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return -1; + } + + /* backward compatible */ + req->data = req->iov[0].iov_base; + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, + req->iovcnt); + + return 0; + } + + SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", + sgl->generic.type, sgl->generic.subtype); + rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; + return -1; +} + +static void +_nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req, + struct spdk_nvmf_rdma_transport *rtransport) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_poll_group *rgroup; + + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + if (rdma_req->req.data_from_pool) { + rgroup = rqpair->poller->group; + + spdk_nvmf_request_free_buffers(&rdma_req->req, &rgroup->group, &rtransport->transport); + } + nvmf_rdma_request_free_data(rdma_req, rtransport); + rdma_req->req.length = 0; + rdma_req->req.iovcnt = 0; + rdma_req->req.data = NULL; + rdma_req->rsp.wr.next = NULL; + rdma_req->data.wr.next = NULL; + memset(&rdma_req->req.dif, 0, sizeof(rdma_req->req.dif)); + rqpair->qd--; + + STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link); + rdma_req->state = RDMA_REQUEST_STATE_FREE; +} + +bool +nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_request *rdma_req) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; + int rc; + struct spdk_nvmf_rdma_recv *rdma_recv; + enum spdk_nvmf_rdma_request_state prev_state; + bool progress = false; + int data_posted; + uint32_t num_blocks; + + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + device = rqpair->device; + rgroup = rqpair->poller->group; + + assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); + + /* If the queue pair is in an error state, force the request to the completed state + * to release resources. */ + if (rqpair->ibv_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { + if (rdma_req->state == RDMA_REQUEST_STATE_NEED_BUFFER) { + STAILQ_REMOVE(&rgroup->group.pending_buf_queue, &rdma_req->req, spdk_nvmf_request, buf_link); + } else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING) { + STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link); + } else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING) { + STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link); + } + rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; + } + + /* The loop here is to allow for several back-to-back state changes. */ + do { + prev_state = rdma_req->state; + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); + + switch (rdma_req->state) { + case RDMA_REQUEST_STATE_FREE: + /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_NEW: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + rdma_recv = rdma_req->recv; + + /* The first element of the SGL is the NVMe command */ + rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; + memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); + + if (rqpair->ibv_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { + rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; + break; + } + + if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&rdma_req->req, &rdma_req->req.dif.dif_ctx))) { + rdma_req->req.dif.dif_insert_or_strip = true; + } + +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + rdma_req->rsp.wr.opcode = IBV_WR_SEND; + rdma_req->rsp.wr.imm_data = 0; +#endif + + /* The next state transition depends on the data transfer needs of this request. */ + rdma_req->req.xfer = spdk_nvmf_req_get_xfer(&rdma_req->req); + + /* If no data to transfer, ready to execute. */ + if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { + rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; + break; + } + + rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER; + STAILQ_INSERT_TAIL(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link); + break; + case RDMA_REQUEST_STATE_NEED_BUFFER: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); + + if (&rdma_req->req != STAILQ_FIRST(&rgroup->group.pending_buf_queue)) { + /* This request needs to wait in line to obtain a buffer */ + break; + } + + /* Try to get a data buffer */ + rc = nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); + if (rc < 0) { + STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link); + rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; + break; + } + + if (!rdma_req->req.data) { + /* No buffers available. */ + rgroup->stat.pending_data_buffer++; + break; + } + + STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link); + + /* If data is transferring from host to controller and the data didn't + * arrive using in capsule data, we need to do a transfer from the host. + */ + if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && + rdma_req->req.data_from_pool) { + STAILQ_INSERT_TAIL(&rqpair->pending_rdma_read_queue, rdma_req, state_link); + rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING; + break; + } + + rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; + break; + case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_read_queue)) { + /* This request needs to wait in line to perform RDMA */ + break; + } + if (rqpair->current_send_depth + rdma_req->num_outstanding_data_wr > rqpair->max_send_depth + || rqpair->current_read_depth + rdma_req->num_outstanding_data_wr > rqpair->max_read_depth) { + /* We can only have so many WRs outstanding. we have to wait until some finish. */ + rqpair->poller->stat.pending_rdma_read++; + break; + } + + /* We have already verified that this request is the head of the queue. */ + STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link); + + rc = request_transfer_in(&rdma_req->req); + if (!rc) { + rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER; + } else { + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; + } + break; + case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_READY_TO_EXECUTE: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) { + if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + /* generate DIF for write operation */ + num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size); + assert(num_blocks > 0); + + rc = spdk_dif_generate(rdma_req->req.iov, rdma_req->req.iovcnt, + num_blocks, &rdma_req->req.dif.dif_ctx); + if (rc != 0) { + SPDK_ERRLOG("DIF generation failed\n"); + rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; + nvmf_rdma_start_disconnect(rqpair); + break; + } + } + + assert(rdma_req->req.dif.elba_length >= rdma_req->req.length); + /* set extended length before IO operation */ + rdma_req->req.length = rdma_req->req.dif.elba_length; + } + + rdma_req->state = RDMA_REQUEST_STATE_EXECUTING; + spdk_nvmf_request_exec(&rdma_req->req); + break; + case RDMA_REQUEST_STATE_EXECUTING: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_EXECUTED: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && + rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + STAILQ_INSERT_TAIL(&rqpair->pending_rdma_write_queue, rdma_req, state_link); + rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING; + } else { + rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; + } + if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) { + /* restore the original length */ + rdma_req->req.length = rdma_req->req.dif.orig_length; + + if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + struct spdk_dif_error error_blk; + + num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size); + + rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks, + &rdma_req->req.dif.dif_ctx, &error_blk); + if (rc) { + struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; + + SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", error_blk.err_type, + error_blk.err_offset); + rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR; + rsp->status.sc = nvmf_rdma_dif_error_to_compl_status(error_blk.err_type); + rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; + STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link); + } + } + } + break; + case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_write_queue)) { + /* This request needs to wait in line to perform RDMA */ + break; + } + if ((rqpair->current_send_depth + rdma_req->num_outstanding_data_wr + 1) > + rqpair->max_send_depth) { + /* We can only have so many WRs outstanding. we have to wait until some finish. + * +1 since each request has an additional wr in the resp. */ + rqpair->poller->stat.pending_rdma_write++; + break; + } + + /* We have already verified that this request is the head of the queue. */ + STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_write_queue, state_link); + + /* The data transfer will be kicked off from + * RDMA_REQUEST_STATE_READY_TO_COMPLETE state. + */ + rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; + break; + case RDMA_REQUEST_STATE_READY_TO_COMPLETE: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + rc = request_transfer_out(&rdma_req->req, &data_posted); + assert(rc == 0); /* No good way to handle this currently */ + if (rc) { + rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; + } else { + rdma_req->state = data_posted ? RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST : + RDMA_REQUEST_STATE_COMPLETING; + } + break; + case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_COMPLETING: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_COMPLETED: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + rqpair->poller->stat.request_latency += spdk_get_ticks() - rdma_req->receive_tsc; + _nvmf_rdma_request_free(rdma_req, rtransport); + break; + case RDMA_REQUEST_NUM_STATES: + default: + assert(0); + break; + } + + if (rdma_req->state != prev_state) { + progress = true; + } + } while (rdma_req->state != prev_state); + + return progress; +} + +/* Public API callbacks begin here */ + +#define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128 +#define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128 +#define SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH 4096 +#define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 128 +#define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 +#define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072 +#define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES) +#define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4095 +#define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32 +#define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false +#define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false +#define SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG 100 +#define SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC 1 + +static void +nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) +{ + opts->max_queue_depth = SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH; + opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR; + opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE; + opts->max_io_size = SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE; + opts->io_unit_size = SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE; + opts->max_aq_depth = SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH; + opts->num_shared_buffers = SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS; + opts->buf_cache_size = SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE; + opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH; + opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ; + opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP; + opts->acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG; + opts->abort_timeout_sec = SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC; +} + +const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = { + .notify_cb = nvmf_rdma_mem_notify, + .are_contiguous = nvmf_rdma_check_contiguous_entries +}; + +static int nvmf_rdma_destroy(struct spdk_nvmf_transport *transport); + +static struct spdk_nvmf_transport * +nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) +{ + int rc; + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_device *device, *tmp; + struct ibv_context **contexts; + uint32_t i; + int flag; + uint32_t sge_count; + uint32_t min_shared_buffers; + int max_device_sge = SPDK_NVMF_MAX_SGL_ENTRIES; + pthread_mutexattr_t attr; + + rtransport = calloc(1, sizeof(*rtransport)); + if (!rtransport) { + return NULL; + } + + if (pthread_mutexattr_init(&attr)) { + SPDK_ERRLOG("pthread_mutexattr_init() failed\n"); + free(rtransport); + return NULL; + } + + if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) { + SPDK_ERRLOG("pthread_mutexattr_settype() failed\n"); + pthread_mutexattr_destroy(&attr); + free(rtransport); + return NULL; + } + + if (pthread_mutex_init(&rtransport->lock, &attr)) { + SPDK_ERRLOG("pthread_mutex_init() failed\n"); + pthread_mutexattr_destroy(&attr); + free(rtransport); + return NULL; + } + + pthread_mutexattr_destroy(&attr); + + TAILQ_INIT(&rtransport->devices); + TAILQ_INIT(&rtransport->ports); + TAILQ_INIT(&rtransport->poll_groups); + + rtransport->transport.ops = &spdk_nvmf_transport_rdma; + + SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n" + " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" + " max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" + " in_capsule_data_size=%d, max_aq_depth=%d,\n" + " num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d," + " acceptor_backlog=%d, abort_timeout_sec=%d\n", + opts->max_queue_depth, + opts->max_io_size, + opts->max_qpairs_per_ctrlr - 1, + opts->io_unit_size, + opts->in_capsule_data_size, + opts->max_aq_depth, + opts->num_shared_buffers, + opts->max_srq_depth, + opts->no_srq, + opts->acceptor_backlog, + opts->abort_timeout_sec); + + /* I/O unit size cannot be larger than max I/O size */ + if (opts->io_unit_size > opts->max_io_size) { + opts->io_unit_size = opts->max_io_size; + } + + if (opts->acceptor_backlog <= 0) { + SPDK_ERRLOG("The acceptor backlog cannot be less than 1, setting to the default value of (%d).\n", + SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG); + opts->acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG; + } + + if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) { + SPDK_ERRLOG("The number of shared data buffers (%d) is less than" + "the minimum number required to guarantee that forward progress can be made (%d)\n", + opts->num_shared_buffers, (SPDK_NVMF_MAX_SGL_ENTRIES * 2)); + nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size; + if (min_shared_buffers > opts->num_shared_buffers) { + SPDK_ERRLOG("There are not enough buffers to satisfy" + "per-poll group caches for each thread. (%" PRIu32 ")" + "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers); + SPDK_ERRLOG("Please specify a larger number of shared buffers\n"); + nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + sge_count = opts->max_io_size / opts->io_unit_size; + if (sge_count > NVMF_DEFAULT_TX_SGE) { + SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); + nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + rtransport->event_channel = rdma_create_event_channel(); + if (rtransport->event_channel == NULL) { + SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); + nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + flag = fcntl(rtransport->event_channel->fd, F_GETFL); + if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", + rtransport->event_channel->fd, spdk_strerror(errno)); + nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + rtransport->data_wr_pool = spdk_mempool_create("spdk_nvmf_rdma_wr_data", + opts->max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES, + sizeof(struct spdk_nvmf_rdma_request_data), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!rtransport->data_wr_pool) { + SPDK_ERRLOG("Unable to allocate work request pool for poll group\n"); + nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + contexts = rdma_get_devices(NULL); + if (contexts == NULL) { + SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); + nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + i = 0; + rc = 0; + while (contexts[i] != NULL) { + device = calloc(1, sizeof(*device)); + if (!device) { + SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); + rc = -ENOMEM; + break; + } + device->context = contexts[i]; + rc = ibv_query_device(device->context, &device->attr); + if (rc < 0) { + SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); + free(device); + break; + + } + + max_device_sge = spdk_min(max_device_sge, device->attr.max_sge); + +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) { + SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,"); + SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id); + } + + /** + * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE. + * The Soft-RoCE RXE driver does not currently support send with invalidate, + * but incorrectly reports that it does. There are changes making their way + * through the kernel now that will enable this feature. When they are merged, + * we can conditionally enable this feature. + * + * TODO: enable this for versions of the kernel rxe driver that support it. + */ + if (device->attr.vendor_id == 0) { + device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS); + } +#endif + + /* set up device context async ev fd as NON_BLOCKING */ + flag = fcntl(device->context->async_fd, F_GETFL); + rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); + if (rc < 0) { + SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); + free(device); + break; + } + + TAILQ_INSERT_TAIL(&rtransport->devices, device, link); + i++; + + if (g_nvmf_hooks.get_ibv_pd) { + device->pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context); + } else { + device->pd = ibv_alloc_pd(device->context); + } + + if (!device->pd) { + SPDK_ERRLOG("Unable to allocate protection domain.\n"); + rc = -ENOMEM; + break; + } + + assert(device->map == NULL); + + device->map = spdk_mem_map_alloc(0, &g_nvmf_rdma_map_ops, device->pd); + if (!device->map) { + SPDK_ERRLOG("Unable to allocate memory map for listen address\n"); + rc = -ENOMEM; + break; + } + + assert(device->map != NULL); + assert(device->pd != NULL); + } + rdma_free_devices(contexts); + + if (opts->io_unit_size * max_device_sge < opts->max_io_size) { + /* divide and round up. */ + opts->io_unit_size = (opts->max_io_size + max_device_sge - 1) / max_device_sge; + + /* round up to the nearest 4k. */ + opts->io_unit_size = (opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT - 1) & ~NVMF_DATA_BUFFER_MASK; + + opts->io_unit_size = spdk_max(opts->io_unit_size, SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE); + SPDK_NOTICELOG("Adjusting the io unit size to fit the device's maximum I/O size. New I/O unit size %u\n", + opts->io_unit_size); + } + + if (rc < 0) { + nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + /* Set up poll descriptor array to monitor events from RDMA and IB + * in a single poll syscall + */ + rtransport->npoll_fds = i + 1; + i = 0; + rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); + if (rtransport->poll_fds == NULL) { + SPDK_ERRLOG("poll_fds allocation failed\n"); + nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + rtransport->poll_fds[i].fd = rtransport->event_channel->fd; + rtransport->poll_fds[i++].events = POLLIN; + + TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { + rtransport->poll_fds[i].fd = device->context->async_fd; + rtransport->poll_fds[i++].events = POLLIN; + } + + return &rtransport->transport; +} + +static int +nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_port *port, *port_tmp; + struct spdk_nvmf_rdma_device *device, *device_tmp; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { + TAILQ_REMOVE(&rtransport->ports, port, link); + rdma_destroy_id(port->id); + free(port); + } + + if (rtransport->poll_fds != NULL) { + free(rtransport->poll_fds); + } + + if (rtransport->event_channel != NULL) { + rdma_destroy_event_channel(rtransport->event_channel); + } + + TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { + TAILQ_REMOVE(&rtransport->devices, device, link); + if (device->map) { + spdk_mem_map_free(&device->map); + } + if (device->pd) { + if (!g_nvmf_hooks.get_ibv_pd) { + ibv_dealloc_pd(device->pd); + } + } + free(device); + } + + if (rtransport->data_wr_pool != NULL) { + if (spdk_mempool_count(rtransport->data_wr_pool) != + (transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES)) { + SPDK_ERRLOG("transport wr pool count is %zu but should be %u\n", + spdk_mempool_count(rtransport->data_wr_pool), + transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES); + } + } + + spdk_mempool_free(rtransport->data_wr_pool); + + pthread_mutex_destroy(&rtransport->lock); + free(rtransport); + + return 0; +} + +static int +nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, + struct spdk_nvme_transport_id *trid, + bool peer); + +static int +nvmf_rdma_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_port *port; + struct addrinfo *res; + struct addrinfo hints; + int family; + int rc; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + assert(rtransport->event_channel != NULL); + + pthread_mutex_lock(&rtransport->lock); + port = calloc(1, sizeof(*port)); + if (!port) { + SPDK_ERRLOG("Port allocation failed\n"); + pthread_mutex_unlock(&rtransport->lock); + return -ENOMEM; + } + + port->trid = trid; + + switch (trid->adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + family = AF_INET; + break; + case SPDK_NVMF_ADRFAM_IPV6: + family = AF_INET6; + break; + default: + SPDK_ERRLOG("Unhandled ADRFAM %d\n", trid->adrfam); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -EINVAL; + } + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = family; + hints.ai_flags = AI_NUMERICSERV; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = 0; + + rc = getaddrinfo(trid->traddr, trid->trsvcid, &hints, &res); + if (rc) { + SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -EINVAL; + } + + rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); + if (rc < 0) { + SPDK_ERRLOG("rdma_create_id() failed\n"); + freeaddrinfo(res); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return rc; + } + + rc = rdma_bind_addr(port->id, res->ai_addr); + freeaddrinfo(res); + + if (rc < 0) { + SPDK_ERRLOG("rdma_bind_addr() failed\n"); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return rc; + } + + if (!port->id->verbs) { + SPDK_ERRLOG("ibv_context is null\n"); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -1; + } + + rc = rdma_listen(port->id, transport->opts.acceptor_backlog); + if (rc < 0) { + SPDK_ERRLOG("rdma_listen() failed\n"); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return rc; + } + + TAILQ_FOREACH(device, &rtransport->devices, link) { + if (device->context == port->id->verbs) { + port->device = device; + break; + } + } + if (!port->device) { + SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", + port->id->verbs); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -EINVAL; + } + + SPDK_NOTICELOG("*** NVMe/RDMA Target Listening on %s port %s ***\n", + trid->traddr, trid->trsvcid); + + TAILQ_INSERT_TAIL(&rtransport->ports, port, link); + pthread_mutex_unlock(&rtransport->lock); + return 0; +} + +static void +nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_port *port, *tmp; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + pthread_mutex_lock(&rtransport->lock); + TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { + if (spdk_nvme_transport_id_compare(port->trid, trid) == 0) { + TAILQ_REMOVE(&rtransport->ports, port, link); + rdma_destroy_id(port->id); + free(port); + break; + } + } + + pthread_mutex_unlock(&rtransport->lock); +} + +static void +nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_qpair *rqpair, bool drain) +{ + struct spdk_nvmf_request *req, *tmp; + struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; + struct spdk_nvmf_rdma_resources *resources; + + /* We process I/O in the data transfer pending queue at the highest priority. RDMA reads first */ + STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_read_queue, state_link, req_tmp) { + if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { + break; + } + } + + /* Then RDMA writes since reads have stronger restrictions than writes */ + STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_write_queue, state_link, req_tmp) { + if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { + break; + } + } + + /* The second highest priority is I/O waiting on memory buffers. */ + STAILQ_FOREACH_SAFE(req, &rqpair->poller->group->group.pending_buf_queue, buf_link, tmp) { + rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); + if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) { + break; + } + } + + resources = rqpair->resources; + while (!STAILQ_EMPTY(&resources->free_queue) && !STAILQ_EMPTY(&resources->incoming_queue)) { + rdma_req = STAILQ_FIRST(&resources->free_queue); + STAILQ_REMOVE_HEAD(&resources->free_queue, state_link); + rdma_req->recv = STAILQ_FIRST(&resources->incoming_queue); + STAILQ_REMOVE_HEAD(&resources->incoming_queue, link); + + if (rqpair->srq != NULL) { + rdma_req->req.qpair = &rdma_req->recv->qpair->qpair; + rdma_req->recv->qpair->qd++; + } else { + rqpair->qd++; + } + + rdma_req->receive_tsc = rdma_req->recv->receive_tsc; + rdma_req->state = RDMA_REQUEST_STATE_NEW; + if (nvmf_rdma_request_process(rtransport, rdma_req) == false) { + break; + } + } + if (!STAILQ_EMPTY(&resources->incoming_queue) && STAILQ_EMPTY(&resources->free_queue)) { + rqpair->poller->stat.pending_free_request++; + } +} + +static void +_nvmf_rdma_qpair_disconnect(void *ctx) +{ + struct spdk_nvmf_qpair *qpair = ctx; + + spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); +} + +static void +_nvmf_rdma_try_disconnect(void *ctx) +{ + struct spdk_nvmf_qpair *qpair = ctx; + struct spdk_nvmf_poll_group *group; + + /* Read the group out of the qpair. This is normally set and accessed only from + * the thread that created the group. Here, we're not on that thread necessarily. + * The data member qpair->group begins it's life as NULL and then is assigned to + * a pointer and never changes. So fortunately reading this and checking for + * non-NULL is thread safe in the x86_64 memory model. */ + group = qpair->group; + + if (group == NULL) { + /* The qpair hasn't been assigned to a group yet, so we can't + * process a disconnect. Send a message to ourself and try again. */ + spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_try_disconnect, qpair); + return; + } + + spdk_thread_send_msg(group->thread, _nvmf_rdma_qpair_disconnect, qpair); +} + +static inline void +nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair) +{ + if (!__atomic_test_and_set(&rqpair->disconnect_started, __ATOMIC_RELAXED)) { + _nvmf_rdma_try_disconnect(&rqpair->qpair); + } +} + +static void nvmf_rdma_destroy_drained_qpair(void *ctx) +{ + struct spdk_nvmf_rdma_qpair *rqpair = ctx; + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, + struct spdk_nvmf_rdma_transport, transport); + + /* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */ + if (rqpair->current_send_depth != 0) { + return; + } + + if (rqpair->srq == NULL && rqpair->current_recv_depth != rqpair->max_queue_depth) { + return; + } + + if (rqpair->srq != NULL && rqpair->last_wqe_reached == false) { + return; + } + + nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); + + /* Qpair will be destroyed after nvmf layer closes this qpair */ + if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ERROR) { + return; + } + + nvmf_rdma_qpair_destroy(rqpair); +} + + +static int +nvmf_rdma_disconnect(struct rdma_cm_event *evt) +{ + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + + if (evt->id == NULL) { + SPDK_ERRLOG("disconnect request: missing cm_id\n"); + return -1; + } + + qpair = evt->id->context; + if (qpair == NULL) { + SPDK_ERRLOG("disconnect request: no active connection\n"); + return -1; + } + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0); + + nvmf_rdma_start_disconnect(rqpair); + + return 0; +} + +#ifdef DEBUG +static const char *CM_EVENT_STR[] = { + "RDMA_CM_EVENT_ADDR_RESOLVED", + "RDMA_CM_EVENT_ADDR_ERROR", + "RDMA_CM_EVENT_ROUTE_RESOLVED", + "RDMA_CM_EVENT_ROUTE_ERROR", + "RDMA_CM_EVENT_CONNECT_REQUEST", + "RDMA_CM_EVENT_CONNECT_RESPONSE", + "RDMA_CM_EVENT_CONNECT_ERROR", + "RDMA_CM_EVENT_UNREACHABLE", + "RDMA_CM_EVENT_REJECTED", + "RDMA_CM_EVENT_ESTABLISHED", + "RDMA_CM_EVENT_DISCONNECTED", + "RDMA_CM_EVENT_DEVICE_REMOVAL", + "RDMA_CM_EVENT_MULTICAST_JOIN", + "RDMA_CM_EVENT_MULTICAST_ERROR", + "RDMA_CM_EVENT_ADDR_CHANGE", + "RDMA_CM_EVENT_TIMEWAIT_EXIT" +}; +#endif /* DEBUG */ + +static void +nvmf_rdma_disconnect_qpairs_on_port(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_port *port) +{ + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_poller *rpoller; + struct spdk_nvmf_rdma_qpair *rqpair; + + TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) { + TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { + TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) { + if (rqpair->listen_id == port->id) { + nvmf_rdma_start_disconnect(rqpair); + } + } + } + } +} + +static bool +nvmf_rdma_handle_cm_event_addr_change(struct spdk_nvmf_transport *transport, + struct rdma_cm_event *event) +{ + const struct spdk_nvme_transport_id *trid; + struct spdk_nvmf_rdma_port *port; + struct spdk_nvmf_rdma_transport *rtransport; + bool event_acked = false; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + TAILQ_FOREACH(port, &rtransport->ports, link) { + if (port->id == event->id) { + SPDK_ERRLOG("ADDR_CHANGE: IP %s:%s migrated\n", port->trid->traddr, port->trid->trsvcid); + rdma_ack_cm_event(event); + event_acked = true; + trid = port->trid; + break; + } + } + + if (event_acked) { + nvmf_rdma_disconnect_qpairs_on_port(rtransport, port); + + nvmf_rdma_stop_listen(transport, trid); + nvmf_rdma_listen(transport, trid); + } + + return event_acked; +} + +static void +nvmf_rdma_handle_cm_event_port_removal(struct spdk_nvmf_transport *transport, + struct rdma_cm_event *event) +{ + struct spdk_nvmf_rdma_port *port; + struct spdk_nvmf_rdma_transport *rtransport; + + port = event->id->context; + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + SPDK_NOTICELOG("Port %s:%s is being removed\n", port->trid->traddr, port->trid->trsvcid); + + nvmf_rdma_disconnect_qpairs_on_port(rtransport, port); + + rdma_ack_cm_event(event); + + while (spdk_nvmf_transport_stop_listen(transport, port->trid) == 0) { + ; + } +} + +static void +nvmf_process_cm_event(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct rdma_cm_event *event; + int rc; + bool event_acked; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + if (rtransport->event_channel == NULL) { + return; + } + + while (1) { + event_acked = false; + rc = rdma_get_cm_event(rtransport->event_channel, &event); + if (rc) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); + } + break; + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); + + spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event); + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + case RDMA_CM_EVENT_ROUTE_ERROR: + /* No action required. The target never attempts to resolve routes. */ + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + rc = nvmf_rdma_connect(transport, event); + if (rc < 0) { + SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); + break; + } + break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + /* The target never initiates a new connection. So this will not occur. */ + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + /* Can this happen? The docs say it can, but not sure what causes it. */ + break; + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + /* These only occur on the client side. */ + break; + case RDMA_CM_EVENT_ESTABLISHED: + /* TODO: Should we be waiting for this event anywhere? */ + break; + case RDMA_CM_EVENT_DISCONNECTED: + rc = nvmf_rdma_disconnect(event); + if (rc < 0) { + SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); + break; + } + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* In case of device removal, kernel IB part triggers IBV_EVENT_DEVICE_FATAL + * which triggers RDMA_CM_EVENT_DEVICE_REMOVAL on all cma_id’s. + * Once these events are sent to SPDK, we should release all IB resources and + * don't make attempts to call any ibv_query/modify/create functions. We can only call + * ibv_destory* functions to release user space memory allocated by IB. All kernel + * resources are already cleaned. */ + if (event->id->qp) { + /* If rdma_cm event has a valid `qp` pointer then the event refers to the + * corresponding qpair. Otherwise the event refers to a listening device */ + rc = nvmf_rdma_disconnect(event); + if (rc < 0) { + SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); + break; + } + } else { + nvmf_rdma_handle_cm_event_port_removal(transport, event); + event_acked = true; + } + break; + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + /* Multicast is not used */ + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + event_acked = nvmf_rdma_handle_cm_event_addr_change(transport, event); + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + /* For now, do nothing. The target never re-uses queue pairs. */ + break; + default: + SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); + break; + } + if (!event_acked) { + rdma_ack_cm_event(event); + } + } +} + +static void +nvmf_rdma_handle_qp_fatal(struct spdk_nvmf_rdma_qpair *rqpair) +{ + nvmf_rdma_update_ibv_state(rqpair); + nvmf_rdma_start_disconnect(rqpair); +} + +static void +nvmf_rdma_handle_last_wqe_reached(struct spdk_nvmf_rdma_qpair *rqpair) +{ + rqpair->last_wqe_reached = true; + nvmf_rdma_destroy_drained_qpair(rqpair); +} + +static void +nvmf_rdma_handle_sq_drained(struct spdk_nvmf_rdma_qpair *rqpair) +{ + nvmf_rdma_start_disconnect(rqpair); +} + +static void +nvmf_rdma_qpair_process_ibv_event(void *ctx) +{ + struct spdk_nvmf_rdma_ibv_event_ctx *event_ctx = ctx; + + if (event_ctx->rqpair) { + STAILQ_REMOVE(&event_ctx->rqpair->ibv_events, event_ctx, spdk_nvmf_rdma_ibv_event_ctx, link); + if (event_ctx->cb_fn) { + event_ctx->cb_fn(event_ctx->rqpair); + } + } + free(event_ctx); +} + +static int +nvmf_rdma_send_qpair_async_event(struct spdk_nvmf_rdma_qpair *rqpair, + spdk_nvmf_rdma_qpair_ibv_event fn) +{ + struct spdk_nvmf_rdma_ibv_event_ctx *ctx; + struct spdk_thread *thr = NULL; + int rc; + + if (rqpair->qpair.group) { + thr = rqpair->qpair.group->thread; + } else if (rqpair->destruct_channel) { + thr = spdk_io_channel_get_thread(rqpair->destruct_channel); + } + + if (!thr) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "rqpair %p has no thread\n", rqpair); + return -EINVAL; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + return -ENOMEM; + } + + ctx->rqpair = rqpair; + ctx->cb_fn = fn; + STAILQ_INSERT_TAIL(&rqpair->ibv_events, ctx, link); + + rc = spdk_thread_send_msg(thr, nvmf_rdma_qpair_process_ibv_event, ctx); + if (rc) { + STAILQ_REMOVE(&rqpair->ibv_events, ctx, spdk_nvmf_rdma_ibv_event_ctx, link); + free(ctx); + } + + return rc; +} + +static void +nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) +{ + int rc; + struct spdk_nvmf_rdma_qpair *rqpair = NULL; + struct ibv_async_event event; + + rc = ibv_get_async_event(device->context, &event); + + if (rc) { + SPDK_ERRLOG("Failed to get async_event (%d): %s\n", + errno, spdk_strerror(errno)); + return; + } + + switch (event.event_type) { + case IBV_EVENT_QP_FATAL: + rqpair = event.element.qp->qp_context; + SPDK_ERRLOG("Fatal event received for rqpair %p\n", rqpair); + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + rc = nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_qp_fatal); + if (rc) { + SPDK_WARNLOG("Failed to send QP_FATAL event. rqpair %p, err %d\n", rqpair, rc); + nvmf_rdma_handle_qp_fatal(rqpair); + } + break; + case IBV_EVENT_QP_LAST_WQE_REACHED: + /* This event only occurs for shared receive queues. */ + rqpair = event.element.qp->qp_context; + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last WQE reached event received for rqpair %p\n", rqpair); + rc = nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_last_wqe_reached); + if (rc) { + SPDK_WARNLOG("Failed to send LAST_WQE_REACHED event. rqpair %p, err %d\n", rqpair, rc); + rqpair->last_wqe_reached = true; + } + break; + case IBV_EVENT_SQ_DRAINED: + /* This event occurs frequently in both error and non-error states. + * Check if the qpair is in an error state before sending a message. */ + rqpair = event.element.qp->qp_context; + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last sq drained event received for rqpair %p\n", rqpair); + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + if (nvmf_rdma_update_ibv_state(rqpair) == IBV_QPS_ERR) { + rc = nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_sq_drained); + if (rc) { + SPDK_WARNLOG("Failed to send SQ_DRAINED event. rqpair %p, err %d\n", rqpair, rc); + nvmf_rdma_handle_sq_drained(rqpair); + } + } + break; + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + SPDK_NOTICELOG("Async event: %s\n", + ibv_event_type_str(event.event_type)); + rqpair = event.element.qp->qp_context; + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + nvmf_rdma_update_ibv_state(rqpair); + break; + case IBV_EVENT_CQ_ERR: + case IBV_EVENT_DEVICE_FATAL: + case IBV_EVENT_PORT_ACTIVE: + case IBV_EVENT_PORT_ERR: + case IBV_EVENT_LID_CHANGE: + case IBV_EVENT_PKEY_CHANGE: + case IBV_EVENT_SM_CHANGE: + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + case IBV_EVENT_CLIENT_REREGISTER: + case IBV_EVENT_GID_CHANGE: + default: + SPDK_NOTICELOG("Async event: %s\n", + ibv_event_type_str(event.event_type)); + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type); + break; + } + ibv_ack_async_event(&event); +} + +static uint32_t +nvmf_rdma_accept(struct spdk_nvmf_transport *transport) +{ + int nfds, i = 0; + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_device *device, *tmp; + uint32_t count; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + count = nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); + + if (nfds <= 0) { + return 0; + } + + /* The first poll descriptor is RDMA CM event */ + if (rtransport->poll_fds[i++].revents & POLLIN) { + nvmf_process_cm_event(transport); + nfds--; + } + + if (nfds == 0) { + return count; + } + + /* Second and subsequent poll descriptors are IB async events */ + TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { + if (rtransport->poll_fds[i++].revents & POLLIN) { + nvmf_process_ib_event(device); + nfds--; + } + } + /* check all flagged fd's have been served */ + assert(nfds == 0); + + return count; +} + +static void +nvmf_rdma_cdata_init(struct spdk_nvmf_transport *transport, struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr_data *cdata) +{ + cdata->nvmf_specific.msdbd = SPDK_NVMF_MAX_SGL_ENTRIES; + + /* Disable in-capsule data transfer for RDMA controller when dif_insert_or_strip is enabled + since in-capsule data only works with NVME drives that support SGL memory layout */ + if (transport->opts.dif_insert_or_strip) { + cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16; + } +} + +static void +nvmf_rdma_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry) +{ + entry->trtype = SPDK_NVMF_TRTYPE_RDMA; + entry->adrfam = trid->adrfam; + entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED; + + spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); + spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); + + entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; + entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; + entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; +} + +static void +nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group); + +static struct spdk_nvmf_transport_poll_group * +nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_poller *poller; + struct spdk_nvmf_rdma_device *device; + struct ibv_srq_init_attr srq_init_attr; + struct spdk_nvmf_rdma_resource_opts opts; + int num_cqe; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + rgroup = calloc(1, sizeof(*rgroup)); + if (!rgroup) { + return NULL; + } + + TAILQ_INIT(&rgroup->pollers); + STAILQ_INIT(&rgroup->retired_bufs); + + pthread_mutex_lock(&rtransport->lock); + TAILQ_FOREACH(device, &rtransport->devices, link) { + poller = calloc(1, sizeof(*poller)); + if (!poller) { + SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); + nvmf_rdma_poll_group_destroy(&rgroup->group); + pthread_mutex_unlock(&rtransport->lock); + return NULL; + } + + poller->device = device; + poller->group = rgroup; + + TAILQ_INIT(&poller->qpairs); + STAILQ_INIT(&poller->qpairs_pending_send); + STAILQ_INIT(&poller->qpairs_pending_recv); + + TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); + if (transport->opts.no_srq == false && device->num_srq < device->attr.max_srq) { + poller->max_srq_depth = transport->opts.max_srq_depth; + + device->num_srq++; + memset(&srq_init_attr, 0, sizeof(struct ibv_srq_init_attr)); + srq_init_attr.attr.max_wr = poller->max_srq_depth; + srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE); + poller->srq = ibv_create_srq(device->pd, &srq_init_attr); + if (!poller->srq) { + SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno); + nvmf_rdma_poll_group_destroy(&rgroup->group); + pthread_mutex_unlock(&rtransport->lock); + return NULL; + } + + opts.qp = poller->srq; + opts.pd = device->pd; + opts.qpair = NULL; + opts.shared = true; + opts.max_queue_depth = poller->max_srq_depth; + opts.in_capsule_data_size = transport->opts.in_capsule_data_size; + + poller->resources = nvmf_rdma_resources_create(&opts); + if (!poller->resources) { + SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n"); + nvmf_rdma_poll_group_destroy(&rgroup->group); + pthread_mutex_unlock(&rtransport->lock); + return NULL; + } + } + + /* + * When using an srq, we can limit the completion queue at startup. + * The following formula represents the calculation: + * num_cqe = num_recv + num_data_wr + num_send_wr. + * where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth + */ + if (poller->srq) { + num_cqe = poller->max_srq_depth * 3; + } else { + num_cqe = DEFAULT_NVMF_RDMA_CQ_SIZE; + } + + poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0); + if (!poller->cq) { + SPDK_ERRLOG("Unable to create completion queue\n"); + nvmf_rdma_poll_group_destroy(&rgroup->group); + pthread_mutex_unlock(&rtransport->lock); + return NULL; + } + poller->num_cqe = num_cqe; + } + + TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link); + if (rtransport->conn_sched.next_admin_pg == NULL) { + rtransport->conn_sched.next_admin_pg = rgroup; + rtransport->conn_sched.next_io_pg = rgroup; + } + + pthread_mutex_unlock(&rtransport->lock); + return &rgroup->group; +} + +static struct spdk_nvmf_transport_poll_group * +nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_poll_group **pg; + struct spdk_nvmf_transport_poll_group *result; + + rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); + + pthread_mutex_lock(&rtransport->lock); + + if (TAILQ_EMPTY(&rtransport->poll_groups)) { + pthread_mutex_unlock(&rtransport->lock); + return NULL; + } + + if (qpair->qid == 0) { + pg = &rtransport->conn_sched.next_admin_pg; + } else { + pg = &rtransport->conn_sched.next_io_pg; + } + + assert(*pg != NULL); + + result = &(*pg)->group; + + *pg = TAILQ_NEXT(*pg, link); + if (*pg == NULL) { + *pg = TAILQ_FIRST(&rtransport->poll_groups); + } + + pthread_mutex_unlock(&rtransport->lock); + + return result; +} + +static void +nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_rdma_poll_group *rgroup, *next_rgroup; + struct spdk_nvmf_rdma_poller *poller, *tmp; + struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; + struct spdk_nvmf_transport_pg_cache_buf *buf, *tmp_buf; + struct spdk_nvmf_rdma_transport *rtransport; + + rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); + if (!rgroup) { + return; + } + + /* free all retired buffers back to the transport so we don't short the mempool. */ + STAILQ_FOREACH_SAFE(buf, &rgroup->retired_bufs, link, tmp_buf) { + STAILQ_REMOVE(&rgroup->retired_bufs, buf, spdk_nvmf_transport_pg_cache_buf, link); + assert(group->transport != NULL); + spdk_mempool_put(group->transport->data_buf_pool, buf); + } + + TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { + TAILQ_REMOVE(&rgroup->pollers, poller, link); + + TAILQ_FOREACH_SAFE(qpair, &poller->qpairs, link, tmp_qpair) { + nvmf_rdma_qpair_destroy(qpair); + } + + if (poller->srq) { + if (poller->resources) { + nvmf_rdma_resources_destroy(poller->resources); + } + ibv_destroy_srq(poller->srq); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Destroyed RDMA shared queue %p\n", poller->srq); + } + + if (poller->cq) { + ibv_destroy_cq(poller->cq); + } + + free(poller); + } + + if (rgroup->group.transport == NULL) { + /* Transport can be NULL when nvmf_rdma_poll_group_create() + * calls this function directly in a failure path. */ + free(rgroup); + return; + } + + rtransport = SPDK_CONTAINEROF(rgroup->group.transport, struct spdk_nvmf_rdma_transport, transport); + + pthread_mutex_lock(&rtransport->lock); + next_rgroup = TAILQ_NEXT(rgroup, link); + TAILQ_REMOVE(&rtransport->poll_groups, rgroup, link); + if (next_rgroup == NULL) { + next_rgroup = TAILQ_FIRST(&rtransport->poll_groups); + } + if (rtransport->conn_sched.next_admin_pg == rgroup) { + rtransport->conn_sched.next_admin_pg = next_rgroup; + } + if (rtransport->conn_sched.next_io_pg == rgroup) { + rtransport->conn_sched.next_io_pg = next_rgroup; + } + pthread_mutex_unlock(&rtransport->lock); + + free(rgroup); +} + +static void +nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair) +{ + if (rqpair->cm_id != NULL) { + nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); + } + nvmf_rdma_qpair_destroy(rqpair); +} + +static int +nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_poller *poller; + int rc; + + rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + device = rqpair->device; + + TAILQ_FOREACH(poller, &rgroup->pollers, link) { + if (poller->device == device) { + break; + } + } + + if (!poller) { + SPDK_ERRLOG("No poller found for device.\n"); + return -1; + } + + TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); + rqpair->poller = poller; + rqpair->srq = rqpair->poller->srq; + + rc = nvmf_rdma_qpair_initialize(qpair); + if (rc < 0) { + SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); + return -1; + } + + rc = nvmf_rdma_event_accept(rqpair->cm_id, rqpair); + if (rc) { + /* Try to reject, but we probably can't */ + nvmf_rdma_qpair_reject_connection(rqpair); + return -1; + } + + nvmf_rdma_update_ibv_state(rqpair); + + return 0; +} + +static int +nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + assert(group->transport->tgt != NULL); + + rqpair->destruct_channel = spdk_get_io_channel(group->transport->tgt); + + if (!rqpair->destruct_channel) { + SPDK_WARNLOG("failed to get io_channel, qpair %p\n", qpair); + return 0; + } + + /* Sanity check that we get io_channel on the correct thread */ + if (qpair->group) { + assert(qpair->group->thread == spdk_io_channel_get_thread(rqpair->destruct_channel)); + } + + return 0; +} + +static int +nvmf_rdma_request_free(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, + struct spdk_nvmf_rdma_transport, transport); + struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, + struct spdk_nvmf_rdma_qpair, qpair); + + /* + * AER requests are freed when a qpair is destroyed. The recv corresponding to that request + * needs to be returned to the shared receive queue or the poll group will eventually be + * starved of RECV structures. + */ + if (rqpair->srq && rdma_req->recv) { + int rc; + struct ibv_recv_wr *bad_recv_wr; + + rc = ibv_post_srq_recv(rqpair->srq, &rdma_req->recv->wr, &bad_recv_wr); + if (rc) { + SPDK_ERRLOG("Unable to re-post rx descriptor\n"); + } + } + + _nvmf_rdma_request_free(rdma_req, rtransport); + return 0; +} + +static int +nvmf_rdma_request_complete(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, + struct spdk_nvmf_rdma_transport, transport); + struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, + struct spdk_nvmf_rdma_request, req); + struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, + struct spdk_nvmf_rdma_qpair, qpair); + + if (rqpair->ibv_state != IBV_QPS_ERR) { + /* The connection is alive, so process the request as normal */ + rdma_req->state = RDMA_REQUEST_STATE_EXECUTED; + } else { + /* The connection is dead. Move the request directly to the completed state. */ + rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; + } + + nvmf_rdma_request_process(rtransport, rdma_req); + + return 0; +} + +static int +nvmf_rdma_destroy_defunct_qpair(void *ctx) +{ + struct spdk_nvmf_rdma_qpair *rqpair = ctx; + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, + struct spdk_nvmf_rdma_transport, transport); + + SPDK_INFOLOG(SPDK_LOG_RDMA, "QP#%d hasn't been drained as expected, manually destroy it\n", + rqpair->qpair.qid); + + nvmf_rdma_qpair_process_pending(rtransport, rqpair, true); + nvmf_rdma_qpair_destroy(rqpair); + + return SPDK_POLLER_BUSY; +} + +static void +nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + if (rqpair->disconnect_flags & RDMA_QP_DISCONNECTING) { + return; + } + + rqpair->disconnect_flags |= RDMA_QP_DISCONNECTING; + + /* This happens only when the qpair is disconnected before + * it is added to the poll group. Since there is no poll group, + * the RDMA qp has not been initialized yet and the RDMA CM + * event has not yet been acknowledged, so we need to reject it. + */ + if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) { + nvmf_rdma_qpair_reject_connection(rqpair); + return; + } + + if (rqpair->rdma_qp) { + spdk_rdma_qp_disconnect(rqpair->rdma_qp); + } + + rqpair->destruct_poller = SPDK_POLLER_REGISTER(nvmf_rdma_destroy_defunct_qpair, (void *)rqpair, + NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US); +} + +static struct spdk_nvmf_rdma_qpair * +get_rdma_qpair_from_wc(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_wc *wc) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + /* @todo: improve QP search */ + TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) { + if (wc->qp_num == rqpair->rdma_qp->qp->qp_num) { + return rqpair; + } + } + SPDK_ERRLOG("Didn't find QP with qp_num %u\n", wc->qp_num); + return NULL; +} + +#ifdef DEBUG +static int +nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req) +{ + return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST || + rdma_req->state == RDMA_REQUEST_STATE_COMPLETING; +} +#endif + +static void +_poller_reset_failed_recvs(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_recv_wr *bad_recv_wr, + int rc) +{ + struct spdk_nvmf_rdma_recv *rdma_recv; + struct spdk_nvmf_rdma_wr *bad_rdma_wr; + + SPDK_ERRLOG("Failed to post a recv for the poller %p with errno %d\n", rpoller, -rc); + while (bad_recv_wr != NULL) { + bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_recv_wr->wr_id; + rdma_recv = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); + + rdma_recv->qpair->current_recv_depth++; + bad_recv_wr = bad_recv_wr->next; + SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rdma_recv->qpair, -rc); + nvmf_rdma_start_disconnect(rdma_recv->qpair); + } +} + +static void +_qp_reset_failed_recvs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *bad_recv_wr, int rc) +{ + SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rqpair, -rc); + while (bad_recv_wr != NULL) { + bad_recv_wr = bad_recv_wr->next; + rqpair->current_recv_depth++; + } + nvmf_rdma_start_disconnect(rqpair); +} + +static void +_poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_poller *rpoller) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct ibv_recv_wr *bad_recv_wr; + int rc; + + if (rpoller->srq) { + if (rpoller->resources->recvs_to_post.first != NULL) { + rc = ibv_post_srq_recv(rpoller->srq, rpoller->resources->recvs_to_post.first, &bad_recv_wr); + if (rc) { + _poller_reset_failed_recvs(rpoller, bad_recv_wr, rc); + } + rpoller->resources->recvs_to_post.first = NULL; + rpoller->resources->recvs_to_post.last = NULL; + } + } else { + while (!STAILQ_EMPTY(&rpoller->qpairs_pending_recv)) { + rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_recv); + assert(rqpair->resources->recvs_to_post.first != NULL); + rc = ibv_post_recv(rqpair->rdma_qp->qp, rqpair->resources->recvs_to_post.first, &bad_recv_wr); + if (rc) { + _qp_reset_failed_recvs(rqpair, bad_recv_wr, rc); + } + rqpair->resources->recvs_to_post.first = NULL; + rqpair->resources->recvs_to_post.last = NULL; + STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_recv, recv_link); + } + } +} + +static void +_qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *bad_wr, int rc) +{ + struct spdk_nvmf_rdma_wr *bad_rdma_wr; + struct spdk_nvmf_rdma_request *prev_rdma_req = NULL, *cur_rdma_req = NULL; + + SPDK_ERRLOG("Failed to post a send for the qpair %p with errno %d\n", rqpair, -rc); + for (; bad_wr != NULL; bad_wr = bad_wr->next) { + bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_wr->wr_id; + assert(rqpair->current_send_depth > 0); + rqpair->current_send_depth--; + switch (bad_rdma_wr->type) { + case RDMA_WR_TYPE_DATA: + cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); + if (bad_wr->opcode == IBV_WR_RDMA_READ) { + assert(rqpair->current_read_depth > 0); + rqpair->current_read_depth--; + } + break; + case RDMA_WR_TYPE_SEND: + cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); + break; + default: + SPDK_ERRLOG("Found a RECV in the list of pending SEND requests for qpair %p\n", rqpair); + prev_rdma_req = cur_rdma_req; + continue; + } + + if (prev_rdma_req == cur_rdma_req) { + /* this request was handled by an earlier wr. i.e. we were performing an nvme read. */ + /* We only have to check against prev_wr since each requests wrs are contiguous in this list. */ + continue; + } + + switch (cur_rdma_req->state) { + case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: + cur_rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + cur_rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; + break; + case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: + case RDMA_REQUEST_STATE_COMPLETING: + cur_rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; + break; + default: + SPDK_ERRLOG("Found a request in a bad state %d when draining pending SEND requests for qpair %p\n", + cur_rdma_req->state, rqpair); + continue; + } + + nvmf_rdma_request_process(rtransport, cur_rdma_req); + prev_rdma_req = cur_rdma_req; + } + + if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { + /* Disconnect the connection. */ + nvmf_rdma_start_disconnect(rqpair); + } + +} + +static void +_poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_poller *rpoller) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct ibv_send_wr *bad_wr = NULL; + int rc; + + while (!STAILQ_EMPTY(&rpoller->qpairs_pending_send)) { + rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_send); + rc = spdk_rdma_qp_flush_send_wrs(rqpair->rdma_qp, &bad_wr); + + /* bad wr always points to the first wr that failed. */ + if (rc) { + _qp_reset_failed_sends(rtransport, rqpair, bad_wr, rc); + } + STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_send, send_link); + } +} + +static int +nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_poller *rpoller) +{ + struct ibv_wc wc[32]; + struct spdk_nvmf_rdma_wr *rdma_wr; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_rdma_recv *rdma_recv; + struct spdk_nvmf_rdma_qpair *rqpair; + int reaped, i; + int count = 0; + bool error = false; + uint64_t poll_tsc = spdk_get_ticks(); + + /* Poll for completing operations. */ + reaped = ibv_poll_cq(rpoller->cq, 32, wc); + if (reaped < 0) { + SPDK_ERRLOG("Error polling CQ! (%d): %s\n", + errno, spdk_strerror(errno)); + return -1; + } + + rpoller->stat.polls++; + rpoller->stat.completions += reaped; + + for (i = 0; i < reaped; i++) { + + rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id; + + switch (rdma_wr->type) { + case RDMA_WR_TYPE_SEND: + rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + if (!wc[i].status) { + count++; + assert(wc[i].opcode == IBV_WC_SEND); + assert(nvmf_rdma_req_is_completing(rdma_req)); + } + + rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; + /* RDMA_WRITE operation completed. +1 since it was chained with rsp WR */ + rqpair->current_send_depth -= rdma_req->num_outstanding_data_wr + 1; + rdma_req->num_outstanding_data_wr = 0; + + nvmf_rdma_request_process(rtransport, rdma_req); + break; + case RDMA_WR_TYPE_RECV: + /* rdma_recv->qpair will be invalid if using an SRQ. In that case we have to get the qpair from the wc. */ + rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr); + if (rpoller->srq != NULL) { + rdma_recv->qpair = get_rdma_qpair_from_wc(rpoller, &wc[i]); + /* It is possible that there are still some completions for destroyed QP + * associated with SRQ. We just ignore these late completions and re-post + * receive WRs back to SRQ. + */ + if (spdk_unlikely(NULL == rdma_recv->qpair)) { + struct ibv_recv_wr *bad_wr; + int rc; + + rdma_recv->wr.next = NULL; + rc = ibv_post_srq_recv(rpoller->srq, + &rdma_recv->wr, + &bad_wr); + if (rc) { + SPDK_ERRLOG("Failed to re-post recv WR to SRQ, err %d\n", rc); + } + continue; + } + } + rqpair = rdma_recv->qpair; + + assert(rqpair != NULL); + if (!wc[i].status) { + assert(wc[i].opcode == IBV_WC_RECV); + if (rqpair->current_recv_depth >= rqpair->max_queue_depth) { + nvmf_rdma_start_disconnect(rqpair); + break; + } + } + + rdma_recv->wr.next = NULL; + rqpair->current_recv_depth++; + rdma_recv->receive_tsc = poll_tsc; + rpoller->stat.requests++; + STAILQ_INSERT_TAIL(&rqpair->resources->incoming_queue, rdma_recv, link); + break; + case RDMA_WR_TYPE_DATA: + rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + assert(rdma_req->num_outstanding_data_wr > 0); + + rqpair->current_send_depth--; + rdma_req->num_outstanding_data_wr--; + if (!wc[i].status) { + assert(wc[i].opcode == IBV_WC_RDMA_READ); + rqpair->current_read_depth--; + /* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */ + if (rdma_req->num_outstanding_data_wr == 0) { + rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE; + nvmf_rdma_request_process(rtransport, rdma_req); + } + } else { + /* If the data transfer fails still force the queue into the error state, + * if we were performing an RDMA_READ, we need to force the request into a + * completed state since it wasn't linked to a send. However, in the RDMA_WRITE + * case, we should wait for the SEND to complete. */ + if (rdma_req->data.wr.opcode == IBV_WR_RDMA_READ) { + rqpair->current_read_depth--; + if (rdma_req->num_outstanding_data_wr == 0) { + rdma_req->state = RDMA_REQUEST_STATE_COMPLETED; + } + } + } + break; + default: + SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); + continue; + } + + /* Handle error conditions */ + if (wc[i].status) { + if ((rdma_wr->type == RDMA_WR_TYPE_RECV && !rpoller->srq)) { + /* When we don't use SRQ and close a qpair, we will receive completions with error + * status for all posted ibv_recv_wrs. This is expected and we don't want to log + * an error in that case. */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Error on CQ %p, request 0x%lu, type %d, status: (%d): %s\n", + rpoller->cq, wc[i].wr_id, rdma_wr->type, wc[i].status, ibv_wc_status_str(wc[i].status)); + } else { + SPDK_ERRLOG("Error on CQ %p, request 0x%lu, type %d, status: (%d): %s\n", + rpoller->cq, wc[i].wr_id, rdma_wr->type, wc[i].status, ibv_wc_status_str(wc[i].status)); + } + + error = true; + + if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) { + /* Disconnect the connection. */ + nvmf_rdma_start_disconnect(rqpair); + } else { + nvmf_rdma_destroy_drained_qpair(rqpair); + } + continue; + } + + nvmf_rdma_qpair_process_pending(rtransport, rqpair, false); + + if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { + nvmf_rdma_destroy_drained_qpair(rqpair); + } + } + + if (error == true) { + return -1; + } + + /* submit outstanding work requests. */ + _poller_submit_recvs(rtransport, rpoller); + _poller_submit_sends(rtransport, rpoller); + + return count; +} + +static int +nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_poller *rpoller; + int count, rc; + + rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); + rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); + + count = 0; + TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { + rc = nvmf_rdma_poller_poll(rtransport, rpoller); + if (rc < 0) { + return rc; + } + count += rc; + } + + return count; +} + +static int +nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, + struct spdk_nvme_transport_id *trid, + bool peer) +{ + struct sockaddr *saddr; + uint16_t port; + + spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_RDMA); + + if (peer) { + saddr = rdma_get_peer_addr(id); + } else { + saddr = rdma_get_local_addr(id); + } + switch (saddr->sa_family) { + case AF_INET: { + struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr; + + trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; + inet_ntop(AF_INET, &saddr_in->sin_addr, + trid->traddr, sizeof(trid->traddr)); + if (peer) { + port = ntohs(rdma_get_dst_port(id)); + } else { + port = ntohs(rdma_get_src_port(id)); + } + snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); + break; + } + case AF_INET6: { + struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr; + trid->adrfam = SPDK_NVMF_ADRFAM_IPV6; + inet_ntop(AF_INET6, &saddr_in->sin6_addr, + trid->traddr, sizeof(trid->traddr)); + if (peer) { + port = ntohs(rdma_get_dst_port(id)); + } else { + port = ntohs(rdma_get_src_port(id)); + } + snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); + break; + } + default: + return -1; + + } + + return 0; +} + +static int +nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true); +} + +static int +nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false); +} + +static int +nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + return nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false); +} + +void +spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) +{ + g_nvmf_hooks = *hooks; +} + +static void +nvmf_rdma_request_set_abort_status(struct spdk_nvmf_request *req, + struct spdk_nvmf_rdma_request *rdma_req_to_abort) +{ + rdma_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + rdma_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; + + rdma_req_to_abort->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE; + + req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command was successfully aborted. */ +} + +static int +_nvmf_rdma_qpair_abort_request(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_rdma_request *rdma_req_to_abort = SPDK_CONTAINEROF( + req->req_to_abort, struct spdk_nvmf_rdma_request, req); + struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair, + struct spdk_nvmf_rdma_qpair, qpair); + int rc; + + spdk_poller_unregister(&req->poller); + + switch (rdma_req_to_abort->state) { + case RDMA_REQUEST_STATE_EXECUTING: + rc = nvmf_ctrlr_abort_request(req); + if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) { + return SPDK_POLLER_BUSY; + } + break; + + case RDMA_REQUEST_STATE_NEED_BUFFER: + STAILQ_REMOVE(&rqpair->poller->group->group.pending_buf_queue, + &rdma_req_to_abort->req, spdk_nvmf_request, buf_link); + + nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort); + break; + + case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING: + STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req_to_abort, + spdk_nvmf_rdma_request, state_link); + + nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort); + break; + + case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING: + STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req_to_abort, + spdk_nvmf_rdma_request, state_link); + + nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort); + break; + + case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: + if (spdk_get_ticks() < req->timeout_tsc) { + req->poller = SPDK_POLLER_REGISTER(_nvmf_rdma_qpair_abort_request, req, 0); + return SPDK_POLLER_BUSY; + } + break; + + default: + break; + } + + spdk_nvmf_request_complete(req); + return SPDK_POLLER_BUSY; +} + +static void +nvmf_rdma_qpair_abort_request(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_transport *transport; + uint16_t cid; + uint32_t i; + struct spdk_nvmf_rdma_request *rdma_req_to_abort = NULL; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); + transport = &rtransport->transport; + + cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; + + for (i = 0; i < rqpair->max_queue_depth; i++) { + rdma_req_to_abort = &rqpair->resources->reqs[i]; + + if (rdma_req_to_abort->state != RDMA_REQUEST_STATE_FREE && + rdma_req_to_abort->req.cmd->nvme_cmd.cid == cid) { + break; + } + } + + if (rdma_req_to_abort == NULL) { + spdk_nvmf_request_complete(req); + return; + } + + req->req_to_abort = &rdma_req_to_abort->req; + req->timeout_tsc = spdk_get_ticks() + + transport->opts.abort_timeout_sec * spdk_get_ticks_hz(); + req->poller = NULL; + + _nvmf_rdma_qpair_abort_request(req); +} + +static int +nvmf_rdma_poll_group_get_stat(struct spdk_nvmf_tgt *tgt, + struct spdk_nvmf_transport_poll_group_stat **stat) +{ + struct spdk_io_channel *ch; + struct spdk_nvmf_poll_group *group; + struct spdk_nvmf_transport_poll_group *tgroup; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_poller *rpoller; + struct spdk_nvmf_rdma_device_stat *device_stat; + uint64_t num_devices = 0; + + if (tgt == NULL || stat == NULL) { + return -EINVAL; + } + + ch = spdk_get_io_channel(tgt); + group = spdk_io_channel_get_ctx(ch);; + spdk_put_io_channel(ch); + TAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (SPDK_NVME_TRANSPORT_RDMA == tgroup->transport->ops->type) { + *stat = calloc(1, sizeof(struct spdk_nvmf_transport_poll_group_stat)); + if (!*stat) { + SPDK_ERRLOG("Failed to allocate memory for NVMf RDMA statistics\n"); + return -ENOMEM; + } + (*stat)->trtype = SPDK_NVME_TRANSPORT_RDMA; + + rgroup = SPDK_CONTAINEROF(tgroup, struct spdk_nvmf_rdma_poll_group, group); + /* Count devices to allocate enough memory */ + TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { + ++num_devices; + } + (*stat)->rdma.devices = calloc(num_devices, sizeof(struct spdk_nvmf_rdma_device_stat)); + if (!(*stat)->rdma.devices) { + SPDK_ERRLOG("Failed to allocate NVMf RDMA devices statistics\n"); + free(*stat); + return -ENOMEM; + } + + (*stat)->rdma.pending_data_buffer = rgroup->stat.pending_data_buffer; + (*stat)->rdma.num_devices = num_devices; + num_devices = 0; + TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { + device_stat = &(*stat)->rdma.devices[num_devices++]; + device_stat->name = ibv_get_device_name(rpoller->device->context->device); + device_stat->polls = rpoller->stat.polls; + device_stat->completions = rpoller->stat.completions; + device_stat->requests = rpoller->stat.requests; + device_stat->request_latency = rpoller->stat.request_latency; + device_stat->pending_free_request = rpoller->stat.pending_free_request; + device_stat->pending_rdma_read = rpoller->stat.pending_rdma_read; + device_stat->pending_rdma_write = rpoller->stat.pending_rdma_write; + } + return 0; + } + } + return -ENOENT; +} + +static void +nvmf_rdma_poll_group_free_stat(struct spdk_nvmf_transport_poll_group_stat *stat) +{ + if (stat) { + free(stat->rdma.devices); + } + free(stat); +} + +const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { + .name = "RDMA", + .type = SPDK_NVME_TRANSPORT_RDMA, + .opts_init = nvmf_rdma_opts_init, + .create = nvmf_rdma_create, + .destroy = nvmf_rdma_destroy, + + .listen = nvmf_rdma_listen, + .stop_listen = nvmf_rdma_stop_listen, + .accept = nvmf_rdma_accept, + .cdata_init = nvmf_rdma_cdata_init, + + .listener_discover = nvmf_rdma_discover, + + .poll_group_create = nvmf_rdma_poll_group_create, + .get_optimal_poll_group = nvmf_rdma_get_optimal_poll_group, + .poll_group_destroy = nvmf_rdma_poll_group_destroy, + .poll_group_add = nvmf_rdma_poll_group_add, + .poll_group_remove = nvmf_rdma_poll_group_remove, + .poll_group_poll = nvmf_rdma_poll_group_poll, + + .req_free = nvmf_rdma_request_free, + .req_complete = nvmf_rdma_request_complete, + + .qpair_fini = nvmf_rdma_close_qpair, + .qpair_get_peer_trid = nvmf_rdma_qpair_get_peer_trid, + .qpair_get_local_trid = nvmf_rdma_qpair_get_local_trid, + .qpair_get_listen_trid = nvmf_rdma_qpair_get_listen_trid, + .qpair_abort_request = nvmf_rdma_qpair_abort_request, + + .poll_group_get_stat = nvmf_rdma_poll_group_get_stat, + .poll_group_free_stat = nvmf_rdma_poll_group_free_stat, +}; + +SPDK_NVMF_TRANSPORT_REGISTER(rdma, &spdk_nvmf_transport_rdma); +SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) diff --git a/src/spdk/lib/nvmf/spdk_nvmf.map b/src/spdk/lib/nvmf/spdk_nvmf.map new file mode 100644 index 000000000..994e7437b --- /dev/null +++ b/src/spdk/lib/nvmf/spdk_nvmf.map @@ -0,0 +1,118 @@ +{ + global: + + # public functions in nvmf.h + spdk_nvmf_tgt_create; + spdk_nvmf_tgt_destroy; + spdk_nvmf_tgt_get_name; + spdk_nvmf_get_tgt; + spdk_nvmf_get_first_tgt; + spdk_nvmf_get_next_tgt; + spdk_nvmf_tgt_write_config_json; + spdk_nvmf_tgt_listen; + spdk_nvmf_tgt_stop_listen; + spdk_nvmf_tgt_accept; + spdk_nvmf_poll_group_create; + spdk_nvmf_get_optimal_poll_group; + spdk_nvmf_poll_group_destroy; + spdk_nvmf_poll_group_add; + spdk_nvmf_poll_group_get_stat; + spdk_nvmf_qpair_disconnect; + spdk_nvmf_qpair_get_peer_trid; + spdk_nvmf_qpair_get_local_trid; + spdk_nvmf_qpair_get_listen_trid; + spdk_nvmf_subsystem_create; + spdk_nvmf_subsystem_destroy; + spdk_nvmf_subsystem_start; + spdk_nvmf_subsystem_stop; + spdk_nvmf_subsystem_pause; + spdk_nvmf_subsystem_resume; + spdk_nvmf_tgt_find_subsystem; + spdk_nvmf_subsystem_get_first; + spdk_nvmf_subsystem_get_next; + spdk_nvmf_subsystem_add_host; + spdk_nvmf_subsystem_remove_host; + spdk_nvmf_subsystem_set_allow_any_host; + spdk_nvmf_subsystem_get_allow_any_host; + spdk_nvmf_subsystem_host_allowed; + spdk_nvmf_subsystem_get_first_host; + spdk_nvmf_subsystem_get_next_host; + spdk_nvmf_host_get_nqn; + spdk_nvmf_subsystem_add_listener; + spdk_nvmf_subsystem_remove_listener; + spdk_nvmf_subsystem_listener_allowed; + spdk_nvmf_subsystem_get_first_listener; + spdk_nvmf_subsystem_get_next_listener; + spdk_nvmf_subsystem_listener_get_trid; + spdk_nvmf_subsystem_allow_any_listener; + spdk_nvmf_subsytem_any_listener_allowed; + spdk_nvmf_ns_opts_get_defaults; + spdk_nvmf_subsystem_add_ns; + spdk_nvmf_subsystem_remove_ns; + spdk_nvmf_subsystem_get_first_ns; + spdk_nvmf_subsystem_get_next_ns; + spdk_nvmf_subsystem_get_ns; + spdk_nvmf_subsystem_get_max_namespaces; + spdk_nvmf_ns_get_id; + spdk_nvmf_ns_get_bdev; + spdk_nvmf_ns_get_opts; + spdk_nvmf_subsystem_get_sn; + spdk_nvmf_subsystem_set_sn; + spdk_nvmf_subsystem_get_mn; + spdk_nvmf_subsystem_set_mn; + spdk_nvmf_subsystem_get_nqn; + spdk_nvmf_subsystem_get_type; + spdk_nvmf_subsystem_get_max_nsid; + spdk_nvmf_transport_opts_init; + spdk_nvmf_transport_create; + spdk_nvmf_transport_destroy; + spdk_nvmf_tgt_get_transport; + spdk_nvmf_transport_get_first; + spdk_nvmf_transport_get_next; + spdk_nvmf_get_transport_opts; + spdk_nvmf_get_transport_type; + spdk_nvmf_get_transport_name; + spdk_nvmf_tgt_add_transport; + spdk_nvmf_transport_listen; + spdk_nvmf_transport_stop_listen; + spdk_nvmf_transport_poll_group_get_stat; + spdk_nvmf_transport_poll_group_free_stat; + spdk_nvmf_rdma_init_hooks; + + # public functions in nvmf_cmd.h + spdk_nvmf_ctrlr_identify_ctrlr; + spdk_nvmf_ctrlr_identify_ns; + spdk_nvmf_set_custom_admin_cmd_hdlr; + spdk_nvmf_set_passthru_admin_cmd; + spdk_nvmf_bdev_ctrlr_nvme_passthru_admin; + spdk_nvmf_request_get_bdev; + spdk_nvmf_request_get_ctrlr; + spdk_nvmf_request_get_subsystem; + spdk_nvmf_request_get_data; + spdk_nvmf_request_get_cmd; + spdk_nvmf_request_get_response; + spdk_nvmf_request_get_req_to_abort; + spdk_nvmf_bdev_ctrlr_abort_cmd; + + # public functions in nvmf_transport.h + spdk_nvmf_transport_register; + spdk_nvmf_tgt_new_qpair; + spdk_nvmf_ctrlr_connect; + spdk_nvmf_ctrlr_data_init; + spdk_nvmf_ctrlr_get_regs; + spdk_nvmf_request_free_buffers; + spdk_nvmf_request_get_buffers; + spdk_nvmf_request_get_buffers_multi; + spdk_nvmf_request_get_dif_ctx; + spdk_nvmf_request_exec; + spdk_nvmf_request_exec_fabrics; + spdk_nvmf_request_free; + spdk_nvmf_request_complete; + spdk_nvmf_ctrlr_get_subsystem; + spdk_nvmf_ctrlr_get_id; + spdk_nvmf_req_get_xfer; + spdk_nvmf_poll_group_remove; + + + local: *; +}; diff --git a/src/spdk/lib/nvmf/subsystem.c b/src/spdk/lib/nvmf/subsystem.c new file mode 100644 index 000000000..ebe8d9a8e --- /dev/null +++ b/src/spdk/lib/nvmf/subsystem.c @@ -0,0 +1,2515 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/nvmf_spec.h" +#include "spdk/uuid.h" +#include "spdk/json.h" +#include "spdk/file.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk_internal/utf.h" + +#define MODEL_NUMBER_DEFAULT "SPDK bdev Controller" + +/* + * States for parsing valid domains in NQNs according to RFC 1034 + */ +enum spdk_nvmf_nqn_domain_states { + /* First character of a domain must be a letter */ + SPDK_NVMF_DOMAIN_ACCEPT_LETTER = 0, + + /* Subsequent characters can be any of letter, digit, or hyphen */ + SPDK_NVMF_DOMAIN_ACCEPT_LDH = 1, + + /* A domain label must end with either a letter or digit */ + SPDK_NVMF_DOMAIN_ACCEPT_ANY = 2 +}; + +/* Returns true if is a valid ASCII string as defined by the NVMe spec */ +static bool +nvmf_valid_ascii_string(const void *buf, size_t size) +{ + const uint8_t *str = buf; + size_t i; + + for (i = 0; i < size; i++) { + if (str[i] < 0x20 || str[i] > 0x7E) { + return false; + } + } + + return true; +} + +static bool +nvmf_valid_nqn(const char *nqn) +{ + size_t len; + struct spdk_uuid uuid_value; + uint32_t i; + int bytes_consumed; + uint32_t domain_label_length; + char *reverse_domain_end; + uint32_t reverse_domain_end_index; + enum spdk_nvmf_nqn_domain_states domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LETTER; + + /* Check for length requirements */ + len = strlen(nqn); + if (len > SPDK_NVMF_NQN_MAX_LEN) { + SPDK_ERRLOG("Invalid NQN \"%s\": length %zu > max %d\n", nqn, len, SPDK_NVMF_NQN_MAX_LEN); + return false; + } + + /* The nqn must be at least as long as SPDK_NVMF_NQN_MIN_LEN to contain the necessary prefix. */ + if (len < SPDK_NVMF_NQN_MIN_LEN) { + SPDK_ERRLOG("Invalid NQN \"%s\": length %zu < min %d\n", nqn, len, SPDK_NVMF_NQN_MIN_LEN); + return false; + } + + /* Check for discovery controller nqn */ + if (!strcmp(nqn, SPDK_NVMF_DISCOVERY_NQN)) { + return true; + } + + /* Check for equality with the generic nqn structure of the form "nqn.2014-08.org.nvmexpress:uuid:11111111-2222-3333-4444-555555555555" */ + if (!strncmp(nqn, SPDK_NVMF_NQN_UUID_PRE, SPDK_NVMF_NQN_UUID_PRE_LEN)) { + if (len != SPDK_NVMF_NQN_UUID_PRE_LEN + SPDK_NVMF_UUID_STRING_LEN) { + SPDK_ERRLOG("Invalid NQN \"%s\": uuid is not the correct length\n", nqn); + return false; + } + + if (spdk_uuid_parse(&uuid_value, &nqn[SPDK_NVMF_NQN_UUID_PRE_LEN])) { + SPDK_ERRLOG("Invalid NQN \"%s\": uuid is not formatted correctly\n", nqn); + return false; + } + return true; + } + + /* If the nqn does not match the uuid structure, the next several checks validate the form "nqn.yyyy-mm.reverse.domain:user-string" */ + + if (strncmp(nqn, "nqn.", 4) != 0) { + SPDK_ERRLOG("Invalid NQN \"%s\": NQN must begin with \"nqn.\".\n", nqn); + return false; + } + + /* Check for yyyy-mm. */ + if (!(isdigit(nqn[4]) && isdigit(nqn[5]) && isdigit(nqn[6]) && isdigit(nqn[7]) && + nqn[8] == '-' && isdigit(nqn[9]) && isdigit(nqn[10]) && nqn[11] == '.')) { + SPDK_ERRLOG("Invalid date code in NQN \"%s\"\n", nqn); + return false; + } + + reverse_domain_end = strchr(nqn, ':'); + if (reverse_domain_end != NULL && (reverse_domain_end_index = reverse_domain_end - nqn) < len - 1) { + } else { + SPDK_ERRLOG("Invalid NQN \"%s\". NQN must contain user specified name with a ':' as a prefix.\n", + nqn); + return false; + } + + /* Check for valid reverse domain */ + domain_label_length = 0; + for (i = 12; i < reverse_domain_end_index; i++) { + if (domain_label_length > SPDK_DOMAIN_LABEL_MAX_LEN) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". At least one Label is too long.\n", nqn); + return false; + } + + switch (domain_state) { + + case SPDK_NVMF_DOMAIN_ACCEPT_LETTER: { + if (isalpha(nqn[i])) { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY; + domain_label_length++; + break; + } else { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must start with a letter.\n", nqn); + return false; + } + } + + case SPDK_NVMF_DOMAIN_ACCEPT_LDH: { + if (isalpha(nqn[i]) || isdigit(nqn[i])) { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY; + domain_label_length++; + break; + } else if (nqn[i] == '-') { + if (i == reverse_domain_end_index - 1) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n", + nqn); + return false; + } + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LDH; + domain_label_length++; + break; + } else if (nqn[i] == '.') { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n", + nqn); + return false; + } else { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only [a-z,A-Z,0-9,'-','.'].\n", + nqn); + return false; + } + } + + case SPDK_NVMF_DOMAIN_ACCEPT_ANY: { + if (isalpha(nqn[i]) || isdigit(nqn[i])) { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY; + domain_label_length++; + break; + } else if (nqn[i] == '-') { + if (i == reverse_domain_end_index - 1) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n", + nqn); + return false; + } + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LDH; + domain_label_length++; + break; + } else if (nqn[i] == '.') { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LETTER; + domain_label_length = 0; + break; + } else { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only [a-z,A-Z,0-9,'-','.'].\n", + nqn); + return false; + } + } + } + } + + i = reverse_domain_end_index + 1; + while (i < len) { + bytes_consumed = utf8_valid(&nqn[i], &nqn[len]); + if (bytes_consumed <= 0) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only valid utf-8.\n", nqn); + return false; + } + + i += bytes_consumed; + } + return true; +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_subsystem_create(struct spdk_nvmf_tgt *tgt, + const char *nqn, + enum spdk_nvmf_subtype type, + uint32_t num_ns) +{ + struct spdk_nvmf_subsystem *subsystem; + uint32_t sid; + + if (spdk_nvmf_tgt_find_subsystem(tgt, nqn)) { + SPDK_ERRLOG("Subsystem NQN '%s' already exists\n", nqn); + return NULL; + } + + if (!nvmf_valid_nqn(nqn)) { + return NULL; + } + + if (type == SPDK_NVMF_SUBTYPE_DISCOVERY && num_ns != 0) { + SPDK_ERRLOG("Discovery subsystem cannot have namespaces.\n"); + return NULL; + } + + /* Find a free subsystem id (sid) */ + for (sid = 0; sid < tgt->max_subsystems; sid++) { + if (tgt->subsystems[sid] == NULL) { + break; + } + } + if (sid >= tgt->max_subsystems) { + return NULL; + } + + subsystem = calloc(1, sizeof(struct spdk_nvmf_subsystem)); + if (subsystem == NULL) { + return NULL; + } + + subsystem->thread = spdk_get_thread(); + subsystem->state = SPDK_NVMF_SUBSYSTEM_INACTIVE; + subsystem->tgt = tgt; + subsystem->id = sid; + subsystem->subtype = type; + subsystem->max_nsid = num_ns; + subsystem->max_allowed_nsid = num_ns; + subsystem->next_cntlid = 0; + snprintf(subsystem->subnqn, sizeof(subsystem->subnqn), "%s", nqn); + TAILQ_INIT(&subsystem->listeners); + TAILQ_INIT(&subsystem->hosts); + TAILQ_INIT(&subsystem->ctrlrs); + + if (num_ns != 0) { + subsystem->ns = calloc(num_ns, sizeof(struct spdk_nvmf_ns *)); + if (subsystem->ns == NULL) { + SPDK_ERRLOG("Namespace memory allocation failed\n"); + free(subsystem); + return NULL; + } + } + + memset(subsystem->sn, '0', sizeof(subsystem->sn) - 1); + subsystem->sn[sizeof(subsystem->sn) - 1] = '\0'; + + snprintf(subsystem->mn, sizeof(subsystem->mn), "%s", + MODEL_NUMBER_DEFAULT); + + tgt->subsystems[sid] = subsystem; + tgt->discovery_genctr++; + + return subsystem; +} + +static void +nvmf_subsystem_remove_host(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_host *host) +{ + TAILQ_REMOVE(&subsystem->hosts, host, link); + free(host); +} + +static void +_nvmf_subsystem_remove_listener(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_subsystem_listener *listener, + bool stop) +{ + struct spdk_nvmf_transport *transport; + + if (stop) { + transport = spdk_nvmf_tgt_get_transport(subsystem->tgt, listener->trid->trstring); + if (transport != NULL) { + spdk_nvmf_transport_stop_listen(transport, listener->trid); + } + } + + TAILQ_REMOVE(&subsystem->listeners, listener, link); + free(listener); +} + +void +spdk_nvmf_subsystem_destroy(struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_host *host, *host_tmp; + struct spdk_nvmf_ctrlr *ctrlr, *ctrlr_tmp; + struct spdk_nvmf_ns *ns; + + if (!subsystem) { + return; + } + + assert(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "subsystem is %p\n", subsystem); + + nvmf_subsystem_remove_all_listeners(subsystem, false); + + TAILQ_FOREACH_SAFE(host, &subsystem->hosts, link, host_tmp) { + nvmf_subsystem_remove_host(subsystem, host); + } + + TAILQ_FOREACH_SAFE(ctrlr, &subsystem->ctrlrs, link, ctrlr_tmp) { + nvmf_ctrlr_destruct(ctrlr); + } + + ns = spdk_nvmf_subsystem_get_first_ns(subsystem); + while (ns != NULL) { + struct spdk_nvmf_ns *next_ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns); + + spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid); + ns = next_ns; + } + + free(subsystem->ns); + + subsystem->tgt->subsystems[subsystem->id] = NULL; + subsystem->tgt->discovery_genctr++; + + free(subsystem); +} + +static int +nvmf_subsystem_set_state(struct spdk_nvmf_subsystem *subsystem, + enum spdk_nvmf_subsystem_state state) +{ + enum spdk_nvmf_subsystem_state actual_old_state, expected_old_state; + bool exchanged; + + switch (state) { + case SPDK_NVMF_SUBSYSTEM_INACTIVE: + expected_old_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING; + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVATING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_INACTIVE; + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVE: + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; + break; + case SPDK_NVMF_SUBSYSTEM_PAUSING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVE; + break; + case SPDK_NVMF_SUBSYSTEM_PAUSED: + expected_old_state = SPDK_NVMF_SUBSYSTEM_PAUSING; + break; + case SPDK_NVMF_SUBSYSTEM_RESUMING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_PAUSED; + break; + case SPDK_NVMF_SUBSYSTEM_DEACTIVATING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVE; + break; + default: + assert(false); + return -1; + } + + actual_old_state = expected_old_state; + exchanged = __atomic_compare_exchange_n(&subsystem->state, &actual_old_state, state, false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + if (spdk_unlikely(exchanged == false)) { + if (actual_old_state == SPDK_NVMF_SUBSYSTEM_RESUMING && + state == SPDK_NVMF_SUBSYSTEM_ACTIVE) { + expected_old_state = SPDK_NVMF_SUBSYSTEM_RESUMING; + } + /* This is for the case when activating the subsystem fails. */ + if (actual_old_state == SPDK_NVMF_SUBSYSTEM_ACTIVATING && + state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING) { + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; + } + actual_old_state = expected_old_state; + __atomic_compare_exchange_n(&subsystem->state, &actual_old_state, state, false, + __ATOMIC_RELAXED, __ATOMIC_RELAXED); + } + assert(actual_old_state == expected_old_state); + return actual_old_state - expected_old_state; +} + +struct subsystem_state_change_ctx { + struct spdk_nvmf_subsystem *subsystem; + + enum spdk_nvmf_subsystem_state requested_state; + + spdk_nvmf_subsystem_state_change_done cb_fn; + void *cb_arg; +}; + +static void +subsystem_state_change_done(struct spdk_io_channel_iter *i, int status) +{ + struct subsystem_state_change_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + if (status == 0) { + status = nvmf_subsystem_set_state(ctx->subsystem, ctx->requested_state); + if (status) { + status = -1; + } + } + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status); + } + free(ctx); +} + +static void +subsystem_state_change_continue(void *ctx, int status) +{ + struct spdk_io_channel_iter *i = ctx; + spdk_for_each_channel_continue(i, status); +} + +static void +subsystem_state_change_on_pg(struct spdk_io_channel_iter *i) +{ + struct subsystem_state_change_ctx *ctx; + struct spdk_io_channel *ch; + struct spdk_nvmf_poll_group *group; + + ctx = spdk_io_channel_iter_get_ctx(i); + ch = spdk_io_channel_iter_get_channel(i); + group = spdk_io_channel_get_ctx(ch); + + switch (ctx->requested_state) { + case SPDK_NVMF_SUBSYSTEM_INACTIVE: + nvmf_poll_group_remove_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVE: + if (ctx->subsystem->state == SPDK_NVMF_SUBSYSTEM_ACTIVATING) { + nvmf_poll_group_add_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + } else if (ctx->subsystem->state == SPDK_NVMF_SUBSYSTEM_RESUMING) { + nvmf_poll_group_resume_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + } + break; + case SPDK_NVMF_SUBSYSTEM_PAUSED: + nvmf_poll_group_pause_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + break; + default: + assert(false); + break; + } +} + +static int +nvmf_subsystem_state_change(struct spdk_nvmf_subsystem *subsystem, + enum spdk_nvmf_subsystem_state requested_state, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + struct subsystem_state_change_ctx *ctx; + enum spdk_nvmf_subsystem_state intermediate_state; + int rc; + + switch (requested_state) { + case SPDK_NVMF_SUBSYSTEM_INACTIVE: + intermediate_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING; + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVE: + if (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED) { + intermediate_state = SPDK_NVMF_SUBSYSTEM_RESUMING; + } else { + intermediate_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; + } + break; + case SPDK_NVMF_SUBSYSTEM_PAUSED: + intermediate_state = SPDK_NVMF_SUBSYSTEM_PAUSING; + break; + default: + assert(false); + return -EINVAL; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + return -ENOMEM; + } + + rc = nvmf_subsystem_set_state(subsystem, intermediate_state); + if (rc) { + free(ctx); + return rc; + } + + ctx->subsystem = subsystem; + ctx->requested_state = requested_state; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_for_each_channel(subsystem->tgt, + subsystem_state_change_on_pg, + ctx, + subsystem_state_change_done); + + return 0; +} + +int +spdk_nvmf_subsystem_start(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_ACTIVE, cb_fn, cb_arg); +} + +int +spdk_nvmf_subsystem_stop(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_INACTIVE, cb_fn, cb_arg); +} + +int +spdk_nvmf_subsystem_pause(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_PAUSED, cb_fn, cb_arg); +} + +int +spdk_nvmf_subsystem_resume(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_ACTIVE, cb_fn, cb_arg); +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_subsystem_get_first(struct spdk_nvmf_tgt *tgt) +{ + struct spdk_nvmf_subsystem *subsystem; + uint32_t sid; + + for (sid = 0; sid < tgt->max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem) { + return subsystem; + } + } + + return NULL; +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_subsystem_get_next(struct spdk_nvmf_subsystem *subsystem) +{ + uint32_t sid; + struct spdk_nvmf_tgt *tgt; + + if (!subsystem) { + return NULL; + } + + tgt = subsystem->tgt; + + for (sid = subsystem->id + 1; sid < tgt->max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem) { + return subsystem; + } + } + + return NULL; +} + +static struct spdk_nvmf_host * +nvmf_subsystem_find_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + struct spdk_nvmf_host *host = NULL; + + TAILQ_FOREACH(host, &subsystem->hosts, link) { + if (strcmp(hostnqn, host->nqn) == 0) { + return host; + } + } + + return NULL; +} + +int +spdk_nvmf_subsystem_add_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + struct spdk_nvmf_host *host; + + if (!nvmf_valid_nqn(hostnqn)) { + return -EINVAL; + } + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + if (nvmf_subsystem_find_host(subsystem, hostnqn)) { + /* This subsystem already allows the specified host. */ + return 0; + } + + host = calloc(1, sizeof(*host)); + if (!host) { + return -ENOMEM; + } + + snprintf(host->nqn, sizeof(host->nqn), "%s", hostnqn); + + TAILQ_INSERT_HEAD(&subsystem->hosts, host, link); + subsystem->tgt->discovery_genctr++; + + return 0; +} + +int +spdk_nvmf_subsystem_remove_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + struct spdk_nvmf_host *host; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + host = nvmf_subsystem_find_host(subsystem, hostnqn); + if (host == NULL) { + return -ENOENT; + } + + nvmf_subsystem_remove_host(subsystem, host); + return 0; +} + +int +spdk_nvmf_subsystem_set_allow_any_host(struct spdk_nvmf_subsystem *subsystem, bool allow_any_host) +{ + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + subsystem->allow_any_host = allow_any_host; + + return 0; +} + +bool +spdk_nvmf_subsystem_get_allow_any_host(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->allow_any_host; +} + +bool +spdk_nvmf_subsystem_host_allowed(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + if (!hostnqn) { + return false; + } + + if (subsystem->allow_any_host) { + return true; + } + + return nvmf_subsystem_find_host(subsystem, hostnqn) != NULL; +} + +struct spdk_nvmf_host * +spdk_nvmf_subsystem_get_first_host(struct spdk_nvmf_subsystem *subsystem) +{ + return TAILQ_FIRST(&subsystem->hosts); +} + + +struct spdk_nvmf_host * +spdk_nvmf_subsystem_get_next_host(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_host *prev_host) +{ + return TAILQ_NEXT(prev_host, link); +} + +const char * +spdk_nvmf_host_get_nqn(const struct spdk_nvmf_host *host) +{ + return host->nqn; +} + +struct spdk_nvmf_subsystem_listener * +nvmf_subsystem_find_listener(struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_subsystem_listener *listener; + + TAILQ_FOREACH(listener, &subsystem->listeners, link) { + if (spdk_nvme_transport_id_compare(listener->trid, trid) == 0) { + return listener; + } + } + + return NULL; +} + +/** + * Function to be called once the target is listening. + * + * \param ctx Context argument passed to this function. + * \param status 0 if it completed successfully, or negative errno if it failed. + */ +static void +_nvmf_subsystem_add_listener_done(void *ctx, int status) +{ + struct spdk_nvmf_subsystem_listener *listener = ctx; + + if (status) { + listener->cb_fn(listener->cb_arg, status); + free(listener); + return; + } + + TAILQ_INSERT_HEAD(&listener->subsystem->listeners, listener, link); + listener->subsystem->tgt->discovery_genctr++; + listener->cb_fn(listener->cb_arg, status); +} + +void +spdk_nvmf_subsystem_add_listener(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_transport_id *trid, + spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn, + void *cb_arg) +{ + struct spdk_nvmf_transport *transport; + struct spdk_nvmf_subsystem_listener *listener; + struct spdk_nvmf_listener *tr_listener; + + assert(cb_fn != NULL); + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + cb_fn(cb_arg, -EAGAIN); + return; + } + + if (nvmf_subsystem_find_listener(subsystem, trid)) { + /* Listener already exists in this subsystem */ + cb_fn(cb_arg, 0); + return; + } + + transport = spdk_nvmf_tgt_get_transport(subsystem->tgt, trid->trstring); + if (transport == NULL) { + SPDK_ERRLOG("Unknown transport type %d\n", trid->trtype); + cb_fn(cb_arg, -EINVAL); + return; + } + + tr_listener = nvmf_transport_find_listener(transport, trid); + if (!tr_listener) { + SPDK_ERRLOG("Cannot find transport listener for %s\n", trid->traddr); + cb_fn(cb_arg, -EINVAL); + return; + } + + listener = calloc(1, sizeof(*listener)); + if (!listener) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + listener->trid = &tr_listener->trid; + listener->transport = transport; + listener->cb_fn = cb_fn; + listener->cb_arg = cb_arg; + listener->subsystem = subsystem; + + if (transport->ops->listen_associate != NULL) { + transport->ops->listen_associate(transport, subsystem, trid, + _nvmf_subsystem_add_listener_done, + listener); + } else { + _nvmf_subsystem_add_listener_done(listener, 0); + } +} + +int +spdk_nvmf_subsystem_remove_listener(struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_subsystem_listener *listener; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + listener = nvmf_subsystem_find_listener(subsystem, trid); + if (listener == NULL) { + return -ENOENT; + } + + _nvmf_subsystem_remove_listener(subsystem, listener, false); + + return 0; +} + +void +nvmf_subsystem_remove_all_listeners(struct spdk_nvmf_subsystem *subsystem, + bool stop) +{ + struct spdk_nvmf_subsystem_listener *listener, *listener_tmp; + + TAILQ_FOREACH_SAFE(listener, &subsystem->listeners, link, listener_tmp) { + _nvmf_subsystem_remove_listener(subsystem, listener, stop); + } +} + +bool +spdk_nvmf_subsystem_listener_allowed(struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_subsystem_listener *listener; + + if (!strcmp(subsystem->subnqn, SPDK_NVMF_DISCOVERY_NQN)) { + return true; + } + + TAILQ_FOREACH(listener, &subsystem->listeners, link) { + if (spdk_nvme_transport_id_compare(listener->trid, trid) == 0) { + return true; + } + } + + return false; +} + +struct spdk_nvmf_subsystem_listener * +spdk_nvmf_subsystem_get_first_listener(struct spdk_nvmf_subsystem *subsystem) +{ + return TAILQ_FIRST(&subsystem->listeners); +} + +struct spdk_nvmf_subsystem_listener * +spdk_nvmf_subsystem_get_next_listener(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_subsystem_listener *prev_listener) +{ + return TAILQ_NEXT(prev_listener, link); +} + +const struct spdk_nvme_transport_id * +spdk_nvmf_subsystem_listener_get_trid(struct spdk_nvmf_subsystem_listener *listener) +{ + return listener->trid; +} + +void +spdk_nvmf_subsystem_allow_any_listener(struct spdk_nvmf_subsystem *subsystem, + bool allow_any_listener) +{ + subsystem->allow_any_listener = allow_any_listener; +} + +bool +spdk_nvmf_subsytem_any_listener_allowed(struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->allow_any_listener; +} + + +struct subsystem_update_ns_ctx { + struct spdk_nvmf_subsystem *subsystem; + + spdk_nvmf_subsystem_state_change_done cb_fn; + void *cb_arg; +}; + +static void +subsystem_update_ns_done(struct spdk_io_channel_iter *i, int status) +{ + struct subsystem_update_ns_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status); + } + free(ctx); +} + +static void +subsystem_update_ns_on_pg(struct spdk_io_channel_iter *i) +{ + int rc; + struct subsystem_update_ns_ctx *ctx; + struct spdk_nvmf_poll_group *group; + struct spdk_nvmf_subsystem *subsystem; + + ctx = spdk_io_channel_iter_get_ctx(i); + group = spdk_io_channel_get_ctx(spdk_io_channel_iter_get_channel(i)); + subsystem = ctx->subsystem; + + rc = nvmf_poll_group_update_subsystem(group, subsystem); + spdk_for_each_channel_continue(i, rc); +} + +static int +nvmf_subsystem_update_ns(struct spdk_nvmf_subsystem *subsystem, spdk_channel_for_each_cpl cpl, + void *ctx) +{ + spdk_for_each_channel(subsystem->tgt, + subsystem_update_ns_on_pg, + ctx, + cpl); + + return 0; +} + +static void +nvmf_subsystem_ns_changed(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + struct spdk_nvmf_ctrlr *ctrlr; + + TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) { + nvmf_ctrlr_ns_changed(ctrlr, nsid); + } +} + +int +spdk_nvmf_subsystem_remove_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + struct spdk_nvmf_ns *ns; + struct spdk_nvmf_registrant *reg, *reg_tmp; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + assert(false); + return -1; + } + + if (nsid == 0 || nsid > subsystem->max_nsid) { + return -1; + } + + ns = subsystem->ns[nsid - 1]; + if (!ns) { + return -1; + } + + subsystem->ns[nsid - 1] = NULL; + + TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, reg_tmp) { + TAILQ_REMOVE(&ns->registrants, reg, link); + free(reg); + } + spdk_bdev_module_release_bdev(ns->bdev); + spdk_bdev_close(ns->desc); + if (ns->ptpl_file) { + free(ns->ptpl_file); + } + free(ns); + + nvmf_subsystem_ns_changed(subsystem, nsid); + + return 0; +} + +static void +_nvmf_ns_hot_remove(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct spdk_nvmf_ns *ns = cb_arg; + int rc; + + rc = spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid); + if (rc != 0) { + SPDK_ERRLOG("Failed to make changes to NVME-oF subsystem with id: %u\n", subsystem->id); + } + + spdk_nvmf_subsystem_resume(subsystem, NULL, NULL); +} + +static void +nvmf_ns_hot_remove(void *remove_ctx) +{ + struct spdk_nvmf_ns *ns = remove_ctx; + int rc; + + rc = spdk_nvmf_subsystem_pause(ns->subsystem, _nvmf_ns_hot_remove, ns); + if (rc) { + SPDK_ERRLOG("Unable to pause subsystem to process namespace removal!\n"); + } +} + +static void +_nvmf_ns_resize(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) +{ + struct spdk_nvmf_ns *ns = cb_arg; + + nvmf_subsystem_ns_changed(subsystem, ns->opts.nsid); + spdk_nvmf_subsystem_resume(subsystem, NULL, NULL); +} + +static void +nvmf_ns_resize(void *event_ctx) +{ + struct spdk_nvmf_ns *ns = event_ctx; + int rc; + + rc = spdk_nvmf_subsystem_pause(ns->subsystem, _nvmf_ns_resize, ns); + if (rc) { + SPDK_ERRLOG("Unable to pause subsystem to process namespace resize!\n"); + } +} + +static void +nvmf_ns_event(enum spdk_bdev_event_type type, + struct spdk_bdev *bdev, + void *event_ctx) +{ + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Bdev event: type %d, name %s, subsystem_id %d, ns_id %d\n", + type, + bdev->name, + ((struct spdk_nvmf_ns *)event_ctx)->subsystem->id, + ((struct spdk_nvmf_ns *)event_ctx)->nsid); + + switch (type) { + case SPDK_BDEV_EVENT_REMOVE: + nvmf_ns_hot_remove(event_ctx); + break; + case SPDK_BDEV_EVENT_RESIZE: + nvmf_ns_resize(event_ctx); + break; + default: + SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); + break; + } +} + +void +spdk_nvmf_ns_opts_get_defaults(struct spdk_nvmf_ns_opts *opts, size_t opts_size) +{ + /* All current fields are set to 0 by default. */ + memset(opts, 0, opts_size); +} + +/* Dummy bdev module used to to claim bdevs. */ +static struct spdk_bdev_module ns_bdev_module = { + .name = "NVMe-oF Target", +}; + +static int +nvmf_ns_load_reservation(const char *file, struct spdk_nvmf_reservation_info *info); +static int +nvmf_ns_reservation_restore(struct spdk_nvmf_ns *ns, struct spdk_nvmf_reservation_info *info); + +uint32_t +spdk_nvmf_subsystem_add_ns(struct spdk_nvmf_subsystem *subsystem, struct spdk_bdev *bdev, + const struct spdk_nvmf_ns_opts *user_opts, size_t opts_size, + const char *ptpl_file) +{ + struct spdk_nvmf_ns_opts opts; + struct spdk_nvmf_ns *ns; + struct spdk_nvmf_reservation_info info = {0}; + int rc; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return 0; + } + + if (spdk_bdev_get_md_size(bdev) != 0 && !spdk_bdev_is_md_interleaved(bdev)) { + SPDK_ERRLOG("Can't attach bdev with separate metadata.\n"); + return 0; + } + + spdk_nvmf_ns_opts_get_defaults(&opts, sizeof(opts)); + if (user_opts) { + memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size)); + } + + if (spdk_mem_all_zero(&opts.uuid, sizeof(opts.uuid))) { + opts.uuid = *spdk_bdev_get_uuid(bdev); + } + + if (opts.nsid == SPDK_NVME_GLOBAL_NS_TAG) { + SPDK_ERRLOG("Invalid NSID %" PRIu32 "\n", opts.nsid); + return 0; + } + + if (opts.nsid == 0) { + /* + * NSID not specified - find a free index. + * + * If no free slots are found, opts.nsid will be subsystem->max_nsid + 1, which will + * expand max_nsid if possible. + */ + for (opts.nsid = 1; opts.nsid <= subsystem->max_nsid; opts.nsid++) { + if (_nvmf_subsystem_get_ns(subsystem, opts.nsid) == NULL) { + break; + } + } + } + + if (_nvmf_subsystem_get_ns(subsystem, opts.nsid)) { + SPDK_ERRLOG("Requested NSID %" PRIu32 " already in use\n", opts.nsid); + return 0; + } + + if (opts.nsid > subsystem->max_nsid) { + struct spdk_nvmf_ns **new_ns_array; + + /* If MaxNamespaces was specified, we can't extend max_nsid beyond it. */ + if (subsystem->max_allowed_nsid > 0 && opts.nsid > subsystem->max_allowed_nsid) { + SPDK_ERRLOG("Can't extend NSID range above MaxNamespaces\n"); + return 0; + } + + /* If a controller is connected, we can't change NN. */ + if (!TAILQ_EMPTY(&subsystem->ctrlrs)) { + SPDK_ERRLOG("Can't extend NSID range while controllers are connected\n"); + return 0; + } + + new_ns_array = realloc(subsystem->ns, sizeof(struct spdk_nvmf_ns *) * opts.nsid); + if (new_ns_array == NULL) { + SPDK_ERRLOG("Memory allocation error while resizing namespace array.\n"); + return 0; + } + + memset(new_ns_array + subsystem->max_nsid, 0, + sizeof(struct spdk_nvmf_ns *) * (opts.nsid - subsystem->max_nsid)); + subsystem->ns = new_ns_array; + subsystem->max_nsid = opts.nsid; + } + + ns = calloc(1, sizeof(*ns)); + if (ns == NULL) { + SPDK_ERRLOG("Namespace allocation failed\n"); + return 0; + } + + ns->bdev = bdev; + ns->opts = opts; + ns->subsystem = subsystem; + rc = spdk_bdev_open_ext(bdev->name, true, nvmf_ns_event, ns, &ns->desc); + if (rc != 0) { + SPDK_ERRLOG("Subsystem %s: bdev %s cannot be opened, error=%d\n", + subsystem->subnqn, spdk_bdev_get_name(bdev), rc); + free(ns); + return 0; + } + rc = spdk_bdev_module_claim_bdev(bdev, ns->desc, &ns_bdev_module); + if (rc != 0) { + spdk_bdev_close(ns->desc); + free(ns); + return 0; + } + subsystem->ns[opts.nsid - 1] = ns; + ns->nsid = opts.nsid; + TAILQ_INIT(&ns->registrants); + + if (ptpl_file) { + rc = nvmf_ns_load_reservation(ptpl_file, &info); + if (!rc) { + rc = nvmf_ns_reservation_restore(ns, &info); + if (rc) { + SPDK_ERRLOG("Subsystem restore reservation failed\n"); + subsystem->ns[opts.nsid - 1] = NULL; + spdk_bdev_close(ns->desc); + free(ns); + return 0; + } + } + ns->ptpl_file = strdup(ptpl_file); + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Subsystem %s: bdev %s assigned nsid %" PRIu32 "\n", + spdk_nvmf_subsystem_get_nqn(subsystem), + spdk_bdev_get_name(bdev), + opts.nsid); + + nvmf_subsystem_ns_changed(subsystem, opts.nsid); + + return opts.nsid; +} + +static uint32_t +nvmf_subsystem_get_next_allocated_nsid(struct spdk_nvmf_subsystem *subsystem, + uint32_t prev_nsid) +{ + uint32_t nsid; + + if (prev_nsid >= subsystem->max_nsid) { + return 0; + } + + for (nsid = prev_nsid + 1; nsid <= subsystem->max_nsid; nsid++) { + if (subsystem->ns[nsid - 1]) { + return nsid; + } + } + + return 0; +} + +struct spdk_nvmf_ns * +spdk_nvmf_subsystem_get_first_ns(struct spdk_nvmf_subsystem *subsystem) +{ + uint32_t first_nsid; + + first_nsid = nvmf_subsystem_get_next_allocated_nsid(subsystem, 0); + return _nvmf_subsystem_get_ns(subsystem, first_nsid); +} + +struct spdk_nvmf_ns * +spdk_nvmf_subsystem_get_next_ns(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ns *prev_ns) +{ + uint32_t next_nsid; + + next_nsid = nvmf_subsystem_get_next_allocated_nsid(subsystem, prev_ns->opts.nsid); + return _nvmf_subsystem_get_ns(subsystem, next_nsid); +} + +struct spdk_nvmf_ns * +spdk_nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + return _nvmf_subsystem_get_ns(subsystem, nsid); +} + +uint32_t +spdk_nvmf_ns_get_id(const struct spdk_nvmf_ns *ns) +{ + return ns->opts.nsid; +} + +struct spdk_bdev * +spdk_nvmf_ns_get_bdev(struct spdk_nvmf_ns *ns) +{ + return ns->bdev; +} + +void +spdk_nvmf_ns_get_opts(const struct spdk_nvmf_ns *ns, struct spdk_nvmf_ns_opts *opts, + size_t opts_size) +{ + memset(opts, 0, opts_size); + memcpy(opts, &ns->opts, spdk_min(sizeof(ns->opts), opts_size)); +} + +const char * +spdk_nvmf_subsystem_get_sn(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->sn; +} + +int +spdk_nvmf_subsystem_set_sn(struct spdk_nvmf_subsystem *subsystem, const char *sn) +{ + size_t len, max_len; + + max_len = sizeof(subsystem->sn) - 1; + len = strlen(sn); + if (len > max_len) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Invalid sn \"%s\": length %zu > max %zu\n", + sn, len, max_len); + return -1; + } + + if (!nvmf_valid_ascii_string(sn, len)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Non-ASCII sn\n"); + SPDK_LOGDUMP(SPDK_LOG_NVMF, "sn", sn, len); + return -1; + } + + snprintf(subsystem->sn, sizeof(subsystem->sn), "%s", sn); + + return 0; +} + +const char * +spdk_nvmf_subsystem_get_mn(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->mn; +} + +int +spdk_nvmf_subsystem_set_mn(struct spdk_nvmf_subsystem *subsystem, const char *mn) +{ + size_t len, max_len; + + if (mn == NULL) { + mn = MODEL_NUMBER_DEFAULT; + } + max_len = sizeof(subsystem->mn) - 1; + len = strlen(mn); + if (len > max_len) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Invalid mn \"%s\": length %zu > max %zu\n", + mn, len, max_len); + return -1; + } + + if (!nvmf_valid_ascii_string(mn, len)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Non-ASCII mn\n"); + SPDK_LOGDUMP(SPDK_LOG_NVMF, "mn", mn, len); + return -1; + } + + snprintf(subsystem->mn, sizeof(subsystem->mn), "%s", mn); + + return 0; +} + +const char * +spdk_nvmf_subsystem_get_nqn(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->subnqn; +} + +enum spdk_nvmf_subtype spdk_nvmf_subsystem_get_type(struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->subtype; +} + +uint32_t +spdk_nvmf_subsystem_get_max_nsid(struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->max_nsid; +} + +static uint16_t +nvmf_subsystem_gen_cntlid(struct spdk_nvmf_subsystem *subsystem) +{ + int count; + + /* + * In the worst case, we might have to try all CNTLID values between 1 and 0xFFF0 - 1 + * before we find one that is unused (or find that all values are in use). + */ + for (count = 0; count < 0xFFF0 - 1; count++) { + subsystem->next_cntlid++; + if (subsystem->next_cntlid >= 0xFFF0) { + /* The spec reserves cntlid values in the range FFF0h to FFFFh. */ + subsystem->next_cntlid = 1; + } + + /* Check if a controller with this cntlid currently exists. */ + if (nvmf_subsystem_get_ctrlr(subsystem, subsystem->next_cntlid) == NULL) { + /* Found unused cntlid */ + return subsystem->next_cntlid; + } + } + + /* All valid cntlid values are in use. */ + return 0xFFFF; +} + +int +nvmf_subsystem_add_ctrlr(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_ctrlr *ctrlr) +{ + ctrlr->cntlid = nvmf_subsystem_gen_cntlid(subsystem); + if (ctrlr->cntlid == 0xFFFF) { + /* Unable to get a cntlid */ + SPDK_ERRLOG("Reached max simultaneous ctrlrs\n"); + return -EBUSY; + } + + TAILQ_INSERT_TAIL(&subsystem->ctrlrs, ctrlr, link); + + return 0; +} + +void +nvmf_subsystem_remove_ctrlr(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr *ctrlr) +{ + assert(subsystem == ctrlr->subsys); + TAILQ_REMOVE(&subsystem->ctrlrs, ctrlr, link); +} + +struct spdk_nvmf_ctrlr * +nvmf_subsystem_get_ctrlr(struct spdk_nvmf_subsystem *subsystem, uint16_t cntlid) +{ + struct spdk_nvmf_ctrlr *ctrlr; + + TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) { + if (ctrlr->cntlid == cntlid) { + return ctrlr; + } + } + + return NULL; +} + +uint32_t +spdk_nvmf_subsystem_get_max_namespaces(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->max_allowed_nsid; +} + +struct _nvmf_ns_registrant { + uint64_t rkey; + char *host_uuid; +}; + +struct _nvmf_ns_registrants { + size_t num_regs; + struct _nvmf_ns_registrant reg[SPDK_NVMF_MAX_NUM_REGISTRANTS]; +}; + +struct _nvmf_ns_reservation { + bool ptpl_activated; + enum spdk_nvme_reservation_type rtype; + uint64_t crkey; + char *bdev_uuid; + char *holder_uuid; + struct _nvmf_ns_registrants regs; +}; + +static const struct spdk_json_object_decoder nvmf_ns_pr_reg_decoders[] = { + {"rkey", offsetof(struct _nvmf_ns_registrant, rkey), spdk_json_decode_uint64}, + {"host_uuid", offsetof(struct _nvmf_ns_registrant, host_uuid), spdk_json_decode_string}, +}; + +static int +nvmf_decode_ns_pr_reg(const struct spdk_json_val *val, void *out) +{ + struct _nvmf_ns_registrant *reg = out; + + return spdk_json_decode_object(val, nvmf_ns_pr_reg_decoders, + SPDK_COUNTOF(nvmf_ns_pr_reg_decoders), reg); +} + +static int +nvmf_decode_ns_pr_regs(const struct spdk_json_val *val, void *out) +{ + struct _nvmf_ns_registrants *regs = out; + + return spdk_json_decode_array(val, nvmf_decode_ns_pr_reg, regs->reg, + SPDK_NVMF_MAX_NUM_REGISTRANTS, ®s->num_regs, + sizeof(struct _nvmf_ns_registrant)); +} + +static const struct spdk_json_object_decoder nvmf_ns_pr_decoders[] = { + {"ptpl", offsetof(struct _nvmf_ns_reservation, ptpl_activated), spdk_json_decode_bool, true}, + {"rtype", offsetof(struct _nvmf_ns_reservation, rtype), spdk_json_decode_uint32, true}, + {"crkey", offsetof(struct _nvmf_ns_reservation, crkey), spdk_json_decode_uint64, true}, + {"bdev_uuid", offsetof(struct _nvmf_ns_reservation, bdev_uuid), spdk_json_decode_string}, + {"holder_uuid", offsetof(struct _nvmf_ns_reservation, holder_uuid), spdk_json_decode_string, true}, + {"registrants", offsetof(struct _nvmf_ns_reservation, regs), nvmf_decode_ns_pr_regs}, +}; + +static int +nvmf_ns_load_reservation(const char *file, struct spdk_nvmf_reservation_info *info) +{ + FILE *fd; + size_t json_size; + ssize_t values_cnt, rc; + void *json = NULL, *end; + struct spdk_json_val *values = NULL; + struct _nvmf_ns_reservation res = {}; + uint32_t i; + + fd = fopen(file, "r"); + /* It's not an error if the file does not exist */ + if (!fd) { + SPDK_NOTICELOG("File %s does not exist\n", file); + return -ENOENT; + } + + /* Load all persist file contents into a local buffer */ + json = spdk_posix_file_load(fd, &json_size); + fclose(fd); + if (!json) { + SPDK_ERRLOG("Load persit file %s failed\n", file); + return -ENOMEM; + } + + rc = spdk_json_parse(json, json_size, NULL, 0, &end, 0); + if (rc < 0) { + SPDK_NOTICELOG("Parsing JSON configuration failed (%zd)\n", rc); + goto exit; + } + + values_cnt = rc; + values = calloc(values_cnt, sizeof(struct spdk_json_val)); + if (values == NULL) { + goto exit; + } + + rc = spdk_json_parse(json, json_size, values, values_cnt, &end, 0); + if (rc != values_cnt) { + SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc); + goto exit; + } + + /* Decode json */ + if (spdk_json_decode_object(values, nvmf_ns_pr_decoders, + SPDK_COUNTOF(nvmf_ns_pr_decoders), + &res)) { + SPDK_ERRLOG("Invalid objects in the persist file %s\n", file); + rc = -EINVAL; + goto exit; + } + + if (res.regs.num_regs > SPDK_NVMF_MAX_NUM_REGISTRANTS) { + SPDK_ERRLOG("Can only support up to %u registrants\n", SPDK_NVMF_MAX_NUM_REGISTRANTS); + rc = -ERANGE; + goto exit; + } + + rc = 0; + info->ptpl_activated = res.ptpl_activated; + info->rtype = res.rtype; + info->crkey = res.crkey; + snprintf(info->bdev_uuid, sizeof(info->bdev_uuid), "%s", res.bdev_uuid); + snprintf(info->holder_uuid, sizeof(info->holder_uuid), "%s", res.holder_uuid); + info->num_regs = res.regs.num_regs; + for (i = 0; i < res.regs.num_regs; i++) { + info->registrants[i].rkey = res.regs.reg[i].rkey; + snprintf(info->registrants[i].host_uuid, sizeof(info->registrants[i].host_uuid), "%s", + res.regs.reg[i].host_uuid); + } + +exit: + free(json); + free(values); + free(res.bdev_uuid); + free(res.holder_uuid); + for (i = 0; i < res.regs.num_regs; i++) { + free(res.regs.reg[i].host_uuid); + } + + return rc; +} + +static bool +nvmf_ns_reservation_all_registrants_type(struct spdk_nvmf_ns *ns); + +static int +nvmf_ns_reservation_restore(struct spdk_nvmf_ns *ns, struct spdk_nvmf_reservation_info *info) +{ + uint32_t i; + struct spdk_nvmf_registrant *reg, *holder = NULL; + struct spdk_uuid bdev_uuid, holder_uuid; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "NSID %u, PTPL %u, Number of registrants %u\n", + ns->nsid, info->ptpl_activated, info->num_regs); + + /* it's not an error */ + if (!info->ptpl_activated || !info->num_regs) { + return 0; + } + + spdk_uuid_parse(&bdev_uuid, info->bdev_uuid); + if (spdk_uuid_compare(&bdev_uuid, spdk_bdev_get_uuid(ns->bdev))) { + SPDK_ERRLOG("Existing bdev UUID is not same with configuration file\n"); + return -EINVAL; + } + + ns->crkey = info->crkey; + ns->rtype = info->rtype; + ns->ptpl_activated = info->ptpl_activated; + spdk_uuid_parse(&holder_uuid, info->holder_uuid); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Bdev UUID %s\n", info->bdev_uuid); + if (info->rtype) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Holder UUID %s, RTYPE %u, RKEY 0x%"PRIx64"\n", + info->holder_uuid, info->rtype, info->crkey); + } + + for (i = 0; i < info->num_regs; i++) { + reg = calloc(1, sizeof(*reg)); + if (!reg) { + return -ENOMEM; + } + spdk_uuid_parse(®->hostid, info->registrants[i].host_uuid); + reg->rkey = info->registrants[i].rkey; + TAILQ_INSERT_TAIL(&ns->registrants, reg, link); + if (!spdk_uuid_compare(&holder_uuid, ®->hostid)) { + holder = reg; + } + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Registrant RKEY 0x%"PRIx64", Host UUID %s\n", + info->registrants[i].rkey, info->registrants[i].host_uuid); + } + + if (nvmf_ns_reservation_all_registrants_type(ns)) { + ns->holder = TAILQ_FIRST(&ns->registrants); + } else { + ns->holder = holder; + } + + return 0; +} + +static int +nvmf_ns_json_write_cb(void *cb_ctx, const void *data, size_t size) +{ + char *file = cb_ctx; + size_t rc; + FILE *fd; + + fd = fopen(file, "w"); + if (!fd) { + SPDK_ERRLOG("Can't open file %s for write\n", file); + return -ENOENT; + } + rc = fwrite(data, 1, size, fd); + fclose(fd); + + return rc == size ? 0 : -1; +} + +static int +nvmf_ns_reservation_update(const char *file, struct spdk_nvmf_reservation_info *info) +{ + struct spdk_json_write_ctx *w; + uint32_t i; + int rc = 0; + + w = spdk_json_write_begin(nvmf_ns_json_write_cb, (void *)file, 0); + if (w == NULL) { + return -ENOMEM; + } + /* clear the configuration file */ + if (!info->ptpl_activated) { + goto exit; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_bool(w, "ptpl", info->ptpl_activated); + spdk_json_write_named_uint32(w, "rtype", info->rtype); + spdk_json_write_named_uint64(w, "crkey", info->crkey); + spdk_json_write_named_string(w, "bdev_uuid", info->bdev_uuid); + spdk_json_write_named_string(w, "holder_uuid", info->holder_uuid); + + spdk_json_write_named_array_begin(w, "registrants"); + for (i = 0; i < info->num_regs; i++) { + spdk_json_write_object_begin(w); + spdk_json_write_named_uint64(w, "rkey", info->registrants[i].rkey); + spdk_json_write_named_string(w, "host_uuid", info->registrants[i].host_uuid); + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + spdk_json_write_object_end(w); + +exit: + rc = spdk_json_write_end(w); + return rc; +} + +static int +nvmf_ns_update_reservation_info(struct spdk_nvmf_ns *ns) +{ + struct spdk_nvmf_reservation_info info; + struct spdk_nvmf_registrant *reg, *tmp; + uint32_t i = 0; + + assert(ns != NULL); + + if (!ns->bdev || !ns->ptpl_file) { + return 0; + } + + memset(&info, 0, sizeof(info)); + spdk_uuid_fmt_lower(info.bdev_uuid, sizeof(info.bdev_uuid), spdk_bdev_get_uuid(ns->bdev)); + + if (ns->rtype) { + info.rtype = ns->rtype; + info.crkey = ns->crkey; + if (!nvmf_ns_reservation_all_registrants_type(ns)) { + assert(ns->holder != NULL); + spdk_uuid_fmt_lower(info.holder_uuid, sizeof(info.holder_uuid), &ns->holder->hostid); + } + } + + TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) { + spdk_uuid_fmt_lower(info.registrants[i].host_uuid, sizeof(info.registrants[i].host_uuid), + ®->hostid); + info.registrants[i++].rkey = reg->rkey; + } + + info.num_regs = i; + info.ptpl_activated = ns->ptpl_activated; + + return nvmf_ns_reservation_update(ns->ptpl_file, &info); +} + +static struct spdk_nvmf_registrant * +nvmf_ns_reservation_get_registrant(struct spdk_nvmf_ns *ns, + struct spdk_uuid *uuid) +{ + struct spdk_nvmf_registrant *reg, *tmp; + + TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) { + if (!spdk_uuid_compare(®->hostid, uuid)) { + return reg; + } + } + + return NULL; +} + +/* Generate reservation notice log to registered HostID controllers */ +static void +nvmf_subsystem_gen_ctrlr_notification(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ns *ns, + struct spdk_uuid *hostid_list, + uint32_t num_hostid, + enum spdk_nvme_reservation_notification_log_page_type type) +{ + struct spdk_nvmf_ctrlr *ctrlr; + uint32_t i; + + for (i = 0; i < num_hostid; i++) { + TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) { + if (!spdk_uuid_compare(&ctrlr->hostid, &hostid_list[i])) { + nvmf_ctrlr_reservation_notice_log(ctrlr, ns, type); + } + } + } +} + +/* Get all registrants' hostid other than the controller who issued the command */ +static uint32_t +nvmf_ns_reservation_get_all_other_hostid(struct spdk_nvmf_ns *ns, + struct spdk_uuid *hostid_list, + uint32_t max_num_hostid, + struct spdk_uuid *current_hostid) +{ + struct spdk_nvmf_registrant *reg, *tmp; + uint32_t num_hostid = 0; + + TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) { + if (spdk_uuid_compare(®->hostid, current_hostid)) { + if (num_hostid == max_num_hostid) { + assert(false); + return max_num_hostid; + } + hostid_list[num_hostid++] = reg->hostid; + } + } + + return num_hostid; +} + +/* Calculate the unregistered HostID list according to list + * prior to execute preempt command and list after executing + * preempt command. + */ +static uint32_t +nvmf_ns_reservation_get_unregistered_hostid(struct spdk_uuid *old_hostid_list, + uint32_t old_num_hostid, + struct spdk_uuid *remaining_hostid_list, + uint32_t remaining_num_hostid) +{ + struct spdk_uuid temp_hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS]; + uint32_t i, j, num_hostid = 0; + bool found; + + if (!remaining_num_hostid) { + return old_num_hostid; + } + + for (i = 0; i < old_num_hostid; i++) { + found = false; + for (j = 0; j < remaining_num_hostid; j++) { + if (!spdk_uuid_compare(&old_hostid_list[i], &remaining_hostid_list[j])) { + found = true; + break; + } + } + if (!found) { + spdk_uuid_copy(&temp_hostid_list[num_hostid++], &old_hostid_list[i]); + } + } + + if (num_hostid) { + memcpy(old_hostid_list, temp_hostid_list, sizeof(struct spdk_uuid) * num_hostid); + } + + return num_hostid; +} + +/* current reservation type is all registrants or not */ +static bool +nvmf_ns_reservation_all_registrants_type(struct spdk_nvmf_ns *ns) +{ + return (ns->rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE_ALL_REGS || + ns->rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS); +} + +/* current registrant is reservation holder or not */ +static bool +nvmf_ns_reservation_registrant_is_holder(struct spdk_nvmf_ns *ns, + struct spdk_nvmf_registrant *reg) +{ + if (!reg) { + return false; + } + + if (nvmf_ns_reservation_all_registrants_type(ns)) { + return true; + } + + return (ns->holder == reg); +} + +static int +nvmf_ns_reservation_add_registrant(struct spdk_nvmf_ns *ns, + struct spdk_nvmf_ctrlr *ctrlr, + uint64_t nrkey) +{ + struct spdk_nvmf_registrant *reg; + + reg = calloc(1, sizeof(*reg)); + if (!reg) { + return -ENOMEM; + } + + reg->rkey = nrkey; + /* set hostid for the registrant */ + spdk_uuid_copy(®->hostid, &ctrlr->hostid); + TAILQ_INSERT_TAIL(&ns->registrants, reg, link); + ns->gen++; + + return 0; +} + +static void +nvmf_ns_reservation_release_reservation(struct spdk_nvmf_ns *ns) +{ + ns->rtype = 0; + ns->crkey = 0; + ns->holder = NULL; +} + +/* release the reservation if the last registrant was removed */ +static void +nvmf_ns_reservation_check_release_on_remove_registrant(struct spdk_nvmf_ns *ns, + struct spdk_nvmf_registrant *reg) +{ + struct spdk_nvmf_registrant *next_reg; + + /* no reservation holder */ + if (!ns->holder) { + assert(ns->rtype == 0); + return; + } + + next_reg = TAILQ_FIRST(&ns->registrants); + if (next_reg && nvmf_ns_reservation_all_registrants_type(ns)) { + /* the next valid registrant is the new holder now */ + ns->holder = next_reg; + } else if (nvmf_ns_reservation_registrant_is_holder(ns, reg)) { + /* release the reservation */ + nvmf_ns_reservation_release_reservation(ns); + } +} + +static void +nvmf_ns_reservation_remove_registrant(struct spdk_nvmf_ns *ns, + struct spdk_nvmf_registrant *reg) +{ + TAILQ_REMOVE(&ns->registrants, reg, link); + nvmf_ns_reservation_check_release_on_remove_registrant(ns, reg); + free(reg); + ns->gen++; + return; +} + +static uint32_t +nvmf_ns_reservation_remove_registrants_by_key(struct spdk_nvmf_ns *ns, + uint64_t rkey) +{ + struct spdk_nvmf_registrant *reg, *tmp; + uint32_t count = 0; + + TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) { + if (reg->rkey == rkey) { + nvmf_ns_reservation_remove_registrant(ns, reg); + count++; + } + } + return count; +} + +static uint32_t +nvmf_ns_reservation_remove_all_other_registrants(struct spdk_nvmf_ns *ns, + struct spdk_nvmf_registrant *reg) +{ + struct spdk_nvmf_registrant *reg_tmp, *reg_tmp2; + uint32_t count = 0; + + TAILQ_FOREACH_SAFE(reg_tmp, &ns->registrants, link, reg_tmp2) { + if (reg_tmp != reg) { + nvmf_ns_reservation_remove_registrant(ns, reg_tmp); + count++; + } + } + return count; +} + +static uint32_t +nvmf_ns_reservation_clear_all_registrants(struct spdk_nvmf_ns *ns) +{ + struct spdk_nvmf_registrant *reg, *reg_tmp; + uint32_t count = 0; + + TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, reg_tmp) { + nvmf_ns_reservation_remove_registrant(ns, reg); + count++; + } + return count; +} + +static void +nvmf_ns_reservation_acquire_reservation(struct spdk_nvmf_ns *ns, uint64_t rkey, + enum spdk_nvme_reservation_type rtype, + struct spdk_nvmf_registrant *holder) +{ + ns->rtype = rtype; + ns->crkey = rkey; + assert(ns->holder == NULL); + ns->holder = holder; +} + +static bool +nvmf_ns_reservation_register(struct spdk_nvmf_ns *ns, + struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + uint8_t rrega, iekey, cptpl, rtype; + struct spdk_nvme_reservation_register_data key; + struct spdk_nvmf_registrant *reg; + uint8_t status = SPDK_NVME_SC_SUCCESS; + bool update_sgroup = false; + struct spdk_uuid hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS]; + uint32_t num_hostid = 0; + int rc; + + rrega = cmd->cdw10_bits.resv_register.rrega; + iekey = cmd->cdw10_bits.resv_register.iekey; + cptpl = cmd->cdw10_bits.resv_register.cptpl; + + if (req->data && req->length >= sizeof(key)) { + memcpy(&key, req->data, sizeof(key)); + } else { + SPDK_ERRLOG("No key provided. Failing request.\n"); + status = SPDK_NVME_SC_INVALID_FIELD; + goto exit; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "REGISTER: RREGA %u, IEKEY %u, CPTPL %u, " + "NRKEY 0x%"PRIx64", NRKEY 0x%"PRIx64"\n", + rrega, iekey, cptpl, key.crkey, key.nrkey); + + if (cptpl == SPDK_NVME_RESERVE_PTPL_CLEAR_POWER_ON) { + /* Ture to OFF state, and need to be updated in the configuration file */ + if (ns->ptpl_activated) { + ns->ptpl_activated = 0; + update_sgroup = true; + } + } else if (cptpl == SPDK_NVME_RESERVE_PTPL_PERSIST_POWER_LOSS) { + if (ns->ptpl_file == NULL) { + status = SPDK_NVME_SC_INVALID_FIELD; + goto exit; + } else if (ns->ptpl_activated == 0) { + ns->ptpl_activated = 1; + update_sgroup = true; + } + } + + /* current Host Identifier has registrant or not */ + reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr->hostid); + + switch (rrega) { + case SPDK_NVME_RESERVE_REGISTER_KEY: + if (!reg) { + /* register new controller */ + if (key.nrkey == 0) { + SPDK_ERRLOG("Can't register zeroed new key\n"); + status = SPDK_NVME_SC_INVALID_FIELD; + goto exit; + } + rc = nvmf_ns_reservation_add_registrant(ns, ctrlr, key.nrkey); + if (rc < 0) { + status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + goto exit; + } + update_sgroup = true; + } else { + /* register with same key is not an error */ + if (reg->rkey != key.nrkey) { + SPDK_ERRLOG("The same host already register a " + "key with 0x%"PRIx64"\n", + reg->rkey); + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + goto exit; + } + } + break; + case SPDK_NVME_RESERVE_UNREGISTER_KEY: + if (!reg || (!iekey && reg->rkey != key.crkey)) { + SPDK_ERRLOG("No registrant or current key doesn't match " + "with existing registrant key\n"); + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + goto exit; + } + + rtype = ns->rtype; + num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, hostid_list, + SPDK_NVMF_MAX_NUM_REGISTRANTS, + &ctrlr->hostid); + + nvmf_ns_reservation_remove_registrant(ns, reg); + + if (!ns->rtype && num_hostid && (rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE_REG_ONLY || + rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_REG_ONLY)) { + nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns, + hostid_list, + num_hostid, + SPDK_NVME_RESERVATION_RELEASED); + } + update_sgroup = true; + break; + case SPDK_NVME_RESERVE_REPLACE_KEY: + if (!reg || (!iekey && reg->rkey != key.crkey)) { + SPDK_ERRLOG("No registrant or current key doesn't match " + "with existing registrant key\n"); + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + goto exit; + } + if (key.nrkey == 0) { + SPDK_ERRLOG("Can't register zeroed new key\n"); + status = SPDK_NVME_SC_INVALID_FIELD; + goto exit; + } + reg->rkey = key.nrkey; + update_sgroup = true; + break; + default: + status = SPDK_NVME_SC_INVALID_FIELD; + goto exit; + } + +exit: + if (update_sgroup) { + rc = nvmf_ns_update_reservation_info(ns); + if (rc != 0) { + status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + } + } + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = status; + return update_sgroup; +} + +static bool +nvmf_ns_reservation_acquire(struct spdk_nvmf_ns *ns, + struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + uint8_t racqa, iekey, rtype; + struct spdk_nvme_reservation_acquire_data key; + struct spdk_nvmf_registrant *reg; + bool all_regs = false; + uint32_t count = 0; + bool update_sgroup = true; + struct spdk_uuid hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS]; + uint32_t num_hostid = 0; + struct spdk_uuid new_hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS]; + uint32_t new_num_hostid = 0; + bool reservation_released = false; + uint8_t status = SPDK_NVME_SC_SUCCESS; + + racqa = cmd->cdw10_bits.resv_acquire.racqa; + iekey = cmd->cdw10_bits.resv_acquire.iekey; + rtype = cmd->cdw10_bits.resv_acquire.rtype; + + if (req->data && req->length >= sizeof(key)) { + memcpy(&key, req->data, sizeof(key)); + } else { + SPDK_ERRLOG("No key provided. Failing request.\n"); + status = SPDK_NVME_SC_INVALID_FIELD; + goto exit; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ACQUIRE: RACQA %u, IEKEY %u, RTYPE %u, " + "NRKEY 0x%"PRIx64", PRKEY 0x%"PRIx64"\n", + racqa, iekey, rtype, key.crkey, key.prkey); + + if (iekey || rtype > SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS) { + SPDK_ERRLOG("Ignore existing key field set to 1\n"); + status = SPDK_NVME_SC_INVALID_FIELD; + update_sgroup = false; + goto exit; + } + + reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr->hostid); + /* must be registrant and CRKEY must match */ + if (!reg || reg->rkey != key.crkey) { + SPDK_ERRLOG("No registrant or current key doesn't match " + "with existing registrant key\n"); + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + update_sgroup = false; + goto exit; + } + + all_regs = nvmf_ns_reservation_all_registrants_type(ns); + + switch (racqa) { + case SPDK_NVME_RESERVE_ACQUIRE: + /* it's not an error for the holder to acquire same reservation type again */ + if (nvmf_ns_reservation_registrant_is_holder(ns, reg) && ns->rtype == rtype) { + /* do nothing */ + update_sgroup = false; + } else if (ns->holder == NULL) { + /* fisrt time to acquire the reservation */ + nvmf_ns_reservation_acquire_reservation(ns, key.crkey, rtype, reg); + } else { + SPDK_ERRLOG("Invalid rtype or current registrant is not holder\n"); + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + update_sgroup = false; + goto exit; + } + break; + case SPDK_NVME_RESERVE_PREEMPT: + /* no reservation holder */ + if (!ns->holder) { + /* unregister with PRKEY */ + nvmf_ns_reservation_remove_registrants_by_key(ns, key.prkey); + break; + } + num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, hostid_list, + SPDK_NVMF_MAX_NUM_REGISTRANTS, + &ctrlr->hostid); + + /* only 1 reservation holder and reservation key is valid */ + if (!all_regs) { + /* preempt itself */ + if (nvmf_ns_reservation_registrant_is_holder(ns, reg) && + ns->crkey == key.prkey) { + ns->rtype = rtype; + reservation_released = true; + break; + } + + if (ns->crkey == key.prkey) { + nvmf_ns_reservation_remove_registrant(ns, ns->holder); + nvmf_ns_reservation_acquire_reservation(ns, key.crkey, rtype, reg); + reservation_released = true; + } else if (key.prkey != 0) { + nvmf_ns_reservation_remove_registrants_by_key(ns, key.prkey); + } else { + /* PRKEY is zero */ + SPDK_ERRLOG("Current PRKEY is zero\n"); + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + update_sgroup = false; + goto exit; + } + } else { + /* release all other registrants except for the current one */ + if (key.prkey == 0) { + nvmf_ns_reservation_remove_all_other_registrants(ns, reg); + assert(ns->holder == reg); + } else { + count = nvmf_ns_reservation_remove_registrants_by_key(ns, key.prkey); + if (count == 0) { + SPDK_ERRLOG("PRKEY doesn't match any registrant\n"); + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + update_sgroup = false; + goto exit; + } + } + } + break; + default: + status = SPDK_NVME_SC_INVALID_FIELD; + update_sgroup = false; + break; + } + +exit: + if (update_sgroup && racqa == SPDK_NVME_RESERVE_PREEMPT) { + new_num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, new_hostid_list, + SPDK_NVMF_MAX_NUM_REGISTRANTS, + &ctrlr->hostid); + /* Preempt notification occurs on the unregistered controllers + * other than the controller who issued the command. + */ + num_hostid = nvmf_ns_reservation_get_unregistered_hostid(hostid_list, + num_hostid, + new_hostid_list, + new_num_hostid); + if (num_hostid) { + nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns, + hostid_list, + num_hostid, + SPDK_NVME_REGISTRATION_PREEMPTED); + + } + /* Reservation released notification occurs on the + * controllers which are the remaining registrants other than + * the controller who issued the command. + */ + if (reservation_released && new_num_hostid) { + nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns, + new_hostid_list, + new_num_hostid, + SPDK_NVME_RESERVATION_RELEASED); + + } + } + if (update_sgroup && ns->ptpl_activated) { + if (nvmf_ns_update_reservation_info(ns)) { + status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + } + } + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = status; + return update_sgroup; +} + +static bool +nvmf_ns_reservation_release(struct spdk_nvmf_ns *ns, + struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + uint8_t rrela, iekey, rtype; + struct spdk_nvmf_registrant *reg; + uint64_t crkey; + uint8_t status = SPDK_NVME_SC_SUCCESS; + bool update_sgroup = true; + struct spdk_uuid hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS]; + uint32_t num_hostid = 0; + + rrela = cmd->cdw10_bits.resv_release.rrela; + iekey = cmd->cdw10_bits.resv_release.iekey; + rtype = cmd->cdw10_bits.resv_release.rtype; + + if (req->data && req->length >= sizeof(crkey)) { + memcpy(&crkey, req->data, sizeof(crkey)); + } else { + SPDK_ERRLOG("No key provided. Failing request.\n"); + status = SPDK_NVME_SC_INVALID_FIELD; + goto exit; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "RELEASE: RRELA %u, IEKEY %u, RTYPE %u, " + "CRKEY 0x%"PRIx64"\n", rrela, iekey, rtype, crkey); + + if (iekey) { + SPDK_ERRLOG("Ignore existing key field set to 1\n"); + status = SPDK_NVME_SC_INVALID_FIELD; + update_sgroup = false; + goto exit; + } + + reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr->hostid); + if (!reg || reg->rkey != crkey) { + SPDK_ERRLOG("No registrant or current key doesn't match " + "with existing registrant key\n"); + status = SPDK_NVME_SC_RESERVATION_CONFLICT; + update_sgroup = false; + goto exit; + } + + num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, hostid_list, + SPDK_NVMF_MAX_NUM_REGISTRANTS, + &ctrlr->hostid); + + switch (rrela) { + case SPDK_NVME_RESERVE_RELEASE: + if (!ns->holder) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "RELEASE: no holder\n"); + update_sgroup = false; + goto exit; + } + if (ns->rtype != rtype) { + SPDK_ERRLOG("Type doesn't match\n"); + status = SPDK_NVME_SC_INVALID_FIELD; + update_sgroup = false; + goto exit; + } + if (!nvmf_ns_reservation_registrant_is_holder(ns, reg)) { + /* not the reservation holder, this isn't an error */ + update_sgroup = false; + goto exit; + } + + rtype = ns->rtype; + nvmf_ns_reservation_release_reservation(ns); + + if (num_hostid && rtype != SPDK_NVME_RESERVE_WRITE_EXCLUSIVE && + rtype != SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS) { + nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns, + hostid_list, + num_hostid, + SPDK_NVME_RESERVATION_RELEASED); + } + break; + case SPDK_NVME_RESERVE_CLEAR: + nvmf_ns_reservation_clear_all_registrants(ns); + if (num_hostid) { + nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns, + hostid_list, + num_hostid, + SPDK_NVME_RESERVATION_PREEMPTED); + } + break; + default: + status = SPDK_NVME_SC_INVALID_FIELD; + update_sgroup = false; + goto exit; + } + +exit: + if (update_sgroup && ns->ptpl_activated) { + if (nvmf_ns_update_reservation_info(ns)) { + status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + } + } + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = status; + return update_sgroup; +} + +static void +nvmf_ns_reservation_report(struct spdk_nvmf_ns *ns, + struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvmf_ctrlr *ctrlr_tmp; + struct spdk_nvmf_registrant *reg, *tmp; + struct spdk_nvme_reservation_status_extended_data *status_data; + struct spdk_nvme_registered_ctrlr_extended_data *ctrlr_data; + uint8_t *payload; + uint32_t len, count = 0; + uint32_t regctl = 0; + uint8_t status = SPDK_NVME_SC_SUCCESS; + + if (req->data == NULL) { + SPDK_ERRLOG("No data transfer specified for request. " + " Unable to transfer back response.\n"); + status = SPDK_NVME_SC_INVALID_FIELD; + goto exit; + } + + if (!cmd->cdw11_bits.resv_report.eds) { + SPDK_ERRLOG("NVMeoF uses extended controller data structure, " + "please set EDS bit in cdw11 and try again\n"); + status = SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT; + goto exit; + } + + /* Get number of registerd controllers, one Host may have more than + * one controller based on different ports. + */ + TAILQ_FOREACH(ctrlr_tmp, &subsystem->ctrlrs, link) { + reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr_tmp->hostid); + if (reg) { + regctl++; + } + } + + len = sizeof(*status_data) + sizeof(*ctrlr_data) * regctl; + payload = calloc(1, len); + if (!payload) { + status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + goto exit; + } + + status_data = (struct spdk_nvme_reservation_status_extended_data *)payload; + status_data->data.gen = ns->gen; + status_data->data.rtype = ns->rtype; + status_data->data.regctl = regctl; + status_data->data.ptpls = ns->ptpl_activated; + + TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) { + assert(count <= regctl); + ctrlr_data = (struct spdk_nvme_registered_ctrlr_extended_data *) + (payload + sizeof(*status_data) + sizeof(*ctrlr_data) * count); + /* Set to 0xffffh for dynamic controller */ + ctrlr_data->cntlid = 0xffff; + ctrlr_data->rcsts.status = (ns->holder == reg) ? true : false; + ctrlr_data->rkey = reg->rkey; + spdk_uuid_copy((struct spdk_uuid *)ctrlr_data->hostid, ®->hostid); + count++; + } + + memcpy(req->data, payload, spdk_min(len, (cmd->cdw10 + 1) * sizeof(uint32_t))); + free(payload); + +exit: + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = status; + return; +} + +static void +nvmf_ns_reservation_complete(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + + spdk_nvmf_request_complete(req); +} + +static void +_nvmf_ns_reservation_update_done(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)cb_arg; + struct spdk_nvmf_poll_group *group = req->qpair->group; + + spdk_thread_send_msg(group->thread, nvmf_ns_reservation_complete, req); +} + +void +nvmf_ns_reservation_request(void *ctx) +{ + struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)ctx; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct subsystem_update_ns_ctx *update_ctx; + uint32_t nsid; + struct spdk_nvmf_ns *ns; + bool update_sgroup = false; + + nsid = cmd->nsid; + ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid); + assert(ns != NULL); + + switch (cmd->opc) { + case SPDK_NVME_OPC_RESERVATION_REGISTER: + update_sgroup = nvmf_ns_reservation_register(ns, ctrlr, req); + break; + case SPDK_NVME_OPC_RESERVATION_ACQUIRE: + update_sgroup = nvmf_ns_reservation_acquire(ns, ctrlr, req); + break; + case SPDK_NVME_OPC_RESERVATION_RELEASE: + update_sgroup = nvmf_ns_reservation_release(ns, ctrlr, req); + break; + case SPDK_NVME_OPC_RESERVATION_REPORT: + nvmf_ns_reservation_report(ns, ctrlr, req); + break; + default: + break; + } + + /* update reservation information to subsystem's poll group */ + if (update_sgroup) { + update_ctx = calloc(1, sizeof(*update_ctx)); + if (update_ctx == NULL) { + SPDK_ERRLOG("Can't alloc subsystem poll group update context\n"); + goto update_done; + } + update_ctx->subsystem = ctrlr->subsys; + update_ctx->cb_fn = _nvmf_ns_reservation_update_done; + update_ctx->cb_arg = req; + + nvmf_subsystem_update_ns(ctrlr->subsys, subsystem_update_ns_done, update_ctx); + return; + } + +update_done: + _nvmf_ns_reservation_update_done(ctrlr->subsys, (void *)req, 0); +} diff --git a/src/spdk/lib/nvmf/tcp.c b/src/spdk/lib/nvmf/tcp.c new file mode 100644 index 000000000..391d4bcf1 --- /dev/null +++ b/src/spdk/lib/nvmf/tcp.c @@ -0,0 +1,2631 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/crc32.h" +#include "spdk/endian.h" +#include "spdk/assert.h" +#include "spdk/thread.h" +#include "spdk/nvmf_transport.h" +#include "spdk/sock.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/util.h" + +#include "spdk_internal/assert.h" +#include "spdk_internal/log.h" +#include "spdk_internal/nvme_tcp.h" + +#include "nvmf_internal.h" + +#define NVMF_TCP_MAX_ACCEPT_SOCK_ONE_TIME 16 +#define SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY 6 + +const struct spdk_nvmf_transport_ops spdk_nvmf_transport_tcp; + +/* spdk nvmf related structure */ +enum spdk_nvmf_tcp_req_state { + + /* The request is not currently in use */ + TCP_REQUEST_STATE_FREE = 0, + + /* Initial state when request first received */ + TCP_REQUEST_STATE_NEW, + + /* The request is queued until a data buffer is available. */ + TCP_REQUEST_STATE_NEED_BUFFER, + + /* The request is currently transferring data from the host to the controller. */ + TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, + + /* The request is waiting for the R2T send acknowledgement. */ + TCP_REQUEST_STATE_AWAITING_R2T_ACK, + + /* The request is ready to execute at the block device */ + TCP_REQUEST_STATE_READY_TO_EXECUTE, + + /* The request is currently executing at the block device */ + TCP_REQUEST_STATE_EXECUTING, + + /* The request finished executing at the block device */ + TCP_REQUEST_STATE_EXECUTED, + + /* The request is ready to send a completion */ + TCP_REQUEST_STATE_READY_TO_COMPLETE, + + /* The request is currently transferring final pdus from the controller to the host. */ + TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, + + /* The request completed and can be marked free. */ + TCP_REQUEST_STATE_COMPLETED, + + /* Terminator */ + TCP_REQUEST_NUM_STATES, +}; + +static const char *spdk_nvmf_tcp_term_req_fes_str[] = { + "Invalid PDU Header Field", + "PDU Sequence Error", + "Header Digiest Error", + "Data Transfer Out of Range", + "R2T Limit Exceeded", + "Unsupported parameter", +}; + +#define OBJECT_NVMF_TCP_IO 0x80 + +#define TRACE_GROUP_NVMF_TCP 0x5 +#define TRACE_TCP_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x0) +#define TRACE_TCP_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x1) +#define TRACE_TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x2) +#define TRACE_TCP_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x3) +#define TRACE_TCP_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x4) +#define TRACE_TCP_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x5) +#define TRACE_TCP_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x6) +#define TRACE_TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x7) +#define TRACE_TCP_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x8) +#define TRACE_TCP_FLUSH_WRITEBUF_START SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x9) +#define TRACE_TCP_FLUSH_WRITEBUF_DONE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0xA) +#define TRACE_TCP_READ_FROM_SOCKET_DONE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0xB) +#define TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0xC) + +SPDK_TRACE_REGISTER_FN(nvmf_tcp_trace, "nvmf_tcp", TRACE_GROUP_NVMF_TCP) +{ + spdk_trace_register_object(OBJECT_NVMF_TCP_IO, 'r'); + spdk_trace_register_description("TCP_REQ_NEW", + TRACE_TCP_REQUEST_STATE_NEW, + OWNER_NONE, OBJECT_NVMF_TCP_IO, 1, 1, ""); + spdk_trace_register_description("TCP_REQ_NEED_BUFFER", + TRACE_TCP_REQUEST_STATE_NEED_BUFFER, + OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, ""); + spdk_trace_register_description("TCP_REQ_TX_H_TO_C", + TRACE_TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, + OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, ""); + spdk_trace_register_description("TCP_REQ_RDY_TO_EXECUTE", + TRACE_TCP_REQUEST_STATE_READY_TO_EXECUTE, + OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, ""); + spdk_trace_register_description("TCP_REQ_EXECUTING", + TRACE_TCP_REQUEST_STATE_EXECUTING, + OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, ""); + spdk_trace_register_description("TCP_REQ_EXECUTED", + TRACE_TCP_REQUEST_STATE_EXECUTED, + OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, ""); + spdk_trace_register_description("TCP_REQ_RDY_TO_COMPLETE", + TRACE_TCP_REQUEST_STATE_READY_TO_COMPLETE, + OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, ""); + spdk_trace_register_description("TCP_REQ_TRANSFER_C2H", + TRACE_TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, + OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, ""); + spdk_trace_register_description("TCP_REQ_COMPLETED", + TRACE_TCP_REQUEST_STATE_COMPLETED, + OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, ""); + spdk_trace_register_description("TCP_WRITE_START", + TRACE_TCP_FLUSH_WRITEBUF_START, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("TCP_WRITE_DONE", + TRACE_TCP_FLUSH_WRITEBUF_DONE, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("TCP_READ_DONE", + TRACE_TCP_READ_FROM_SOCKET_DONE, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("TCP_REQ_AWAIT_R2T_ACK", + TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK, + OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, ""); +} + +struct spdk_nvmf_tcp_req { + struct spdk_nvmf_request req; + struct spdk_nvme_cpl rsp; + struct spdk_nvme_cmd cmd; + + /* A PDU that can be used for sending responses. This is + * not the incoming PDU! */ + struct nvme_tcp_pdu *pdu; + + /* + * The PDU for a request may be used multiple times in serial over + * the request's lifetime. For example, first to send an R2T, then + * to send a completion. To catch mistakes where the PDU is used + * twice at the same time, add a debug flag here for init/fini. + */ + bool pdu_in_use; + + /* In-capsule data buffer */ + uint8_t *buf; + + bool has_incapsule_data; + + /* transfer_tag */ + uint16_t ttag; + + enum spdk_nvmf_tcp_req_state state; + + /* + * h2c_offset is used when we receive the h2c_data PDU. + */ + uint32_t h2c_offset; + + STAILQ_ENTRY(spdk_nvmf_tcp_req) link; + TAILQ_ENTRY(spdk_nvmf_tcp_req) state_link; +}; + +struct spdk_nvmf_tcp_qpair { + struct spdk_nvmf_qpair qpair; + struct spdk_nvmf_tcp_poll_group *group; + struct spdk_nvmf_tcp_port *port; + struct spdk_sock *sock; + + enum nvme_tcp_pdu_recv_state recv_state; + enum nvme_tcp_qpair_state state; + + /* PDU being actively received */ + struct nvme_tcp_pdu pdu_in_progress; + uint32_t recv_buf_size; + + /* This is a spare PDU used for sending special management + * operations. Primarily, this is used for the initial + * connection response and c2h termination request. */ + struct nvme_tcp_pdu mgmt_pdu; + + TAILQ_HEAD(, nvme_tcp_pdu) send_queue; + + /* Arrays of in-capsule buffers, requests, and pdus. + * Each array is 'resource_count' number of elements */ + void *bufs; + struct spdk_nvmf_tcp_req *reqs; + struct nvme_tcp_pdu *pdus; + uint32_t resource_count; + + /* Queues to track the requests in all states */ + TAILQ_HEAD(, spdk_nvmf_tcp_req) state_queue[TCP_REQUEST_NUM_STATES]; + /* Number of requests in each state */ + uint32_t state_cntr[TCP_REQUEST_NUM_STATES]; + + uint8_t cpda; + + bool host_hdgst_enable; + bool host_ddgst_enable; + + /* IP address */ + char initiator_addr[SPDK_NVMF_TRADDR_MAX_LEN]; + char target_addr[SPDK_NVMF_TRADDR_MAX_LEN]; + + /* IP port */ + uint16_t initiator_port; + uint16_t target_port; + + /* Timer used to destroy qpair after detecting transport error issue if initiator does + * not close the connection. + */ + struct spdk_poller *timeout_poller; + + TAILQ_ENTRY(spdk_nvmf_tcp_qpair) link; +}; + +struct spdk_nvmf_tcp_poll_group { + struct spdk_nvmf_transport_poll_group group; + struct spdk_sock_group *sock_group; + + TAILQ_HEAD(, spdk_nvmf_tcp_qpair) qpairs; + TAILQ_HEAD(, spdk_nvmf_tcp_qpair) await_req; +}; + +struct spdk_nvmf_tcp_port { + const struct spdk_nvme_transport_id *trid; + struct spdk_sock *listen_sock; + TAILQ_ENTRY(spdk_nvmf_tcp_port) link; +}; + +struct spdk_nvmf_tcp_transport { + struct spdk_nvmf_transport transport; + + pthread_mutex_t lock; + + TAILQ_HEAD(, spdk_nvmf_tcp_port) ports; +}; + +static bool nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport, + struct spdk_nvmf_tcp_req *tcp_req); + +static void +nvmf_tcp_req_set_state(struct spdk_nvmf_tcp_req *tcp_req, + enum spdk_nvmf_tcp_req_state state) +{ + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_tcp_qpair *tqpair; + + qpair = tcp_req->req.qpair; + tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair); + + TAILQ_REMOVE(&tqpair->state_queue[tcp_req->state], tcp_req, state_link); + assert(tqpair->state_cntr[tcp_req->state] > 0); + tqpair->state_cntr[tcp_req->state]--; + + TAILQ_INSERT_TAIL(&tqpair->state_queue[state], tcp_req, state_link); + tqpair->state_cntr[state]++; + + tcp_req->state = state; +} + +static inline struct nvme_tcp_pdu * +nvmf_tcp_req_pdu_init(struct spdk_nvmf_tcp_req *tcp_req) +{ + assert(tcp_req->pdu_in_use == false); + tcp_req->pdu_in_use = true; + + memset(tcp_req->pdu, 0, sizeof(*tcp_req->pdu)); + tcp_req->pdu->qpair = SPDK_CONTAINEROF(tcp_req->req.qpair, struct spdk_nvmf_tcp_qpair, qpair); + + return tcp_req->pdu; +} + +static inline void +nvmf_tcp_req_pdu_fini(struct spdk_nvmf_tcp_req *tcp_req) +{ + tcp_req->pdu_in_use = false; +} + +static struct spdk_nvmf_tcp_req * +nvmf_tcp_req_get(struct spdk_nvmf_tcp_qpair *tqpair) +{ + struct spdk_nvmf_tcp_req *tcp_req; + + tcp_req = TAILQ_FIRST(&tqpair->state_queue[TCP_REQUEST_STATE_FREE]); + if (!tcp_req) { + return NULL; + } + + memset(&tcp_req->rsp, 0, sizeof(tcp_req->rsp)); + tcp_req->h2c_offset = 0; + tcp_req->has_incapsule_data = false; + tcp_req->req.dif.dif_insert_or_strip = false; + + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_NEW); + return tcp_req; +} + +static void +nvmf_tcp_request_free(struct spdk_nvmf_tcp_req *tcp_req) +{ + struct spdk_nvmf_tcp_transport *ttransport; + + assert(tcp_req != NULL); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tcp_req=%p will be freed\n", tcp_req); + ttransport = SPDK_CONTAINEROF(tcp_req->req.qpair->transport, + struct spdk_nvmf_tcp_transport, transport); + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_COMPLETED); + nvmf_tcp_req_process(ttransport, tcp_req); +} + +static int +nvmf_tcp_req_free(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_tcp_req *tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req); + + nvmf_tcp_request_free(tcp_req); + + return 0; +} + +static void +nvmf_tcp_drain_state_queue(struct spdk_nvmf_tcp_qpair *tqpair, + enum spdk_nvmf_tcp_req_state state) +{ + struct spdk_nvmf_tcp_req *tcp_req, *req_tmp; + + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->state_queue[state], state_link, req_tmp) { + nvmf_tcp_request_free(tcp_req); + } +} + +static void +nvmf_tcp_cleanup_all_states(struct spdk_nvmf_tcp_qpair *tqpair) +{ + struct spdk_nvmf_tcp_req *tcp_req, *req_tmp; + + assert(TAILQ_EMPTY(&tqpair->send_queue)); + + nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST); + nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_NEW); + + /* Wipe the requests waiting for buffer from the global list */ + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->state_queue[TCP_REQUEST_STATE_NEED_BUFFER], state_link, + req_tmp) { + STAILQ_REMOVE(&tqpair->group->group.pending_buf_queue, &tcp_req->req, + spdk_nvmf_request, buf_link); + } + + nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_NEED_BUFFER); + nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_EXECUTING); + nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_AWAITING_R2T_ACK); +} + +static void +nvmf_tcp_dump_qpair_req_contents(struct spdk_nvmf_tcp_qpair *tqpair) +{ + int i; + struct spdk_nvmf_tcp_req *tcp_req; + + SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", tqpair->qpair.qid); + for (i = 1; i < TCP_REQUEST_NUM_STATES; i++) { + SPDK_ERRLOG("\tNum of requests in state[%d] = %u\n", i, tqpair->state_cntr[i]); + TAILQ_FOREACH(tcp_req, &tqpair->state_queue[i], state_link) { + SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", tcp_req->req.data_from_pool); + SPDK_ERRLOG("\t\tRequest opcode: %d\n", tcp_req->req.cmd->nvmf_cmd.opcode); + } + } +} + +static void +nvmf_tcp_qpair_destroy(struct spdk_nvmf_tcp_qpair *tqpair) +{ + int err = 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n"); + + err = spdk_sock_close(&tqpair->sock); + assert(err == 0); + nvmf_tcp_cleanup_all_states(tqpair); + + if (tqpair->state_cntr[TCP_REQUEST_STATE_FREE] != tqpair->resource_count) { + SPDK_ERRLOG("tqpair(%p) free tcp request num is %u but should be %u\n", tqpair, + tqpair->state_cntr[TCP_REQUEST_STATE_FREE], + tqpair->resource_count); + err++; + } + + if (err > 0) { + nvmf_tcp_dump_qpair_req_contents(tqpair); + } + + spdk_dma_free(tqpair->pdus); + free(tqpair->reqs); + spdk_free(tqpair->bufs); + free(tqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Leave\n"); +} + +static int +nvmf_tcp_destroy(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_tcp_transport *ttransport; + + assert(transport != NULL); + ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport); + + pthread_mutex_destroy(&ttransport->lock); + free(ttransport); + return 0; +} + +static struct spdk_nvmf_transport * +nvmf_tcp_create(struct spdk_nvmf_transport_opts *opts) +{ + struct spdk_nvmf_tcp_transport *ttransport; + uint32_t sge_count; + uint32_t min_shared_buffers; + + ttransport = calloc(1, sizeof(*ttransport)); + if (!ttransport) { + return NULL; + } + + TAILQ_INIT(&ttransport->ports); + + ttransport->transport.ops = &spdk_nvmf_transport_tcp; + + SPDK_NOTICELOG("*** TCP Transport Init ***\n"); + + SPDK_INFOLOG(SPDK_LOG_NVMF_TCP, "*** TCP Transport Init ***\n" + " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" + " max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" + " in_capsule_data_size=%d, max_aq_depth=%d\n" + " num_shared_buffers=%d, c2h_success=%d,\n" + " dif_insert_or_strip=%d, sock_priority=%d\n" + " abort_timeout_sec=%d\n", + opts->max_queue_depth, + opts->max_io_size, + opts->max_qpairs_per_ctrlr - 1, + opts->io_unit_size, + opts->in_capsule_data_size, + opts->max_aq_depth, + opts->num_shared_buffers, + opts->c2h_success, + opts->dif_insert_or_strip, + opts->sock_priority, + opts->abort_timeout_sec); + + if (opts->sock_priority > SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY) { + SPDK_ERRLOG("Unsupported socket_priority=%d, the current range is: 0 to %d\n" + "you can use man 7 socket to view the range of priority under SO_PRIORITY item\n", + opts->sock_priority, SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY); + free(ttransport); + return NULL; + } + + /* I/O unit size cannot be larger than max I/O size */ + if (opts->io_unit_size > opts->max_io_size) { + opts->io_unit_size = opts->max_io_size; + } + + sge_count = opts->max_io_size / opts->io_unit_size; + if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) { + SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); + free(ttransport); + return NULL; + } + + min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size; + if (min_shared_buffers > opts->num_shared_buffers) { + SPDK_ERRLOG("There are not enough buffers to satisfy" + "per-poll group caches for each thread. (%" PRIu32 ")" + "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers); + SPDK_ERRLOG("Please specify a larger number of shared buffers\n"); + nvmf_tcp_destroy(&ttransport->transport); + return NULL; + } + + pthread_mutex_init(&ttransport->lock, NULL); + + return &ttransport->transport; +} + +static int +nvmf_tcp_trsvcid_to_int(const char *trsvcid) +{ + unsigned long long ull; + char *end = NULL; + + ull = strtoull(trsvcid, &end, 10); + if (end == NULL || end == trsvcid || *end != '\0') { + return -1; + } + + /* Valid TCP/IP port numbers are in [0, 65535] */ + if (ull > 65535) { + return -1; + } + + return (int)ull; +} + +/** + * Canonicalize a listen address trid. + */ +static int +nvmf_tcp_canon_listen_trid(struct spdk_nvme_transport_id *canon_trid, + const struct spdk_nvme_transport_id *trid) +{ + int trsvcid_int; + + trsvcid_int = nvmf_tcp_trsvcid_to_int(trid->trsvcid); + if (trsvcid_int < 0) { + return -EINVAL; + } + + memset(canon_trid, 0, sizeof(*canon_trid)); + spdk_nvme_trid_populate_transport(canon_trid, SPDK_NVME_TRANSPORT_TCP); + canon_trid->adrfam = trid->adrfam; + snprintf(canon_trid->traddr, sizeof(canon_trid->traddr), "%s", trid->traddr); + snprintf(canon_trid->trsvcid, sizeof(canon_trid->trsvcid), "%d", trsvcid_int); + + return 0; +} + +/** + * Find an existing listening port. + * + * Caller must hold ttransport->lock. + */ +static struct spdk_nvmf_tcp_port * +nvmf_tcp_find_port(struct spdk_nvmf_tcp_transport *ttransport, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvme_transport_id canon_trid; + struct spdk_nvmf_tcp_port *port; + + if (nvmf_tcp_canon_listen_trid(&canon_trid, trid) != 0) { + return NULL; + } + + TAILQ_FOREACH(port, &ttransport->ports, link) { + if (spdk_nvme_transport_id_compare(&canon_trid, port->trid) == 0) { + return port; + } + } + + return NULL; +} + +static int +nvmf_tcp_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_tcp_transport *ttransport; + struct spdk_nvmf_tcp_port *port; + int trsvcid_int; + uint8_t adrfam; + struct spdk_sock_opts opts; + + ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport); + + trsvcid_int = nvmf_tcp_trsvcid_to_int(trid->trsvcid); + if (trsvcid_int < 0) { + SPDK_ERRLOG("Invalid trsvcid '%s'\n", trid->trsvcid); + return -EINVAL; + } + + pthread_mutex_lock(&ttransport->lock); + port = calloc(1, sizeof(*port)); + if (!port) { + SPDK_ERRLOG("Port allocation failed\n"); + pthread_mutex_unlock(&ttransport->lock); + return -ENOMEM; + } + + port->trid = trid; + opts.opts_size = sizeof(opts); + spdk_sock_get_default_opts(&opts); + opts.priority = transport->opts.sock_priority; + port->listen_sock = spdk_sock_listen_ext(trid->traddr, trsvcid_int, + NULL, &opts); + if (port->listen_sock == NULL) { + SPDK_ERRLOG("spdk_sock_listen(%s, %d) failed: %s (%d)\n", + trid->traddr, trsvcid_int, + spdk_strerror(errno), errno); + free(port); + pthread_mutex_unlock(&ttransport->lock); + return -errno; + } + + if (spdk_sock_is_ipv4(port->listen_sock)) { + adrfam = SPDK_NVMF_ADRFAM_IPV4; + } else if (spdk_sock_is_ipv6(port->listen_sock)) { + adrfam = SPDK_NVMF_ADRFAM_IPV6; + } else { + SPDK_ERRLOG("Unhandled socket type\n"); + adrfam = 0; + } + + if (adrfam != trid->adrfam) { + SPDK_ERRLOG("Socket address family mismatch\n"); + spdk_sock_close(&port->listen_sock); + free(port); + pthread_mutex_unlock(&ttransport->lock); + return -EINVAL; + } + + SPDK_NOTICELOG("*** NVMe/TCP Target Listening on %s port %s ***\n", + trid->traddr, trid->trsvcid); + + TAILQ_INSERT_TAIL(&ttransport->ports, port, link); + pthread_mutex_unlock(&ttransport->lock); + return 0; +} + +static void +nvmf_tcp_stop_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_tcp_transport *ttransport; + struct spdk_nvmf_tcp_port *port; + + ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Removing listen address %s port %s\n", + trid->traddr, trid->trsvcid); + + pthread_mutex_lock(&ttransport->lock); + port = nvmf_tcp_find_port(ttransport, trid); + if (port) { + TAILQ_REMOVE(&ttransport->ports, port, link); + spdk_sock_close(&port->listen_sock); + free(port); + } + + pthread_mutex_unlock(&ttransport->lock); +} + +static void nvmf_tcp_qpair_set_recv_state(struct spdk_nvmf_tcp_qpair *tqpair, + enum nvme_tcp_pdu_recv_state state); + +static void +nvmf_tcp_qpair_disconnect(struct spdk_nvmf_tcp_qpair *tqpair) +{ + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Disconnecting qpair %p\n", tqpair); + + if (tqpair->state <= NVME_TCP_QPAIR_STATE_RUNNING) { + tqpair->state = NVME_TCP_QPAIR_STATE_EXITING; + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + spdk_poller_unregister(&tqpair->timeout_poller); + + /* This will end up calling nvmf_tcp_close_qpair */ + spdk_nvmf_qpair_disconnect(&tqpair->qpair, NULL, NULL); + } +} + +static void +_pdu_write_done(void *_pdu, int err) +{ + struct nvme_tcp_pdu *pdu = _pdu; + struct spdk_nvmf_tcp_qpair *tqpair = pdu->qpair; + + TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); + + if (err != 0) { + nvmf_tcp_qpair_disconnect(tqpair); + return; + } + + assert(pdu->cb_fn != NULL); + pdu->cb_fn(pdu->cb_arg); +} + +static void +nvmf_tcp_qpair_write_pdu(struct spdk_nvmf_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu, + nvme_tcp_qpair_xfer_complete_cb cb_fn, + void *cb_arg) +{ + int hlen; + uint32_t crc32c; + uint32_t mapped_length = 0; + ssize_t rc; + + assert(&tqpair->pdu_in_progress != pdu); + + hlen = pdu->hdr.common.hlen; + + /* Header Digest */ + if (g_nvme_tcp_hdgst[pdu->hdr.common.pdu_type] && tqpair->host_hdgst_enable) { + crc32c = nvme_tcp_pdu_calc_header_digest(pdu); + MAKE_DIGEST_WORD((uint8_t *)pdu->hdr.raw + hlen, crc32c); + } + + /* Data Digest */ + if (pdu->data_len > 0 && g_nvme_tcp_ddgst[pdu->hdr.common.pdu_type] && tqpair->host_ddgst_enable) { + crc32c = nvme_tcp_pdu_calc_data_digest(pdu); + MAKE_DIGEST_WORD(pdu->data_digest, crc32c); + } + + pdu->cb_fn = cb_fn; + pdu->cb_arg = cb_arg; + + pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, SPDK_COUNTOF(pdu->iov), pdu, + tqpair->host_hdgst_enable, tqpair->host_ddgst_enable, + &mapped_length); + pdu->sock_req.cb_fn = _pdu_write_done; + pdu->sock_req.cb_arg = pdu; + TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq); + if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_RESP || + pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ) { + rc = spdk_sock_writev(tqpair->sock, pdu->iov, pdu->sock_req.iovcnt); + if (rc == mapped_length) { + _pdu_write_done(pdu, 0); + } else { + SPDK_ERRLOG("IC_RESP or TERM_REQ could not write to socket.\n"); + _pdu_write_done(pdu, -1); + } + } else { + spdk_sock_writev_async(tqpair->sock, &pdu->sock_req); + } +} + +static int +nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair) +{ + uint32_t i; + struct spdk_nvmf_transport_opts *opts; + uint32_t in_capsule_data_size; + + opts = &tqpair->qpair.transport->opts; + + in_capsule_data_size = opts->in_capsule_data_size; + if (opts->dif_insert_or_strip) { + in_capsule_data_size = SPDK_BDEV_BUF_SIZE_WITH_MD(in_capsule_data_size); + } + + tqpair->resource_count = opts->max_queue_depth; + + tqpair->mgmt_pdu.qpair = tqpair; + + tqpair->reqs = calloc(tqpair->resource_count, sizeof(*tqpair->reqs)); + if (!tqpair->reqs) { + SPDK_ERRLOG("Unable to allocate reqs on tqpair=%p\n", tqpair); + return -1; + } + + if (in_capsule_data_size) { + tqpair->bufs = spdk_zmalloc(tqpair->resource_count * in_capsule_data_size, 0x1000, + NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); + if (!tqpair->bufs) { + SPDK_ERRLOG("Unable to allocate bufs on tqpair=%p.\n", tqpair); + return -1; + } + } + + tqpair->pdus = spdk_dma_malloc(tqpair->resource_count * sizeof(*tqpair->pdus), 0x1000, NULL); + if (!tqpair->pdus) { + SPDK_ERRLOG("Unable to allocate pdu pool on tqpair =%p.\n", tqpair); + return -1; + } + + for (i = 0; i < tqpair->resource_count; i++) { + struct spdk_nvmf_tcp_req *tcp_req = &tqpair->reqs[i]; + + tcp_req->ttag = i + 1; + tcp_req->req.qpair = &tqpair->qpair; + + tcp_req->pdu = &tqpair->pdus[i]; + tcp_req->pdu->qpair = tqpair; + + /* Set up memory to receive commands */ + if (tqpair->bufs) { + tcp_req->buf = (void *)((uintptr_t)tqpair->bufs + (i * in_capsule_data_size)); + } + + /* Set the cmdn and rsp */ + tcp_req->req.rsp = (union nvmf_c2h_msg *)&tcp_req->rsp; + tcp_req->req.cmd = (union nvmf_h2c_msg *)&tcp_req->cmd; + + /* Initialize request state to FREE */ + tcp_req->state = TCP_REQUEST_STATE_FREE; + TAILQ_INSERT_TAIL(&tqpair->state_queue[tcp_req->state], tcp_req, state_link); + tqpair->state_cntr[TCP_REQUEST_STATE_FREE]++; + } + + tqpair->recv_buf_size = (in_capsule_data_size + sizeof(struct spdk_nvme_tcp_cmd) + 2 * + SPDK_NVME_TCP_DIGEST_LEN) * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR; + + return 0; +} + +static int +nvmf_tcp_qpair_init(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_tcp_qpair *tqpair; + int i; + + tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "New TCP Connection: %p\n", qpair); + + TAILQ_INIT(&tqpair->send_queue); + + /* Initialise request state queues of the qpair */ + for (i = TCP_REQUEST_STATE_FREE; i < TCP_REQUEST_NUM_STATES; i++) { + TAILQ_INIT(&tqpair->state_queue[i]); + } + + tqpair->host_hdgst_enable = true; + tqpair->host_ddgst_enable = true; + + return 0; +} + +static int +nvmf_tcp_qpair_sock_init(struct spdk_nvmf_tcp_qpair *tqpair) +{ + int rc; + + /* set low water mark */ + rc = spdk_sock_set_recvlowat(tqpair->sock, sizeof(struct spdk_nvme_tcp_common_pdu_hdr)); + if (rc != 0) { + SPDK_ERRLOG("spdk_sock_set_recvlowat() failed\n"); + return rc; + } + + return 0; +} + +static void +nvmf_tcp_handle_connect(struct spdk_nvmf_transport *transport, + struct spdk_nvmf_tcp_port *port, + struct spdk_sock *sock) +{ + struct spdk_nvmf_tcp_qpair *tqpair; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "New connection accepted on %s port %s\n", + port->trid->traddr, port->trid->trsvcid); + + tqpair = calloc(1, sizeof(struct spdk_nvmf_tcp_qpair)); + if (tqpair == NULL) { + SPDK_ERRLOG("Could not allocate new connection.\n"); + spdk_sock_close(&sock); + return; + } + + tqpair->sock = sock; + tqpair->state_cntr[TCP_REQUEST_STATE_FREE] = 0; + tqpair->port = port; + tqpair->qpair.transport = transport; + + rc = spdk_sock_getaddr(tqpair->sock, tqpair->target_addr, + sizeof(tqpair->target_addr), &tqpair->target_port, + tqpair->initiator_addr, sizeof(tqpair->initiator_addr), + &tqpair->initiator_port); + if (rc < 0) { + SPDK_ERRLOG("spdk_sock_getaddr() failed of tqpair=%p\n", tqpair); + nvmf_tcp_qpair_destroy(tqpair); + return; + } + + spdk_nvmf_tgt_new_qpair(transport->tgt, &tqpair->qpair); +} + +static uint32_t +nvmf_tcp_port_accept(struct spdk_nvmf_transport *transport, struct spdk_nvmf_tcp_port *port) +{ + struct spdk_sock *sock; + uint32_t count = 0; + int i; + + for (i = 0; i < NVMF_TCP_MAX_ACCEPT_SOCK_ONE_TIME; i++) { + sock = spdk_sock_accept(port->listen_sock); + if (sock == NULL) { + break; + } + count++; + nvmf_tcp_handle_connect(transport, port, sock); + } + + return count; +} + +static uint32_t +nvmf_tcp_accept(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_tcp_transport *ttransport; + struct spdk_nvmf_tcp_port *port; + uint32_t count = 0; + + ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport); + + TAILQ_FOREACH(port, &ttransport->ports, link) { + count += nvmf_tcp_port_accept(transport, port); + } + + return count; +} + +static void +nvmf_tcp_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry) +{ + entry->trtype = SPDK_NVMF_TRTYPE_TCP; + entry->adrfam = trid->adrfam; + entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED; + + spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); + spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); + + entry->tsas.tcp.sectype = SPDK_NVME_TCP_SECURITY_NONE; +} + +static struct spdk_nvmf_transport_poll_group * +nvmf_tcp_poll_group_create(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_tcp_poll_group *tgroup; + + tgroup = calloc(1, sizeof(*tgroup)); + if (!tgroup) { + return NULL; + } + + tgroup->sock_group = spdk_sock_group_create(&tgroup->group); + if (!tgroup->sock_group) { + goto cleanup; + } + + TAILQ_INIT(&tgroup->qpairs); + TAILQ_INIT(&tgroup->await_req); + + return &tgroup->group; + +cleanup: + free(tgroup); + return NULL; +} + +static struct spdk_nvmf_transport_poll_group * +nvmf_tcp_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_tcp_qpair *tqpair; + struct spdk_sock_group *group = NULL; + int rc; + + tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair); + rc = spdk_sock_get_optimal_sock_group(tqpair->sock, &group); + if (!rc && group != NULL) { + return spdk_sock_group_get_ctx(group); + } + + return NULL; +} + +static void +nvmf_tcp_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_tcp_poll_group *tgroup; + + tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group); + spdk_sock_group_close(&tgroup->sock_group); + + free(tgroup); +} + +static void +nvmf_tcp_qpair_set_recv_state(struct spdk_nvmf_tcp_qpair *tqpair, + enum nvme_tcp_pdu_recv_state state) +{ + if (tqpair->recv_state == state) { + SPDK_ERRLOG("The recv state of tqpair=%p is same with the state(%d) to be set\n", + tqpair, state); + return; + } + + if (tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_REQ) { + /* When leaving the await req state, move the qpair to the main list */ + TAILQ_REMOVE(&tqpair->group->await_req, tqpair, link); + TAILQ_INSERT_TAIL(&tqpair->group->qpairs, tqpair, link); + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tqpair(%p) recv state=%d\n", tqpair, state); + tqpair->recv_state = state; + + switch (state) { + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH: + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH: + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD: + break; + case NVME_TCP_PDU_RECV_STATE_AWAIT_REQ: + TAILQ_REMOVE(&tqpair->group->qpairs, tqpair, link); + TAILQ_INSERT_TAIL(&tqpair->group->await_req, tqpair, link); + break; + case NVME_TCP_PDU_RECV_STATE_ERROR: + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY: + memset(&tqpair->pdu_in_progress, 0, sizeof(tqpair->pdu_in_progress)); + break; + default: + SPDK_ERRLOG("The state(%d) is invalid\n", state); + abort(); + break; + } +} + +static int +nvmf_tcp_qpair_handle_timeout(void *ctx) +{ + struct spdk_nvmf_tcp_qpair *tqpair = ctx; + + assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_ERROR); + + SPDK_ERRLOG("No pdu coming for tqpair=%p within %d seconds\n", tqpair, + SPDK_NVME_TCP_QPAIR_EXIT_TIMEOUT); + + nvmf_tcp_qpair_disconnect(tqpair); + return SPDK_POLLER_BUSY; +} + +static void +nvmf_tcp_send_c2h_term_req_complete(void *cb_arg) +{ + struct spdk_nvmf_tcp_qpair *tqpair = (struct spdk_nvmf_tcp_qpair *)cb_arg; + + if (!tqpair->timeout_poller) { + tqpair->timeout_poller = SPDK_POLLER_REGISTER(nvmf_tcp_qpair_handle_timeout, tqpair, + SPDK_NVME_TCP_QPAIR_EXIT_TIMEOUT * 1000000); + } +} + +static void +nvmf_tcp_send_c2h_term_req(struct spdk_nvmf_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu, + enum spdk_nvme_tcp_term_req_fes fes, uint32_t error_offset) +{ + struct nvme_tcp_pdu *rsp_pdu; + struct spdk_nvme_tcp_term_req_hdr *c2h_term_req; + uint32_t c2h_term_req_hdr_len = sizeof(*c2h_term_req); + uint32_t copy_len; + + rsp_pdu = &tqpair->mgmt_pdu; + + c2h_term_req = &rsp_pdu->hdr.term_req; + c2h_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ; + c2h_term_req->common.hlen = c2h_term_req_hdr_len; + + if ((fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) || + (fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) { + DSET32(&c2h_term_req->fei, error_offset); + } + + copy_len = spdk_min(pdu->hdr.common.hlen, SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE); + + /* Copy the error info into the buffer */ + memcpy((uint8_t *)rsp_pdu->hdr.raw + c2h_term_req_hdr_len, pdu->hdr.raw, copy_len); + nvme_tcp_pdu_set_data(rsp_pdu, (uint8_t *)rsp_pdu->hdr.raw + c2h_term_req_hdr_len, copy_len); + + /* Contain the header of the wrong received pdu */ + c2h_term_req->common.plen = c2h_term_req->common.hlen + copy_len; + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_send_c2h_term_req_complete, tqpair); +} + +static void +nvmf_tcp_capsule_cmd_hdr_handle(struct spdk_nvmf_tcp_transport *ttransport, + struct spdk_nvmf_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvmf_tcp_req *tcp_req; + + assert(pdu->psh_valid_bytes == pdu->psh_len); + assert(pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD); + + tcp_req = nvmf_tcp_req_get(tqpair); + if (!tcp_req) { + /* Directly return and make the allocation retry again */ + if (tqpair->state_cntr[TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST] > 0) { + return; + } + + /* The host sent more commands than the maximum queue depth. */ + SPDK_ERRLOG("Cannot allocate tcp_req on tqpair=%p\n", tqpair); + nvmf_tcp_qpair_disconnect(tqpair); + return; + } + + pdu->req = tcp_req; + assert(tcp_req->state == TCP_REQUEST_STATE_NEW); + nvmf_tcp_req_process(ttransport, tcp_req); +} + +static void +nvmf_tcp_capsule_cmd_payload_handle(struct spdk_nvmf_tcp_transport *ttransport, + struct spdk_nvmf_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvmf_tcp_req *tcp_req; + struct spdk_nvme_tcp_cmd *capsule_cmd; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + capsule_cmd = &pdu->hdr.capsule_cmd; + tcp_req = pdu->req; + assert(tcp_req != NULL); + if (capsule_cmd->common.pdo > SPDK_NVME_TCP_PDU_PDO_MAX_OFFSET) { + SPDK_ERRLOG("Expected ICReq capsule_cmd pdu offset <= %d, got %c\n", + SPDK_NVME_TCP_PDU_PDO_MAX_OFFSET, capsule_cmd->common.pdo); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdo); + goto err; + } + + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE); + nvmf_tcp_req_process(ttransport, tcp_req); + + return; +err: + nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset); +} + +static int +nvmf_tcp_find_req_in_state(struct spdk_nvmf_tcp_qpair *tqpair, + enum spdk_nvmf_tcp_req_state state, + uint16_t cid, uint16_t tag, + struct spdk_nvmf_tcp_req **req) +{ + struct spdk_nvmf_tcp_req *tcp_req = NULL; + + TAILQ_FOREACH(tcp_req, &tqpair->state_queue[state], state_link) { + if (tcp_req->req.cmd->nvme_cmd.cid != cid) { + continue; + } + + if (tcp_req->ttag == tag) { + *req = tcp_req; + return 0; + } + + *req = NULL; + return -1; + } + + /* Didn't find it, but not an error */ + *req = NULL; + return 0; +} + +static void +nvmf_tcp_h2c_data_hdr_handle(struct spdk_nvmf_tcp_transport *ttransport, + struct spdk_nvmf_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvmf_tcp_req *tcp_req; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes = 0; + struct spdk_nvme_tcp_h2c_data_hdr *h2c_data; + int rc; + + h2c_data = &pdu->hdr.h2c_data; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tqpair=%p, r2t_info: datao=%u, datal=%u, cccid=%u, ttag=%u\n", + tqpair, h2c_data->datao, h2c_data->datal, h2c_data->cccid, h2c_data->ttag); + + rc = nvmf_tcp_find_req_in_state(tqpair, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, + h2c_data->cccid, h2c_data->ttag, &tcp_req); + if (rc == 0 && tcp_req == NULL) { + rc = nvmf_tcp_find_req_in_state(tqpair, TCP_REQUEST_STATE_AWAITING_R2T_ACK, h2c_data->cccid, + h2c_data->ttag, &tcp_req); + } + + if (!tcp_req) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tcp_req is not found for tqpair=%p\n", tqpair); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER; + if (rc == 0) { + error_offset = offsetof(struct spdk_nvme_tcp_h2c_data_hdr, cccid); + } else { + error_offset = offsetof(struct spdk_nvme_tcp_h2c_data_hdr, ttag); + } + goto err; + } + + if (tcp_req->h2c_offset != h2c_data->datao) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, + "tcp_req(%p), tqpair=%p, expected data offset %u, but data offset is %u\n", + tcp_req, tqpair, tcp_req->h2c_offset, h2c_data->datao); + fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; + goto err; + } + + if ((h2c_data->datao + h2c_data->datal) > tcp_req->req.length) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, + "tcp_req(%p), tqpair=%p, (datao=%u + datal=%u) execeeds requested length=%u\n", + tcp_req, tqpair, h2c_data->datao, h2c_data->datal, tcp_req->req.length); + fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; + goto err; + } + + pdu->req = tcp_req; + + if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) { + pdu->dif_ctx = &tcp_req->req.dif.dif_ctx; + } + + nvme_tcp_pdu_set_data_buf(pdu, tcp_req->req.iov, tcp_req->req.iovcnt, + h2c_data->datao, h2c_data->datal); + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); + return; + +err: + nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset); +} + +static void +nvmf_tcp_pdu_cmd_complete(void *cb_arg) +{ + struct spdk_nvmf_tcp_req *tcp_req = cb_arg; + nvmf_tcp_request_free(tcp_req); +} + +static void +nvmf_tcp_send_capsule_resp_pdu(struct spdk_nvmf_tcp_req *tcp_req, + struct spdk_nvmf_tcp_qpair *tqpair) +{ + struct nvme_tcp_pdu *rsp_pdu; + struct spdk_nvme_tcp_rsp *capsule_resp; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter, tqpair=%p\n", tqpair); + + rsp_pdu = nvmf_tcp_req_pdu_init(tcp_req); + assert(rsp_pdu != NULL); + + capsule_resp = &rsp_pdu->hdr.capsule_resp; + capsule_resp->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP; + capsule_resp->common.plen = capsule_resp->common.hlen = sizeof(*capsule_resp); + capsule_resp->rccqe = tcp_req->req.rsp->nvme_cpl; + if (tqpair->host_hdgst_enable) { + capsule_resp->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; + capsule_resp->common.plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_pdu_cmd_complete, tcp_req); +} + +static void +nvmf_tcp_pdu_c2h_data_complete(void *cb_arg) +{ + struct spdk_nvmf_tcp_req *tcp_req = cb_arg; + struct spdk_nvmf_tcp_qpair *tqpair = SPDK_CONTAINEROF(tcp_req->req.qpair, + struct spdk_nvmf_tcp_qpair, qpair); + + assert(tqpair != NULL); + if (tqpair->qpair.transport->opts.c2h_success) { + nvmf_tcp_request_free(tcp_req); + } else { + nvmf_tcp_req_pdu_fini(tcp_req); + nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair); + } +} + +static void +nvmf_tcp_r2t_complete(void *cb_arg) +{ + struct spdk_nvmf_tcp_req *tcp_req = cb_arg; + struct spdk_nvmf_tcp_transport *ttransport; + + nvmf_tcp_req_pdu_fini(tcp_req); + + ttransport = SPDK_CONTAINEROF(tcp_req->req.qpair->transport, + struct spdk_nvmf_tcp_transport, transport); + + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + + if (tcp_req->h2c_offset == tcp_req->req.length) { + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE); + nvmf_tcp_req_process(ttransport, tcp_req); + } +} + +static void +nvmf_tcp_send_r2t_pdu(struct spdk_nvmf_tcp_qpair *tqpair, + struct spdk_nvmf_tcp_req *tcp_req) +{ + struct nvme_tcp_pdu *rsp_pdu; + struct spdk_nvme_tcp_r2t_hdr *r2t; + + rsp_pdu = nvmf_tcp_req_pdu_init(tcp_req); + assert(rsp_pdu != NULL); + + r2t = &rsp_pdu->hdr.r2t; + r2t->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_R2T; + r2t->common.plen = r2t->common.hlen = sizeof(*r2t); + + if (tqpair->host_hdgst_enable) { + r2t->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; + r2t->common.plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + r2t->cccid = tcp_req->req.cmd->nvme_cmd.cid; + r2t->ttag = tcp_req->ttag; + r2t->r2to = tcp_req->h2c_offset; + r2t->r2tl = tcp_req->req.length; + + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_AWAITING_R2T_ACK); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, + "tcp_req(%p) on tqpair(%p), r2t_info: cccid=%u, ttag=%u, r2to=%u, r2tl=%u\n", + tcp_req, tqpair, r2t->cccid, r2t->ttag, r2t->r2to, r2t->r2tl); + nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_r2t_complete, tcp_req); +} + +static void +nvmf_tcp_h2c_data_payload_handle(struct spdk_nvmf_tcp_transport *ttransport, + struct spdk_nvmf_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvmf_tcp_req *tcp_req; + + tcp_req = pdu->req; + assert(tcp_req != NULL); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n"); + + tcp_req->h2c_offset += pdu->data_len; + + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + + /* Wait for all of the data to arrive AND for the initial R2T PDU send to be + * acknowledged before moving on. */ + if (tcp_req->h2c_offset == tcp_req->req.length && + tcp_req->state == TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER) { + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE); + nvmf_tcp_req_process(ttransport, tcp_req); + } +} + +static void +nvmf_tcp_h2c_term_req_dump(struct spdk_nvme_tcp_term_req_hdr *h2c_term_req) +{ + SPDK_ERRLOG("Error info of pdu(%p): %s\n", h2c_term_req, + spdk_nvmf_tcp_term_req_fes_str[h2c_term_req->fes]); + if ((h2c_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) || + (h2c_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "The offset from the start of the PDU header is %u\n", + DGET32(h2c_term_req->fei)); + } +} + +static void +nvmf_tcp_h2c_term_req_hdr_handle(struct spdk_nvmf_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvme_tcp_term_req_hdr *h2c_term_req = &pdu->hdr.term_req; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + + if (h2c_term_req->fes > SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER) { + SPDK_ERRLOG("Fatal Error Stauts(FES) is unknown for h2c_term_req pdu=%p\n", pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_term_req_hdr, fes); + goto end; + } + + /* set the data buffer */ + nvme_tcp_pdu_set_data(pdu, (uint8_t *)pdu->hdr.raw + h2c_term_req->common.hlen, + h2c_term_req->common.plen - h2c_term_req->common.hlen); + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); + return; +end: + nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset); +} + +static void +nvmf_tcp_h2c_term_req_payload_handle(struct spdk_nvmf_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvme_tcp_term_req_hdr *h2c_term_req = &pdu->hdr.term_req; + + nvmf_tcp_h2c_term_req_dump(h2c_term_req); + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); +} + +static void +nvmf_tcp_pdu_payload_handle(struct spdk_nvmf_tcp_qpair *tqpair, + struct spdk_nvmf_tcp_transport *ttransport) +{ + int rc = 0; + struct nvme_tcp_pdu *pdu; + uint32_t crc32c, error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); + pdu = &tqpair->pdu_in_progress; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n"); + /* check data digest if need */ + if (pdu->ddgst_enable) { + crc32c = nvme_tcp_pdu_calc_data_digest(pdu); + rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c); + if (rc == 0) { + SPDK_ERRLOG("Data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR; + nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset); + return; + + } + } + + switch (pdu->hdr.common.pdu_type) { + case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD: + nvmf_tcp_capsule_cmd_payload_handle(ttransport, tqpair, pdu); + break; + case SPDK_NVME_TCP_PDU_TYPE_H2C_DATA: + nvmf_tcp_h2c_data_payload_handle(ttransport, tqpair, pdu); + break; + + case SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ: + nvmf_tcp_h2c_term_req_payload_handle(tqpair, pdu); + break; + + default: + /* The code should not go to here */ + SPDK_ERRLOG("The code should not go to here\n"); + break; + } +} + +static void +nvmf_tcp_send_icresp_complete(void *cb_arg) +{ + struct spdk_nvmf_tcp_qpair *tqpair = cb_arg; + + tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING; +} + +static void +nvmf_tcp_icreq_handle(struct spdk_nvmf_tcp_transport *ttransport, + struct spdk_nvmf_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvme_tcp_ic_req *ic_req = &pdu->hdr.ic_req; + struct nvme_tcp_pdu *rsp_pdu; + struct spdk_nvme_tcp_ic_resp *ic_resp; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + /* Only PFV 0 is defined currently */ + if (ic_req->pfv != 0) { + SPDK_ERRLOG("Expected ICReq PFV %u, got %u\n", 0u, ic_req->pfv); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_ic_req, pfv); + goto end; + } + + /* MAXR2T is 0's based */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "maxr2t =%u\n", (ic_req->maxr2t + 1u)); + + tqpair->host_hdgst_enable = ic_req->dgst.bits.hdgst_enable ? true : false; + if (!tqpair->host_hdgst_enable) { + tqpair->recv_buf_size -= SPDK_NVME_TCP_DIGEST_LEN * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR; + } + + tqpair->host_ddgst_enable = ic_req->dgst.bits.ddgst_enable ? true : false; + if (!tqpair->host_ddgst_enable) { + tqpair->recv_buf_size -= SPDK_NVME_TCP_DIGEST_LEN * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR; + } + + /* Now that we know whether digests are enabled, properly size the receive buffer */ + if (spdk_sock_set_recvbuf(tqpair->sock, tqpair->recv_buf_size) < 0) { + SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n", + tqpair, + tqpair->recv_buf_size); + /* Not fatal. */ + } + + tqpair->cpda = spdk_min(ic_req->hpda, SPDK_NVME_TCP_CPDA_MAX); + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "cpda of tqpair=(%p) is : %u\n", tqpair, tqpair->cpda); + + rsp_pdu = &tqpair->mgmt_pdu; + + ic_resp = &rsp_pdu->hdr.ic_resp; + ic_resp->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_RESP; + ic_resp->common.hlen = ic_resp->common.plen = sizeof(*ic_resp); + ic_resp->pfv = 0; + ic_resp->cpda = tqpair->cpda; + ic_resp->maxh2cdata = ttransport->transport.opts.max_io_size; + ic_resp->dgst.bits.hdgst_enable = tqpair->host_hdgst_enable ? 1 : 0; + ic_resp->dgst.bits.ddgst_enable = tqpair->host_ddgst_enable ? 1 : 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "host_hdgst_enable: %u\n", tqpair->host_hdgst_enable); + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "host_ddgst_enable: %u\n", tqpair->host_ddgst_enable); + + tqpair->state = NVME_TCP_QPAIR_STATE_INITIALIZING; + nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_send_icresp_complete, tqpair); + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + return; +end: + nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset); +} + +static void +nvmf_tcp_pdu_psh_handle(struct spdk_nvmf_tcp_qpair *tqpair, + struct spdk_nvmf_tcp_transport *ttransport) +{ + struct nvme_tcp_pdu *pdu; + int rc; + uint32_t crc32c, error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH); + pdu = &tqpair->pdu_in_progress; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "pdu type of tqpair(%p) is %d\n", tqpair, + pdu->hdr.common.pdu_type); + /* check header digest if needed */ + if (pdu->has_hdgst) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Compare the header of pdu=%p on tqpair=%p\n", pdu, tqpair); + crc32c = nvme_tcp_pdu_calc_header_digest(pdu); + rc = MATCH_DIGEST_WORD((uint8_t *)pdu->hdr.raw + pdu->hdr.common.hlen, crc32c); + if (rc == 0) { + SPDK_ERRLOG("Header digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR; + nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset); + return; + + } + } + + switch (pdu->hdr.common.pdu_type) { + case SPDK_NVME_TCP_PDU_TYPE_IC_REQ: + nvmf_tcp_icreq_handle(ttransport, tqpair, pdu); + break; + case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD: + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_REQ); + break; + case SPDK_NVME_TCP_PDU_TYPE_H2C_DATA: + nvmf_tcp_h2c_data_hdr_handle(ttransport, tqpair, pdu); + break; + + case SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ: + nvmf_tcp_h2c_term_req_hdr_handle(tqpair, pdu); + break; + + default: + SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->pdu_in_progress.hdr.common.pdu_type); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = 1; + nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset); + break; + } +} + +static void +nvmf_tcp_pdu_ch_handle(struct spdk_nvmf_tcp_qpair *tqpair) +{ + struct nvme_tcp_pdu *pdu; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + uint8_t expected_hlen, pdo; + bool plen_error = false, pdo_error = false; + + assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH); + pdu = &tqpair->pdu_in_progress; + + if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_REQ) { + if (tqpair->state != NVME_TCP_QPAIR_STATE_INVALID) { + SPDK_ERRLOG("Already received ICreq PDU, and reject this pdu=%p\n", pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR; + goto err; + } + expected_hlen = sizeof(struct spdk_nvme_tcp_ic_req); + if (pdu->hdr.common.plen != expected_hlen) { + plen_error = true; + } + } else { + if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) { + SPDK_ERRLOG("The TCP/IP connection is not negotitated\n"); + fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR; + goto err; + } + + switch (pdu->hdr.common.pdu_type) { + case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD: + expected_hlen = sizeof(struct spdk_nvme_tcp_cmd); + pdo = pdu->hdr.common.pdo; + if ((tqpair->cpda != 0) && (pdo != ((tqpair->cpda + 1) << 2))) { + pdo_error = true; + break; + } + + if (pdu->hdr.common.plen < expected_hlen) { + plen_error = true; + } + break; + case SPDK_NVME_TCP_PDU_TYPE_H2C_DATA: + expected_hlen = sizeof(struct spdk_nvme_tcp_h2c_data_hdr); + pdo = pdu->hdr.common.pdo; + if ((tqpair->cpda != 0) && (pdo != ((tqpair->cpda + 1) << 2))) { + pdo_error = true; + break; + } + if (pdu->hdr.common.plen < expected_hlen) { + plen_error = true; + } + break; + + case SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ: + expected_hlen = sizeof(struct spdk_nvme_tcp_term_req_hdr); + if ((pdu->hdr.common.plen <= expected_hlen) || + (pdu->hdr.common.plen > SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE)) { + plen_error = true; + } + break; + + default: + SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", pdu->hdr.common.pdu_type); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdu_type); + goto err; + } + } + + if (pdu->hdr.common.hlen != expected_hlen) { + SPDK_ERRLOG("PDU type=0x%02x, Expected ICReq header length %u, got %u on tqpair=%p\n", + pdu->hdr.common.pdu_type, + expected_hlen, pdu->hdr.common.hlen, tqpair); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, hlen); + goto err; + } else if (pdo_error) { + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdo); + } else if (plen_error) { + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen); + goto err; + } else { + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH); + nvme_tcp_pdu_calc_psh_len(&tqpair->pdu_in_progress, tqpair->host_hdgst_enable); + return; + } +err: + nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset); +} + +static int +nvmf_tcp_pdu_payload_insert_dif(struct nvme_tcp_pdu *pdu, uint32_t read_offset, + int read_len) +{ + int rc; + + rc = spdk_dif_generate_stream(pdu->data_iov, pdu->data_iovcnt, + read_offset, read_len, pdu->dif_ctx); + if (rc != 0) { + SPDK_ERRLOG("DIF generate failed\n"); + } + + return rc; +} + +static int +nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair) +{ + int rc = 0; + struct nvme_tcp_pdu *pdu; + enum nvme_tcp_pdu_recv_state prev_state; + uint32_t data_len; + struct spdk_nvmf_tcp_transport *ttransport = SPDK_CONTAINEROF(tqpair->qpair.transport, + struct spdk_nvmf_tcp_transport, transport); + + /* The loop here is to allow for several back-to-back state changes. */ + do { + prev_state = tqpair->recv_state; + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tqpair(%p) recv pdu entering state %d\n", tqpair, prev_state); + + pdu = &tqpair->pdu_in_progress; + switch (tqpair->recv_state) { + /* Wait for the common header */ + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY: + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH: + if (spdk_unlikely(tqpair->state == NVME_TCP_QPAIR_STATE_INITIALIZING)) { + return rc; + } + + rc = nvme_tcp_read_data(tqpair->sock, + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes, + (void *)&pdu->hdr.common + pdu->ch_valid_bytes); + if (rc < 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "will disconnect tqpair=%p\n", tqpair); + return NVME_TCP_PDU_FATAL; + } else if (rc > 0) { + pdu->ch_valid_bytes += rc; + spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, 0, rc, 0, 0); + if (spdk_likely(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY)) { + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH); + } + } + + if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) { + return NVME_TCP_PDU_IN_PROGRESS; + } + + /* The command header of this PDU has now been read from the socket. */ + nvmf_tcp_pdu_ch_handle(tqpair); + break; + /* Wait for the pdu specific header */ + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH: + rc = nvme_tcp_read_data(tqpair->sock, + pdu->psh_len - pdu->psh_valid_bytes, + (void *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes); + if (rc < 0) { + return NVME_TCP_PDU_FATAL; + } else if (rc > 0) { + spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, + 0, rc, 0, 0); + pdu->psh_valid_bytes += rc; + } + + if (pdu->psh_valid_bytes < pdu->psh_len) { + return NVME_TCP_PDU_IN_PROGRESS; + } + + /* All header(ch, psh, head digist) of this PDU has now been read from the socket. */ + nvmf_tcp_pdu_psh_handle(tqpair, ttransport); + break; + /* Wait for the req slot */ + case NVME_TCP_PDU_RECV_STATE_AWAIT_REQ: + nvmf_tcp_capsule_cmd_hdr_handle(ttransport, tqpair, pdu); + break; + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD: + /* check whether the data is valid, if not we just return */ + if (!pdu->data_len) { + return NVME_TCP_PDU_IN_PROGRESS; + } + + data_len = pdu->data_len; + /* data digest */ + if (spdk_unlikely((pdu->hdr.common.pdu_type != SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ) && + tqpair->host_ddgst_enable)) { + data_len += SPDK_NVME_TCP_DIGEST_LEN; + pdu->ddgst_enable = true; + } + + rc = nvme_tcp_read_payload_data(tqpair->sock, pdu); + if (rc < 0) { + return NVME_TCP_PDU_FATAL; + } + pdu->readv_offset += rc; + + if (spdk_unlikely(pdu->dif_ctx != NULL)) { + rc = nvmf_tcp_pdu_payload_insert_dif(pdu, pdu->readv_offset - rc, rc); + if (rc != 0) { + return NVME_TCP_PDU_FATAL; + } + } + + if (pdu->readv_offset < data_len) { + return NVME_TCP_PDU_IN_PROGRESS; + } + + /* All of this PDU has now been read from the socket. */ + nvmf_tcp_pdu_payload_handle(tqpair, ttransport); + break; + case NVME_TCP_PDU_RECV_STATE_ERROR: + if (!spdk_sock_is_connected(tqpair->sock)) { + return NVME_TCP_PDU_FATAL; + } + break; + default: + assert(0); + SPDK_ERRLOG("code should not come to here"); + break; + } + } while (tqpair->recv_state != prev_state); + + return rc; +} + +static int +nvmf_tcp_req_parse_sgl(struct spdk_nvmf_tcp_req *tcp_req, + struct spdk_nvmf_transport *transport, + struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_request *req = &tcp_req->req; + struct spdk_nvme_cmd *cmd; + struct spdk_nvme_cpl *rsp; + struct spdk_nvme_sgl_descriptor *sgl; + uint32_t length; + + cmd = &req->cmd->nvme_cmd; + rsp = &req->rsp->nvme_cpl; + sgl = &cmd->dptr.sgl1; + + length = sgl->unkeyed.length; + + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK && + sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_TRANSPORT) { + if (length > transport->opts.max_io_size) { + SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", + length, transport->opts.max_io_size); + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return -1; + } + + /* fill request length and populate iovs */ + req->length = length; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Data requested length= 0x%x\n", length); + + if (spdk_unlikely(req->dif.dif_insert_or_strip)) { + req->dif.orig_length = length; + length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx); + req->dif.elba_length = length; + } + + if (spdk_nvmf_request_get_buffers(req, group, transport, length)) { + /* No available buffers. Queue this request up. */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "No available large data buffers. Queueing request %p\n", + tcp_req); + return 0; + } + + /* backward compatible */ + req->data = req->iov[0].iov_base; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Request %p took %d buffer/s from central pool, and data=%p\n", + tcp_req, req->iovcnt, req->data); + + return 0; + } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && + sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { + uint64_t offset = sgl->address; + uint32_t max_len = transport->opts.in_capsule_data_size; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", + offset, length); + + if (offset > max_len) { + SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", + offset, max_len); + rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; + return -1; + } + max_len -= (uint32_t)offset; + + if (length > max_len) { + SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", + length, max_len); + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return -1; + } + + req->data = tcp_req->buf + offset; + req->data_from_pool = false; + req->length = length; + + if (spdk_unlikely(req->dif.dif_insert_or_strip)) { + length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx); + req->dif.elba_length = length; + } + + req->iov[0].iov_base = req->data; + req->iov[0].iov_len = length; + req->iovcnt = 1; + + return 0; + } + + SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", + sgl->generic.type, sgl->generic.subtype); + rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; + return -1; +} + +static inline enum spdk_nvme_media_error_status_code +nvmf_tcp_dif_error_to_compl_status(uint8_t err_type) { + enum spdk_nvme_media_error_status_code result; + + switch (err_type) + { + case SPDK_DIF_REFTAG_ERROR: + result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR; + break; + case SPDK_DIF_APPTAG_ERROR: + result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR; + break; + case SPDK_DIF_GUARD_ERROR: + result = SPDK_NVME_SC_GUARD_CHECK_ERROR; + break; + default: + SPDK_UNREACHABLE(); + break; + } + + return result; +} + +static void +nvmf_tcp_send_c2h_data(struct spdk_nvmf_tcp_qpair *tqpair, + struct spdk_nvmf_tcp_req *tcp_req) +{ + struct nvme_tcp_pdu *rsp_pdu; + struct spdk_nvme_tcp_c2h_data_hdr *c2h_data; + uint32_t plen, pdo, alignment; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n"); + + rsp_pdu = nvmf_tcp_req_pdu_init(tcp_req); + assert(rsp_pdu != NULL); + + c2h_data = &rsp_pdu->hdr.c2h_data; + c2h_data->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_C2H_DATA; + plen = c2h_data->common.hlen = sizeof(*c2h_data); + + if (tqpair->host_hdgst_enable) { + plen += SPDK_NVME_TCP_DIGEST_LEN; + c2h_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; + } + + /* set the psh */ + c2h_data->cccid = tcp_req->req.cmd->nvme_cmd.cid; + c2h_data->datal = tcp_req->req.length; + c2h_data->datao = 0; + + /* set the padding */ + rsp_pdu->padding_len = 0; + pdo = plen; + if (tqpair->cpda) { + alignment = (tqpair->cpda + 1) << 2; + if (alignment > plen) { + rsp_pdu->padding_len = alignment - plen; + pdo = plen = alignment; + } + } + + c2h_data->common.pdo = pdo; + plen += c2h_data->datal; + if (tqpair->host_ddgst_enable) { + c2h_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF; + plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + c2h_data->common.plen = plen; + + if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) { + rsp_pdu->dif_ctx = &tcp_req->req.dif.dif_ctx; + } + + nvme_tcp_pdu_set_data_buf(rsp_pdu, tcp_req->req.iov, tcp_req->req.iovcnt, + c2h_data->datao, c2h_data->datal); + + if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) { + struct spdk_nvme_cpl *rsp = &tcp_req->req.rsp->nvme_cpl; + struct spdk_dif_error err_blk = {}; + + rc = spdk_dif_verify_stream(rsp_pdu->data_iov, rsp_pdu->data_iovcnt, + 0, rsp_pdu->data_len, rsp_pdu->dif_ctx, &err_blk); + if (rc != 0) { + SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", + err_blk.err_type, err_blk.err_offset); + rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR; + rsp->status.sc = nvmf_tcp_dif_error_to_compl_status(err_blk.err_type); + nvmf_tcp_req_pdu_fini(tcp_req); + nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair); + return; + } + } + + c2h_data->common.flags |= SPDK_NVME_TCP_C2H_DATA_FLAGS_LAST_PDU; + if (tqpair->qpair.transport->opts.c2h_success) { + c2h_data->common.flags |= SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS; + } + + nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_pdu_c2h_data_complete, tcp_req); +} + +static int +request_transfer_out(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_tcp_req *tcp_req; + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_tcp_qpair *tqpair; + struct spdk_nvme_cpl *rsp; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n"); + + qpair = req->qpair; + rsp = &req->rsp->nvme_cpl; + tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req); + + /* Advance our sq_head pointer */ + if (qpair->sq_head == qpair->sq_head_max) { + qpair->sq_head = 0; + } else { + qpair->sq_head++; + } + rsp->sqhd = qpair->sq_head; + + tqpair = SPDK_CONTAINEROF(tcp_req->req.qpair, struct spdk_nvmf_tcp_qpair, qpair); + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST); + if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + nvmf_tcp_send_c2h_data(tqpair, tcp_req); + } else { + nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair); + } + + return 0; +} + +static void +nvmf_tcp_set_incapsule_data(struct spdk_nvmf_tcp_qpair *tqpair, + struct spdk_nvmf_tcp_req *tcp_req) +{ + struct nvme_tcp_pdu *pdu; + uint32_t plen = 0; + + pdu = &tqpair->pdu_in_progress; + plen = pdu->hdr.common.hlen; + + if (tqpair->host_hdgst_enable) { + plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + if (pdu->hdr.common.plen != plen) { + tcp_req->has_incapsule_data = true; + } +} + +static bool +nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport, + struct spdk_nvmf_tcp_req *tcp_req) +{ + struct spdk_nvmf_tcp_qpair *tqpair; + int rc; + enum spdk_nvmf_tcp_req_state prev_state; + bool progress = false; + struct spdk_nvmf_transport *transport = &ttransport->transport; + struct spdk_nvmf_transport_poll_group *group; + + tqpair = SPDK_CONTAINEROF(tcp_req->req.qpair, struct spdk_nvmf_tcp_qpair, qpair); + group = &tqpair->group->group; + assert(tcp_req->state != TCP_REQUEST_STATE_FREE); + + /* If the qpair is not active, we need to abort the outstanding requests. */ + if (tqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { + if (tcp_req->state == TCP_REQUEST_STATE_NEED_BUFFER) { + STAILQ_REMOVE(&group->pending_buf_queue, &tcp_req->req, spdk_nvmf_request, buf_link); + } + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_COMPLETED); + } + + /* The loop here is to allow for several back-to-back state changes. */ + do { + prev_state = tcp_req->state; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Request %p entering state %d on tqpair=%p\n", tcp_req, prev_state, + tqpair); + + switch (tcp_req->state) { + case TCP_REQUEST_STATE_FREE: + /* Some external code must kick a request into TCP_REQUEST_STATE_NEW + * to escape this state. */ + break; + case TCP_REQUEST_STATE_NEW: + spdk_trace_record(TRACE_TCP_REQUEST_STATE_NEW, 0, 0, (uintptr_t)tcp_req, 0); + + /* copy the cmd from the receive pdu */ + tcp_req->cmd = tqpair->pdu_in_progress.hdr.capsule_cmd.ccsqe; + + if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&tcp_req->req, &tcp_req->req.dif.dif_ctx))) { + tcp_req->req.dif.dif_insert_or_strip = true; + tqpair->pdu_in_progress.dif_ctx = &tcp_req->req.dif.dif_ctx; + } + + /* The next state transition depends on the data transfer needs of this request. */ + tcp_req->req.xfer = spdk_nvmf_req_get_xfer(&tcp_req->req); + + /* If no data to transfer, ready to execute. */ + if (tcp_req->req.xfer == SPDK_NVME_DATA_NONE) { + /* Reset the tqpair receving pdu state */ + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE); + break; + } + + nvmf_tcp_set_incapsule_data(tqpair, tcp_req); + + if (!tcp_req->has_incapsule_data) { + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + } + + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_NEED_BUFFER); + STAILQ_INSERT_TAIL(&group->pending_buf_queue, &tcp_req->req, buf_link); + break; + case TCP_REQUEST_STATE_NEED_BUFFER: + spdk_trace_record(TRACE_TCP_REQUEST_STATE_NEED_BUFFER, 0, 0, (uintptr_t)tcp_req, 0); + + assert(tcp_req->req.xfer != SPDK_NVME_DATA_NONE); + + if (!tcp_req->has_incapsule_data && (&tcp_req->req != STAILQ_FIRST(&group->pending_buf_queue))) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, + "Not the first element to wait for the buf for tcp_req(%p) on tqpair=%p\n", + tcp_req, tqpair); + /* This request needs to wait in line to obtain a buffer */ + break; + } + + /* Try to get a data buffer */ + rc = nvmf_tcp_req_parse_sgl(tcp_req, transport, group); + if (rc < 0) { + STAILQ_REMOVE_HEAD(&group->pending_buf_queue, buf_link); + /* Reset the tqpair receving pdu state */ + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_COMPLETE); + break; + } + + if (!tcp_req->req.data) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "No buffer allocated for tcp_req(%p) on tqpair(%p\n)", + tcp_req, tqpair); + /* No buffers available. */ + break; + } + + STAILQ_REMOVE(&group->pending_buf_queue, &tcp_req->req, spdk_nvmf_request, buf_link); + + /* If data is transferring from host to controller, we need to do a transfer from the host. */ + if (tcp_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + if (tcp_req->req.data_from_pool) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Sending R2T for tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair); + nvmf_tcp_send_r2t_pdu(tqpair, tcp_req); + } else { + struct nvme_tcp_pdu *pdu; + + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + + pdu = &tqpair->pdu_in_progress; + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Not need to send r2t for tcp_req(%p) on tqpair=%p\n", tcp_req, + tqpair); + /* No need to send r2t, contained in the capsuled data */ + nvme_tcp_pdu_set_data_buf(pdu, tcp_req->req.iov, tcp_req->req.iovcnt, + 0, tcp_req->req.length); + nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); + } + break; + } + + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE); + break; + case TCP_REQUEST_STATE_AWAITING_R2T_ACK: + spdk_trace_record(TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK, 0, 0, (uintptr_t)tcp_req, 0); + /* The R2T completion or the h2c data incoming will kick it out of this state. */ + break; + case TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: + + spdk_trace_record(TRACE_TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, + (uintptr_t)tcp_req, 0); + /* Some external code must kick a request into TCP_REQUEST_STATE_READY_TO_EXECUTE + * to escape this state. */ + break; + case TCP_REQUEST_STATE_READY_TO_EXECUTE: + spdk_trace_record(TRACE_TCP_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, (uintptr_t)tcp_req, 0); + + if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) { + assert(tcp_req->req.dif.elba_length >= tcp_req->req.length); + tcp_req->req.length = tcp_req->req.dif.elba_length; + } + + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTING); + spdk_nvmf_request_exec(&tcp_req->req); + break; + case TCP_REQUEST_STATE_EXECUTING: + spdk_trace_record(TRACE_TCP_REQUEST_STATE_EXECUTING, 0, 0, (uintptr_t)tcp_req, 0); + /* Some external code must kick a request into TCP_REQUEST_STATE_EXECUTED + * to escape this state. */ + break; + case TCP_REQUEST_STATE_EXECUTED: + spdk_trace_record(TRACE_TCP_REQUEST_STATE_EXECUTED, 0, 0, (uintptr_t)tcp_req, 0); + + if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) { + tcp_req->req.length = tcp_req->req.dif.orig_length; + } + + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_COMPLETE); + break; + case TCP_REQUEST_STATE_READY_TO_COMPLETE: + spdk_trace_record(TRACE_TCP_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, (uintptr_t)tcp_req, 0); + rc = request_transfer_out(&tcp_req->req); + assert(rc == 0); /* No good way to handle this currently */ + break; + case TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: + spdk_trace_record(TRACE_TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0, + (uintptr_t)tcp_req, + 0); + /* Some external code must kick a request into TCP_REQUEST_STATE_COMPLETED + * to escape this state. */ + break; + case TCP_REQUEST_STATE_COMPLETED: + spdk_trace_record(TRACE_TCP_REQUEST_STATE_COMPLETED, 0, 0, (uintptr_t)tcp_req, 0); + if (tcp_req->req.data_from_pool) { + spdk_nvmf_request_free_buffers(&tcp_req->req, group, transport); + } + tcp_req->req.length = 0; + tcp_req->req.iovcnt = 0; + tcp_req->req.data = NULL; + + nvmf_tcp_req_pdu_fini(tcp_req); + + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_FREE); + break; + case TCP_REQUEST_NUM_STATES: + default: + assert(0); + break; + } + + if (tcp_req->state != prev_state) { + progress = true; + } + } while (tcp_req->state != prev_state); + + return progress; +} + +static void +nvmf_tcp_sock_cb(void *arg, struct spdk_sock_group *group, struct spdk_sock *sock) +{ + struct spdk_nvmf_tcp_qpair *tqpair = arg; + int rc; + + assert(tqpair != NULL); + rc = nvmf_tcp_sock_process(tqpair); + + /* If there was a new socket error, disconnect */ + if (rc < 0) { + nvmf_tcp_qpair_disconnect(tqpair); + } +} + +static int +nvmf_tcp_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_tcp_poll_group *tgroup; + struct spdk_nvmf_tcp_qpair *tqpair; + int rc; + + tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group); + tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair); + + rc = spdk_sock_group_add_sock(tgroup->sock_group, tqpair->sock, + nvmf_tcp_sock_cb, tqpair); + if (rc != 0) { + SPDK_ERRLOG("Could not add sock to sock_group: %s (%d)\n", + spdk_strerror(errno), errno); + return -1; + } + + rc = nvmf_tcp_qpair_sock_init(tqpair); + if (rc != 0) { + SPDK_ERRLOG("Cannot set sock opt for tqpair=%p\n", tqpair); + return -1; + } + + rc = nvmf_tcp_qpair_init(&tqpair->qpair); + if (rc < 0) { + SPDK_ERRLOG("Cannot init tqpair=%p\n", tqpair); + return -1; + } + + rc = nvmf_tcp_qpair_init_mem_resource(tqpair); + if (rc < 0) { + SPDK_ERRLOG("Cannot init memory resource info for tqpair=%p\n", tqpair); + return -1; + } + + tqpair->group = tgroup; + tqpair->state = NVME_TCP_QPAIR_STATE_INVALID; + TAILQ_INSERT_TAIL(&tgroup->qpairs, tqpair, link); + + return 0; +} + +static int +nvmf_tcp_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_tcp_poll_group *tgroup; + struct spdk_nvmf_tcp_qpair *tqpair; + int rc; + + tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group); + tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair); + + assert(tqpair->group == tgroup); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "remove tqpair=%p from the tgroup=%p\n", tqpair, tgroup); + if (tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_REQ) { + TAILQ_REMOVE(&tgroup->await_req, tqpair, link); + } else { + TAILQ_REMOVE(&tgroup->qpairs, tqpair, link); + } + + rc = spdk_sock_group_remove_sock(tgroup->sock_group, tqpair->sock); + if (rc != 0) { + SPDK_ERRLOG("Could not remove sock from sock_group: %s (%d)\n", + spdk_strerror(errno), errno); + } + + return rc; +} + +static int +nvmf_tcp_req_complete(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_tcp_transport *ttransport; + struct spdk_nvmf_tcp_req *tcp_req; + + ttransport = SPDK_CONTAINEROF(req->qpair->transport, struct spdk_nvmf_tcp_transport, transport); + tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req); + + nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTED); + nvmf_tcp_req_process(ttransport, tcp_req); + + return 0; +} + +static void +nvmf_tcp_close_qpair(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_tcp_qpair *tqpair; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Qpair: %p\n", qpair); + + tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair); + tqpair->state = NVME_TCP_QPAIR_STATE_EXITED; + nvmf_tcp_qpair_destroy(tqpair); +} + +static int +nvmf_tcp_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_tcp_poll_group *tgroup; + int rc; + struct spdk_nvmf_request *req, *req_tmp; + struct spdk_nvmf_tcp_req *tcp_req; + struct spdk_nvmf_tcp_qpair *tqpair, *tqpair_tmp; + struct spdk_nvmf_tcp_transport *ttransport = SPDK_CONTAINEROF(group->transport, + struct spdk_nvmf_tcp_transport, transport); + + tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group); + + if (spdk_unlikely(TAILQ_EMPTY(&tgroup->qpairs) && TAILQ_EMPTY(&tgroup->await_req))) { + return 0; + } + + STAILQ_FOREACH_SAFE(req, &group->pending_buf_queue, buf_link, req_tmp) { + tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req); + if (nvmf_tcp_req_process(ttransport, tcp_req) == false) { + break; + } + } + + rc = spdk_sock_group_poll(tgroup->sock_group); + if (rc < 0) { + SPDK_ERRLOG("Failed to poll sock_group=%p\n", tgroup->sock_group); + } + + TAILQ_FOREACH_SAFE(tqpair, &tgroup->await_req, link, tqpair_tmp) { + nvmf_tcp_sock_process(tqpair); + } + + return rc; +} + +static int +nvmf_tcp_qpair_get_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid, bool peer) +{ + struct spdk_nvmf_tcp_qpair *tqpair; + uint16_t port; + + tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair); + spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_TCP); + + if (peer) { + snprintf(trid->traddr, sizeof(trid->traddr), "%s", tqpair->initiator_addr); + port = tqpair->initiator_port; + } else { + snprintf(trid->traddr, sizeof(trid->traddr), "%s", tqpair->target_addr); + port = tqpair->target_port; + } + + if (spdk_sock_is_ipv4(tqpair->sock)) { + trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; + } else if (spdk_sock_is_ipv6(tqpair->sock)) { + trid->adrfam = SPDK_NVMF_ADRFAM_IPV6; + } else { + return -1; + } + + snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%d", port); + return 0; +} + +static int +nvmf_tcp_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return nvmf_tcp_qpair_get_trid(qpair, trid, 0); +} + +static int +nvmf_tcp_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return nvmf_tcp_qpair_get_trid(qpair, trid, 1); +} + +static int +nvmf_tcp_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return nvmf_tcp_qpair_get_trid(qpair, trid, 0); +} + +static void +nvmf_tcp_req_set_abort_status(struct spdk_nvmf_request *req, + struct spdk_nvmf_tcp_req *tcp_req_to_abort) +{ + tcp_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + tcp_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; + + nvmf_tcp_req_set_state(tcp_req_to_abort, TCP_REQUEST_STATE_READY_TO_COMPLETE); + + req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command was successfully aborted. */ +} + +static int +_nvmf_tcp_qpair_abort_request(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_tcp_req *tcp_req_to_abort = SPDK_CONTAINEROF(req->req_to_abort, + struct spdk_nvmf_tcp_req, req); + struct spdk_nvmf_tcp_qpair *tqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair, + struct spdk_nvmf_tcp_qpair, qpair); + int rc; + + spdk_poller_unregister(&req->poller); + + switch (tcp_req_to_abort->state) { + case TCP_REQUEST_STATE_EXECUTING: + rc = nvmf_ctrlr_abort_request(req); + if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) { + return SPDK_POLLER_BUSY; + } + break; + + case TCP_REQUEST_STATE_NEED_BUFFER: + STAILQ_REMOVE(&tqpair->group->group.pending_buf_queue, + &tcp_req_to_abort->req, spdk_nvmf_request, buf_link); + + nvmf_tcp_req_set_abort_status(req, tcp_req_to_abort); + break; + + case TCP_REQUEST_STATE_AWAITING_R2T_ACK: + nvmf_tcp_req_set_abort_status(req, tcp_req_to_abort); + break; + + case TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: + if (spdk_get_ticks() < req->timeout_tsc) { + req->poller = SPDK_POLLER_REGISTER(_nvmf_tcp_qpair_abort_request, req, 0); + return SPDK_POLLER_BUSY; + } + break; + + default: + break; + } + + spdk_nvmf_request_complete(req); + return SPDK_POLLER_BUSY; +} + +static void +nvmf_tcp_qpair_abort_request(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_tcp_qpair *tqpair; + struct spdk_nvmf_tcp_transport *ttransport; + struct spdk_nvmf_transport *transport; + uint16_t cid; + uint32_t i; + struct spdk_nvmf_tcp_req *tcp_req_to_abort = NULL; + + tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair); + ttransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_tcp_transport, transport); + transport = &ttransport->transport; + + cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid; + + for (i = 0; i < tqpair->resource_count; i++) { + tcp_req_to_abort = &tqpair->reqs[i]; + + if (tcp_req_to_abort->state != TCP_REQUEST_STATE_FREE && + tcp_req_to_abort->req.cmd->nvme_cmd.cid == cid) { + break; + } + } + + if (tcp_req_to_abort == NULL) { + spdk_nvmf_request_complete(req); + return; + } + + req->req_to_abort = &tcp_req_to_abort->req; + req->timeout_tsc = spdk_get_ticks() + + transport->opts.abort_timeout_sec * spdk_get_ticks_hz(); + req->poller = NULL; + + _nvmf_tcp_qpair_abort_request(req); +} + +#define SPDK_NVMF_TCP_DEFAULT_MAX_QUEUE_DEPTH 128 +#define SPDK_NVMF_TCP_DEFAULT_AQ_DEPTH 128 +#define SPDK_NVMF_TCP_DEFAULT_MAX_QPAIRS_PER_CTRLR 128 +#define SPDK_NVMF_TCP_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 +#define SPDK_NVMF_TCP_DEFAULT_MAX_IO_SIZE 131072 +#define SPDK_NVMF_TCP_DEFAULT_IO_UNIT_SIZE 131072 +#define SPDK_NVMF_TCP_DEFAULT_NUM_SHARED_BUFFERS 511 +#define SPDK_NVMF_TCP_DEFAULT_BUFFER_CACHE_SIZE 32 +#define SPDK_NVMF_TCP_DEFAULT_SUCCESS_OPTIMIZATION true +#define SPDK_NVMF_TCP_DEFAULT_DIF_INSERT_OR_STRIP false +#define SPDK_NVMF_TCP_DEFAULT_SOCK_PRIORITY 0 +#define SPDK_NVMF_TCP_DEFAULT_ABORT_TIMEOUT_SEC 1 + +static void +nvmf_tcp_opts_init(struct spdk_nvmf_transport_opts *opts) +{ + opts->max_queue_depth = SPDK_NVMF_TCP_DEFAULT_MAX_QUEUE_DEPTH; + opts->max_qpairs_per_ctrlr = SPDK_NVMF_TCP_DEFAULT_MAX_QPAIRS_PER_CTRLR; + opts->in_capsule_data_size = SPDK_NVMF_TCP_DEFAULT_IN_CAPSULE_DATA_SIZE; + opts->max_io_size = SPDK_NVMF_TCP_DEFAULT_MAX_IO_SIZE; + opts->io_unit_size = SPDK_NVMF_TCP_DEFAULT_IO_UNIT_SIZE; + opts->max_aq_depth = SPDK_NVMF_TCP_DEFAULT_AQ_DEPTH; + opts->num_shared_buffers = SPDK_NVMF_TCP_DEFAULT_NUM_SHARED_BUFFERS; + opts->buf_cache_size = SPDK_NVMF_TCP_DEFAULT_BUFFER_CACHE_SIZE; + opts->c2h_success = SPDK_NVMF_TCP_DEFAULT_SUCCESS_OPTIMIZATION; + opts->dif_insert_or_strip = SPDK_NVMF_TCP_DEFAULT_DIF_INSERT_OR_STRIP; + opts->sock_priority = SPDK_NVMF_TCP_DEFAULT_SOCK_PRIORITY; + opts->abort_timeout_sec = SPDK_NVMF_TCP_DEFAULT_ABORT_TIMEOUT_SEC; +} + +const struct spdk_nvmf_transport_ops spdk_nvmf_transport_tcp = { + .name = "TCP", + .type = SPDK_NVME_TRANSPORT_TCP, + .opts_init = nvmf_tcp_opts_init, + .create = nvmf_tcp_create, + .destroy = nvmf_tcp_destroy, + + .listen = nvmf_tcp_listen, + .stop_listen = nvmf_tcp_stop_listen, + .accept = nvmf_tcp_accept, + + .listener_discover = nvmf_tcp_discover, + + .poll_group_create = nvmf_tcp_poll_group_create, + .get_optimal_poll_group = nvmf_tcp_get_optimal_poll_group, + .poll_group_destroy = nvmf_tcp_poll_group_destroy, + .poll_group_add = nvmf_tcp_poll_group_add, + .poll_group_remove = nvmf_tcp_poll_group_remove, + .poll_group_poll = nvmf_tcp_poll_group_poll, + + .req_free = nvmf_tcp_req_free, + .req_complete = nvmf_tcp_req_complete, + + .qpair_fini = nvmf_tcp_close_qpair, + .qpair_get_local_trid = nvmf_tcp_qpair_get_local_trid, + .qpair_get_peer_trid = nvmf_tcp_qpair_get_peer_trid, + .qpair_get_listen_trid = nvmf_tcp_qpair_get_listen_trid, + .qpair_abort_request = nvmf_tcp_qpair_abort_request, +}; + +SPDK_NVMF_TRANSPORT_REGISTER(tcp, &spdk_nvmf_transport_tcp); +SPDK_LOG_REGISTER_COMPONENT("nvmf_tcp", SPDK_LOG_NVMF_TCP) diff --git a/src/spdk/lib/nvmf/transport.c b/src/spdk/lib/nvmf/transport.c new file mode 100644 index 000000000..11bb152df --- /dev/null +++ b/src/spdk/lib/nvmf/transport.c @@ -0,0 +1,572 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2018-2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/config.h" +#include "spdk/log.h" +#include "spdk/nvmf.h" +#include "spdk/nvmf_transport.h" +#include "spdk/queue.h" +#include "spdk/util.h" + +#define MAX_MEMPOOL_NAME_LENGTH 40 + +struct nvmf_transport_ops_list_element { + struct spdk_nvmf_transport_ops ops; + TAILQ_ENTRY(nvmf_transport_ops_list_element) link; +}; + +TAILQ_HEAD(nvmf_transport_ops_list, nvmf_transport_ops_list_element) +g_spdk_nvmf_transport_ops = TAILQ_HEAD_INITIALIZER(g_spdk_nvmf_transport_ops); + +static inline const struct spdk_nvmf_transport_ops * +nvmf_get_transport_ops(const char *transport_name) +{ + struct nvmf_transport_ops_list_element *ops; + TAILQ_FOREACH(ops, &g_spdk_nvmf_transport_ops, link) { + if (strcasecmp(transport_name, ops->ops.name) == 0) { + return &ops->ops; + } + } + return NULL; +} + +void +spdk_nvmf_transport_register(const struct spdk_nvmf_transport_ops *ops) +{ + struct nvmf_transport_ops_list_element *new_ops; + + if (nvmf_get_transport_ops(ops->name) != NULL) { + SPDK_ERRLOG("Double registering nvmf transport type %s.\n", ops->name); + assert(false); + return; + } + + new_ops = calloc(1, sizeof(*new_ops)); + if (new_ops == NULL) { + SPDK_ERRLOG("Unable to allocate memory to register new transport type %s.\n", ops->name); + assert(false); + return; + } + + new_ops->ops = *ops; + + TAILQ_INSERT_TAIL(&g_spdk_nvmf_transport_ops, new_ops, link); +} + +const struct spdk_nvmf_transport_opts * +spdk_nvmf_get_transport_opts(struct spdk_nvmf_transport *transport) +{ + return &transport->opts; +} + +spdk_nvme_transport_type_t +spdk_nvmf_get_transport_type(struct spdk_nvmf_transport *transport) +{ + return transport->ops->type; +} + +const char * +spdk_nvmf_get_transport_name(struct spdk_nvmf_transport *transport) +{ + return transport->ops->name; +} + +struct spdk_nvmf_transport * +spdk_nvmf_transport_create(const char *transport_name, struct spdk_nvmf_transport_opts *opts) +{ + const struct spdk_nvmf_transport_ops *ops = NULL; + struct spdk_nvmf_transport *transport; + char spdk_mempool_name[MAX_MEMPOOL_NAME_LENGTH]; + int chars_written; + + ops = nvmf_get_transport_ops(transport_name); + if (!ops) { + SPDK_ERRLOG("Transport type '%s' unavailable.\n", transport_name); + return NULL; + } + + if (opts->max_aq_depth < SPDK_NVMF_MIN_ADMIN_MAX_SQ_SIZE) { + SPDK_ERRLOG("max_aq_depth %u is less than minimum defined by NVMf spec, use min value\n", + opts->max_aq_depth); + opts->max_aq_depth = SPDK_NVMF_MIN_ADMIN_MAX_SQ_SIZE; + } + + transport = ops->create(opts); + if (!transport) { + SPDK_ERRLOG("Unable to create new transport of type %s\n", transport_name); + return NULL; + } + + TAILQ_INIT(&transport->listeners); + + transport->ops = ops; + transport->opts = *opts; + chars_written = snprintf(spdk_mempool_name, MAX_MEMPOOL_NAME_LENGTH, "%s_%s_%s", "spdk_nvmf", + transport_name, "data"); + if (chars_written < 0) { + SPDK_ERRLOG("Unable to generate transport data buffer pool name.\n"); + ops->destroy(transport); + return NULL; + } + + transport->data_buf_pool = spdk_mempool_create(spdk_mempool_name, + opts->num_shared_buffers, + opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT, + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + + if (!transport->data_buf_pool) { + SPDK_ERRLOG("Unable to allocate buffer pool for poll group\n"); + ops->destroy(transport); + return NULL; + } + + return transport; +} + +struct spdk_nvmf_transport * +spdk_nvmf_transport_get_first(struct spdk_nvmf_tgt *tgt) +{ + return TAILQ_FIRST(&tgt->transports); +} + +struct spdk_nvmf_transport * +spdk_nvmf_transport_get_next(struct spdk_nvmf_transport *transport) +{ + return TAILQ_NEXT(transport, link); +} + +int +spdk_nvmf_transport_destroy(struct spdk_nvmf_transport *transport) +{ + if (transport->data_buf_pool != NULL) { + if (spdk_mempool_count(transport->data_buf_pool) != + transport->opts.num_shared_buffers) { + SPDK_ERRLOG("transport buffer pool count is %zu but should be %u\n", + spdk_mempool_count(transport->data_buf_pool), + transport->opts.num_shared_buffers); + } + } + + spdk_mempool_free(transport->data_buf_pool); + + return transport->ops->destroy(transport); +} + +struct spdk_nvmf_listener * +nvmf_transport_find_listener(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_listener *listener; + + TAILQ_FOREACH(listener, &transport->listeners, link) { + if (spdk_nvme_transport_id_compare(&listener->trid, trid) == 0) { + return listener; + } + } + + return NULL; +} + +int +spdk_nvmf_transport_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_listener *listener; + int rc; + + listener = nvmf_transport_find_listener(transport, trid); + if (!listener) { + listener = calloc(1, sizeof(*listener)); + if (!listener) { + return -ENOMEM; + } + + listener->ref = 1; + listener->trid = *trid; + TAILQ_INSERT_TAIL(&transport->listeners, listener, link); + + rc = transport->ops->listen(transport, &listener->trid); + if (rc != 0) { + TAILQ_REMOVE(&transport->listeners, listener, link); + free(listener); + } + return rc; + } + + ++listener->ref; + + return 0; +} + +int +spdk_nvmf_transport_stop_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_listener *listener; + + listener = nvmf_transport_find_listener(transport, trid); + if (!listener) { + return -ENOENT; + } + + if (--listener->ref == 0) { + TAILQ_REMOVE(&transport->listeners, listener, link); + transport->ops->stop_listen(transport, trid); + free(listener); + } + + return 0; +} + +uint32_t +nvmf_transport_accept(struct spdk_nvmf_transport *transport) +{ + return transport->ops->accept(transport); +} + +void +nvmf_transport_listener_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry) +{ + transport->ops->listener_discover(transport, trid, entry); +} + +struct spdk_nvmf_transport_poll_group * +nvmf_transport_poll_group_create(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_transport_poll_group *group; + struct spdk_nvmf_transport_pg_cache_buf *buf; + + group = transport->ops->poll_group_create(transport); + if (!group) { + return NULL; + } + group->transport = transport; + + STAILQ_INIT(&group->pending_buf_queue); + STAILQ_INIT(&group->buf_cache); + + if (transport->opts.buf_cache_size) { + group->buf_cache_count = 0; + group->buf_cache_size = transport->opts.buf_cache_size; + while (group->buf_cache_count < group->buf_cache_size) { + buf = (struct spdk_nvmf_transport_pg_cache_buf *)spdk_mempool_get(transport->data_buf_pool); + if (!buf) { + SPDK_NOTICELOG("Unable to reserve the full number of buffers for the pg buffer cache.\n"); + break; + } + STAILQ_INSERT_HEAD(&group->buf_cache, buf, link); + group->buf_cache_count++; + } + } + return group; +} + +struct spdk_nvmf_transport_poll_group * +nvmf_transport_get_optimal_poll_group(struct spdk_nvmf_transport *transport, + struct spdk_nvmf_qpair *qpair) +{ + if (transport->ops->get_optimal_poll_group) { + return transport->ops->get_optimal_poll_group(qpair); + } else { + return NULL; + } +} + +void +nvmf_transport_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_transport_pg_cache_buf *buf, *tmp; + + if (!STAILQ_EMPTY(&group->pending_buf_queue)) { + SPDK_ERRLOG("Pending I/O list wasn't empty on poll group destruction\n"); + } + + STAILQ_FOREACH_SAFE(buf, &group->buf_cache, link, tmp) { + STAILQ_REMOVE(&group->buf_cache, buf, spdk_nvmf_transport_pg_cache_buf, link); + spdk_mempool_put(group->transport->data_buf_pool, buf); + } + group->transport->ops->poll_group_destroy(group); +} + +int +nvmf_transport_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + if (qpair->transport) { + assert(qpair->transport == group->transport); + if (qpair->transport != group->transport) { + return -1; + } + } else { + qpair->transport = group->transport; + } + + return group->transport->ops->poll_group_add(group, qpair); +} + +int +nvmf_transport_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + int rc = ENOTSUP; + + assert(qpair->transport == group->transport); + if (group->transport->ops->poll_group_remove) { + rc = group->transport->ops->poll_group_remove(group, qpair); + } + + return rc; +} + +int +nvmf_transport_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) +{ + return group->transport->ops->poll_group_poll(group); +} + +int +nvmf_transport_req_free(struct spdk_nvmf_request *req) +{ + return req->qpair->transport->ops->req_free(req); +} + +int +nvmf_transport_req_complete(struct spdk_nvmf_request *req) +{ + return req->qpair->transport->ops->req_complete(req); +} + +void +nvmf_transport_qpair_fini(struct spdk_nvmf_qpair *qpair) +{ + qpair->transport->ops->qpair_fini(qpair); +} + +int +nvmf_transport_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return qpair->transport->ops->qpair_get_peer_trid(qpair, trid); +} + +int +nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return qpair->transport->ops->qpair_get_local_trid(qpair, trid); +} + +int +nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return qpair->transport->ops->qpair_get_listen_trid(qpair, trid); +} + +void +nvmf_transport_qpair_abort_request(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req) +{ + qpair->transport->ops->qpair_abort_request(qpair, req); +} + +bool +spdk_nvmf_transport_opts_init(const char *transport_name, + struct spdk_nvmf_transport_opts *opts) +{ + const struct spdk_nvmf_transport_ops *ops; + + ops = nvmf_get_transport_ops(transport_name); + if (!ops) { + SPDK_ERRLOG("Transport type %s unavailable.\n", transport_name); + return false; + } + + ops->opts_init(opts); + return true; +} + +int +spdk_nvmf_transport_poll_group_get_stat(struct spdk_nvmf_tgt *tgt, + struct spdk_nvmf_transport *transport, + struct spdk_nvmf_transport_poll_group_stat **stat) +{ + if (transport->ops->poll_group_get_stat) { + return transport->ops->poll_group_get_stat(tgt, stat); + } else { + return -ENOTSUP; + } +} + +void +spdk_nvmf_transport_poll_group_free_stat(struct spdk_nvmf_transport *transport, + struct spdk_nvmf_transport_poll_group_stat *stat) +{ + if (transport->ops->poll_group_free_stat) { + transport->ops->poll_group_free_stat(stat); + } +} + +void +spdk_nvmf_request_free_buffers(struct spdk_nvmf_request *req, + struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_transport *transport) +{ + uint32_t i; + + for (i = 0; i < req->iovcnt; i++) { + if (group->buf_cache_count < group->buf_cache_size) { + STAILQ_INSERT_HEAD(&group->buf_cache, + (struct spdk_nvmf_transport_pg_cache_buf *)req->buffers[i], + link); + group->buf_cache_count++; + } else { + spdk_mempool_put(transport->data_buf_pool, req->buffers[i]); + } + req->iov[i].iov_base = NULL; + req->buffers[i] = NULL; + req->iov[i].iov_len = 0; + } + req->data_from_pool = false; +} + +static inline int +nvmf_request_set_buffer(struct spdk_nvmf_request *req, void *buf, uint32_t length, + uint32_t io_unit_size) +{ + req->buffers[req->iovcnt] = buf; + req->iov[req->iovcnt].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & + ~NVMF_DATA_BUFFER_MASK); + req->iov[req->iovcnt].iov_len = spdk_min(length, io_unit_size); + length -= req->iov[req->iovcnt].iov_len; + req->iovcnt++; + + return length; +} + +static int +nvmf_request_get_buffers(struct spdk_nvmf_request *req, + struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_transport *transport, + uint32_t length) +{ + uint32_t io_unit_size = transport->opts.io_unit_size; + uint32_t num_buffers; + uint32_t i = 0, j; + void *buffer, *buffers[NVMF_REQ_MAX_BUFFERS]; + + /* If the number of buffers is too large, then we know the I/O is larger than allowed. + * Fail it. + */ + num_buffers = SPDK_CEIL_DIV(length, io_unit_size); + if (num_buffers + req->iovcnt > NVMF_REQ_MAX_BUFFERS) { + return -EINVAL; + } + + while (i < num_buffers) { + if (!(STAILQ_EMPTY(&group->buf_cache))) { + group->buf_cache_count--; + buffer = STAILQ_FIRST(&group->buf_cache); + STAILQ_REMOVE_HEAD(&group->buf_cache, link); + assert(buffer != NULL); + + length = nvmf_request_set_buffer(req, buffer, length, io_unit_size); + i++; + } else { + if (spdk_mempool_get_bulk(transport->data_buf_pool, buffers, + num_buffers - i)) { + return -ENOMEM; + } + for (j = 0; j < num_buffers - i; j++) { + length = nvmf_request_set_buffer(req, buffers[j], length, io_unit_size); + } + i += num_buffers - i; + } + } + + assert(length == 0); + + req->data_from_pool = true; + return 0; +} + +int +spdk_nvmf_request_get_buffers(struct spdk_nvmf_request *req, + struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_transport *transport, + uint32_t length) +{ + int rc; + + req->iovcnt = 0; + + rc = nvmf_request_get_buffers(req, group, transport, length); + if (rc == -ENOMEM) { + spdk_nvmf_request_free_buffers(req, group, transport); + } + + return rc; +} + +int +spdk_nvmf_request_get_buffers_multi(struct spdk_nvmf_request *req, + struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_transport *transport, + uint32_t *lengths, uint32_t num_lengths) +{ + int rc = 0; + uint32_t i; + + req->iovcnt = 0; + + for (i = 0; i < num_lengths; i++) { + rc = nvmf_request_get_buffers(req, group, transport, lengths[i]); + if (rc != 0) { + goto err_exit; + } + } + + return 0; + +err_exit: + spdk_nvmf_request_free_buffers(req, group, transport); + return rc; +} diff --git a/src/spdk/lib/nvmf/transport.h b/src/spdk/lib/nvmf/transport.h new file mode 100644 index 000000000..38b5d8db3 --- /dev/null +++ b/src/spdk/lib/nvmf/transport.h @@ -0,0 +1,82 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_NVMF_TRANSPORT_H +#define SPDK_NVMF_TRANSPORT_H + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/nvmf.h" +#include "spdk/nvmf_transport.h" + +uint32_t nvmf_transport_accept(struct spdk_nvmf_transport *transport); + +void nvmf_transport_listener_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry); + +struct spdk_nvmf_transport_poll_group *nvmf_transport_poll_group_create( + struct spdk_nvmf_transport *transport); +struct spdk_nvmf_transport_poll_group *nvmf_transport_get_optimal_poll_group( + struct spdk_nvmf_transport *transport, struct spdk_nvmf_qpair *qpair); + +void nvmf_transport_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group); + +int nvmf_transport_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair); + +int nvmf_transport_poll_group_remove(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair); + +int nvmf_transport_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); + +int nvmf_transport_req_free(struct spdk_nvmf_request *req); + +int nvmf_transport_req_complete(struct spdk_nvmf_request *req); + +void nvmf_transport_qpair_fini(struct spdk_nvmf_qpair *qpair); + +int nvmf_transport_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + +int nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + +int nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + +void nvmf_transport_qpair_abort_request(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_request *req); + +#endif /* SPDK_NVMF_TRANSPORT_H */ diff --git a/src/spdk/lib/rdma/Makefile b/src/spdk/lib/rdma/Makefile new file mode 100644 index 000000000..e6374557d --- /dev/null +++ b/src/spdk/lib/rdma/Makefile @@ -0,0 +1,70 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. All rights reserved. +# Copyright (c) Mellanox Technologies LTD. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 1 +SO_MINOR := 0 + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_rdma.map) + +LIBNAME = rdma + +ifeq ($(CONFIG_RDMA_PROV),verbs) +C_SRCS = rdma_verbs.c +else ifeq ($(CONFIG_RDMA_PROV),mlx5_dv) +C_SRCS = rdma_mlx5_dv.c +LOCAL_SYS_LIBS += -lmlx5 +else +$(error Wrong RDMA provider specified: $(CONFIG_RDMA_PROV)) +endif + +LOCAL_SYS_LIBS += -libverbs -lrdmacm +#Attach only if FreeBSD and RDMA is specified with configure +ifeq ($(OS),FreeBSD) +# Mellanox - MLX4 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx4.*)","") +LOCAL_SYS_LIBS += -lmlx4 +endif +# Mellanox - MLX5 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx5.*)","") +LOCAL_SYS_LIBS += -lmlx5 +endif +# Chelsio HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libcxgb4.*)","") +LOCAL_SYS_LIBS += -lcxgb4 +endif +endif + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/rdma/rdma_mlx5_dv.c b/src/spdk/lib/rdma/rdma_mlx5_dv.c new file mode 100644 index 000000000..bae3afdda --- /dev/null +++ b/src/spdk/lib/rdma/rdma_mlx5_dv.c @@ -0,0 +1,316 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <rdma/rdma_cma.h> +#include <infiniband/mlx5dv.h> + +#include "spdk/stdinc.h" +#include "spdk/string.h" +#include "spdk/likely.h" + +#include "spdk_internal/rdma.h" +#include "spdk_internal/log.h" + +struct spdk_rdma_mlx5_dv_qp { + struct spdk_rdma_qp common; + struct ibv_qp_ex *qpex; +}; + +static int +rdma_mlx5_dv_init_qpair(struct spdk_rdma_mlx5_dv_qp *mlx5_qp) +{ + struct ibv_qp_attr qp_attr; + int qp_attr_mask, rc; + + qp_attr.qp_state = IBV_QPS_INIT; + rc = rdma_init_qp_attr(mlx5_qp->common.cm_id, &qp_attr, &qp_attr_mask); + if (rc) { + SPDK_ERRLOG("Failed to init attr IBV_QPS_INIT, errno %s (%d)\n", spdk_strerror(errno), errno); + return rc; + } + + rc = ibv_modify_qp(mlx5_qp->common.qp, &qp_attr, qp_attr_mask); + if (rc) { + SPDK_ERRLOG("ibv_modify_qp(IBV_QPS_INIT) failed, rc %d\n", rc); + return rc; + } + + qp_attr.qp_state = IBV_QPS_RTR; + rc = rdma_init_qp_attr(mlx5_qp->common.cm_id, &qp_attr, &qp_attr_mask); + if (rc) { + SPDK_ERRLOG("Failed to init attr IBV_QPS_RTR, errno %s (%d)\n", spdk_strerror(errno), errno); + return rc; + } + + rc = ibv_modify_qp(mlx5_qp->common.qp, &qp_attr, qp_attr_mask); + if (rc) { + SPDK_ERRLOG("ibv_modify_qp(IBV_QPS_RTR) failed, rc %d\n", rc); + return rc; + } + + qp_attr.qp_state = IBV_QPS_RTS; + rc = rdma_init_qp_attr(mlx5_qp->common.cm_id, &qp_attr, &qp_attr_mask); + if (rc) { + SPDK_ERRLOG("Failed to init attr IBV_QPS_RTR, errno %s (%d)\n", spdk_strerror(errno), errno); + return rc; + } + + rc = ibv_modify_qp(mlx5_qp->common.qp, &qp_attr, qp_attr_mask); + if (rc) { + SPDK_ERRLOG("ibv_modify_qp(IBV_QPS_RTS) failed, rc %d\n", rc); + } + + return rc; +} + +struct spdk_rdma_qp * +spdk_rdma_qp_create(struct rdma_cm_id *cm_id, struct spdk_rdma_qp_init_attr *qp_attr) +{ + assert(cm_id); + assert(qp_attr); + + struct ibv_qp *qp; + struct spdk_rdma_mlx5_dv_qp *mlx5_qp; + struct ibv_qp_init_attr_ex dv_qp_attr = { + .qp_context = qp_attr->qp_context, + .send_cq = qp_attr->send_cq, + .recv_cq = qp_attr->recv_cq, + .srq = qp_attr->srq, + .cap = qp_attr->cap, + .qp_type = IBV_QPT_RC, + .comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS, + .pd = qp_attr->pd ? qp_attr->pd : cm_id->pd + }; + + assert(dv_qp_attr.pd); + + mlx5_qp = calloc(1, sizeof(*mlx5_qp)); + if (!mlx5_qp) { + SPDK_ERRLOG("qp memory allocation failed\n"); + return NULL; + } + + qp = mlx5dv_create_qp(cm_id->verbs, &dv_qp_attr, NULL); + + if (!qp) { + SPDK_ERRLOG("Failed to create qpair, errno %s (%d)\n", spdk_strerror(errno), errno); + free(mlx5_qp); + return NULL; + } + + mlx5_qp->common.qp = qp; + mlx5_qp->common.cm_id = cm_id; + mlx5_qp->qpex = ibv_qp_to_qp_ex(qp); + + if (!mlx5_qp->qpex) { + spdk_rdma_qp_destroy(&mlx5_qp->common); + return NULL; + } + + qp_attr->cap = dv_qp_attr.cap; + + return &mlx5_qp->common; +} + +int +spdk_rdma_qp_accept(struct spdk_rdma_qp *spdk_rdma_qp, struct rdma_conn_param *conn_param) +{ + struct spdk_rdma_mlx5_dv_qp *mlx5_qp; + + assert(spdk_rdma_qp != NULL); + assert(spdk_rdma_qp->cm_id != NULL); + + mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common); + + /* NVMEoF target must move qpair to RTS state */ + if (rdma_mlx5_dv_init_qpair(mlx5_qp) != 0) { + SPDK_ERRLOG("Failed to initialize qpair\n"); + /* Set errno to be compliant with rdma_accept behaviour */ + errno = ECONNABORTED; + return -1; + } + + return rdma_accept(spdk_rdma_qp->cm_id, conn_param); +} + +int +spdk_rdma_qp_complete_connect(struct spdk_rdma_qp *spdk_rdma_qp) +{ + struct spdk_rdma_mlx5_dv_qp *mlx5_qp; + int rc; + + assert(spdk_rdma_qp); + + mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common); + + rc = rdma_mlx5_dv_init_qpair(mlx5_qp); + if (rc) { + SPDK_ERRLOG("Failed to initialize qpair\n"); + return rc; + } + + rc = rdma_establish(mlx5_qp->common.cm_id); + if (rc) { + SPDK_ERRLOG("rdma_establish failed, errno %s (%d)\n", spdk_strerror(errno), errno); + } + + return rc; +} + +void +spdk_rdma_qp_destroy(struct spdk_rdma_qp *spdk_rdma_qp) +{ + struct spdk_rdma_mlx5_dv_qp *mlx5_qp; + int rc; + + assert(spdk_rdma_qp != NULL); + + mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common); + + if (spdk_rdma_qp->send_wrs.first != NULL) { + SPDK_WARNLOG("Destroying qpair with queued Work Requests\n"); + } + + if (mlx5_qp->common.qp) { + rc = ibv_destroy_qp(mlx5_qp->common.qp); + if (rc) { + SPDK_ERRLOG("Failed to destroy ibv qp %p, rc %d\n", mlx5_qp->common.qp, rc); + } + } + + free(mlx5_qp); +} + +int +spdk_rdma_qp_disconnect(struct spdk_rdma_qp *spdk_rdma_qp) +{ + int rc = 0; + + assert(spdk_rdma_qp != NULL); + + if (spdk_rdma_qp->qp) { + struct ibv_qp_attr qp_attr = {.qp_state = IBV_QPS_ERR}; + + rc = ibv_modify_qp(spdk_rdma_qp->qp, &qp_attr, IBV_QP_STATE); + if (rc) { + SPDK_ERRLOG("Failed to modify ibv qp %p state to ERR, rc %d\n", spdk_rdma_qp->qp, rc); + return rc; + } + } + + if (spdk_rdma_qp->cm_id) { + rc = rdma_disconnect(spdk_rdma_qp->cm_id); + if (rc) { + SPDK_ERRLOG("rdma_disconnect failed, errno %s (%d)\n", spdk_strerror(errno), errno); + } + } + + return rc; +} + +bool +spdk_rdma_qp_queue_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr *first) +{ + struct ibv_send_wr *tmp; + struct spdk_rdma_mlx5_dv_qp *mlx5_qp; + bool is_first; + + assert(spdk_rdma_qp); + assert(first); + + is_first = spdk_rdma_qp->send_wrs.first == NULL; + mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common); + + if (is_first) { + ibv_wr_start(mlx5_qp->qpex); + spdk_rdma_qp->send_wrs.first = first; + } else { + spdk_rdma_qp->send_wrs.last->next = first; + } + + for (tmp = first; tmp != NULL; tmp = tmp->next) { + mlx5_qp->qpex->wr_id = tmp->wr_id; + mlx5_qp->qpex->wr_flags = tmp->send_flags; + + switch (tmp->opcode) { + case IBV_WR_SEND: + ibv_wr_send(mlx5_qp->qpex); + break; + case IBV_WR_SEND_WITH_INV: + ibv_wr_send_inv(mlx5_qp->qpex, tmp->invalidate_rkey); + break; + case IBV_WR_RDMA_READ: + ibv_wr_rdma_read(mlx5_qp->qpex, tmp->wr.rdma.rkey, tmp->wr.rdma.remote_addr); + break; + case IBV_WR_RDMA_WRITE: + ibv_wr_rdma_write(mlx5_qp->qpex, tmp->wr.rdma.rkey, tmp->wr.rdma.remote_addr); + break; + default: + SPDK_ERRLOG("Unexpected opcode %d\n", tmp->opcode); + assert(0); + } + + ibv_wr_set_sge_list(mlx5_qp->qpex, tmp->num_sge, tmp->sg_list); + + spdk_rdma_qp->send_wrs.last = tmp; + } + + return is_first; +} + +int +spdk_rdma_qp_flush_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr **bad_wr) +{ + struct spdk_rdma_mlx5_dv_qp *mlx5_qp; + int rc; + + assert(bad_wr); + assert(spdk_rdma_qp); + + mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common); + + if (spdk_unlikely(spdk_rdma_qp->send_wrs.first == NULL)) { + return 0; + } + + rc = ibv_wr_complete(mlx5_qp->qpex); + + if (spdk_unlikely(rc)) { + /* If ibv_wr_complete reports an error that means that no WRs are posted to NIC */ + *bad_wr = spdk_rdma_qp->send_wrs.first; + } + + spdk_rdma_qp->send_wrs.first = NULL; + + return rc; +} diff --git a/src/spdk/lib/rdma/rdma_verbs.c b/src/spdk/lib/rdma/rdma_verbs.c new file mode 100644 index 000000000..66be5bf60 --- /dev/null +++ b/src/spdk/lib/rdma/rdma_verbs.c @@ -0,0 +1,167 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <rdma/rdma_cma.h> + +#include "spdk/stdinc.h" +#include "spdk/string.h" +#include "spdk/likely.h" + +#include "spdk_internal/rdma.h" +#include "spdk_internal/log.h" + +struct spdk_rdma_qp * +spdk_rdma_qp_create(struct rdma_cm_id *cm_id, struct spdk_rdma_qp_init_attr *qp_attr) +{ + struct spdk_rdma_qp *spdk_rdma_qp; + int rc; + struct ibv_qp_init_attr attr = { + .qp_context = qp_attr->qp_context, + .send_cq = qp_attr->send_cq, + .recv_cq = qp_attr->recv_cq, + .srq = qp_attr->srq, + .cap = qp_attr->cap, + .qp_type = IBV_QPT_RC + }; + + spdk_rdma_qp = calloc(1, sizeof(*spdk_rdma_qp)); + if (!spdk_rdma_qp) { + SPDK_ERRLOG("qp memory allocation failed\n"); + return NULL; + } + + rc = rdma_create_qp(cm_id, qp_attr->pd, &attr); + if (rc) { + SPDK_ERRLOG("Failed to create qp, errno %s (%d)\n", spdk_strerror(errno), errno); + free(spdk_rdma_qp); + return NULL; + } + + qp_attr->cap = attr.cap; + spdk_rdma_qp->qp = cm_id->qp; + spdk_rdma_qp->cm_id = cm_id; + + return spdk_rdma_qp; +} + +int +spdk_rdma_qp_accept(struct spdk_rdma_qp *spdk_rdma_qp, struct rdma_conn_param *conn_param) +{ + assert(spdk_rdma_qp != NULL); + assert(spdk_rdma_qp->cm_id != NULL); + + return rdma_accept(spdk_rdma_qp->cm_id, conn_param); +} + +int +spdk_rdma_qp_complete_connect(struct spdk_rdma_qp *spdk_rdma_qp) +{ + /* Nothing to be done for Verbs */ + return 0; +} + +void +spdk_rdma_qp_destroy(struct spdk_rdma_qp *spdk_rdma_qp) +{ + assert(spdk_rdma_qp != NULL); + + if (spdk_rdma_qp->send_wrs.first != NULL) { + SPDK_WARNLOG("Destroying qpair with queued Work Requests\n"); + } + + if (spdk_rdma_qp->qp) { + rdma_destroy_qp(spdk_rdma_qp->cm_id); + } + + free(spdk_rdma_qp); +} + +int +spdk_rdma_qp_disconnect(struct spdk_rdma_qp *spdk_rdma_qp) +{ + int rc = 0; + + assert(spdk_rdma_qp != NULL); + + if (spdk_rdma_qp->cm_id) { + rc = rdma_disconnect(spdk_rdma_qp->cm_id); + if (rc) { + SPDK_ERRLOG("rdma_disconnect failed, errno %s (%d)\n", spdk_strerror(errno), errno); + } + } + + return rc; +} + +bool +spdk_rdma_qp_queue_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr *first) +{ + struct ibv_send_wr *last; + + assert(spdk_rdma_qp); + assert(first); + + last = first; + while (last->next != NULL) { + last = last->next; + } + + if (spdk_rdma_qp->send_wrs.first == NULL) { + spdk_rdma_qp->send_wrs.first = first; + spdk_rdma_qp->send_wrs.last = last; + return true; + } else { + spdk_rdma_qp->send_wrs.last->next = first; + spdk_rdma_qp->send_wrs.last = last; + return false; + } +} + +int +spdk_rdma_qp_flush_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr **bad_wr) +{ + int rc; + + assert(spdk_rdma_qp); + assert(bad_wr); + + if (spdk_unlikely(!spdk_rdma_qp->send_wrs.first)) { + return 0; + } + + rc = ibv_post_send(spdk_rdma_qp->qp, spdk_rdma_qp->send_wrs.first, bad_wr); + + spdk_rdma_qp->send_wrs.first = NULL; + + return rc; +} diff --git a/src/spdk/lib/rdma/spdk_rdma.map b/src/spdk/lib/rdma/spdk_rdma.map new file mode 100644 index 000000000..9268a2191 --- /dev/null +++ b/src/spdk/lib/rdma/spdk_rdma.map @@ -0,0 +1,14 @@ +{ + global: + + # Public functions + spdk_rdma_qp_create; + spdk_rdma_qp_accept; + spdk_rdma_qp_complete_connect; + spdk_rdma_qp_destroy; + spdk_rdma_qp_disconnect; + spdk_rdma_qp_queue_send_wrs; + spdk_rdma_qp_flush_send_wrs; + + local: *; +}; diff --git a/src/spdk/lib/reduce/Makefile b/src/spdk/lib/reduce/Makefile new file mode 100644 index 000000000..fb417cd57 --- /dev/null +++ b/src/spdk/lib/reduce/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = reduce.c +LIBNAME = reduce + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_reduce.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/reduce/reduce.c b/src/spdk/lib/reduce/reduce.c new file mode 100644 index 000000000..6188f6c6c --- /dev/null +++ b/src/spdk/lib/reduce/reduce.c @@ -0,0 +1,1625 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/reduce.h" +#include "spdk/env.h" +#include "spdk/string.h" +#include "spdk/bit_array.h" +#include "spdk/util.h" +#include "spdk_internal/log.h" + +#include "libpmem.h" + +/* Always round up the size of the PM region to the nearest cacheline. */ +#define REDUCE_PM_SIZE_ALIGNMENT 64 + +/* Offset into the backing device where the persistent memory file's path is stored. */ +#define REDUCE_BACKING_DEV_PATH_OFFSET 4096 + +#define REDUCE_EMPTY_MAP_ENTRY -1ULL + +#define REDUCE_NUM_VOL_REQUESTS 256 + +/* Structure written to offset 0 of both the pm file and the backing device. */ +struct spdk_reduce_vol_superblock { + uint8_t signature[8]; + struct spdk_reduce_vol_params params; + uint8_t reserved[4048]; +}; +SPDK_STATIC_ASSERT(sizeof(struct spdk_reduce_vol_superblock) == 4096, "size incorrect"); + +#define SPDK_REDUCE_SIGNATURE "SPDKREDU" +/* null terminator counts one */ +SPDK_STATIC_ASSERT(sizeof(SPDK_REDUCE_SIGNATURE) - 1 == + sizeof(((struct spdk_reduce_vol_superblock *)0)->signature), "size incorrect"); + +#define REDUCE_PATH_MAX 4096 + +#define REDUCE_ZERO_BUF_SIZE 0x100000 + +/** + * Describes a persistent memory file used to hold metadata associated with a + * compressed volume. + */ +struct spdk_reduce_pm_file { + char path[REDUCE_PATH_MAX]; + void *pm_buf; + int pm_is_pmem; + uint64_t size; +}; + +#define REDUCE_IO_READV 1 +#define REDUCE_IO_WRITEV 2 + +struct spdk_reduce_chunk_map { + uint32_t compressed_size; + uint32_t reserved; + uint64_t io_unit_index[0]; +}; + +struct spdk_reduce_vol_request { + /** + * Scratch buffer used for uncompressed chunk. This is used for: + * 1) source buffer for compression operations + * 2) destination buffer for decompression operations + * 3) data buffer when writing uncompressed chunk to disk + * 4) data buffer when reading uncompressed chunk from disk + */ + uint8_t *decomp_buf; + struct iovec *decomp_buf_iov; + + /** + * These are used to construct the iovecs that are sent to + * the decomp engine, they point to a mix of the scratch buffer + * and user buffer + */ + struct iovec decomp_iov[REDUCE_MAX_IOVECS + 2]; + int decomp_iovcnt; + + /** + * Scratch buffer used for compressed chunk. This is used for: + * 1) destination buffer for compression operations + * 2) source buffer for decompression operations + * 3) data buffer when writing compressed chunk to disk + * 4) data buffer when reading compressed chunk from disk + */ + uint8_t *comp_buf; + struct iovec *comp_buf_iov; + struct iovec *iov; + bool rmw; + struct spdk_reduce_vol *vol; + int type; + int reduce_errno; + int iovcnt; + int num_backing_ops; + uint32_t num_io_units; + bool chunk_is_compressed; + uint64_t offset; + uint64_t logical_map_index; + uint64_t length; + uint64_t chunk_map_index; + struct spdk_reduce_chunk_map *chunk; + spdk_reduce_vol_op_complete cb_fn; + void *cb_arg; + TAILQ_ENTRY(spdk_reduce_vol_request) tailq; + struct spdk_reduce_vol_cb_args backing_cb_args; +}; + +struct spdk_reduce_vol { + struct spdk_reduce_vol_params params; + uint32_t backing_io_units_per_chunk; + uint32_t backing_lba_per_io_unit; + uint32_t logical_blocks_per_chunk; + struct spdk_reduce_pm_file pm_file; + struct spdk_reduce_backing_dev *backing_dev; + struct spdk_reduce_vol_superblock *backing_super; + struct spdk_reduce_vol_superblock *pm_super; + uint64_t *pm_logical_map; + uint64_t *pm_chunk_maps; + + struct spdk_bit_array *allocated_chunk_maps; + struct spdk_bit_array *allocated_backing_io_units; + + struct spdk_reduce_vol_request *request_mem; + TAILQ_HEAD(, spdk_reduce_vol_request) free_requests; + TAILQ_HEAD(, spdk_reduce_vol_request) executing_requests; + TAILQ_HEAD(, spdk_reduce_vol_request) queued_requests; + + /* Single contiguous buffer used for all request buffers for this volume. */ + uint8_t *buf_mem; + struct iovec *buf_iov_mem; +}; + +static void _start_readv_request(struct spdk_reduce_vol_request *req); +static void _start_writev_request(struct spdk_reduce_vol_request *req); +static uint8_t *g_zero_buf; +static int g_vol_count = 0; + +/* + * Allocate extra metadata chunks and corresponding backing io units to account for + * outstanding IO in worst case scenario where logical map is completely allocated + * and no data can be compressed. We need extra chunks in this case to handle + * in-flight writes since reduce never writes data in place. + */ +#define REDUCE_NUM_EXTRA_CHUNKS 128 + +static void +_reduce_persist(struct spdk_reduce_vol *vol, const void *addr, size_t len) +{ + if (vol->pm_file.pm_is_pmem) { + pmem_persist(addr, len); + } else { + pmem_msync(addr, len); + } +} + +static uint64_t +_get_pm_logical_map_size(uint64_t vol_size, uint64_t chunk_size) +{ + uint64_t chunks_in_logical_map, logical_map_size; + + chunks_in_logical_map = vol_size / chunk_size; + logical_map_size = chunks_in_logical_map * sizeof(uint64_t); + + /* Round up to next cacheline. */ + return spdk_divide_round_up(logical_map_size, REDUCE_PM_SIZE_ALIGNMENT) * + REDUCE_PM_SIZE_ALIGNMENT; +} + +static uint64_t +_get_total_chunks(uint64_t vol_size, uint64_t chunk_size) +{ + uint64_t num_chunks; + + num_chunks = vol_size / chunk_size; + num_chunks += REDUCE_NUM_EXTRA_CHUNKS; + + return num_chunks; +} + +static inline uint32_t +_reduce_vol_get_chunk_struct_size(uint64_t backing_io_units_per_chunk) +{ + return sizeof(struct spdk_reduce_chunk_map) + sizeof(uint64_t) * backing_io_units_per_chunk; +} + +static uint64_t +_get_pm_total_chunks_size(uint64_t vol_size, uint64_t chunk_size, uint64_t backing_io_unit_size) +{ + uint64_t io_units_per_chunk, num_chunks, total_chunks_size; + + num_chunks = _get_total_chunks(vol_size, chunk_size); + io_units_per_chunk = chunk_size / backing_io_unit_size; + + total_chunks_size = num_chunks * _reduce_vol_get_chunk_struct_size(io_units_per_chunk); + + return spdk_divide_round_up(total_chunks_size, REDUCE_PM_SIZE_ALIGNMENT) * + REDUCE_PM_SIZE_ALIGNMENT; +} + +static struct spdk_reduce_chunk_map * +_reduce_vol_get_chunk_map(struct spdk_reduce_vol *vol, uint64_t chunk_map_index) +{ + uintptr_t chunk_map_addr; + + assert(chunk_map_index < _get_total_chunks(vol->params.vol_size, vol->params.chunk_size)); + + chunk_map_addr = (uintptr_t)vol->pm_chunk_maps; + chunk_map_addr += chunk_map_index * + _reduce_vol_get_chunk_struct_size(vol->backing_io_units_per_chunk); + + return (struct spdk_reduce_chunk_map *)chunk_map_addr; +} + +static int +_validate_vol_params(struct spdk_reduce_vol_params *params) +{ + if (params->vol_size > 0) { + /** + * User does not pass in the vol size - it gets calculated by libreduce from + * values in this structure plus the size of the backing device. + */ + return -EINVAL; + } + + if (params->chunk_size == 0 || params->backing_io_unit_size == 0 || + params->logical_block_size == 0) { + return -EINVAL; + } + + /* Chunk size must be an even multiple of the backing io unit size. */ + if ((params->chunk_size % params->backing_io_unit_size) != 0) { + return -EINVAL; + } + + /* Chunk size must be an even multiple of the logical block size. */ + if ((params->chunk_size % params->logical_block_size) != 0) { + return -1; + } + + return 0; +} + +static uint64_t +_get_vol_size(uint64_t chunk_size, uint64_t backing_dev_size) +{ + uint64_t num_chunks; + + num_chunks = backing_dev_size / chunk_size; + if (num_chunks <= REDUCE_NUM_EXTRA_CHUNKS) { + return 0; + } + + num_chunks -= REDUCE_NUM_EXTRA_CHUNKS; + return num_chunks * chunk_size; +} + +static uint64_t +_get_pm_file_size(struct spdk_reduce_vol_params *params) +{ + uint64_t total_pm_size; + + total_pm_size = sizeof(struct spdk_reduce_vol_superblock); + total_pm_size += _get_pm_logical_map_size(params->vol_size, params->chunk_size); + total_pm_size += _get_pm_total_chunks_size(params->vol_size, params->chunk_size, + params->backing_io_unit_size); + return total_pm_size; +} + +const struct spdk_uuid * +spdk_reduce_vol_get_uuid(struct spdk_reduce_vol *vol) +{ + return &vol->params.uuid; +} + +static void +_initialize_vol_pm_pointers(struct spdk_reduce_vol *vol) +{ + uint64_t logical_map_size; + + /* Superblock is at the beginning of the pm file. */ + vol->pm_super = (struct spdk_reduce_vol_superblock *)vol->pm_file.pm_buf; + + /* Logical map immediately follows the super block. */ + vol->pm_logical_map = (uint64_t *)(vol->pm_super + 1); + + /* Chunks maps follow the logical map. */ + logical_map_size = _get_pm_logical_map_size(vol->params.vol_size, vol->params.chunk_size); + vol->pm_chunk_maps = (uint64_t *)((uint8_t *)vol->pm_logical_map + logical_map_size); +} + +/* We need 2 iovs during load - one for the superblock, another for the path */ +#define LOAD_IOV_COUNT 2 + +struct reduce_init_load_ctx { + struct spdk_reduce_vol *vol; + struct spdk_reduce_vol_cb_args backing_cb_args; + spdk_reduce_vol_op_with_handle_complete cb_fn; + void *cb_arg; + struct iovec iov[LOAD_IOV_COUNT]; + void *path; +}; + +static int +_allocate_vol_requests(struct spdk_reduce_vol *vol) +{ + struct spdk_reduce_vol_request *req; + int i; + + /* Allocate 2x since we need buffers for both read/write and compress/decompress + * intermediate buffers. + */ + vol->buf_mem = spdk_malloc(2 * REDUCE_NUM_VOL_REQUESTS * vol->params.chunk_size, + 64, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (vol->buf_mem == NULL) { + return -ENOMEM; + } + + vol->request_mem = calloc(REDUCE_NUM_VOL_REQUESTS, sizeof(*req)); + if (vol->request_mem == NULL) { + spdk_free(vol->buf_mem); + vol->buf_mem = NULL; + return -ENOMEM; + } + + /* Allocate 2x since we need iovs for both read/write and compress/decompress intermediate + * buffers. + */ + vol->buf_iov_mem = calloc(REDUCE_NUM_VOL_REQUESTS, + 2 * sizeof(struct iovec) * vol->backing_io_units_per_chunk); + if (vol->buf_iov_mem == NULL) { + free(vol->request_mem); + spdk_free(vol->buf_mem); + vol->request_mem = NULL; + vol->buf_mem = NULL; + return -ENOMEM; + } + + for (i = 0; i < REDUCE_NUM_VOL_REQUESTS; i++) { + req = &vol->request_mem[i]; + TAILQ_INSERT_HEAD(&vol->free_requests, req, tailq); + req->decomp_buf_iov = &vol->buf_iov_mem[(2 * i) * vol->backing_io_units_per_chunk]; + req->decomp_buf = vol->buf_mem + (2 * i) * vol->params.chunk_size; + req->comp_buf_iov = &vol->buf_iov_mem[(2 * i + 1) * vol->backing_io_units_per_chunk]; + req->comp_buf = vol->buf_mem + (2 * i + 1) * vol->params.chunk_size; + } + + return 0; +} + +static void +_init_load_cleanup(struct spdk_reduce_vol *vol, struct reduce_init_load_ctx *ctx) +{ + if (ctx != NULL) { + spdk_free(ctx->path); + free(ctx); + } + + if (vol != NULL) { + if (vol->pm_file.pm_buf != NULL) { + pmem_unmap(vol->pm_file.pm_buf, vol->pm_file.size); + } + + spdk_free(vol->backing_super); + spdk_bit_array_free(&vol->allocated_chunk_maps); + spdk_bit_array_free(&vol->allocated_backing_io_units); + free(vol->request_mem); + free(vol->buf_iov_mem); + spdk_free(vol->buf_mem); + free(vol); + } +} + +static int +_alloc_zero_buff(void) +{ + int rc = 0; + + /* The zero buffer is shared between all volumnes and just used + * for reads so allocate one global instance here if not already + * allocated when another vol init'd or loaded. + */ + if (g_vol_count++ == 0) { + g_zero_buf = spdk_zmalloc(REDUCE_ZERO_BUF_SIZE, + 64, NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); + if (g_zero_buf == NULL) { + rc = -ENOMEM; + } + } + return rc; +} + +static void +_init_write_super_cpl(void *cb_arg, int reduce_errno) +{ + struct reduce_init_load_ctx *init_ctx = cb_arg; + int rc; + + rc = _allocate_vol_requests(init_ctx->vol); + if (rc != 0) { + init_ctx->cb_fn(init_ctx->cb_arg, NULL, rc); + _init_load_cleanup(init_ctx->vol, init_ctx); + return; + } + + rc = _alloc_zero_buff(); + if (rc != 0) { + init_ctx->cb_fn(init_ctx->cb_arg, NULL, rc); + _init_load_cleanup(init_ctx->vol, init_ctx); + return; + } + + init_ctx->cb_fn(init_ctx->cb_arg, init_ctx->vol, reduce_errno); + /* Only clean up the ctx - the vol has been passed to the application + * for use now that initialization was successful. + */ + _init_load_cleanup(NULL, init_ctx); +} + +static void +_init_write_path_cpl(void *cb_arg, int reduce_errno) +{ + struct reduce_init_load_ctx *init_ctx = cb_arg; + struct spdk_reduce_vol *vol = init_ctx->vol; + + init_ctx->iov[0].iov_base = vol->backing_super; + init_ctx->iov[0].iov_len = sizeof(*vol->backing_super); + init_ctx->backing_cb_args.cb_fn = _init_write_super_cpl; + init_ctx->backing_cb_args.cb_arg = init_ctx; + vol->backing_dev->writev(vol->backing_dev, init_ctx->iov, 1, + 0, sizeof(*vol->backing_super) / vol->backing_dev->blocklen, + &init_ctx->backing_cb_args); +} + +static int +_allocate_bit_arrays(struct spdk_reduce_vol *vol) +{ + uint64_t total_chunks, total_backing_io_units; + uint32_t i, num_metadata_io_units; + + total_chunks = _get_total_chunks(vol->params.vol_size, vol->params.chunk_size); + vol->allocated_chunk_maps = spdk_bit_array_create(total_chunks); + total_backing_io_units = total_chunks * (vol->params.chunk_size / vol->params.backing_io_unit_size); + vol->allocated_backing_io_units = spdk_bit_array_create(total_backing_io_units); + + if (vol->allocated_chunk_maps == NULL || vol->allocated_backing_io_units == NULL) { + return -ENOMEM; + } + + /* Set backing io unit bits associated with metadata. */ + num_metadata_io_units = (sizeof(*vol->backing_super) + REDUCE_PATH_MAX) / + vol->backing_dev->blocklen; + for (i = 0; i < num_metadata_io_units; i++) { + spdk_bit_array_set(vol->allocated_backing_io_units, i); + } + + return 0; +} + +void +spdk_reduce_vol_init(struct spdk_reduce_vol_params *params, + struct spdk_reduce_backing_dev *backing_dev, + const char *pm_file_dir, + spdk_reduce_vol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_reduce_vol *vol; + struct reduce_init_load_ctx *init_ctx; + uint64_t backing_dev_size; + size_t mapped_len; + int dir_len, max_dir_len, rc; + + /* We need to append a path separator and the UUID to the supplied + * path. + */ + max_dir_len = REDUCE_PATH_MAX - SPDK_UUID_STRING_LEN - 1; + dir_len = strnlen(pm_file_dir, max_dir_len); + /* Strip trailing slash if the user provided one - we will add it back + * later when appending the filename. + */ + if (pm_file_dir[dir_len - 1] == '/') { + dir_len--; + } + if (dir_len == max_dir_len) { + SPDK_ERRLOG("pm_file_dir (%s) too long\n", pm_file_dir); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + rc = _validate_vol_params(params); + if (rc != 0) { + SPDK_ERRLOG("invalid vol params\n"); + cb_fn(cb_arg, NULL, rc); + return; + } + + backing_dev_size = backing_dev->blockcnt * backing_dev->blocklen; + params->vol_size = _get_vol_size(params->chunk_size, backing_dev_size); + if (params->vol_size == 0) { + SPDK_ERRLOG("backing device is too small\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + if (backing_dev->readv == NULL || backing_dev->writev == NULL || + backing_dev->unmap == NULL) { + SPDK_ERRLOG("backing_dev function pointer not specified\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + vol = calloc(1, sizeof(*vol)); + if (vol == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + TAILQ_INIT(&vol->free_requests); + TAILQ_INIT(&vol->executing_requests); + TAILQ_INIT(&vol->queued_requests); + + vol->backing_super = spdk_zmalloc(sizeof(*vol->backing_super), 0, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (vol->backing_super == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + _init_load_cleanup(vol, NULL); + return; + } + + init_ctx = calloc(1, sizeof(*init_ctx)); + if (init_ctx == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + _init_load_cleanup(vol, NULL); + return; + } + + init_ctx->path = spdk_zmalloc(REDUCE_PATH_MAX, 0, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (init_ctx->path == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + _init_load_cleanup(vol, init_ctx); + return; + } + + if (spdk_mem_all_zero(¶ms->uuid, sizeof(params->uuid))) { + spdk_uuid_generate(¶ms->uuid); + } + + memcpy(vol->pm_file.path, pm_file_dir, dir_len); + vol->pm_file.path[dir_len] = '/'; + spdk_uuid_fmt_lower(&vol->pm_file.path[dir_len + 1], SPDK_UUID_STRING_LEN, + ¶ms->uuid); + vol->pm_file.size = _get_pm_file_size(params); + vol->pm_file.pm_buf = pmem_map_file(vol->pm_file.path, vol->pm_file.size, + PMEM_FILE_CREATE | PMEM_FILE_EXCL, 0600, + &mapped_len, &vol->pm_file.pm_is_pmem); + if (vol->pm_file.pm_buf == NULL) { + SPDK_ERRLOG("could not pmem_map_file(%s): %s\n", + vol->pm_file.path, strerror(errno)); + cb_fn(cb_arg, NULL, -errno); + _init_load_cleanup(vol, init_ctx); + return; + } + + if (vol->pm_file.size != mapped_len) { + SPDK_ERRLOG("could not map entire pmem file (size=%" PRIu64 " mapped=%" PRIu64 ")\n", + vol->pm_file.size, mapped_len); + cb_fn(cb_arg, NULL, -ENOMEM); + _init_load_cleanup(vol, init_ctx); + return; + } + + vol->backing_io_units_per_chunk = params->chunk_size / params->backing_io_unit_size; + vol->logical_blocks_per_chunk = params->chunk_size / params->logical_block_size; + vol->backing_lba_per_io_unit = params->backing_io_unit_size / backing_dev->blocklen; + memcpy(&vol->params, params, sizeof(*params)); + + vol->backing_dev = backing_dev; + + rc = _allocate_bit_arrays(vol); + if (rc != 0) { + cb_fn(cb_arg, NULL, rc); + _init_load_cleanup(vol, init_ctx); + return; + } + + memcpy(vol->backing_super->signature, SPDK_REDUCE_SIGNATURE, + sizeof(vol->backing_super->signature)); + memcpy(&vol->backing_super->params, params, sizeof(*params)); + + _initialize_vol_pm_pointers(vol); + + memcpy(vol->pm_super, vol->backing_super, sizeof(*vol->backing_super)); + /* Writing 0xFF's is equivalent of filling it all with SPDK_EMPTY_MAP_ENTRY. + * Note that this writes 0xFF to not just the logical map but the chunk maps as well. + */ + memset(vol->pm_logical_map, 0xFF, vol->pm_file.size - sizeof(*vol->backing_super)); + _reduce_persist(vol, vol->pm_file.pm_buf, vol->pm_file.size); + + init_ctx->vol = vol; + init_ctx->cb_fn = cb_fn; + init_ctx->cb_arg = cb_arg; + + memcpy(init_ctx->path, vol->pm_file.path, REDUCE_PATH_MAX); + init_ctx->iov[0].iov_base = init_ctx->path; + init_ctx->iov[0].iov_len = REDUCE_PATH_MAX; + init_ctx->backing_cb_args.cb_fn = _init_write_path_cpl; + init_ctx->backing_cb_args.cb_arg = init_ctx; + /* Write path to offset 4K on backing device - just after where the super + * block will be written. We wait until this is committed before writing the + * super block to guarantee we don't get the super block written without the + * the path if the system crashed in the middle of a write operation. + */ + vol->backing_dev->writev(vol->backing_dev, init_ctx->iov, 1, + REDUCE_BACKING_DEV_PATH_OFFSET / vol->backing_dev->blocklen, + REDUCE_PATH_MAX / vol->backing_dev->blocklen, + &init_ctx->backing_cb_args); +} + +static void destroy_load_cb(void *cb_arg, struct spdk_reduce_vol *vol, int reduce_errno); + +static void +_load_read_super_and_path_cpl(void *cb_arg, int reduce_errno) +{ + struct reduce_init_load_ctx *load_ctx = cb_arg; + struct spdk_reduce_vol *vol = load_ctx->vol; + uint64_t backing_dev_size; + uint64_t i, num_chunks, logical_map_index; + struct spdk_reduce_chunk_map *chunk; + size_t mapped_len; + uint32_t j; + int rc; + + rc = _alloc_zero_buff(); + if (rc) { + goto error; + } + + if (memcmp(vol->backing_super->signature, + SPDK_REDUCE_SIGNATURE, + sizeof(vol->backing_super->signature)) != 0) { + /* This backing device isn't a libreduce backing device. */ + rc = -EILSEQ; + goto error; + } + + /* If the cb_fn is destroy_load_cb, it means we are wanting to destroy this compress bdev. + * So don't bother getting the volume ready to use - invoke the callback immediately + * so destroy_load_cb can delete the metadata off of the block device and delete the + * persistent memory file if it exists. + */ + memcpy(vol->pm_file.path, load_ctx->path, sizeof(vol->pm_file.path)); + if (load_ctx->cb_fn == (*destroy_load_cb)) { + load_ctx->cb_fn(load_ctx->cb_arg, vol, 0); + _init_load_cleanup(NULL, load_ctx); + return; + } + + memcpy(&vol->params, &vol->backing_super->params, sizeof(vol->params)); + vol->backing_io_units_per_chunk = vol->params.chunk_size / vol->params.backing_io_unit_size; + vol->logical_blocks_per_chunk = vol->params.chunk_size / vol->params.logical_block_size; + vol->backing_lba_per_io_unit = vol->params.backing_io_unit_size / vol->backing_dev->blocklen; + + rc = _allocate_bit_arrays(vol); + if (rc != 0) { + goto error; + } + + backing_dev_size = vol->backing_dev->blockcnt * vol->backing_dev->blocklen; + if (_get_vol_size(vol->params.chunk_size, backing_dev_size) < vol->params.vol_size) { + SPDK_ERRLOG("backing device size %" PRIi64 " smaller than expected\n", + backing_dev_size); + rc = -EILSEQ; + goto error; + } + + vol->pm_file.size = _get_pm_file_size(&vol->params); + vol->pm_file.pm_buf = pmem_map_file(vol->pm_file.path, 0, 0, 0, &mapped_len, + &vol->pm_file.pm_is_pmem); + if (vol->pm_file.pm_buf == NULL) { + SPDK_ERRLOG("could not pmem_map_file(%s): %s\n", vol->pm_file.path, strerror(errno)); + rc = -errno; + goto error; + } + + if (vol->pm_file.size != mapped_len) { + SPDK_ERRLOG("could not map entire pmem file (size=%" PRIu64 " mapped=%" PRIu64 ")\n", + vol->pm_file.size, mapped_len); + rc = -ENOMEM; + goto error; + } + + rc = _allocate_vol_requests(vol); + if (rc != 0) { + goto error; + } + + _initialize_vol_pm_pointers(vol); + + num_chunks = vol->params.vol_size / vol->params.chunk_size; + for (i = 0; i < num_chunks; i++) { + logical_map_index = vol->pm_logical_map[i]; + if (logical_map_index == REDUCE_EMPTY_MAP_ENTRY) { + continue; + } + spdk_bit_array_set(vol->allocated_chunk_maps, logical_map_index); + chunk = _reduce_vol_get_chunk_map(vol, logical_map_index); + for (j = 0; j < vol->backing_io_units_per_chunk; j++) { + if (chunk->io_unit_index[j] != REDUCE_EMPTY_MAP_ENTRY) { + spdk_bit_array_set(vol->allocated_backing_io_units, chunk->io_unit_index[j]); + } + } + } + + load_ctx->cb_fn(load_ctx->cb_arg, vol, 0); + /* Only clean up the ctx - the vol has been passed to the application + * for use now that volume load was successful. + */ + _init_load_cleanup(NULL, load_ctx); + return; + +error: + load_ctx->cb_fn(load_ctx->cb_arg, NULL, rc); + _init_load_cleanup(vol, load_ctx); +} + +void +spdk_reduce_vol_load(struct spdk_reduce_backing_dev *backing_dev, + spdk_reduce_vol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_reduce_vol *vol; + struct reduce_init_load_ctx *load_ctx; + + if (backing_dev->readv == NULL || backing_dev->writev == NULL || + backing_dev->unmap == NULL) { + SPDK_ERRLOG("backing_dev function pointer not specified\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + vol = calloc(1, sizeof(*vol)); + if (vol == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + TAILQ_INIT(&vol->free_requests); + TAILQ_INIT(&vol->executing_requests); + TAILQ_INIT(&vol->queued_requests); + + vol->backing_super = spdk_zmalloc(sizeof(*vol->backing_super), 64, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (vol->backing_super == NULL) { + _init_load_cleanup(vol, NULL); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + vol->backing_dev = backing_dev; + + load_ctx = calloc(1, sizeof(*load_ctx)); + if (load_ctx == NULL) { + _init_load_cleanup(vol, NULL); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + load_ctx->path = spdk_zmalloc(REDUCE_PATH_MAX, 64, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (load_ctx->path == NULL) { + _init_load_cleanup(vol, load_ctx); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + load_ctx->vol = vol; + load_ctx->cb_fn = cb_fn; + load_ctx->cb_arg = cb_arg; + + load_ctx->iov[0].iov_base = vol->backing_super; + load_ctx->iov[0].iov_len = sizeof(*vol->backing_super); + load_ctx->iov[1].iov_base = load_ctx->path; + load_ctx->iov[1].iov_len = REDUCE_PATH_MAX; + load_ctx->backing_cb_args.cb_fn = _load_read_super_and_path_cpl; + load_ctx->backing_cb_args.cb_arg = load_ctx; + vol->backing_dev->readv(vol->backing_dev, load_ctx->iov, LOAD_IOV_COUNT, 0, + (sizeof(*vol->backing_super) + REDUCE_PATH_MAX) / + vol->backing_dev->blocklen, + &load_ctx->backing_cb_args); +} + +void +spdk_reduce_vol_unload(struct spdk_reduce_vol *vol, + spdk_reduce_vol_op_complete cb_fn, void *cb_arg) +{ + if (vol == NULL) { + /* This indicates a programming error. */ + assert(false); + cb_fn(cb_arg, -EINVAL); + return; + } + + if (--g_vol_count == 0) { + spdk_free(g_zero_buf); + } + assert(g_vol_count >= 0); + _init_load_cleanup(vol, NULL); + cb_fn(cb_arg, 0); +} + +struct reduce_destroy_ctx { + spdk_reduce_vol_op_complete cb_fn; + void *cb_arg; + struct spdk_reduce_vol *vol; + struct spdk_reduce_vol_superblock *super; + struct iovec iov; + struct spdk_reduce_vol_cb_args backing_cb_args; + int reduce_errno; + char pm_path[REDUCE_PATH_MAX]; +}; + +static void +destroy_unload_cpl(void *cb_arg, int reduce_errno) +{ + struct reduce_destroy_ctx *destroy_ctx = cb_arg; + + if (destroy_ctx->reduce_errno == 0) { + if (unlink(destroy_ctx->pm_path)) { + SPDK_ERRLOG("%s could not be unlinked: %s\n", + destroy_ctx->pm_path, strerror(errno)); + } + } + + /* Even if the unload somehow failed, we still pass the destroy_ctx + * reduce_errno since that indicates whether or not the volume was + * actually destroyed. + */ + destroy_ctx->cb_fn(destroy_ctx->cb_arg, destroy_ctx->reduce_errno); + spdk_free(destroy_ctx->super); + free(destroy_ctx); +} + +static void +_destroy_zero_super_cpl(void *cb_arg, int reduce_errno) +{ + struct reduce_destroy_ctx *destroy_ctx = cb_arg; + struct spdk_reduce_vol *vol = destroy_ctx->vol; + + destroy_ctx->reduce_errno = reduce_errno; + spdk_reduce_vol_unload(vol, destroy_unload_cpl, destroy_ctx); +} + +static void +destroy_load_cb(void *cb_arg, struct spdk_reduce_vol *vol, int reduce_errno) +{ + struct reduce_destroy_ctx *destroy_ctx = cb_arg; + + if (reduce_errno != 0) { + destroy_ctx->cb_fn(destroy_ctx->cb_arg, reduce_errno); + spdk_free(destroy_ctx->super); + free(destroy_ctx); + return; + } + + destroy_ctx->vol = vol; + memcpy(destroy_ctx->pm_path, vol->pm_file.path, sizeof(destroy_ctx->pm_path)); + destroy_ctx->iov.iov_base = destroy_ctx->super; + destroy_ctx->iov.iov_len = sizeof(*destroy_ctx->super); + destroy_ctx->backing_cb_args.cb_fn = _destroy_zero_super_cpl; + destroy_ctx->backing_cb_args.cb_arg = destroy_ctx; + vol->backing_dev->writev(vol->backing_dev, &destroy_ctx->iov, 1, 0, + sizeof(*destroy_ctx->super) / vol->backing_dev->blocklen, + &destroy_ctx->backing_cb_args); +} + +void +spdk_reduce_vol_destroy(struct spdk_reduce_backing_dev *backing_dev, + spdk_reduce_vol_op_complete cb_fn, void *cb_arg) +{ + struct reduce_destroy_ctx *destroy_ctx; + + destroy_ctx = calloc(1, sizeof(*destroy_ctx)); + if (destroy_ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + destroy_ctx->super = spdk_zmalloc(sizeof(*destroy_ctx->super), 64, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (destroy_ctx->super == NULL) { + free(destroy_ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + destroy_ctx->cb_fn = cb_fn; + destroy_ctx->cb_arg = cb_arg; + spdk_reduce_vol_load(backing_dev, destroy_load_cb, destroy_ctx); +} + +static bool +_request_spans_chunk_boundary(struct spdk_reduce_vol *vol, uint64_t offset, uint64_t length) +{ + uint64_t start_chunk, end_chunk; + + start_chunk = offset / vol->logical_blocks_per_chunk; + end_chunk = (offset + length - 1) / vol->logical_blocks_per_chunk; + + return (start_chunk != end_chunk); +} + +typedef void (*reduce_request_fn)(void *_req, int reduce_errno); + +static void +_reduce_vol_complete_req(struct spdk_reduce_vol_request *req, int reduce_errno) +{ + struct spdk_reduce_vol_request *next_req; + struct spdk_reduce_vol *vol = req->vol; + + req->cb_fn(req->cb_arg, reduce_errno); + TAILQ_REMOVE(&vol->executing_requests, req, tailq); + + TAILQ_FOREACH(next_req, &vol->queued_requests, tailq) { + if (next_req->logical_map_index == req->logical_map_index) { + TAILQ_REMOVE(&vol->queued_requests, next_req, tailq); + if (next_req->type == REDUCE_IO_READV) { + _start_readv_request(next_req); + } else { + assert(next_req->type == REDUCE_IO_WRITEV); + _start_writev_request(next_req); + } + break; + } + } + + TAILQ_INSERT_HEAD(&vol->free_requests, req, tailq); +} + +static void +_write_write_done(void *_req, int reduce_errno) +{ + struct spdk_reduce_vol_request *req = _req; + struct spdk_reduce_vol *vol = req->vol; + uint64_t old_chunk_map_index; + struct spdk_reduce_chunk_map *old_chunk; + uint32_t i; + + if (reduce_errno != 0) { + req->reduce_errno = reduce_errno; + } + + assert(req->num_backing_ops > 0); + if (--req->num_backing_ops > 0) { + return; + } + + if (req->reduce_errno != 0) { + _reduce_vol_complete_req(req, req->reduce_errno); + return; + } + + old_chunk_map_index = vol->pm_logical_map[req->logical_map_index]; + if (old_chunk_map_index != REDUCE_EMPTY_MAP_ENTRY) { + old_chunk = _reduce_vol_get_chunk_map(vol, old_chunk_map_index); + for (i = 0; i < vol->backing_io_units_per_chunk; i++) { + if (old_chunk->io_unit_index[i] == REDUCE_EMPTY_MAP_ENTRY) { + break; + } + assert(spdk_bit_array_get(vol->allocated_backing_io_units, old_chunk->io_unit_index[i]) == true); + spdk_bit_array_clear(vol->allocated_backing_io_units, old_chunk->io_unit_index[i]); + old_chunk->io_unit_index[i] = REDUCE_EMPTY_MAP_ENTRY; + } + spdk_bit_array_clear(vol->allocated_chunk_maps, old_chunk_map_index); + } + + /* + * We don't need to persist the clearing of the old chunk map here. The old chunk map + * becomes invalid after we update the logical map, since the old chunk map will no + * longer have a reference to it in the logical map. + */ + + /* Persist the new chunk map. This must be persisted before we update the logical map. */ + _reduce_persist(vol, req->chunk, + _reduce_vol_get_chunk_struct_size(vol->backing_io_units_per_chunk)); + + vol->pm_logical_map[req->logical_map_index] = req->chunk_map_index; + + _reduce_persist(vol, &vol->pm_logical_map[req->logical_map_index], sizeof(uint64_t)); + + _reduce_vol_complete_req(req, 0); +} + +static void +_issue_backing_ops(struct spdk_reduce_vol_request *req, struct spdk_reduce_vol *vol, + reduce_request_fn next_fn, bool is_write) +{ + struct iovec *iov; + uint8_t *buf; + uint32_t i; + + if (req->chunk_is_compressed) { + iov = req->comp_buf_iov; + buf = req->comp_buf; + } else { + iov = req->decomp_buf_iov; + buf = req->decomp_buf; + } + + req->num_backing_ops = req->num_io_units; + req->backing_cb_args.cb_fn = next_fn; + req->backing_cb_args.cb_arg = req; + for (i = 0; i < req->num_io_units; i++) { + iov[i].iov_base = buf + i * vol->params.backing_io_unit_size; + iov[i].iov_len = vol->params.backing_io_unit_size; + if (is_write) { + vol->backing_dev->writev(vol->backing_dev, &iov[i], 1, + req->chunk->io_unit_index[i] * vol->backing_lba_per_io_unit, + vol->backing_lba_per_io_unit, &req->backing_cb_args); + } else { + vol->backing_dev->readv(vol->backing_dev, &iov[i], 1, + req->chunk->io_unit_index[i] * vol->backing_lba_per_io_unit, + vol->backing_lba_per_io_unit, &req->backing_cb_args); + } + } +} + +static void +_reduce_vol_write_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn, + uint32_t compressed_size) +{ + struct spdk_reduce_vol *vol = req->vol; + uint32_t i; + uint64_t chunk_offset, remainder, total_len = 0; + uint8_t *buf; + int j; + + req->chunk_map_index = spdk_bit_array_find_first_clear(vol->allocated_chunk_maps, 0); + + /* TODO: fail if no chunk map found - but really this should not happen if we + * size the number of requests similarly to number of extra chunk maps + */ + assert(req->chunk_map_index != UINT32_MAX); + spdk_bit_array_set(vol->allocated_chunk_maps, req->chunk_map_index); + + req->chunk = _reduce_vol_get_chunk_map(vol, req->chunk_map_index); + req->num_io_units = spdk_divide_round_up(compressed_size, + vol->params.backing_io_unit_size); + req->chunk_is_compressed = (req->num_io_units != vol->backing_io_units_per_chunk); + req->chunk->compressed_size = + req->chunk_is_compressed ? compressed_size : vol->params.chunk_size; + + /* if the chunk is uncompressed we need to copy the data from the host buffers. */ + if (req->chunk_is_compressed == false) { + chunk_offset = req->offset % vol->logical_blocks_per_chunk; + buf = req->decomp_buf; + total_len = chunk_offset * vol->params.logical_block_size; + + /* zero any offset into chunk */ + if (req->rmw == false && chunk_offset) { + memset(buf, 0, total_len); + } + buf += total_len; + + /* copy the data */ + for (j = 0; j < req->iovcnt; j++) { + memcpy(buf, req->iov[j].iov_base, req->iov[j].iov_len); + buf += req->iov[j].iov_len; + total_len += req->iov[j].iov_len; + } + + /* zero any remainder */ + remainder = vol->params.chunk_size - total_len; + total_len += remainder; + if (req->rmw == false && remainder) { + memset(buf, 0, remainder); + } + assert(total_len == vol->params.chunk_size); + } + + for (i = 0; i < req->num_io_units; i++) { + req->chunk->io_unit_index[i] = spdk_bit_array_find_first_clear(vol->allocated_backing_io_units, 0); + /* TODO: fail if no backing block found - but really this should also not + * happen (see comment above). + */ + assert(req->chunk->io_unit_index[i] != UINT32_MAX); + spdk_bit_array_set(vol->allocated_backing_io_units, req->chunk->io_unit_index[i]); + } + + _issue_backing_ops(req, vol, next_fn, true /* write */); +} + +static void +_write_compress_done(void *_req, int reduce_errno) +{ + struct spdk_reduce_vol_request *req = _req; + + /* Negative reduce_errno indicates failure for compression operations. + * Just write the uncompressed data instead. Force this to happen + * by just passing the full chunk size to _reduce_vol_write_chunk. + * When it sees the data couldn't be compressed, it will just write + * the uncompressed buffer to disk. + */ + if (reduce_errno < 0) { + reduce_errno = req->vol->params.chunk_size; + } + + /* Positive reduce_errno indicates number of bytes in compressed buffer. */ + _reduce_vol_write_chunk(req, _write_write_done, (uint32_t)reduce_errno); +} + +static void +_reduce_vol_compress_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn) +{ + struct spdk_reduce_vol *vol = req->vol; + + req->backing_cb_args.cb_fn = next_fn; + req->backing_cb_args.cb_arg = req; + req->comp_buf_iov[0].iov_base = req->comp_buf; + req->comp_buf_iov[0].iov_len = vol->params.chunk_size; + vol->backing_dev->compress(vol->backing_dev, + &req->decomp_iov[0], req->decomp_iovcnt, req->comp_buf_iov, 1, + &req->backing_cb_args); +} + +static void +_reduce_vol_decompress_chunk_scratch(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn) +{ + struct spdk_reduce_vol *vol = req->vol; + + req->backing_cb_args.cb_fn = next_fn; + req->backing_cb_args.cb_arg = req; + req->comp_buf_iov[0].iov_base = req->comp_buf; + req->comp_buf_iov[0].iov_len = req->chunk->compressed_size; + req->decomp_buf_iov[0].iov_base = req->decomp_buf; + req->decomp_buf_iov[0].iov_len = vol->params.chunk_size; + vol->backing_dev->decompress(vol->backing_dev, + req->comp_buf_iov, 1, req->decomp_buf_iov, 1, + &req->backing_cb_args); +} + +static void +_reduce_vol_decompress_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn) +{ + struct spdk_reduce_vol *vol = req->vol; + uint64_t chunk_offset, remainder = 0; + uint64_t ttl_len = 0; + int i; + + req->decomp_iovcnt = 0; + chunk_offset = req->offset % vol->logical_blocks_per_chunk; + + if (chunk_offset) { + /* first iov point to our scratch buffer for any offset into the chunk */ + req->decomp_iov[0].iov_base = req->decomp_buf; + req->decomp_iov[0].iov_len = chunk_offset * vol->params.logical_block_size; + ttl_len += req->decomp_iov[0].iov_len; + req->decomp_iovcnt = 1; + } + + /* now the user data iov, direct to the user buffer */ + for (i = 0; i < req->iovcnt; i++) { + req->decomp_iov[i + req->decomp_iovcnt].iov_base = req->iov[i].iov_base; + req->decomp_iov[i + req->decomp_iovcnt].iov_len = req->iov[i].iov_len; + ttl_len += req->decomp_iov[i + req->decomp_iovcnt].iov_len; + } + req->decomp_iovcnt += req->iovcnt; + + /* send the rest of the chunk to our scratch buffer */ + remainder = vol->params.chunk_size - ttl_len; + if (remainder) { + req->decomp_iov[req->decomp_iovcnt].iov_base = req->decomp_buf + ttl_len; + req->decomp_iov[req->decomp_iovcnt].iov_len = remainder; + ttl_len += req->decomp_iov[req->decomp_iovcnt].iov_len; + req->decomp_iovcnt++; + } + assert(ttl_len == vol->params.chunk_size); + + req->backing_cb_args.cb_fn = next_fn; + req->backing_cb_args.cb_arg = req; + req->comp_buf_iov[0].iov_base = req->comp_buf; + req->comp_buf_iov[0].iov_len = req->chunk->compressed_size; + vol->backing_dev->decompress(vol->backing_dev, + req->comp_buf_iov, 1, &req->decomp_iov[0], req->decomp_iovcnt, + &req->backing_cb_args); +} + +static void +_write_decompress_done(void *_req, int reduce_errno) +{ + struct spdk_reduce_vol_request *req = _req; + struct spdk_reduce_vol *vol = req->vol; + uint64_t chunk_offset, remainder, ttl_len = 0; + int i; + + /* Negative reduce_errno indicates failure for compression operations. */ + if (reduce_errno < 0) { + _reduce_vol_complete_req(req, reduce_errno); + return; + } + + /* Positive reduce_errno indicates number of bytes in decompressed + * buffer. This should equal the chunk size - otherwise that's another + * type of failure. + */ + if ((uint32_t)reduce_errno != vol->params.chunk_size) { + _reduce_vol_complete_req(req, -EIO); + return; + } + + req->decomp_iovcnt = 0; + chunk_offset = req->offset % vol->logical_blocks_per_chunk; + + if (chunk_offset) { + req->decomp_iov[0].iov_base = req->decomp_buf; + req->decomp_iov[0].iov_len = chunk_offset * vol->params.logical_block_size; + ttl_len += req->decomp_iov[0].iov_len; + req->decomp_iovcnt = 1; + } + + for (i = 0; i < req->iovcnt; i++) { + req->decomp_iov[i + req->decomp_iovcnt].iov_base = req->iov[i].iov_base; + req->decomp_iov[i + req->decomp_iovcnt].iov_len = req->iov[i].iov_len; + ttl_len += req->decomp_iov[i + req->decomp_iovcnt].iov_len; + } + req->decomp_iovcnt += req->iovcnt; + + remainder = vol->params.chunk_size - ttl_len; + if (remainder) { + req->decomp_iov[req->decomp_iovcnt].iov_base = req->decomp_buf + ttl_len; + req->decomp_iov[req->decomp_iovcnt].iov_len = remainder; + ttl_len += req->decomp_iov[req->decomp_iovcnt].iov_len; + req->decomp_iovcnt++; + } + assert(ttl_len == vol->params.chunk_size); + + _reduce_vol_compress_chunk(req, _write_compress_done); +} + +static void +_write_read_done(void *_req, int reduce_errno) +{ + struct spdk_reduce_vol_request *req = _req; + + if (reduce_errno != 0) { + req->reduce_errno = reduce_errno; + } + + assert(req->num_backing_ops > 0); + if (--req->num_backing_ops > 0) { + return; + } + + if (req->reduce_errno != 0) { + _reduce_vol_complete_req(req, req->reduce_errno); + return; + } + + if (req->chunk_is_compressed) { + _reduce_vol_decompress_chunk_scratch(req, _write_decompress_done); + } else { + _write_decompress_done(req, req->chunk->compressed_size); + } +} + +static void +_read_decompress_done(void *_req, int reduce_errno) +{ + struct spdk_reduce_vol_request *req = _req; + struct spdk_reduce_vol *vol = req->vol; + + /* Negative reduce_errno indicates failure for compression operations. */ + if (reduce_errno < 0) { + _reduce_vol_complete_req(req, reduce_errno); + return; + } + + /* Positive reduce_errno indicates number of bytes in decompressed + * buffer. This should equal the chunk size - otherwise that's another + * type of failure. + */ + if ((uint32_t)reduce_errno != vol->params.chunk_size) { + _reduce_vol_complete_req(req, -EIO); + return; + } + + _reduce_vol_complete_req(req, 0); +} + +static void +_read_read_done(void *_req, int reduce_errno) +{ + struct spdk_reduce_vol_request *req = _req; + uint64_t chunk_offset; + uint8_t *buf; + int i; + + if (reduce_errno != 0) { + req->reduce_errno = reduce_errno; + } + + assert(req->num_backing_ops > 0); + if (--req->num_backing_ops > 0) { + return; + } + + if (req->reduce_errno != 0) { + _reduce_vol_complete_req(req, req->reduce_errno); + return; + } + + if (req->chunk_is_compressed) { + _reduce_vol_decompress_chunk(req, _read_decompress_done); + } else { + + /* If the chunk was compressed, the data would have been sent to the + * host buffers by the decompression operation, if not we need to memcpy here. + */ + chunk_offset = req->offset % req->vol->logical_blocks_per_chunk; + buf = req->decomp_buf + chunk_offset * req->vol->params.logical_block_size; + for (i = 0; i < req->iovcnt; i++) { + memcpy(req->iov[i].iov_base, buf, req->iov[i].iov_len); + buf += req->iov[i].iov_len; + } + + _read_decompress_done(req, req->chunk->compressed_size); + } +} + +static void +_reduce_vol_read_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn) +{ + struct spdk_reduce_vol *vol = req->vol; + + req->chunk_map_index = vol->pm_logical_map[req->logical_map_index]; + assert(req->chunk_map_index != UINT32_MAX); + + req->chunk = _reduce_vol_get_chunk_map(vol, req->chunk_map_index); + req->num_io_units = spdk_divide_round_up(req->chunk->compressed_size, + vol->params.backing_io_unit_size); + req->chunk_is_compressed = (req->num_io_units != vol->backing_io_units_per_chunk); + + _issue_backing_ops(req, vol, next_fn, false /* read */); +} + +static bool +_iov_array_is_valid(struct spdk_reduce_vol *vol, struct iovec *iov, int iovcnt, + uint64_t length) +{ + uint64_t size = 0; + int i; + + if (iovcnt > REDUCE_MAX_IOVECS) { + return false; + } + + for (i = 0; i < iovcnt; i++) { + size += iov[i].iov_len; + } + + return size == (length * vol->params.logical_block_size); +} + +static bool +_check_overlap(struct spdk_reduce_vol *vol, uint64_t logical_map_index) +{ + struct spdk_reduce_vol_request *req; + + TAILQ_FOREACH(req, &vol->executing_requests, tailq) { + if (logical_map_index == req->logical_map_index) { + return true; + } + } + + return false; +} + +static void +_start_readv_request(struct spdk_reduce_vol_request *req) +{ + TAILQ_INSERT_TAIL(&req->vol->executing_requests, req, tailq); + _reduce_vol_read_chunk(req, _read_read_done); +} + +void +spdk_reduce_vol_readv(struct spdk_reduce_vol *vol, + struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, + spdk_reduce_vol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_reduce_vol_request *req; + uint64_t logical_map_index; + bool overlapped; + int i; + + if (length == 0) { + cb_fn(cb_arg, 0); + return; + } + + if (_request_spans_chunk_boundary(vol, offset, length)) { + cb_fn(cb_arg, -EINVAL); + return; + } + + if (!_iov_array_is_valid(vol, iov, iovcnt, length)) { + cb_fn(cb_arg, -EINVAL); + return; + } + + logical_map_index = offset / vol->logical_blocks_per_chunk; + overlapped = _check_overlap(vol, logical_map_index); + + if (!overlapped && vol->pm_logical_map[logical_map_index] == REDUCE_EMPTY_MAP_ENTRY) { + /* + * This chunk hasn't been allocated. So treat the data as all + * zeroes for this chunk - do the memset and immediately complete + * the operation. + */ + for (i = 0; i < iovcnt; i++) { + memset(iov[i].iov_base, 0, iov[i].iov_len); + } + cb_fn(cb_arg, 0); + return; + } + + req = TAILQ_FIRST(&vol->free_requests); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + TAILQ_REMOVE(&vol->free_requests, req, tailq); + req->type = REDUCE_IO_READV; + req->vol = vol; + req->iov = iov; + req->iovcnt = iovcnt; + req->offset = offset; + req->logical_map_index = logical_map_index; + req->length = length; + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + if (!overlapped) { + _start_readv_request(req); + } else { + TAILQ_INSERT_TAIL(&vol->queued_requests, req, tailq); + } +} + +static void +_start_writev_request(struct spdk_reduce_vol_request *req) +{ + struct spdk_reduce_vol *vol = req->vol; + uint64_t chunk_offset, ttl_len = 0; + uint64_t remainder = 0; + uint32_t lbsize; + int i; + + TAILQ_INSERT_TAIL(&req->vol->executing_requests, req, tailq); + if (vol->pm_logical_map[req->logical_map_index] != REDUCE_EMPTY_MAP_ENTRY) { + if ((req->length * vol->params.logical_block_size) < vol->params.chunk_size) { + /* Read old chunk, then overwrite with data from this write + * operation. + */ + req->rmw = true; + _reduce_vol_read_chunk(req, _write_read_done); + return; + } + } + + lbsize = vol->params.logical_block_size; + req->decomp_iovcnt = 0; + req->rmw = false; + + /* Note: point to our zero buf for offset into the chunk. */ + chunk_offset = req->offset % vol->logical_blocks_per_chunk; + if (chunk_offset != 0) { + ttl_len += chunk_offset * lbsize; + req->decomp_iov[0].iov_base = g_zero_buf; + req->decomp_iov[0].iov_len = ttl_len; + req->decomp_iovcnt = 1; + } + + /* now the user data iov, direct from the user buffer */ + for (i = 0; i < req->iovcnt; i++) { + req->decomp_iov[i + req->decomp_iovcnt].iov_base = req->iov[i].iov_base; + req->decomp_iov[i + req->decomp_iovcnt].iov_len = req->iov[i].iov_len; + ttl_len += req->decomp_iov[i + req->decomp_iovcnt].iov_len; + } + req->decomp_iovcnt += req->iovcnt; + + remainder = vol->params.chunk_size - ttl_len; + if (remainder) { + req->decomp_iov[req->decomp_iovcnt].iov_base = g_zero_buf; + req->decomp_iov[req->decomp_iovcnt].iov_len = remainder; + ttl_len += req->decomp_iov[req->decomp_iovcnt].iov_len; + req->decomp_iovcnt++; + } + assert(ttl_len == req->vol->params.chunk_size); + + _reduce_vol_compress_chunk(req, _write_compress_done); +} + +void +spdk_reduce_vol_writev(struct spdk_reduce_vol *vol, + struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, + spdk_reduce_vol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_reduce_vol_request *req; + uint64_t logical_map_index; + bool overlapped; + + if (length == 0) { + cb_fn(cb_arg, 0); + return; + } + + if (_request_spans_chunk_boundary(vol, offset, length)) { + cb_fn(cb_arg, -EINVAL); + return; + } + + if (!_iov_array_is_valid(vol, iov, iovcnt, length)) { + cb_fn(cb_arg, -EINVAL); + return; + } + + logical_map_index = offset / vol->logical_blocks_per_chunk; + overlapped = _check_overlap(vol, logical_map_index); + + req = TAILQ_FIRST(&vol->free_requests); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + TAILQ_REMOVE(&vol->free_requests, req, tailq); + req->type = REDUCE_IO_WRITEV; + req->vol = vol; + req->iov = iov; + req->iovcnt = iovcnt; + req->offset = offset; + req->logical_map_index = logical_map_index; + req->length = length; + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + if (!overlapped) { + _start_writev_request(req); + } else { + TAILQ_INSERT_TAIL(&vol->queued_requests, req, tailq); + } +} + +const struct spdk_reduce_vol_params * +spdk_reduce_vol_get_params(struct spdk_reduce_vol *vol) +{ + return &vol->params; +} + +void spdk_reduce_vol_print_info(struct spdk_reduce_vol *vol) +{ + uint64_t logical_map_size, num_chunks, ttl_chunk_sz; + uint32_t struct_size; + uint64_t chunk_map_size; + + SPDK_NOTICELOG("vol info:\n"); + SPDK_NOTICELOG("\tvol->params.backing_io_unit_size = 0x%x\n", vol->params.backing_io_unit_size); + SPDK_NOTICELOG("\tvol->params.logical_block_size = 0x%x\n", vol->params.logical_block_size); + SPDK_NOTICELOG("\tvol->params.chunk_size = 0x%x\n", vol->params.chunk_size); + SPDK_NOTICELOG("\tvol->params.vol_size = 0x%" PRIx64 "\n", vol->params.vol_size); + num_chunks = _get_total_chunks(vol->params.vol_size, vol->params.chunk_size); + SPDK_NOTICELOG("\ttotal chunks (including extra) = 0x%" PRIx64 "\n", num_chunks); + SPDK_NOTICELOG("\ttotal chunks (excluding extra) = 0x%" PRIx64 "\n", + vol->params.vol_size / vol->params.chunk_size); + ttl_chunk_sz = _get_pm_total_chunks_size(vol->params.vol_size, vol->params.chunk_size, + vol->params.backing_io_unit_size); + SPDK_NOTICELOG("\ttotal_chunks_size = 0x%" PRIx64 "\n", ttl_chunk_sz); + struct_size = _reduce_vol_get_chunk_struct_size(vol->backing_io_units_per_chunk); + SPDK_NOTICELOG("\tchunk_struct_size = 0x%x\n", struct_size); + + SPDK_NOTICELOG("pmem info:\n"); + SPDK_NOTICELOG("\tvol->pm_file.size = 0x%" PRIx64 "\n", vol->pm_file.size); + SPDK_NOTICELOG("\tvol->pm_file.pm_buf = %p\n", (void *)vol->pm_file.pm_buf); + SPDK_NOTICELOG("\tvol->pm_super = %p\n", (void *)vol->pm_super); + SPDK_NOTICELOG("\tvol->pm_logical_map = %p\n", (void *)vol->pm_logical_map); + logical_map_size = _get_pm_logical_map_size(vol->params.vol_size, + vol->params.chunk_size); + SPDK_NOTICELOG("\tlogical_map_size = 0x%" PRIx64 "\n", logical_map_size); + SPDK_NOTICELOG("\tvol->pm_chunk_maps = %p\n", (void *)vol->pm_chunk_maps); + chunk_map_size = _get_pm_total_chunks_size(vol->params.vol_size, vol->params.chunk_size, + vol->params.backing_io_unit_size); + SPDK_NOTICELOG("\tchunk_map_size = 0x%" PRIx64 "\n", chunk_map_size); +} + +SPDK_LOG_REGISTER_COMPONENT("reduce", SPDK_LOG_REDUCE) diff --git a/src/spdk/lib/reduce/spdk_reduce.map b/src/spdk/lib/reduce/spdk_reduce.map new file mode 100644 index 000000000..c53792710 --- /dev/null +++ b/src/spdk/lib/reduce/spdk_reduce.map @@ -0,0 +1,16 @@ +{ + global: + + # public functions + spdk_reduce_vol_get_uuid; + spdk_reduce_vol_init; + spdk_reduce_vol_load; + spdk_reduce_vol_unload; + spdk_reduce_vol_destroy; + spdk_reduce_vol_readv; + spdk_reduce_vol_writev; + spdk_reduce_vol_get_params; + spdk_reduce_vol_print_info; + + local: *; +}; diff --git a/src/spdk/lib/rocksdb/env_spdk.cc b/src/spdk/lib/rocksdb/env_spdk.cc new file mode 100644 index 000000000..8695acca6 --- /dev/null +++ b/src/spdk/lib/rocksdb/env_spdk.cc @@ -0,0 +1,798 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "rocksdb/env.h" +#include <set> +#include <iostream> +#include <stdexcept> + +extern "C" { +#include "spdk/env.h" +#include "spdk/event.h" +#include "spdk/blob.h" +#include "spdk/blobfs.h" +#include "spdk/blob_bdev.h" +#include "spdk/log.h" +#include "spdk/thread.h" +#include "spdk/bdev.h" + +#include "spdk_internal/thread.h" +} + +namespace rocksdb +{ + +struct spdk_filesystem *g_fs = NULL; +struct spdk_bs_dev *g_bs_dev; +uint32_t g_lcore = 0; +std::string g_bdev_name; +volatile bool g_spdk_ready = false; +volatile bool g_spdk_start_failure = false; + +void SpdkInitializeThread(void); + +class SpdkThreadCtx +{ +public: + struct spdk_fs_thread_ctx *channel; + + SpdkThreadCtx(void) : channel(NULL) + { + SpdkInitializeThread(); + } + + ~SpdkThreadCtx(void) + { + if (channel) { + spdk_fs_free_thread_ctx(channel); + channel = NULL; + } + } + +private: + SpdkThreadCtx(const SpdkThreadCtx &); + SpdkThreadCtx &operator=(const SpdkThreadCtx &); +}; + +thread_local SpdkThreadCtx g_sync_args; + +static void +set_channel() +{ + struct spdk_thread *thread; + + if (g_fs != NULL && g_sync_args.channel == NULL) { + thread = spdk_thread_create("spdK_rocksdb", NULL); + spdk_set_thread(thread); + g_sync_args.channel = spdk_fs_alloc_thread_ctx(g_fs); + } +} + +static void +__call_fn(void *arg1, void *arg2) +{ + fs_request_fn fn; + + fn = (fs_request_fn)arg1; + fn(arg2); +} + +static void +__send_request(fs_request_fn fn, void *arg) +{ + struct spdk_event *event; + + event = spdk_event_allocate(g_lcore, __call_fn, (void *)fn, arg); + spdk_event_call(event); +} + +static std::string +sanitize_path(const std::string &input, const std::string &mount_directory) +{ + int index = 0; + std::string name; + std::string input_tmp; + + input_tmp = input.substr(mount_directory.length(), input.length()); + for (const char &c : input_tmp) { + if (index == 0) { + if (c != '/') { + name = name.insert(index, 1, '/'); + index++; + } + name = name.insert(index, 1, c); + index++; + } else { + if (name[index - 1] == '/' && c == '/') { + continue; + } else { + name = name.insert(index, 1, c); + index++; + } + } + } + + if (name[name.size() - 1] == '/') { + name = name.erase(name.size() - 1, 1); + } + return name; +} + +class SpdkSequentialFile : public SequentialFile +{ + struct spdk_file *mFile; + uint64_t mOffset; +public: + SpdkSequentialFile(struct spdk_file *file) : mFile(file), mOffset(0) {} + virtual ~SpdkSequentialFile(); + + virtual Status Read(size_t n, Slice *result, char *scratch) override; + virtual Status Skip(uint64_t n) override; + virtual Status InvalidateCache(size_t offset, size_t length) override; +}; + +SpdkSequentialFile::~SpdkSequentialFile(void) +{ + set_channel(); + spdk_file_close(mFile, g_sync_args.channel); +} + +Status +SpdkSequentialFile::Read(size_t n, Slice *result, char *scratch) +{ + int64_t ret; + + set_channel(); + ret = spdk_file_read(mFile, g_sync_args.channel, scratch, mOffset, n); + if (ret >= 0) { + mOffset += ret; + *result = Slice(scratch, ret); + return Status::OK(); + } else { + errno = -ret; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } +} + +Status +SpdkSequentialFile::Skip(uint64_t n) +{ + mOffset += n; + return Status::OK(); +} + +Status +SpdkSequentialFile::InvalidateCache(__attribute__((unused)) size_t offset, + __attribute__((unused)) size_t length) +{ + return Status::OK(); +} + +class SpdkRandomAccessFile : public RandomAccessFile +{ + struct spdk_file *mFile; +public: + SpdkRandomAccessFile(struct spdk_file *file) : mFile(file) {} + virtual ~SpdkRandomAccessFile(); + + virtual Status Read(uint64_t offset, size_t n, Slice *result, char *scratch) const override; + virtual Status InvalidateCache(size_t offset, size_t length) override; +}; + +SpdkRandomAccessFile::~SpdkRandomAccessFile(void) +{ + set_channel(); + spdk_file_close(mFile, g_sync_args.channel); +} + +Status +SpdkRandomAccessFile::Read(uint64_t offset, size_t n, Slice *result, char *scratch) const +{ + int64_t rc; + + set_channel(); + rc = spdk_file_read(mFile, g_sync_args.channel, scratch, offset, n); + if (rc >= 0) { + *result = Slice(scratch, n); + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } +} + +Status +SpdkRandomAccessFile::InvalidateCache(__attribute__((unused)) size_t offset, + __attribute__((unused)) size_t length) +{ + return Status::OK(); +} + +class SpdkWritableFile : public WritableFile +{ + struct spdk_file *mFile; + uint64_t mSize; + +public: + SpdkWritableFile(struct spdk_file *file) : mFile(file), mSize(0) {} + ~SpdkWritableFile() + { + if (mFile != NULL) { + Close(); + } + } + + virtual void SetIOPriority(Env::IOPriority pri) + { + if (pri == Env::IO_HIGH) { + spdk_file_set_priority(mFile, SPDK_FILE_PRIORITY_HIGH); + } + } + + virtual Status Truncate(uint64_t size) override + { + int rc; + + set_channel(); + rc = spdk_file_truncate(mFile, g_sync_args.channel, size); + if (!rc) { + mSize = size; + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } + } + virtual Status Close() override + { + set_channel(); + spdk_file_close(mFile, g_sync_args.channel); + mFile = NULL; + return Status::OK(); + } + virtual Status Append(const Slice &data) override; + virtual Status Flush() override + { + return Status::OK(); + } + virtual Status Sync() override + { + int rc; + + set_channel(); + rc = spdk_file_sync(mFile, g_sync_args.channel); + if (!rc) { + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } + } + virtual Status Fsync() override + { + int rc; + + set_channel(); + rc = spdk_file_sync(mFile, g_sync_args.channel); + if (!rc) { + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } + } + virtual bool IsSyncThreadSafe() const override + { + return true; + } + virtual uint64_t GetFileSize() override + { + return mSize; + } + virtual Status InvalidateCache(__attribute__((unused)) size_t offset, + __attribute__((unused)) size_t length) override + { + return Status::OK(); + } + virtual Status Allocate(uint64_t offset, uint64_t len) override + { + int rc; + + set_channel(); + rc = spdk_file_truncate(mFile, g_sync_args.channel, offset + len); + if (!rc) { + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } + } + virtual Status RangeSync(__attribute__((unused)) uint64_t offset, + __attribute__((unused)) uint64_t nbytes) override + { + int rc; + + /* + * SPDK BlobFS does not have a range sync operation yet, so just sync + * the whole file. + */ + set_channel(); + rc = spdk_file_sync(mFile, g_sync_args.channel); + if (!rc) { + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } + } + virtual size_t GetUniqueId(char *id, size_t max_size) const override + { + int rc; + + rc = spdk_file_get_id(mFile, id, max_size); + if (rc < 0) { + return 0; + } else { + return rc; + } + } +}; + +Status +SpdkWritableFile::Append(const Slice &data) +{ + int64_t rc; + + set_channel(); + rc = spdk_file_write(mFile, g_sync_args.channel, (void *)data.data(), mSize, data.size()); + if (rc >= 0) { + mSize += data.size(); + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } +} + +class SpdkDirectory : public Directory +{ +public: + SpdkDirectory() {} + ~SpdkDirectory() {} + Status Fsync() override + { + return Status::OK(); + } +}; + +class SpdkAppStartException : public std::runtime_error +{ +public: + SpdkAppStartException(std::string mess): std::runtime_error(mess) {} +}; + +class SpdkEnv : public EnvWrapper +{ +private: + pthread_t mSpdkTid; + std::string mDirectory; + std::string mConfig; + std::string mBdev; + +public: + SpdkEnv(Env *base_env, const std::string &dir, const std::string &conf, + const std::string &bdev, uint64_t cache_size_in_mb); + + virtual ~SpdkEnv(); + + virtual Status NewSequentialFile(const std::string &fname, + unique_ptr<SequentialFile> *result, + const EnvOptions &options) override + { + if (fname.compare(0, mDirectory.length(), mDirectory) == 0) { + struct spdk_file *file; + int rc; + + std::string name = sanitize_path(fname, mDirectory); + set_channel(); + rc = spdk_fs_open_file(g_fs, g_sync_args.channel, + name.c_str(), 0, &file); + if (rc == 0) { + result->reset(new SpdkSequentialFile(file)); + return Status::OK(); + } else { + /* Myrocks engine uses errno(ENOENT) as one + * special condition, for the purpose to + * support MySQL, set the errno to right value. + */ + errno = -rc; + return Status::IOError(name, strerror(errno)); + } + } else { + return EnvWrapper::NewSequentialFile(fname, result, options); + } + } + + virtual Status NewRandomAccessFile(const std::string &fname, + unique_ptr<RandomAccessFile> *result, + const EnvOptions &options) override + { + if (fname.compare(0, mDirectory.length(), mDirectory) == 0) { + std::string name = sanitize_path(fname, mDirectory); + struct spdk_file *file; + int rc; + + set_channel(); + rc = spdk_fs_open_file(g_fs, g_sync_args.channel, + name.c_str(), 0, &file); + if (rc == 0) { + result->reset(new SpdkRandomAccessFile(file)); + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(name, strerror(errno)); + } + } else { + return EnvWrapper::NewRandomAccessFile(fname, result, options); + } + } + + virtual Status NewWritableFile(const std::string &fname, + unique_ptr<WritableFile> *result, + const EnvOptions &options) override + { + if (fname.compare(0, mDirectory.length(), mDirectory) == 0) { + std::string name = sanitize_path(fname, mDirectory); + struct spdk_file *file; + int rc; + + set_channel(); + rc = spdk_fs_open_file(g_fs, g_sync_args.channel, name.c_str(), + SPDK_BLOBFS_OPEN_CREATE, &file); + if (rc == 0) { + result->reset(new SpdkWritableFile(file)); + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(name, strerror(errno)); + } + } else { + return EnvWrapper::NewWritableFile(fname, result, options); + } + } + + virtual Status ReuseWritableFile(const std::string &fname, + const std::string &old_fname, + unique_ptr<WritableFile> *result, + const EnvOptions &options) override + { + return EnvWrapper::ReuseWritableFile(fname, old_fname, result, options); + } + + virtual Status NewDirectory(__attribute__((unused)) const std::string &name, + unique_ptr<Directory> *result) override + { + result->reset(new SpdkDirectory()); + return Status::OK(); + } + virtual Status FileExists(const std::string &fname) override + { + struct spdk_file_stat stat; + int rc; + std::string name = sanitize_path(fname, mDirectory); + + set_channel(); + rc = spdk_fs_file_stat(g_fs, g_sync_args.channel, name.c_str(), &stat); + if (rc == 0) { + return Status::OK(); + } + return EnvWrapper::FileExists(fname); + } + virtual Status RenameFile(const std::string &src, const std::string &t) override + { + int rc; + std::string src_name = sanitize_path(src, mDirectory); + std::string target_name = sanitize_path(t, mDirectory); + + set_channel(); + rc = spdk_fs_rename_file(g_fs, g_sync_args.channel, + src_name.c_str(), target_name.c_str()); + if (rc == -ENOENT) { + return EnvWrapper::RenameFile(src, t); + } + return Status::OK(); + } + virtual Status LinkFile(__attribute__((unused)) const std::string &src, + __attribute__((unused)) const std::string &t) override + { + return Status::NotSupported("SpdkEnv does not support LinkFile"); + } + virtual Status GetFileSize(const std::string &fname, uint64_t *size) override + { + struct spdk_file_stat stat; + int rc; + std::string name = sanitize_path(fname, mDirectory); + + set_channel(); + rc = spdk_fs_file_stat(g_fs, g_sync_args.channel, name.c_str(), &stat); + if (rc == -ENOENT) { + return EnvWrapper::GetFileSize(fname, size); + } + *size = stat.size; + return Status::OK(); + } + virtual Status DeleteFile(const std::string &fname) override + { + int rc; + std::string name = sanitize_path(fname, mDirectory); + + set_channel(); + rc = spdk_fs_delete_file(g_fs, g_sync_args.channel, name.c_str()); + if (rc == -ENOENT) { + return EnvWrapper::DeleteFile(fname); + } + return Status::OK(); + } + virtual Status LockFile(const std::string &fname, FileLock **lock) override + { + std::string name = sanitize_path(fname, mDirectory); + int64_t rc; + + set_channel(); + rc = spdk_fs_open_file(g_fs, g_sync_args.channel, name.c_str(), + SPDK_BLOBFS_OPEN_CREATE, (struct spdk_file **)lock); + if (!rc) { + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(name, strerror(errno)); + } + } + virtual Status UnlockFile(FileLock *lock) override + { + set_channel(); + spdk_file_close((struct spdk_file *)lock, g_sync_args.channel); + return Status::OK(); + } + virtual Status GetChildren(const std::string &dir, + std::vector<std::string> *result) override + { + std::string::size_type pos; + std::set<std::string> dir_and_file_set; + std::string full_path; + std::string filename; + std::string dir_name; + + if (dir.find("archive") != std::string::npos) { + return Status::OK(); + } + if (dir.compare(0, mDirectory.length(), mDirectory) == 0) { + spdk_fs_iter iter; + struct spdk_file *file; + dir_name = sanitize_path(dir, mDirectory); + + iter = spdk_fs_iter_first(g_fs); + while (iter != NULL) { + file = spdk_fs_iter_get_file(iter); + full_path = spdk_file_get_name(file); + if (strncmp(dir_name.c_str(), full_path.c_str(), dir_name.length())) { + iter = spdk_fs_iter_next(iter); + continue; + } + pos = full_path.find("/", dir_name.length() + 1); + + if (pos != std::string::npos) { + filename = full_path.substr(dir_name.length() + 1, pos - dir_name.length() - 1); + } else { + filename = full_path.substr(dir_name.length() + 1); + } + dir_and_file_set.insert(filename); + iter = spdk_fs_iter_next(iter); + } + + for (auto &s : dir_and_file_set) { + result->push_back(s); + } + + result->push_back("."); + result->push_back(".."); + + return Status::OK(); + } + return EnvWrapper::GetChildren(dir, result); + } +}; + +/* The thread local constructor doesn't work for the main thread, since + * the filesystem hasn't been loaded yet. So we break out this + * SpdkInitializeThread function, so that the main thread can explicitly + * call it after the filesystem has been loaded. + */ +void SpdkInitializeThread(void) +{ + struct spdk_thread *thread; + + if (g_fs != NULL) { + if (g_sync_args.channel) { + spdk_fs_free_thread_ctx(g_sync_args.channel); + } + thread = spdk_thread_create("spdk_rocksdb", NULL); + spdk_set_thread(thread); + g_sync_args.channel = spdk_fs_alloc_thread_ctx(g_fs); + } +} + +static void +fs_load_cb(__attribute__((unused)) void *ctx, + struct spdk_filesystem *fs, int fserrno) +{ + if (fserrno == 0) { + g_fs = fs; + } + g_spdk_ready = true; +} + +static void +rocksdb_run(__attribute__((unused)) void *arg1) +{ + struct spdk_bdev *bdev; + + bdev = spdk_bdev_get_by_name(g_bdev_name.c_str()); + + if (bdev == NULL) { + SPDK_ERRLOG("bdev %s not found\n", g_bdev_name.c_str()); + exit(1); + } + + g_lcore = spdk_env_get_first_core(); + + g_bs_dev = spdk_bdev_create_bs_dev(bdev, NULL, NULL); + printf("using bdev %s\n", g_bdev_name.c_str()); + spdk_fs_load(g_bs_dev, __send_request, fs_load_cb, NULL); +} + +static void +fs_unload_cb(__attribute__((unused)) void *ctx, + __attribute__((unused)) int fserrno) +{ + assert(fserrno == 0); + + spdk_app_stop(0); +} + +static void +rocksdb_shutdown(void) +{ + if (g_fs != NULL) { + spdk_fs_unload(g_fs, fs_unload_cb, NULL); + } else { + fs_unload_cb(NULL, 0); + } +} + +static void * +initialize_spdk(void *arg) +{ + struct spdk_app_opts *opts = (struct spdk_app_opts *)arg; + int rc; + + rc = spdk_app_start(opts, rocksdb_run, NULL); + /* + * TODO: Revisit for case of internal failure of + * spdk_app_start(), itself. At this time, it's known + * the only application's use of spdk_app_stop() passes + * a zero; i.e. no fail (non-zero) cases so here we + * assume there was an internal failure and flag it + * so we can throw an exception. + */ + if (rc) { + g_spdk_start_failure = true; + } else { + spdk_app_fini(); + delete opts; + } + pthread_exit(NULL); + +} + +SpdkEnv::SpdkEnv(Env *base_env, const std::string &dir, const std::string &conf, + const std::string &bdev, uint64_t cache_size_in_mb) + : EnvWrapper(base_env), mDirectory(dir), mConfig(conf), mBdev(bdev) +{ + struct spdk_app_opts *opts = new struct spdk_app_opts; + + spdk_app_opts_init(opts); + opts->name = "rocksdb"; + opts->config_file = mConfig.c_str(); + opts->shutdown_cb = rocksdb_shutdown; + + spdk_fs_set_cache_size(cache_size_in_mb); + g_bdev_name = mBdev; + + pthread_create(&mSpdkTid, NULL, &initialize_spdk, opts); + while (!g_spdk_ready && !g_spdk_start_failure) + ; + if (g_spdk_start_failure) { + delete opts; + throw SpdkAppStartException("spdk_app_start() unable to start rocksdb_run()"); + } + + SpdkInitializeThread(); +} + +SpdkEnv::~SpdkEnv() +{ + /* This is a workaround for rocksdb test, we close the files if the rocksdb not + * do the work before the test quit. + */ + if (g_fs != NULL) { + spdk_fs_iter iter; + struct spdk_file *file; + + if (!g_sync_args.channel) { + SpdkInitializeThread(); + } + + iter = spdk_fs_iter_first(g_fs); + while (iter != NULL) { + file = spdk_fs_iter_get_file(iter); + spdk_file_close(file, g_sync_args.channel); + iter = spdk_fs_iter_next(iter); + } + } + + spdk_app_start_shutdown(); + pthread_join(mSpdkTid, NULL); +} + +Env *NewSpdkEnv(Env *base_env, const std::string &dir, const std::string &conf, + const std::string &bdev, uint64_t cache_size_in_mb) +{ + try { + SpdkEnv *spdk_env = new SpdkEnv(base_env, dir, conf, bdev, cache_size_in_mb); + if (g_fs != NULL) { + return spdk_env; + } else { + delete spdk_env; + return NULL; + } + } catch (SpdkAppStartException &e) { + SPDK_ERRLOG("NewSpdkEnv: exception caught: %s", e.what()); + return NULL; + } catch (...) { + SPDK_ERRLOG("NewSpdkEnv: default exception caught"); + return NULL; + } +} + +} // namespace rocksdb diff --git a/src/spdk/lib/rocksdb/spdk.rocksdb.mk b/src/spdk/lib/rocksdb/spdk.rocksdb.mk new file mode 100644 index 000000000..fe498cc39 --- /dev/null +++ b/src/spdk/lib/rocksdb/spdk.rocksdb.mk @@ -0,0 +1,70 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# This snippet will be included into the RocksDB Makefile + +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +include $(SPDK_ROOT_DIR)/mk/spdk.app.mk +include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk + +CXXFLAGS += -I$(SPDK_DIR)/include -Iinclude/ + +# The SPDK makefiles turn this on, but RocksDB won't compile with it. So +# turn it off after including the SPDK makefiles. +CXXFLAGS += -Wno-missing-declarations + +# The SPDK Makefiles may turn these options on but we do not want to enable +# them for the RocksDB source files. +CXXFLAGS += -fno-profile-arcs -fno-test-coverage +ifeq ($(CONFIG_UBSAN),y) +CXXFLAGS += -fno-sanitize=undefined +endif +ifeq ($(CONFIG_ASAN),y) +CXXFLAGS += -fno-sanitize=address +endif + +SPDK_LIB_LIST = $(ALL_MODULES_LIST) +SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM) +SPDK_LIB_LIST += bdev accel event util conf trace log jsonrpc json rpc sock thread notify +SPDK_LIB_LIST += bdev_rpc blobfs_bdev + +AM_LINK += $(SPDK_LIB_LINKER_ARGS) $(ENV_LINKER_ARGS) +AM_LINK += $(SYS_LIBS) + +ifeq ($(CONFIG_UBSAN),y) +AM_LINK += -fsanitize=undefined +endif + +ifeq ($(CONFIG_COVERAGE),y) +AM_LINK += -fprofile-arcs -ftest-coverage +endif diff --git a/src/spdk/lib/rpc/Makefile b/src/spdk/lib/rpc/Makefile new file mode 100644 index 000000000..ead36f6ba --- /dev/null +++ b/src/spdk/lib/rpc/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = rpc.c +LIBNAME = rpc + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_rpc.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/rpc/rpc.c b/src/spdk/lib/rpc/rpc.c new file mode 100644 index 000000000..7182f41e9 --- /dev/null +++ b/src/spdk/lib/rpc/rpc.c @@ -0,0 +1,392 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/file.h> + +#include "spdk/stdinc.h" + +#include "spdk/queue.h" +#include "spdk/rpc.h" +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/version.h" + +static struct sockaddr_un g_rpc_listen_addr_unix = {}; +static char g_rpc_lock_path[sizeof(g_rpc_listen_addr_unix.sun_path) + sizeof(".lock")]; +static int g_rpc_lock_fd = -1; + +static struct spdk_jsonrpc_server *g_jsonrpc_server = NULL; +static uint32_t g_rpc_state; +static bool g_rpcs_correct = true; + +struct spdk_rpc_method { + const char *name; + spdk_rpc_method_handler func; + SLIST_ENTRY(spdk_rpc_method) slist; + uint32_t state_mask; + bool is_deprecated; + struct spdk_rpc_method *is_alias_of; + bool deprecation_warning_printed; +}; + +static SLIST_HEAD(, spdk_rpc_method) g_rpc_methods = SLIST_HEAD_INITIALIZER(g_rpc_methods); + +void +spdk_rpc_set_state(uint32_t state) +{ + g_rpc_state = state; +} + +uint32_t +spdk_rpc_get_state(void) +{ + return g_rpc_state; +} + +static struct spdk_rpc_method * +_get_rpc_method(const struct spdk_json_val *method) +{ + struct spdk_rpc_method *m; + + SLIST_FOREACH(m, &g_rpc_methods, slist) { + if (spdk_json_strequal(method, m->name)) { + return m; + } + } + + return NULL; +} + +static struct spdk_rpc_method * +_get_rpc_method_raw(const char *method) +{ + struct spdk_json_val method_val; + + method_val.type = SPDK_JSON_VAL_STRING; + method_val.len = strlen(method); + method_val.start = (char *)method; + + return _get_rpc_method(&method_val); +} + +static void +jsonrpc_handler(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *method, + const struct spdk_json_val *params) +{ + struct spdk_rpc_method *m; + + assert(method != NULL); + + m = _get_rpc_method(method); + if (m == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_METHOD_NOT_FOUND, "Method not found"); + return; + } + + if (m->is_alias_of != NULL) { + if (m->is_deprecated && !m->deprecation_warning_printed) { + SPDK_WARNLOG("RPC method %s is deprecated. Use %s instead.\n", m->name, m->is_alias_of->name); + m->deprecation_warning_printed = true; + } + m = m->is_alias_of; + } + + if ((m->state_mask & g_rpc_state) == g_rpc_state) { + m->func(request, params); + } else { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE, + "Method is allowed in any state in the mask (%"PRIx32")," + " but current state is (%"PRIx32")", + m->state_mask, g_rpc_state); + } +} + +int +spdk_rpc_listen(const char *listen_addr) +{ + int rc; + + memset(&g_rpc_listen_addr_unix, 0, sizeof(g_rpc_listen_addr_unix)); + + g_rpc_listen_addr_unix.sun_family = AF_UNIX; + rc = snprintf(g_rpc_listen_addr_unix.sun_path, + sizeof(g_rpc_listen_addr_unix.sun_path), + "%s", listen_addr); + if (rc < 0 || (size_t)rc >= sizeof(g_rpc_listen_addr_unix.sun_path)) { + SPDK_ERRLOG("RPC Listen address Unix socket path too long\n"); + g_rpc_listen_addr_unix.sun_path[0] = '\0'; + return -1; + } + + rc = snprintf(g_rpc_lock_path, sizeof(g_rpc_lock_path), "%s.lock", + g_rpc_listen_addr_unix.sun_path); + if (rc < 0 || (size_t)rc >= sizeof(g_rpc_lock_path)) { + SPDK_ERRLOG("RPC lock path too long\n"); + g_rpc_listen_addr_unix.sun_path[0] = '\0'; + g_rpc_lock_path[0] = '\0'; + return -1; + } + + g_rpc_lock_fd = open(g_rpc_lock_path, O_RDONLY | O_CREAT, 0600); + if (g_rpc_lock_fd == -1) { + SPDK_ERRLOG("Cannot open lock file %s: %s\n", + g_rpc_lock_path, spdk_strerror(errno)); + g_rpc_listen_addr_unix.sun_path[0] = '\0'; + g_rpc_lock_path[0] = '\0'; + return -1; + } + + rc = flock(g_rpc_lock_fd, LOCK_EX | LOCK_NB); + if (rc != 0) { + SPDK_ERRLOG("RPC Unix domain socket path %s in use. Specify another.\n", + g_rpc_listen_addr_unix.sun_path); + g_rpc_listen_addr_unix.sun_path[0] = '\0'; + g_rpc_lock_path[0] = '\0'; + return -1; + } + + /* + * Since we acquired the lock, it is safe to delete the Unix socket file + * if it still exists from a previous process. + */ + unlink(g_rpc_listen_addr_unix.sun_path); + + g_jsonrpc_server = spdk_jsonrpc_server_listen(AF_UNIX, 0, + (struct sockaddr *)&g_rpc_listen_addr_unix, + sizeof(g_rpc_listen_addr_unix), + jsonrpc_handler); + if (g_jsonrpc_server == NULL) { + SPDK_ERRLOG("spdk_jsonrpc_server_listen() failed\n"); + close(g_rpc_lock_fd); + g_rpc_lock_fd = -1; + unlink(g_rpc_lock_path); + g_rpc_lock_path[0] = '\0'; + return -1; + } + + return 0; +} + +void +spdk_rpc_accept(void) +{ + spdk_jsonrpc_server_poll(g_jsonrpc_server); +} + +void +spdk_rpc_register_method(const char *method, spdk_rpc_method_handler func, uint32_t state_mask) +{ + struct spdk_rpc_method *m; + + m = _get_rpc_method_raw(method); + if (m != NULL) { + SPDK_ERRLOG("duplicate RPC %s registered...\n", method); + g_rpcs_correct = false; + return; + } + + m = calloc(1, sizeof(struct spdk_rpc_method)); + assert(m != NULL); + + m->name = strdup(method); + assert(m->name != NULL); + + m->func = func; + m->state_mask = state_mask; + + /* TODO: use a hash table or sorted list */ + SLIST_INSERT_HEAD(&g_rpc_methods, m, slist); +} + +void +spdk_rpc_register_alias_deprecated(const char *method, const char *alias) +{ + struct spdk_rpc_method *m, *base; + + base = _get_rpc_method_raw(method); + if (base == NULL) { + SPDK_ERRLOG("cannot create alias %s - method %s does not exist\n", + alias, method); + g_rpcs_correct = false; + return; + } + + if (base->is_alias_of != NULL) { + SPDK_ERRLOG("cannot create alias %s of alias %s\n", alias, method); + g_rpcs_correct = false; + return; + } + + m = calloc(1, sizeof(struct spdk_rpc_method)); + assert(m != NULL); + + m->name = strdup(alias); + assert(m->name != NULL); + + m->is_alias_of = base; + m->is_deprecated = true; + m->state_mask = base->state_mask; + + /* TODO: use a hash table or sorted list */ + SLIST_INSERT_HEAD(&g_rpc_methods, m, slist); +} + +bool +spdk_rpc_verify_methods(void) +{ + return g_rpcs_correct; +} + +int +spdk_rpc_is_method_allowed(const char *method, uint32_t state_mask) +{ + struct spdk_rpc_method *m; + + SLIST_FOREACH(m, &g_rpc_methods, slist) { + if (strcmp(m->name, method) != 0) { + continue; + } + + if ((m->state_mask & state_mask) == state_mask) { + return 0; + } else { + return -EPERM; + } + } + + return -ENOENT; +} + +void +spdk_rpc_close(void) +{ + if (g_jsonrpc_server) { + if (g_rpc_listen_addr_unix.sun_path[0]) { + /* Delete the Unix socket file */ + unlink(g_rpc_listen_addr_unix.sun_path); + g_rpc_listen_addr_unix.sun_path[0] = '\0'; + } + + spdk_jsonrpc_server_shutdown(g_jsonrpc_server); + g_jsonrpc_server = NULL; + + if (g_rpc_lock_fd != -1) { + close(g_rpc_lock_fd); + g_rpc_lock_fd = -1; + } + + if (g_rpc_lock_path[0]) { + unlink(g_rpc_lock_path); + g_rpc_lock_path[0] = '\0'; + } + } +} + +struct rpc_get_methods { + bool current; + bool include_aliases; +}; + +static const struct spdk_json_object_decoder rpc_get_methods_decoders[] = { + {"current", offsetof(struct rpc_get_methods, current), spdk_json_decode_bool, true}, + {"include_aliases", offsetof(struct rpc_get_methods, include_aliases), spdk_json_decode_bool, true}, +}; + +static void +rpc_get_methods(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +{ + struct rpc_get_methods req = {}; + struct spdk_json_write_ctx *w; + struct spdk_rpc_method *m; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_get_methods_decoders, + SPDK_COUNTOF(rpc_get_methods_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + SLIST_FOREACH(m, &g_rpc_methods, slist) { + if (m->is_alias_of != NULL && !req.include_aliases) { + continue; + } + if (req.current && ((m->state_mask & g_rpc_state) != g_rpc_state)) { + continue; + } + spdk_json_write_string(w, m->name); + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("rpc_get_methods", rpc_get_methods, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(rpc_get_methods, get_rpc_methods) + +static void +rpc_spdk_get_version(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "spdk_get_version method requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_object_begin(w); + + spdk_json_write_named_string_fmt(w, "version", "%s", SPDK_VERSION_STRING); + spdk_json_write_named_object_begin(w, "fields"); + spdk_json_write_named_uint32(w, "major", SPDK_VERSION_MAJOR); + spdk_json_write_named_uint32(w, "minor", SPDK_VERSION_MINOR); + spdk_json_write_named_uint32(w, "patch", SPDK_VERSION_PATCH); + spdk_json_write_named_string_fmt(w, "suffix", "%s", SPDK_VERSION_SUFFIX); +#ifdef SPDK_GIT_COMMIT + spdk_json_write_named_string_fmt(w, "commit", "%s", SPDK_GIT_COMMIT_STRING); +#endif + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("spdk_get_version", rpc_spdk_get_version, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(spdk_get_version, get_spdk_version) diff --git a/src/spdk/lib/rpc/spdk_rpc.map b/src/spdk/lib/rpc/spdk_rpc.map new file mode 100644 index 000000000..e15ff8b53 --- /dev/null +++ b/src/spdk/lib/rpc/spdk_rpc.map @@ -0,0 +1,16 @@ +{ + global: + + # public functions + spdk_rpc_verify_methods; + spdk_rpc_listen; + spdk_rpc_accept; + spdk_rpc_close; + spdk_rpc_register_method; + spdk_rpc_register_alias_deprecated; + spdk_rpc_is_method_allowed; + spdk_rpc_set_state; + spdk_rpc_get_state; + + local: *; +}; diff --git a/src/spdk/lib/rte_vhost/Makefile b/src/spdk/lib/rte_vhost/Makefile new file mode 100644 index 000000000..aa073c6ca --- /dev/null +++ b/src/spdk/lib/rte_vhost/Makefile @@ -0,0 +1,50 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +CFLAGS += -I. +CFLAGS += $(ENV_CFLAGS) +CFLAGS += -include rte_config.h +CFLAGS += -Wno-address-of-packed-member + +# These are the DPDK vhost files copied (for now) into SPDK +C_SRCS += fd_man.c socket.c vhost_user.c vhost.c + +LIBNAME = rte_vhost + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/rte_vhost/fd_man.c b/src/spdk/lib/rte_vhost/fd_man.c new file mode 100644 index 000000000..2ceacc9ab --- /dev/null +++ b/src/spdk/lib/rte_vhost/fd_man.c @@ -0,0 +1,300 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> +#include <string.h> + +#include <rte_common.h> +#include <rte_log.h> + +#include "fd_man.h" + +#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL) + +static int +get_last_valid_idx(struct fdset *pfdset, int last_valid_idx) +{ + int i; + + for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--) + ; + + return i; +} + +static void +fdset_move(struct fdset *pfdset, int dst, int src) +{ + pfdset->fd[dst] = pfdset->fd[src]; + pfdset->rwfds[dst] = pfdset->rwfds[src]; +} + +static void +fdset_shrink_nolock(struct fdset *pfdset) +{ + int i; + int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1); + + for (i = 0; i < last_valid_idx; i++) { + if (pfdset->fd[i].fd != -1) + continue; + + fdset_move(pfdset, i, last_valid_idx); + last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1); + } + pfdset->num = last_valid_idx + 1; +} + +/* + * Find deleted fd entries and remove them + */ +static void +fdset_shrink(struct fdset *pfdset) +{ + pthread_mutex_lock(&pfdset->fd_mutex); + fdset_shrink_nolock(pfdset); + pthread_mutex_unlock(&pfdset->fd_mutex); +} + +/** + * Returns the index in the fdset for a given fd. + * @return + * index for the fd, or -1 if fd isn't in the fdset. + */ +static int +fdset_find_fd(struct fdset *pfdset, int fd) +{ + int i; + + for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++) + ; + + return i == pfdset->num ? -1 : i; +} + +static void +fdset_add_fd(struct fdset *pfdset, int idx, int fd, + fd_cb rcb, fd_cb wcb, void *dat) +{ + struct fdentry *pfdentry = &pfdset->fd[idx]; + struct pollfd *pfd = &pfdset->rwfds[idx]; + + pfdentry->fd = fd; + pfdentry->rcb = rcb; + pfdentry->wcb = wcb; + pfdentry->dat = dat; + + pfd->fd = fd; + pfd->events = rcb ? POLLIN : 0; + pfd->events |= wcb ? POLLOUT : 0; + pfd->revents = 0; +} + +void +fdset_init(struct fdset *pfdset) +{ + int i; + + if (pfdset == NULL) + return; + + for (i = 0; i < MAX_FDS; i++) { + pfdset->fd[i].fd = -1; + pfdset->fd[i].dat = NULL; + } + pfdset->num = 0; +} + +/** + * Register the fd in the fdset with read/write handler and context. + */ +int +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) +{ + int i; + + if (pfdset == NULL || fd == -1) + return -1; + + pthread_mutex_lock(&pfdset->fd_mutex); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + fdset_shrink_nolock(pfdset); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -2; + } + } + + fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); + pthread_mutex_unlock(&pfdset->fd_mutex); + + return 0; +} + +/** + * Unregister the fd from the fdset. + * Returns context of a given fd or NULL. + */ +void * +fdset_del(struct fdset *pfdset, int fd) +{ + int i; + void *dat = NULL; + + if (pfdset == NULL || fd == -1) + return NULL; + + do { + pthread_mutex_lock(&pfdset->fd_mutex); + + i = fdset_find_fd(pfdset, fd); + if (i != -1 && pfdset->fd[i].busy == 0) { + /* busy indicates r/wcb is executing! */ + dat = pfdset->fd[i].dat; + pfdset->fd[i].fd = -1; + pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; + pfdset->fd[i].dat = NULL; + i = -1; + } + pthread_mutex_unlock(&pfdset->fd_mutex); + } while (i != -1); + + return dat; +} + + +/** + * This functions runs in infinite blocking loop until there is no fd in + * pfdset. It calls corresponding r/w handler if there is event on the fd. + * + * Before the callback is called, we set the flag to busy status; If other + * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it + * will wait until the flag is reset to zero(which indicates the callback is + * finished), then it could free the context after fdset_del. + */ +void * +fdset_event_dispatch(void *arg) +{ + int i; + struct pollfd *pfd; + struct fdentry *pfdentry; + fd_cb rcb, wcb; + void *dat; + int fd, numfds; + int remove1, remove2; + int need_shrink; + struct fdset *pfdset = arg; + + if (pfdset == NULL) + return NULL; + + while (1) { + + /* + * When poll is blocked, other threads might unregister + * listenfds from and register new listenfds into fdset. + * When poll returns, the entries for listenfds in the fdset + * might have been updated. It is ok if there is unwanted call + * for new listenfds. + */ + pthread_mutex_lock(&pfdset->fd_mutex); + numfds = pfdset->num; + pthread_mutex_unlock(&pfdset->fd_mutex); + + poll(pfdset->rwfds, numfds, 1000 /* millisecs */); + + need_shrink = 0; + for (i = 0; i < numfds; i++) { + pthread_mutex_lock(&pfdset->fd_mutex); + + pfdentry = &pfdset->fd[i]; + fd = pfdentry->fd; + pfd = &pfdset->rwfds[i]; + + if (fd < 0) { + need_shrink = 1; + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + if (!pfd->revents) { + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + remove1 = remove2 = 0; + + rcb = pfdentry->rcb; + wcb = pfdentry->wcb; + dat = pfdentry->dat; + pfdentry->busy = 1; + + pthread_mutex_unlock(&pfdset->fd_mutex); + + if (rcb && pfd->revents & (POLLIN | FDPOLLERR)) + rcb(fd, dat, &remove1); + if (wcb && pfd->revents & (POLLOUT | FDPOLLERR)) + wcb(fd, dat, &remove2); + pfdentry->busy = 0; + /* + * fdset_del needs to check busy flag. + * We don't allow fdset_del to be called in callback + * directly. + */ + /* + * When we are to clean up the fd from fdset, + * because the fd is closed in the cb, + * the old fd val could be reused by when creates new + * listen fd in another thread, we couldn't call + * fd_set_del. + */ + if (remove1 || remove2) { + pfdentry->fd = -1; + need_shrink = 1; + } + } + + if (need_shrink) + fdset_shrink(pfdset); + } + + return NULL; +} diff --git a/src/spdk/lib/rte_vhost/fd_man.h b/src/spdk/lib/rte_vhost/fd_man.h new file mode 100644 index 000000000..3a9d269b3 --- /dev/null +++ b/src/spdk/lib/rte_vhost/fd_man.h @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FD_MAN_H_ +#define _FD_MAN_H_ +#include <stdint.h> +#include <pthread.h> +#include <poll.h> + +#define MAX_FDS 1024 + +typedef void (*fd_cb)(int fd, void *dat, int *remove); + +struct fdentry { + int fd; /* -1 indicates this entry is empty */ + fd_cb rcb; /* callback when this fd is readable. */ + fd_cb wcb; /* callback when this fd is writeable. */ + void *dat; /* fd context */ + int busy; /* whether this entry is being used in cb. */ +}; + +struct fdset { + struct pollfd rwfds[MAX_FDS]; + struct fdentry fd[MAX_FDS]; + pthread_mutex_t fd_mutex; + int num; /* current fd number of this fdset */ +}; + + +void fdset_init(struct fdset *pfdset); + +int fdset_add(struct fdset *pfdset, int fd, + fd_cb rcb, fd_cb wcb, void *dat); + +void *fdset_del(struct fdset *pfdset, int fd); + +void *fdset_event_dispatch(void *arg); + +#endif diff --git a/src/spdk/lib/rte_vhost/rte_vhost.h b/src/spdk/lib/rte_vhost/rte_vhost.h new file mode 100644 index 000000000..b1b7f2cd8 --- /dev/null +++ b/src/spdk/lib/rte_vhost/rte_vhost.h @@ -0,0 +1,635 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_VHOST_H_ +#define _RTE_VHOST_H_ + +/** + * @file + * Interface to vhost-user + */ + +#include <stdint.h> +#include <linux/vhost.h> +#include <linux/virtio_ring.h> +#include <sys/eventfd.h> + +#include <rte_config.h> +#include <rte_memory.h> +#include <rte_mempool.h> + +#define RTE_VHOST_USER_CLIENT (1ULL << 0) +#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) +#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) + +/** + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. + */ +struct rte_vhost_mem_region { + uint64_t guest_phys_addr; + uint64_t guest_user_addr; + uint64_t host_user_addr; + uint64_t size; + void *mmap_addr; + uint64_t mmap_size; + int fd; +}; + +/** + * Memory structure includes region and mapping information. + */ +struct rte_vhost_memory { + uint32_t nregions; + struct rte_vhost_mem_region regions[0]; +}; + +struct rte_vhost_inflight_desc_split { + uint8_t inflight; + uint8_t padding[5]; + uint16_t next; + uint64_t counter; +}; + +struct rte_vhost_inflight_info_split { + uint64_t features; + uint16_t version; + uint16_t desc_num; + uint16_t last_inflight_io; + uint16_t used_idx; + struct rte_vhost_inflight_desc_split desc[0]; +}; + +struct rte_vhost_resubmit_desc { + uint16_t index; + uint64_t counter; +}; + +struct rte_vhost_resubmit_info { + struct rte_vhost_resubmit_desc *resubmit_list; + uint16_t resubmit_num; +}; + +struct rte_vhost_ring_inflight { + struct rte_vhost_inflight_info_split *inflight_split; + struct rte_vhost_resubmit_info *resubmit_inflight; +}; + +struct rte_vhost_vring { + union { + struct vring_desc *desc; + struct vring_packed_desc *desc_packed; + }; + union { + struct vring_avail *avail; + struct vring_packed_desc_event *driver_event; + }; + union { + struct vring_used *used; + struct vring_packed_desc_event *device_event; + }; + uint64_t log_guest_addr; + + int callfd; + int kickfd; + uint16_t size; +}; + +/** + * Device and vring operations. + */ +struct vhost_device_ops { + int (*new_device)(int vid); /**< Add device. */ + void (*destroy_device)(int vid); /**< Remove device. */ + + int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ + + /** + * Features could be changed after the feature negotiation. + * For example, VHOST_F_LOG_ALL will be set/cleared at the + * start/end of live migration, respectively. This callback + * is used to inform the application on such change. + */ + int (*features_changed)(int vid, uint64_t features); + int (*vhost_nvme_admin_passthrough)(int vid, void *cmd, void *cqe, void *buf); + int (*vhost_nvme_set_cq_call)(int vid, uint16_t qid, int fd); + int (*vhost_nvme_set_bar_mr)(int vid, void *bar_addr, uint64_t bar_size); + int (*vhost_nvme_get_cap)(int vid, uint64_t *cap); + + int (*new_connection)(int vid); + void (*destroy_connection)(int vid); + + int (*get_config)(int vid, uint8_t *config, uint32_t config_len); + int (*set_config)(int vid, uint8_t *config, uint32_t offset, + uint32_t len, uint32_t flags); + + void *reserved[2]; /**< Reserved for future extension */ +}; + +/** + * Convert guest physical address to host virtual address + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @return + * the host virtual address on success, 0 on failure + */ +static inline uint64_t __attribute__((always_inline)) +rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + reg = &mem->regions[i]; + if (gpa >= reg->guest_phys_addr && + gpa < reg->guest_phys_addr + reg->size) { + return gpa - reg->guest_phys_addr + + reg->host_user_addr; + } + } + + return 0; +} + +/** + * Convert guest physical address to host virtual address safely + * + * This variant of rte_vhost_gpa_to_vva() takes care all the + * requested length is mapped and contiguous in process address + * space. + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @param len + * the size of the requested area to map, + * updated with actual size mapped + * @return + * the host virtual address on success, 0 on failure */ +static inline uint64_t +rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, + uint64_t gpa, uint64_t *len) +{ + struct rte_vhost_mem_region *r; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + r = &mem->regions[i]; + if (gpa >= r->guest_phys_addr && + gpa < r->guest_phys_addr + r->size) { + + if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) + *len = r->guest_phys_addr + r->size - gpa; + + return gpa - r->guest_phys_addr + + r->host_user_addr; + } + } + *len = 0; + + return 0; +} + +#define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL)) + +/** + * Log the memory write start with given address. + * + * This function only need be invoked when the live migration starts. + * Therefore, we won't need call it at all in the most of time. For + * making the performance impact be minimum, it's suggested to do a + * check before calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_write(vid, addr, len); + * + * @param vid + * vhost device ID + * @param addr + * the starting address for write + * @param len + * the length to write + */ +void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len); + +/** + * Log the used ring update start at given offset. + * + * Same as rte_vhost_log_write, it's suggested to do a check before + * calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_used_vring(vid, vring_idx, offset, len); + * + * @param vid + * vhost device ID + * @param vring_idx + * the vring index + * @param offset + * the offset inside the used ring + * @param len + * the length to write + */ +void rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len); + +int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); + +/** + * Register vhost driver. path could be different for multiple + * instance support. + */ +int rte_vhost_driver_register(const char *path, uint64_t flags); + +/* Unregister vhost driver. This is only meaningful to vhost user. */ +int rte_vhost_driver_unregister(const char *path); + +/** + * Set the feature bits the vhost-user driver supports. + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_set_features(const char *path, uint64_t features); + +/** + * Enable vhost-user driver features. + * + * Note that + * - the param @features should be a subset of the feature bits provided + * by rte_vhost_driver_set_features(). + * - it must be invoked before vhost-user negotiation starts. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to enable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_enable_features(const char *path, uint64_t features); + +/** + * Disable vhost-user driver features. + * + * The two notes at rte_vhost_driver_enable_features() also apply here. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to disable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_disable_features(const char *path, uint64_t features); + +/** + * Get the feature bits before feature negotiation. + * + * @param path + * The vhost-user socket file path + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_get_features(const char *path, uint64_t *features); + +/** + * Get the feature bits after negotiation + * + * @param vid + * Vhost device ID + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_negotiated_features(int vid, uint64_t *features); + +/* Register callbacks. */ +int rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops); + +/** + * + * Start the vhost-user driver. + * + * This function triggers the vhost-user negotiation. + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_start(const char *path); + +/** + * Get the MTU value of the device if set in QEMU. + * + * @param vid + * virtio-net device ID + * @param mtu + * The variable to store the MTU value + * + * @return + * 0: success + * -EAGAIN: device not yet started + * -ENOTSUP: device does not support MTU feature + */ +int rte_vhost_get_mtu(int vid, uint16_t *mtu); + +/** + * Get the numa node from which the virtio net device's memory + * is allocated. + * + * @param vid + * vhost device ID + * + * @return + * The numa node, -1 on failure + */ +int rte_vhost_get_numa_node(int vid); + +/** + * Get the virtio net device's ifname, which is the vhost-user socket + * file path. + * + * @param vid + * vhost device ID + * @param buf + * The buffer to stored the queried ifname + * @param len + * The length of buf + * + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_ifname(int vid, char *buf, size_t len); + +/** + * Get how many avail entries are left in the queue + * + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index + * + * @return + * num of avail entires left + */ +uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id); + +struct rte_mbuf; +struct rte_mempool; +/** + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtual device. A packet + * count is returned to indicate the number of packets that were succesfully + * added to the RX queue. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param pkts + * array to contain packets to be enqueued + * @param count + * packets num to be enqueued + * @return + * num of packets enqueued + */ +uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count); + +/** + * This function gets guest buffers from the virtio device TX virtqueue, + * construct host mbufs, copies guest buffer content to host mbufs and + * store them in pkts to be processed. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param mbuf_pool + * mbuf_pool where host mbuf is allocated. + * @param pkts + * array to contain packets to be dequeued + * @param count + * packets num to be dequeued + * @return + * num of packets dequeued + */ +uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); + +/** + * Get guest mem table: a list of memory regions. + * + * An rte_vhost_vhost_memory object will be allocated internaly, to hold the + * guest memory regions. Application should free it at destroy_device() + * callback. + * + * @param vid + * vhost device ID + * @param mem + * To store the returned mem regions + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem); + +/** + * Get guest vring info, including the vring address, vring size, etc. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param vring + * the structure to hold the requested vring info + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring); + +/** + * Set id of the last descriptors in avail and used guest vrings. + * + * In case user application operates directly on buffers, it should use this + * function on device destruction to retrieve the same values later on in device + * creation via rte_vhost_get_vhost_vring(int, uint16_t, struct rte_vhost_vring *) + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param last_avail_idx + * id of the last descriptor in avail ring to be set + * @param last_used_idx + * id of the last descriptor in used ring to be set + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_set_vring_base(int vid, uint16_t queue_id, + uint16_t last_avail_idx, uint16_t last_used_idx); + +int rte_vhost_get_vring_base(int vid, uint16_t queue_id, + uint16_t *last_avail_idx, uint16_t *last_used_idx); + +/** + * Notify the guest that used descriptors have been added to the vring. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_vring_call(int vid, uint16_t vring_idx); + +/** + * Get guest inflight vring info, including inflight ring and resubmit list. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param vring + * the structure to hold the requested inflight vring info + * @return + * 0 on success, -1 on failure + */ +__rte_experimental +int +rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx, + struct rte_vhost_ring_inflight *vring); + +/** + * Set split inflight descriptor. + * + * This function save descriptors that has been comsumed in available + * ring + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param idx + * inflight entry index + * @return + * 0 on success, -1 on failure + */ +__rte_experimental +int +rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx, + uint16_t idx); + +/** + * Save the head of list that the last batch of used descriptors. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param idx + * descriptor entry index + * @return + * 0 on success, -1 on failure + */ +__rte_experimental +int +rte_vhost_set_last_inflight_io_split(int vid, + uint16_t vring_idx, uint16_t idx); + +/** + * Clear the split inflight status. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param last_used_idx + * last used idx of used ring + * @param idx + * inflight entry index + * @return + * 0 on success, -1 on failure + */ +__rte_experimental +int +rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx, + uint16_t last_used_idx, uint16_t idx); + +/** + * Save the head of list that the last batch of used descriptors. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param idx + * descriptor entry index + * @return + * 0 on success, -1 on failure + */ +__rte_experimental +int +rte_vhost_set_last_inflight_io_split(int vid, + uint16_t vring_idx, uint16_t idx); + +/** + * Clear the split inflight status. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param last_used_idx + * last used idx of used ring + * @param idx + * inflight entry index + * @return + * 0 on success, -1 on failure + */ +__rte_experimental +int +rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx, + uint16_t last_used_idx, uint16_t idx); +#endif /* _RTE_VHOST_H_ */ diff --git a/src/spdk/lib/rte_vhost/socket.c b/src/spdk/lib/rte_vhost/socket.c new file mode 100644 index 000000000..ec923518b --- /dev/null +++ b/src/spdk/lib/rte_vhost/socket.c @@ -0,0 +1,841 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdbool.h> +#include <limits.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/queue.h> +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> + +#include <rte_log.h> + +#include "fd_man.h" +#include "vhost.h" +#include "vhost_user.h" + + +TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection); + +/* + * Every time rte_vhost_driver_register() is invoked, an associated + * vhost_user_socket struct will be created. + */ +struct vhost_user_socket { + struct vhost_user_connection_list conn_list; + pthread_mutex_t conn_mutex; + char *path; + int socket_fd; + struct sockaddr_un un; + bool is_server; + bool reconnect; + bool dequeue_zero_copy; + + /* + * The "supported_features" indicates the feature bits the + * vhost driver supports. The "features" indicates the feature + * bits after the rte_vhost_driver_features_disable/enable(). + * It is also the final feature bits used for vhost-user + * features negotiation. + */ + uint64_t supported_features; + uint64_t features; + + struct vhost_device_ops const *notify_ops; +}; + +struct vhost_user_connection { + struct vhost_user_socket *vsocket; + int connfd; + int vid; + + TAILQ_ENTRY(vhost_user_connection) next; +}; + +#define MAX_VHOST_SOCKET 1024 +struct vhost_user { + struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET]; + struct fdset fdset; + int vsocket_cnt; + pthread_mutex_t mutex; +}; + +#define MAX_VIRTIO_BACKLOG 128 + +static void vhost_user_server_new_connection(int fd, void *data, int *remove); +static void vhost_user_read_cb(int fd, void *dat, int *remove); +static int create_unix_socket(struct vhost_user_socket *vsocket); +static int vhost_user_start_client(struct vhost_user_socket *vsocket); + +static struct vhost_user vhost_user = { + .fdset = { + .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} }, + .fd_mutex = PTHREAD_MUTEX_INITIALIZER, + .num = 0 + }, + .vsocket_cnt = 0, + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +/* return bytes# of read on success or negative val on failure. */ +int +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { + if (ret) + RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed, %s\n", strerror(errno)); + else + RTE_LOG(INFO, VHOST_CONFIG, "peer closed\n"); + return ret; + } + + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n"); + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(fds, CMSG_DATA(cmsg), fdsize); + break; + } + } + + return ret; +} + +int +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + if (cmsg == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n"); + errno = EINVAL; + return -1; + } + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + ret = sendmsg(sockfd, &msgh, 0); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); + return ret; + } + + return ret; +} + +static void +vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) +{ + int vid; + size_t size; + struct vhost_user_connection *conn; + int ret; + + conn = malloc(sizeof(*conn)); + if (conn == NULL) { + close(fd); + return; + } + + vid = vhost_new_device(vsocket->features, vsocket->notify_ops); + if (vid == -1) { + goto err; + } + + size = strnlen(vsocket->path, PATH_MAX); + vhost_set_ifname(vid, vsocket->path, size); + + if (vsocket->dequeue_zero_copy) + vhost_enable_dequeue_zero_copy(vid); + + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); + + if (vsocket->notify_ops->new_connection) { + ret = vsocket->notify_ops->new_connection(vid); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add vhost user connection with fd %d\n", + fd); + goto err; + } + } + + conn->connfd = fd; + conn->vsocket = vsocket; + conn->vid = vid; + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb, + NULL, conn); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add fd %d into vhost server fdset\n", + fd); + + if (vsocket->notify_ops->destroy_connection) + vsocket->notify_ops->destroy_connection(conn->vid); + + goto err; + } + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + return; + +err: + free(conn); + close(fd); +} + +/* call back when there is new vhost-user connection from client */ +static void +vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) +{ + struct vhost_user_socket *vsocket = dat; + + fd = accept(fd, NULL, NULL); + if (fd < 0) + return; + + RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd); + vhost_user_add_connection(fd, vsocket); +} + +static void +vhost_user_read_cb(int connfd, void *dat, int *remove) +{ + struct vhost_user_connection *conn = dat; + struct vhost_user_socket *vsocket = conn->vsocket; + int ret; + + ret = vhost_user_msg_handler(conn->vid, connfd); + if (ret < 0) { + *remove = 1; + vhost_destroy_device(conn->vid); + + if (vsocket->notify_ops->destroy_connection) + vsocket->notify_ops->destroy_connection(conn->vid); + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); + if (conn->connfd != -1) { + close(conn->connfd); + conn->connfd = -1; + } + pthread_mutex_unlock(&vsocket->conn_mutex); + + free(conn); + + if (vsocket->reconnect) { + create_unix_socket(vsocket); + vhost_user_start_client(vsocket); + } + } +} + +static int +create_unix_socket(struct vhost_user_socket *vsocket) +{ + int fd; + struct sockaddr_un *un = &vsocket->un; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", + vsocket->is_server ? "server" : "client", fd); + + if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "vhost-user: can't set nonblocking mode for socket, fd: " + "%d (%s)\n", fd, strerror(errno)); + close(fd); + return -1; + } + + memset(un, 0, sizeof(*un)); + un->sun_family = AF_UNIX; + strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path)); + un->sun_path[sizeof(un->sun_path) - 1] = '\0'; + + vsocket->socket_fd = fd; + return 0; +} + +static int +vhost_user_start_server(struct vhost_user_socket *vsocket) +{ + int ret; + int fd = vsocket->socket_fd; + const char *path = vsocket->path; + + ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un)); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to bind to %s: %s; remove it and try again\n", + path, strerror(errno)); + goto err; + } + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); + + ret = listen(fd, MAX_VIRTIO_BACKLOG); + if (ret < 0) + goto err; + + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, + NULL, vsocket); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add listen fd %d to vhost server fdset\n", + fd); + goto err; + } + + return 0; + +err: + close(fd); + return -1; +} + +struct vhost_user_reconnect { + struct sockaddr_un un; + int fd; + struct vhost_user_socket *vsocket; + + TAILQ_ENTRY(vhost_user_reconnect) next; +}; + +TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect); +struct vhost_user_reconnect_list { + struct vhost_user_reconnect_tailq_list head; + pthread_mutex_t mutex; +}; + +static struct vhost_user_reconnect_list reconn_list; +static pthread_t reconn_tid; + +static int +vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz) +{ + int ret, flags; + + ret = connect(fd, un, sz); + if (ret < 0 && errno != EISCONN) + return -1; + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't get flags for connfd %d\n", fd); + return -2; + } + if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't disable nonblocking on fd %d\n", fd); + return -2; + } + return 0; +} + +static void * +vhost_user_client_reconnect(void *arg __rte_unused) +{ + int ret; + struct vhost_user_reconnect *reconn, *next; + + while (1) { + pthread_mutex_lock(&reconn_list.mutex); + + /* + * An equal implementation of TAILQ_FOREACH_SAFE, + * which does not exist on all platforms. + */ + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + ret = vhost_user_connect_nonblock(reconn->fd, + (struct sockaddr *)&reconn->un, + sizeof(reconn->un)); + if (ret == -2) { + close(reconn->fd); + RTE_LOG(ERR, VHOST_CONFIG, + "reconnection for fd %d failed\n", + reconn->fd); + goto remove_fd; + } + if (ret == -1) + continue; + + RTE_LOG(INFO, VHOST_CONFIG, + "%s: connected\n", reconn->vsocket->path); + vhost_user_add_connection(reconn->fd, reconn->vsocket); +remove_fd: + TAILQ_REMOVE(&reconn_list.head, reconn, next); + free(reconn); + } + + pthread_mutex_unlock(&reconn_list.mutex); + sleep(1); + } + + return NULL; +} + +static int +vhost_user_reconnect_init(void) +{ + int ret; + + pthread_mutex_init(&reconn_list.mutex, NULL); + TAILQ_INIT(&reconn_list.head); + + ret = pthread_create(&reconn_tid, NULL, + vhost_user_client_reconnect, NULL); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); + + return ret; +} + +static int +vhost_user_start_client(struct vhost_user_socket *vsocket) +{ + int ret; + int fd = vsocket->socket_fd; + const char *path = vsocket->path; + struct vhost_user_reconnect *reconn; + + ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un, + sizeof(vsocket->un)); + if (ret == 0) { + vhost_user_add_connection(fd, vsocket); + return 0; + } + + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to connect to %s: %s\n", + path, strerror(errno)); + + if (ret == -2 || !vsocket->reconnect) { + close(fd); + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path); + reconn = malloc(sizeof(*reconn)); + if (reconn == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for reconnect\n"); + close(fd); + return -1; + } + reconn->un = vsocket->un; + reconn->fd = fd; + reconn->vsocket = vsocket; + pthread_mutex_lock(&reconn_list.mutex); + TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next); + pthread_mutex_unlock(&reconn_list.mutex); + + return 0; +} + +static struct vhost_user_socket * +find_vhost_user_socket(const char *path) +{ + int i; + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) + return vsocket; + } + + return NULL; +} + +int +rte_vhost_driver_disable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->features &= ~features; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_enable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + if ((vsocket->supported_features & features) != features) { + /* + * trying to enable features the driver doesn't + * support. + */ + pthread_mutex_unlock(&vhost_user.mutex); + return -1; + } + vsocket->features |= features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_set_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + vsocket->supported_features = features; + vsocket->features = features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_get_features(const char *path, uint64_t *features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + *features = vsocket->features; + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + return -1; + } else { + return 0; + } +} + +/* + * Register a new vhost-user socket; here we could act as server + * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag + * is set. + */ +int +rte_vhost_driver_register(const char *path, uint64_t flags) +{ + int ret = -1; + struct vhost_user_socket *vsocket; + + if (!path) + return -1; + + pthread_mutex_lock(&vhost_user.mutex); + + if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: the number of vhost sockets reaches maximum\n"); + goto out; + } + + vsocket = malloc(sizeof(struct vhost_user_socket)); + if (!vsocket) + goto out; + memset(vsocket, 0, sizeof(struct vhost_user_socket)); + vsocket->path = strdup(path); + if (!vsocket->path) { + free(vsocket); + goto out; + } + TAILQ_INIT(&vsocket->conn_list); + vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; + + /* + * Set the supported features correctly for the builtin vhost-user + * net driver. + * + * Applications know nothing about features the builtin virtio net + * driver (virtio_net.c) supports, thus it's not possible for them + * to invoke rte_vhost_driver_set_features(). To workaround it, here + * we set it unconditionally. If the application want to implement + * another vhost-user driver (say SCSI), it should call the + * rte_vhost_driver_set_features(), which will overwrite following + * two values. + */ + vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; + vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; + + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { + vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); + if (vsocket->reconnect && reconn_tid == 0) { + if (vhost_user_reconnect_init() < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + } + } else { + vsocket->is_server = true; + } + ret = create_unix_socket(vsocket); + if (ret < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + + pthread_mutex_init(&vsocket->conn_mutex, NULL); + vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket; + +out: + pthread_mutex_unlock(&vhost_user.mutex); + + return ret; +} + +static bool +vhost_user_remove_reconnect(struct vhost_user_socket *vsocket) +{ + int found = false; + struct vhost_user_reconnect *reconn, *next; + + pthread_mutex_lock(&reconn_list.mutex); + + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + if (reconn->vsocket == vsocket) { + TAILQ_REMOVE(&reconn_list.head, reconn, next); + close(reconn->fd); + free(reconn); + found = true; + break; + } + } + pthread_mutex_unlock(&reconn_list.mutex); + return found; +} + +/** + * Unregister the specified vhost socket + */ +int +rte_vhost_driver_unregister(const char *path) +{ + int i; + int count; + struct vhost_user_connection *conn; + + pthread_mutex_lock(&vhost_user.mutex); + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) { + if (vsocket->is_server) { + fdset_del(&vhost_user.fdset, vsocket->socket_fd); + close(vsocket->socket_fd); + unlink(path); + } else if (vsocket->reconnect) { + vhost_user_remove_reconnect(vsocket); + } + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_FOREACH(conn, &vsocket->conn_list, next) { + close(conn->connfd); + conn->connfd = -1; + } + pthread_mutex_unlock(&vsocket->conn_mutex); + + do { + pthread_mutex_lock(&vsocket->conn_mutex); + conn = TAILQ_FIRST(&vsocket->conn_list); + pthread_mutex_unlock(&vsocket->conn_mutex); + } while (conn != NULL); + + free(vsocket->path); + free(vsocket); + + count = --vhost_user.vsocket_cnt; + vhost_user.vsockets[i] = vhost_user.vsockets[count]; + vhost_user.vsockets[count] = NULL; + pthread_mutex_unlock(&vhost_user.mutex); + + return 0; + } + } + pthread_mutex_unlock(&vhost_user.mutex); + + return -1; +} + +/* + * Register ops so that we can add/remove device to data core. + */ +int +rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->notify_ops = ops; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +struct vhost_device_ops const * +vhost_driver_callback_get(const char *path) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? vsocket->notify_ops : NULL; +} + +int +rte_vhost_driver_start(const char *path) +{ + struct vhost_user_socket *vsocket; + static pthread_t fdset_tid; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) + return -1; + + if (fdset_tid == 0) { + rte_cpuset_t orig_cpuset; + rte_cpuset_t tmp_cpuset; + long num_cores, i; + int ret; + + CPU_ZERO(&tmp_cpuset); + num_cores = sysconf(_SC_NPROCESSORS_CONF); + /* Create a mask containing all CPUs */ + for (i = 0; i < num_cores; i++) { + CPU_SET(i, &tmp_cpuset); + } + + rte_thread_get_affinity(&orig_cpuset); + rte_thread_set_affinity(&tmp_cpuset); + ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch, + &vhost_user.fdset); + rte_thread_set_affinity(&orig_cpuset); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "failed to create fdset handling thread"); + } + + if (vsocket->is_server) + return vhost_user_start_server(vsocket); + else + return vhost_user_start_client(vsocket); +} diff --git a/src/spdk/lib/rte_vhost/vhost.c b/src/spdk/lib/rte_vhost/vhost.c new file mode 100644 index 000000000..8e875c585 --- /dev/null +++ b/src/spdk/lib/rte_vhost/vhost.c @@ -0,0 +1,565 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> +#ifdef RTE_LIBRTE_VHOST_NUMA +#include <numaif.h> +#endif + +#include <rte_ethdev.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include <rte_memory.h> +#include <rte_malloc.h> +#include <rte_vhost.h> + +#include "vhost.h" + +struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +struct virtio_net * +get_device(int vid) +{ + struct virtio_net *dev = vhost_devices[vid]; + + if (unlikely(!dev)) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) device not found.\n", vid); + } + + return dev; +} + +static void +cleanup_vq(struct vhost_virtqueue *vq, int destroy) +{ + if ((vq->callfd >= 0) && (destroy != 0)) + close(vq->callfd); + if (vq->kickfd >= 0) + close(vq->kickfd); +} + +/* + * Unmap any memory, close any file descriptors and + * free any memory owned by a device. + */ +void +cleanup_device(struct virtio_net *dev, int destroy) +{ + uint32_t i; + + vhost_backend_cleanup(dev); + + for (i = 0; i < dev->nr_vring; i++) + cleanup_vq(dev->virtqueue[i], destroy); +} + +/* + * Release virtqueues and device memory. + */ +static void +free_device(struct virtio_net *dev) +{ + uint32_t i; + struct vhost_virtqueue *vq; + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + + rte_free(vq->shadow_used_ring); + + rte_free(vq); + } + + rte_free(dev); +} + +static void +init_vring_queue(struct vhost_virtqueue *vq) +{ + memset(vq, 0, sizeof(struct vhost_virtqueue)); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + /* Backends are set to -1 indicating an inactive device. */ + vq->backend = -1; + + /* + * always set the vq to enabled; this is to keep compatibility + * with the old QEMU, whereas there is no SET_VRING_ENABLE message. + */ + vq->enabled = 1; + + TAILQ_INIT(&vq->zmbuf_list); +} + +static void +reset_vring_queue(struct vhost_virtqueue *vq) +{ + int callfd; + + callfd = vq->callfd; + init_vring_queue(vq); + vq->callfd = callfd; +} + +int +alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) +{ + struct vhost_virtqueue *vq; + + vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0); + if (vq == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for vring:%u.\n", vring_idx); + return -1; + } + + dev->virtqueue[vring_idx] = vq; + init_vring_queue(vq); + + dev->nr_vring += 1; + + return 0; +} + +/* + * Reset some variables in device structure, while keeping few + * others untouched, such as vid, ifname, nr_vring: they + * should be same unless the device is removed. + */ +void +reset_device(struct virtio_net *dev) +{ + uint32_t i; + + dev->negotiated_features = 0; + dev->protocol_features = 0; + dev->flags = 0; + + for (i = 0; i < dev->nr_vring; i++) + reset_vring_queue(dev->virtqueue[i]); +} + +/* + * Invoked when there is a new vhost-user connection established (when + * there is a new virtio device being attached). + */ +int +vhost_new_device(uint64_t features, struct vhost_device_ops const *ops) +{ + struct virtio_net *dev; + int i; + + dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); + if (dev == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for new dev.\n"); + return -1; + } + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + if (vhost_devices[i] == NULL) + break; + } + if (i == MAX_VHOST_DEVICE) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find a free slot for new device.\n"); + rte_free(dev); + return -1; + } + + vhost_devices[i] = dev; + dev->vid = i; + dev->features = features; + dev->notify_ops = ops; + + return i; +} + +/* + * Invoked when there is the vhost-user connection is broken (when + * the virtio device is being detached). + */ +void +vhost_destroy_device(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(vid); + } + + cleanup_device(dev, 1); + free_device(dev); + + vhost_devices[vid] = NULL; +} + +void +vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) +{ + struct virtio_net *dev; + unsigned int len; + + dev = get_device(vid); + if (dev == NULL) + return; + + len = if_len > sizeof(dev->ifname) ? + sizeof(dev->ifname) : if_len; + + strncpy(dev->ifname, if_name, len); + dev->ifname[sizeof(dev->ifname) - 1] = '\0'; +} + +void +vhost_enable_dequeue_zero_copy(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + dev->dequeue_zero_copy = 1; +} + +int +rte_vhost_get_mtu(int vid, uint16_t *mtu) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -ENODEV; + + if (!(dev->flags & VIRTIO_DEV_READY)) + return -EAGAIN; + + if (!(dev->negotiated_features & VIRTIO_NET_F_MTU)) + return -ENOTSUP; + + *mtu = dev->mtu; + + return 0; +} + +int +rte_vhost_get_numa_node(int vid) +{ +#ifdef RTE_LIBRTE_VHOST_NUMA + struct virtio_net *dev = get_device(vid); + int numa_node; + int ret; + + if (dev == NULL) + return -1; + + ret = get_mempolicy(&numa_node, NULL, 0, dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to query numa node: %d\n", vid, ret); + return -1; + } + + return numa_node; +#else + RTE_SET_USED(vid); + return -1; +#endif +} + +int +rte_vhost_get_ifname(int vid, char *buf, size_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + len = RTE_MIN(len, sizeof(dev->ifname)); + + strncpy(buf, dev->ifname, len); + buf[len - 1] = '\0'; + + return 0; +} + +int +rte_vhost_get_negotiated_features(int vid, uint64_t *features) +{ + struct virtio_net *dev; + + dev = get_device(vid); + if (!dev) + return -1; + + *features = dev->negotiated_features; + return 0; +} + +int +rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) +{ + struct virtio_net *dev; + struct rte_vhost_memory *m; + size_t size; + + dev = get_device(vid); + if (!dev) + return -1; + + size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region); + m = malloc(sizeof(struct rte_vhost_memory) + size); + if (!m) + return -1; + + m->nregions = dev->mem->nregions; + memcpy(m->regions, dev->mem->regions, size); + *mem = m; + + return 0; +} + +int +rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vring->desc = vq->desc; + vring->avail = vq->avail; + vring->used = vq->used; + vring->log_guest_addr = vq->log_guest_addr; + + vring->callfd = vq->callfd; + vring->kickfd = vq->kickfd; + vring->size = vq->size; + + return 0; +} + +uint16_t +rte_vhost_avail_entries(int vid, uint16_t queue_id) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + vq = dev->virtqueue[queue_id]; + if (!vq->enabled) + return 0; + + return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; +} + +int +rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + if (enable) { + RTE_LOG(ERR, VHOST_CONFIG, + "guest notification isn't supported.\n"); + return -1; + } + + dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY; + return 0; +} + +void +rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + vhost_log_write(dev, addr, len); +} + +void +rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (dev == NULL) + return; + + if (vring_idx >= VHOST_MAX_VRING) + return; + vq = dev->virtqueue[vring_idx]; + if (!vq) + return; + + vhost_log_used_vring(dev, vq, offset, len); +} + +int +rte_vhost_set_vring_base(int vid, uint16_t vring_idx, + uint16_t last_avail_idx, uint16_t last_used_idx) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vq->last_avail_idx = last_avail_idx; + vq->last_used_idx = last_used_idx; + + return 0; +} + +int +rte_vhost_get_vring_base(int vid, uint16_t vring_idx, + uint16_t *last_avail_idx, uint16_t *last_used_idx) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + *last_avail_idx = vq->last_avail_idx; + *last_used_idx = vq->last_used_idx; + + return 0; +} + +int +rte_vhost_vring_call(int vid, uint16_t vring_idx) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if(!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + /* Ensure all our used ring changes are visible to the guest at the time + * of interrupt. + * TODO: this is currently an sfence on x86. For other architectures we + * will most likely need an smp_mb(), but smp_mb() is an overkill for x86. + */ + rte_wmb(); + + if (vq->callfd != -1) { + eventfd_write(vq->callfd, (eventfd_t)1); + return 0; + } + + return -1; +} + +int +rte_vhost_set_last_inflight_io_split(int vid, uint16_t vring_idx, + uint16_t idx) +{ + return 0; +} + +int +rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx, + uint16_t last_used_idx, uint16_t idx) +{ + return 0; +} + +int +rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx, + uint16_t idx) +{ + return 0; +} + +int +rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx, + struct rte_vhost_ring_inflight *vring) +{ + return 0; +} diff --git a/src/spdk/lib/rte_vhost/vhost.h b/src/spdk/lib/rte_vhost/vhost.h new file mode 100644 index 000000000..d738dba7f --- /dev/null +++ b/src/spdk/lib/rte_vhost/vhost.h @@ -0,0 +1,330 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_CDEV_H_ +#define _VHOST_NET_CDEV_H_ +#include <stdint.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/queue.h> +#include <unistd.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <sys/socket.h> +#include <linux/if.h> + +#include <rte_log.h> +#include <rte_ether.h> + +#include "rte_vhost.h" +#include "vhost_user.h" + +/* Used to indicate that the device is running on a data core */ +#define VIRTIO_DEV_RUNNING 1 +/* Used to indicate that the device is ready to operate */ +#define VIRTIO_DEV_READY 2 + +/* Backend value set by guest. */ +#define VIRTIO_DEV_STOPPED -1 + +#define BUF_VECTOR_MAX 256 + +/** + * Structure contains buffer address, length and descriptor index + * from vring to do scatter RX. + */ +struct buf_vector { + uint64_t buf_addr; + uint32_t buf_len; + uint32_t desc_idx; +}; + +/* + * A structure to hold some fields needed in zero copy code path, + * mainly for associating an mbuf with the right desc_idx. + */ +struct zcopy_mbuf { + struct rte_mbuf *mbuf; + uint32_t desc_idx; + uint16_t in_use; + + TAILQ_ENTRY(zcopy_mbuf) next; +}; +TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf); + +/** + * Structure contains variables relevant to RX/TX virtqueues. + */ +struct vhost_virtqueue { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint32_t size; + + uint16_t last_avail_idx; + uint16_t last_used_idx; +#define VIRTIO_INVALID_EVENTFD (-1) +#define VIRTIO_UNINITIALIZED_EVENTFD (-2) + + /* Backend value to determine if device should started/stopped */ + int backend; + /* Used to notify the guest (trigger interrupt) */ + int callfd; + /* Currently unused as polling mode is enabled */ + int kickfd; + int enabled; + + /* Physical address of used ring, for logging */ + uint64_t log_guest_addr; + + uint16_t nr_zmbuf; + uint16_t zmbuf_size; + uint16_t last_zmbuf_idx; + struct zcopy_mbuf *zmbufs; + struct zcopy_mbuf_list zmbuf_list; + + struct vring_used_elem *shadow_used_ring; + uint16_t shadow_used_idx; +} __rte_cache_aligned; + +/* Old kernels have no such macros defined */ +#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE + #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 +#endif + +#ifndef VIRTIO_NET_F_MQ + #define VIRTIO_NET_F_MQ 22 +#endif + +#define VHOST_MAX_VRING 0x100 +#define VHOST_MAX_QUEUE_PAIRS 0x80 + +#ifndef VIRTIO_NET_F_MTU + #define VIRTIO_NET_F_MTU 3 +#endif + +/* + * Define virtio 1.0 for older kernels + */ +#ifndef VIRTIO_F_VERSION_1 + #define VIRTIO_F_VERSION_1 32 +#endif + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +/* Features supported by this builtin vhost-user net driver. */ +#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_RX) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + (1ULL << VIRTIO_NET_F_MQ) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1ULL << VIRTIO_NET_F_MTU)) + + +struct guest_page { + uint64_t guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; +}; + +/* struct ether_addr was renamed to struct rte_ether_addr at one point */ +#ifdef RTE_ETHER_ADDR_LEN +struct ether_addr { + uint8_t addr_bytes[RTE_ETHER_ADDR_LEN]; +} __attribute__((__packed__)); +#endif + +/** + * Device structure contains all configuration information relating + * to the device. + */ +struct virtio_net { + /* Frontend (QEMU) memory and memory region information */ + struct rte_vhost_memory *mem; + uint64_t features; + uint64_t negotiated_features; + uint64_t protocol_features; + int vid; + uint32_t is_nvme; + uint32_t flags; + uint16_t vhost_hlen; + /* to tell if we need broadcast rarp packet */ + rte_atomic16_t broadcast_rarp; + uint32_t nr_vring; + int dequeue_zero_copy; + struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; +#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) + char ifname[IF_NAME_SZ]; + uint64_t log_size; + uint64_t log_base; + uint64_t log_addr; + struct ether_addr mac; + uint16_t mtu; + + struct vhost_device_ops const *notify_ops; + + uint32_t nr_guest_pages; + uint32_t max_guest_pages; + struct guest_page *guest_pages; + int has_new_mem_table; + void *bar_addr; + uint64_t bar_size; + struct VhostUserMemory mem_table; + int mem_table_fds[VHOST_MEMORY_MAX_NREGIONS]; +} __rte_cache_aligned; + + +#define VHOST_LOG_PAGE 4096 + +static inline void __attribute__((always_inline)) +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + log_base[page / 8] |= 1 << (page % 8); +} + +static inline void __attribute__((always_inline)) +vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->negotiated_features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} + +static inline void __attribute__((always_inline)) +vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_write(dev, vq->log_guest_addr + offset, len); +} + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 +#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 + +#ifdef RTE_LIBRTE_VHOST_DEBUG +#define VHOST_MAX_PRINT_BUFF 6072 +#define VHOST_LOG_LEVEL RTE_LOG_DEBUG +#define VHOST_LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) +#define PRINT_PACKET(device, addr, size, header) do { \ + char *pkt_addr = (char *)(addr); \ + unsigned int index; \ + char packet[VHOST_MAX_PRINT_BUFF]; \ + \ + if ((header)) \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ + else \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ + for (index = 0; index < (size); index++) { \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ + "%02hhx ", pkt_addr[index]); \ + } \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ + \ + VHOST_LOG_DEBUG(VHOST_DATA, "%s", packet); \ +} while (0) +#else +#define VHOST_LOG_LEVEL RTE_LOG_INFO +#define VHOST_LOG_DEBUG(log_type, fmt, args...) do {} while (0) +#define PRINT_PACKET(device, addr, size, header) do {} while (0) +#endif + +extern uint64_t VHOST_FEATURES; +#define MAX_VHOST_DEVICE 1024 +extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +/* Convert guest physical address to host physical address */ +static inline phys_addr_t __attribute__((always_inline)) +gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + if (gpa >= page->guest_phys_addr && + gpa + size < page->guest_phys_addr + page->size) { + return gpa - page->guest_phys_addr + + page->host_phys_addr; + } + } + + return 0; +} + +struct virtio_net *get_device(int vid); + +int vhost_new_device(uint64_t features, struct vhost_device_ops const *ops); +void cleanup_device(struct virtio_net *dev, int destroy); +void reset_device(struct virtio_net *dev); +void vhost_destroy_device(int); + +int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); + +void vhost_set_ifname(int, const char *if_name, unsigned int if_len); +void vhost_enable_dequeue_zero_copy(int vid); + +struct vhost_device_ops const *vhost_driver_callback_get(const char *path); + +/* + * Backend-specific cleanup. + * + * TODO: fix it; we have one backend now + */ +void vhost_backend_cleanup(struct virtio_net *dev); + +#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/src/spdk/lib/rte_vhost/vhost_user.c b/src/spdk/lib/rte_vhost/vhost_user.c new file mode 100644 index 000000000..a07483fcf --- /dev/null +++ b/src/spdk/lib/rte_vhost/vhost_user.c @@ -0,0 +1,1426 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <unistd.h> +#include <sys/mman.h> +#include <asm/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <assert.h> +#ifdef RTE_LIBRTE_VHOST_NUMA +#include <numaif.h> +#endif + +#include <rte_common.h> +#include <rte_malloc.h> +#include <rte_log.h> + +#include "vhost.h" +#include "vhost_user.h" + +#define VIRTIO_MIN_MTU 68 +#define VIRTIO_MAX_MTU 65535 + +static const char *vhost_message_str[VHOST_USER_MAX] = { + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", + [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", + [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", + [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG", + [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG", + [VHOST_USER_NVME_ADMIN] = "VHOST_USER_NVME_ADMIN", + [VHOST_USER_NVME_SET_CQ_CALL] = "VHOST_USER_NVME_SET_CQ_CALL", + [VHOST_USER_NVME_GET_CAP] = "VHOST_USER_NVME_GET_CAP", + [VHOST_USER_NVME_START_STOP] = "VHOST_USER_NVME_START_STOP", + [VHOST_USER_NVME_SET_BAR_MR] = "VHOST_USER_NVME_SET_BAR_MR" +}; + +static uint64_t +get_blk_size(int fd) +{ + struct stat stat; + int ret; + + ret = fstat(fd, &stat); + return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; +} + +static void +free_mem_region(struct virtio_net *dev) +{ + uint32_t i; + struct rte_vhost_mem_region *reg; + + if (!dev || !dev->mem) + return; + + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + if (reg->host_user_addr) { + munmap(reg->mmap_addr, reg->mmap_size); + close(reg->fd); + } + } +} + +void +vhost_backend_cleanup(struct virtio_net *dev) +{ + uint32_t i; + + if (dev->has_new_mem_table) { + for (i = 0; i < dev->mem_table.nregions; i++) { + close(dev->mem_table_fds[i]); + } + dev->has_new_mem_table = 0; + } + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + free(dev->guest_pages); + dev->guest_pages = NULL; + + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + dev->log_addr = 0; + } + if (dev->bar_addr) { + munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size); + dev->bar_addr = NULL; + dev->bar_size = 0; + } +} + +/* + * This function just returns success at the moment unless + * the device hasn't been initialised. + */ +static int +vhost_user_set_owner(void) +{ + return 0; +} + +static int +vhost_user_reset_owner(struct virtio_net *dev) +{ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + cleanup_device(dev, 0); + reset_device(dev); + return 0; +} + +/* + * The features that we support are requested. + */ +static uint64_t +vhost_user_get_features(struct virtio_net *dev) +{ + return dev->features; +} + +/* + * We receive the negotiated features supported by us and the virtio device. + */ +static int +vhost_user_set_features(struct virtio_net *dev, uint64_t features) +{ + uint64_t vhost_features = 0; + + vhost_features = vhost_user_get_features(dev); + if (features & ~vhost_features) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) received invalid negotiated features.\n", + dev->vid); + return -1; + } + + if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->negotiated_features != features) { + if (dev->notify_ops->features_changed) { + dev->notify_ops->features_changed(dev->vid, features); + } else { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + } + + dev->negotiated_features = features; + if (dev->negotiated_features & + ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { + dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + dev->vhost_hlen = sizeof(struct virtio_net_hdr); + } + VHOST_LOG_DEBUG(VHOST_CONFIG, + "(%d) mergeable RX buffers %s, virtio 1 %s\n", + dev->vid, + (dev->negotiated_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", + (dev->negotiated_features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); + + return 0; +} + +/* + * The virtio device sends us the size of the descriptor ring. + */ +static int +vhost_user_set_vring_num(struct virtio_net *dev, + VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; + + vq->size = msg->payload.state.num; + + if (dev->dequeue_zero_copy) { + vq->nr_zmbuf = 0; + vq->last_zmbuf_idx = 0; + vq->zmbuf_size = vq->size; + vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * + sizeof(struct zcopy_mbuf), 0); + if (vq->zmbufs == NULL) { + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to allocate mem for zero copy; " + "zero copy is force disabled\n"); + dev->dequeue_zero_copy = 0; + } + } + + vq->shadow_used_ring = rte_malloc(NULL, + vq->size * sizeof(struct vring_used_elem), + RTE_CACHE_LINE_SIZE); + if (!vq->shadow_used_ring) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } + + return 0; +} + +/* + * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the + * same numa node as the memory of vring descriptor. + */ +#ifdef RTE_LIBRTE_VHOST_NUMA +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index) +{ + int oldnode, newnode; + struct virtio_net *old_dev; + struct vhost_virtqueue *old_vq, *vq; + int ret; + + old_dev = dev; + vq = old_vq = dev->virtqueue[index]; + + ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, + MPOL_F_NODE | MPOL_F_ADDR); + + /* check if we need to reallocate vq */ + ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get vq numa information.\n"); + return dev; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate vq from %d to %d node\n", oldnode, newnode); + vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode); + if (!vq) + return dev; + + memcpy(vq, old_vq, sizeof(*vq)); + rte_free(old_vq); + } + + /* check if we need to reallocate dev */ + ret = get_mempolicy(&oldnode, NULL, 0, old_dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get dev numa information.\n"); + goto out; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate dev from %d to %d node\n", + oldnode, newnode); + dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); + if (!dev) { + dev = old_dev; + goto out; + } + + memcpy(dev, old_dev, sizeof(*dev)); + rte_free(old_dev); + } + +out: + dev->virtqueue[index] = vq; + vhost_devices[dev->vid] = dev; + + return dev; +} +#else +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index __rte_unused) +{ + return dev; +} +#endif + +/* + * Converts QEMU virtual address to Vhost virtual address. This function is + * used to convert the ring addresses to our address space. + */ +static uint64_t +qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + + /* Find the region where the address lives. */ + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + + if (qva >= reg->guest_user_addr && + qva < reg->guest_user_addr + reg->size) { + + if (unlikely(*len > reg->guest_user_addr + reg->size - qva)) + *len = reg->guest_user_addr + reg->size - qva; + + return qva - reg->guest_user_addr + + reg->host_user_addr; + } + } + + return 0; +} + +static int vhost_setup_mem_table(struct virtio_net *dev); + +/* + * The virtio device sends us the desc, used and avail ring addresses. + * This function then converts these to our address space. + */ +static int +vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq; + uint64_t len; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + if (dev->has_new_mem_table) { + vhost_setup_mem_table(dev); + dev->has_new_mem_table = 0; + } + + if (dev->mem == NULL) + return -1; + + /* addr->index refers to the queue index. The txq 1, rxq is 0. */ + vq = dev->virtqueue[msg->payload.addr.index]; + + /* The addresses are converted from QEMU virtual to Vhost virtual. */ + len = sizeof(struct vring_desc) * vq->size; + vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.desc_user_addr, &len); + if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to map desc ring.\n", + dev->vid); + return -1; + } + + dev = numa_realloc(dev, msg->payload.addr.index); + vq = dev->virtqueue[msg->payload.addr.index]; + + len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; + vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.avail_user_addr, &len); + if (vq->avail == 0 || + len != sizeof(struct vring_avail) + + sizeof(uint16_t) * vq->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find avail ring address.\n", + dev->vid); + return -1; + } + + len = sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size; + vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.used_user_addr, &len); + if (vq->used == 0 || len != sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size) { + + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find used ring address.\n", + dev->vid); + return -1; + } + + if (vq->last_used_idx != vq->used->idx) { + RTE_LOG(WARNING, VHOST_CONFIG, + "last_used_idx (%u) and vq->used->idx (%u) mismatches; " + "some packets maybe resent for Tx and dropped for Rx\n", + vq->last_used_idx, vq->used->idx); + vq->last_used_idx = vq->used->idx; + vq->last_avail_idx = vq->used->idx; + } + + vq->log_guest_addr = msg->payload.addr.log_guest_addr; + + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", + dev->vid, vq->desc); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", + dev->vid, vq->avail); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", + dev->vid, vq->used); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", + dev->vid, vq->log_guest_addr); + + return 0; +} + +/* + * The virtio device sends us the available ring last used index. + */ +static int +vhost_user_set_vring_base(struct virtio_net *dev, + VhostUserMsg *msg) +{ + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + dev->virtqueue[msg->payload.state.index]->last_used_idx = msg->payload.state.num; + dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num; + + return 0; +} + +static void +add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, + uint64_t host_phys_addr, uint64_t size) +{ + struct guest_page *page, *last_page; + + if (dev->nr_guest_pages == dev->max_guest_pages) { + dev->max_guest_pages = RTE_MAX(8U, dev->max_guest_pages * 2); + dev->guest_pages = realloc(dev->guest_pages, + dev->max_guest_pages * sizeof(*page)); + } + + if (dev->nr_guest_pages > 0) { + last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; + /* merge if the two pages are continuous */ + if (host_phys_addr == last_page->host_phys_addr + + last_page->size) { + last_page->size += size; + return; + } + } + + page = &dev->guest_pages[dev->nr_guest_pages++]; + page->guest_phys_addr = guest_phys_addr; + page->host_phys_addr = host_phys_addr; + page->size = size; +} + +static void +add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, + uint64_t page_size) +{ + uint64_t reg_size = reg->size; + uint64_t host_user_addr = reg->host_user_addr; + uint64_t guest_phys_addr = reg->guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; + + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr); + size = page_size - (guest_phys_addr & (page_size - 1)); + size = RTE_MIN(size, reg_size); + + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + + while (reg_size > 0) { + size = RTE_MIN(reg_size, page_size); + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t) + host_user_addr); + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + } +} + +#ifdef RTE_LIBRTE_VHOST_DEBUG +/* TODO: enable it only in debug mode? */ +static void +dump_guest_pages(struct virtio_net *dev) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + RTE_LOG(INFO, VHOST_CONFIG, + "guest physical page region %u\n" + "\t guest_phys_addr: %" PRIx64 "\n" + "\t host_phys_addr : %" PRIx64 "\n" + "\t size : %" PRIx64 "\n", + i, + page->guest_phys_addr, + page->host_phys_addr, + page->size); + } +} +#else +#define dump_guest_pages(dev) +#endif + +static int +vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + uint32_t i; + + if (dev->has_new_mem_table) { + /* + * The previous mem table was not consumed, so close the + * file descriptors from that mem table before copying + * the new one. + */ + for (i = 0; i < dev->mem_table.nregions; i++) { + close(dev->mem_table_fds[i]); + } + } + + memcpy(&dev->mem_table, &pmsg->payload.memory, sizeof(dev->mem_table)); + memcpy(dev->mem_table_fds, pmsg->fds, sizeof(dev->mem_table_fds)); + dev->has_new_mem_table = 1; + /* vhost-user-nvme will not send + * set vring addr message, enable + * memory address table now. + */ + if (dev->has_new_mem_table && dev->is_nvme) { + vhost_setup_mem_table(dev); + dev->has_new_mem_table = 0; + } + + return 0; +} + + static int +vhost_setup_mem_table(struct virtio_net *dev) +{ + struct VhostUserMemory memory = dev->mem_table; + struct rte_vhost_mem_region *reg; + struct vhost_virtqueue *vq; + void *mmap_addr; + uint64_t mmap_size; + uint64_t mmap_offset; + uint64_t alignment; + uint32_t i; + int fd; + + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + /* Those addresses won't be valid anymore in host address space + * after setting new mem table. Initiator need to resend these + * addresses. + */ + vq->desc = NULL; + vq->avail = NULL; + vq->used = NULL; + } + + dev->nr_guest_pages = 0; + if (!dev->guest_pages) { + dev->max_guest_pages = 8; + dev->guest_pages = malloc(dev->max_guest_pages * + sizeof(struct guest_page)); + } + + dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) + + sizeof(struct rte_vhost_mem_region) * memory.nregions, 0); + if (dev->mem == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to allocate memory for dev->mem\n", + dev->vid); + return -1; + } + dev->mem->nregions = memory.nregions; + + for (i = 0; i < memory.nregions; i++) { + fd = dev->mem_table_fds[i]; + reg = &dev->mem->regions[i]; + + reg->guest_phys_addr = memory.regions[i].guest_phys_addr; + reg->guest_user_addr = memory.regions[i].userspace_addr; + reg->size = memory.regions[i].memory_size; + reg->fd = fd; + + mmap_offset = memory.regions[i].mmap_offset; + mmap_size = reg->size + mmap_offset; + + /* mmap() without flag of MAP_ANONYMOUS, should be called + * with length argument aligned with hugepagesz at older + * longterm version Linux, like 2.6.32 and 3.2.72, or + * mmap() will fail with EINVAL. + * + * to avoid failure, make sure in caller to keep length + * aligned. + */ + alignment = get_blk_size(fd); + if (alignment == (uint64_t)-1) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't get hugepage size through fstat\n"); + goto err_mmap; + } + mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); + + mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + + if (mmap_addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap region %u failed.\n", i); + goto err_mmap; + } + + if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) { + RTE_LOG(INFO, VHOST_CONFIG, + "MADV_DONTDUMP advice setting failed.\n"); + } + + reg->mmap_addr = mmap_addr; + reg->mmap_size = mmap_size; + reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + + mmap_offset; + + if (dev->dequeue_zero_copy) + add_guest_pages(dev, reg, alignment); + + RTE_LOG(INFO, VHOST_CONFIG, + "guest memory region %u, size: 0x%" PRIx64 "\n" + "\t guest physical addr: 0x%" PRIx64 "\n" + "\t guest virtual addr: 0x%" PRIx64 "\n" + "\t host virtual addr: 0x%" PRIx64 "\n" + "\t mmap addr : 0x%" PRIx64 "\n" + "\t mmap size : 0x%" PRIx64 "\n" + "\t mmap align: 0x%" PRIx64 "\n" + "\t mmap off : 0x%" PRIx64 "\n", + i, reg->size, + reg->guest_phys_addr, + reg->guest_user_addr, + reg->host_user_addr, + (uint64_t)(uintptr_t)mmap_addr, + mmap_size, + alignment, + mmap_offset); + } + + dump_guest_pages(dev); + + return 0; + +err_mmap: + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + return -1; +} + +static int +vq_is_ready(struct vhost_virtqueue *vq) +{ + return vq && vq->desc && + vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && + vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD && + vq->kickfd != VIRTIO_INVALID_EVENTFD && + vq->callfd != VIRTIO_INVALID_EVENTFD; +} + +static int +virtio_is_ready(struct virtio_net *dev) +{ + struct vhost_virtqueue *vq; + uint32_t i; + + if (dev->nr_vring == 0) + return 0; + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + + if (vq_is_ready(vq)) { + RTE_LOG(INFO, VHOST_CONFIG, + "virtio is now ready for processing.\n"); + return 1; + } + } + + return 0; +} + +static void +vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring call idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = file.fd; +} + +static void +vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring kick idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->kickfd >= 0) + close(vq->kickfd); + vq->kickfd = file.fd; +} + +static void +free_zmbufs(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *zmbuf, *next; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + rte_pktmbuf_free(zmbuf->mbuf); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + } + + rte_free(vq->zmbufs); +} + +/* + * when virtio is stopped, qemu will send us the GET_VRING_BASE message. + */ +static int +vhost_user_get_vring_base(struct virtio_net *dev, + VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; + + /* We have to stop the queue (virtio) if it is running. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + dev->flags &= ~VIRTIO_DEV_READY; + + /* Here we are safe to get the last used index */ + msg->payload.state.num = vq->last_used_idx; + + RTE_LOG(INFO, VHOST_CONFIG, + "vring base idx:%d file:%d\n", msg->payload.state.index, msg->payload.state.num); + /* + * Based on current qemu vhost-user implementation, this message is + * sent and only sent in vhost_vring_stop. + * TODO: cleanup the vring, it isn't usable since here. + */ + if (vq->kickfd >= 0) + close(vq->kickfd); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (dev->dequeue_zero_copy) + free_zmbufs(vq); + rte_free(vq->shadow_used_ring); + vq->shadow_used_ring = NULL; + + return 0; +} + +/* + * when virtio queues are ready to work, qemu will send us to + * enable the virtio queue pair. + */ +static int +vhost_user_set_vring_enable(struct virtio_net *dev, + VhostUserMsg *msg) +{ + int enable = (int)msg->payload.state.num; + + RTE_LOG(INFO, VHOST_CONFIG, + "set queue enable: %d to qp idx: %d\n", + enable, msg->payload.state.index); + + if (dev->notify_ops->vring_state_changed) + dev->notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable); + + dev->virtqueue[msg->payload.state.index]->enabled = enable; + + return 0; +} + +static void +vhost_user_set_protocol_features(struct virtio_net *dev, + uint64_t protocol_features) +{ + if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) + return; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + dev->protocol_features = protocol_features; +} + +static int +vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + int fd = msg->fds[0]; + uint64_t size, off; + void *addr; + + if (fd < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); + return -1; + } + + if (msg->size != sizeof(VhostUserLog)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid log base msg size: %"PRId32" != %d\n", + msg->size, (int)sizeof(VhostUserLog)); + return -1; + } + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + size = msg->payload.log.mmap_size; + off = msg->payload.log.mmap_offset; + RTE_LOG(INFO, VHOST_CONFIG, + "log mmap size: %"PRId64", offset: %"PRId64"\n", + size, off); + + /* + * mmap from 0 to workaround a hugepage mmap bug: mmap will + * fail when offset is not page size aligned. + */ + addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); + return -1; + } + + /* + * Free previously mapped log memory on occasionally + * multiple VHOST_USER_SET_LOG_BASE. + */ + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + } + dev->log_addr = (uint64_t)(uintptr_t)addr; + dev->log_base = dev->log_addr + off; + dev->log_size = size; + + return 0; +} + +/* + * An rarp packet is constructed and broadcasted to notify switches about + * the new location of the migrated VM, so that packets from outside will + * not be lost after migration. + * + * However, we don't actually "send" a rarp packet here, instead, we set + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. + */ +static int +vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + uint8_t *mac = (uint8_t *)&msg->payload.u64; + + RTE_LOG(DEBUG, VHOST_CONFIG, + ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + memcpy(dev->mac.addr_bytes, mac, 6); + + /* + * Set the flag to inject a RARP broadcast packet at + * rte_vhost_dequeue_burst(). + * + * rte_smp_wmb() is for making sure the mac is copied + * before the flag is set. + */ + rte_smp_wmb(); + rte_atomic16_set(&dev->broadcast_rarp, 1); + + return 0; +} + +static int +vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + if (msg->payload.u64 < VIRTIO_MIN_MTU || + msg->payload.u64 > VIRTIO_MAX_MTU) { + RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n", + msg->payload.u64); + + return -1; + } + + dev->mtu = msg->payload.u64; + + return 0; +} + +/* return bytes# of read on success or negative val on failure. */ +static int +read_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + msg->fds, VHOST_MEMORY_MAX_NREGIONS); + if (ret <= 0) + return ret; + + if (msg && msg->size) { + if (msg->size > sizeof(msg->payload)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid msg size: %d\n", msg->size); + return -1; + } + ret = read(sockfd, &msg->payload, msg->size); + if (ret <= 0) + return ret; + if (ret != (int)msg->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "read control message failed\n"); + return -1; + } + } + + return ret; +} + +static int +send_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + if (!msg) + return 0; + + msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags &= ~VHOST_USER_NEED_REPLY; + msg->flags |= VHOST_USER_VERSION; + msg->flags |= VHOST_USER_REPLY_MASK; + + ret = send_fd_message(sockfd, (char *)msg, + VHOST_USER_HDR_SIZE + msg->size, NULL, 0); + + return ret; +} + +/* + * Allocate a queue pair if it hasn't been allocated yet + */ +static int +vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg) +{ + uint16_t vring_idx; + + switch (msg->request) { + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + break; + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + vring_idx = msg->payload.state.index; + break; + case VHOST_USER_SET_VRING_ADDR: + vring_idx = msg->payload.addr.index; + break; + default: + return 0; + } + + if (vring_idx >= VHOST_MAX_VRING) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid vring index: %u\n", vring_idx); + return -1; + } + + if (dev->virtqueue[vring_idx]) + return 0; + + return alloc_vring_queue(dev, vring_idx); +} + +static int +vhost_user_nvme_admin_passthrough(struct virtio_net *dev, + void *cmd, void *cqe, void *buf) +{ + if (dev->notify_ops->vhost_nvme_admin_passthrough) { + return dev->notify_ops->vhost_nvme_admin_passthrough(dev->vid, cmd, cqe, buf); + } + + return -1; +} + +static int +vhost_user_nvme_set_cq_call(struct virtio_net *dev, uint16_t qid, int fd) +{ + if (dev->notify_ops->vhost_nvme_set_cq_call) { + return dev->notify_ops->vhost_nvme_set_cq_call(dev->vid, qid, fd); + } + + return -1; +} + +static int +vhost_user_nvme_get_cap(struct virtio_net *dev, uint64_t *cap) +{ + if (dev->notify_ops->vhost_nvme_get_cap) { + return dev->notify_ops->vhost_nvme_get_cap(dev->vid, cap); + } + + return -1; +} + +static int +vhost_user_nvme_set_bar_mr(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct VhostUserMemory mem_table; + int fd = pmsg->fds[0]; + void *mmap_addr; + uint64_t mmap_size; + uint64_t mmap_offset; + uint64_t alignment; + struct rte_vhost_mem_region reg; + int ret = 0; + + memcpy(&mem_table, &pmsg->payload.memory, sizeof(mem_table)); + + reg.guest_phys_addr = mem_table.regions[0].guest_phys_addr; + reg.guest_user_addr = mem_table.regions[0].userspace_addr; + reg.size = mem_table.regions[0].memory_size; + reg.fd = fd; + mmap_offset = mem_table.regions[0].mmap_offset; + mmap_size = reg.size + mmap_offset; + + alignment = get_blk_size(fd); + if (alignment == (uint64_t)-1) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't get hugepage size through fstat\n"); + return -1; + } + mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); + + mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + + if (mmap_addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap region failed.\n"); + return -1; + } + + if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) { + RTE_LOG(INFO, VHOST_CONFIG, + "MADV_DONTDUMP advice setting failed.\n"); + } + + reg.mmap_addr = mmap_addr; + reg.mmap_size = mmap_size; + reg.host_user_addr = (uint64_t)(uintptr_t)mmap_addr + + mmap_offset; + + RTE_LOG(INFO, VHOST_CONFIG, + "BAR memory region %u, size: 0x%" PRIx64 "\n" + "\t guest physical addr: 0x%" PRIx64 "\n" + "\t guest virtual addr: 0x%" PRIx64 "\n" + "\t host virtual addr: 0x%" PRIx64 "\n" + "\t mmap addr : 0x%" PRIx64 "\n" + "\t mmap size : 0x%" PRIx64 "\n" + "\t mmap align: 0x%" PRIx64 "\n" + "\t mmap off : 0x%" PRIx64 "\n", + 0, reg.size, + reg.guest_phys_addr, + reg.guest_user_addr, + reg.host_user_addr, + (uint64_t)(uintptr_t)mmap_addr, + mmap_size, + alignment, + mmap_offset); + + if (dev->bar_addr) { + munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size); + } + dev->bar_addr = (void *)(uintptr_t)reg.host_user_addr; + dev->bar_size = reg.mmap_size; + + if (dev->notify_ops->vhost_nvme_set_bar_mr) { + ret = dev->notify_ops->vhost_nvme_set_bar_mr(dev->vid, dev->bar_addr, dev->bar_size); + if (ret) { + munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size); + dev->bar_addr = NULL; + dev->bar_size = 0; + } + } + + return ret; +} + +int +vhost_user_msg_handler(int vid, int fd) +{ + struct virtio_net *dev; + struct VhostUserMsg msg; + struct vhost_vring_file file; + int ret; + uint64_t cap; + uint64_t enable; + uint8_t cqe[16]; + uint8_t cmd[64]; + uint8_t buf[4096]; + + dev = get_device(vid); + if (dev == NULL) + return -1; + + ret = read_vhost_message(fd, &msg); + if (ret <= 0 || msg.request >= VHOST_USER_MAX) { + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read message failed\n"); + else if (ret == 0) + RTE_LOG(INFO, VHOST_CONFIG, + "vhost peer closed\n"); + else + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read incorrect message\n"); + + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, "%s: read message %s\n", + dev->ifname, vhost_message_str[msg.request]); + + ret = vhost_user_check_and_alloc_queue_pair(dev, &msg); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to alloc queue\n"); + return -1; + } + + switch (msg.request) { + case VHOST_USER_GET_CONFIG: + if (dev->notify_ops->get_config(dev->vid, + msg.payload.config.region, + msg.payload.config.size) != 0) { + msg.size = sizeof(uint64_t); + } + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_CONFIG: + if ((dev->notify_ops->set_config(dev->vid, + msg.payload.config.region, + msg.payload.config.offset, + msg.payload.config.size, + msg.payload.config.flags)) != 0) { + ret = 1; + } else { + ret = 0; + } + break; + case VHOST_USER_NVME_ADMIN: + if (!dev->is_nvme) { + dev->is_nvme = 1; + } + memcpy(cmd, msg.payload.nvme.cmd.req, sizeof(cmd)); + ret = vhost_user_nvme_admin_passthrough(dev, cmd, cqe, buf); + memcpy(msg.payload.nvme.cmd.cqe, cqe, sizeof(cqe)); + msg.size = sizeof(cqe); + /* NVMe Identify Command */ + if (cmd[0] == 0x06) { + memcpy(msg.payload.nvme.buf, &buf, 4096); + msg.size += 4096; + } + send_vhost_message(fd, &msg); + break; + case VHOST_USER_NVME_SET_CQ_CALL: + file.index = msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; + file.fd = msg.fds[0]; + ret = vhost_user_nvme_set_cq_call(dev, file.index, file.fd); + break; + case VHOST_USER_NVME_GET_CAP: + ret = vhost_user_nvme_get_cap(dev, &cap); + if (!ret) + msg.payload.u64 = cap; + else + msg.payload.u64 = 0; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_NVME_START_STOP: + enable = msg.payload.u64; + /* device must be started before set cq call */ + if (enable) { + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } else { + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + } + break; + case VHOST_USER_NVME_SET_BAR_MR: + ret = vhost_user_nvme_set_bar_mr(dev, &msg); + break; + case VHOST_USER_GET_FEATURES: + msg.payload.u64 = vhost_user_get_features(dev); + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_FEATURES: + vhost_user_set_features(dev, msg.payload.u64); + break; + + case VHOST_USER_GET_PROTOCOL_FEATURES: + msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + vhost_user_set_protocol_features(dev, msg.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + vhost_user_set_owner(); + break; + case VHOST_USER_RESET_OWNER: + vhost_user_reset_owner(dev); + break; + + case VHOST_USER_SET_MEM_TABLE: + ret = vhost_user_set_mem_table(dev, &msg); + break; + + case VHOST_USER_SET_LOG_BASE: + vhost_user_set_log_base(dev, &msg); + + /* it needs a reply */ + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_LOG_FD: + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); + break; + + case VHOST_USER_SET_VRING_NUM: + vhost_user_set_vring_num(dev, &msg); + break; + case VHOST_USER_SET_VRING_ADDR: + vhost_user_set_vring_addr(dev, &msg); + break; + case VHOST_USER_SET_VRING_BASE: + vhost_user_set_vring_base(dev, &msg); + break; + + case VHOST_USER_GET_VRING_BASE: + vhost_user_get_vring_base(dev, &msg); + msg.size = sizeof(msg.payload.state); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_KICK: + vhost_user_set_vring_kick(dev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + vhost_user_set_vring_call(dev, &msg); + break; + + case VHOST_USER_SET_VRING_ERR: + if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); + break; + + case VHOST_USER_GET_QUEUE_NUM: + msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_ENABLE: + vhost_user_set_vring_enable(dev, &msg); + break; + case VHOST_USER_SEND_RARP: + vhost_user_send_rarp(dev, &msg); + break; + + case VHOST_USER_NET_SET_MTU: + ret = vhost_user_net_set_mtu(dev, &msg); + break; + + default: + ret = -1; + break; + + } + + if (msg.flags & VHOST_USER_NEED_REPLY) { + msg.payload.u64 = !!ret; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + } + + if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { + dev->flags |= VIRTIO_DEV_READY; + + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->dequeue_zero_copy) { + RTE_LOG(INFO, VHOST_CONFIG, + "dequeue zero copy is enabled\n"); + } + + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } + + return 0; +} diff --git a/src/spdk/lib/rte_vhost/vhost_user.h b/src/spdk/lib/rte_vhost/vhost_user.h new file mode 100644 index 000000000..d20574b64 --- /dev/null +++ b/src/spdk/lib/rte_vhost/vhost_user.h @@ -0,0 +1,171 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H + +#include <stdint.h> +#include <linux/vhost.h> + +#include "rte_vhost.h" + +/* refer to hw/virtio/vhost-user.c */ + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +/* + * Maximum size of virtio device config space + */ +#define VHOST_USER_MAX_CONFIG_SIZE 256 + +#define VHOST_USER_PROTOCOL_F_MQ 0 +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 +#define VHOST_USER_PROTOCOL_F_RARP 2 +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#define VHOST_USER_PROTOCOL_F_NET_MTU 4 +#define VHOST_USER_PROTOCOL_F_CONFIG 9 + +#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ + (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \ + (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ + (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \ + (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_NVME_ADMIN = 80, + VHOST_USER_NVME_SET_CQ_CALL = 81, + VHOST_USER_NVME_GET_CAP = 82, + VHOST_USER_NVME_START_STOP = 83, + VHOST_USER_NVME_IO_CMD = 84, + VHOST_USER_NVME_SET_BAR_MR = 85, + VHOST_USER_MAX +} VhostUserRequest; + +typedef enum VhostUserSlaveRequest { + VHOST_USER_SLAVE_NONE = 0, + VHOST_USER_SLAVE_IOTLB_MSG = 1, + VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, + VHOST_USER_SLAVE_MAX +} VhostUserSlaveRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + +typedef struct VhostUserConfig { + uint32_t offset; + uint32_t size; + uint32_t flags; + uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; +} VhostUserConfig; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + VhostUserConfig config; + struct nvme { + union { + uint8_t req[64]; + uint8_t cqe[16]; + } cmd; + uint8_t buf[4096]; + } nvme; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute((packed)) VhostUserMsg; + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + + +/* vhost_user.c */ +int vhost_user_msg_handler(int vid, int fd); + +/* socket.c */ +int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); +int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); + +#endif diff --git a/src/spdk/lib/scsi/Makefile b/src/spdk/lib/scsi/Makefile new file mode 100644 index 000000000..8f8a8c326 --- /dev/null +++ b/src/spdk/lib/scsi/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 + +C_SRCS = dev.c lun.c port.c scsi.c scsi_bdev.c scsi_pr.c scsi_rpc.c task.c +LIBNAME = scsi + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_scsi.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/scsi/dev.c b/src/spdk/lib/scsi/dev.c new file mode 100644 index 000000000..6d3cfdf31 --- /dev/null +++ b/src/spdk/lib/scsi/dev.c @@ -0,0 +1,436 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +static struct spdk_scsi_dev g_devs[SPDK_SCSI_MAX_DEVS]; + +struct spdk_scsi_dev * +scsi_dev_get_list(void) +{ + return g_devs; +} + +static struct spdk_scsi_dev * +allocate_dev(void) +{ + struct spdk_scsi_dev *dev; + int i; + + for (i = 0; i < SPDK_SCSI_MAX_DEVS; i++) { + dev = &g_devs[i]; + if (!dev->is_allocated) { + memset(dev, 0, sizeof(*dev)); + dev->id = i; + dev->is_allocated = 1; + return dev; + } + } + + return NULL; +} + +static void +free_dev(struct spdk_scsi_dev *dev) +{ + assert(dev->is_allocated == 1); + assert(dev->removed == true); + + dev->is_allocated = 0; + + if (dev->remove_cb) { + dev->remove_cb(dev->remove_ctx, 0); + dev->remove_cb = NULL; + } +} + +void +spdk_scsi_dev_destruct(struct spdk_scsi_dev *dev, + spdk_scsi_dev_destruct_cb_t cb_fn, void *cb_arg) +{ + int lun_cnt; + int i; + + if (dev == NULL) { + if (cb_fn) { + cb_fn(cb_arg, -EINVAL); + } + return; + } + + if (dev->removed) { + if (cb_fn) { + cb_fn(cb_arg, -EINVAL); + } + return; + } + + dev->removed = true; + dev->remove_cb = cb_fn; + dev->remove_ctx = cb_arg; + lun_cnt = 0; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == NULL) { + continue; + } + + /* + * LUN will remove itself from this dev when all outstanding IO + * is done. When no more LUNs, dev will be deleted. + */ + scsi_lun_destruct(dev->lun[i]); + lun_cnt++; + } + + if (lun_cnt == 0) { + free_dev(dev); + return; + } +} + +static int +scsi_dev_find_lowest_free_lun_id(struct spdk_scsi_dev *dev) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == NULL) { + return i; + } + } + + return -1; +} + +int +spdk_scsi_dev_add_lun(struct spdk_scsi_dev *dev, const char *bdev_name, int lun_id, + void (*hotremove_cb)(const struct spdk_scsi_lun *, void *), + void *hotremove_ctx) +{ + struct spdk_bdev *bdev; + struct spdk_scsi_lun *lun; + + bdev = spdk_bdev_get_by_name(bdev_name); + if (bdev == NULL) { + SPDK_ERRLOG("device %s: cannot find bdev '%s' (target %d)\n", + dev->name, bdev_name, lun_id); + return -1; + } + + /* Search the lowest free LUN ID if LUN ID is default */ + if (lun_id == -1) { + lun_id = scsi_dev_find_lowest_free_lun_id(dev); + if (lun_id == -1) { + SPDK_ERRLOG("Free LUN ID is not found\n"); + return -1; + } + } + + lun = scsi_lun_construct(bdev, hotremove_cb, hotremove_ctx); + if (lun == NULL) { + return -1; + } + + lun->id = lun_id; + lun->dev = dev; + dev->lun[lun_id] = lun; + return 0; +} + +void +spdk_scsi_dev_delete_lun(struct spdk_scsi_dev *dev, + struct spdk_scsi_lun *lun) +{ + int lun_cnt = 0; + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == lun) { + dev->lun[i] = NULL; + } + + if (dev->lun[i]) { + lun_cnt++; + } + } + + if (dev->removed == true && lun_cnt == 0) { + free_dev(dev); + } +} + +struct spdk_scsi_dev *spdk_scsi_dev_construct(const char *name, const char *bdev_name_list[], + int *lun_id_list, int num_luns, uint8_t protocol_id, + void (*hotremove_cb)(const struct spdk_scsi_lun *, void *), + void *hotremove_ctx) +{ + struct spdk_scsi_dev *dev; + size_t name_len; + bool found_lun_0; + int i, rc; + + name_len = strlen(name); + if (name_len > sizeof(dev->name) - 1) { + SPDK_ERRLOG("device %s: name longer than maximum allowed length %zu\n", + name, sizeof(dev->name) - 1); + return NULL; + } + + if (num_luns == 0) { + SPDK_ERRLOG("device %s: no LUNs specified\n", name); + return NULL; + } + + found_lun_0 = false; + for (i = 0; i < num_luns; i++) { + if (lun_id_list[i] == 0) { + found_lun_0 = true; + break; + } + } + + if (!found_lun_0) { + SPDK_ERRLOG("device %s: no LUN 0 specified\n", name); + return NULL; + } + + for (i = 0; i < num_luns; i++) { + if (bdev_name_list[i] == NULL) { + SPDK_ERRLOG("NULL spdk_scsi_lun for LUN %d\n", + lun_id_list[i]); + return NULL; + } + } + + dev = allocate_dev(); + if (dev == NULL) { + return NULL; + } + + memcpy(dev->name, name, name_len + 1); + + dev->num_ports = 0; + dev->protocol_id = protocol_id; + + for (i = 0; i < num_luns; i++) { + rc = spdk_scsi_dev_add_lun(dev, bdev_name_list[i], lun_id_list[i], + hotremove_cb, hotremove_ctx); + if (rc < 0) { + spdk_scsi_dev_destruct(dev, NULL, NULL); + return NULL; + } + } + + return dev; +} + +void +spdk_scsi_dev_queue_mgmt_task(struct spdk_scsi_dev *dev, + struct spdk_scsi_task *task) +{ + assert(task != NULL); + + scsi_lun_execute_mgmt_task(task->lun, task); +} + +void +spdk_scsi_dev_queue_task(struct spdk_scsi_dev *dev, + struct spdk_scsi_task *task) +{ + assert(task != NULL); + + scsi_lun_execute_task(task->lun, task); +} + +static struct spdk_scsi_port * +scsi_dev_find_free_port(struct spdk_scsi_dev *dev) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) { + if (!dev->port[i].is_used) { + return &dev->port[i]; + } + } + + return NULL; +} + +int +spdk_scsi_dev_add_port(struct spdk_scsi_dev *dev, uint64_t id, const char *name) +{ + struct spdk_scsi_port *port; + int rc; + + if (dev->num_ports == SPDK_SCSI_DEV_MAX_PORTS) { + SPDK_ERRLOG("device already has %d ports\n", SPDK_SCSI_DEV_MAX_PORTS); + return -1; + } + + port = spdk_scsi_dev_find_port_by_id(dev, id); + if (port != NULL) { + SPDK_ERRLOG("device already has port(%" PRIu64 ")\n", id); + return -1; + } + + port = scsi_dev_find_free_port(dev); + if (port == NULL) { + assert(false); + return -1; + } + + rc = scsi_port_construct(port, id, dev->num_ports, name); + if (rc != 0) { + return rc; + } + + dev->num_ports++; + return 0; +} + +int +spdk_scsi_dev_delete_port(struct spdk_scsi_dev *dev, uint64_t id) +{ + struct spdk_scsi_port *port; + + port = spdk_scsi_dev_find_port_by_id(dev, id); + if (port == NULL) { + SPDK_ERRLOG("device does not have specified port(%" PRIu64 ")\n", id); + return -1; + } + + scsi_port_destruct(port); + + dev->num_ports--; + + return 0; +} + +struct spdk_scsi_port * +spdk_scsi_dev_find_port_by_id(struct spdk_scsi_dev *dev, uint64_t id) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) { + if (!dev->port[i].is_used) { + continue; + } + if (dev->port[i].id == id) { + return &dev->port[i]; + } + } + + /* No matching port found. */ + return NULL; +} + +void +spdk_scsi_dev_free_io_channels(struct spdk_scsi_dev *dev) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == NULL) { + continue; + } + scsi_lun_free_io_channel(dev->lun[i]); + } +} + +int +spdk_scsi_dev_allocate_io_channels(struct spdk_scsi_dev *dev) +{ + int i, rc; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == NULL) { + continue; + } + rc = scsi_lun_allocate_io_channel(dev->lun[i]); + if (rc < 0) { + spdk_scsi_dev_free_io_channels(dev); + return -1; + } + } + + return 0; +} + +const char * +spdk_scsi_dev_get_name(const struct spdk_scsi_dev *dev) +{ + return dev->name; +} + +int +spdk_scsi_dev_get_id(const struct spdk_scsi_dev *dev) +{ + return dev->id; +} + +struct spdk_scsi_lun * +spdk_scsi_dev_get_lun(struct spdk_scsi_dev *dev, int lun_id) +{ + struct spdk_scsi_lun *lun; + + if (lun_id < 0 || lun_id >= SPDK_SCSI_DEV_MAX_LUN) { + return NULL; + } + + lun = dev->lun[lun_id]; + + if (lun != NULL && !spdk_scsi_lun_is_removing(lun)) { + return lun; + } else { + return NULL; + } +} + +bool +spdk_scsi_dev_has_pending_tasks(const struct spdk_scsi_dev *dev, + const struct spdk_scsi_port *initiator_port) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; ++i) { + if (dev->lun[i] && + (scsi_lun_has_pending_tasks(dev->lun[i], initiator_port) || + scsi_lun_has_pending_mgmt_tasks(dev->lun[i], initiator_port))) { + return true; + } + } + + return false; +} diff --git a/src/spdk/lib/scsi/lun.c b/src/spdk/lib/scsi/lun.c new file mode 100644 index 000000000..262137d80 --- /dev/null +++ b/src/spdk/lib/scsi/lun.c @@ -0,0 +1,623 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/util.h" +#include "spdk/likely.h" + +static void scsi_lun_execute_tasks(struct spdk_scsi_lun *lun); +static void _scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun); + +void +scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +{ + if (lun) { + TAILQ_REMOVE(&lun->tasks, task, scsi_link); + spdk_trace_record(TRACE_SCSI_TASK_DONE, lun->dev->id, 0, (uintptr_t)task, 0); + } + task->cpl_fn(task); +} + +static void +scsi_lun_complete_mgmt_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +{ + TAILQ_REMOVE(&lun->mgmt_tasks, task, scsi_link); + + task->cpl_fn(task); + + /* Try to execute the first pending mgmt task if it exists. */ + _scsi_lun_execute_mgmt_task(lun); +} + +static bool +_scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun) +{ + return !TAILQ_EMPTY(&lun->pending_mgmt_tasks); +} + +static bool +scsi_lun_has_outstanding_mgmt_tasks(const struct spdk_scsi_lun *lun) +{ + return !TAILQ_EMPTY(&lun->mgmt_tasks); +} + +static bool +_scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun) +{ + return !TAILQ_EMPTY(&lun->pending_tasks); +} + +static bool +scsi_lun_has_outstanding_tasks(const struct spdk_scsi_lun *lun) +{ + return !TAILQ_EMPTY(&lun->tasks); +} + +/* Reset task have to wait until all prior outstanding tasks complete. */ +static int +scsi_lun_reset_check_outstanding_tasks(void *arg) +{ + struct spdk_scsi_task *task = (struct spdk_scsi_task *)arg; + struct spdk_scsi_lun *lun = task->lun; + + if (scsi_lun_has_outstanding_tasks(lun)) { + return SPDK_POLLER_BUSY; + } + spdk_poller_unregister(&lun->reset_poller); + + scsi_lun_complete_mgmt_task(lun, task); + return SPDK_POLLER_BUSY; +} + +void +scsi_lun_complete_reset_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +{ + if (task->status == SPDK_SCSI_STATUS_GOOD) { + if (scsi_lun_has_outstanding_tasks(lun)) { + lun->reset_poller = + SPDK_POLLER_REGISTER(scsi_lun_reset_check_outstanding_tasks, + task, 10); + return; + } + } + + scsi_lun_complete_mgmt_task(lun, task); +} + +static void +scsi_lun_append_mgmt_task(struct spdk_scsi_lun *lun, + struct spdk_scsi_task *task) +{ + TAILQ_INSERT_TAIL(&lun->pending_mgmt_tasks, task, scsi_link); +} + +static void +_scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun) +{ + struct spdk_scsi_task *task; + + if (!TAILQ_EMPTY(&lun->mgmt_tasks)) { + return; + } + + task = TAILQ_FIRST(&lun->pending_mgmt_tasks); + if (spdk_likely(task == NULL)) { + /* Try to execute all pending tasks */ + scsi_lun_execute_tasks(lun); + return; + } + TAILQ_REMOVE(&lun->pending_mgmt_tasks, task, scsi_link); + + TAILQ_INSERT_TAIL(&lun->mgmt_tasks, task, scsi_link); + + if (lun->removed) { + task->response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN; + scsi_lun_complete_mgmt_task(lun, task); + return; + } + + switch (task->function) { + case SPDK_SCSI_TASK_FUNC_ABORT_TASK: + task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + SPDK_ERRLOG("ABORT_TASK failed\n"); + break; + + case SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET: + task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + SPDK_ERRLOG("ABORT_TASK_SET failed\n"); + break; + + case SPDK_SCSI_TASK_FUNC_LUN_RESET: + bdev_scsi_reset(task); + return; + + default: + SPDK_ERRLOG("Unknown Task Management Function!\n"); + /* + * Task management functions other than those above should never + * reach this point having been filtered by the frontend. Reject + * the task as being unsupported. + */ + task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + break; + } + + scsi_lun_complete_mgmt_task(lun, task); +} + +void +scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun, + struct spdk_scsi_task *task) +{ + scsi_lun_append_mgmt_task(lun, task); + _scsi_lun_execute_mgmt_task(lun); +} + +static void +_scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +{ + int rc; + + task->status = SPDK_SCSI_STATUS_GOOD; + spdk_trace_record(TRACE_SCSI_TASK_START, lun->dev->id, task->length, (uintptr_t)task, 0); + TAILQ_INSERT_TAIL(&lun->tasks, task, scsi_link); + if (!lun->removed) { + /* Check the command is allowed or not when reservation is exist */ + if (spdk_unlikely(lun->reservation.flags & SCSI_SPC2_RESERVE)) { + rc = scsi2_reserve_check(task); + } else { + rc = scsi_pr_check(task); + } + if (spdk_unlikely(rc < 0)) { + /* Reservation Conflict */ + rc = SPDK_SCSI_TASK_COMPLETE; + } else { + rc = bdev_scsi_execute(task); + } + } else { + spdk_scsi_task_process_abort(task); + rc = SPDK_SCSI_TASK_COMPLETE; + } + + switch (rc) { + case SPDK_SCSI_TASK_PENDING: + break; + + case SPDK_SCSI_TASK_COMPLETE: + scsi_lun_complete_task(lun, task); + break; + + default: + abort(); + } +} + +static void +scsi_lun_append_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +{ + TAILQ_INSERT_TAIL(&lun->pending_tasks, task, scsi_link); +} + +static void +scsi_lun_execute_tasks(struct spdk_scsi_lun *lun) +{ + struct spdk_scsi_task *task, *task_tmp; + + TAILQ_FOREACH_SAFE(task, &lun->pending_tasks, scsi_link, task_tmp) { + TAILQ_REMOVE(&lun->pending_tasks, task, scsi_link); + _scsi_lun_execute_task(lun, task); + } +} + +void +scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +{ + if (spdk_unlikely(_scsi_lun_has_pending_mgmt_tasks(lun))) { + /* Add the IO task to pending list and wait for completion of + * existing mgmt tasks. + */ + scsi_lun_append_task(lun, task); + } else if (spdk_unlikely(_scsi_lun_has_pending_tasks(lun))) { + /* If there is any pending IO task, append the IO task to the + * tail of the pending list, and then execute all pending IO tasks + * from the head to submit IO tasks in order. + */ + scsi_lun_append_task(lun, task); + scsi_lun_execute_tasks(lun); + } else { + /* Execute the IO task directly. */ + _scsi_lun_execute_task(lun, task); + } +} + +static void +_scsi_lun_remove(void *arg) +{ + struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; + + spdk_bdev_close(lun->bdev_desc); + spdk_scsi_dev_delete_lun(lun->dev, lun); + free(lun); +} + +static void +scsi_lun_remove(struct spdk_scsi_lun *lun) +{ + struct spdk_scsi_pr_registrant *reg, *tmp; + struct spdk_thread *thread; + + TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) { + TAILQ_REMOVE(&lun->reg_head, reg, link); + free(reg); + } + + thread = spdk_get_thread(); + if (thread != lun->thread) { + spdk_thread_send_msg(lun->thread, _scsi_lun_remove, lun); + } else { + _scsi_lun_remove(lun); + } +} + +static int +scsi_lun_check_io_channel(void *arg) +{ + struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; + + if (lun->io_channel) { + return SPDK_POLLER_BUSY; + } + spdk_poller_unregister(&lun->hotremove_poller); + + scsi_lun_remove(lun); + return SPDK_POLLER_BUSY; +} + +static void +scsi_lun_notify_hot_remove(struct spdk_scsi_lun *lun) +{ + struct spdk_scsi_lun_desc *desc, *tmp; + + if (lun->hotremove_cb) { + lun->hotremove_cb(lun, lun->hotremove_ctx); + } + + TAILQ_FOREACH_SAFE(desc, &lun->open_descs, link, tmp) { + if (desc->hotremove_cb) { + desc->hotremove_cb(lun, desc->hotremove_ctx); + } else { + spdk_scsi_lun_close(desc); + } + } + + if (lun->io_channel) { + lun->hotremove_poller = SPDK_POLLER_REGISTER(scsi_lun_check_io_channel, + lun, 10); + } else { + scsi_lun_remove(lun); + } +} + +static int +scsi_lun_check_outstanding_tasks(void *arg) +{ + struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; + + if (scsi_lun_has_outstanding_tasks(lun) || + scsi_lun_has_outstanding_mgmt_tasks(lun)) { + return SPDK_POLLER_BUSY; + } + spdk_poller_unregister(&lun->hotremove_poller); + + scsi_lun_notify_hot_remove(lun); + return SPDK_POLLER_BUSY; +} + +static void +_scsi_lun_hot_remove(void *arg1) +{ + struct spdk_scsi_lun *lun = arg1; + + /* If lun->removed is set, no new task can be submitted to the LUN. + * Execute previously queued tasks, which will be immediately aborted. + */ + scsi_lun_execute_tasks(lun); + + /* Then we only need to wait for all outstanding tasks to be completed + * before notifying the upper layer about the removal. + */ + if (scsi_lun_has_outstanding_tasks(lun) || + scsi_lun_has_outstanding_mgmt_tasks(lun)) { + lun->hotremove_poller = SPDK_POLLER_REGISTER(scsi_lun_check_outstanding_tasks, + lun, 10); + } else { + scsi_lun_notify_hot_remove(lun); + } +} + +static void +scsi_lun_hot_remove(void *remove_ctx) +{ + struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)remove_ctx; + struct spdk_thread *thread; + + if (lun->removed) { + return; + } + + lun->removed = true; + if (lun->io_channel == NULL) { + _scsi_lun_hot_remove(lun); + return; + } + + thread = spdk_io_channel_get_thread(lun->io_channel); + if (thread != spdk_get_thread()) { + spdk_thread_send_msg(thread, _scsi_lun_hot_remove, lun); + } else { + _scsi_lun_hot_remove(lun); + } +} + +/** + * \brief Constructs a new spdk_scsi_lun object based on the provided parameters. + * + * \param bdev bdev associated with this LUN + * + * \return NULL if bdev == NULL + * \return pointer to the new spdk_scsi_lun object otherwise + */ +struct spdk_scsi_lun *scsi_lun_construct(struct spdk_bdev *bdev, + void (*hotremove_cb)(const struct spdk_scsi_lun *, void *), + void *hotremove_ctx) +{ + struct spdk_scsi_lun *lun; + int rc; + + if (bdev == NULL) { + SPDK_ERRLOG("bdev must be non-NULL\n"); + return NULL; + } + + lun = calloc(1, sizeof(*lun)); + if (lun == NULL) { + SPDK_ERRLOG("could not allocate lun\n"); + return NULL; + } + + rc = spdk_bdev_open(bdev, true, scsi_lun_hot_remove, lun, &lun->bdev_desc); + + if (rc != 0) { + SPDK_ERRLOG("bdev %s cannot be opened, error=%d\n", spdk_bdev_get_name(bdev), rc); + free(lun); + return NULL; + } + + lun->thread = spdk_get_thread(); + + TAILQ_INIT(&lun->tasks); + TAILQ_INIT(&lun->pending_tasks); + TAILQ_INIT(&lun->mgmt_tasks); + TAILQ_INIT(&lun->pending_mgmt_tasks); + + lun->bdev = bdev; + lun->io_channel = NULL; + lun->hotremove_cb = hotremove_cb; + lun->hotremove_ctx = hotremove_ctx; + TAILQ_INIT(&lun->open_descs); + TAILQ_INIT(&lun->reg_head); + + return lun; +} + +void +scsi_lun_destruct(struct spdk_scsi_lun *lun) +{ + scsi_lun_hot_remove(lun); +} + +int +spdk_scsi_lun_open(struct spdk_scsi_lun *lun, spdk_scsi_lun_remove_cb_t hotremove_cb, + void *hotremove_ctx, struct spdk_scsi_lun_desc **_desc) +{ + struct spdk_scsi_lun_desc *desc; + + desc = calloc(1, sizeof(*desc)); + if (desc == NULL) { + SPDK_ERRLOG("calloc() failed for LUN descriptor.\n"); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&lun->open_descs, desc, link); + + desc->lun = lun; + desc->hotremove_cb = hotremove_cb; + desc->hotremove_ctx = hotremove_ctx; + *_desc = desc; + + return 0; +} + +void +spdk_scsi_lun_close(struct spdk_scsi_lun_desc *desc) +{ + struct spdk_scsi_lun *lun = desc->lun; + + TAILQ_REMOVE(&lun->open_descs, desc, link); + free(desc); + + assert(!TAILQ_EMPTY(&lun->open_descs) || lun->io_channel == NULL); +} + +int +scsi_lun_allocate_io_channel(struct spdk_scsi_lun *lun) +{ + if (lun->io_channel != NULL) { + if (spdk_get_thread() == spdk_io_channel_get_thread(lun->io_channel)) { + lun->ref++; + return 0; + } + SPDK_ERRLOG("io_channel already allocated for lun %s\n", + spdk_bdev_get_name(lun->bdev)); + return -1; + } + + lun->io_channel = spdk_bdev_get_io_channel(lun->bdev_desc); + if (lun->io_channel == NULL) { + return -1; + } + lun->ref = 1; + return 0; +} + +void +scsi_lun_free_io_channel(struct spdk_scsi_lun *lun) +{ + if (lun->io_channel == NULL) { + return; + } + + if (spdk_get_thread() != spdk_io_channel_get_thread(lun->io_channel)) { + SPDK_ERRLOG("io_channel was freed by different thread\n"); + return; + } + + lun->ref--; + if (lun->ref == 0) { + spdk_put_io_channel(lun->io_channel); + lun->io_channel = NULL; + } +} + +int +spdk_scsi_lun_allocate_io_channel(struct spdk_scsi_lun_desc *desc) +{ + struct spdk_scsi_lun *lun = desc->lun; + + return scsi_lun_allocate_io_channel(lun); +} + +void +spdk_scsi_lun_free_io_channel(struct spdk_scsi_lun_desc *desc) +{ + struct spdk_scsi_lun *lun = desc->lun; + + scsi_lun_free_io_channel(lun); +} + +int +spdk_scsi_lun_get_id(const struct spdk_scsi_lun *lun) +{ + return lun->id; +} + +const char * +spdk_scsi_lun_get_bdev_name(const struct spdk_scsi_lun *lun) +{ + return spdk_bdev_get_name(lun->bdev); +} + +const struct spdk_scsi_dev * +spdk_scsi_lun_get_dev(const struct spdk_scsi_lun *lun) +{ + return lun->dev; +} + +bool +scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun, + const struct spdk_scsi_port *initiator_port) +{ + struct spdk_scsi_task *task; + + if (initiator_port == NULL) { + return _scsi_lun_has_pending_mgmt_tasks(lun) || + scsi_lun_has_outstanding_mgmt_tasks(lun); + } + + TAILQ_FOREACH(task, &lun->pending_mgmt_tasks, scsi_link) { + if (task->initiator_port == initiator_port) { + return true; + } + } + + TAILQ_FOREACH(task, &lun->mgmt_tasks, scsi_link) { + if (task->initiator_port == initiator_port) { + return true; + } + } + + return false; +} +/* This check includes both pending and submitted (outstanding) tasks. */ +bool +scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun, + const struct spdk_scsi_port *initiator_port) +{ + struct spdk_scsi_task *task; + + if (initiator_port == NULL) { + return _scsi_lun_has_pending_tasks(lun) || + scsi_lun_has_outstanding_tasks(lun); + } + + TAILQ_FOREACH(task, &lun->pending_tasks, scsi_link) { + if (task->initiator_port == initiator_port) { + return true; + } + } + + TAILQ_FOREACH(task, &lun->tasks, scsi_link) { + if (task->initiator_port == initiator_port) { + return true; + } + } + + return false; +} + +bool +spdk_scsi_lun_is_removing(const struct spdk_scsi_lun *lun) +{ + return lun->removed; +} + +bool +spdk_scsi_lun_get_dif_ctx(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task, + struct spdk_dif_ctx *dif_ctx) +{ + return bdev_scsi_get_dif_ctx(lun->bdev, task, dif_ctx); +} diff --git a/src/spdk/lib/scsi/port.c b/src/spdk/lib/scsi/port.c new file mode 100644 index 000000000..09311bac2 --- /dev/null +++ b/src/spdk/lib/scsi/port.c @@ -0,0 +1,134 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +#include "spdk/endian.h" + +struct spdk_scsi_port * +spdk_scsi_port_create(uint64_t id, uint16_t index, const char *name) +{ + struct spdk_scsi_port *port; + + port = calloc(1, sizeof(struct spdk_scsi_port)); + + if (!port) { + return NULL; + } + + if (scsi_port_construct(port, id, index, name) != 0) { + spdk_scsi_port_free(&port); + return NULL; + } + + return port; +} + +void +spdk_scsi_port_free(struct spdk_scsi_port **pport) +{ + struct spdk_scsi_port *port; + + if (!pport) { + return; + } + + port = *pport; + *pport = NULL; + free(port); +} + +int +scsi_port_construct(struct spdk_scsi_port *port, uint64_t id, uint16_t index, + const char *name) +{ + if (strlen(name) >= sizeof(port->name)) { + SPDK_ERRLOG("port name too long\n"); + return -1; + } + + port->is_used = 1; + port->id = id; + port->index = index; + snprintf(port->name, sizeof(port->name), "%s", name); + return 0; +} + +void +scsi_port_destruct(struct spdk_scsi_port *port) +{ + memset(port, 0, sizeof(struct spdk_scsi_port)); +} + +const char * +spdk_scsi_port_get_name(const struct spdk_scsi_port *port) +{ + return port->name; +} + +/* + * spc3r23 7.5.4.6 iSCSI initiator port TransportID, + * using code format 0x01. + */ +void +spdk_scsi_port_set_iscsi_transport_id(struct spdk_scsi_port *port, char *iscsi_name, + uint64_t isid) +{ + struct spdk_scsi_iscsi_transport_id *data; + uint32_t len; + char *name; + + memset(port->transport_id, 0, sizeof(port->transport_id)); + port->transport_id_len = 0; + + data = (struct spdk_scsi_iscsi_transport_id *)port->transport_id; + + data->protocol_id = (uint8_t)SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI; + data->format = 0x1; + + name = data->name; + len = snprintf(name, SPDK_SCSI_MAX_TRANSPORT_ID_LENGTH - sizeof(*data), + "%s,i,0x%12.12" PRIx64, iscsi_name, isid); + do { + name[len++] = '\0'; + } while (len & 3); + + if (len < 20) { + SPDK_ERRLOG("The length of Transport ID should >= 20 bytes\n"); + return; + } + + to_be16(&data->additional_len, len); + port->transport_id_len = len + sizeof(*data); +} diff --git a/src/spdk/lib/scsi/scsi.c b/src/spdk/lib/scsi/scsi.c new file mode 100644 index 000000000..c18192e37 --- /dev/null +++ b/src/spdk/lib/scsi/scsi.c @@ -0,0 +1,110 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +struct spdk_scsi_globals g_scsi; + +int +spdk_scsi_init(void) +{ + int rc; + + rc = pthread_mutex_init(&g_scsi.mutex, NULL); + if (rc != 0) { + SPDK_ERRLOG("mutex_init() failed\n"); + return -1; + } + + return 0; +} + +void +spdk_scsi_fini(void) +{ + pthread_mutex_destroy(&g_scsi.mutex); +} + +SPDK_TRACE_REGISTER_FN(scsi_trace, "scsi", TRACE_GROUP_SCSI) +{ + spdk_trace_register_owner(OWNER_SCSI_DEV, 'd'); + spdk_trace_register_object(OBJECT_SCSI_TASK, 't'); + spdk_trace_register_description("SCSI_TASK_DONE", TRACE_SCSI_TASK_DONE, + OWNER_SCSI_DEV, OBJECT_SCSI_TASK, 0, 0, ""); + spdk_trace_register_description("SCSI_TASK_START", TRACE_SCSI_TASK_START, + OWNER_SCSI_DEV, OBJECT_SCSI_TASK, 0, 0, ""); +} + +uint64_t +spdk_scsi_lun_id_int_to_fmt(int lun_id) +{ + uint64_t fmt_lun, method; + + if (SPDK_SCSI_DEV_MAX_LUN <= 0x0100) { + /* below 256 */ + method = 0x00U; + fmt_lun = (method & 0x03U) << 62; + fmt_lun |= ((uint64_t)lun_id & 0x00ffU) << 48; + } else if (SPDK_SCSI_DEV_MAX_LUN <= 0x4000) { + /* below 16384 */ + method = 0x01U; + fmt_lun = (method & 0x03U) << 62; + fmt_lun |= ((uint64_t)lun_id & 0x3fffU) << 48; + } else { + /* XXX */ + fmt_lun = 0; + } + + return fmt_lun; +} + +int +spdk_scsi_lun_id_fmt_to_int(uint64_t fmt_lun) +{ + uint64_t method; + int lun_i; + + method = (fmt_lun >> 62) & 0x03U; + fmt_lun = fmt_lun >> 48; + if (method == 0x00U) { + lun_i = (int)(fmt_lun & 0x00ffU); + } else if (method == 0x01U) { + lun_i = (int)(fmt_lun & 0x3fffU); + } else { + lun_i = 0xffffU; + } + return lun_i; +} + +SPDK_LOG_REGISTER_COMPONENT("scsi", SPDK_LOG_SCSI) diff --git a/src/spdk/lib/scsi/scsi_bdev.c b/src/spdk/lib/scsi/scsi_bdev.c new file mode 100644 index 000000000..bf0fb5af7 --- /dev/null +++ b/src/spdk/lib/scsi/scsi_bdev.c @@ -0,0 +1,2067 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +/* + * TODO: move bdev SCSI error code translation tests to bdev unit test + * and remove this include. + */ +#include "spdk/bdev_module.h" + +#include "spdk/env.h" +#include "spdk/bdev.h" +#include "spdk/endian.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#define SPDK_WORK_BLOCK_SIZE (4ULL * 1024ULL * 1024ULL) +#define SPDK_WORK_ATS_BLOCK_SIZE (1ULL * 1024ULL * 1024ULL) +#define MAX_SERIAL_STRING 32 + +#define DEFAULT_DISK_VENDOR "INTEL" +#define DEFAULT_DISK_REVISION "0001" +#define DEFAULT_DISK_ROTATION_RATE 1 /* Non-rotating medium */ +#define DEFAULT_DISK_FORM_FACTOR 0x02 /* 3.5 inch */ +#define DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT 256 + +#define INQUIRY_OFFSET(field) offsetof(struct spdk_scsi_cdb_inquiry_data, field) + \ + sizeof(((struct spdk_scsi_cdb_inquiry_data *)0x0)->field) + +static void bdev_scsi_process_block_resubmit(void *arg); + +static int +hex2bin(char ch) +{ + if ((ch >= '0') && (ch <= '9')) { + return ch - '0'; + } + ch = tolower(ch); + if ((ch >= 'a') && (ch <= 'f')) { + return ch - 'a' + 10; + } + return (int)ch; +} + +static void +bdev_scsi_set_naa_ieee_extended(const char *name, uint8_t *buf) +{ + int i, value, count = 0; + uint64_t local_value; + + for (i = 0; (i < 16) && (name[i] != '\0'); i++) { + value = hex2bin(name[i]); + if (i % 2) { + buf[count++] |= value << 4; + } else { + buf[count] = value; + } + } + + local_value = *(uint64_t *)buf; + /* + * see spc3r23 7.6.3.6.2, + * NAA IEEE Extended identifer format + */ + local_value &= 0x0fff000000ffffffull; + /* NAA 02, and 00 03 47 for IEEE Intel */ + local_value |= 0x2000000347000000ull; + + to_be64((void *)buf, local_value); +} + +static int +bdev_scsi_report_luns(struct spdk_scsi_lun *lun, + int sel, uint8_t *data, int alloc_len) +{ + struct spdk_scsi_dev *dev; + uint64_t fmt_lun; + int hlen, len = 0; + int i; + + if (alloc_len < 8) { + return -1; + } + + if (sel == 0x00) { + /* logical unit with addressing method */ + } else if (sel == 0x01) { + /* well known logical unit */ + } else if (sel == 0x02) { + /* logical unit */ + } else { + return -1; + } + + /* LUN LIST LENGTH */ + memset(data, 0, 4); + + /* Reserved */ + memset(&data[4], 0, 4); + hlen = 8; + + dev = lun->dev; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == NULL) { + continue; + } + + if (alloc_len - (hlen + len) < 8) { + return -1; + } + + fmt_lun = spdk_scsi_lun_id_int_to_fmt(i); + + /* LUN */ + to_be64(&data[hlen + len], fmt_lun); + len += 8; + } + + /* LUN LIST LENGTH */ + to_be32(data, len); + + return hlen + len; +} + +static int +bdev_scsi_pad_scsi_name(char *dst, const char *name) +{ + size_t len; + + len = strlen(name); + memcpy(dst, name, len); + do { + dst[len++] = '\0'; + } while (len & 3); + + return len; +} + +static int +bdev_scsi_inquiry(struct spdk_bdev *bdev, struct spdk_scsi_task *task, + uint8_t *cdb, uint8_t *data, uint16_t alloc_len) +{ + struct spdk_scsi_lun *lun; + struct spdk_scsi_dev *dev; + struct spdk_scsi_port *port; + uint32_t blocks, optimal_blocks; + int hlen = 0, plen, plen2; + uint16_t len = 0; + int pc; + int pd; + int evpd; + int i; + struct spdk_scsi_cdb_inquiry *inq = (struct spdk_scsi_cdb_inquiry *)cdb; + + /* standard inquiry command at lease with 36 Bytes */ + if (alloc_len < 0x24) { + goto inq_error; + } + + lun = task->lun; + dev = lun->dev; + port = task->target_port; + + pd = SPDK_SPC_PERIPHERAL_DEVICE_TYPE_DISK; + pc = inq->page_code; + evpd = inq->evpd & 0x1; + + if (!evpd && pc) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + if (evpd) { + struct spdk_scsi_vpd_page *vpage = (struct spdk_scsi_vpd_page *)data; + + /* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */ + vpage->peripheral_device_type = pd; + vpage->peripheral_qualifier = SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED; + /* PAGE CODE */ + vpage->page_code = pc; + + /* Vital product data */ + switch (pc) { + case SPDK_SPC_VPD_SUPPORTED_VPD_PAGES: + hlen = 4; + + vpage->params[0] = SPDK_SPC_VPD_SUPPORTED_VPD_PAGES; + vpage->params[1] = SPDK_SPC_VPD_UNIT_SERIAL_NUMBER; + vpage->params[2] = SPDK_SPC_VPD_DEVICE_IDENTIFICATION; + vpage->params[3] = SPDK_SPC_VPD_MANAGEMENT_NETWORK_ADDRESSES; + vpage->params[4] = SPDK_SPC_VPD_EXTENDED_INQUIRY_DATA; + vpage->params[5] = SPDK_SPC_VPD_MODE_PAGE_POLICY; + vpage->params[6] = SPDK_SPC_VPD_SCSI_PORTS; + vpage->params[7] = SPDK_SPC_VPD_BLOCK_LIMITS; + vpage->params[8] = SPDK_SPC_VPD_BLOCK_DEV_CHARS; + len = 9; + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + vpage->params[9] = SPDK_SPC_VPD_BLOCK_THIN_PROVISION; + len++; + } + + /* PAGE LENGTH */ + to_be16(vpage->alloc_len, len); + break; + + case SPDK_SPC_VPD_UNIT_SERIAL_NUMBER: { + const char *name = spdk_bdev_get_name(bdev); + + hlen = 4; + + /* PRODUCT SERIAL NUMBER */ + len = strlen(name) + 1; + if (len > MAX_SERIAL_STRING) { + len = MAX_SERIAL_STRING; + } + + memcpy(vpage->params, name, len - 1); + vpage->params[len - 1] = 0; + + /* PAGE LENGTH */ + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_DEVICE_IDENTIFICATION: { + const char *name = spdk_bdev_get_name(bdev); + const char *product_name = spdk_bdev_get_product_name(bdev); + uint8_t protocol_id = dev->protocol_id; + uint8_t *buf = vpage->params; + struct spdk_scsi_desig_desc *desig; + + hlen = 4; + + /* Check total length by calculated how much space all entries take */ + len = sizeof(struct spdk_scsi_desig_desc) + 8; + len += sizeof(struct spdk_scsi_desig_desc) + 8 + 16 + MAX_SERIAL_STRING; + len += sizeof(struct spdk_scsi_desig_desc) + SPDK_SCSI_DEV_MAX_NAME + 1; + len += sizeof(struct spdk_scsi_desig_desc) + SPDK_SCSI_PORT_MAX_NAME_LENGTH; + len += sizeof(struct spdk_scsi_desig_desc) + 4; + len += sizeof(struct spdk_scsi_desig_desc) + 4; + len += sizeof(struct spdk_scsi_desig_desc) + 4; + if (sizeof(struct spdk_scsi_vpd_page) + len > alloc_len) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + /* Now fill out the designator array */ + + /* NAA designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_NAA; + desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = 8; + bdev_scsi_set_naa_ieee_extended(name, desig->desig); + len = sizeof(struct spdk_scsi_desig_desc) + 8; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* T10 Vendor ID designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_ASCII; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_T10_VENDOR_ID; + desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = 8 + 16 + MAX_SERIAL_STRING; + spdk_strcpy_pad(desig->desig, DEFAULT_DISK_VENDOR, 8, ' '); + spdk_strcpy_pad(&desig->desig[8], product_name, 16, ' '); + spdk_strcpy_pad(&desig->desig[24], name, MAX_SERIAL_STRING, ' '); + len += sizeof(struct spdk_scsi_desig_desc) + 8 + 16 + MAX_SERIAL_STRING; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* SCSI Device Name designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_UTF8; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME; + desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_DEVICE; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = bdev_scsi_pad_scsi_name(desig->desig, dev->name); + len += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* SCSI Port Name designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_UTF8; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME; + desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = snprintf(desig->desig, SPDK_SCSI_PORT_MAX_NAME_LENGTH, "%s", port->name); + len += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* Relative Target Port designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_RELATIVE_TARGET_PORT; + desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = 4; + memset(desig->desig, 0, 2); /* Reserved */ + to_be16(&desig->desig[2], port->index); + len += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* Target port group designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_TARGET_PORT_GROUP; + desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = 4; + memset(desig->desig, 0, 4); + len += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* Logical unit group designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_LOGICAL_UNIT_GROUP; + desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = 4; + memset(desig->desig, 0, 2); /* Reserved */ + to_be16(&desig->desig[2], dev->id); + len += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + to_be16(vpage->alloc_len, len); + + break; + } + + case SPDK_SPC_VPD_EXTENDED_INQUIRY_DATA: { + struct spdk_scsi_vpd_ext_inquiry *vext = (struct spdk_scsi_vpd_ext_inquiry *)vpage; + + hlen = 4; + memset((uint8_t *)vext + hlen, 0, sizeof(*vext) - hlen); + + /* RTO(3) GRD_CHK(2) APP_CHK(1) REF_CHK(0) */ + + /* GROUP_SUP(4) PRIOR_SUP(3) HEADSUP(2) ORDSUP(1) SIMPSUP(0) */ + vext->sup = SPDK_SCSI_VEXT_HEADSUP | SPDK_SCSI_VEXT_SIMPSUP; + + /* NV_SUP(1) V_SUP(0) */ + + /* Reserved[7-63] */ + + len = 64 - hlen; + + /* PAGE LENGTH */ + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_MANAGEMENT_NETWORK_ADDRESSES: + /* PAGE LENGTH */ + hlen = 4; + + to_be16(vpage->alloc_len, len); + break; + + case SPDK_SPC_VPD_MODE_PAGE_POLICY: { + struct spdk_scsi_mpage_policy_desc *pdesc = + (struct spdk_scsi_mpage_policy_desc *)vpage->params; + + hlen = 4; + + /* Mode page policy descriptor 1 */ + + /* POLICY PAGE CODE(5-0) */ + /* all page code */ + pdesc->page_code = 0x3f; + + /* POLICY SUBPAGE CODE */ + /* all sub page */ + pdesc->sub_page_code = 0xff; + + /* MLUS(7) MODE PAGE POLICY(1-0) */ + /* MLUS own copy */ + /* Shared MODE PAGE policy */ + pdesc->policy = 0; + /* Reserved */ + pdesc->reserved = 0; + + len += 4; + + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_SCSI_PORTS: { + /* PAGE LENGTH */ + hlen = 4; + + /* Identification descriptor list */ + for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) { + struct spdk_scsi_port_desc *sdesc; + struct spdk_scsi_tgt_port_desc *pdesc; + + if (!dev->port[i].is_used) { + continue; + } + + /* Identification descriptor N */ + sdesc = (struct spdk_scsi_port_desc *)&vpage->params[len]; + + /* Reserved */ + sdesc->reserved = 0; + + /* RELATIVE PORT IDENTIFIER */ + to_be16(&sdesc->rel_port_id, dev->port[i].index); + + /* Reserved */ + sdesc->reserved2 = 0; + + /* INITIATOR PORT TRANSPORTID LENGTH */ + sdesc->init_port_len = 0; + + /* Reserved */ + sdesc->init_port_id = 0; + + /* TARGET PORT DESCRIPTORS LENGTH */ + sdesc->tgt_desc_len = 0; + + len += 12; + + plen2 = 0; + /* Target port descriptor 1 */ + pdesc = (struct spdk_scsi_tgt_port_desc *)sdesc->tgt_desc; + + /* PROTOCOL IDENTIFIER(7-4) CODE SET(3-0) */ + pdesc->code_set = + SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI << 4 | + SPDK_SPC_VPD_CODE_SET_UTF8; + + /* PIV(7) ASSOCIATION(5-4) IDENTIFIER TYPE(3-0) */ + pdesc->desig_type = SPDK_SPC_VPD_DESIG_PIV | + SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT << 4 | + SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME; + + /* Reserved */ + pdesc->reserved = 0; + + /* IDENTIFIER */ + plen = snprintf((char *)pdesc->designator, + SPDK_SCSI_PORT_MAX_NAME_LENGTH, "%s", + dev->port[i].name); + pdesc->len = plen; + + plen2 += 4 + plen; + + /* TARGET PORT DESCRIPTORS LENGTH */ + to_be16(&sdesc->tgt_desc_len, plen2); + + len += plen2; + } + + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_BLOCK_LIMITS: { + uint32_t block_size = spdk_bdev_get_data_block_size(bdev); + + /* PAGE LENGTH */ + memset(&data[4], 0, 60); + + hlen = 4; + + /* WSNZ(0) */ + /* support zero length in WRITE SAME */ + + /* MAXIMUM COMPARE AND WRITE LENGTH */ + blocks = SPDK_WORK_ATS_BLOCK_SIZE / block_size; + + if (blocks > 0xff) { + blocks = 0xff; + } + + data[5] = (uint8_t)blocks; + + /* force align to 4KB */ + if (block_size < 4096) { + optimal_blocks = 4096 / block_size; + } else { + optimal_blocks = 1; + } + + /* OPTIMAL TRANSFER LENGTH GRANULARITY */ + to_be16(&data[6], optimal_blocks); + + blocks = SPDK_WORK_BLOCK_SIZE / block_size; + + /* MAXIMUM TRANSFER LENGTH */ + to_be32(&data[8], blocks); + /* OPTIMAL TRANSFER LENGTH */ + to_be32(&data[12], blocks); + + /* MAXIMUM PREFETCH XDREAD XDWRITE TRANSFER LENGTH */ + + len = 20 - hlen; + + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + /* + * MAXIMUM UNMAP LBA COUNT: indicates the + * maximum number of LBAs that may be + * unmapped by an UNMAP command. + */ + /* For now, choose 4MB as the maximum. */ + to_be32(&data[20], 4194304); + + /* + * MAXIMUM UNMAP BLOCK DESCRIPTOR COUNT: + * indicates the maximum number of UNMAP + * block descriptors that shall be contained + * in the parameter data transferred to the + * device server for an UNMAP command. + * The bdev layer automatically splits unmap + * requests, so pick an arbitrary high number here. + */ + to_be32(&data[24], DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT); + + /* + * The UGAVALID bit is left as 0 which means neither the + * OPTIMAL UNMAP GRANULARITY nor the UNMAP GRANULARITY + * ALIGNMENT fields are valid. + */ + + /* + * MAXIMUM WRITE SAME LENGTH: indicates the + * maximum number of contiguous logical blocks + * that the device server allows to be unmapped + * or written in a single WRITE SAME command. + */ + to_be64(&data[36], 512); + + /* Reserved */ + /* not specified */ + len = 64 - hlen; + } + + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_BLOCK_DEV_CHARS: { + /* PAGE LENGTH */ + hlen = 4; + len = 64 - hlen; + + to_be16(&data[4], DEFAULT_DISK_ROTATION_RATE); + + /* Reserved */ + data[6] = 0; + /* NOMINAL FORM FACTOR(3-0) */ + data[7] = DEFAULT_DISK_FORM_FACTOR << 4; + /* Reserved */ + memset(&data[8], 0, 64 - 8); + + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_BLOCK_THIN_PROVISION: { + if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + goto inq_error; + } + + hlen = 4; + len = 7; + + /* + * PAGE LENGTH : if the DP bit is set to one, then the + * page length shall be set 0004h. + */ + to_be16(&data[2], 0x0004); + + /* + * THRESHOLD EXPONENT : it indicates the threshold set + * size in LBAs as a power of 2( i.e., the threshold + * set size = 2 ^ (threshold exponent). + */ + data[4] = 0; + + /* + * Set the LBPU bit to indicate the support for UNMAP + * command. + */ + data[5] |= SPDK_SCSI_UNMAP_LBPU; + + /* + * Set the provisioning type to thin provision. + */ + data[6] = SPDK_SCSI_UNMAP_THIN_PROVISIONING; + + to_be16(vpage->alloc_len, len); + break; + } + + default: + if (pc >= 0xc0 && pc <= 0xff) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "Vendor specific INQUIRY VPD page 0x%x\n", pc); + } else { + SPDK_ERRLOG("unsupported INQUIRY VPD page 0x%x\n", pc); + } + goto inq_error; + } + } else { + struct spdk_scsi_cdb_inquiry_data *inqdata = + (struct spdk_scsi_cdb_inquiry_data *)data; + + /* Standard INQUIRY data */ + /* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */ + inqdata->peripheral_device_type = pd; + inqdata->peripheral_qualifier = SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED; + + /* RMB(7) */ + inqdata->rmb = 0; + + /* VERSION */ + /* See SPC3/SBC2/MMC4/SAM2 for more details */ + inqdata->version = SPDK_SPC_VERSION_SPC3; + + /* NORMACA(5) HISUP(4) RESPONSE DATA FORMAT(3-0) */ + /* format 2 */ /* hierarchical support */ + inqdata->response = 2 | 1 << 4; + + hlen = 5; + + /* SCCS(7) ACC(6) TPGS(5-4) 3PC(3) PROTECT(0) */ + /* Not support TPGS */ + inqdata->flags = 0; + + /* MULTIP */ + inqdata->flags2 = 0x10; + + /* WBUS16(5) SYNC(4) LINKED(3) CMDQUE(1) VS(0) */ + /* CMDQUE */ + inqdata->flags3 = 0x2; + + /* T10 VENDOR IDENTIFICATION */ + spdk_strcpy_pad(inqdata->t10_vendor_id, DEFAULT_DISK_VENDOR, 8, ' '); + + /* PRODUCT IDENTIFICATION */ + spdk_strcpy_pad(inqdata->product_id, spdk_bdev_get_product_name(bdev), 16, ' '); + + /* PRODUCT REVISION LEVEL */ + spdk_strcpy_pad(inqdata->product_rev, DEFAULT_DISK_REVISION, 4, ' '); + + /* + * Standard inquiry data ends here. Only populate remaining fields if alloc_len + * indicates enough space to hold it. + */ + len = INQUIRY_OFFSET(product_rev) - 5; + + if (alloc_len >= INQUIRY_OFFSET(vendor)) { + /* Vendor specific */ + memset(inqdata->vendor, 0x20, 20); + len += sizeof(inqdata->vendor); + } + + if (alloc_len >= INQUIRY_OFFSET(ius)) { + /* CLOCKING(3-2) QAS(1) IUS(0) */ + inqdata->ius = 0; + len += sizeof(inqdata->ius); + } + + if (alloc_len >= INQUIRY_OFFSET(reserved)) { + /* Reserved */ + inqdata->reserved = 0; + len += sizeof(inqdata->reserved); + } + + /* VERSION DESCRIPTOR 1-8 */ + if (alloc_len >= INQUIRY_OFFSET(reserved) + 2) { + to_be16(&inqdata->desc[0], 0x0960); + len += 2; + } + + if (alloc_len >= INQUIRY_OFFSET(reserved) + 4) { + to_be16(&inqdata->desc[2], 0x0300); /* SPC-3 (no version claimed) */ + len += 2; + } + + if (alloc_len >= INQUIRY_OFFSET(reserved) + 6) { + to_be16(&inqdata->desc[4], 0x320); /* SBC-2 (no version claimed) */ + len += 2; + } + + if (alloc_len >= INQUIRY_OFFSET(reserved) + 8) { + to_be16(&inqdata->desc[6], 0x0040); /* SAM-2 (no version claimed) */ + len += 2; + } + + /* + * We only fill out 4 descriptors, but if the allocation length goes past + * that, zero the remaining bytes. This fixes some SCSI compliance tests + * which expect a full 96 bytes to be returned, including the unpopulated + * version descriptors 5-8 (4 * 2 = 8 bytes) plus the 22 bytes of reserved + * space (bytes 74-95) - for a total of 30 bytes. + */ + if (alloc_len > INQUIRY_OFFSET(reserved) + 8) { + i = alloc_len - (INQUIRY_OFFSET(reserved) + 8); + if (i > 30) { + i = 30; + } + memset(&inqdata->desc[8], 0, i); + len += i; + } + + /* ADDITIONAL LENGTH */ + inqdata->add_len = len; + } + + return hlen + len; + +inq_error: + task->data_transferred = 0; + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; +} + +static void +mode_sense_page_init(uint8_t *buf, int len, int page, int subpage) +{ + if (!buf) { + return; + } + + memset(buf, 0, len); + if (subpage != 0) { + buf[0] = page | 0x40; /* PAGE + SPF=1 */ + buf[1] = subpage; + to_be16(&buf[2], len - 4); + } else { + buf[0] = page; + buf[1] = len - 2; + } +} + +static int +bdev_scsi_mode_sense_page(struct spdk_bdev *bdev, + uint8_t *cdb, int pc, int page, int subpage, + uint8_t *data, struct spdk_scsi_task *task) +{ + uint8_t *cp = data; + int len = 0; + int plen; + int i; + + if (pc == 0x00) { + /* Current values */ + } else if (pc == 0x01) { + /* Changeable values */ + /* As we currently do not support changeable values, + all parameters are reported as zero. */ + } else if (pc == 0x02) { + /* Default values */ + } else { + /* Saved values not supported */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_SAVING_PARAMETERS_NOT_SUPPORTED, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + switch (page) { + case 0x00: + /* Vendor specific */ + break; + case 0x01: + /* Read-Write Error Recovery */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Read-Write Error Recovery\n"); + if (subpage != 0x00) { + break; + } + plen = 0x0a + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x02: + /* Disconnect-Reconnect */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Disconnect-Reconnect\n"); + if (subpage != 0x00) { + break; + } + plen = 0x0e + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x03: + /* Obsolete (Format Device) */ + break; + case 0x04: + /* Obsolete (Rigid Disk Geometry) */ + break; + case 0x05: + /* Obsolete (Rigid Disk Geometry) */ + break; + case 0x06: + /* Reserved */ + break; + case 0x07: + /* Verify Error Recovery */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Verify Error Recovery\n"); + + if (subpage != 0x00) { + break; + } + + plen = 0x0a + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x08: { + /* Caching */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "MODE_SENSE Caching\n"); + if (subpage != 0x00) { + break; + } + + plen = 0x12 + 2; + mode_sense_page_init(cp, plen, page, subpage); + + if (cp && spdk_bdev_has_write_cache(bdev) && pc != 0x01) { + cp[2] |= 0x4; /* WCE */ + } + + /* Read Cache Disable (RCD) = 1 */ + if (cp && pc != 0x01) { + cp[2] |= 0x1; + } + + len += plen; + break; + } + case 0x09: + /* Obsolete */ + break; + case 0x0a: + switch (subpage) { + case 0x00: + /* Control */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Control\n"); + plen = 0x0a + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x01: + /* Control Extension */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Control Extension\n"); + plen = 0x1c + 4; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0xff: + /* All subpages */ + len += bdev_scsi_mode_sense_page(bdev, + cdb, pc, page, + 0x00, + cp ? &cp[len] : NULL, task); + len += bdev_scsi_mode_sense_page(bdev, + cdb, pc, page, + 0x01, + cp ? &cp[len] : NULL, task); + break; + default: + /* 0x02-0x3e: Reserved */ + break; + } + break; + case 0x0b: + /* Obsolete (Medium Types Supported) */ + break; + case 0x0c: + /* Obsolete (Notch And Partitio) */ + break; + case 0x0d: + /* Obsolete */ + break; + case 0x0e: + case 0x0f: + /* Reserved */ + break; + case 0x10: + /* XOR Control */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "MODE_SENSE XOR Control\n"); + if (subpage != 0x00) { + break; + } + plen = 0x16 + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x11: + case 0x12: + case 0x13: + /* Reserved */ + break; + case 0x14: + /* Enclosure Services Management */ + break; + case 0x15: + case 0x16: + case 0x17: + /* Reserved */ + break; + case 0x18: + /* Protocol-Specific LUN */ + break; + case 0x19: + /* Protocol-Specific Port */ + break; + case 0x1a: + /* Power Condition */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Power Condition\n"); + if (subpage != 0x00) { + break; + } + plen = 0x0a + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x1b: + /* Reserved */ + break; + case 0x1c: + /* Informational Exceptions Control */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Informational Exceptions Control\n"); + if (subpage != 0x00) { + break; + } + + plen = 0x0a + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x1d: + case 0x1e: + case 0x1f: + /* Reserved */ + break; + case 0x20: + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2a: + case 0x2b: + case 0x2c: + case 0x2d: + case 0x2e: + case 0x2f: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3a: + case 0x3b: + case 0x3c: + case 0x3d: + case 0x3e: + /* Vendor-specific */ + break; + case 0x3f: + switch (subpage) { + case 0x00: + /* All mode pages */ + for (i = 0x00; i < 0x3e; i ++) { + len += bdev_scsi_mode_sense_page( + bdev, cdb, pc, i, 0x00, + cp ? &cp[len] : NULL, task); + } + break; + case 0xff: + /* All mode pages and subpages */ + for (i = 0x00; i < 0x3e; i ++) { + len += bdev_scsi_mode_sense_page( + bdev, cdb, pc, i, 0x00, + cp ? &cp[len] : NULL, task); + } + for (i = 0x00; i < 0x3e; i ++) { + len += bdev_scsi_mode_sense_page( + bdev, cdb, pc, i, 0xff, + cp ? &cp[len] : NULL, task); + } + break; + default: + /* 0x01-0x3e: Reserved */ + break; + } + } + + return len; +} + +static int +bdev_scsi_mode_sense(struct spdk_bdev *bdev, int md, + uint8_t *cdb, int dbd, int llbaa, int pc, + int page, int subpage, uint8_t *data, struct spdk_scsi_task *task) +{ + uint64_t num_blocks = spdk_bdev_get_num_blocks(bdev); + uint32_t block_size = spdk_bdev_get_data_block_size(bdev); + uint8_t *hdr, *bdesc, *pages; + int hlen; + int blen; + int plen, total; + + assert(md == 6 || md == 10); + + if (md == 6) { + hlen = 4; + blen = 8; /* For MODE SENSE 6 only short LBA */ + } else { + hlen = 8; + blen = llbaa ? 16 : 8; + } + + if (dbd) { + blen = 0; + } + + pages = data ? &data[hlen + blen] : NULL; + plen = bdev_scsi_mode_sense_page(bdev, cdb, pc, page, + subpage, + pages, task); + if (plen < 0) { + return -1; + } + + total = hlen + blen + plen; + if (data == NULL) { + return total; + } + + hdr = &data[0]; + if (hlen == 4) { + hdr[0] = total - 1; /* Mode Data Length */ + hdr[1] = 0; /* Medium Type */ + hdr[2] = 0; /* Device-Specific Parameter */ + hdr[3] = blen; /* Block Descripter Length */ + } else { + to_be16(&hdr[0], total - 2); /* Mode Data Length */ + hdr[2] = 0; /* Medium Type */ + hdr[3] = 0; /* Device-Specific Parameter */ + hdr[4] = llbaa ? 0x1 : 0; /* Long/short LBA */ + hdr[5] = 0; /* Reserved */ + to_be16(&hdr[6], blen); /* Block Descripter Length */ + } + + bdesc = &data[hlen]; + if (blen == 16) { + /* Number of Blocks */ + to_be64(&bdesc[0], num_blocks); + /* Reserved */ + memset(&bdesc[8], 0, 4); + /* Block Length */ + to_be32(&bdesc[12], block_size); + } else if (blen == 8) { + /* Number of Blocks */ + if (num_blocks > 0xffffffffULL) { + memset(&bdesc[0], 0xff, 4); + } else { + to_be32(&bdesc[0], num_blocks); + } + + /* Block Length */ + to_be32(&bdesc[4], block_size); + } + + return total; +} + +static void +bdev_scsi_task_complete_cmd(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_scsi_task *task = cb_arg; + int sc, sk, asc, ascq; + + spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq); + + spdk_bdev_free_io(bdev_io); + + spdk_scsi_task_set_status(task, sc, sk, asc, ascq); + scsi_lun_complete_task(task->lun, task); +} + +static void +bdev_scsi_read_task_complete_cmd(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_scsi_task *task = cb_arg; + int sc, sk, asc, ascq; + + task->bdev_io = bdev_io; + + spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq); + + spdk_scsi_task_set_status(task, sc, sk, asc, ascq); + scsi_lun_complete_task(task->lun, task); +} + +static void +bdev_scsi_task_complete_reset(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_scsi_task *task = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (success) { + task->response = SPDK_SCSI_TASK_MGMT_RESP_SUCCESS; + } + + scsi_lun_complete_reset_task(task->lun, task); +} + +static void +bdev_scsi_queue_io(struct spdk_scsi_task *task, spdk_bdev_io_wait_cb cb_fn, void *cb_arg) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_bdev *bdev = lun->bdev; + struct spdk_io_channel *ch = lun->io_channel; + int rc; + + task->bdev_io_wait.bdev = bdev; + task->bdev_io_wait.cb_fn = cb_fn; + task->bdev_io_wait.cb_arg = cb_arg; + + rc = spdk_bdev_queue_io_wait(bdev, ch, &task->bdev_io_wait); + if (rc != 0) { + assert(false); + } +} + +static int +bdev_scsi_sync(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc, + struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task, + uint64_t lba, uint32_t num_blocks) +{ + uint64_t bdev_num_blocks; + int rc; + + if (num_blocks == 0) { + return SPDK_SCSI_TASK_COMPLETE; + } + + bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + + if (lba >= bdev_num_blocks || num_blocks > bdev_num_blocks || + lba > (bdev_num_blocks - num_blocks)) { + SPDK_ERRLOG("end of media\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + + rc = spdk_bdev_flush_blocks(bdev_desc, bdev_ch, lba, num_blocks, + bdev_scsi_task_complete_cmd, task); + + if (rc) { + if (rc == -ENOMEM) { + bdev_scsi_queue_io(task, bdev_scsi_process_block_resubmit, task); + return SPDK_SCSI_TASK_PENDING; + } + SPDK_ERRLOG("spdk_bdev_flush_blocks() failed\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + task->data_transferred = 0; + return SPDK_SCSI_TASK_PENDING; +} + +static uint64_t +_bytes_to_blocks(uint32_t block_size, uint64_t offset_bytes, uint64_t *offset_blocks, + uint64_t num_bytes, uint64_t *num_blocks) +{ + uint8_t shift_cnt; + + /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ + if (spdk_likely(spdk_u32_is_pow2(block_size))) { + shift_cnt = spdk_u32log2(block_size); + *offset_blocks = offset_bytes >> shift_cnt; + *num_blocks = num_bytes >> shift_cnt; + return (offset_bytes - (*offset_blocks << shift_cnt)) | + (num_bytes - (*num_blocks << shift_cnt)); + } else { + *offset_blocks = offset_bytes / block_size; + *num_blocks = num_bytes / block_size; + return (offset_bytes % block_size) | (num_bytes % block_size); + } +} + +static int +bdev_scsi_readwrite(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc, + struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task, + uint64_t lba, uint32_t xfer_len, bool is_read) +{ + uint64_t bdev_num_blocks, offset_blocks, num_blocks; + uint32_t max_xfer_len, block_size; + int sk = SPDK_SCSI_SENSE_NO_SENSE, asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + int rc; + + task->data_transferred = 0; + + if (spdk_unlikely(task->dxfer_dir != SPDK_SCSI_DIR_NONE && + task->dxfer_dir != (is_read ? SPDK_SCSI_DIR_FROM_DEV : SPDK_SCSI_DIR_TO_DEV))) { + SPDK_ERRLOG("Incorrect data direction\n"); + goto check_condition; + } + + bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + if (spdk_unlikely(bdev_num_blocks <= lba || bdev_num_blocks - lba < xfer_len)) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "end of media\n"); + sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_ADDRESS_OUT_OF_RANGE; + goto check_condition; + } + + if (spdk_unlikely(xfer_len == 0)) { + task->status = SPDK_SCSI_STATUS_GOOD; + return SPDK_SCSI_TASK_COMPLETE; + } + + block_size = spdk_bdev_get_data_block_size(bdev); + + /* Transfer Length is limited to the Block Limits VPD page Maximum Transfer Length */ + max_xfer_len = SPDK_WORK_BLOCK_SIZE / block_size; + if (spdk_unlikely(xfer_len > max_xfer_len)) { + SPDK_ERRLOG("xfer_len %" PRIu32 " > maximum transfer length %" PRIu32 "\n", + xfer_len, max_xfer_len); + sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB; + goto check_condition; + } + + if (!is_read) { + /* Additional check for Transfer Length */ + if (xfer_len * block_size > task->transfer_len) { + SPDK_ERRLOG("xfer_len %" PRIu32 " * block_size %" PRIu32 " > transfer_len %u\n", + xfer_len, block_size, task->transfer_len); + goto check_condition; + } + } + + if (_bytes_to_blocks(block_size, task->offset, &offset_blocks, task->length, &num_blocks) != 0) { + SPDK_ERRLOG("task's offset %" PRIu64 " or length %" PRIu32 " is not block multiple\n", + task->offset, task->length); + goto check_condition; + } + + offset_blocks += lba; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "%s: lba=%"PRIu64", len=%"PRIu64"\n", + is_read ? "Read" : "Write", offset_blocks, num_blocks); + + if (is_read) { + rc = spdk_bdev_readv_blocks(bdev_desc, bdev_ch, task->iovs, task->iovcnt, + offset_blocks, num_blocks, + bdev_scsi_read_task_complete_cmd, task); + } else { + rc = spdk_bdev_writev_blocks(bdev_desc, bdev_ch, task->iovs, task->iovcnt, + offset_blocks, num_blocks, + bdev_scsi_task_complete_cmd, task); + } + + if (rc) { + if (rc == -ENOMEM) { + bdev_scsi_queue_io(task, bdev_scsi_process_block_resubmit, task); + return SPDK_SCSI_TASK_PENDING; + } + SPDK_ERRLOG("spdk_bdev_%s_blocks() failed\n", is_read ? "readv" : "writev"); + goto check_condition; + } + + task->data_transferred = task->length; + return SPDK_SCSI_TASK_PENDING; + +check_condition: + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, sk, asc, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; +} + +struct spdk_bdev_scsi_unmap_ctx { + struct spdk_scsi_task *task; + struct spdk_scsi_unmap_bdesc desc[DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT]; + uint32_t count; +}; + +static int bdev_scsi_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc, + struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task, + struct spdk_bdev_scsi_unmap_ctx *ctx); + +static void +bdev_scsi_task_complete_unmap_cmd(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_bdev_scsi_unmap_ctx *ctx = cb_arg; + struct spdk_scsi_task *task = ctx->task; + int sc, sk, asc, ascq; + + ctx->count--; + + task->bdev_io = bdev_io; + + if (task->status == SPDK_SCSI_STATUS_GOOD) { + spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq); + spdk_scsi_task_set_status(task, sc, sk, asc, ascq); + } + + if (ctx->count == 0) { + scsi_lun_complete_task(task->lun, task); + free(ctx); + } +} + +static int +__copy_desc(struct spdk_bdev_scsi_unmap_ctx *ctx, uint8_t *data, size_t data_len) +{ + uint16_t desc_data_len; + uint16_t desc_count; + + if (!data) { + return -EINVAL; + } + + if (data_len < 8) { + /* We can't even get the reported length, so fail. */ + return -EINVAL; + } + + desc_data_len = from_be16(&data[2]); + desc_count = desc_data_len / 16; + + if (desc_data_len > (data_len - 8)) { + SPDK_ERRLOG("Error - desc_data_len (%u) > data_len (%lu) - 8\n", + desc_data_len, data_len); + return -EINVAL; + } + + if (desc_count > DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT) { + SPDK_ERRLOG("desc_count (%u) greater than max allowed (%u)\n", + desc_count, DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT); + return -EINVAL; + } + + memcpy(ctx->desc, &data[8], desc_data_len); + return desc_count; +} + +static void +bdev_scsi_unmap_resubmit(void *arg) +{ + struct spdk_bdev_scsi_unmap_ctx *ctx = arg; + struct spdk_scsi_task *task = ctx->task; + struct spdk_scsi_lun *lun = task->lun; + + bdev_scsi_unmap(lun->bdev, lun->bdev_desc, lun->io_channel, task, ctx); +} + +static int +bdev_scsi_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc, + struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task, + struct spdk_bdev_scsi_unmap_ctx *ctx) +{ + uint8_t *data; + int i, desc_count = -1; + int data_len; + int rc; + + assert(task->status == SPDK_SCSI_STATUS_GOOD); + + if (ctx == NULL) { + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + + ctx->task = task; + ctx->count = 0; + } + + + if (task->iovcnt == 1) { + data = (uint8_t *)task->iovs[0].iov_base; + data_len = task->iovs[0].iov_len; + desc_count = __copy_desc(ctx, data, data_len); + } else { + data = spdk_scsi_task_gather_data(task, &data_len); + if (data) { + desc_count = __copy_desc(ctx, data, data_len); + free(data); + } + } + + if (desc_count < 0) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + free(ctx); + return SPDK_SCSI_TASK_COMPLETE; + } + + for (i = ctx->count; i < desc_count; i++) { + struct spdk_scsi_unmap_bdesc *desc; + uint64_t offset_blocks; + uint64_t num_blocks; + + desc = &ctx->desc[i]; + + offset_blocks = from_be64(&desc->lba); + num_blocks = from_be32(&desc->block_count); + + if (num_blocks == 0) { + continue; + } + + ctx->count++; + rc = spdk_bdev_unmap_blocks(bdev_desc, bdev_ch, offset_blocks, num_blocks, + bdev_scsi_task_complete_unmap_cmd, ctx); + + if (rc) { + if (rc == -ENOMEM) { + bdev_scsi_queue_io(task, bdev_scsi_unmap_resubmit, ctx); + /* Unmap was not yet submitted to bdev */ + ctx->count--; + return SPDK_SCSI_TASK_PENDING; + } + SPDK_ERRLOG("SCSI Unmapping failed\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + ctx->count--; + /* We can't complete here - we may have to wait for previously + * submitted unmaps to complete */ + break; + } + } + + if (ctx->count == 0) { + free(ctx); + return SPDK_SCSI_TASK_COMPLETE; + } + + return SPDK_SCSI_TASK_PENDING; +} + +static int +bdev_scsi_process_block(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_bdev *bdev = lun->bdev; + uint64_t lba; + uint32_t xfer_len; + uint32_t len = 0; + uint8_t *cdb = task->cdb; + + /* XXX: We need to support FUA bit for writes! */ + switch (cdb[0]) { + case SPDK_SBC_READ_6: + case SPDK_SBC_WRITE_6: + lba = (uint64_t)cdb[1] << 16; + lba |= (uint64_t)cdb[2] << 8; + lba |= (uint64_t)cdb[3]; + xfer_len = cdb[4]; + if (xfer_len == 0) { + xfer_len = 256; + } + return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel, + task, lba, xfer_len, + cdb[0] == SPDK_SBC_READ_6); + + case SPDK_SBC_READ_10: + case SPDK_SBC_WRITE_10: + lba = from_be32(&cdb[2]); + xfer_len = from_be16(&cdb[7]); + return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel, + task, lba, xfer_len, + cdb[0] == SPDK_SBC_READ_10); + + case SPDK_SBC_READ_12: + case SPDK_SBC_WRITE_12: + lba = from_be32(&cdb[2]); + xfer_len = from_be32(&cdb[6]); + return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel, + task, lba, xfer_len, + cdb[0] == SPDK_SBC_READ_12); + case SPDK_SBC_READ_16: + case SPDK_SBC_WRITE_16: + lba = from_be64(&cdb[2]); + xfer_len = from_be32(&cdb[10]); + return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel, + task, lba, xfer_len, + cdb[0] == SPDK_SBC_READ_16); + + case SPDK_SBC_READ_CAPACITY_10: { + uint64_t num_blocks = spdk_bdev_get_num_blocks(bdev); + uint8_t buffer[8]; + + if (num_blocks - 1 > 0xffffffffULL) { + memset(buffer, 0xff, 4); + } else { + to_be32(buffer, num_blocks - 1); + } + to_be32(&buffer[4], spdk_bdev_get_data_block_size(bdev)); + + len = spdk_min(task->length, sizeof(buffer)); + if (spdk_scsi_task_scatter_data(task, buffer, len) < 0) { + break; + } + + task->data_transferred = len; + task->status = SPDK_SCSI_STATUS_GOOD; + break; + } + + case SPDK_SPC_SERVICE_ACTION_IN_16: + switch (cdb[1] & 0x1f) { /* SERVICE ACTION */ + case SPDK_SBC_SAI_READ_CAPACITY_16: { + uint8_t buffer[32] = {0}; + + to_be64(&buffer[0], spdk_bdev_get_num_blocks(bdev) - 1); + to_be32(&buffer[8], spdk_bdev_get_data_block_size(bdev)); + /* + * Set the TPE bit to 1 to indicate thin provisioning. + * The position of TPE bit is the 7th bit in 14th byte + * in READ CAPACITY (16) parameter data. + */ + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + buffer[14] |= 1 << 7; + } + + len = spdk_min(from_be32(&cdb[10]), sizeof(buffer)); + if (spdk_scsi_task_scatter_data(task, buffer, len) < 0) { + break; + } + + task->data_transferred = len; + task->status = SPDK_SCSI_STATUS_GOOD; + break; + } + + default: + return SPDK_SCSI_TASK_UNKNOWN; + } + break; + + case SPDK_SBC_SYNCHRONIZE_CACHE_10: + case SPDK_SBC_SYNCHRONIZE_CACHE_16: + if (cdb[0] == SPDK_SBC_SYNCHRONIZE_CACHE_10) { + lba = from_be32(&cdb[2]); + len = from_be16(&cdb[7]); + } else { + lba = from_be64(&cdb[2]); + len = from_be32(&cdb[10]); + } + + if (len == 0) { + len = spdk_bdev_get_num_blocks(bdev) - lba; + } + + return bdev_scsi_sync(bdev, lun->bdev_desc, lun->io_channel, task, lba, len); + break; + + case SPDK_SBC_UNMAP: + return bdev_scsi_unmap(bdev, lun->bdev_desc, lun->io_channel, task, NULL); + + default: + return SPDK_SCSI_TASK_UNKNOWN; + } + + return SPDK_SCSI_TASK_COMPLETE; +} + +static void +bdev_scsi_process_block_resubmit(void *arg) +{ + struct spdk_scsi_task *task = arg; + + bdev_scsi_process_block(task); +} + +static int +bdev_scsi_check_len(struct spdk_scsi_task *task, int len, int min_len) +{ + if (len >= min_len) { + return 0; + } + + /* INVALID FIELD IN CDB */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; +} + +static int +bdev_scsi_process_primary(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_bdev *bdev = lun->bdev; + int alloc_len = -1; + int data_len = -1; + uint8_t *cdb = task->cdb; + uint8_t *data = NULL; + int rc = 0; + int pllen, md = 0; + int llba; + int dbd, pc, page, subpage; + int cmd_parsed = 0; + + switch (cdb[0]) { + case SPDK_SPC_INQUIRY: + alloc_len = from_be16(&cdb[3]); + data_len = spdk_max(4096, alloc_len); + data = calloc(1, data_len); + assert(data != NULL); + rc = bdev_scsi_inquiry(bdev, task, cdb, data, data_len); + data_len = spdk_min(rc, data_len); + if (rc < 0) { + break; + } + + SPDK_LOGDUMP(SPDK_LOG_SCSI, "INQUIRY", data, data_len); + break; + + case SPDK_SPC_REPORT_LUNS: { + int sel; + + sel = cdb[2]; + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "sel=%x\n", sel); + + alloc_len = from_be32(&cdb[6]); + rc = bdev_scsi_check_len(task, alloc_len, 16); + if (rc < 0) { + break; + } + + data_len = spdk_max(4096, alloc_len); + data = calloc(1, data_len); + assert(data != NULL); + rc = bdev_scsi_report_luns(task->lun, sel, data, data_len); + data_len = rc; + if (rc < 0) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + SPDK_LOGDUMP(SPDK_LOG_SCSI, "REPORT LUNS", data, data_len); + break; + } + + case SPDK_SPC_MODE_SELECT_6: + case SPDK_SPC_MODE_SELECT_10: + if (cdb[0] == SPDK_SPC_MODE_SELECT_6) { + /* MODE_SELECT(6) must have at least a 4 byte header. */ + md = 4; + pllen = cdb[4]; + } else { + /* MODE_SELECT(10) must have at least an 8 byte header. */ + md = 8; + pllen = from_be16(&cdb[7]); + } + + if (pllen == 0) { + break; + } + + rc = bdev_scsi_check_len(task, pllen, md); + if (rc < 0) { + break; + } + + data = spdk_scsi_task_gather_data(task, &rc); + if (rc < 0) { + break; + } + data_len = rc; + + rc = bdev_scsi_check_len(task, data_len, spdk_max(pllen, md)); + if (rc < 0) { + break; + } + + rc = pllen; + data_len = 0; + break; + + case SPDK_SPC_MODE_SENSE_6: + alloc_len = cdb[4]; + md = 6; + /* FALLTHROUGH */ + case SPDK_SPC_MODE_SENSE_10: + llba = 0; + + if (md == 0) { + alloc_len = from_be16(&cdb[7]); + llba = !!(cdb[1] & 0x10); + md = 10; + } + + dbd = !!(cdb[1] & 0x8); + pc = (cdb[2] & 0xc0) >> 6; + page = cdb[2] & 0x3f; + subpage = cdb[3]; + + /* First call with no buffer to discover needed buffer size */ + rc = bdev_scsi_mode_sense(bdev, md, + cdb, dbd, llba, pc, + page, subpage, + NULL, task); + if (rc < 0) { + break; + } + + data_len = rc; + data = calloc(1, data_len); + assert(data != NULL); + + /* First call with no buffer to discover needed buffer size */ + rc = bdev_scsi_mode_sense(bdev, md, + cdb, dbd, llba, pc, + page, subpage, + data, task); + if (rc < 0) { + /* INVALID FIELD IN CDB */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + break; + + case SPDK_SPC_REQUEST_SENSE: { + int desc; + int sk, asc, ascq; + + desc = cdb[1] & 0x1; + if (desc != 0) { + /* INVALID FIELD IN CDB */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + alloc_len = cdb[4]; + + /* NO ADDITIONAL SENSE INFORMATION */ + sk = SPDK_SCSI_SENSE_NO_SENSE; + asc = 0x00; + ascq = 0x00; + + spdk_scsi_task_build_sense_data(task, sk, asc, ascq); + + data_len = task->sense_data_len; + data = calloc(1, data_len); + assert(data != NULL); + memcpy(data, task->sense_data, data_len); + break; + } + + case SPDK_SPC_LOG_SELECT: + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "LOG_SELECT\n"); + cmd_parsed = 1; + /* FALLTHROUGH */ + case SPDK_SPC_LOG_SENSE: + if (!cmd_parsed) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "LOG_SENSE\n"); + } + + /* INVALID COMMAND OPERATION CODE */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + rc = -1; + break; + + case SPDK_SPC_TEST_UNIT_READY: + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "TEST_UNIT_READY\n"); + cmd_parsed = 1; + /* FALLTHROUGH */ + case SPDK_SBC_START_STOP_UNIT: + if (!cmd_parsed) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "START_STOP_UNIT\n"); + } + + rc = 0; + break; + + case SPDK_SPC_PERSISTENT_RESERVE_OUT: + pllen = from_be32(&cdb[5]); + rc = bdev_scsi_check_len(task, pllen, 24); + if (rc < 0) { + break; + } + + data = spdk_scsi_task_gather_data(task, &rc); + if (rc < 0) { + break; + } + data_len = rc; + if (data_len < 24) { + rc = -1; + break; + } + + rc = scsi_pr_out(task, cdb, data, data_len); + if (rc < 0) { + break; + } + rc = pllen; + data_len = 0; + break; + + case SPDK_SPC_PERSISTENT_RESERVE_IN: + alloc_len = from_be16(&cdb[7]); + data_len = alloc_len; + data = calloc(1, data_len); + assert(data != NULL); + rc = scsi_pr_in(task, cdb, data, data_len); + break; + + case SPDK_SPC2_RESERVE_6: + case SPDK_SPC2_RESERVE_10: + rc = scsi2_reserve(task, cdb); + if (rc == 0) { + if (cdb[0] == SPDK_SPC2_RESERVE_10) { + rc = from_be16(&cdb[7]); + } + data_len = 0; + } + break; + + case SPDK_SPC2_RELEASE_6: + case SPDK_SPC2_RELEASE_10: + rc = scsi2_release(task); + break; + + default: + return SPDK_SCSI_TASK_UNKNOWN; + } + + if (rc >= 0 && data_len > 0) { + assert(alloc_len >= 0); + spdk_scsi_task_scatter_data(task, data, spdk_min(alloc_len, data_len)); + rc = spdk_min(data_len, alloc_len); + } + + if (rc >= 0) { + task->data_transferred = rc; + task->status = SPDK_SCSI_STATUS_GOOD; + } + + if (data) { + free(data); + } + + return SPDK_SCSI_TASK_COMPLETE; +} + +int +bdev_scsi_execute(struct spdk_scsi_task *task) +{ + int rc; + + if ((rc = bdev_scsi_process_block(task)) == SPDK_SCSI_TASK_UNKNOWN) { + if ((rc = bdev_scsi_process_primary(task)) == SPDK_SCSI_TASK_UNKNOWN) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "unsupported SCSI OP=0x%x\n", task->cdb[0]); + /* INVALID COMMAND OPERATION CODE */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + } + + return rc; +} + +static void +bdev_scsi_reset_resubmit(void *arg) +{ + struct spdk_scsi_task *task = arg; + + bdev_scsi_reset(task); +} + +void +bdev_scsi_reset(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + int rc; + + rc = spdk_bdev_reset(lun->bdev_desc, lun->io_channel, bdev_scsi_task_complete_reset, + task); + if (rc == -ENOMEM) { + bdev_scsi_queue_io(task, bdev_scsi_reset_resubmit, task); + } +} + +bool +bdev_scsi_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_scsi_task *task, + struct spdk_dif_ctx *dif_ctx) +{ + uint32_t ref_tag = 0, dif_check_flags = 0, data_offset; + uint8_t *cdb; + int rc; + + if (spdk_likely(spdk_bdev_get_md_size(bdev) == 0)) { + return false; + } + + cdb = task->cdb; + data_offset = task->offset; + + /* We use lower 32 bits of LBA as Reference. Tag */ + switch (cdb[0]) { + case SPDK_SBC_READ_6: + case SPDK_SBC_WRITE_6: + ref_tag = (uint32_t)cdb[1] << 16; + ref_tag |= (uint32_t)cdb[2] << 8; + ref_tag |= (uint32_t)cdb[3]; + break; + case SPDK_SBC_READ_10: + case SPDK_SBC_WRITE_10: + case SPDK_SBC_READ_12: + case SPDK_SBC_WRITE_12: + ref_tag = from_be32(&cdb[2]); + break; + case SPDK_SBC_READ_16: + case SPDK_SBC_WRITE_16: + ref_tag = (uint32_t)from_be64(&cdb[2]); + break; + default: + return false; + } + + if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) { + dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK; + } + + if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) { + dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK; + } + + rc = spdk_dif_ctx_init(dif_ctx, + spdk_bdev_get_block_size(bdev), + spdk_bdev_get_md_size(bdev), + spdk_bdev_is_md_interleaved(bdev), + spdk_bdev_is_dif_head_of_md(bdev), + spdk_bdev_get_dif_type(bdev), + dif_check_flags, + ref_tag, 0, 0, data_offset, 0); + + return (rc == 0) ? true : false; +} diff --git a/src/spdk/lib/scsi/scsi_internal.h b/src/spdk/lib/scsi/scsi_internal.h new file mode 100644 index 000000000..2da3a99a8 --- /dev/null +++ b/src/spdk/lib/scsi/scsi_internal.h @@ -0,0 +1,214 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_SCSI_INTERNAL_H +#define SPDK_SCSI_INTERNAL_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/scsi.h" +#include "spdk/scsi_spec.h" +#include "spdk/trace.h" +#include "spdk/dif.h" + +#include "spdk_internal/log.h" + +enum { + SPDK_SCSI_TASK_UNKNOWN = -1, + SPDK_SCSI_TASK_COMPLETE, + SPDK_SCSI_TASK_PENDING, +}; + +struct spdk_scsi_port { + uint8_t is_used; + uint64_t id; + uint16_t index; + uint16_t transport_id_len; + char transport_id[SPDK_SCSI_MAX_TRANSPORT_ID_LENGTH]; + char name[SPDK_SCSI_PORT_MAX_NAME_LENGTH]; +}; + +/* Registrant with I_T nextus */ +struct spdk_scsi_pr_registrant { + uint64_t rkey; + uint16_t relative_target_port_id; + uint16_t transport_id_len; + char transport_id[SPDK_SCSI_MAX_TRANSPORT_ID_LENGTH]; + char initiator_port_name[SPDK_SCSI_PORT_MAX_NAME_LENGTH]; + char target_port_name[SPDK_SCSI_PORT_MAX_NAME_LENGTH]; + struct spdk_scsi_port *initiator_port; + struct spdk_scsi_port *target_port; + TAILQ_ENTRY(spdk_scsi_pr_registrant) link; +}; + +#define SCSI_SPC2_RESERVE 0x00000001U + +/* Reservation with LU_SCOPE */ +struct spdk_scsi_pr_reservation { + uint32_t flags; + struct spdk_scsi_pr_registrant *holder; + enum spdk_scsi_pr_type_code rtype; + uint64_t crkey; +}; + +struct spdk_scsi_dev { + int id; + int is_allocated; + bool removed; + spdk_scsi_dev_destruct_cb_t remove_cb; + void *remove_ctx; + + char name[SPDK_SCSI_DEV_MAX_NAME + 1]; + + struct spdk_scsi_lun *lun[SPDK_SCSI_DEV_MAX_LUN]; + + int num_ports; + struct spdk_scsi_port port[SPDK_SCSI_DEV_MAX_PORTS]; + + uint8_t protocol_id; +}; + +struct spdk_scsi_lun_desc { + struct spdk_scsi_lun *lun; + spdk_scsi_lun_remove_cb_t hotremove_cb; + void *hotremove_ctx; + TAILQ_ENTRY(spdk_scsi_lun_desc) link; +}; + +struct spdk_scsi_lun { + /** LUN id for this logical unit. */ + int id; + + /** Pointer to the SCSI device containing this LUN. */ + struct spdk_scsi_dev *dev; + + /** The bdev associated with this LUN. */ + struct spdk_bdev *bdev; + + /** Descriptor for opened block device. */ + struct spdk_bdev_desc *bdev_desc; + + /** The thread which opens this LUN. */ + struct spdk_thread *thread; + + /** I/O channel for the bdev associated with this LUN. */ + struct spdk_io_channel *io_channel; + + /** The reference number for this LUN, thus we can correctly free the io_channel */ + uint32_t ref; + + /** Poller to release the resource of the lun when it is hot removed */ + struct spdk_poller *hotremove_poller; + + /** The LUN is removed */ + bool removed; + + /** Callback to be fired when LUN removal is first triggered. */ + void (*hotremove_cb)(const struct spdk_scsi_lun *lun, void *arg); + + /** Argument for hotremove_cb */ + void *hotremove_ctx; + + /** Registrant head for I_T nexus */ + TAILQ_HEAD(, spdk_scsi_pr_registrant) reg_head; + /** Persistent Reservation Generation */ + uint32_t pr_generation; + /** Reservation for the LUN */ + struct spdk_scsi_pr_reservation reservation; + /** Reservation holder for SPC2 RESERVE(6) and RESERVE(10) */ + struct spdk_scsi_pr_registrant scsi2_holder; + + /** List of open descriptors for this LUN. */ + TAILQ_HEAD(, spdk_scsi_lun_desc) open_descs; + + /** submitted tasks */ + TAILQ_HEAD(tasks, spdk_scsi_task) tasks; + + /** pending tasks */ + TAILQ_HEAD(pending_tasks, spdk_scsi_task) pending_tasks; + + /** submitted management tasks */ + TAILQ_HEAD(mgmt_tasks, spdk_scsi_task) mgmt_tasks; + + /** pending management tasks */ + TAILQ_HEAD(pending_mgmt_tasks, spdk_scsi_task) pending_mgmt_tasks; + + /** poller to check completion of tasks prior to reset */ + struct spdk_poller *reset_poller; +}; + +struct spdk_scsi_lun *scsi_lun_construct(struct spdk_bdev *bdev, + void (*hotremove_cb)(const struct spdk_scsi_lun *, void *), + void *hotremove_ctx); +void scsi_lun_destruct(struct spdk_scsi_lun *lun); + +void scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task); +void scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task); +bool scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun, + const struct spdk_scsi_port *initiator_port); +void scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task); +void scsi_lun_complete_reset_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task); +bool scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun, + const struct spdk_scsi_port *initiator_port); +int scsi_lun_allocate_io_channel(struct spdk_scsi_lun *lun); +void scsi_lun_free_io_channel(struct spdk_scsi_lun *lun); + +struct spdk_scsi_dev *scsi_dev_get_list(void); + +int scsi_port_construct(struct spdk_scsi_port *port, uint64_t id, + uint16_t index, const char *name); +void scsi_port_destruct(struct spdk_scsi_port *port); + +int bdev_scsi_execute(struct spdk_scsi_task *task); +void bdev_scsi_reset(struct spdk_scsi_task *task); + +bool bdev_scsi_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_scsi_task *task, + struct spdk_dif_ctx *dif_ctx); + +int scsi_pr_out(struct spdk_scsi_task *task, uint8_t *cdb, uint8_t *data, uint16_t data_len); +int scsi_pr_in(struct spdk_scsi_task *task, uint8_t *cdb, uint8_t *data, uint16_t data_len); +int scsi_pr_check(struct spdk_scsi_task *task); + +int scsi2_reserve(struct spdk_scsi_task *task, uint8_t *cdb); +int scsi2_release(struct spdk_scsi_task *task); +int scsi2_reserve_check(struct spdk_scsi_task *task); + +struct spdk_scsi_globals { + pthread_mutex_t mutex; +}; + +extern struct spdk_scsi_globals g_scsi; + +#endif /* SPDK_SCSI_INTERNAL_H */ diff --git a/src/spdk/lib/scsi/scsi_pr.c b/src/spdk/lib/scsi/scsi_pr.c new file mode 100644 index 000000000..4e17cc2c6 --- /dev/null +++ b/src/spdk/lib/scsi/scsi_pr.c @@ -0,0 +1,1067 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +#include "spdk/endian.h" + +/* Get registrant by I_T nexus */ +static struct spdk_scsi_pr_registrant * +scsi_pr_get_registrant(struct spdk_scsi_lun *lun, + struct spdk_scsi_port *initiator_port, + struct spdk_scsi_port *target_port) +{ + struct spdk_scsi_pr_registrant *reg, *tmp; + + TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) { + if (initiator_port == reg->initiator_port && + target_port == reg->target_port) { + return reg; + } + } + + return NULL; +} + +static bool +scsi2_it_nexus_is_holder(struct spdk_scsi_lun *lun, + struct spdk_scsi_port *initiator_port, + struct spdk_scsi_port *target_port) +{ + struct spdk_scsi_pr_registrant *reg = lun->reservation.holder; + + assert(reg != NULL); + + if ((reg->initiator_port == initiator_port) && + (reg->target_port == target_port)) { + return true; + } + + return false; +} + +/* Reservation type is all registrants or not */ +static inline bool +scsi_pr_is_all_registrants_type(struct spdk_scsi_lun *lun) +{ + return (lun->reservation.rtype == SPDK_SCSI_PR_WRITE_EXCLUSIVE_ALL_REGS || + lun->reservation.rtype == SPDK_SCSI_PR_EXCLUSIVE_ACCESS_ALL_REGS); +} + +/* Registrant is reservation holder or not */ +static inline bool +scsi_pr_registrant_is_holder(struct spdk_scsi_lun *lun, + struct spdk_scsi_pr_registrant *reg) +{ + if (scsi_pr_is_all_registrants_type(lun)) { + return true; + } + + return (lun->reservation.holder == reg); +} + +/* LUN holds a reservation or not */ +static inline bool +scsi_pr_has_reservation(struct spdk_scsi_lun *lun) +{ + return !(lun->reservation.holder == NULL); +} + +static int +scsi_pr_register_registrant(struct spdk_scsi_lun *lun, + struct spdk_scsi_port *initiator_port, + struct spdk_scsi_port *target_port, + uint64_t sa_rkey) +{ + struct spdk_scsi_pr_registrant *reg; + + /* Register sa_rkey with the I_T nexus */ + reg = calloc(1, sizeof(*reg)); + if (!reg) { + return -ENOMEM; + } + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: new registrant registered " + "with key 0x%"PRIx64"\n", sa_rkey); + + /* New I_T nexus */ + reg->initiator_port = initiator_port; + if (initiator_port) { + snprintf(reg->initiator_port_name, sizeof(reg->initiator_port_name), "%s", + initiator_port->name); + reg->transport_id_len = initiator_port->transport_id_len; + memcpy(reg->transport_id, initiator_port->transport_id, reg->transport_id_len); + } + reg->target_port = target_port; + if (target_port) { + snprintf(reg->target_port_name, sizeof(reg->target_port_name), "%s", + target_port->name); + reg->relative_target_port_id = target_port->index; + } + reg->rkey = sa_rkey; + TAILQ_INSERT_TAIL(&lun->reg_head, reg, link); + lun->pr_generation++; + + return 0; +} + +static void +scsi_pr_release_reservation(struct spdk_scsi_lun *lun, struct spdk_scsi_pr_registrant *reg) +{ + bool all_regs = false; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: release reservation " + "with type %u\n", lun->reservation.rtype); + + /* TODO: Unit Attention */ + all_regs = scsi_pr_is_all_registrants_type(lun); + if (all_regs && !TAILQ_EMPTY(&lun->reg_head)) { + lun->reservation.holder = TAILQ_FIRST(&lun->reg_head); + return; + } + + memset(&lun->reservation, 0, sizeof(struct spdk_scsi_pr_reservation)); +} + +static void +scsi_pr_reserve_reservation(struct spdk_scsi_lun *lun, + enum spdk_scsi_pr_type_code type, + uint64_t rkey, + struct spdk_scsi_pr_registrant *holder) +{ + lun->reservation.rtype = type; + lun->reservation.crkey = rkey; + lun->reservation.holder = holder; +} + +static void +scsi_pr_unregister_registrant(struct spdk_scsi_lun *lun, + struct spdk_scsi_pr_registrant *reg) +{ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: unregister registrant\n"); + + TAILQ_REMOVE(&lun->reg_head, reg, link); + if (scsi_pr_registrant_is_holder(lun, reg)) { + scsi_pr_release_reservation(lun, reg); + } + + free(reg); + lun->pr_generation++; +} + +static void +scsi_pr_replace_registrant_key(struct spdk_scsi_lun *lun, + struct spdk_scsi_pr_registrant *reg, + uint64_t sa_rkey) +{ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: replace with new " + "reservation key 0x%"PRIx64"\n", sa_rkey); + reg->rkey = sa_rkey; + lun->pr_generation++; +} + +static int +scsi_pr_out_reserve(struct spdk_scsi_task *task, + enum spdk_scsi_pr_type_code rtype, uint64_t rkey, + uint8_t spec_i_pt, uint8_t all_tg_pt, uint8_t aptpl) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_registrant *reg; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT RESERVE: rkey 0x%"PRIx64", requested " + "reservation type %u, type %u\n", rkey, rtype, lun->reservation.rtype); + + /* TODO: don't support now */ + if (spec_i_pt || all_tg_pt || aptpl) { + SPDK_ERRLOG("Unspported spec_i_pt/all_tg_pt fields " + "or invalid aptpl field\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -EINVAL; + } + + reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port); + /* No registration for the I_T nexus */ + if (!reg) { + SPDK_ERRLOG("No registration\n"); + goto conflict; + } + + /* invalid reservation key */ + if (reg->rkey != rkey) { + SPDK_ERRLOG("Reservation key 0x%"PRIx64" don't match 0x%"PRIx64"\n", + rkey, reg->rkey); + goto conflict; + } + + /* reservation holder already exists */ + if (scsi_pr_has_reservation(lun)) { + if (rtype != lun->reservation.rtype) { + SPDK_ERRLOG("Reservation type doesn't match\n"); + goto conflict; + } + + if (!scsi_pr_registrant_is_holder(lun, reg)) { + SPDK_ERRLOG("Only 1 holder is allowed for type %u\n", rtype); + goto conflict; + } + } else { + /* current I_T nexus is the first reservation holder */ + scsi_pr_reserve_reservation(lun, rtype, rkey, reg); + } + + return 0; + +conflict: + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -EINVAL; +} + +static int +scsi_pr_out_register(struct spdk_scsi_task *task, + enum spdk_scsi_pr_out_service_action_code action, + uint64_t rkey, uint64_t sa_rkey, + uint8_t spec_i_pt, uint8_t all_tg_pt, uint8_t aptpl) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_registrant *reg; + int sc, sk, asc; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT REGISTER: rkey 0x%"PRIx64", " + "sa_key 0x%"PRIx64", reservation type %u\n", rkey, sa_rkey, lun->reservation.rtype); + + /* TODO: don't support now */ + if (spec_i_pt || all_tg_pt || aptpl) { + SPDK_ERRLOG("Unsupported spec_i_pt/all_tg_pt/aptpl field\n"); + sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB; + goto error_exit; + } + + reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port); + /* an unregistered I_T nexus session */ + if (!reg) { + if (rkey && (action == SPDK_SCSI_PR_OUT_REGISTER)) { + SPDK_ERRLOG("Reservation key field is not empty\n"); + sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT; + sk = SPDK_SCSI_SENSE_NO_SENSE; + asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + goto error_exit; + } + + if (!sa_rkey) { + /* Do nothing except return GOOD status */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: service action " + "reservation key is zero, do noting\n"); + return 0; + } + /* Add a new registrant for the I_T nexus */ + return scsi_pr_register_registrant(lun, task->initiator_port, + task->target_port, sa_rkey); + } else { + /* a registered I_T nexus */ + if (rkey != reg->rkey && action == SPDK_SCSI_PR_OUT_REGISTER) { + SPDK_ERRLOG("Reservation key 0x%"PRIx64" don't match " + "registrant's key 0x%"PRIx64"\n", rkey, reg->rkey); + sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT; + sk = SPDK_SCSI_SENSE_NO_SENSE; + asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + goto error_exit; + } + + if (!sa_rkey) { + /* unregister */ + scsi_pr_unregister_registrant(lun, reg); + } else { + /* replace */ + scsi_pr_replace_registrant_key(lun, reg, sa_rkey); + } + } + + return 0; + +error_exit: + spdk_scsi_task_set_status(task, sc, sk, asc, SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE); + return -EINVAL; +} + +static int +scsi_pr_out_release(struct spdk_scsi_task *task, + enum spdk_scsi_pr_type_code rtype, uint64_t rkey) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_registrant *reg; + int sk, asc; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT RELEASE: rkey 0x%"PRIx64", " + "reservation type %u\n", rkey, rtype); + + reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port); + if (!reg) { + SPDK_ERRLOG("No registration\n"); + sk = SPDK_SCSI_SENSE_NOT_READY; + asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + goto check_condition; + } + + /* no reservation holder */ + if (!scsi_pr_has_reservation(lun)) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "RELEASE: no reservation holder\n"); + return 0; + } + + if (lun->reservation.rtype != rtype || rkey != lun->reservation.crkey) { + sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB; + goto check_condition; + } + + /* I_T nexus is not a persistent reservation holder */ + if (!scsi_pr_registrant_is_holder(lun, reg)) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "RELEASE: current I_T nexus is not holder\n"); + return 0; + } + + scsi_pr_release_reservation(lun, reg); + + return 0; + +check_condition: + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, sk, asc, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -EINVAL; +} + +static int +scsi_pr_out_clear(struct spdk_scsi_task *task, uint64_t rkey) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_registrant *reg, *tmp; + int sc, sk, asc; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT CLEAR: rkey 0x%"PRIx64"\n", rkey); + + reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port); + if (!reg) { + SPDK_ERRLOG("No registration\n"); + sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + sk = SPDK_SCSI_SENSE_NOT_READY; + asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + goto error_exit; + } + + if (rkey != reg->rkey) { + SPDK_ERRLOG("Reservation key 0x%"PRIx64" doesn't match " + "registrant's key 0x%"PRIx64"\n", rkey, reg->rkey); + sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT; + sk = SPDK_SCSI_SENSE_NO_SENSE; + asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + goto error_exit; + } + + TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) { + scsi_pr_unregister_registrant(lun, reg); + } + + return 0; + +error_exit: + spdk_scsi_task_set_status(task, sc, sk, asc, SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -EINVAL; +} + +static void +scsi_pr_remove_all_regs_by_key(struct spdk_scsi_lun *lun, uint64_t sa_rkey) +{ + struct spdk_scsi_pr_registrant *reg, *tmp; + + TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) { + if (reg->rkey == sa_rkey) { + scsi_pr_unregister_registrant(lun, reg); + } + } +} + +static void +scsi_pr_remove_all_other_regs(struct spdk_scsi_lun *lun, struct spdk_scsi_pr_registrant *reg) +{ + struct spdk_scsi_pr_registrant *reg_tmp, *reg_tmp2; + + TAILQ_FOREACH_SAFE(reg_tmp, &lun->reg_head, link, reg_tmp2) { + if (reg_tmp != reg) { + scsi_pr_unregister_registrant(lun, reg_tmp); + } + } +} + +static int +scsi_pr_out_preempt(struct spdk_scsi_task *task, + enum spdk_scsi_pr_out_service_action_code action, + enum spdk_scsi_pr_type_code rtype, + uint64_t rkey, uint64_t sa_rkey) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_registrant *reg; + bool all_regs = false; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT PREEMPT: rkey 0x%"PRIx64", sa_rkey 0x%"PRIx64" " + "action %u, type %u, reservation type %u\n", + rkey, sa_rkey, action, rtype, lun->reservation.rtype); + + /* I_T nexus is not registered */ + reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port); + if (!reg) { + SPDK_ERRLOG("No registration\n"); + goto conflict; + } + if (rkey != reg->rkey) { + SPDK_ERRLOG("Reservation key 0x%"PRIx64" doesn't match " + "registrant's key 0x%"PRIx64"\n", rkey, reg->rkey); + goto conflict; + } + + /* no persistent reservation */ + if (!scsi_pr_has_reservation(lun)) { + scsi_pr_remove_all_regs_by_key(lun, sa_rkey); + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: no persistent reservation\n"); + goto exit; + } + + all_regs = scsi_pr_is_all_registrants_type(lun); + + if (all_regs) { + if (sa_rkey != 0) { + scsi_pr_remove_all_regs_by_key(lun, sa_rkey); + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: All registrants type with sa_rkey\n"); + } else { + /* remove all other registrants and release persistent reservation if any */ + scsi_pr_remove_all_other_regs(lun, reg); + /* create persistent reservation using new type and scope */ + scsi_pr_reserve_reservation(lun, rtype, 0, reg); + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: All registrants type with sa_rkey zeroed\n"); + } + goto exit; + } + + assert(lun->reservation.crkey != 0); + + if (sa_rkey != lun->reservation.crkey) { + if (!sa_rkey) { + SPDK_ERRLOG("Zeroed sa_rkey\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -EINVAL; + } + scsi_pr_remove_all_regs_by_key(lun, sa_rkey); + goto exit; + } + + if (scsi_pr_registrant_is_holder(lun, reg)) { + scsi_pr_reserve_reservation(lun, rtype, rkey, reg); + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: preempt itself with type %u\n", rtype); + goto exit; + } + + /* unregister registrants if any */ + scsi_pr_remove_all_regs_by_key(lun, sa_rkey); + reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port); + if (!reg) { + SPDK_ERRLOG("Current I_T nexus registrant was removed\n"); + goto conflict; + } + + /* preempt the holder */ + scsi_pr_reserve_reservation(lun, rtype, rkey, reg); + +exit: + lun->pr_generation++; + return 0; + +conflict: + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -EINVAL; +} + +int +scsi_pr_out(struct spdk_scsi_task *task, uint8_t *cdb, + uint8_t *data, uint16_t data_len) +{ + int rc = -1; + uint64_t rkey, sa_rkey; + uint8_t spec_i_pt, all_tg_pt, aptpl; + enum spdk_scsi_pr_out_service_action_code action; + enum spdk_scsi_pr_scope_code scope; + enum spdk_scsi_pr_type_code rtype; + struct spdk_scsi_pr_out_param_list *param = (struct spdk_scsi_pr_out_param_list *)data; + + action = cdb[1] & 0x0f; + scope = (cdb[2] >> 4) & 0x0f; + rtype = cdb[2] & 0x0f; + + rkey = from_be64(¶m->rkey); + sa_rkey = from_be64(¶m->sa_rkey); + aptpl = param->aptpl; + spec_i_pt = param->spec_i_pt; + all_tg_pt = param->all_tg_pt; + + switch (action) { + case SPDK_SCSI_PR_OUT_REGISTER: + case SPDK_SCSI_PR_OUT_REG_AND_IGNORE_KEY: + rc = scsi_pr_out_register(task, action, rkey, sa_rkey, + spec_i_pt, all_tg_pt, aptpl); + break; + case SPDK_SCSI_PR_OUT_RESERVE: + if (scope != SPDK_SCSI_PR_LU_SCOPE) { + goto invalid; + } + rc = scsi_pr_out_reserve(task, rtype, rkey, + spec_i_pt, all_tg_pt, aptpl); + break; + case SPDK_SCSI_PR_OUT_RELEASE: + if (scope != SPDK_SCSI_PR_LU_SCOPE) { + goto invalid; + } + rc = scsi_pr_out_release(task, rtype, rkey); + break; + case SPDK_SCSI_PR_OUT_CLEAR: + rc = scsi_pr_out_clear(task, rkey); + break; + case SPDK_SCSI_PR_OUT_PREEMPT: + if (scope != SPDK_SCSI_PR_LU_SCOPE) { + goto invalid; + } + rc = scsi_pr_out_preempt(task, action, rtype, rkey, sa_rkey); + break; + default: + SPDK_ERRLOG("Invalid service action code %u\n", action); + goto invalid; + } + + return rc; + +invalid: + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -EINVAL; +} + +static int +scsi_pr_in_read_keys(struct spdk_scsi_task *task, uint8_t *data, + uint16_t data_len) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_in_read_keys_data *keys; + struct spdk_scsi_pr_registrant *reg, *tmp; + uint16_t count = 0; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN READ KEYS\n"); + keys = (struct spdk_scsi_pr_in_read_keys_data *)data; + + to_be32(&keys->header.pr_generation, lun->pr_generation); + TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) { + if (((count + 1) * 8 + sizeof(keys->header)) > data_len) { + break; + } + to_be64(&keys->rkeys[count], reg->rkey); + count++; + } + to_be32(&keys->header.additional_len, count * 8); + + return (sizeof(keys->header) + count * 8); +} + +static int +scsi_pr_in_read_reservations(struct spdk_scsi_task *task, + uint8_t *data, uint16_t data_len) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_in_read_reservations_data *param; + bool all_regs = false; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN READ RESERVATIONS\n"); + param = (struct spdk_scsi_pr_in_read_reservations_data *)(data); + + to_be32(¶m->header.pr_generation, lun->pr_generation); + if (scsi_pr_has_reservation(lun)) { + all_regs = scsi_pr_is_all_registrants_type(lun); + if (all_regs) { + to_be64(¶m->rkey, 0); + } else { + to_be64(¶m->rkey, lun->reservation.crkey); + } + to_be32(¶m->header.additional_len, 16); + param->scope = SPDK_SCSI_PR_LU_SCOPE; + param->type = lun->reservation.rtype; + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "READ RESERVATIONS with valid reservation\n"); + return sizeof(*param); + } + + /* no reservation */ + to_be32(¶m->header.additional_len, 0); + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "READ RESERVATIONS no reservation\n"); + return sizeof(param->header); +} + +static int +scsi_pr_in_report_capabilities(struct spdk_scsi_task *task, + uint8_t *data, uint16_t data_len) +{ + struct spdk_scsi_pr_in_report_capabilities_data *param; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN REPORT CAPABILITIES\n"); + param = (struct spdk_scsi_pr_in_report_capabilities_data *)data; + + memset(param, 0, sizeof(*param)); + to_be16(¶m->length, sizeof(*param)); + /* Compatible reservation handling to support RESERVE/RELEASE defined in SPC-2 */ + param->crh = 1; + param->tmv = 1; + param->wr_ex = 1; + param->ex_ac = 1; + param->wr_ex_ro = 1; + param->ex_ac_ro = 1; + param->wr_ex_ar = 1; + param->ex_ac_ar = 1; + + return sizeof(*param); +} + +static int +scsi_pr_in_read_full_status(struct spdk_scsi_task *task, + uint8_t *data, uint16_t data_len) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_in_full_status_data *param; + struct spdk_scsi_pr_in_full_status_desc *desc; + struct spdk_scsi_pr_registrant *reg, *tmp; + bool all_regs = false; + uint32_t add_len = 0; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN READ FULL STATUS\n"); + + all_regs = scsi_pr_is_all_registrants_type(lun); + param = (struct spdk_scsi_pr_in_full_status_data *)data; + to_be32(¶m->header.pr_generation, lun->pr_generation); + + TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) { + desc = (struct spdk_scsi_pr_in_full_status_desc *) + ((uint8_t *)param->desc_list + add_len); + if (add_len + sizeof(*desc) + sizeof(param->header) > data_len) { + break; + } + add_len += sizeof(*desc); + desc->rkey = reg->rkey; + if (all_regs || lun->reservation.holder == reg) { + desc->r_holder = true; + desc->type = lun->reservation.rtype; + } else { + desc->r_holder = false; + desc->type = 0; + } + desc->all_tg_pt = 0; + desc->scope = SPDK_SCSI_PR_LU_SCOPE; + desc->relative_target_port_id = reg->relative_target_port_id; + if (add_len + reg->transport_id_len + sizeof(param->header) > data_len) { + break; + } + add_len += reg->transport_id_len; + memcpy(&desc->transport_id, reg->transport_id, reg->transport_id_len); + to_be32(&desc->desc_len, reg->transport_id_len); + } + to_be32(¶m->header.additional_len, add_len); + + return (sizeof(param->header) + add_len); +} + +int +scsi_pr_in(struct spdk_scsi_task *task, uint8_t *cdb, + uint8_t *data, uint16_t data_len) +{ + enum spdk_scsi_pr_in_action_code action; + int rc = 0; + + action = cdb[1] & 0x1f; + if (data_len < sizeof(struct spdk_scsi_pr_in_read_header)) { + goto invalid; + } + + switch (action) { + case SPDK_SCSI_PR_IN_READ_KEYS: + rc = scsi_pr_in_read_keys(task, data, data_len); + break; + case SPDK_SCSI_PR_IN_READ_RESERVATION: + if (data_len < sizeof(struct spdk_scsi_pr_in_read_reservations_data)) { + goto invalid; + } + rc = scsi_pr_in_read_reservations(task, data, data_len); + break; + case SPDK_SCSI_PR_IN_REPORT_CAPABILITIES: + rc = scsi_pr_in_report_capabilities(task, data, data_len); + break; + case SPDK_SCSI_PR_IN_READ_FULL_STATUS: + rc = scsi_pr_in_read_full_status(task, data, data_len); + break; + default: + goto invalid; + } + + return rc; + +invalid: + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -EINVAL; +} + +int +scsi_pr_check(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + uint8_t *cdb = task->cdb; + enum spdk_scsi_pr_type_code rtype; + enum spdk_scsi_pr_out_service_action_code action; + struct spdk_scsi_pr_registrant *reg; + bool dma_to_device = false; + + /* no reservation holders */ + if (!scsi_pr_has_reservation(lun)) { + return 0; + } + + rtype = lun->reservation.rtype; + assert(rtype != 0); + + reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port); + /* current I_T nexus hold the reservation */ + if (scsi_pr_registrant_is_holder(lun, reg)) { + return 0; + } + + /* reservation is held by other I_T nexus */ + switch (cdb[0]) { + case SPDK_SPC_INQUIRY: + case SPDK_SPC_REPORT_LUNS: + case SPDK_SPC_REQUEST_SENSE: + case SPDK_SPC_LOG_SENSE: + case SPDK_SPC_TEST_UNIT_READY: + case SPDK_SBC_START_STOP_UNIT: + case SPDK_SBC_READ_CAPACITY_10: + case SPDK_SPC_PERSISTENT_RESERVE_IN: + case SPDK_SPC_SERVICE_ACTION_IN_16: + /* CRH enabled, processed by scsi2_reserve() */ + case SPDK_SPC2_RESERVE_6: + case SPDK_SPC2_RESERVE_10: + /* CRH enabled, processed by scsi2_release() */ + case SPDK_SPC2_RELEASE_6: + case SPDK_SPC2_RELEASE_10: + return 0; + case SPDK_SPC_MODE_SELECT_6: + case SPDK_SPC_MODE_SELECT_10: + case SPDK_SPC_MODE_SENSE_6: + case SPDK_SPC_MODE_SENSE_10: + case SPDK_SPC_LOG_SELECT: + /* I_T nexus is registrant but not holder */ + if (!reg) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "CHECK: current I_T nexus " + "is not registered, cdb 0x%x\n", cdb[0]); + goto conflict; + } + return 0; + case SPDK_SPC_PERSISTENT_RESERVE_OUT: + action = cdb[1] & 0x1f; + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "CHECK: PR OUT action %u\n", action); + switch (action) { + case SPDK_SCSI_PR_OUT_RELEASE: + case SPDK_SCSI_PR_OUT_CLEAR: + case SPDK_SCSI_PR_OUT_PREEMPT: + case SPDK_SCSI_PR_OUT_PREEMPT_AND_ABORT: + if (!reg) { + SPDK_ERRLOG("CHECK: PR OUT action %u\n", action); + goto conflict; + } + return 0; + case SPDK_SCSI_PR_OUT_REGISTER: + case SPDK_SCSI_PR_OUT_REG_AND_IGNORE_KEY: + return 0; + case SPDK_SCSI_PR_OUT_REG_AND_MOVE: + SPDK_ERRLOG("CHECK: PR OUT action %u\n", action); + goto conflict; + default: + SPDK_ERRLOG("CHECK: PR OUT invalid action %u\n", action); + goto conflict; + } + + /* For most SBC R/W commands */ + default: + break; + } + + switch (cdb[0]) { + case SPDK_SBC_READ_6: + case SPDK_SBC_READ_10: + case SPDK_SBC_READ_12: + case SPDK_SBC_READ_16: + break; + case SPDK_SBC_WRITE_6: + case SPDK_SBC_WRITE_10: + case SPDK_SBC_WRITE_12: + case SPDK_SBC_WRITE_16: + case SPDK_SBC_UNMAP: + case SPDK_SBC_SYNCHRONIZE_CACHE_10: + case SPDK_SBC_SYNCHRONIZE_CACHE_16: + dma_to_device = true; + break; + default: + SPDK_ERRLOG("CHECK: unsupported SCSI command cdb 0x%x\n", cdb[0]); + goto conflict; + } + + switch (rtype) { + case SPDK_SCSI_PR_WRITE_EXCLUSIVE: + if (dma_to_device) { + SPDK_ERRLOG("CHECK: Write Exclusive reservation type " + "rejects command 0x%x\n", cdb[0]); + goto conflict; + } + break; + case SPDK_SCSI_PR_EXCLUSIVE_ACCESS: + SPDK_ERRLOG("CHECK: Exclusive Access reservation type " + "rejects command 0x%x\n", cdb[0]); + goto conflict; + case SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY: + case SPDK_SCSI_PR_WRITE_EXCLUSIVE_ALL_REGS: + if (!reg && dma_to_device) { + SPDK_ERRLOG("CHECK: Registrants only reservation " + "type reject command 0x%x\n", cdb[0]); + goto conflict; + } + break; + case SPDK_SCSI_PR_EXCLUSIVE_ACCESS_REGS_ONLY: + case SPDK_SCSI_PR_EXCLUSIVE_ACCESS_ALL_REGS: + if (!reg) { + SPDK_ERRLOG("CHECK: All Registrants reservation " + "type reject command 0x%x\n", cdb[0]); + goto conflict; + } + break; + default: + break; + } + + return 0; + +conflict: + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; +} + +static int +scsi2_check_reservation_conflict(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_registrant *reg; + bool conflict = false; + + reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port); + if (reg) { + /* + * From spc4r31 5.9.3 Exceptions to SPC-2 RESERVE and RELEASE + * behavior + * + * A RESERVE(6) or RESERVE(10) command shall complete with GOOD + * status, but no reservation shall be established and the + * persistent reservation shall not be changed, if the command + * is received from a) and b) below. + * + * A RELEASE(6) or RELEASE(10) command shall complete with GOOD + * status, but the persistent reservation shall not be released, + * if the command is received from a) and b) + * + * a) An I_T nexus that is a persistent reservation holder; or + * b) An I_T nexus that is registered if a registrants only or + * all registrants type persistent reservation is present. + * + * In all other cases, a RESERVE(6) command, RESERVE(10) command, + * RELEASE(6) command, or RELEASE(10) command shall be processed + * as defined in SPC-2. + */ + if (scsi_pr_registrant_is_holder(lun, reg)) { + return 1; + } + + if (lun->reservation.rtype == SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY || + lun->reservation.rtype == SPDK_SCSI_PR_EXCLUSIVE_ACCESS_REGS_ONLY) { + return 1; + } + + conflict = true; + } else { + /* + * From spc2r20 5.5.1 Reservations overview: + * + * If a logical unit has executed a PERSISTENT RESERVE OUT + * command with the REGISTER or the REGISTER AND IGNORE + * EXISTING KEY service action and is still registered by any + * initiator, all RESERVE commands and all RELEASE commands + * regardless of initiator shall conflict and shall terminate + * with a RESERVATION CONFLICT status. + */ + conflict = TAILQ_EMPTY(&lun->reg_head) ? false : true; + } + + if (conflict) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + return 0; +} + +int +scsi2_reserve(struct spdk_scsi_task *task, uint8_t *cdb) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_scsi_pr_registrant *reg = &lun->scsi2_holder; + int ret; + + /* Obsolete Bits and LongID set, returning ILLEGAL_REQUEST */ + if (cdb[1] & 0x3) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + ret = scsi2_check_reservation_conflict(task); + /* PERSISTENT RESERVE is enabled */ + if (ret == 1) { + return 0; + } else if (ret < 0) { + return ret; + } + + /* SPC2 RESERVE */ + reg->initiator_port = task->initiator_port; + if (task->initiator_port) { + snprintf(reg->initiator_port_name, sizeof(reg->initiator_port_name), "%s", + task->initiator_port->name); + reg->transport_id_len = task->initiator_port->transport_id_len; + memcpy(reg->transport_id, task->initiator_port->transport_id, + reg->transport_id_len); + } + reg->target_port = task->target_port; + if (task->target_port) { + snprintf(reg->target_port_name, sizeof(reg->target_port_name), "%s", + task->target_port->name); + } + + lun->reservation.flags = SCSI_SPC2_RESERVE; + lun->reservation.holder = &lun->scsi2_holder; + + return 0; +} + +int +scsi2_release(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + int ret; + + ret = scsi2_check_reservation_conflict(task); + /* PERSISTENT RESERVE is enabled */ + if (ret == 1) { + return 0; + } else if (ret < 0) { + return ret; + } + + assert(lun->reservation.flags & SCSI_SPC2_RESERVE); + + memset(&lun->reservation, 0, sizeof(struct spdk_scsi_pr_reservation)); + memset(&lun->scsi2_holder, 0, sizeof(struct spdk_scsi_pr_registrant)); + + return 0; +} + +int scsi2_reserve_check(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + uint8_t *cdb = task->cdb; + + switch (cdb[0]) { + case SPDK_SPC_INQUIRY: + case SPDK_SPC2_RELEASE_6: + case SPDK_SPC2_RELEASE_10: + return 0; + + default: + break; + } + + /* no reservation holders */ + if (!scsi_pr_has_reservation(lun)) { + return 0; + } + + if (scsi2_it_nexus_is_holder(lun, task->initiator_port, task->target_port)) { + return 0; + } + + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; +} diff --git a/src/spdk/lib/scsi/scsi_rpc.c b/src/spdk/lib/scsi/scsi_rpc.c new file mode 100644 index 000000000..1938ddac7 --- /dev/null +++ b/src/spdk/lib/scsi/scsi_rpc.c @@ -0,0 +1,77 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +#include "spdk/rpc.h" +#include "spdk/util.h" + +static void +rpc_scsi_get_devices(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + struct spdk_scsi_dev *devs = scsi_dev_get_list(); + int i; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "scsi_get_devices requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + for (i = 0; i < SPDK_SCSI_MAX_DEVS; i++) { + struct spdk_scsi_dev *dev = &devs[i]; + + if (!dev->is_allocated) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_named_int32(w, "id", dev->id); + + spdk_json_write_named_string(w, "device_name", dev->name); + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("scsi_get_devices", rpc_scsi_get_devices, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(scsi_get_devices, get_scsi_devices) diff --git a/src/spdk/lib/scsi/spdk_scsi.map b/src/spdk/lib/scsi/spdk_scsi.map new file mode 100644 index 000000000..643372699 --- /dev/null +++ b/src/spdk/lib/scsi/spdk_scsi.map @@ -0,0 +1,49 @@ +{ + global: + + # Public functions + spdk_scsi_init; + spdk_scsi_fini; + spdk_scsi_lun_get_id; + spdk_scsi_lun_get_bdev_name; + spdk_scsi_lun_get_dev; + spdk_scsi_lun_is_removing; + spdk_scsi_dev_get_name; + spdk_scsi_dev_get_id; + spdk_scsi_dev_get_lun; + spdk_scsi_dev_has_pending_tasks; + spdk_scsi_dev_destruct; + spdk_scsi_dev_queue_mgmt_task; + spdk_scsi_dev_queue_task; + spdk_scsi_dev_add_port; + spdk_scsi_dev_delete_port; + spdk_scsi_dev_find_port_by_id; + spdk_scsi_dev_allocate_io_channels; + spdk_scsi_dev_free_io_channels; + spdk_scsi_dev_construct; + spdk_scsi_dev_delete_lun; + spdk_scsi_dev_add_lun; + spdk_scsi_port_create; + spdk_scsi_port_free; + spdk_scsi_port_get_name; + spdk_scsi_task_construct; + spdk_scsi_task_put; + spdk_scsi_task_set_data; + spdk_scsi_task_scatter_data; + spdk_scsi_task_gather_data; + spdk_scsi_task_build_sense_data; + spdk_scsi_task_set_status; + spdk_scsi_task_copy_status; + spdk_scsi_task_process_null_lun; + spdk_scsi_task_process_abort; + spdk_scsi_lun_open; + spdk_scsi_lun_close; + spdk_scsi_lun_allocate_io_channel; + spdk_scsi_lun_free_io_channel; + spdk_scsi_lun_get_dif_ctx; + spdk_scsi_port_set_iscsi_transport_id; + spdk_scsi_lun_id_int_to_fmt; + spdk_scsi_lun_id_fmt_to_int; + + local: *; +}; diff --git a/src/spdk/lib/scsi/task.c b/src/spdk/lib/scsi/task.c new file mode 100644 index 000000000..7fd8305ec --- /dev/null +++ b/src/spdk/lib/scsi/task.c @@ -0,0 +1,300 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/util.h" + +static void +scsi_task_free_data(struct spdk_scsi_task *task) +{ + if (task->alloc_len != 0) { + spdk_dma_free(task->iov.iov_base); + task->alloc_len = 0; + } + + task->iov.iov_base = NULL; + task->iov.iov_len = 0; +} + +void +spdk_scsi_task_put(struct spdk_scsi_task *task) +{ + if (!task) { + return; + } + + assert(task->ref > 0); + task->ref--; + + if (task->ref == 0) { + struct spdk_bdev_io *bdev_io = task->bdev_io; + + if (bdev_io) { + spdk_bdev_free_io(bdev_io); + } + + scsi_task_free_data(task); + + task->free_fn(task); + } +} + +void +spdk_scsi_task_construct(struct spdk_scsi_task *task, + spdk_scsi_task_cpl cpl_fn, + spdk_scsi_task_free free_fn) +{ + assert(task != NULL); + assert(cpl_fn != NULL); + assert(free_fn != NULL); + + task->cpl_fn = cpl_fn; + task->free_fn = free_fn; + + task->ref++; + + /* + * Pre-fill the iov_buffers to point to the embedded iov + */ + assert(task->iov.iov_base == NULL); + task->iovs = &task->iov; + task->iovcnt = 1; +} + +static void * +scsi_task_alloc_data(struct spdk_scsi_task *task, uint32_t alloc_len) +{ + assert(task->alloc_len == 0); + + task->iov.iov_base = spdk_dma_zmalloc(alloc_len, 0, NULL); + task->iov.iov_len = alloc_len; + task->alloc_len = alloc_len; + + return task->iov.iov_base; +} + +int +spdk_scsi_task_scatter_data(struct spdk_scsi_task *task, const void *src, size_t buf_len) +{ + size_t len = 0; + size_t buf_left = buf_len; + int i; + struct iovec *iovs = task->iovs; + const uint8_t *pos; + + if (buf_len == 0) { + return 0; + } + + if (task->iovcnt == 1 && iovs[0].iov_base == NULL) { + scsi_task_alloc_data(task, buf_len); + iovs[0] = task->iov; + } + + for (i = 0; i < task->iovcnt; i++) { + assert(iovs[i].iov_base != NULL); + len += iovs[i].iov_len; + } + + if (len < buf_len) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + pos = src; + + for (i = 0; i < task->iovcnt; i++) { + len = spdk_min(iovs[i].iov_len, buf_left); + buf_left -= len; + memcpy(iovs[i].iov_base, pos, len); + pos += len; + } + + return buf_len; +} + +void * +spdk_scsi_task_gather_data(struct spdk_scsi_task *task, int *len) +{ + int i; + struct iovec *iovs = task->iovs; + size_t buf_len = 0; + uint8_t *buf, *pos; + + for (i = 0; i < task->iovcnt; i++) { + assert(iovs[i].iov_base != NULL); + buf_len += iovs[i].iov_len; + } + + if (buf_len == 0) { + *len = 0; + return NULL; + } + + buf = calloc(1, buf_len); + if (buf == NULL) { + *len = -1; + return NULL; + } + + pos = buf; + for (i = 0; i < task->iovcnt; i++) { + memcpy(pos, iovs[i].iov_base, iovs[i].iov_len); + pos += iovs[i].iov_len; + } + + *len = buf_len; + return buf; +} + +void +spdk_scsi_task_set_data(struct spdk_scsi_task *task, void *data, uint32_t len) +{ + assert(task->iovcnt == 1); + assert(task->alloc_len == 0); + + task->iovs[0].iov_base = data; + task->iovs[0].iov_len = len; +} + +void +spdk_scsi_task_build_sense_data(struct spdk_scsi_task *task, int sk, int asc, int ascq) +{ + uint8_t *cp; + int resp_code; + + resp_code = 0x70; /* Current + Fixed format */ + + /* Sense Data */ + cp = task->sense_data; + + /* VALID(7) RESPONSE CODE(6-0) */ + cp[0] = 0x80 | resp_code; + /* Obsolete */ + cp[1] = 0; + /* FILEMARK(7) EOM(6) ILI(5) SENSE KEY(3-0) */ + cp[2] = sk & 0xf; + /* INFORMATION */ + memset(&cp[3], 0, 4); + + /* ADDITIONAL SENSE LENGTH */ + cp[7] = 10; + + /* COMMAND-SPECIFIC INFORMATION */ + memset(&cp[8], 0, 4); + /* ADDITIONAL SENSE CODE */ + cp[12] = asc; + /* ADDITIONAL SENSE CODE QUALIFIER */ + cp[13] = ascq; + /* FIELD REPLACEABLE UNIT CODE */ + cp[14] = 0; + + /* SKSV(7) SENSE KEY SPECIFIC(6-0,7-0,7-0) */ + cp[15] = 0; + cp[16] = 0; + cp[17] = 0; + + /* SenseLength */ + task->sense_data_len = 18; +} + +void +spdk_scsi_task_set_status(struct spdk_scsi_task *task, int sc, int sk, + int asc, int ascq) +{ + if (sc == SPDK_SCSI_STATUS_CHECK_CONDITION) { + spdk_scsi_task_build_sense_data(task, sk, asc, ascq); + } + task->status = sc; +} + +void +spdk_scsi_task_copy_status(struct spdk_scsi_task *dst, + struct spdk_scsi_task *src) +{ + memcpy(dst->sense_data, src->sense_data, src->sense_data_len); + dst->sense_data_len = src->sense_data_len; + dst->status = src->status; +} + +void +spdk_scsi_task_process_null_lun(struct spdk_scsi_task *task) +{ + uint8_t buffer[36]; + uint32_t allocation_len; + uint32_t data_len; + + task->length = task->transfer_len; + if (task->cdb[0] == SPDK_SPC_INQUIRY) { + /* + * SPC-4 states that INQUIRY commands to an unsupported LUN + * must be served with PERIPHERAL QUALIFIER = 0x3 and + * PERIPHERAL DEVICE TYPE = 0x1F. + */ + data_len = sizeof(buffer); + + memset(buffer, 0, data_len); + /* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */ + buffer[0] = 0x03 << 5 | 0x1f; + /* ADDITIONAL LENGTH */ + buffer[4] = data_len - 5; + + allocation_len = from_be16(&task->cdb[3]); + if (spdk_scsi_task_scatter_data(task, buffer, spdk_min(allocation_len, data_len)) >= 0) { + task->data_transferred = data_len; + task->status = SPDK_SCSI_STATUS_GOOD; + } + } else { + /* LOGICAL UNIT NOT SUPPORTED */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_SUPPORTED, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + task->data_transferred = 0; + } +} + +void +spdk_scsi_task_process_abort(struct spdk_scsi_task *task) +{ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ABORTED_COMMAND, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); +} diff --git a/src/spdk/lib/sock/Makefile b/src/spdk/lib/sock/Makefile new file mode 100644 index 000000000..82fe41e90 --- /dev/null +++ b/src/spdk/lib/sock/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 1 + +C_SRCS = sock.c net_framework.c sock_rpc.c + +LIBNAME = sock + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_sock.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/sock/net_framework.c b/src/spdk/lib/sock/net_framework.c new file mode 100644 index 000000000..45d52d162 --- /dev/null +++ b/src/spdk/lib/sock/net_framework.c @@ -0,0 +1,107 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/log.h" +#include "spdk/net.h" +#include "spdk/queue.h" + +static STAILQ_HEAD(, spdk_net_framework) g_net_frameworks = + STAILQ_HEAD_INITIALIZER(g_net_frameworks); + +static spdk_net_init_cb g_init_cb_fn = NULL; +static void *g_init_cb_arg = NULL; + +static spdk_net_fini_cb g_fini_cb_fn = NULL; +static void *g_fini_cb_arg = NULL; + +struct spdk_net_framework *g_next_net_framework = NULL; + +static inline struct spdk_net_framework * +get_next_net_framework(struct spdk_net_framework *net) +{ + return net ? STAILQ_NEXT(net, link) : STAILQ_FIRST(&g_net_frameworks); +} + +void +spdk_net_framework_init_next(int rc) +{ + if (rc) { + SPDK_ERRLOG("Net framework %s failed to initalize with error %d\n", g_next_net_framework->name, rc); + g_init_cb_fn(g_init_cb_arg, rc); + return; + } + + g_next_net_framework = get_next_net_framework(g_next_net_framework); + if (g_next_net_framework == NULL) { + g_init_cb_fn(g_init_cb_arg, 0); + return; + } + + g_next_net_framework->init(); +} + +void +spdk_net_framework_start(spdk_net_init_cb cb_fn, void *cb_arg) +{ + g_init_cb_fn = cb_fn; + g_init_cb_arg = cb_arg; + + spdk_net_framework_init_next(0); +} + +void +spdk_net_framework_fini_next(void) +{ + g_next_net_framework = get_next_net_framework(g_next_net_framework); + if (g_next_net_framework == NULL) { + g_fini_cb_fn(g_fini_cb_arg); + return; + } + + g_next_net_framework->fini(); +} + +void +spdk_net_framework_fini(spdk_net_fini_cb cb_fn, void *cb_arg) +{ + g_fini_cb_fn = cb_fn; + g_fini_cb_arg = cb_arg; + + spdk_net_framework_fini_next(); +} + +void +spdk_net_framework_register(struct spdk_net_framework *frame) +{ + STAILQ_INSERT_TAIL(&g_net_frameworks, frame, link); +} diff --git a/src/spdk/lib/sock/sock.c b/src/spdk/lib/sock/sock.c new file mode 100644 index 000000000..5ea90385c --- /dev/null +++ b/src/spdk/lib/sock/sock.c @@ -0,0 +1,809 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/log.h" +#include "spdk/sock.h" +#include "spdk_internal/sock.h" +#include "spdk/queue.h" + +#define SPDK_SOCK_DEFAULT_PRIORITY 0 +#define SPDK_SOCK_OPTS_FIELD_OK(opts, field) (offsetof(struct spdk_sock_opts, field) + sizeof(opts->field) <= (opts->opts_size)) + +static STAILQ_HEAD(, spdk_net_impl) g_net_impls = STAILQ_HEAD_INITIALIZER(g_net_impls); + +struct spdk_sock_placement_id_entry { + int placement_id; + uint32_t ref; + struct spdk_sock_group *group; + STAILQ_ENTRY(spdk_sock_placement_id_entry) link; +}; + +static STAILQ_HEAD(, spdk_sock_placement_id_entry) g_placement_id_map = STAILQ_HEAD_INITIALIZER( + g_placement_id_map); +static pthread_mutex_t g_map_table_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* Insert a group into the placement map. + * If the group is already in the map, take a reference. + */ +static int +sock_map_insert(int placement_id, struct spdk_sock_group *group) +{ + struct spdk_sock_placement_id_entry *entry; + + pthread_mutex_lock(&g_map_table_mutex); + STAILQ_FOREACH(entry, &g_placement_id_map, link) { + if (placement_id == entry->placement_id) { + /* The mapping already exists, it means that different sockets have + * the same placement_ids. + */ + entry->ref++; + pthread_mutex_unlock(&g_map_table_mutex); + return 0; + } + } + + entry = calloc(1, sizeof(*entry)); + if (!entry) { + SPDK_ERRLOG("Cannot allocate an entry for placement_id=%u\n", placement_id); + pthread_mutex_unlock(&g_map_table_mutex); + return -ENOMEM; + } + + entry->placement_id = placement_id; + entry->group = group; + entry->ref++; + + STAILQ_INSERT_TAIL(&g_placement_id_map, entry, link); + pthread_mutex_unlock(&g_map_table_mutex); + + return 0; +} + +/* Release a reference to the group for a given placement_id. + * If the reference count is 0, remove the group. + */ +static void +sock_map_release(int placement_id) +{ + struct spdk_sock_placement_id_entry *entry; + + pthread_mutex_lock(&g_map_table_mutex); + STAILQ_FOREACH(entry, &g_placement_id_map, link) { + if (placement_id == entry->placement_id) { + assert(entry->ref > 0); + entry->ref--; + break; + } + } + + pthread_mutex_unlock(&g_map_table_mutex); +} + +/* Look up the group for a placement_id. */ +static void +sock_map_lookup(int placement_id, struct spdk_sock_group **group) +{ + struct spdk_sock_placement_id_entry *entry; + + *group = NULL; + pthread_mutex_lock(&g_map_table_mutex); + STAILQ_FOREACH(entry, &g_placement_id_map, link) { + if (placement_id == entry->placement_id) { + assert(entry->group != NULL); + *group = entry->group; + break; + } + } + pthread_mutex_unlock(&g_map_table_mutex); +} + +/* Remove the socket group from the map table */ +static void +sock_remove_sock_group_from_map_table(struct spdk_sock_group *group) +{ + struct spdk_sock_placement_id_entry *entry, *tmp; + + pthread_mutex_lock(&g_map_table_mutex); + STAILQ_FOREACH_SAFE(entry, &g_placement_id_map, link, tmp) { + if (entry->group == group) { + STAILQ_REMOVE(&g_placement_id_map, entry, spdk_sock_placement_id_entry, link); + free(entry); + } + } + pthread_mutex_unlock(&g_map_table_mutex); + +} + +int +spdk_sock_get_optimal_sock_group(struct spdk_sock *sock, struct spdk_sock_group **group) +{ + int placement_id = 0, rc; + + rc = sock->net_impl->get_placement_id(sock, &placement_id); + if (!rc && (placement_id != 0)) { + sock_map_lookup(placement_id, group); + return 0; + } else { + return -1; + } +} + +int +spdk_sock_getaddr(struct spdk_sock *sock, char *saddr, int slen, uint16_t *sport, + char *caddr, int clen, uint16_t *cport) +{ + return sock->net_impl->getaddr(sock, saddr, slen, sport, caddr, clen, cport); +} + +void +spdk_sock_get_default_opts(struct spdk_sock_opts *opts) +{ + assert(opts); + + if (SPDK_SOCK_OPTS_FIELD_OK(opts, priority)) { + opts->priority = SPDK_SOCK_DEFAULT_PRIORITY; + } +} + +/* + * opts The opts allocated in the current library. + * opts_user The opts passed by the caller. + * */ +static void +sock_init_opts(struct spdk_sock_opts *opts, struct spdk_sock_opts *opts_user) +{ + assert(opts); + assert(opts_user); + + opts->opts_size = sizeof(*opts); + spdk_sock_get_default_opts(opts); + + /* reset the size according to the user */ + opts->opts_size = opts_user->opts_size; + if (SPDK_SOCK_OPTS_FIELD_OK(opts, priority)) { + opts->priority = opts_user->priority; + } +} + +struct spdk_sock * +spdk_sock_connect(const char *ip, int port, char *impl_name) +{ + struct spdk_sock_opts opts; + + opts.opts_size = sizeof(opts); + spdk_sock_get_default_opts(&opts); + return spdk_sock_connect_ext(ip, port, impl_name, &opts); +} + +struct spdk_sock * +spdk_sock_connect_ext(const char *ip, int port, char *impl_name, struct spdk_sock_opts *opts) +{ + struct spdk_net_impl *impl = NULL; + struct spdk_sock *sock; + struct spdk_sock_opts opts_local; + + if (opts == NULL) { + SPDK_ERRLOG("the opts should not be NULL pointer\n"); + return NULL; + } + + STAILQ_FOREACH_FROM(impl, &g_net_impls, link) { + if (impl_name && strncmp(impl_name, impl->name, strlen(impl->name) + 1)) { + continue; + } + + sock_init_opts(&opts_local, opts); + sock = impl->connect(ip, port, &opts_local); + if (sock != NULL) { + /* Copy the contents, both the two structures are the same ABI version */ + memcpy(&sock->opts, &opts_local, sizeof(sock->opts)); + sock->net_impl = impl; + TAILQ_INIT(&sock->queued_reqs); + TAILQ_INIT(&sock->pending_reqs); + return sock; + } + } + + return NULL; +} + +struct spdk_sock * +spdk_sock_listen(const char *ip, int port, char *impl_name) +{ + struct spdk_sock_opts opts; + + opts.opts_size = sizeof(opts); + spdk_sock_get_default_opts(&opts); + return spdk_sock_listen_ext(ip, port, impl_name, &opts); +} + +struct spdk_sock * +spdk_sock_listen_ext(const char *ip, int port, char *impl_name, struct spdk_sock_opts *opts) +{ + struct spdk_net_impl *impl = NULL; + struct spdk_sock *sock; + struct spdk_sock_opts opts_local; + + if (opts == NULL) { + SPDK_ERRLOG("the opts should not be NULL pointer\n"); + return NULL; + } + + STAILQ_FOREACH_FROM(impl, &g_net_impls, link) { + if (impl_name && strncmp(impl_name, impl->name, strlen(impl->name) + 1)) { + continue; + } + + sock_init_opts(&opts_local, opts); + sock = impl->listen(ip, port, &opts_local); + if (sock != NULL) { + /* Copy the contents, both the two structures are the same ABI version */ + memcpy(&sock->opts, &opts_local, sizeof(sock->opts)); + sock->net_impl = impl; + /* Don't need to initialize the request queues for listen + * sockets. */ + return sock; + } + } + + return NULL; +} + +struct spdk_sock * +spdk_sock_accept(struct spdk_sock *sock) +{ + struct spdk_sock *new_sock; + + new_sock = sock->net_impl->accept(sock); + if (new_sock != NULL) { + /* Inherit the opts from the "accept sock" */ + new_sock->opts = sock->opts; + memcpy(&new_sock->opts, &sock->opts, sizeof(new_sock->opts)); + new_sock->net_impl = sock->net_impl; + TAILQ_INIT(&new_sock->queued_reqs); + TAILQ_INIT(&new_sock->pending_reqs); + } + + return new_sock; +} + +int +spdk_sock_close(struct spdk_sock **_sock) +{ + struct spdk_sock *sock = *_sock; + int rc; + + if (sock == NULL) { + errno = EBADF; + return -1; + } + + if (sock->cb_fn != NULL) { + /* This sock is still part of a sock_group. */ + errno = EBUSY; + return -1; + } + + sock->flags.closed = true; + + if (sock->cb_cnt > 0) { + /* Let the callback unwind before destroying the socket */ + return 0; + } + + spdk_sock_abort_requests(sock); + + rc = sock->net_impl->close(sock); + if (rc == 0) { + *_sock = NULL; + } + + return rc; +} + +ssize_t +spdk_sock_recv(struct spdk_sock *sock, void *buf, size_t len) +{ + if (sock == NULL) { + errno = EBADF; + return -1; + } + + if (sock->flags.closed) { + errno = EBADF; + return -1; + } + + return sock->net_impl->recv(sock, buf, len); +} + +ssize_t +spdk_sock_readv(struct spdk_sock *sock, struct iovec *iov, int iovcnt) +{ + if (sock == NULL) { + errno = EBADF; + return -1; + } + + if (sock->flags.closed) { + errno = EBADF; + return -1; + } + + return sock->net_impl->readv(sock, iov, iovcnt); +} + +ssize_t +spdk_sock_writev(struct spdk_sock *sock, struct iovec *iov, int iovcnt) +{ + if (sock == NULL) { + errno = EBADF; + return -1; + } + + if (sock->flags.closed) { + errno = EBADF; + return -1; + } + + return sock->net_impl->writev(sock, iov, iovcnt); +} + +void +spdk_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req) +{ + assert(req->cb_fn != NULL); + + if (sock == NULL) { + req->cb_fn(req->cb_arg, -EBADF); + return; + } + + if (sock->flags.closed) { + req->cb_fn(req->cb_arg, -EBADF); + return; + } + + sock->net_impl->writev_async(sock, req); +} + +int +spdk_sock_flush(struct spdk_sock *sock) +{ + if (sock == NULL) { + return -EBADF; + } + + if (sock->flags.closed) { + return -EBADF; + } + + return sock->net_impl->flush(sock); +} + +int +spdk_sock_set_recvlowat(struct spdk_sock *sock, int nbytes) +{ + return sock->net_impl->set_recvlowat(sock, nbytes); +} + +int +spdk_sock_set_recvbuf(struct spdk_sock *sock, int sz) +{ + return sock->net_impl->set_recvbuf(sock, sz); +} + +int +spdk_sock_set_sendbuf(struct spdk_sock *sock, int sz) +{ + return sock->net_impl->set_sendbuf(sock, sz); +} + +bool +spdk_sock_is_ipv6(struct spdk_sock *sock) +{ + return sock->net_impl->is_ipv6(sock); +} + +bool +spdk_sock_is_ipv4(struct spdk_sock *sock) +{ + return sock->net_impl->is_ipv4(sock); +} + +bool +spdk_sock_is_connected(struct spdk_sock *sock) +{ + return sock->net_impl->is_connected(sock); +} + +struct spdk_sock_group * +spdk_sock_group_create(void *ctx) +{ + struct spdk_net_impl *impl = NULL; + struct spdk_sock_group *group; + struct spdk_sock_group_impl *group_impl; + + group = calloc(1, sizeof(*group)); + if (group == NULL) { + return NULL; + } + + STAILQ_INIT(&group->group_impls); + + STAILQ_FOREACH_FROM(impl, &g_net_impls, link) { + group_impl = impl->group_impl_create(); + if (group_impl != NULL) { + STAILQ_INSERT_TAIL(&group->group_impls, group_impl, link); + TAILQ_INIT(&group_impl->socks); + group_impl->num_removed_socks = 0; + group_impl->net_impl = impl; + } + } + + group->ctx = ctx; + return group; +} + +void * +spdk_sock_group_get_ctx(struct spdk_sock_group *group) +{ + if (group == NULL) { + return NULL; + } + + return group->ctx; +} + +int +spdk_sock_group_add_sock(struct spdk_sock_group *group, struct spdk_sock *sock, + spdk_sock_cb cb_fn, void *cb_arg) +{ + struct spdk_sock_group_impl *group_impl = NULL; + int rc, placement_id = 0; + + if (cb_fn == NULL) { + errno = EINVAL; + return -1; + } + + if (sock->group_impl != NULL) { + /* + * This sock is already part of a sock_group. Currently we don't + * support this. + */ + errno = EBUSY; + return -1; + } + + rc = sock->net_impl->get_placement_id(sock, &placement_id); + if (!rc && (placement_id != 0)) { + rc = sock_map_insert(placement_id, group); + if (rc < 0) { + return -1; + } + } + + STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) { + if (sock->net_impl == group_impl->net_impl) { + break; + } + } + + if (group_impl == NULL) { + errno = EINVAL; + return -1; + } + + rc = group_impl->net_impl->group_impl_add_sock(group_impl, sock); + if (rc == 0) { + TAILQ_INSERT_TAIL(&group_impl->socks, sock, link); + sock->group_impl = group_impl; + sock->cb_fn = cb_fn; + sock->cb_arg = cb_arg; + } + + return rc; +} + +int +spdk_sock_group_remove_sock(struct spdk_sock_group *group, struct spdk_sock *sock) +{ + struct spdk_sock_group_impl *group_impl = NULL; + int rc, placement_id = 0; + + STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) { + if (sock->net_impl == group_impl->net_impl) { + break; + } + } + + if (group_impl == NULL) { + errno = EINVAL; + return -1; + } + + assert(group_impl == sock->group_impl); + + rc = sock->net_impl->get_placement_id(sock, &placement_id); + if (!rc && (placement_id != 0)) { + sock_map_release(placement_id); + } + + rc = group_impl->net_impl->group_impl_remove_sock(group_impl, sock); + if (rc == 0) { + TAILQ_REMOVE(&group_impl->socks, sock, link); + assert(group_impl->num_removed_socks < MAX_EVENTS_PER_POLL); + group_impl->removed_socks[group_impl->num_removed_socks] = (uintptr_t)sock; + group_impl->num_removed_socks++; + sock->group_impl = NULL; + sock->cb_fn = NULL; + sock->cb_arg = NULL; + } + + return rc; +} + +int +spdk_sock_group_poll(struct spdk_sock_group *group) +{ + return spdk_sock_group_poll_count(group, MAX_EVENTS_PER_POLL); +} + +static int +sock_group_impl_poll_count(struct spdk_sock_group_impl *group_impl, + struct spdk_sock_group *group, + int max_events) +{ + struct spdk_sock *socks[MAX_EVENTS_PER_POLL]; + int num_events, i; + + if (TAILQ_EMPTY(&group_impl->socks)) { + return 0; + } + + /* The number of removed sockets should be reset for each call to poll. */ + group_impl->num_removed_socks = 0; + + num_events = group_impl->net_impl->group_impl_poll(group_impl, max_events, socks); + if (num_events == -1) { + return -1; + } + + for (i = 0; i < num_events; i++) { + struct spdk_sock *sock = socks[i]; + int j; + bool valid = true; + for (j = 0; j < group_impl->num_removed_socks; j++) { + if ((uintptr_t)sock == group_impl->removed_socks[j]) { + valid = false; + break; + } + } + + if (valid) { + assert(sock->cb_fn != NULL); + sock->cb_fn(sock->cb_arg, group, sock); + } + } + + return num_events; +} + +int +spdk_sock_group_poll_count(struct spdk_sock_group *group, int max_events) +{ + struct spdk_sock_group_impl *group_impl = NULL; + int rc, num_events = 0; + + if (max_events < 1) { + errno = -EINVAL; + return -1; + } + + /* + * Only poll for up to 32 events at a time - if more events are pending, + * the next call to this function will reap them. + */ + if (max_events > MAX_EVENTS_PER_POLL) { + max_events = MAX_EVENTS_PER_POLL; + } + + STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) { + rc = sock_group_impl_poll_count(group_impl, group, max_events); + if (rc < 0) { + num_events = -1; + SPDK_ERRLOG("group_impl_poll_count for net(%s) failed\n", + group_impl->net_impl->name); + } else if (num_events >= 0) { + num_events += rc; + } + } + + return num_events; +} + +int +spdk_sock_group_close(struct spdk_sock_group **group) +{ + struct spdk_sock_group_impl *group_impl = NULL, *tmp; + int rc; + + if (*group == NULL) { + errno = EBADF; + return -1; + } + + STAILQ_FOREACH_SAFE(group_impl, &(*group)->group_impls, link, tmp) { + if (!TAILQ_EMPTY(&group_impl->socks)) { + errno = EBUSY; + return -1; + } + } + + STAILQ_FOREACH_SAFE(group_impl, &(*group)->group_impls, link, tmp) { + rc = group_impl->net_impl->group_impl_close(group_impl); + if (rc != 0) { + SPDK_ERRLOG("group_impl_close for net(%s) failed\n", + group_impl->net_impl->name); + } + } + + sock_remove_sock_group_from_map_table(*group); + free(*group); + *group = NULL; + + return 0; +} + +static inline struct spdk_net_impl * +sock_get_impl_by_name(const char *impl_name) +{ + struct spdk_net_impl *impl; + + assert(impl_name != NULL); + STAILQ_FOREACH(impl, &g_net_impls, link) { + if (0 == strcmp(impl_name, impl->name)) { + return impl; + } + } + + return NULL; +} + +int +spdk_sock_impl_get_opts(const char *impl_name, struct spdk_sock_impl_opts *opts, size_t *len) +{ + struct spdk_net_impl *impl; + + if (!impl_name || !opts || !len) { + errno = EINVAL; + return -1; + } + + impl = sock_get_impl_by_name(impl_name); + if (!impl) { + errno = EINVAL; + return -1; + } + + if (!impl->get_opts) { + errno = ENOTSUP; + return -1; + } + + return impl->get_opts(opts, len); +} + +int +spdk_sock_impl_set_opts(const char *impl_name, const struct spdk_sock_impl_opts *opts, size_t len) +{ + struct spdk_net_impl *impl; + + if (!impl_name || !opts) { + errno = EINVAL; + return -1; + } + + impl = sock_get_impl_by_name(impl_name); + if (!impl) { + errno = EINVAL; + return -1; + } + + if (!impl->set_opts) { + errno = ENOTSUP; + return -1; + } + + return impl->set_opts(opts, len); +} + +void +spdk_sock_write_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_net_impl *impl; + struct spdk_sock_impl_opts opts; + size_t len; + + assert(w != NULL); + + spdk_json_write_array_begin(w); + + STAILQ_FOREACH(impl, &g_net_impls, link) { + if (!impl->get_opts) { + continue; + } + + len = sizeof(opts); + if (impl->get_opts(&opts, &len) == 0) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "sock_impl_set_options"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "impl_name", impl->name); + spdk_json_write_named_uint32(w, "recv_buf_size", opts.recv_buf_size); + spdk_json_write_named_uint32(w, "send_buf_size", opts.send_buf_size); + spdk_json_write_named_bool(w, "enable_recv_pipe", opts.enable_recv_pipe); + spdk_json_write_named_bool(w, "enable_zerocopy_send", opts.enable_zerocopy_send); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); + } else { + SPDK_ERRLOG("Failed to get socket options for socket implementation %s\n", impl->name); + } + } + + spdk_json_write_array_end(w); +} + +void +spdk_net_impl_register(struct spdk_net_impl *impl, int priority) +{ + struct spdk_net_impl *cur, *prev; + + impl->priority = priority; + prev = NULL; + STAILQ_FOREACH(cur, &g_net_impls, link) { + if (impl->priority > cur->priority) { + break; + } + prev = cur; + } + + if (prev) { + STAILQ_INSERT_AFTER(&g_net_impls, prev, impl, link); + } else { + STAILQ_INSERT_HEAD(&g_net_impls, impl, link); + } +} diff --git a/src/spdk/lib/sock/sock_rpc.c b/src/spdk/lib/sock/sock_rpc.c new file mode 100644 index 000000000..c8686a068 --- /dev/null +++ b/src/spdk/lib/sock/sock_rpc.c @@ -0,0 +1,161 @@ +/*- + * BSD LICENSE + * + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/sock.h" + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + + +static const struct spdk_json_object_decoder rpc_sock_impl_get_opts_decoders[] = { + { "impl_name", 0, spdk_json_decode_string, false }, +}; + +static void +rpc_sock_impl_get_options(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + char *impl_name = NULL; + struct spdk_sock_impl_opts sock_opts = {}; + struct spdk_json_write_ctx *w; + size_t len; + int rc; + + if (spdk_json_decode_object(params, rpc_sock_impl_get_opts_decoders, + SPDK_COUNTOF(rpc_sock_impl_get_opts_decoders), &impl_name)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + len = sizeof(sock_opts); + rc = spdk_sock_impl_get_opts(impl_name, &sock_opts, &len); + if (rc) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_object_begin(w); + spdk_json_write_named_uint32(w, "recv_buf_size", sock_opts.recv_buf_size); + spdk_json_write_named_uint32(w, "send_buf_size", sock_opts.send_buf_size); + spdk_json_write_named_bool(w, "enable_recv_pipe", sock_opts.enable_recv_pipe); + spdk_json_write_named_bool(w, "enable_zerocopy_send", sock_opts.enable_zerocopy_send); + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); + free(impl_name); +} +SPDK_RPC_REGISTER("sock_impl_get_options", rpc_sock_impl_get_options, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) + +struct spdk_rpc_sock_impl_set_opts { + char *impl_name; + struct spdk_sock_impl_opts sock_opts; +}; + +static const struct spdk_json_object_decoder rpc_sock_impl_set_opts_decoders[] = { + { + "impl_name", offsetof(struct spdk_rpc_sock_impl_set_opts, impl_name), + spdk_json_decode_string, false + }, + { + "recv_buf_size", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.recv_buf_size), + spdk_json_decode_uint32, true + }, + { + "send_buf_size", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.send_buf_size), + spdk_json_decode_uint32, true + }, + { + "enable_recv_pipe", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_recv_pipe), + spdk_json_decode_bool, true + }, + { + "enable_zerocopy_send", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_zerocopy_send), + spdk_json_decode_bool, true + }, +}; + +static void +rpc_sock_impl_set_options(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_rpc_sock_impl_set_opts opts = {}; + struct spdk_json_write_ctx *w; + size_t len; + int rc; + + /* Get type */ + if (spdk_json_decode_object(params, rpc_sock_impl_set_opts_decoders, + SPDK_COUNTOF(rpc_sock_impl_set_opts_decoders), &opts)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + /* Retrieve default opts for requested socket implementation */ + len = sizeof(opts.sock_opts); + rc = spdk_sock_impl_get_opts(opts.impl_name, &opts.sock_opts, &len); + if (rc) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + /* Decode opts */ + if (spdk_json_decode_object(params, rpc_sock_impl_set_opts_decoders, + SPDK_COUNTOF(rpc_sock_impl_set_opts_decoders), &opts)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + rc = spdk_sock_impl_set_opts(opts.impl_name, &opts.sock_opts, sizeof(opts.sock_opts)); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + free(opts.impl_name); +} +SPDK_RPC_REGISTER("sock_impl_set_options", rpc_sock_impl_set_options, SPDK_RPC_STARTUP) diff --git a/src/spdk/lib/sock/spdk_sock.map b/src/spdk/lib/sock/spdk_sock.map new file mode 100644 index 000000000..e3fb44281 --- /dev/null +++ b/src/spdk/lib/sock/spdk_sock.map @@ -0,0 +1,47 @@ +{ + global: + + # public functions in spdk/sock.h + spdk_sock_get_default_opts; + spdk_sock_getaddr; + spdk_sock_connect; + spdk_sock_connect_ext; + spdk_sock_listen; + spdk_sock_listen_ext; + spdk_sock_accept; + spdk_sock_close; + spdk_sock_flush; + spdk_sock_recv; + spdk_sock_writev; + spdk_sock_writev_async; + spdk_sock_readv; + spdk_sock_set_recvlowat; + spdk_sock_set_recvbuf; + spdk_sock_set_sendbuf; + spdk_sock_is_ipv6; + spdk_sock_is_ipv4; + spdk_sock_is_connected; + spdk_sock_group_create; + spdk_sock_group_get_ctx; + spdk_sock_group_add_sock; + spdk_sock_group_remove_sock; + spdk_sock_group_poll; + spdk_sock_group_poll_count; + spdk_sock_group_close; + spdk_sock_get_optimal_sock_group; + spdk_sock_impl_get_opts; + spdk_sock_impl_set_opts; + spdk_sock_write_config_json; + + # public functions in spdk/net.h + spdk_net_framework_register; + spdk_net_framework_start; + spdk_net_framework_fini; + spdk_net_framework_init_next; + spdk_net_framework_fini_next; + + # internal function in spdk_internal/sock.h + spdk_net_impl_register; + + local: *; +}; diff --git a/src/spdk/lib/thread/Makefile b/src/spdk/lib/thread/Makefile new file mode 100644 index 000000000..ceb7a394e --- /dev/null +++ b/src/spdk/lib/thread/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 + +C_SRCS = thread.c +LIBNAME = thread + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_thread.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/thread/spdk_thread.map b/src/spdk/lib/thread/spdk_thread.map new file mode 100644 index 000000000..b71fa06eb --- /dev/null +++ b/src/spdk/lib/thread/spdk_thread.map @@ -0,0 +1,55 @@ +{ + global: + + # public functions in spdk/thread.h + spdk_thread_lib_init; + spdk_thread_lib_init_ext; + spdk_thread_lib_fini; + spdk_thread_create; + spdk_set_thread; + spdk_thread_exit; + spdk_thread_is_exited; + spdk_thread_destroy; + spdk_thread_get_ctx; + spdk_thread_get_cpumask; + spdk_thread_set_cpumask; + spdk_thread_get_from_ctx; + spdk_thread_poll; + spdk_thread_next_poller_expiration; + spdk_thread_has_active_pollers; + spdk_thread_has_pollers; + spdk_thread_is_idle; + spdk_thread_get_count; + spdk_get_thread; + spdk_thread_get_name; + spdk_thread_get_id; + spdk_thread_get_by_id; + spdk_thread_get_stats; + spdk_thread_get_last_tsc; + spdk_thread_send_msg; + spdk_thread_send_critical_msg; + spdk_for_each_thread; + spdk_poller_register; + spdk_poller_register_named; + spdk_poller_unregister; + spdk_poller_pause; + spdk_poller_resume; + spdk_io_device_register; + spdk_io_device_unregister; + spdk_get_io_channel; + spdk_put_io_channel; + spdk_io_channel_get_ctx; + spdk_io_channel_from_ctx; + spdk_io_channel_get_thread; + spdk_for_each_channel; + spdk_io_channel_iter_get_io_device; + spdk_io_channel_iter_get_channel; + spdk_io_channel_iter_get_ctx; + spdk_for_each_channel_continue; + + # internal functions in spdk_internal/thread.h + spdk_poller_state_str; + spdk_io_device_get_name; + + local: *; +}; diff --git a/src/spdk/lib/thread/thread.c b/src/spdk/lib/thread/thread.c new file mode 100644 index 000000000..65d91ce35 --- /dev/null +++ b/src/spdk/lib/thread/thread.c @@ -0,0 +1,1636 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" +#include "spdk_internal/thread.h" + +#define SPDK_MSG_BATCH_SIZE 8 +#define SPDK_MAX_DEVICE_NAME_LEN 256 +#define SPDK_THREAD_EXIT_TIMEOUT_SEC 5 + +static pthread_mutex_t g_devlist_mutex = PTHREAD_MUTEX_INITIALIZER; + +static spdk_new_thread_fn g_new_thread_fn = NULL; +static spdk_thread_op_fn g_thread_op_fn = NULL; +static spdk_thread_op_supported_fn g_thread_op_supported_fn; +static size_t g_ctx_sz = 0; +/* Monotonic increasing ID is set to each created thread beginning at 1. Once the + * ID exceeds UINT64_MAX, further thread creation is not allowed and restarting + * SPDK application is required. + */ +static uint64_t g_thread_id = 1; + +struct io_device { + void *io_device; + char name[SPDK_MAX_DEVICE_NAME_LEN + 1]; + spdk_io_channel_create_cb create_cb; + spdk_io_channel_destroy_cb destroy_cb; + spdk_io_device_unregister_cb unregister_cb; + struct spdk_thread *unregister_thread; + uint32_t ctx_size; + uint32_t for_each_count; + TAILQ_ENTRY(io_device) tailq; + + uint32_t refcnt; + + bool unregistered; +}; + +static TAILQ_HEAD(, io_device) g_io_devices = TAILQ_HEAD_INITIALIZER(g_io_devices); + +struct spdk_msg { + spdk_msg_fn fn; + void *arg; + + SLIST_ENTRY(spdk_msg) link; +}; + +#define SPDK_MSG_MEMPOOL_CACHE_SIZE 1024 +static struct spdk_mempool *g_spdk_msg_mempool = NULL; + +static TAILQ_HEAD(, spdk_thread) g_threads = TAILQ_HEAD_INITIALIZER(g_threads); +static uint32_t g_thread_count = 0; + +static __thread struct spdk_thread *tls_thread = NULL; + +static inline struct spdk_thread * +_get_thread(void) +{ + return tls_thread; +} + +static int +_thread_lib_init(size_t ctx_sz) +{ + char mempool_name[SPDK_MAX_MEMZONE_NAME_LEN]; + + g_ctx_sz = ctx_sz; + + snprintf(mempool_name, sizeof(mempool_name), "msgpool_%d", getpid()); + g_spdk_msg_mempool = spdk_mempool_create(mempool_name, + 262144 - 1, /* Power of 2 minus 1 is optimal for memory consumption */ + sizeof(struct spdk_msg), + 0, /* No cache. We do our own. */ + SPDK_ENV_SOCKET_ID_ANY); + + if (!g_spdk_msg_mempool) { + return -1; + } + + return 0; +} + +int +spdk_thread_lib_init(spdk_new_thread_fn new_thread_fn, size_t ctx_sz) +{ + assert(g_new_thread_fn == NULL); + assert(g_thread_op_fn == NULL); + + if (new_thread_fn == NULL) { + SPDK_INFOLOG(SPDK_LOG_THREAD, "new_thread_fn was not specified at spdk_thread_lib_init\n"); + } else { + g_new_thread_fn = new_thread_fn; + } + + return _thread_lib_init(ctx_sz); +} + +int +spdk_thread_lib_init_ext(spdk_thread_op_fn thread_op_fn, + spdk_thread_op_supported_fn thread_op_supported_fn, + size_t ctx_sz) +{ + assert(g_new_thread_fn == NULL); + assert(g_thread_op_fn == NULL); + assert(g_thread_op_supported_fn == NULL); + + if ((thread_op_fn != NULL) != (thread_op_supported_fn != NULL)) { + SPDK_ERRLOG("Both must be defined or undefined together.\n"); + return -EINVAL; + } + + if (thread_op_fn == NULL && thread_op_supported_fn == NULL) { + SPDK_INFOLOG(SPDK_LOG_THREAD, "thread_op_fn and thread_op_supported_fn were not specified\n"); + } else { + g_thread_op_fn = thread_op_fn; + g_thread_op_supported_fn = thread_op_supported_fn; + } + + return _thread_lib_init(ctx_sz); +} + +void +spdk_thread_lib_fini(void) +{ + struct io_device *dev; + + TAILQ_FOREACH(dev, &g_io_devices, tailq) { + SPDK_ERRLOG("io_device %s not unregistered\n", dev->name); + } + + if (g_spdk_msg_mempool) { + spdk_mempool_free(g_spdk_msg_mempool); + g_spdk_msg_mempool = NULL; + } + + g_new_thread_fn = NULL; + g_thread_op_fn = NULL; + g_thread_op_supported_fn = NULL; + g_ctx_sz = 0; +} + +static void +_free_thread(struct spdk_thread *thread) +{ + struct spdk_io_channel *ch; + struct spdk_msg *msg; + struct spdk_poller *poller, *ptmp; + + TAILQ_FOREACH(ch, &thread->io_channels, tailq) { + SPDK_ERRLOG("thread %s still has channel for io_device %s\n", + thread->name, ch->dev->name); + } + + TAILQ_FOREACH_SAFE(poller, &thread->active_pollers, tailq, ptmp) { + if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) { + SPDK_WARNLOG("poller %s still registered at thread exit\n", + poller->name); + } + TAILQ_REMOVE(&thread->active_pollers, poller, tailq); + free(poller); + } + + TAILQ_FOREACH_SAFE(poller, &thread->timed_pollers, tailq, ptmp) { + if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) { + SPDK_WARNLOG("poller %s still registered at thread exit\n", + poller->name); + } + TAILQ_REMOVE(&thread->timed_pollers, poller, tailq); + free(poller); + } + + TAILQ_FOREACH_SAFE(poller, &thread->paused_pollers, tailq, ptmp) { + SPDK_WARNLOG("poller %s still registered at thread exit\n", poller->name); + TAILQ_REMOVE(&thread->paused_pollers, poller, tailq); + free(poller); + } + + pthread_mutex_lock(&g_devlist_mutex); + assert(g_thread_count > 0); + g_thread_count--; + TAILQ_REMOVE(&g_threads, thread, tailq); + pthread_mutex_unlock(&g_devlist_mutex); + + msg = SLIST_FIRST(&thread->msg_cache); + while (msg != NULL) { + SLIST_REMOVE_HEAD(&thread->msg_cache, link); + + assert(thread->msg_cache_count > 0); + thread->msg_cache_count--; + spdk_mempool_put(g_spdk_msg_mempool, msg); + + msg = SLIST_FIRST(&thread->msg_cache); + } + + assert(thread->msg_cache_count == 0); + + spdk_ring_free(thread->messages); + free(thread); +} + +struct spdk_thread * +spdk_thread_create(const char *name, struct spdk_cpuset *cpumask) +{ + struct spdk_thread *thread; + struct spdk_msg *msgs[SPDK_MSG_MEMPOOL_CACHE_SIZE]; + int rc = 0, i; + + thread = calloc(1, sizeof(*thread) + g_ctx_sz); + if (!thread) { + SPDK_ERRLOG("Unable to allocate memory for thread\n"); + return NULL; + } + + if (cpumask) { + spdk_cpuset_copy(&thread->cpumask, cpumask); + } else { + spdk_cpuset_negate(&thread->cpumask); + } + + TAILQ_INIT(&thread->io_channels); + TAILQ_INIT(&thread->active_pollers); + TAILQ_INIT(&thread->timed_pollers); + TAILQ_INIT(&thread->paused_pollers); + SLIST_INIT(&thread->msg_cache); + thread->msg_cache_count = 0; + + thread->tsc_last = spdk_get_ticks(); + + thread->messages = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY); + if (!thread->messages) { + SPDK_ERRLOG("Unable to allocate memory for message ring\n"); + free(thread); + return NULL; + } + + /* Fill the local message pool cache. */ + rc = spdk_mempool_get_bulk(g_spdk_msg_mempool, (void **)msgs, SPDK_MSG_MEMPOOL_CACHE_SIZE); + if (rc == 0) { + /* If we can't populate the cache it's ok. The cache will get filled + * up organically as messages are passed to the thread. */ + for (i = 0; i < SPDK_MSG_MEMPOOL_CACHE_SIZE; i++) { + SLIST_INSERT_HEAD(&thread->msg_cache, msgs[i], link); + thread->msg_cache_count++; + } + } + + if (name) { + snprintf(thread->name, sizeof(thread->name), "%s", name); + } else { + snprintf(thread->name, sizeof(thread->name), "%p", thread); + } + + pthread_mutex_lock(&g_devlist_mutex); + if (g_thread_id == 0) { + SPDK_ERRLOG("Thread ID rolled over. Further thread creation is not allowed.\n"); + pthread_mutex_unlock(&g_devlist_mutex); + _free_thread(thread); + return NULL; + } + thread->id = g_thread_id++; + TAILQ_INSERT_TAIL(&g_threads, thread, tailq); + g_thread_count++; + pthread_mutex_unlock(&g_devlist_mutex); + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Allocating new thread (%" PRIu64 ", %s)\n", + thread->id, thread->name); + + if (g_new_thread_fn) { + rc = g_new_thread_fn(thread); + } else if (g_thread_op_supported_fn && g_thread_op_supported_fn(SPDK_THREAD_OP_NEW)) { + rc = g_thread_op_fn(thread, SPDK_THREAD_OP_NEW); + } + + if (rc != 0) { + _free_thread(thread); + return NULL; + } + + thread->state = SPDK_THREAD_STATE_RUNNING; + + return thread; +} + +void +spdk_set_thread(struct spdk_thread *thread) +{ + tls_thread = thread; +} + +static void +thread_exit(struct spdk_thread *thread, uint64_t now) +{ + struct spdk_poller *poller; + struct spdk_io_channel *ch; + + if (now >= thread->exit_timeout_tsc) { + SPDK_ERRLOG("thread %s got timeout, and move it to the exited state forcefully\n", + thread->name); + goto exited; + } + + TAILQ_FOREACH(poller, &thread->active_pollers, tailq) { + if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) { + SPDK_INFOLOG(SPDK_LOG_THREAD, + "thread %s still has active poller %s\n", + thread->name, poller->name); + return; + } + } + + TAILQ_FOREACH(poller, &thread->timed_pollers, tailq) { + if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) { + SPDK_INFOLOG(SPDK_LOG_THREAD, + "thread %s still has active timed poller %s\n", + thread->name, poller->name); + return; + } + } + + TAILQ_FOREACH(poller, &thread->paused_pollers, tailq) { + SPDK_INFOLOG(SPDK_LOG_THREAD, + "thread %s still has paused poller %s\n", + thread->name, poller->name); + return; + } + + TAILQ_FOREACH(ch, &thread->io_channels, tailq) { + SPDK_INFOLOG(SPDK_LOG_THREAD, + "thread %s still has channel for io_device %s\n", + thread->name, ch->dev->name); + return; + } + +exited: + thread->state = SPDK_THREAD_STATE_EXITED; +} + +int +spdk_thread_exit(struct spdk_thread *thread) +{ + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Exit thread %s\n", thread->name); + + assert(tls_thread == thread); + + if (thread->state >= SPDK_THREAD_STATE_EXITING) { + SPDK_INFOLOG(SPDK_LOG_THREAD, + "thread %s is already exiting\n", + thread->name); + return 0; + } + + thread->exit_timeout_tsc = spdk_get_ticks() + (spdk_get_ticks_hz() * + SPDK_THREAD_EXIT_TIMEOUT_SEC); + thread->state = SPDK_THREAD_STATE_EXITING; + return 0; +} + +bool +spdk_thread_is_exited(struct spdk_thread *thread) +{ + return thread->state == SPDK_THREAD_STATE_EXITED; +} + +void +spdk_thread_destroy(struct spdk_thread *thread) +{ + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Destroy thread %s\n", thread->name); + + assert(thread->state == SPDK_THREAD_STATE_EXITED); + + if (tls_thread == thread) { + tls_thread = NULL; + } + + _free_thread(thread); +} + +void * +spdk_thread_get_ctx(struct spdk_thread *thread) +{ + if (g_ctx_sz > 0) { + return thread->ctx; + } + + return NULL; +} + +struct spdk_cpuset * +spdk_thread_get_cpumask(struct spdk_thread *thread) +{ + return &thread->cpumask; +} + +int +spdk_thread_set_cpumask(struct spdk_cpuset *cpumask) +{ + struct spdk_thread *thread; + + if (!g_thread_op_supported_fn || !g_thread_op_supported_fn(SPDK_THREAD_OP_RESCHED)) { + SPDK_ERRLOG("Framework does not support reschedule operation.\n"); + assert(false); + return -ENOTSUP; + } + + thread = spdk_get_thread(); + if (!thread) { + SPDK_ERRLOG("Called from non-SPDK thread\n"); + assert(false); + return -EINVAL; + } + + spdk_cpuset_copy(&thread->cpumask, cpumask); + + /* Invoke framework's reschedule operation. If this function is called multiple times + * in a single spdk_thread_poll() context, the last cpumask will be used in the + * reschedule operation. + */ + g_thread_op_fn(thread, SPDK_THREAD_OP_RESCHED); + + return 0; +} + +struct spdk_thread * +spdk_thread_get_from_ctx(void *ctx) +{ + if (ctx == NULL) { + assert(false); + return NULL; + } + + assert(g_ctx_sz > 0); + + return SPDK_CONTAINEROF(ctx, struct spdk_thread, ctx); +} + +static inline uint32_t +msg_queue_run_batch(struct spdk_thread *thread, uint32_t max_msgs) +{ + unsigned count, i; + void *messages[SPDK_MSG_BATCH_SIZE]; + +#ifdef DEBUG + /* + * spdk_ring_dequeue() fills messages and returns how many entries it wrote, + * so we will never actually read uninitialized data from events, but just to be sure + * (and to silence a static analyzer false positive), initialize the array to NULL pointers. + */ + memset(messages, 0, sizeof(messages)); +#endif + + if (max_msgs > 0) { + max_msgs = spdk_min(max_msgs, SPDK_MSG_BATCH_SIZE); + } else { + max_msgs = SPDK_MSG_BATCH_SIZE; + } + + count = spdk_ring_dequeue(thread->messages, messages, max_msgs); + if (count == 0) { + return 0; + } + + for (i = 0; i < count; i++) { + struct spdk_msg *msg = messages[i]; + + assert(msg != NULL); + msg->fn(msg->arg); + + if (thread->msg_cache_count < SPDK_MSG_MEMPOOL_CACHE_SIZE) { + /* Insert the messages at the head. We want to re-use the hot + * ones. */ + SLIST_INSERT_HEAD(&thread->msg_cache, msg, link); + thread->msg_cache_count++; + } else { + spdk_mempool_put(g_spdk_msg_mempool, msg); + } + } + + return count; +} + +static void +poller_insert_timer(struct spdk_thread *thread, struct spdk_poller *poller, uint64_t now) +{ + struct spdk_poller *iter; + + poller->next_run_tick = now + poller->period_ticks; + + /* + * Insert poller in the thread's timed_pollers list in sorted order by next scheduled + * run time. + */ + TAILQ_FOREACH_REVERSE(iter, &thread->timed_pollers, timed_pollers_head, tailq) { + if (iter->next_run_tick <= poller->next_run_tick) { + TAILQ_INSERT_AFTER(&thread->timed_pollers, iter, poller, tailq); + return; + } + } + + /* No earlier pollers were found, so this poller must be the new head */ + TAILQ_INSERT_HEAD(&thread->timed_pollers, poller, tailq); +} + +static void +thread_insert_poller(struct spdk_thread *thread, struct spdk_poller *poller) +{ + if (poller->period_ticks) { + poller_insert_timer(thread, poller, spdk_get_ticks()); + } else { + TAILQ_INSERT_TAIL(&thread->active_pollers, poller, tailq); + } +} + +static inline void +thread_update_stats(struct spdk_thread *thread, uint64_t end, + uint64_t start, int rc) +{ + if (rc == 0) { + /* Poller status idle */ + thread->stats.idle_tsc += end - start; + } else if (rc > 0) { + /* Poller status busy */ + thread->stats.busy_tsc += end - start; + } + /* Store end time to use it as start time of the next spdk_thread_poll(). */ + thread->tsc_last = end; +} + +static int +thread_poll(struct spdk_thread *thread, uint32_t max_msgs, uint64_t now) +{ + uint32_t msg_count; + struct spdk_poller *poller, *tmp; + spdk_msg_fn critical_msg; + int rc = 0; + + critical_msg = thread->critical_msg; + if (spdk_unlikely(critical_msg != NULL)) { + critical_msg(NULL); + thread->critical_msg = NULL; + } + + msg_count = msg_queue_run_batch(thread, max_msgs); + if (msg_count) { + rc = 1; + } + + TAILQ_FOREACH_REVERSE_SAFE(poller, &thread->active_pollers, + active_pollers_head, tailq, tmp) { + int poller_rc; + + if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) { + TAILQ_REMOVE(&thread->active_pollers, poller, tailq); + free(poller); + continue; + } else if (poller->state == SPDK_POLLER_STATE_PAUSING) { + TAILQ_REMOVE(&thread->active_pollers, poller, tailq); + TAILQ_INSERT_TAIL(&thread->paused_pollers, poller, tailq); + poller->state = SPDK_POLLER_STATE_PAUSED; + continue; + } + + poller->state = SPDK_POLLER_STATE_RUNNING; + poller_rc = poller->fn(poller->arg); + + poller->run_count++; + if (poller_rc > 0) { + poller->busy_count++; + } + +#ifdef DEBUG + if (poller_rc == -1) { + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Poller %s returned -1\n", poller->name); + } +#endif + + if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) { + TAILQ_REMOVE(&thread->active_pollers, poller, tailq); + free(poller); + } else if (poller->state != SPDK_POLLER_STATE_PAUSED) { + poller->state = SPDK_POLLER_STATE_WAITING; + } + + if (poller_rc > rc) { + rc = poller_rc; + } + } + + TAILQ_FOREACH_SAFE(poller, &thread->timed_pollers, tailq, tmp) { + int timer_rc = 0; + + if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) { + TAILQ_REMOVE(&thread->timed_pollers, poller, tailq); + free(poller); + continue; + } else if (poller->state == SPDK_POLLER_STATE_PAUSING) { + TAILQ_REMOVE(&thread->timed_pollers, poller, tailq); + TAILQ_INSERT_TAIL(&thread->paused_pollers, poller, tailq); + poller->state = SPDK_POLLER_STATE_PAUSED; + continue; + } + + if (now < poller->next_run_tick) { + break; + } + + poller->state = SPDK_POLLER_STATE_RUNNING; + timer_rc = poller->fn(poller->arg); + + poller->run_count++; + if (timer_rc > 0) { + poller->busy_count++; + } + +#ifdef DEBUG + if (timer_rc == -1) { + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Timed poller %s returned -1\n", poller->name); + } +#endif + + if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) { + TAILQ_REMOVE(&thread->timed_pollers, poller, tailq); + free(poller); + } else if (poller->state != SPDK_POLLER_STATE_PAUSED) { + poller->state = SPDK_POLLER_STATE_WAITING; + TAILQ_REMOVE(&thread->timed_pollers, poller, tailq); + poller_insert_timer(thread, poller, now); + } + + if (timer_rc > rc) { + rc = timer_rc; + } + } + + return rc; +} + +int +spdk_thread_poll(struct spdk_thread *thread, uint32_t max_msgs, uint64_t now) +{ + struct spdk_thread *orig_thread; + int rc; + + orig_thread = _get_thread(); + tls_thread = thread; + + if (now == 0) { + now = spdk_get_ticks(); + } + + rc = thread_poll(thread, max_msgs, now); + + if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITING)) { + thread_exit(thread, now); + } + + thread_update_stats(thread, spdk_get_ticks(), now, rc); + + tls_thread = orig_thread; + + return rc; +} + +uint64_t +spdk_thread_next_poller_expiration(struct spdk_thread *thread) +{ + struct spdk_poller *poller; + + poller = TAILQ_FIRST(&thread->timed_pollers); + if (poller) { + return poller->next_run_tick; + } + + return 0; +} + +int +spdk_thread_has_active_pollers(struct spdk_thread *thread) +{ + return !TAILQ_EMPTY(&thread->active_pollers); +} + +static bool +thread_has_unpaused_pollers(struct spdk_thread *thread) +{ + if (TAILQ_EMPTY(&thread->active_pollers) && + TAILQ_EMPTY(&thread->timed_pollers)) { + return false; + } + + return true; +} + +bool +spdk_thread_has_pollers(struct spdk_thread *thread) +{ + if (!thread_has_unpaused_pollers(thread) && + TAILQ_EMPTY(&thread->paused_pollers)) { + return false; + } + + return true; +} + +bool +spdk_thread_is_idle(struct spdk_thread *thread) +{ + if (spdk_ring_count(thread->messages) || + thread_has_unpaused_pollers(thread) || + thread->critical_msg != NULL) { + return false; + } + + return true; +} + +uint32_t +spdk_thread_get_count(void) +{ + /* + * Return cached value of the current thread count. We could acquire the + * lock and iterate through the TAILQ of threads to count them, but that + * count could still be invalidated after we release the lock. + */ + return g_thread_count; +} + +struct spdk_thread * +spdk_get_thread(void) +{ + return _get_thread(); +} + +const char * +spdk_thread_get_name(const struct spdk_thread *thread) +{ + return thread->name; +} + +uint64_t +spdk_thread_get_id(const struct spdk_thread *thread) +{ + return thread->id; +} + +struct spdk_thread * +spdk_thread_get_by_id(uint64_t id) +{ + struct spdk_thread *thread; + + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_FOREACH(thread, &g_threads, tailq) { + if (thread->id == id) { + pthread_mutex_unlock(&g_devlist_mutex); + + return thread; + } + } + pthread_mutex_unlock(&g_devlist_mutex); + + return NULL; +} + +int +spdk_thread_get_stats(struct spdk_thread_stats *stats) +{ + struct spdk_thread *thread; + + thread = _get_thread(); + if (!thread) { + SPDK_ERRLOG("No thread allocated\n"); + return -EINVAL; + } + + if (stats == NULL) { + return -EINVAL; + } + + *stats = thread->stats; + + return 0; +} + +uint64_t +spdk_thread_get_last_tsc(struct spdk_thread *thread) +{ + return thread->tsc_last; +} + +int +spdk_thread_send_msg(const struct spdk_thread *thread, spdk_msg_fn fn, void *ctx) +{ + struct spdk_thread *local_thread; + struct spdk_msg *msg; + int rc; + + assert(thread != NULL); + + if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITED)) { + SPDK_ERRLOG("Thread %s is marked as exited.\n", thread->name); + return -EIO; + } + + local_thread = _get_thread(); + + msg = NULL; + if (local_thread != NULL) { + if (local_thread->msg_cache_count > 0) { + msg = SLIST_FIRST(&local_thread->msg_cache); + assert(msg != NULL); + SLIST_REMOVE_HEAD(&local_thread->msg_cache, link); + local_thread->msg_cache_count--; + } + } + + if (msg == NULL) { + msg = spdk_mempool_get(g_spdk_msg_mempool); + if (!msg) { + SPDK_ERRLOG("msg could not be allocated\n"); + return -ENOMEM; + } + } + + msg->fn = fn; + msg->arg = ctx; + + rc = spdk_ring_enqueue(thread->messages, (void **)&msg, 1, NULL); + if (rc != 1) { + SPDK_ERRLOG("msg could not be enqueued\n"); + spdk_mempool_put(g_spdk_msg_mempool, msg); + return -EIO; + } + + return 0; +} + +int +spdk_thread_send_critical_msg(struct spdk_thread *thread, spdk_msg_fn fn) +{ + spdk_msg_fn expected = NULL; + + if (__atomic_compare_exchange_n(&thread->critical_msg, &expected, fn, false, __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST)) { + return 0; + } + + return -EIO; +} + +static struct spdk_poller * +poller_register(spdk_poller_fn fn, + void *arg, + uint64_t period_microseconds, + const char *name) +{ + struct spdk_thread *thread; + struct spdk_poller *poller; + uint64_t quotient, remainder, ticks; + + thread = spdk_get_thread(); + if (!thread) { + assert(false); + return NULL; + } + + if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITED)) { + SPDK_ERRLOG("thread %s is marked as exited\n", thread->name); + return NULL; + } + + poller = calloc(1, sizeof(*poller)); + if (poller == NULL) { + SPDK_ERRLOG("Poller memory allocation failed\n"); + return NULL; + } + + if (name) { + snprintf(poller->name, sizeof(poller->name), "%s", name); + } else { + snprintf(poller->name, sizeof(poller->name), "%p", fn); + } + + poller->state = SPDK_POLLER_STATE_WAITING; + poller->fn = fn; + poller->arg = arg; + poller->thread = thread; + + if (period_microseconds) { + quotient = period_microseconds / SPDK_SEC_TO_USEC; + remainder = period_microseconds % SPDK_SEC_TO_USEC; + ticks = spdk_get_ticks_hz(); + + poller->period_ticks = ticks * quotient + (ticks * remainder) / SPDK_SEC_TO_USEC; + } else { + poller->period_ticks = 0; + } + + thread_insert_poller(thread, poller); + + return poller; +} + +struct spdk_poller * +spdk_poller_register(spdk_poller_fn fn, + void *arg, + uint64_t period_microseconds) +{ + return poller_register(fn, arg, period_microseconds, NULL); +} + +struct spdk_poller * +spdk_poller_register_named(spdk_poller_fn fn, + void *arg, + uint64_t period_microseconds, + const char *name) +{ + return poller_register(fn, arg, period_microseconds, name); +} + +void +spdk_poller_unregister(struct spdk_poller **ppoller) +{ + struct spdk_thread *thread; + struct spdk_poller *poller; + + poller = *ppoller; + if (poller == NULL) { + return; + } + + *ppoller = NULL; + + thread = spdk_get_thread(); + if (!thread) { + assert(false); + return; + } + + if (poller->thread != thread) { + SPDK_ERRLOG("different from the thread that called spdk_poller_register()\n"); + assert(false); + return; + } + + /* If the poller was paused, put it on the active_pollers list so that + * its unregistration can be processed by spdk_thread_poll(). + */ + if (poller->state == SPDK_POLLER_STATE_PAUSED) { + TAILQ_REMOVE(&thread->paused_pollers, poller, tailq); + TAILQ_INSERT_TAIL(&thread->active_pollers, poller, tailq); + poller->period_ticks = 0; + } + + /* Simply set the state to unregistered. The poller will get cleaned up + * in a subsequent call to spdk_thread_poll(). + */ + poller->state = SPDK_POLLER_STATE_UNREGISTERED; +} + +void +spdk_poller_pause(struct spdk_poller *poller) +{ + struct spdk_thread *thread; + + if (poller->state == SPDK_POLLER_STATE_PAUSED || + poller->state == SPDK_POLLER_STATE_PAUSING) { + return; + } + + thread = spdk_get_thread(); + if (!thread) { + assert(false); + return; + } + + /* If a poller is paused from within itself, we can immediately move it + * on the paused_pollers list. Otherwise we just set its state to + * SPDK_POLLER_STATE_PAUSING and let spdk_thread_poll() move it. It + * allows a poller to be paused from another one's context without + * breaking the TAILQ_FOREACH_REVERSE_SAFE iteration. + */ + if (poller->state != SPDK_POLLER_STATE_RUNNING) { + poller->state = SPDK_POLLER_STATE_PAUSING; + } else { + if (poller->period_ticks > 0) { + TAILQ_REMOVE(&thread->timed_pollers, poller, tailq); + } else { + TAILQ_REMOVE(&thread->active_pollers, poller, tailq); + } + + TAILQ_INSERT_TAIL(&thread->paused_pollers, poller, tailq); + poller->state = SPDK_POLLER_STATE_PAUSED; + } +} + +void +spdk_poller_resume(struct spdk_poller *poller) +{ + struct spdk_thread *thread; + + if (poller->state != SPDK_POLLER_STATE_PAUSED && + poller->state != SPDK_POLLER_STATE_PAUSING) { + return; + } + + thread = spdk_get_thread(); + if (!thread) { + assert(false); + return; + } + + /* If a poller is paused it has to be removed from the paused pollers + * list and put on the active / timer list depending on its + * period_ticks. If a poller is still in the process of being paused, + * we just need to flip its state back to waiting, as it's already on + * the appropriate list. + */ + if (poller->state == SPDK_POLLER_STATE_PAUSED) { + TAILQ_REMOVE(&thread->paused_pollers, poller, tailq); + thread_insert_poller(thread, poller); + } + + poller->state = SPDK_POLLER_STATE_WAITING; +} + +const char * +spdk_poller_state_str(enum spdk_poller_state state) +{ + switch (state) { + case SPDK_POLLER_STATE_WAITING: + return "waiting"; + case SPDK_POLLER_STATE_RUNNING: + return "running"; + case SPDK_POLLER_STATE_UNREGISTERED: + return "unregistered"; + case SPDK_POLLER_STATE_PAUSING: + return "pausing"; + case SPDK_POLLER_STATE_PAUSED: + return "paused"; + default: + return NULL; + } +} + +struct call_thread { + struct spdk_thread *cur_thread; + spdk_msg_fn fn; + void *ctx; + + struct spdk_thread *orig_thread; + spdk_msg_fn cpl; +}; + +static void +_on_thread(void *ctx) +{ + struct call_thread *ct = ctx; + int rc __attribute__((unused)); + + ct->fn(ct->ctx); + + pthread_mutex_lock(&g_devlist_mutex); + ct->cur_thread = TAILQ_NEXT(ct->cur_thread, tailq); + pthread_mutex_unlock(&g_devlist_mutex); + + if (!ct->cur_thread) { + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Completed thread iteration\n"); + + rc = spdk_thread_send_msg(ct->orig_thread, ct->cpl, ct->ctx); + free(ctx); + } else { + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Continuing thread iteration to %s\n", + ct->cur_thread->name); + + rc = spdk_thread_send_msg(ct->cur_thread, _on_thread, ctx); + } + assert(rc == 0); +} + +void +spdk_for_each_thread(spdk_msg_fn fn, void *ctx, spdk_msg_fn cpl) +{ + struct call_thread *ct; + struct spdk_thread *thread; + int rc __attribute__((unused)); + + ct = calloc(1, sizeof(*ct)); + if (!ct) { + SPDK_ERRLOG("Unable to perform thread iteration\n"); + cpl(ctx); + return; + } + + ct->fn = fn; + ct->ctx = ctx; + ct->cpl = cpl; + + thread = _get_thread(); + if (!thread) { + SPDK_ERRLOG("No thread allocated\n"); + free(ct); + cpl(ctx); + return; + } + ct->orig_thread = thread; + + pthread_mutex_lock(&g_devlist_mutex); + ct->cur_thread = TAILQ_FIRST(&g_threads); + pthread_mutex_unlock(&g_devlist_mutex); + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Starting thread iteration from %s\n", + ct->orig_thread->name); + + rc = spdk_thread_send_msg(ct->cur_thread, _on_thread, ct); + assert(rc == 0); +} + +void +spdk_io_device_register(void *io_device, spdk_io_channel_create_cb create_cb, + spdk_io_channel_destroy_cb destroy_cb, uint32_t ctx_size, + const char *name) +{ + struct io_device *dev, *tmp; + struct spdk_thread *thread; + + assert(io_device != NULL); + assert(create_cb != NULL); + assert(destroy_cb != NULL); + + thread = spdk_get_thread(); + if (!thread) { + SPDK_ERRLOG("called from non-SPDK thread\n"); + assert(false); + return; + } + + dev = calloc(1, sizeof(struct io_device)); + if (dev == NULL) { + SPDK_ERRLOG("could not allocate io_device\n"); + return; + } + + dev->io_device = io_device; + if (name) { + snprintf(dev->name, sizeof(dev->name), "%s", name); + } else { + snprintf(dev->name, sizeof(dev->name), "%p", dev); + } + dev->create_cb = create_cb; + dev->destroy_cb = destroy_cb; + dev->unregister_cb = NULL; + dev->ctx_size = ctx_size; + dev->for_each_count = 0; + dev->unregistered = false; + dev->refcnt = 0; + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Registering io_device %s (%p) on thread %s\n", + dev->name, dev->io_device, thread->name); + + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_FOREACH(tmp, &g_io_devices, tailq) { + if (tmp->io_device == io_device) { + SPDK_ERRLOG("io_device %p already registered (old:%s new:%s)\n", + io_device, tmp->name, dev->name); + free(dev); + pthread_mutex_unlock(&g_devlist_mutex); + return; + } + } + TAILQ_INSERT_TAIL(&g_io_devices, dev, tailq); + pthread_mutex_unlock(&g_devlist_mutex); +} + +static void +_finish_unregister(void *arg) +{ + struct io_device *dev = arg; + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Finishing unregistration of io_device %s (%p) on thread %s\n", + dev->name, dev->io_device, dev->unregister_thread->name); + + dev->unregister_cb(dev->io_device); + free(dev); +} + +static void +io_device_free(struct io_device *dev) +{ + int rc __attribute__((unused)); + + if (dev->unregister_cb == NULL) { + free(dev); + } else { + assert(dev->unregister_thread != NULL); + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "io_device %s (%p) needs to unregister from thread %s\n", + dev->name, dev->io_device, dev->unregister_thread->name); + rc = spdk_thread_send_msg(dev->unregister_thread, _finish_unregister, dev); + assert(rc == 0); + } +} + +void +spdk_io_device_unregister(void *io_device, spdk_io_device_unregister_cb unregister_cb) +{ + struct io_device *dev; + uint32_t refcnt; + struct spdk_thread *thread; + + thread = spdk_get_thread(); + if (!thread) { + SPDK_ERRLOG("called from non-SPDK thread\n"); + assert(false); + return; + } + + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_FOREACH(dev, &g_io_devices, tailq) { + if (dev->io_device == io_device) { + break; + } + } + + if (!dev) { + SPDK_ERRLOG("io_device %p not found\n", io_device); + assert(false); + pthread_mutex_unlock(&g_devlist_mutex); + return; + } + + if (dev->for_each_count > 0) { + SPDK_ERRLOG("io_device %s (%p) has %u for_each calls outstanding\n", + dev->name, io_device, dev->for_each_count); + pthread_mutex_unlock(&g_devlist_mutex); + return; + } + + dev->unregister_cb = unregister_cb; + dev->unregistered = true; + TAILQ_REMOVE(&g_io_devices, dev, tailq); + refcnt = dev->refcnt; + dev->unregister_thread = thread; + pthread_mutex_unlock(&g_devlist_mutex); + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Unregistering io_device %s (%p) from thread %s\n", + dev->name, dev->io_device, thread->name); + + if (refcnt > 0) { + /* defer deletion */ + return; + } + + io_device_free(dev); +} + +const char * +spdk_io_device_get_name(struct io_device *dev) +{ + return dev->name; +} + +struct spdk_io_channel * +spdk_get_io_channel(void *io_device) +{ + struct spdk_io_channel *ch; + struct spdk_thread *thread; + struct io_device *dev; + int rc; + + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_FOREACH(dev, &g_io_devices, tailq) { + if (dev->io_device == io_device) { + break; + } + } + if (dev == NULL) { + SPDK_ERRLOG("could not find io_device %p\n", io_device); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + thread = _get_thread(); + if (!thread) { + SPDK_ERRLOG("No thread allocated\n"); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITED)) { + SPDK_ERRLOG("Thread %s is marked as exited\n", thread->name); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + TAILQ_FOREACH(ch, &thread->io_channels, tailq) { + if (ch->dev == dev) { + ch->ref++; + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Get io_channel %p for io_device %s (%p) on thread %s refcnt %u\n", + ch, dev->name, dev->io_device, thread->name, ch->ref); + + /* + * An I/O channel already exists for this device on this + * thread, so return it. + */ + pthread_mutex_unlock(&g_devlist_mutex); + return ch; + } + } + + ch = calloc(1, sizeof(*ch) + dev->ctx_size); + if (ch == NULL) { + SPDK_ERRLOG("could not calloc spdk_io_channel\n"); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + ch->dev = dev; + ch->destroy_cb = dev->destroy_cb; + ch->thread = thread; + ch->ref = 1; + ch->destroy_ref = 0; + TAILQ_INSERT_TAIL(&thread->io_channels, ch, tailq); + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Get io_channel %p for io_device %s (%p) on thread %s refcnt %u\n", + ch, dev->name, dev->io_device, thread->name, ch->ref); + + dev->refcnt++; + + pthread_mutex_unlock(&g_devlist_mutex); + + rc = dev->create_cb(io_device, (uint8_t *)ch + sizeof(*ch)); + if (rc != 0) { + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_REMOVE(&ch->thread->io_channels, ch, tailq); + dev->refcnt--; + free(ch); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + return ch; +} + +static void +put_io_channel(void *arg) +{ + struct spdk_io_channel *ch = arg; + bool do_remove_dev = true; + struct spdk_thread *thread; + + thread = spdk_get_thread(); + if (!thread) { + SPDK_ERRLOG("called from non-SPDK thread\n"); + assert(false); + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, + "Releasing io_channel %p for io_device %s (%p) on thread %s\n", + ch, ch->dev->name, ch->dev->io_device, thread->name); + + assert(ch->thread == thread); + + ch->destroy_ref--; + + if (ch->ref > 0 || ch->destroy_ref > 0) { + /* + * Another reference to the associated io_device was requested + * after this message was sent but before it had a chance to + * execute. + */ + return; + } + + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_REMOVE(&ch->thread->io_channels, ch, tailq); + pthread_mutex_unlock(&g_devlist_mutex); + + /* Don't hold the devlist mutex while the destroy_cb is called. */ + ch->destroy_cb(ch->dev->io_device, spdk_io_channel_get_ctx(ch)); + + pthread_mutex_lock(&g_devlist_mutex); + ch->dev->refcnt--; + + if (!ch->dev->unregistered) { + do_remove_dev = false; + } + + if (ch->dev->refcnt > 0) { + do_remove_dev = false; + } + + pthread_mutex_unlock(&g_devlist_mutex); + + if (do_remove_dev) { + io_device_free(ch->dev); + } + free(ch); +} + +void +spdk_put_io_channel(struct spdk_io_channel *ch) +{ + struct spdk_thread *thread; + int rc __attribute__((unused)); + + thread = spdk_get_thread(); + if (!thread) { + SPDK_ERRLOG("called from non-SPDK thread\n"); + assert(false); + return; + } + + if (ch->thread != thread) { + SPDK_ERRLOG("different from the thread that called get_io_channel()\n"); + assert(false); + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, + "Putting io_channel %p for io_device %s (%p) on thread %s refcnt %u\n", + ch, ch->dev->name, ch->dev->io_device, thread->name, ch->ref); + + ch->ref--; + + if (ch->ref == 0) { + ch->destroy_ref++; + rc = spdk_thread_send_msg(thread, put_io_channel, ch); + assert(rc == 0); + } +} + +struct spdk_io_channel * +spdk_io_channel_from_ctx(void *ctx) +{ + return (struct spdk_io_channel *)((uint8_t *)ctx - sizeof(struct spdk_io_channel)); +} + +struct spdk_thread * +spdk_io_channel_get_thread(struct spdk_io_channel *ch) +{ + return ch->thread; +} + +struct spdk_io_channel_iter { + void *io_device; + struct io_device *dev; + spdk_channel_msg fn; + int status; + void *ctx; + struct spdk_io_channel *ch; + + struct spdk_thread *cur_thread; + + struct spdk_thread *orig_thread; + spdk_channel_for_each_cpl cpl; +}; + +void * +spdk_io_channel_iter_get_io_device(struct spdk_io_channel_iter *i) +{ + return i->io_device; +} + +struct spdk_io_channel * +spdk_io_channel_iter_get_channel(struct spdk_io_channel_iter *i) +{ + return i->ch; +} + +void * +spdk_io_channel_iter_get_ctx(struct spdk_io_channel_iter *i) +{ + return i->ctx; +} + +static void +_call_completion(void *ctx) +{ + struct spdk_io_channel_iter *i = ctx; + + if (i->cpl != NULL) { + i->cpl(i, i->status); + } + free(i); +} + +static void +_call_channel(void *ctx) +{ + struct spdk_io_channel_iter *i = ctx; + struct spdk_io_channel *ch; + + /* + * It is possible that the channel was deleted before this + * message had a chance to execute. If so, skip calling + * the fn() on this thread. + */ + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_FOREACH(ch, &i->cur_thread->io_channels, tailq) { + if (ch->dev->io_device == i->io_device) { + break; + } + } + pthread_mutex_unlock(&g_devlist_mutex); + + if (ch) { + i->fn(i); + } else { + spdk_for_each_channel_continue(i, 0); + } +} + +void +spdk_for_each_channel(void *io_device, spdk_channel_msg fn, void *ctx, + spdk_channel_for_each_cpl cpl) +{ + struct spdk_thread *thread; + struct spdk_io_channel *ch; + struct spdk_io_channel_iter *i; + int rc __attribute__((unused)); + + i = calloc(1, sizeof(*i)); + if (!i) { + SPDK_ERRLOG("Unable to allocate iterator\n"); + return; + } + + i->io_device = io_device; + i->fn = fn; + i->ctx = ctx; + i->cpl = cpl; + + pthread_mutex_lock(&g_devlist_mutex); + i->orig_thread = _get_thread(); + + TAILQ_FOREACH(thread, &g_threads, tailq) { + TAILQ_FOREACH(ch, &thread->io_channels, tailq) { + if (ch->dev->io_device == io_device) { + ch->dev->for_each_count++; + i->dev = ch->dev; + i->cur_thread = thread; + i->ch = ch; + pthread_mutex_unlock(&g_devlist_mutex); + rc = spdk_thread_send_msg(thread, _call_channel, i); + assert(rc == 0); + return; + } + } + } + + pthread_mutex_unlock(&g_devlist_mutex); + + rc = spdk_thread_send_msg(i->orig_thread, _call_completion, i); + assert(rc == 0); +} + +void +spdk_for_each_channel_continue(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_thread *thread; + struct spdk_io_channel *ch; + int rc __attribute__((unused)); + + assert(i->cur_thread == spdk_get_thread()); + + i->status = status; + + pthread_mutex_lock(&g_devlist_mutex); + if (status) { + goto end; + } + thread = TAILQ_NEXT(i->cur_thread, tailq); + while (thread) { + TAILQ_FOREACH(ch, &thread->io_channels, tailq) { + if (ch->dev->io_device == i->io_device) { + i->cur_thread = thread; + i->ch = ch; + pthread_mutex_unlock(&g_devlist_mutex); + rc = spdk_thread_send_msg(thread, _call_channel, i); + assert(rc == 0); + return; + } + } + thread = TAILQ_NEXT(thread, tailq); + } + +end: + i->dev->for_each_count--; + i->ch = NULL; + pthread_mutex_unlock(&g_devlist_mutex); + + rc = spdk_thread_send_msg(i->orig_thread, _call_completion, i); + assert(rc == 0); +} + + +SPDK_LOG_REGISTER_COMPONENT("thread", SPDK_LOG_THREAD) diff --git a/src/spdk/lib/trace/Makefile b/src/spdk/lib/trace/Makefile new file mode 100644 index 000000000..9102c320a --- /dev/null +++ b/src/spdk/lib/trace/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = trace.c trace_flags.c trace_rpc.c +LIBNAME = trace + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_trace.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/trace/spdk_trace.map b/src/spdk/lib/trace/spdk_trace.map new file mode 100644 index 000000000..14a03b337 --- /dev/null +++ b/src/spdk/lib/trace/spdk_trace.map @@ -0,0 +1,29 @@ +{ + global: + + # public functions + _spdk_trace_record; + spdk_trace_get_tpoint_mask; + spdk_trace_set_tpoints; + spdk_trace_clear_tpoints; + spdk_trace_get_tpoint_group_mask; + spdk_trace_set_tpoint_group_mask; + spdk_trace_clear_tpoint_group_mask; + spdk_trace_init; + spdk_trace_cleanup; + spdk_trace_flags_init; + spdk_trace_register_owner; + spdk_trace_register_object; + spdk_trace_register_description; + spdk_trace_get_first_register_fn; + spdk_trace_get_next_register_fn; + spdk_trace_enable_tpoint_group; + spdk_trace_disable_tpoint_group; + spdk_trace_mask_usage; + spdk_trace_add_register_fn; + + # public variables + g_trace_histories; + + local: *; +}; diff --git a/src/spdk/lib/trace/trace.c b/src/spdk/lib/trace/trace.c new file mode 100644 index 000000000..621c52aae --- /dev/null +++ b/src/spdk/lib/trace/trace.c @@ -0,0 +1,201 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/util.h" +#include "spdk/barrier.h" +#include "spdk/log.h" + +static int g_trace_fd = -1; +static char g_shm_name[64]; + +struct spdk_trace_histories *g_trace_histories; + +void +_spdk_trace_record(uint64_t tsc, uint16_t tpoint_id, uint16_t poller_id, uint32_t size, + uint64_t object_id, uint64_t arg1) +{ + struct spdk_trace_history *lcore_history; + struct spdk_trace_entry *next_entry; + unsigned lcore; + uint64_t next_circular_entry; + + lcore = spdk_env_get_current_core(); + if (lcore >= SPDK_TRACE_MAX_LCORE) { + return; + } + + lcore_history = spdk_get_per_lcore_history(g_trace_histories, lcore); + if (tsc == 0) { + tsc = spdk_get_ticks(); + } + + lcore_history->tpoint_count[tpoint_id]++; + + /* Get next entry index in the circular buffer */ + next_circular_entry = lcore_history->next_entry & (lcore_history->num_entries - 1); + next_entry = &lcore_history->entries[next_circular_entry]; + next_entry->tsc = tsc; + next_entry->tpoint_id = tpoint_id; + next_entry->poller_id = poller_id; + next_entry->size = size; + next_entry->object_id = object_id; + next_entry->arg1 = arg1; + + /* Ensure all elements of the trace entry are visible to outside trace tools */ + spdk_smp_wmb(); + lcore_history->next_entry++; +} + +int +spdk_trace_init(const char *shm_name, uint64_t num_entries) +{ + int i = 0; + int histories_size; + uint64_t lcore_offsets[SPDK_TRACE_MAX_LCORE + 1]; + + /* 0 entries requested - skip trace initialization */ + if (num_entries == 0) { + return 0; + } + + lcore_offsets[0] = sizeof(struct spdk_trace_flags); + for (i = 1; i < (int)SPDK_COUNTOF(lcore_offsets); i++) { + lcore_offsets[i] = spdk_get_trace_history_size(num_entries) + lcore_offsets[i - 1]; + } + histories_size = lcore_offsets[SPDK_TRACE_MAX_LCORE]; + + snprintf(g_shm_name, sizeof(g_shm_name), "%s", shm_name); + + g_trace_fd = shm_open(shm_name, O_RDWR | O_CREAT, 0600); + if (g_trace_fd == -1) { + SPDK_ERRLOG("could not shm_open spdk_trace\n"); + SPDK_ERRLOG("errno=%d %s\n", errno, spdk_strerror(errno)); + return 1; + } + + if (ftruncate(g_trace_fd, histories_size) != 0) { + SPDK_ERRLOG("could not truncate shm\n"); + goto trace_init_err; + } + + g_trace_histories = mmap(NULL, histories_size, PROT_READ | PROT_WRITE, + MAP_SHARED, g_trace_fd, 0); + if (g_trace_histories == MAP_FAILED) { + SPDK_ERRLOG("could not mmap shm\n"); + goto trace_init_err; + } + + /* TODO: On FreeBSD, mlock on shm_open'd memory doesn't seem to work. Docs say that kern.ipc.shm_use_phys=1 + * should allow it, but forcing that doesn't seem to work either. So for now just skip mlock on FreeBSD + * altogether. + */ +#if defined(__linux__) + if (mlock(g_trace_histories, histories_size) != 0) { + SPDK_ERRLOG("Could not mlock shm for tracing - %s.\n", spdk_strerror(errno)); + if (errno == ENOMEM) { + SPDK_ERRLOG("Check /dev/shm for old tracing files that can be deleted.\n"); + } + goto trace_init_err; + } +#endif + + memset(g_trace_histories, 0, histories_size); + + g_trace_flags = &g_trace_histories->flags; + + g_trace_flags->tsc_rate = spdk_get_ticks_hz(); + + for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) { + struct spdk_trace_history *lcore_history; + + g_trace_flags->lcore_history_offsets[i] = lcore_offsets[i]; + lcore_history = spdk_get_per_lcore_history(g_trace_histories, i); + lcore_history->lcore = i; + lcore_history->num_entries = num_entries; + } + g_trace_flags->lcore_history_offsets[SPDK_TRACE_MAX_LCORE] = lcore_offsets[SPDK_TRACE_MAX_LCORE]; + + spdk_trace_flags_init(); + + return 0; + +trace_init_err: + if (g_trace_histories != MAP_FAILED) { + munmap(g_trace_histories, histories_size); + } + close(g_trace_fd); + g_trace_fd = -1; + shm_unlink(shm_name); + g_trace_histories = NULL; + + return 1; + +} + +void +spdk_trace_cleanup(void) +{ + bool unlink; + int i; + struct spdk_trace_history *lcore_history; + + if (g_trace_histories == NULL) { + return; + } + + /* + * Only unlink the shm if there were no trace_entry recorded. This ensures the file + * can be used after this process exits/crashes for debugging. + * Note that we have to calculate this value before g_trace_histories gets unmapped. + */ + for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) { + lcore_history = spdk_get_per_lcore_history(g_trace_histories, i); + unlink = lcore_history->entries[0].tsc == 0; + if (!unlink) { + break; + } + } + + munmap(g_trace_histories, sizeof(struct spdk_trace_histories)); + g_trace_histories = NULL; + close(g_trace_fd); + + if (unlink) { + shm_unlink(g_shm_name); + } +} diff --git a/src/spdk/lib/trace/trace_flags.c b/src/spdk/lib/trace/trace_flags.c new file mode 100644 index 000000000..615afe355 --- /dev/null +++ b/src/spdk/lib/trace/trace_flags.c @@ -0,0 +1,323 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/trace.h" +#include "spdk/log.h" +#include "spdk_internal/log.h" + +struct spdk_trace_flags *g_trace_flags = NULL; +static struct spdk_trace_register_fn *g_reg_fn_head = NULL; + +SPDK_LOG_REGISTER_COMPONENT("trace", SPDK_LOG_TRACE) + +uint64_t +spdk_trace_get_tpoint_mask(uint32_t group_id) +{ + if (group_id >= SPDK_TRACE_MAX_GROUP_ID) { + SPDK_ERRLOG("invalid group ID %d\n", group_id); + return 0ULL; + } + + return g_trace_flags->tpoint_mask[group_id]; +} + +void +spdk_trace_set_tpoints(uint32_t group_id, uint64_t tpoint_mask) +{ + if (group_id >= SPDK_TRACE_MAX_GROUP_ID) { + SPDK_ERRLOG("invalid group ID %d\n", group_id); + return; + } + + g_trace_flags->tpoint_mask[group_id] |= tpoint_mask; +} + +void +spdk_trace_clear_tpoints(uint32_t group_id, uint64_t tpoint_mask) +{ + if (group_id >= SPDK_TRACE_MAX_GROUP_ID) { + SPDK_ERRLOG("invalid group ID %d\n", group_id); + return; + } + + g_trace_flags->tpoint_mask[group_id] &= ~tpoint_mask; +} + +uint64_t +spdk_trace_get_tpoint_group_mask(void) +{ + uint64_t mask = 0x0; + int i; + + for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) { + if (spdk_trace_get_tpoint_mask(i) != 0) { + mask |= (1ULL << i); + } + } + + return mask; +} + +void +spdk_trace_set_tpoint_group_mask(uint64_t tpoint_group_mask) +{ + int i; + + for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) { + if (tpoint_group_mask & (1ULL << i)) { + spdk_trace_set_tpoints(i, -1ULL); + } + } +} + +void +spdk_trace_clear_tpoint_group_mask(uint64_t tpoint_group_mask) +{ + int i; + + for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) { + if (tpoint_group_mask & (1ULL << i)) { + spdk_trace_clear_tpoints(i, -1ULL); + } + } +} + +struct spdk_trace_register_fn * +spdk_trace_get_first_register_fn(void) +{ + return g_reg_fn_head; +} + +struct spdk_trace_register_fn * +spdk_trace_get_next_register_fn(struct spdk_trace_register_fn *register_fn) +{ + return register_fn->next; +} + +static uint64_t +trace_create_tpoint_group_mask(const char *group_name) +{ + uint64_t tpoint_group_mask = 0; + struct spdk_trace_register_fn *register_fn; + + register_fn = spdk_trace_get_first_register_fn(); + if (strcmp(group_name, "all") == 0) { + while (register_fn) { + tpoint_group_mask |= (1UL << register_fn->tgroup_id); + + register_fn = spdk_trace_get_next_register_fn(register_fn); + } + } else { + while (register_fn) { + if (strcmp(group_name, register_fn->name) == 0) { + break; + } + + register_fn = spdk_trace_get_next_register_fn(register_fn); + } + + if (register_fn != NULL) { + tpoint_group_mask |= (1UL << register_fn->tgroup_id); + } + } + + return tpoint_group_mask; +} + +int +spdk_trace_enable_tpoint_group(const char *group_name) +{ + uint64_t tpoint_group_mask = 0; + + tpoint_group_mask = trace_create_tpoint_group_mask(group_name); + if (tpoint_group_mask == 0) { + return -1; + } + + spdk_trace_set_tpoint_group_mask(tpoint_group_mask); + return 0; +} + +int +spdk_trace_disable_tpoint_group(const char *group_name) +{ + uint64_t tpoint_group_mask = 0; + + tpoint_group_mask = trace_create_tpoint_group_mask(group_name); + if (tpoint_group_mask == 0) { + return -1; + } + + spdk_trace_clear_tpoint_group_mask(tpoint_group_mask); + return 0; +} + +void +spdk_trace_mask_usage(FILE *f, const char *tmask_arg) +{ + struct spdk_trace_register_fn *register_fn; + + fprintf(f, " %s, --tpoint-group-mask <mask>\n", tmask_arg); + fprintf(f, " tracepoint group mask for spdk trace buffers (default 0x0"); + + register_fn = g_reg_fn_head; + while (register_fn) { + fprintf(f, ", %s 0x%x", register_fn->name, 1 << register_fn->tgroup_id); + register_fn = register_fn->next; + } + + fprintf(f, ", all 0xffff)\n"); +} + +void +spdk_trace_register_owner(uint8_t type, char id_prefix) +{ + struct spdk_trace_owner *owner; + + assert(type != OWNER_NONE); + + /* 'owner' has 256 entries and since 'type' is a uint8_t, it + * can't overrun the array. + */ + owner = &g_trace_flags->owner[type]; + assert(owner->type == 0); + + owner->type = type; + owner->id_prefix = id_prefix; +} + +void +spdk_trace_register_object(uint8_t type, char id_prefix) +{ + struct spdk_trace_object *object; + + assert(type != OBJECT_NONE); + + /* 'object' has 256 entries and since 'type' is a uint8_t, it + * can't overrun the array. + */ + object = &g_trace_flags->object[type]; + assert(object->type == 0); + + object->type = type; + object->id_prefix = id_prefix; +} + +void +spdk_trace_register_description(const char *name, uint16_t tpoint_id, uint8_t owner_type, + uint8_t object_type, uint8_t new_object, + uint8_t arg1_type, const char *arg1_name) +{ + struct spdk_trace_tpoint *tpoint; + + assert(tpoint_id != 0); + assert(tpoint_id < SPDK_TRACE_MAX_TPOINT_ID); + + if (strnlen(name, sizeof(tpoint->name)) == sizeof(tpoint->name)) { + SPDK_ERRLOG("name (%s) too long\n", name); + } + + tpoint = &g_trace_flags->tpoint[tpoint_id]; + assert(tpoint->tpoint_id == 0); + + snprintf(tpoint->name, sizeof(tpoint->name), "%s", name); + tpoint->tpoint_id = tpoint_id; + tpoint->object_type = object_type; + tpoint->owner_type = owner_type; + tpoint->new_object = new_object; + tpoint->arg1_type = arg1_type; + snprintf(tpoint->arg1_name, sizeof(tpoint->arg1_name), "%s", arg1_name); +} + +void +spdk_trace_add_register_fn(struct spdk_trace_register_fn *reg_fn) +{ + struct spdk_trace_register_fn *_reg_fn; + + if (reg_fn->name == NULL) { + SPDK_ERRLOG("missing name for registering spdk trace tpoint group\n"); + assert(false); + return; + } + + if (strcmp(reg_fn->name, "all") == 0) { + SPDK_ERRLOG("illegal name (%s) for tpoint group\n", reg_fn->name); + assert(false); + return; + } + + /* Ensure that no trace point group IDs and names are ever duplicated */ + for (_reg_fn = g_reg_fn_head; _reg_fn; _reg_fn = _reg_fn->next) { + if (reg_fn->tgroup_id == _reg_fn->tgroup_id) { + SPDK_ERRLOG("duplicate tgroup_id (%d) with %s\n", _reg_fn->tgroup_id, _reg_fn->name); + assert(false); + return; + } + + if (strcmp(reg_fn->name, _reg_fn->name) == 0) { + SPDK_ERRLOG("duplicate name with %s\n", _reg_fn->name); + assert(false); + return; + } + } + + /* Arrange trace registration in order on tgroup_id */ + if (g_reg_fn_head == NULL || reg_fn->tgroup_id < g_reg_fn_head->tgroup_id) { + reg_fn->next = g_reg_fn_head; + g_reg_fn_head = reg_fn; + return; + } + + for (_reg_fn = g_reg_fn_head; _reg_fn; _reg_fn = _reg_fn->next) { + if (_reg_fn->next == NULL || reg_fn->tgroup_id < _reg_fn->next->tgroup_id) { + reg_fn->next = _reg_fn->next; + _reg_fn->next = reg_fn; + return; + } + } +} + +void +spdk_trace_flags_init(void) +{ + struct spdk_trace_register_fn *reg_fn; + + reg_fn = g_reg_fn_head; + while (reg_fn) { + reg_fn->reg_fn(); + reg_fn = reg_fn->next; + } +} diff --git a/src/spdk/lib/trace/trace_rpc.c b/src/spdk/lib/trace/trace_rpc.c new file mode 100644 index 000000000..90dbfbc60 --- /dev/null +++ b/src/spdk/lib/trace/trace_rpc.c @@ -0,0 +1,170 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/trace.h" +#include "spdk_internal/log.h" + +struct rpc_tpoint_group { + char *name; +}; + +static void +free_rpc_tpoint_group(struct rpc_tpoint_group *p) +{ + free(p->name); +} + +static const struct spdk_json_object_decoder rpc_tpoint_group_decoders[] = { + {"name", offsetof(struct rpc_tpoint_group, name), spdk_json_decode_string}, +}; + +static void +rpc_trace_enable_tpoint_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_tpoint_group req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_tpoint_group_decoders, + SPDK_COUNTOF(rpc_tpoint_group_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_TRACE, "spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.name == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_TRACE, "flag was NULL\n"); + goto invalid; + } + + if (spdk_trace_enable_tpoint_group(req.name)) { + goto invalid; + } + + free_rpc_tpoint_group(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_tpoint_group(&req); +} +SPDK_RPC_REGISTER("trace_enable_tpoint_group", rpc_trace_enable_tpoint_group, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(trace_enable_tpoint_group, enable_tpoint_group) + +static void +rpc_trace_disable_tpoint_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_tpoint_group req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_tpoint_group_decoders, + SPDK_COUNTOF(rpc_tpoint_group_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_TRACE, "spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.name == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_TRACE, "flag was NULL\n"); + goto invalid; + } + + if (spdk_trace_disable_tpoint_group(req.name)) { + goto invalid; + } + + free_rpc_tpoint_group(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_tpoint_group(&req); +} +SPDK_RPC_REGISTER("trace_disable_tpoint_group", rpc_trace_disable_tpoint_group, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(trace_disable_tpoint_group, disable_tpoint_group) + +static void +rpc_trace_get_tpoint_group_mask(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + uint64_t tpoint_group_mask; + char mask_str[7]; + bool enabled; + struct spdk_json_write_ctx *w; + struct spdk_trace_register_fn *register_fn; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "trace_get_tpoint_group_mask requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + tpoint_group_mask = spdk_trace_get_tpoint_group_mask(); + + spdk_json_write_object_begin(w); + + snprintf(mask_str, sizeof(mask_str), "0x%lx", tpoint_group_mask); + spdk_json_write_named_string(w, "tpoint_group_mask", mask_str); + + register_fn = spdk_trace_get_first_register_fn(); + while (register_fn) { + enabled = spdk_trace_get_tpoint_mask(register_fn->tgroup_id) != 0; + + spdk_json_write_named_object_begin(w, register_fn->name); + spdk_json_write_named_bool(w, "enabled", enabled); + + snprintf(mask_str, sizeof(mask_str), "0x%lx", (1UL << register_fn->tgroup_id)); + spdk_json_write_named_string(w, "mask", mask_str); + spdk_json_write_object_end(w); + + register_fn = spdk_trace_get_next_register_fn(register_fn); + } + + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("trace_get_tpoint_group_mask", rpc_trace_get_tpoint_group_mask, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(trace_get_tpoint_group_mask, get_tpoint_group_mask) diff --git a/src/spdk/lib/ut_mock/Makefile b/src/spdk/lib/ut_mock/Makefile new file mode 100644 index 000000000..f4087807f --- /dev/null +++ b/src/spdk/lib/ut_mock/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = mock.c +LIBNAME = ut_mock + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/ut_mock/mock.c b/src/spdk/lib/ut_mock/mock.c new file mode 100644 index 000000000..cfe51c1d5 --- /dev/null +++ b/src/spdk/lib/ut_mock/mock.c @@ -0,0 +1,71 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk_internal/mock.h" + +DEFINE_WRAPPER(calloc, void *, (size_t nmemb, size_t size), (nmemb, size)) + +DEFINE_WRAPPER(pthread_mutex_init, int, + (pthread_mutex_t *mtx, const pthread_mutexattr_t *attr), + (mtx, attr)) + +DEFINE_WRAPPER(pthread_mutexattr_init, int, + (pthread_mutexattr_t *attr), (attr)) + +DEFINE_WRAPPER(recvmsg, ssize_t, (int sockfd, struct msghdr *msg, int flags), (sockfd, msg, flags)) + +DEFINE_WRAPPER(sendmsg, ssize_t, (int sockfd, const struct msghdr *msg, int flags), (sockfd, msg, + flags)) + +DEFINE_WRAPPER(writev, ssize_t, (int fd, const struct iovec *iov, int iovcnt), (fd, iov, iovcnt)) + +char *g_unlink_path; +void (*g_unlink_callback)(void); + +int +__attribute__((used)) +__wrap_unlink(const char *path) +{ + if (g_unlink_path == NULL) { + return ENOENT; + } + + if (strcmp(g_unlink_path, path) != 0) { + return ENOENT; + } + + if (g_unlink_callback) { + g_unlink_callback(); + } + return 0; +} diff --git a/src/spdk/lib/util/Makefile b/src/spdk/lib/util/Makefile new file mode 100644 index 000000000..23f8db6d0 --- /dev/null +++ b/src/spdk/lib/util/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = base64.c bit_array.c cpuset.c crc16.c crc32.c crc32c.c crc32_ieee.c \ + dif.c fd.c file.c iov.c math.c pipe.c strerror_tls.c string.c uuid.c +LIBNAME = util +LOCAL_SYS_LIBS = -luuid + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_util.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/util/base64.c b/src/spdk/lib/util/base64.c new file mode 100644 index 000000000..adc5e15da --- /dev/null +++ b/src/spdk/lib/util/base64.c @@ -0,0 +1,262 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/endian.h" +#include "spdk/base64.h" + +#ifdef __aarch64__ +#include "base64_neon.c" +#endif + +#define BASE64_ENC_BITMASK 0x3FUL +#define BASE64_PADDING_CHAR '=' + +static const char base64_enc_table[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + +static const char base64_urfsafe_enc_table[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789-_"; + +static const uint8_t +base64_dec_table[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, + 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, + 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +}; + +static const uint8_t +base64_urlsafe_dec_table[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, + 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 63, + 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +}; + +static int +base64_encode(char *dst, const char *enc_table, const void *src, size_t src_len) +{ + uint32_t raw_u32; + + if (!dst || !src || src_len <= 0) { + return -EINVAL; + } + +#ifdef __aarch64__ + base64_encode_neon64(&dst, enc_table, &src, &src_len); +#endif + + while (src_len >= 4) { + raw_u32 = from_be32(src); + + *dst++ = enc_table[(raw_u32 >> 26) & BASE64_ENC_BITMASK]; + *dst++ = enc_table[(raw_u32 >> 20) & BASE64_ENC_BITMASK]; + *dst++ = enc_table[(raw_u32 >> 14) & BASE64_ENC_BITMASK]; + *dst++ = enc_table[(raw_u32 >> 8) & BASE64_ENC_BITMASK]; + + src_len -= 3; + src += 3; + } + + if (src_len == 0) { + goto out; + } + + raw_u32 = 0; + memcpy(&raw_u32, src, src_len); + raw_u32 = from_be32(&raw_u32); + + *dst++ = enc_table[(raw_u32 >> 26) & BASE64_ENC_BITMASK]; + *dst++ = enc_table[(raw_u32 >> 20) & BASE64_ENC_BITMASK]; + *dst++ = (src_len >= 2) ? enc_table[(raw_u32 >> 14) & BASE64_ENC_BITMASK] : BASE64_PADDING_CHAR; + *dst++ = (src_len == 3) ? enc_table[(raw_u32 >> 8) & BASE64_ENC_BITMASK] : BASE64_PADDING_CHAR; + +out: + *dst = '\0'; + + return 0; +} + +int +spdk_base64_encode(char *dst, const void *src, size_t src_len) +{ + return base64_encode(dst, base64_enc_table, src, src_len); +} + +int +spdk_base64_urlsafe_encode(char *dst, const void *src, size_t src_len) +{ + return base64_encode(dst, base64_urfsafe_enc_table, src, src_len); +} + +#ifdef __aarch64__ +static int +base64_decode(void *dst, size_t *_dst_len, const uint8_t *dec_table, + const uint8_t *dec_table_opt, const char *src) +#else +static int +base64_decode(void *dst, size_t *_dst_len, const uint8_t *dec_table, const char *src) +#endif +{ + size_t src_strlen; + size_t tail_len = 0; + const uint8_t *src_in; + uint32_t tmp[4]; + int i; + + if (!src) { + return -EINVAL; + } + + src_strlen = strlen(src); + + /* strlen of src should be 4n */ + if (src_strlen == 0 || src_strlen % 4 != 0) { + return -EINVAL; + } + + /* Consider Base64 padding, it at most has 2 padding characters. */ + for (i = 0; i < 2; i++) { + if (src[src_strlen - 1] != BASE64_PADDING_CHAR) { + break; + } + src_strlen--; + } + + /* strlen of src without padding shouldn't be 4n+1 */ + if (src_strlen == 0 || src_strlen % 4 == 1) { + return -EINVAL; + } + + if (_dst_len) { + *_dst_len = spdk_base64_get_decoded_len(src_strlen); + } + + /* If dst is NULL, the client is only concerned w/ _dst_len, return */ + if (!dst) { + return 0; + } + + src_in = (const uint8_t *) src; + +#ifdef __aarch64__ + base64_decode_neon64(&dst, dec_table_opt, &src_in, &src_strlen); + + if (src_strlen == 0) { + return 0; + } +#endif + + /* space of dst can be used by to_be32 */ + while (src_strlen > 4) { + tmp[0] = dec_table[*src_in++]; + tmp[1] = dec_table[*src_in++]; + tmp[2] = dec_table[*src_in++]; + tmp[3] = dec_table[*src_in++]; + + if (tmp[0] == 255 || tmp[1] == 255 || tmp[2] == 255 || tmp[3] == 255) { + return -EINVAL; + } + + to_be32(dst, tmp[3] << 8 | tmp[2] << 14 | tmp[1] << 20 | tmp[0] << 26); + + dst += 3; + src_strlen -= 4; + } + + /* space of dst is not enough to be used by to_be32 */ + tmp[0] = dec_table[src_in[0]]; + tmp[1] = dec_table[src_in[1]]; + tmp[2] = (src_strlen >= 3) ? dec_table[src_in[2]] : 0; + tmp[3] = (src_strlen == 4) ? dec_table[src_in[3]] : 0; + tail_len = src_strlen - 1; + + if (tmp[0] == 255 || tmp[1] == 255 || tmp[2] == 255 || tmp[3] == 255) { + return -EINVAL; + } + + to_be32(&tmp[3], tmp[3] << 8 | tmp[2] << 14 | tmp[1] << 20 | tmp[0] << 26); + memcpy(dst, (uint8_t *)&tmp[3], tail_len); + + return 0; +} + +int +spdk_base64_decode(void *dst, size_t *dst_len, const char *src) +{ +#ifdef __aarch64__ + return base64_decode(dst, dst_len, base64_dec_table, base64_dec_table_neon64, src); +#else + return base64_decode(dst, dst_len, base64_dec_table, src); +#endif +} + +int +spdk_base64_urlsafe_decode(void *dst, size_t *dst_len, const char *src) +{ +#ifdef __aarch64__ + return base64_decode(dst, dst_len, base64_urlsafe_dec_table, base64_urlsafe_dec_table_neon64, + src); +#else + return base64_decode(dst, dst_len, base64_urlsafe_dec_table, src); +#endif +} diff --git a/src/spdk/lib/util/base64_neon.c b/src/spdk/lib/util/base64_neon.c new file mode 100644 index 000000000..971cff06c --- /dev/null +++ b/src/spdk/lib/util/base64_neon.c @@ -0,0 +1,225 @@ +/*- + * BSD LICENSE + * + * Copyright (c) 2005-2007, Nick Galbreath + * Copyright (c) 2013-2017, Alfred Klomp + * Copyright (c) 2015-2017, Wojciech Mula + * Copyright (c) 2016-2017, Matthieu Darbois + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __aarch64__ +#error Unsupported hardware +#endif + +#include "spdk/stdinc.h" +/* + * Encoding + * Use a 64-byte lookup to do the encoding. + * Reuse existing base64_dec_table and base64_dec_table. + + * Decoding + * The input consists of five valid character sets in the Base64 alphabet, + * which we need to map back to the 6-bit values they represent. + * There are three ranges, two singles, and then there's the rest. + * + * LUT1[0-63] = base64_dec_table_neon64[0-63] + * LUT2[0-63] = base64_dec_table_neon64[64-127] + * # From To LUT Characters + * 1 [0..42] [255] #1 invalid input + * 2 [43] [62] #1 + + * 3 [44..46] [255] #1 invalid input + * 4 [47] [63] #1 / + * 5 [48..57] [52..61] #1 0..9 + * 6 [58..63] [255] #1 invalid input + * 7 [64] [255] #2 invalid input + * 8 [65..90] [0..25] #2 A..Z + * 9 [91..96] [255] #2 invalid input + * 10 [97..122] [26..51] #2 a..z + * 11 [123..126] [255] #2 invalid input + * (12) Everything else => invalid input + */ +static const uint8_t base64_dec_table_neon64[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, + 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, + 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255 +}; + +/* + * LUT1[0-63] = base64_urlsafe_dec_table_neon64[0-63] + * LUT2[0-63] = base64_urlsafe_dec_table_neon64[64-127] + * # From To LUT Characters + * 1 [0..44] [255] #1 invalid input + * 2 [45] [62] #1 - + * 3 [46..47] [255] #1 invalid input + * 5 [48..57] [52..61] #1 0..9 + * 6 [58..63] [255] #1 invalid input + * 7 [64] [255] #2 invalid input + * 8 [65..90] [0..25] #2 A..Z + * 9 [91..94] [255] #2 invalid input + * 10 [95] [63] #2 _ + * 11 [96] [255] #2 invalid input + * 12 [97..122] [26..51] #2 a..z + * 13 [123..126] [255] #2 invalid input + * (14) Everything else => invalid input + */ +static const uint8_t base64_urlsafe_dec_table_neon64[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, + 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, + 63, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255 +}; + +#include <arm_neon.h> +#define CMPGT(s,n) vcgtq_u8((s), vdupq_n_u8(n)) + +static inline uint8x16x4_t +load_64byte_table(const uint8_t *p) +{ + uint8x16x4_t ret; + ret.val[0] = vld1q_u8(p + 0); + ret.val[1] = vld1q_u8(p + 16); + ret.val[2] = vld1q_u8(p + 32); + ret.val[3] = vld1q_u8(p + 48); + return ret; +} + +static void +base64_encode_neon64(char **dst, const char *enc_table, const void **src, size_t *src_len) +{ + const uint8x16x4_t tbl_enc = load_64byte_table(enc_table); + + while (*src_len >= 48) { + uint8x16x3_t str; + uint8x16x4_t res; + + /* Load 48 bytes and deinterleave */ + str = vld3q_u8((uint8_t *)*src); + + /* Divide bits of three input bytes over four output bytes and clear top two bits */ + res.val[0] = vshrq_n_u8(str.val[0], 2); + res.val[1] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[1], 4), vshlq_n_u8(str.val[0], 4)), + vdupq_n_u8(0x3F)); + res.val[2] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[2], 6), vshlq_n_u8(str.val[1], 2)), + vdupq_n_u8(0x3F)); + res.val[3] = vandq_u8(str.val[2], vdupq_n_u8(0x3F)); + + /* + * The bits have now been shifted to the right locations; + * translate their values 0..63 to the Base64 alphabet. + * Use a 64-byte table lookup: + */ + res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]); + res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]); + res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]); + res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]); + + /* Interleave and store result */ + vst4q_u8((uint8_t *)*dst, res); + + *src += 48; /* 3 * 16 bytes of input */ + *dst += 64; /* 4 * 16 bytes of output */ + *src_len -= 48; + } +} + +static void +base64_decode_neon64(void **dst, const uint8_t *dec_table_neon64, const uint8_t **src, + size_t *src_len) +{ + /* + * First LUT tbl_dec1 will use VTBL instruction (out of range indices are set to 0 in destination). + * Second LUT tbl_dec2 will use VTBX instruction (out of range indices will be unchanged in destination). + * Input [64..126] will be mapped to index [1..63] in tb1_dec2. Index 0 means that value comes from tb1_dec1. + */ + const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_table_neon64); + const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_table_neon64 + 64); + const uint8x16_t offset = vdupq_n_u8(63U); + + while (*src_len >= 64) { + + uint8x16x4_t dec1, dec2; + uint8x16x3_t dec; + + /* Load 64 bytes and deinterleave */ + uint8x16x4_t str = vld4q_u8((uint8_t *)*src); + + /* Get indices for 2nd LUT */ + dec2.val[0] = vqsubq_u8(str.val[0], offset); + dec2.val[1] = vqsubq_u8(str.val[1], offset); + dec2.val[2] = vqsubq_u8(str.val[2], offset); + dec2.val[3] = vqsubq_u8(str.val[3], offset); + + /* Get values from 1st LUT */ + dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]); + dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]); + dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]); + dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]); + + /* Get values from 2nd LUT */ + dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]); + dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]); + dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]); + dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]); + + /* Get final values */ + str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]); + str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]); + str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]); + str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]); + + /* Check for invalid input, any value larger than 63 */ + uint8x16_t classified = CMPGT(str.val[0], 63); + classified = vorrq_u8(classified, CMPGT(str.val[1], 63)); + classified = vorrq_u8(classified, CMPGT(str.val[2], 63)); + classified = vorrq_u8(classified, CMPGT(str.val[3], 63)); + + /* check that all bits are zero */ + if (vmaxvq_u8(classified) != 0U) { + break; + } + + /* Compress four bytes into three */ + dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4)); + dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2)); + dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]); + + /* Interleave and store decoded result */ + vst3q_u8((uint8_t *)*dst, dec); + + *src += 64; + *dst += 48; + *src_len -= 64; + } +} diff --git a/src/spdk/lib/util/bit_array.c b/src/spdk/lib/util/bit_array.c new file mode 100644 index 000000000..43c1a4d9b --- /dev/null +++ b/src/spdk/lib/util/bit_array.c @@ -0,0 +1,363 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bit_array.h" +#include "spdk/env.h" + +#include "spdk/likely.h" +#include "spdk/util.h" + +typedef uint64_t spdk_bit_array_word; +#define SPDK_BIT_ARRAY_WORD_TZCNT(x) (__builtin_ctzll(x)) +#define SPDK_BIT_ARRAY_WORD_POPCNT(x) (__builtin_popcountll(x)) +#define SPDK_BIT_ARRAY_WORD_C(x) ((spdk_bit_array_word)(x)) +#define SPDK_BIT_ARRAY_WORD_BYTES sizeof(spdk_bit_array_word) +#define SPDK_BIT_ARRAY_WORD_BITS (SPDK_BIT_ARRAY_WORD_BYTES * 8) +#define SPDK_BIT_ARRAY_WORD_INDEX_SHIFT spdk_u32log2(SPDK_BIT_ARRAY_WORD_BITS) +#define SPDK_BIT_ARRAY_WORD_INDEX_MASK ((1u << SPDK_BIT_ARRAY_WORD_INDEX_SHIFT) - 1) + +struct spdk_bit_array { + uint32_t bit_count; + spdk_bit_array_word words[]; +}; + +struct spdk_bit_array * +spdk_bit_array_create(uint32_t num_bits) +{ + struct spdk_bit_array *ba = NULL; + + spdk_bit_array_resize(&ba, num_bits); + + return ba; +} + +void +spdk_bit_array_free(struct spdk_bit_array **bap) +{ + struct spdk_bit_array *ba; + + if (!bap) { + return; + } + + ba = *bap; + *bap = NULL; + spdk_free(ba); +} + +static inline uint32_t +bit_array_word_count(uint32_t num_bits) +{ + return (num_bits + SPDK_BIT_ARRAY_WORD_BITS - 1) >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT; +} + +static inline spdk_bit_array_word +bit_array_word_mask(uint32_t num_bits) +{ + assert(num_bits < SPDK_BIT_ARRAY_WORD_BITS); + return (SPDK_BIT_ARRAY_WORD_C(1) << num_bits) - 1; +} + +int +spdk_bit_array_resize(struct spdk_bit_array **bap, uint32_t num_bits) +{ + struct spdk_bit_array *new_ba; + uint32_t old_word_count, new_word_count; + size_t new_size; + + /* + * Max number of bits allowed is UINT32_MAX - 1, because we use UINT32_MAX to denote + * when a set or cleared bit cannot be found. + */ + if (!bap || num_bits == UINT32_MAX) { + return -EINVAL; + } + + new_word_count = bit_array_word_count(num_bits); + new_size = offsetof(struct spdk_bit_array, words) + new_word_count * SPDK_BIT_ARRAY_WORD_BYTES; + + /* + * Always keep one extra word with a 0 and a 1 past the actual required size so that the + * find_first functions can just keep going until they match. + */ + new_size += SPDK_BIT_ARRAY_WORD_BYTES; + + new_ba = (struct spdk_bit_array *)spdk_realloc(*bap, new_size, 64); + if (!new_ba) { + return -ENOMEM; + } + + /* + * Set up special extra word (see above comment about find_first_clear). + * + * This is set to 0b10 so that find_first_clear will find a 0 at the very first + * bit past the end of the buffer, and find_first_set will find a 1 at the next bit + * past that. + */ + new_ba->words[new_word_count] = 0x2; + + if (*bap == NULL) { + old_word_count = 0; + new_ba->bit_count = 0; + } else { + old_word_count = bit_array_word_count(new_ba->bit_count); + } + + if (new_word_count > old_word_count) { + /* Zero out new entries */ + memset(&new_ba->words[old_word_count], 0, + (new_word_count - old_word_count) * SPDK_BIT_ARRAY_WORD_BYTES); + } else if (new_word_count == old_word_count && num_bits < new_ba->bit_count) { + /* Make sure any existing partial last word is cleared beyond the new num_bits. */ + uint32_t last_word_bits; + spdk_bit_array_word mask; + + last_word_bits = num_bits & SPDK_BIT_ARRAY_WORD_INDEX_MASK; + mask = bit_array_word_mask(last_word_bits); + new_ba->words[old_word_count - 1] &= mask; + } + + new_ba->bit_count = num_bits; + *bap = new_ba; + return 0; +} + +uint32_t +spdk_bit_array_capacity(const struct spdk_bit_array *ba) +{ + return ba->bit_count; +} + +static inline int +bit_array_get_word(const struct spdk_bit_array *ba, uint32_t bit_index, + uint32_t *word_index, uint32_t *word_bit_index) +{ + if (spdk_unlikely(bit_index >= ba->bit_count)) { + return -EINVAL; + } + + *word_index = bit_index >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT; + *word_bit_index = bit_index & SPDK_BIT_ARRAY_WORD_INDEX_MASK; + + return 0; +} + +bool +spdk_bit_array_get(const struct spdk_bit_array *ba, uint32_t bit_index) +{ + uint32_t word_index, word_bit_index; + + if (bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) { + return false; + } + + return (ba->words[word_index] >> word_bit_index) & 1U; +} + +int +spdk_bit_array_set(struct spdk_bit_array *ba, uint32_t bit_index) +{ + uint32_t word_index, word_bit_index; + + if (bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) { + return -EINVAL; + } + + ba->words[word_index] |= (SPDK_BIT_ARRAY_WORD_C(1) << word_bit_index); + return 0; +} + +void +spdk_bit_array_clear(struct spdk_bit_array *ba, uint32_t bit_index) +{ + uint32_t word_index, word_bit_index; + + if (bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) { + /* + * Clearing past the end of the bit array is a no-op, since bit past the end + * are implicitly 0. + */ + return; + } + + ba->words[word_index] &= ~(SPDK_BIT_ARRAY_WORD_C(1) << word_bit_index); +} + +static inline uint32_t +bit_array_find_first(const struct spdk_bit_array *ba, uint32_t start_bit_index, + spdk_bit_array_word xor_mask) +{ + uint32_t word_index, first_word_bit_index; + spdk_bit_array_word word, first_word_mask; + const spdk_bit_array_word *words, *cur_word; + + if (spdk_unlikely(start_bit_index >= ba->bit_count)) { + return ba->bit_count; + } + + word_index = start_bit_index >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT; + words = ba->words; + cur_word = &words[word_index]; + + /* + * Special case for first word: skip start_bit_index % SPDK_BIT_ARRAY_WORD_BITS bits + * within the first word. + */ + first_word_bit_index = start_bit_index & SPDK_BIT_ARRAY_WORD_INDEX_MASK; + first_word_mask = bit_array_word_mask(first_word_bit_index); + + word = (*cur_word ^ xor_mask) & ~first_word_mask; + + /* + * spdk_bit_array_resize() guarantees that an extra word with a 1 and a 0 will always be + * at the end of the words[] array, so just keep going until a word matches. + */ + while (word == 0) { + word = *++cur_word ^ xor_mask; + } + + return ((uintptr_t)cur_word - (uintptr_t)words) * 8 + SPDK_BIT_ARRAY_WORD_TZCNT(word); +} + + +uint32_t +spdk_bit_array_find_first_set(const struct spdk_bit_array *ba, uint32_t start_bit_index) +{ + uint32_t bit_index; + + bit_index = bit_array_find_first(ba, start_bit_index, 0); + + /* + * If we ran off the end of the array and found the 1 bit in the extra word, + * return UINT32_MAX to indicate no actual 1 bits were found. + */ + if (bit_index >= ba->bit_count) { + bit_index = UINT32_MAX; + } + + return bit_index; +} + +uint32_t +spdk_bit_array_find_first_clear(const struct spdk_bit_array *ba, uint32_t start_bit_index) +{ + uint32_t bit_index; + + bit_index = bit_array_find_first(ba, start_bit_index, SPDK_BIT_ARRAY_WORD_C(-1)); + + /* + * If we ran off the end of the array and found the 0 bit in the extra word, + * return UINT32_MAX to indicate no actual 0 bits were found. + */ + if (bit_index >= ba->bit_count) { + bit_index = UINT32_MAX; + } + + return bit_index; +} + +uint32_t +spdk_bit_array_count_set(const struct spdk_bit_array *ba) +{ + const spdk_bit_array_word *cur_word = ba->words; + uint32_t word_count = bit_array_word_count(ba->bit_count); + uint32_t set_count = 0; + + while (word_count--) { + /* + * No special treatment is needed for the last (potentially partial) word, since + * spdk_bit_array_resize() makes sure the bits past bit_count are cleared. + */ + set_count += SPDK_BIT_ARRAY_WORD_POPCNT(*cur_word++); + } + + return set_count; +} + +uint32_t +spdk_bit_array_count_clear(const struct spdk_bit_array *ba) +{ + return ba->bit_count - spdk_bit_array_count_set(ba); +} + +void +spdk_bit_array_store_mask(const struct spdk_bit_array *ba, void *mask) +{ + uint32_t size, i; + uint32_t num_bits = spdk_bit_array_capacity(ba); + + size = num_bits / CHAR_BIT; + memcpy(mask, ba->words, size); + + for (i = 0; i < num_bits % CHAR_BIT; i++) { + if (spdk_bit_array_get(ba, i + size * CHAR_BIT)) { + ((uint8_t *)mask)[size] |= (1U << i); + } else { + ((uint8_t *)mask)[size] &= ~(1U << i); + } + } +} + +void +spdk_bit_array_load_mask(struct spdk_bit_array *ba, const void *mask) +{ + uint32_t size, i; + uint32_t num_bits = spdk_bit_array_capacity(ba); + + size = num_bits / CHAR_BIT; + memcpy(ba->words, mask, size); + + for (i = 0; i < num_bits % CHAR_BIT; i++) { + if (((uint8_t *)mask)[size] & (1U << i)) { + spdk_bit_array_set(ba, i + size * CHAR_BIT); + } else { + spdk_bit_array_clear(ba, i + size * CHAR_BIT); + } + } +} + +void +spdk_bit_array_clear_mask(struct spdk_bit_array *ba) +{ + uint32_t size, i; + uint32_t num_bits = spdk_bit_array_capacity(ba); + + size = num_bits / CHAR_BIT; + memset(ba->words, 0, size); + + for (i = 0; i < num_bits % CHAR_BIT; i++) { + spdk_bit_array_clear(ba, i + size * CHAR_BIT); + } +} diff --git a/src/spdk/lib/util/cpuset.c b/src/spdk/lib/util/cpuset.c new file mode 100644 index 000000000..8d7c8dc89 --- /dev/null +++ b/src/spdk/lib/util/cpuset.c @@ -0,0 +1,336 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/cpuset.h" +#include "spdk/log.h" + +struct spdk_cpuset * +spdk_cpuset_alloc(void) +{ + return (struct spdk_cpuset *)calloc(sizeof(struct spdk_cpuset), 1); +} + +void +spdk_cpuset_free(struct spdk_cpuset *set) +{ + free(set); +} + +bool +spdk_cpuset_equal(const struct spdk_cpuset *set1, const struct spdk_cpuset *set2) +{ + assert(set1 != NULL); + assert(set2 != NULL); + return memcmp(set1->cpus, set2->cpus, sizeof(set2->cpus)) == 0; +} + +void +spdk_cpuset_copy(struct spdk_cpuset *dst, const struct spdk_cpuset *src) +{ + assert(dst != NULL); + assert(src != NULL); + memcpy(&dst->cpus, &src->cpus, sizeof(src->cpus)); +} + +void +spdk_cpuset_negate(struct spdk_cpuset *set) +{ + unsigned int i; + assert(set != NULL); + for (i = 0; i < sizeof(set->cpus); i++) { + set->cpus[i] = ~set->cpus[i]; + } +} + +void +spdk_cpuset_and(struct spdk_cpuset *dst, const struct spdk_cpuset *src) +{ + unsigned int i; + assert(dst != NULL); + assert(src != NULL); + for (i = 0; i < sizeof(src->cpus); i++) { + dst->cpus[i] &= src->cpus[i]; + } +} + +void +spdk_cpuset_or(struct spdk_cpuset *dst, const struct spdk_cpuset *src) +{ + unsigned int i; + assert(dst != NULL); + assert(src != NULL); + for (i = 0; i < sizeof(src->cpus); i++) { + dst->cpus[i] |= src->cpus[i]; + } +} + +void +spdk_cpuset_xor(struct spdk_cpuset *dst, const struct spdk_cpuset *src) +{ + unsigned int i; + assert(dst != NULL); + assert(src != NULL); + for (i = 0; i < sizeof(src->cpus); i++) { + dst->cpus[i] ^= src->cpus[i]; + } +} + +void +spdk_cpuset_zero(struct spdk_cpuset *set) +{ + assert(set != NULL); + memset(set->cpus, 0, sizeof(set->cpus)); +} + +void +spdk_cpuset_set_cpu(struct spdk_cpuset *set, uint32_t cpu, bool state) +{ + assert(set != NULL); + assert(cpu < sizeof(set->cpus) * 8); + if (state) { + set->cpus[cpu / 8] |= (1U << (cpu % 8)); + } else { + set->cpus[cpu / 8] &= ~(1U << (cpu % 8)); + } +} + +bool +spdk_cpuset_get_cpu(const struct spdk_cpuset *set, uint32_t cpu) +{ + assert(set != NULL); + assert(cpu < sizeof(set->cpus) * 8); + return (set->cpus[cpu / 8] >> (cpu % 8)) & 1U; +} + +uint32_t +spdk_cpuset_count(const struct spdk_cpuset *set) +{ + uint32_t count = 0; + uint8_t n; + unsigned int i; + for (i = 0; i < sizeof(set->cpus); i++) { + n = set->cpus[i]; + while (n) { + n &= (n - 1); + count++; + } + } + return count; +} + +const char * +spdk_cpuset_fmt(struct spdk_cpuset *set) +{ + uint32_t lcore, lcore_max = 0; + int val, i, n; + char *ptr; + static const char *hex = "0123456789abcdef"; + + assert(set != NULL); + + for (lcore = 0; lcore < sizeof(set->cpus) * 8; lcore++) { + if (spdk_cpuset_get_cpu(set, lcore)) { + lcore_max = lcore; + } + } + + ptr = set->str; + n = lcore_max / 8; + val = set->cpus[n]; + + /* Store first number only if it is not leading zero */ + if ((val & 0xf0) != 0) { + *(ptr++) = hex[(val & 0xf0) >> 4]; + } + *(ptr++) = hex[val & 0x0f]; + + for (i = n - 1; i >= 0; i--) { + val = set->cpus[i]; + *(ptr++) = hex[(val & 0xf0) >> 4]; + *(ptr++) = hex[val & 0x0f]; + } + *ptr = '\0'; + + return set->str; +} + +static int +hex_value(uint8_t c) +{ +#define V(x, y) [x] = y + 1 + static const int8_t val[256] = { + V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4), + V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9), + V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF), + V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF), + }; +#undef V + + return val[c] - 1; +} + +static int +parse_list(const char *mask, struct spdk_cpuset *set) +{ + char *end; + const char *ptr = mask; + uint32_t lcore; + uint32_t lcore_min, lcore_max; + + spdk_cpuset_zero(set); + lcore_min = UINT32_MAX; + + ptr++; + end = (char *)ptr; + do { + while (isblank(*ptr)) { + ptr++; + } + if (*ptr == '\0' || *ptr == ']' || *ptr == '-' || *ptr == ',') { + goto invalid_character; + } + + errno = 0; + lcore = strtoul(ptr, &end, 10); + if (errno) { + SPDK_ERRLOG("Conversion of core mask in '%s' failed\n", mask); + return -1; + } + + if (lcore >= sizeof(set->cpus) * 8) { + SPDK_ERRLOG("Core number %" PRIu32 " is out of range in '%s'\n", lcore, mask); + return -1; + } + + while (isblank(*end)) { + end++; + } + + if (*end == '-') { + lcore_min = lcore; + } else if (*end == ',' || *end == ']') { + lcore_max = lcore; + if (lcore_min == UINT32_MAX) { + lcore_min = lcore; + } + if (lcore_min > lcore_max) { + SPDK_ERRLOG("Invalid range of CPUs (%" PRIu32 " > %" PRIu32 ")\n", + lcore_min, lcore_max); + return -1; + } + for (lcore = lcore_min; lcore <= lcore_max; lcore++) { + spdk_cpuset_set_cpu(set, lcore, true); + } + lcore_min = UINT32_MAX; + } else { + goto invalid_character; + } + + ptr = end + 1; + + } while (*end != ']'); + + return 0; + +invalid_character: + if (*end == '\0') { + SPDK_ERRLOG("Unexpected end of core list '%s'\n", mask); + } else { + SPDK_ERRLOG("Parsing of core list '%s' failed on character '%c'\n", mask, *end); + } + return -1; +} + +static int +parse_mask(const char *mask, struct spdk_cpuset *set, size_t len) +{ + int i, j; + char c; + int val; + uint32_t lcore = 0; + + if (mask[0] == '0' && (mask[1] == 'x' || mask[1] == 'X')) { + mask += 2; + len -= 2; + } + + spdk_cpuset_zero(set); + for (i = len - 1; i >= 0; i--) { + c = mask[i]; + val = hex_value(c); + if (val < 0) { + /* Invalid character */ + SPDK_ERRLOG("Invalid character in core mask '%s' (%c)\n", mask, c); + return -1; + } + for (j = 0; j < 4 && lcore < sizeof(set->cpus); j++, lcore++) { + if ((1 << j) & val) { + spdk_cpuset_set_cpu(set, lcore, true); + } + } + } + + return 0; +} + +int +spdk_cpuset_parse(struct spdk_cpuset *set, const char *mask) +{ + int ret; + size_t len; + + if (mask == NULL || set == NULL) { + return -1; + } + + while (isblank(*mask)) { + mask++; + } + + len = strlen(mask); + while (len > 0 && isblank(mask[len - 1])) { + len--; + } + + if (len == 0) { + return -1; + } + + if (mask[0] == '[') { + ret = parse_list(mask, set); + } else { + ret = parse_mask(mask, set, len); + } + + return ret; +} diff --git a/src/spdk/lib/util/crc16.c b/src/spdk/lib/util/crc16.c new file mode 100644 index 000000000..2ba168c4b --- /dev/null +++ b/src/spdk/lib/util/crc16.c @@ -0,0 +1,668 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/crc16.h" +#include "spdk/config.h" + +/* + * Use Intelligent Storage Acceleration Library for line speed CRC + */ + +#ifdef SPDK_CONFIG_ISAL +#include "isa-l/include/crc.h" + +uint16_t +spdk_crc16_t10dif(uint16_t init_crc, const void *buf, size_t len) +{ + return (crc16_t10dif(init_crc, buf, len)); +} + +uint16_t +spdk_crc16_t10dif_copy(uint16_t init_crc, uint8_t *dst, uint8_t *src, + size_t len) +{ + return (crc16_t10dif_copy(init_crc, dst, src, len)); +} + +#else +/* + * Use table-driven (somewhat faster) CRC + */ + +/* + * Static tables used for the table_driven implementation. + */ + +static const uint16_t crc_table_fast[16][256] = { + { + 0x0000u, 0x8BB7u, 0x9CD9u, 0x176Eu, 0xB205u, 0x39B2u, 0x2EDCu, 0xA56Bu, + 0xEFBDu, 0x640Au, 0x7364u, 0xF8D3u, 0x5DB8u, 0xD60Fu, 0xC161u, 0x4AD6u, + 0x54CDu, 0xDF7Au, 0xC814u, 0x43A3u, 0xE6C8u, 0x6D7Fu, 0x7A11u, 0xF1A6u, + 0xBB70u, 0x30C7u, 0x27A9u, 0xAC1Eu, 0x0975u, 0x82C2u, 0x95ACu, 0x1E1Bu, + 0xA99Au, 0x222Du, 0x3543u, 0xBEF4u, 0x1B9Fu, 0x9028u, 0x8746u, 0x0CF1u, + 0x4627u, 0xCD90u, 0xDAFEu, 0x5149u, 0xF422u, 0x7F95u, 0x68FBu, 0xE34Cu, + 0xFD57u, 0x76E0u, 0x618Eu, 0xEA39u, 0x4F52u, 0xC4E5u, 0xD38Bu, 0x583Cu, + 0x12EAu, 0x995Du, 0x8E33u, 0x0584u, 0xA0EFu, 0x2B58u, 0x3C36u, 0xB781u, + 0xD883u, 0x5334u, 0x445Au, 0xCFEDu, 0x6A86u, 0xE131u, 0xF65Fu, 0x7DE8u, + 0x373Eu, 0xBC89u, 0xABE7u, 0x2050u, 0x853Bu, 0x0E8Cu, 0x19E2u, 0x9255u, + 0x8C4Eu, 0x07F9u, 0x1097u, 0x9B20u, 0x3E4Bu, 0xB5FCu, 0xA292u, 0x2925u, + 0x63F3u, 0xE844u, 0xFF2Au, 0x749Du, 0xD1F6u, 0x5A41u, 0x4D2Fu, 0xC698u, + 0x7119u, 0xFAAEu, 0xEDC0u, 0x6677u, 0xC31Cu, 0x48ABu, 0x5FC5u, 0xD472u, + 0x9EA4u, 0x1513u, 0x027Du, 0x89CAu, 0x2CA1u, 0xA716u, 0xB078u, 0x3BCFu, + 0x25D4u, 0xAE63u, 0xB90Du, 0x32BAu, 0x97D1u, 0x1C66u, 0x0B08u, 0x80BFu, + 0xCA69u, 0x41DEu, 0x56B0u, 0xDD07u, 0x786Cu, 0xF3DBu, 0xE4B5u, 0x6F02u, + 0x3AB1u, 0xB106u, 0xA668u, 0x2DDFu, 0x88B4u, 0x0303u, 0x146Du, 0x9FDAu, + 0xD50Cu, 0x5EBBu, 0x49D5u, 0xC262u, 0x6709u, 0xECBEu, 0xFBD0u, 0x7067u, + 0x6E7Cu, 0xE5CBu, 0xF2A5u, 0x7912u, 0xDC79u, 0x57CEu, 0x40A0u, 0xCB17u, + 0x81C1u, 0x0A76u, 0x1D18u, 0x96AFu, 0x33C4u, 0xB873u, 0xAF1Du, 0x24AAu, + 0x932Bu, 0x189Cu, 0x0FF2u, 0x8445u, 0x212Eu, 0xAA99u, 0xBDF7u, 0x3640u, + 0x7C96u, 0xF721u, 0xE04Fu, 0x6BF8u, 0xCE93u, 0x4524u, 0x524Au, 0xD9FDu, + 0xC7E6u, 0x4C51u, 0x5B3Fu, 0xD088u, 0x75E3u, 0xFE54u, 0xE93Au, 0x628Du, + 0x285Bu, 0xA3ECu, 0xB482u, 0x3F35u, 0x9A5Eu, 0x11E9u, 0x0687u, 0x8D30u, + 0xE232u, 0x6985u, 0x7EEBu, 0xF55Cu, 0x5037u, 0xDB80u, 0xCCEEu, 0x4759u, + 0x0D8Fu, 0x8638u, 0x9156u, 0x1AE1u, 0xBF8Au, 0x343Du, 0x2353u, 0xA8E4u, + 0xB6FFu, 0x3D48u, 0x2A26u, 0xA191u, 0x04FAu, 0x8F4Du, 0x9823u, 0x1394u, + 0x5942u, 0xD2F5u, 0xC59Bu, 0x4E2Cu, 0xEB47u, 0x60F0u, 0x779Eu, 0xFC29u, + 0x4BA8u, 0xC01Fu, 0xD771u, 0x5CC6u, 0xF9ADu, 0x721Au, 0x6574u, 0xEEC3u, + 0xA415u, 0x2FA2u, 0x38CCu, 0xB37Bu, 0x1610u, 0x9DA7u, 0x8AC9u, 0x017Eu, + 0x1F65u, 0x94D2u, 0x83BCu, 0x080Bu, 0xAD60u, 0x26D7u, 0x31B9u, 0xBA0Eu, + 0xF0D8u, 0x7B6Fu, 0x6C01u, 0xE7B6u, 0x42DDu, 0xC96Au, 0xDE04u, 0x55B3u + }, + { + 0x0000u, 0x7562u, 0xEAC4u, 0x9FA6u, 0x5E3Fu, 0x2B5Du, 0xB4FBu, 0xC199u, + 0xBC7Eu, 0xC91Cu, 0x56BAu, 0x23D8u, 0xE241u, 0x9723u, 0x0885u, 0x7DE7u, + 0xF34Bu, 0x8629u, 0x198Fu, 0x6CEDu, 0xAD74u, 0xD816u, 0x47B0u, 0x32D2u, + 0x4F35u, 0x3A57u, 0xA5F1u, 0xD093u, 0x110Au, 0x6468u, 0xFBCEu, 0x8EACu, + 0x6D21u, 0x1843u, 0x87E5u, 0xF287u, 0x331Eu, 0x467Cu, 0xD9DAu, 0xACB8u, + 0xD15Fu, 0xA43Du, 0x3B9Bu, 0x4EF9u, 0x8F60u, 0xFA02u, 0x65A4u, 0x10C6u, + 0x9E6Au, 0xEB08u, 0x74AEu, 0x01CCu, 0xC055u, 0xB537u, 0x2A91u, 0x5FF3u, + 0x2214u, 0x5776u, 0xC8D0u, 0xBDB2u, 0x7C2Bu, 0x0949u, 0x96EFu, 0xE38Du, + 0xDA42u, 0xAF20u, 0x3086u, 0x45E4u, 0x847Du, 0xF11Fu, 0x6EB9u, 0x1BDBu, + 0x663Cu, 0x135Eu, 0x8CF8u, 0xF99Au, 0x3803u, 0x4D61u, 0xD2C7u, 0xA7A5u, + 0x2909u, 0x5C6Bu, 0xC3CDu, 0xB6AFu, 0x7736u, 0x0254u, 0x9DF2u, 0xE890u, + 0x9577u, 0xE015u, 0x7FB3u, 0x0AD1u, 0xCB48u, 0xBE2Au, 0x218Cu, 0x54EEu, + 0xB763u, 0xC201u, 0x5DA7u, 0x28C5u, 0xE95Cu, 0x9C3Eu, 0x0398u, 0x76FAu, + 0x0B1Du, 0x7E7Fu, 0xE1D9u, 0x94BBu, 0x5522u, 0x2040u, 0xBFE6u, 0xCA84u, + 0x4428u, 0x314Au, 0xAEECu, 0xDB8Eu, 0x1A17u, 0x6F75u, 0xF0D3u, 0x85B1u, + 0xF856u, 0x8D34u, 0x1292u, 0x67F0u, 0xA669u, 0xD30Bu, 0x4CADu, 0x39CFu, + 0x3F33u, 0x4A51u, 0xD5F7u, 0xA095u, 0x610Cu, 0x146Eu, 0x8BC8u, 0xFEAAu, + 0x834Du, 0xF62Fu, 0x6989u, 0x1CEBu, 0xDD72u, 0xA810u, 0x37B6u, 0x42D4u, + 0xCC78u, 0xB91Au, 0x26BCu, 0x53DEu, 0x9247u, 0xE725u, 0x7883u, 0x0DE1u, + 0x7006u, 0x0564u, 0x9AC2u, 0xEFA0u, 0x2E39u, 0x5B5Bu, 0xC4FDu, 0xB19Fu, + 0x5212u, 0x2770u, 0xB8D6u, 0xCDB4u, 0x0C2Du, 0x794Fu, 0xE6E9u, 0x938Bu, + 0xEE6Cu, 0x9B0Eu, 0x04A8u, 0x71CAu, 0xB053u, 0xC531u, 0x5A97u, 0x2FF5u, + 0xA159u, 0xD43Bu, 0x4B9Du, 0x3EFFu, 0xFF66u, 0x8A04u, 0x15A2u, 0x60C0u, + 0x1D27u, 0x6845u, 0xF7E3u, 0x8281u, 0x4318u, 0x367Au, 0xA9DCu, 0xDCBEu, + 0xE571u, 0x9013u, 0x0FB5u, 0x7AD7u, 0xBB4Eu, 0xCE2Cu, 0x518Au, 0x24E8u, + 0x590Fu, 0x2C6Du, 0xB3CBu, 0xC6A9u, 0x0730u, 0x7252u, 0xEDF4u, 0x9896u, + 0x163Au, 0x6358u, 0xFCFEu, 0x899Cu, 0x4805u, 0x3D67u, 0xA2C1u, 0xD7A3u, + 0xAA44u, 0xDF26u, 0x4080u, 0x35E2u, 0xF47Bu, 0x8119u, 0x1EBFu, 0x6BDDu, + 0x8850u, 0xFD32u, 0x6294u, 0x17F6u, 0xD66Fu, 0xA30Du, 0x3CABu, 0x49C9u, + 0x342Eu, 0x414Cu, 0xDEEAu, 0xAB88u, 0x6A11u, 0x1F73u, 0x80D5u, 0xF5B7u, + 0x7B1Bu, 0x0E79u, 0x91DFu, 0xE4BDu, 0x2524u, 0x5046u, 0xCFE0u, 0xBA82u, + 0xC765u, 0xB207u, 0x2DA1u, 0x58C3u, 0x995Au, 0xEC38u, 0x739Eu, 0x06FCu + }, + { + 0x0000u, 0x7E66u, 0xFCCCu, 0x82AAu, 0x722Fu, 0x0C49u, 0x8EE3u, 0xF085u, + 0xE45Eu, 0x9A38u, 0x1892u, 0x66F4u, 0x9671u, 0xE817u, 0x6ABDu, 0x14DBu, + 0x430Bu, 0x3D6Du, 0xBFC7u, 0xC1A1u, 0x3124u, 0x4F42u, 0xCDE8u, 0xB38Eu, + 0xA755u, 0xD933u, 0x5B99u, 0x25FFu, 0xD57Au, 0xAB1Cu, 0x29B6u, 0x57D0u, + 0x8616u, 0xF870u, 0x7ADAu, 0x04BCu, 0xF439u, 0x8A5Fu, 0x08F5u, 0x7693u, + 0x6248u, 0x1C2Eu, 0x9E84u, 0xE0E2u, 0x1067u, 0x6E01u, 0xECABu, 0x92CDu, + 0xC51Du, 0xBB7Bu, 0x39D1u, 0x47B7u, 0xB732u, 0xC954u, 0x4BFEu, 0x3598u, + 0x2143u, 0x5F25u, 0xDD8Fu, 0xA3E9u, 0x536Cu, 0x2D0Au, 0xAFA0u, 0xD1C6u, + 0x879Bu, 0xF9FDu, 0x7B57u, 0x0531u, 0xF5B4u, 0x8BD2u, 0x0978u, 0x771Eu, + 0x63C5u, 0x1DA3u, 0x9F09u, 0xE16Fu, 0x11EAu, 0x6F8Cu, 0xED26u, 0x9340u, + 0xC490u, 0xBAF6u, 0x385Cu, 0x463Au, 0xB6BFu, 0xC8D9u, 0x4A73u, 0x3415u, + 0x20CEu, 0x5EA8u, 0xDC02u, 0xA264u, 0x52E1u, 0x2C87u, 0xAE2Du, 0xD04Bu, + 0x018Du, 0x7FEBu, 0xFD41u, 0x8327u, 0x73A2u, 0x0DC4u, 0x8F6Eu, 0xF108u, + 0xE5D3u, 0x9BB5u, 0x191Fu, 0x6779u, 0x97FCu, 0xE99Au, 0x6B30u, 0x1556u, + 0x4286u, 0x3CE0u, 0xBE4Au, 0xC02Cu, 0x30A9u, 0x4ECFu, 0xCC65u, 0xB203u, + 0xA6D8u, 0xD8BEu, 0x5A14u, 0x2472u, 0xD4F7u, 0xAA91u, 0x283Bu, 0x565Du, + 0x8481u, 0xFAE7u, 0x784Du, 0x062Bu, 0xF6AEu, 0x88C8u, 0x0A62u, 0x7404u, + 0x60DFu, 0x1EB9u, 0x9C13u, 0xE275u, 0x12F0u, 0x6C96u, 0xEE3Cu, 0x905Au, + 0xC78Au, 0xB9ECu, 0x3B46u, 0x4520u, 0xB5A5u, 0xCBC3u, 0x4969u, 0x370Fu, + 0x23D4u, 0x5DB2u, 0xDF18u, 0xA17Eu, 0x51FBu, 0x2F9Du, 0xAD37u, 0xD351u, + 0x0297u, 0x7CF1u, 0xFE5Bu, 0x803Du, 0x70B8u, 0x0EDEu, 0x8C74u, 0xF212u, + 0xE6C9u, 0x98AFu, 0x1A05u, 0x6463u, 0x94E6u, 0xEA80u, 0x682Au, 0x164Cu, + 0x419Cu, 0x3FFAu, 0xBD50u, 0xC336u, 0x33B3u, 0x4DD5u, 0xCF7Fu, 0xB119u, + 0xA5C2u, 0xDBA4u, 0x590Eu, 0x2768u, 0xD7EDu, 0xA98Bu, 0x2B21u, 0x5547u, + 0x031Au, 0x7D7Cu, 0xFFD6u, 0x81B0u, 0x7135u, 0x0F53u, 0x8DF9u, 0xF39Fu, + 0xE744u, 0x9922u, 0x1B88u, 0x65EEu, 0x956Bu, 0xEB0Du, 0x69A7u, 0x17C1u, + 0x4011u, 0x3E77u, 0xBCDDu, 0xC2BBu, 0x323Eu, 0x4C58u, 0xCEF2u, 0xB094u, + 0xA44Fu, 0xDA29u, 0x5883u, 0x26E5u, 0xD660u, 0xA806u, 0x2AACu, 0x54CAu, + 0x850Cu, 0xFB6Au, 0x79C0u, 0x07A6u, 0xF723u, 0x8945u, 0x0BEFu, 0x7589u, + 0x6152u, 0x1F34u, 0x9D9Eu, 0xE3F8u, 0x137Du, 0x6D1Bu, 0xEFB1u, 0x91D7u, + 0xC607u, 0xB861u, 0x3ACBu, 0x44ADu, 0xB428u, 0xCA4Eu, 0x48E4u, 0x3682u, + 0x2259u, 0x5C3Fu, 0xDE95u, 0xA0F3u, 0x5076u, 0x2E10u, 0xACBAu, 0xD2DCu + }, + { + 0x0000u, 0x82B5u, 0x8EDDu, 0x0C68u, 0x960Du, 0x14B8u, 0x18D0u, 0x9A65u, + 0xA7ADu, 0x2518u, 0x2970u, 0xABC5u, 0x31A0u, 0xB315u, 0xBF7Du, 0x3DC8u, + 0xC4EDu, 0x4658u, 0x4A30u, 0xC885u, 0x52E0u, 0xD055u, 0xDC3Du, 0x5E88u, + 0x6340u, 0xE1F5u, 0xED9Du, 0x6F28u, 0xF54Du, 0x77F8u, 0x7B90u, 0xF925u, + 0x026Du, 0x80D8u, 0x8CB0u, 0x0E05u, 0x9460u, 0x16D5u, 0x1ABDu, 0x9808u, + 0xA5C0u, 0x2775u, 0x2B1Du, 0xA9A8u, 0x33CDu, 0xB178u, 0xBD10u, 0x3FA5u, + 0xC680u, 0x4435u, 0x485Du, 0xCAE8u, 0x508Du, 0xD238u, 0xDE50u, 0x5CE5u, + 0x612Du, 0xE398u, 0xEFF0u, 0x6D45u, 0xF720u, 0x7595u, 0x79FDu, 0xFB48u, + 0x04DAu, 0x866Fu, 0x8A07u, 0x08B2u, 0x92D7u, 0x1062u, 0x1C0Au, 0x9EBFu, + 0xA377u, 0x21C2u, 0x2DAAu, 0xAF1Fu, 0x357Au, 0xB7CFu, 0xBBA7u, 0x3912u, + 0xC037u, 0x4282u, 0x4EEAu, 0xCC5Fu, 0x563Au, 0xD48Fu, 0xD8E7u, 0x5A52u, + 0x679Au, 0xE52Fu, 0xE947u, 0x6BF2u, 0xF197u, 0x7322u, 0x7F4Au, 0xFDFFu, + 0x06B7u, 0x8402u, 0x886Au, 0x0ADFu, 0x90BAu, 0x120Fu, 0x1E67u, 0x9CD2u, + 0xA11Au, 0x23AFu, 0x2FC7u, 0xAD72u, 0x3717u, 0xB5A2u, 0xB9CAu, 0x3B7Fu, + 0xC25Au, 0x40EFu, 0x4C87u, 0xCE32u, 0x5457u, 0xD6E2u, 0xDA8Au, 0x583Fu, + 0x65F7u, 0xE742u, 0xEB2Au, 0x699Fu, 0xF3FAu, 0x714Fu, 0x7D27u, 0xFF92u, + 0x09B4u, 0x8B01u, 0x8769u, 0x05DCu, 0x9FB9u, 0x1D0Cu, 0x1164u, 0x93D1u, + 0xAE19u, 0x2CACu, 0x20C4u, 0xA271u, 0x3814u, 0xBAA1u, 0xB6C9u, 0x347Cu, + 0xCD59u, 0x4FECu, 0x4384u, 0xC131u, 0x5B54u, 0xD9E1u, 0xD589u, 0x573Cu, + 0x6AF4u, 0xE841u, 0xE429u, 0x669Cu, 0xFCF9u, 0x7E4Cu, 0x7224u, 0xF091u, + 0x0BD9u, 0x896Cu, 0x8504u, 0x07B1u, 0x9DD4u, 0x1F61u, 0x1309u, 0x91BCu, + 0xAC74u, 0x2EC1u, 0x22A9u, 0xA01Cu, 0x3A79u, 0xB8CCu, 0xB4A4u, 0x3611u, + 0xCF34u, 0x4D81u, 0x41E9u, 0xC35Cu, 0x5939u, 0xDB8Cu, 0xD7E4u, 0x5551u, + 0x6899u, 0xEA2Cu, 0xE644u, 0x64F1u, 0xFE94u, 0x7C21u, 0x7049u, 0xF2FCu, + 0x0D6Eu, 0x8FDBu, 0x83B3u, 0x0106u, 0x9B63u, 0x19D6u, 0x15BEu, 0x970Bu, + 0xAAC3u, 0x2876u, 0x241Eu, 0xA6ABu, 0x3CCEu, 0xBE7Bu, 0xB213u, 0x30A6u, + 0xC983u, 0x4B36u, 0x475Eu, 0xC5EBu, 0x5F8Eu, 0xDD3Bu, 0xD153u, 0x53E6u, + 0x6E2Eu, 0xEC9Bu, 0xE0F3u, 0x6246u, 0xF823u, 0x7A96u, 0x76FEu, 0xF44Bu, + 0x0F03u, 0x8DB6u, 0x81DEu, 0x036Bu, 0x990Eu, 0x1BBBu, 0x17D3u, 0x9566u, + 0xA8AEu, 0x2A1Bu, 0x2673u, 0xA4C6u, 0x3EA3u, 0xBC16u, 0xB07Eu, 0x32CBu, + 0xCBEEu, 0x495Bu, 0x4533u, 0xC786u, 0x5DE3u, 0xDF56u, 0xD33Eu, 0x518Bu, + 0x6C43u, 0xEEF6u, 0xE29Eu, 0x602Bu, 0xFA4Eu, 0x78FBu, 0x7493u, 0xF626u + }, + { + 0x0000u, 0x1368u, 0x26D0u, 0x35B8u, 0x4DA0u, 0x5EC8u, 0x6B70u, 0x7818u, + 0x9B40u, 0x8828u, 0xBD90u, 0xAEF8u, 0xD6E0u, 0xC588u, 0xF030u, 0xE358u, + 0xBD37u, 0xAE5Fu, 0x9BE7u, 0x888Fu, 0xF097u, 0xE3FFu, 0xD647u, 0xC52Fu, + 0x2677u, 0x351Fu, 0x00A7u, 0x13CFu, 0x6BD7u, 0x78BFu, 0x4D07u, 0x5E6Fu, + 0xF1D9u, 0xE2B1u, 0xD709u, 0xC461u, 0xBC79u, 0xAF11u, 0x9AA9u, 0x89C1u, + 0x6A99u, 0x79F1u, 0x4C49u, 0x5F21u, 0x2739u, 0x3451u, 0x01E9u, 0x1281u, + 0x4CEEu, 0x5F86u, 0x6A3Eu, 0x7956u, 0x014Eu, 0x1226u, 0x279Eu, 0x34F6u, + 0xD7AEu, 0xC4C6u, 0xF17Eu, 0xE216u, 0x9A0Eu, 0x8966u, 0xBCDEu, 0xAFB6u, + 0x6805u, 0x7B6Du, 0x4ED5u, 0x5DBDu, 0x25A5u, 0x36CDu, 0x0375u, 0x101Du, + 0xF345u, 0xE02Du, 0xD595u, 0xC6FDu, 0xBEE5u, 0xAD8Du, 0x9835u, 0x8B5Du, + 0xD532u, 0xC65Au, 0xF3E2u, 0xE08Au, 0x9892u, 0x8BFAu, 0xBE42u, 0xAD2Au, + 0x4E72u, 0x5D1Au, 0x68A2u, 0x7BCAu, 0x03D2u, 0x10BAu, 0x2502u, 0x366Au, + 0x99DCu, 0x8AB4u, 0xBF0Cu, 0xAC64u, 0xD47Cu, 0xC714u, 0xF2ACu, 0xE1C4u, + 0x029Cu, 0x11F4u, 0x244Cu, 0x3724u, 0x4F3Cu, 0x5C54u, 0x69ECu, 0x7A84u, + 0x24EBu, 0x3783u, 0x023Bu, 0x1153u, 0x694Bu, 0x7A23u, 0x4F9Bu, 0x5CF3u, + 0xBFABu, 0xACC3u, 0x997Bu, 0x8A13u, 0xF20Bu, 0xE163u, 0xD4DBu, 0xC7B3u, + 0xD00Au, 0xC362u, 0xF6DAu, 0xE5B2u, 0x9DAAu, 0x8EC2u, 0xBB7Au, 0xA812u, + 0x4B4Au, 0x5822u, 0x6D9Au, 0x7EF2u, 0x06EAu, 0x1582u, 0x203Au, 0x3352u, + 0x6D3Du, 0x7E55u, 0x4BEDu, 0x5885u, 0x209Du, 0x33F5u, 0x064Du, 0x1525u, + 0xF67Du, 0xE515u, 0xD0ADu, 0xC3C5u, 0xBBDDu, 0xA8B5u, 0x9D0Du, 0x8E65u, + 0x21D3u, 0x32BBu, 0x0703u, 0x146Bu, 0x6C73u, 0x7F1Bu, 0x4AA3u, 0x59CBu, + 0xBA93u, 0xA9FBu, 0x9C43u, 0x8F2Bu, 0xF733u, 0xE45Bu, 0xD1E3u, 0xC28Bu, + 0x9CE4u, 0x8F8Cu, 0xBA34u, 0xA95Cu, 0xD144u, 0xC22Cu, 0xF794u, 0xE4FCu, + 0x07A4u, 0x14CCu, 0x2174u, 0x321Cu, 0x4A04u, 0x596Cu, 0x6CD4u, 0x7FBCu, + 0xB80Fu, 0xAB67u, 0x9EDFu, 0x8DB7u, 0xF5AFu, 0xE6C7u, 0xD37Fu, 0xC017u, + 0x234Fu, 0x3027u, 0x059Fu, 0x16F7u, 0x6EEFu, 0x7D87u, 0x483Fu, 0x5B57u, + 0x0538u, 0x1650u, 0x23E8u, 0x3080u, 0x4898u, 0x5BF0u, 0x6E48u, 0x7D20u, + 0x9E78u, 0x8D10u, 0xB8A8u, 0xABC0u, 0xD3D8u, 0xC0B0u, 0xF508u, 0xE660u, + 0x49D6u, 0x5ABEu, 0x6F06u, 0x7C6Eu, 0x0476u, 0x171Eu, 0x22A6u, 0x31CEu, + 0xD296u, 0xC1FEu, 0xF446u, 0xE72Eu, 0x9F36u, 0x8C5Eu, 0xB9E6u, 0xAA8Eu, + 0xF4E1u, 0xE789u, 0xD231u, 0xC159u, 0xB941u, 0xAA29u, 0x9F91u, 0x8CF9u, + 0x6FA1u, 0x7CC9u, 0x4971u, 0x5A19u, 0x2201u, 0x3169u, 0x04D1u, 0x17B9u + }, + { + 0x0000u, 0x2BA3u, 0x5746u, 0x7CE5u, 0xAE8Cu, 0x852Fu, 0xF9CAu, 0xD269u, + 0xD6AFu, 0xFD0Cu, 0x81E9u, 0xAA4Au, 0x7823u, 0x5380u, 0x2F65u, 0x04C6u, + 0x26E9u, 0x0D4Au, 0x71AFu, 0x5A0Cu, 0x8865u, 0xA3C6u, 0xDF23u, 0xF480u, + 0xF046u, 0xDBE5u, 0xA700u, 0x8CA3u, 0x5ECAu, 0x7569u, 0x098Cu, 0x222Fu, + 0x4DD2u, 0x6671u, 0x1A94u, 0x3137u, 0xE35Eu, 0xC8FDu, 0xB418u, 0x9FBBu, + 0x9B7Du, 0xB0DEu, 0xCC3Bu, 0xE798u, 0x35F1u, 0x1E52u, 0x62B7u, 0x4914u, + 0x6B3Bu, 0x4098u, 0x3C7Du, 0x17DEu, 0xC5B7u, 0xEE14u, 0x92F1u, 0xB952u, + 0xBD94u, 0x9637u, 0xEAD2u, 0xC171u, 0x1318u, 0x38BBu, 0x445Eu, 0x6FFDu, + 0x9BA4u, 0xB007u, 0xCCE2u, 0xE741u, 0x3528u, 0x1E8Bu, 0x626Eu, 0x49CDu, + 0x4D0Bu, 0x66A8u, 0x1A4Du, 0x31EEu, 0xE387u, 0xC824u, 0xB4C1u, 0x9F62u, + 0xBD4Du, 0x96EEu, 0xEA0Bu, 0xC1A8u, 0x13C1u, 0x3862u, 0x4487u, 0x6F24u, + 0x6BE2u, 0x4041u, 0x3CA4u, 0x1707u, 0xC56Eu, 0xEECDu, 0x9228u, 0xB98Bu, + 0xD676u, 0xFDD5u, 0x8130u, 0xAA93u, 0x78FAu, 0x5359u, 0x2FBCu, 0x041Fu, + 0x00D9u, 0x2B7Au, 0x579Fu, 0x7C3Cu, 0xAE55u, 0x85F6u, 0xF913u, 0xD2B0u, + 0xF09Fu, 0xDB3Cu, 0xA7D9u, 0x8C7Au, 0x5E13u, 0x75B0u, 0x0955u, 0x22F6u, + 0x2630u, 0x0D93u, 0x7176u, 0x5AD5u, 0x88BCu, 0xA31Fu, 0xDFFAu, 0xF459u, + 0xBCFFu, 0x975Cu, 0xEBB9u, 0xC01Au, 0x1273u, 0x39D0u, 0x4535u, 0x6E96u, + 0x6A50u, 0x41F3u, 0x3D16u, 0x16B5u, 0xC4DCu, 0xEF7Fu, 0x939Au, 0xB839u, + 0x9A16u, 0xB1B5u, 0xCD50u, 0xE6F3u, 0x349Au, 0x1F39u, 0x63DCu, 0x487Fu, + 0x4CB9u, 0x671Au, 0x1BFFu, 0x305Cu, 0xE235u, 0xC996u, 0xB573u, 0x9ED0u, + 0xF12Du, 0xDA8Eu, 0xA66Bu, 0x8DC8u, 0x5FA1u, 0x7402u, 0x08E7u, 0x2344u, + 0x2782u, 0x0C21u, 0x70C4u, 0x5B67u, 0x890Eu, 0xA2ADu, 0xDE48u, 0xF5EBu, + 0xD7C4u, 0xFC67u, 0x8082u, 0xAB21u, 0x7948u, 0x52EBu, 0x2E0Eu, 0x05ADu, + 0x016Bu, 0x2AC8u, 0x562Du, 0x7D8Eu, 0xAFE7u, 0x8444u, 0xF8A1u, 0xD302u, + 0x275Bu, 0x0CF8u, 0x701Du, 0x5BBEu, 0x89D7u, 0xA274u, 0xDE91u, 0xF532u, + 0xF1F4u, 0xDA57u, 0xA6B2u, 0x8D11u, 0x5F78u, 0x74DBu, 0x083Eu, 0x239Du, + 0x01B2u, 0x2A11u, 0x56F4u, 0x7D57u, 0xAF3Eu, 0x849Du, 0xF878u, 0xD3DBu, + 0xD71Du, 0xFCBEu, 0x805Bu, 0xABF8u, 0x7991u, 0x5232u, 0x2ED7u, 0x0574u, + 0x6A89u, 0x412Au, 0x3DCFu, 0x166Cu, 0xC405u, 0xEFA6u, 0x9343u, 0xB8E0u, + 0xBC26u, 0x9785u, 0xEB60u, 0xC0C3u, 0x12AAu, 0x3909u, 0x45ECu, 0x6E4Fu, + 0x4C60u, 0x67C3u, 0x1B26u, 0x3085u, 0xE2ECu, 0xC94Fu, 0xB5AAu, 0x9E09u, + 0x9ACFu, 0xB16Cu, 0xCD89u, 0xE62Au, 0x3443u, 0x1FE0u, 0x6305u, 0x48A6u + }, + { + 0x0000u, 0xF249u, 0x6F25u, 0x9D6Cu, 0xDE4Au, 0x2C03u, 0xB16Fu, 0x4326u, + 0x3723u, 0xC56Au, 0x5806u, 0xAA4Fu, 0xE969u, 0x1B20u, 0x864Cu, 0x7405u, + 0x6E46u, 0x9C0Fu, 0x0163u, 0xF32Au, 0xB00Cu, 0x4245u, 0xDF29u, 0x2D60u, + 0x5965u, 0xAB2Cu, 0x3640u, 0xC409u, 0x872Fu, 0x7566u, 0xE80Au, 0x1A43u, + 0xDC8Cu, 0x2EC5u, 0xB3A9u, 0x41E0u, 0x02C6u, 0xF08Fu, 0x6DE3u, 0x9FAAu, + 0xEBAFu, 0x19E6u, 0x848Au, 0x76C3u, 0x35E5u, 0xC7ACu, 0x5AC0u, 0xA889u, + 0xB2CAu, 0x4083u, 0xDDEFu, 0x2FA6u, 0x6C80u, 0x9EC9u, 0x03A5u, 0xF1ECu, + 0x85E9u, 0x77A0u, 0xEACCu, 0x1885u, 0x5BA3u, 0xA9EAu, 0x3486u, 0xC6CFu, + 0x32AFu, 0xC0E6u, 0x5D8Au, 0xAFC3u, 0xECE5u, 0x1EACu, 0x83C0u, 0x7189u, + 0x058Cu, 0xF7C5u, 0x6AA9u, 0x98E0u, 0xDBC6u, 0x298Fu, 0xB4E3u, 0x46AAu, + 0x5CE9u, 0xAEA0u, 0x33CCu, 0xC185u, 0x82A3u, 0x70EAu, 0xED86u, 0x1FCFu, + 0x6BCAu, 0x9983u, 0x04EFu, 0xF6A6u, 0xB580u, 0x47C9u, 0xDAA5u, 0x28ECu, + 0xEE23u, 0x1C6Au, 0x8106u, 0x734Fu, 0x3069u, 0xC220u, 0x5F4Cu, 0xAD05u, + 0xD900u, 0x2B49u, 0xB625u, 0x446Cu, 0x074Au, 0xF503u, 0x686Fu, 0x9A26u, + 0x8065u, 0x722Cu, 0xEF40u, 0x1D09u, 0x5E2Fu, 0xAC66u, 0x310Au, 0xC343u, + 0xB746u, 0x450Fu, 0xD863u, 0x2A2Au, 0x690Cu, 0x9B45u, 0x0629u, 0xF460u, + 0x655Eu, 0x9717u, 0x0A7Bu, 0xF832u, 0xBB14u, 0x495Du, 0xD431u, 0x2678u, + 0x527Du, 0xA034u, 0x3D58u, 0xCF11u, 0x8C37u, 0x7E7Eu, 0xE312u, 0x115Bu, + 0x0B18u, 0xF951u, 0x643Du, 0x9674u, 0xD552u, 0x271Bu, 0xBA77u, 0x483Eu, + 0x3C3Bu, 0xCE72u, 0x531Eu, 0xA157u, 0xE271u, 0x1038u, 0x8D54u, 0x7F1Du, + 0xB9D2u, 0x4B9Bu, 0xD6F7u, 0x24BEu, 0x6798u, 0x95D1u, 0x08BDu, 0xFAF4u, + 0x8EF1u, 0x7CB8u, 0xE1D4u, 0x139Du, 0x50BBu, 0xA2F2u, 0x3F9Eu, 0xCDD7u, + 0xD794u, 0x25DDu, 0xB8B1u, 0x4AF8u, 0x09DEu, 0xFB97u, 0x66FBu, 0x94B2u, + 0xE0B7u, 0x12FEu, 0x8F92u, 0x7DDBu, 0x3EFDu, 0xCCB4u, 0x51D8u, 0xA391u, + 0x57F1u, 0xA5B8u, 0x38D4u, 0xCA9Du, 0x89BBu, 0x7BF2u, 0xE69Eu, 0x14D7u, + 0x60D2u, 0x929Bu, 0x0FF7u, 0xFDBEu, 0xBE98u, 0x4CD1u, 0xD1BDu, 0x23F4u, + 0x39B7u, 0xCBFEu, 0x5692u, 0xA4DBu, 0xE7FDu, 0x15B4u, 0x88D8u, 0x7A91u, + 0x0E94u, 0xFCDDu, 0x61B1u, 0x93F8u, 0xD0DEu, 0x2297u, 0xBFFBu, 0x4DB2u, + 0x8B7Du, 0x7934u, 0xE458u, 0x1611u, 0x5537u, 0xA77Eu, 0x3A12u, 0xC85Bu, + 0xBC5Eu, 0x4E17u, 0xD37Bu, 0x2132u, 0x6214u, 0x905Du, 0x0D31u, 0xFF78u, + 0xE53Bu, 0x1772u, 0x8A1Eu, 0x7857u, 0x3B71u, 0xC938u, 0x5454u, 0xA61Du, + 0xD218u, 0x2051u, 0xBD3Du, 0x4F74u, 0x0C52u, 0xFE1Bu, 0x6377u, 0x913Eu + }, + { + 0x0000u, 0xCABCu, 0x1ECFu, 0xD473u, 0x3D9Eu, 0xF722u, 0x2351u, 0xE9EDu, + 0x7B3Cu, 0xB180u, 0x65F3u, 0xAF4Fu, 0x46A2u, 0x8C1Eu, 0x586Du, 0x92D1u, + 0xF678u, 0x3CC4u, 0xE8B7u, 0x220Bu, 0xCBE6u, 0x015Au, 0xD529u, 0x1F95u, + 0x8D44u, 0x47F8u, 0x938Bu, 0x5937u, 0xB0DAu, 0x7A66u, 0xAE15u, 0x64A9u, + 0x6747u, 0xADFBu, 0x7988u, 0xB334u, 0x5AD9u, 0x9065u, 0x4416u, 0x8EAAu, + 0x1C7Bu, 0xD6C7u, 0x02B4u, 0xC808u, 0x21E5u, 0xEB59u, 0x3F2Au, 0xF596u, + 0x913Fu, 0x5B83u, 0x8FF0u, 0x454Cu, 0xACA1u, 0x661Du, 0xB26Eu, 0x78D2u, + 0xEA03u, 0x20BFu, 0xF4CCu, 0x3E70u, 0xD79Du, 0x1D21u, 0xC952u, 0x03EEu, + 0xCE8Eu, 0x0432u, 0xD041u, 0x1AFDu, 0xF310u, 0x39ACu, 0xEDDFu, 0x2763u, + 0xB5B2u, 0x7F0Eu, 0xAB7Du, 0x61C1u, 0x882Cu, 0x4290u, 0x96E3u, 0x5C5Fu, + 0x38F6u, 0xF24Au, 0x2639u, 0xEC85u, 0x0568u, 0xCFD4u, 0x1BA7u, 0xD11Bu, + 0x43CAu, 0x8976u, 0x5D05u, 0x97B9u, 0x7E54u, 0xB4E8u, 0x609Bu, 0xAA27u, + 0xA9C9u, 0x6375u, 0xB706u, 0x7DBAu, 0x9457u, 0x5EEBu, 0x8A98u, 0x4024u, + 0xD2F5u, 0x1849u, 0xCC3Au, 0x0686u, 0xEF6Bu, 0x25D7u, 0xF1A4u, 0x3B18u, + 0x5FB1u, 0x950Du, 0x417Eu, 0x8BC2u, 0x622Fu, 0xA893u, 0x7CE0u, 0xB65Cu, + 0x248Du, 0xEE31u, 0x3A42u, 0xF0FEu, 0x1913u, 0xD3AFu, 0x07DCu, 0xCD60u, + 0x16ABu, 0xDC17u, 0x0864u, 0xC2D8u, 0x2B35u, 0xE189u, 0x35FAu, 0xFF46u, + 0x6D97u, 0xA72Bu, 0x7358u, 0xB9E4u, 0x5009u, 0x9AB5u, 0x4EC6u, 0x847Au, + 0xE0D3u, 0x2A6Fu, 0xFE1Cu, 0x34A0u, 0xDD4Du, 0x17F1u, 0xC382u, 0x093Eu, + 0x9BEFu, 0x5153u, 0x8520u, 0x4F9Cu, 0xA671u, 0x6CCDu, 0xB8BEu, 0x7202u, + 0x71ECu, 0xBB50u, 0x6F23u, 0xA59Fu, 0x4C72u, 0x86CEu, 0x52BDu, 0x9801u, + 0x0AD0u, 0xC06Cu, 0x141Fu, 0xDEA3u, 0x374Eu, 0xFDF2u, 0x2981u, 0xE33Du, + 0x8794u, 0x4D28u, 0x995Bu, 0x53E7u, 0xBA0Au, 0x70B6u, 0xA4C5u, 0x6E79u, + 0xFCA8u, 0x3614u, 0xE267u, 0x28DBu, 0xC136u, 0x0B8Au, 0xDFF9u, 0x1545u, + 0xD825u, 0x1299u, 0xC6EAu, 0x0C56u, 0xE5BBu, 0x2F07u, 0xFB74u, 0x31C8u, + 0xA319u, 0x69A5u, 0xBDD6u, 0x776Au, 0x9E87u, 0x543Bu, 0x8048u, 0x4AF4u, + 0x2E5Du, 0xE4E1u, 0x3092u, 0xFA2Eu, 0x13C3u, 0xD97Fu, 0x0D0Cu, 0xC7B0u, + 0x5561u, 0x9FDDu, 0x4BAEu, 0x8112u, 0x68FFu, 0xA243u, 0x7630u, 0xBC8Cu, + 0xBF62u, 0x75DEu, 0xA1ADu, 0x6B11u, 0x82FCu, 0x4840u, 0x9C33u, 0x568Fu, + 0xC45Eu, 0x0EE2u, 0xDA91u, 0x102Du, 0xF9C0u, 0x337Cu, 0xE70Fu, 0x2DB3u, + 0x491Au, 0x83A6u, 0x57D5u, 0x9D69u, 0x7484u, 0xBE38u, 0x6A4Bu, 0xA0F7u, + 0x3226u, 0xF89Au, 0x2CE9u, 0xE655u, 0x0FB8u, 0xC504u, 0x1177u, 0xDBCBu + }, + { + 0x0000u, 0x2D56u, 0x5AACu, 0x77FAu, 0xB558u, 0x980Eu, 0xEFF4u, 0xC2A2u, + 0xE107u, 0xCC51u, 0xBBABu, 0x96FDu, 0x545Fu, 0x7909u, 0x0EF3u, 0x23A5u, + 0x49B9u, 0x64EFu, 0x1315u, 0x3E43u, 0xFCE1u, 0xD1B7u, 0xA64Du, 0x8B1Bu, + 0xA8BEu, 0x85E8u, 0xF212u, 0xDF44u, 0x1DE6u, 0x30B0u, 0x474Au, 0x6A1Cu, + 0x9372u, 0xBE24u, 0xC9DEu, 0xE488u, 0x262Au, 0x0B7Cu, 0x7C86u, 0x51D0u, + 0x7275u, 0x5F23u, 0x28D9u, 0x058Fu, 0xC72Du, 0xEA7Bu, 0x9D81u, 0xB0D7u, + 0xDACBu, 0xF79Du, 0x8067u, 0xAD31u, 0x6F93u, 0x42C5u, 0x353Fu, 0x1869u, + 0x3BCCu, 0x169Au, 0x6160u, 0x4C36u, 0x8E94u, 0xA3C2u, 0xD438u, 0xF96Eu, + 0xAD53u, 0x8005u, 0xF7FFu, 0xDAA9u, 0x180Bu, 0x355Du, 0x42A7u, 0x6FF1u, + 0x4C54u, 0x6102u, 0x16F8u, 0x3BAEu, 0xF90Cu, 0xD45Au, 0xA3A0u, 0x8EF6u, + 0xE4EAu, 0xC9BCu, 0xBE46u, 0x9310u, 0x51B2u, 0x7CE4u, 0x0B1Eu, 0x2648u, + 0x05EDu, 0x28BBu, 0x5F41u, 0x7217u, 0xB0B5u, 0x9DE3u, 0xEA19u, 0xC74Fu, + 0x3E21u, 0x1377u, 0x648Du, 0x49DBu, 0x8B79u, 0xA62Fu, 0xD1D5u, 0xFC83u, + 0xDF26u, 0xF270u, 0x858Au, 0xA8DCu, 0x6A7Eu, 0x4728u, 0x30D2u, 0x1D84u, + 0x7798u, 0x5ACEu, 0x2D34u, 0x0062u, 0xC2C0u, 0xEF96u, 0x986Cu, 0xB53Au, + 0x969Fu, 0xBBC9u, 0xCC33u, 0xE165u, 0x23C7u, 0x0E91u, 0x796Bu, 0x543Du, + 0xD111u, 0xFC47u, 0x8BBDu, 0xA6EBu, 0x6449u, 0x491Fu, 0x3EE5u, 0x13B3u, + 0x3016u, 0x1D40u, 0x6ABAu, 0x47ECu, 0x854Eu, 0xA818u, 0xDFE2u, 0xF2B4u, + 0x98A8u, 0xB5FEu, 0xC204u, 0xEF52u, 0x2DF0u, 0x00A6u, 0x775Cu, 0x5A0Au, + 0x79AFu, 0x54F9u, 0x2303u, 0x0E55u, 0xCCF7u, 0xE1A1u, 0x965Bu, 0xBB0Du, + 0x4263u, 0x6F35u, 0x18CFu, 0x3599u, 0xF73Bu, 0xDA6Du, 0xAD97u, 0x80C1u, + 0xA364u, 0x8E32u, 0xF9C8u, 0xD49Eu, 0x163Cu, 0x3B6Au, 0x4C90u, 0x61C6u, + 0x0BDAu, 0x268Cu, 0x5176u, 0x7C20u, 0xBE82u, 0x93D4u, 0xE42Eu, 0xC978u, + 0xEADDu, 0xC78Bu, 0xB071u, 0x9D27u, 0x5F85u, 0x72D3u, 0x0529u, 0x287Fu, + 0x7C42u, 0x5114u, 0x26EEu, 0x0BB8u, 0xC91Au, 0xE44Cu, 0x93B6u, 0xBEE0u, + 0x9D45u, 0xB013u, 0xC7E9u, 0xEABFu, 0x281Du, 0x054Bu, 0x72B1u, 0x5FE7u, + 0x35FBu, 0x18ADu, 0x6F57u, 0x4201u, 0x80A3u, 0xADF5u, 0xDA0Fu, 0xF759u, + 0xD4FCu, 0xF9AAu, 0x8E50u, 0xA306u, 0x61A4u, 0x4CF2u, 0x3B08u, 0x165Eu, + 0xEF30u, 0xC266u, 0xB59Cu, 0x98CAu, 0x5A68u, 0x773Eu, 0x00C4u, 0x2D92u, + 0x0E37u, 0x2361u, 0x549Bu, 0x79CDu, 0xBB6Fu, 0x9639u, 0xE1C3u, 0xCC95u, + 0xA689u, 0x8BDFu, 0xFC25u, 0xD173u, 0x13D1u, 0x3E87u, 0x497Du, 0x642Bu, + 0x478Eu, 0x6AD8u, 0x1D22u, 0x3074u, 0xF2D6u, 0xDF80u, 0xA87Au, 0x852Cu + }, + { + 0x0000u, 0x2995u, 0x532Au, 0x7ABFu, 0xA654u, 0x8FC1u, 0xF57Eu, 0xDCEBu, + 0xC71Fu, 0xEE8Au, 0x9435u, 0xBDA0u, 0x614Bu, 0x48DEu, 0x3261u, 0x1BF4u, + 0x0589u, 0x2C1Cu, 0x56A3u, 0x7F36u, 0xA3DDu, 0x8A48u, 0xF0F7u, 0xD962u, + 0xC296u, 0xEB03u, 0x91BCu, 0xB829u, 0x64C2u, 0x4D57u, 0x37E8u, 0x1E7Du, + 0x0B12u, 0x2287u, 0x5838u, 0x71ADu, 0xAD46u, 0x84D3u, 0xFE6Cu, 0xD7F9u, + 0xCC0Du, 0xE598u, 0x9F27u, 0xB6B2u, 0x6A59u, 0x43CCu, 0x3973u, 0x10E6u, + 0x0E9Bu, 0x270Eu, 0x5DB1u, 0x7424u, 0xA8CFu, 0x815Au, 0xFBE5u, 0xD270u, + 0xC984u, 0xE011u, 0x9AAEu, 0xB33Bu, 0x6FD0u, 0x4645u, 0x3CFAu, 0x156Fu, + 0x1624u, 0x3FB1u, 0x450Eu, 0x6C9Bu, 0xB070u, 0x99E5u, 0xE35Au, 0xCACFu, + 0xD13Bu, 0xF8AEu, 0x8211u, 0xAB84u, 0x776Fu, 0x5EFAu, 0x2445u, 0x0DD0u, + 0x13ADu, 0x3A38u, 0x4087u, 0x6912u, 0xB5F9u, 0x9C6Cu, 0xE6D3u, 0xCF46u, + 0xD4B2u, 0xFD27u, 0x8798u, 0xAE0Du, 0x72E6u, 0x5B73u, 0x21CCu, 0x0859u, + 0x1D36u, 0x34A3u, 0x4E1Cu, 0x6789u, 0xBB62u, 0x92F7u, 0xE848u, 0xC1DDu, + 0xDA29u, 0xF3BCu, 0x8903u, 0xA096u, 0x7C7Du, 0x55E8u, 0x2F57u, 0x06C2u, + 0x18BFu, 0x312Au, 0x4B95u, 0x6200u, 0xBEEBu, 0x977Eu, 0xEDC1u, 0xC454u, + 0xDFA0u, 0xF635u, 0x8C8Au, 0xA51Fu, 0x79F4u, 0x5061u, 0x2ADEu, 0x034Bu, + 0x2C48u, 0x05DDu, 0x7F62u, 0x56F7u, 0x8A1Cu, 0xA389u, 0xD936u, 0xF0A3u, + 0xEB57u, 0xC2C2u, 0xB87Du, 0x91E8u, 0x4D03u, 0x6496u, 0x1E29u, 0x37BCu, + 0x29C1u, 0x0054u, 0x7AEBu, 0x537Eu, 0x8F95u, 0xA600u, 0xDCBFu, 0xF52Au, + 0xEEDEu, 0xC74Bu, 0xBDF4u, 0x9461u, 0x488Au, 0x611Fu, 0x1BA0u, 0x3235u, + 0x275Au, 0x0ECFu, 0x7470u, 0x5DE5u, 0x810Eu, 0xA89Bu, 0xD224u, 0xFBB1u, + 0xE045u, 0xC9D0u, 0xB36Fu, 0x9AFAu, 0x4611u, 0x6F84u, 0x153Bu, 0x3CAEu, + 0x22D3u, 0x0B46u, 0x71F9u, 0x586Cu, 0x8487u, 0xAD12u, 0xD7ADu, 0xFE38u, + 0xE5CCu, 0xCC59u, 0xB6E6u, 0x9F73u, 0x4398u, 0x6A0Du, 0x10B2u, 0x3927u, + 0x3A6Cu, 0x13F9u, 0x6946u, 0x40D3u, 0x9C38u, 0xB5ADu, 0xCF12u, 0xE687u, + 0xFD73u, 0xD4E6u, 0xAE59u, 0x87CCu, 0x5B27u, 0x72B2u, 0x080Du, 0x2198u, + 0x3FE5u, 0x1670u, 0x6CCFu, 0x455Au, 0x99B1u, 0xB024u, 0xCA9Bu, 0xE30Eu, + 0xF8FAu, 0xD16Fu, 0xABD0u, 0x8245u, 0x5EAEu, 0x773Bu, 0x0D84u, 0x2411u, + 0x317Eu, 0x18EBu, 0x6254u, 0x4BC1u, 0x972Au, 0xBEBFu, 0xC400u, 0xED95u, + 0xF661u, 0xDFF4u, 0xA54Bu, 0x8CDEu, 0x5035u, 0x79A0u, 0x031Fu, 0x2A8Au, + 0x34F7u, 0x1D62u, 0x67DDu, 0x4E48u, 0x92A3u, 0xBB36u, 0xC189u, 0xE81Cu, + 0xF3E8u, 0xDA7Du, 0xA0C2u, 0x8957u, 0x55BCu, 0x7C29u, 0x0696u, 0x2F03u + }, + { + 0x0000u, 0x5890u, 0xB120u, 0xE9B0u, 0xE9F7u, 0xB167u, 0x58D7u, 0x0047u, + 0x5859u, 0x00C9u, 0xE979u, 0xB1E9u, 0xB1AEu, 0xE93Eu, 0x008Eu, 0x581Eu, + 0xB0B2u, 0xE822u, 0x0192u, 0x5902u, 0x5945u, 0x01D5u, 0xE865u, 0xB0F5u, + 0xE8EBu, 0xB07Bu, 0x59CBu, 0x015Bu, 0x011Cu, 0x598Cu, 0xB03Cu, 0xE8ACu, + 0xEAD3u, 0xB243u, 0x5BF3u, 0x0363u, 0x0324u, 0x5BB4u, 0xB204u, 0xEA94u, + 0xB28Au, 0xEA1Au, 0x03AAu, 0x5B3Au, 0x5B7Du, 0x03EDu, 0xEA5Du, 0xB2CDu, + 0x5A61u, 0x02F1u, 0xEB41u, 0xB3D1u, 0xB396u, 0xEB06u, 0x02B6u, 0x5A26u, + 0x0238u, 0x5AA8u, 0xB318u, 0xEB88u, 0xEBCFu, 0xB35Fu, 0x5AEFu, 0x027Fu, + 0x5E11u, 0x0681u, 0xEF31u, 0xB7A1u, 0xB7E6u, 0xEF76u, 0x06C6u, 0x5E56u, + 0x0648u, 0x5ED8u, 0xB768u, 0xEFF8u, 0xEFBFu, 0xB72Fu, 0x5E9Fu, 0x060Fu, + 0xEEA3u, 0xB633u, 0x5F83u, 0x0713u, 0x0754u, 0x5FC4u, 0xB674u, 0xEEE4u, + 0xB6FAu, 0xEE6Au, 0x07DAu, 0x5F4Au, 0x5F0Du, 0x079Du, 0xEE2Du, 0xB6BDu, + 0xB4C2u, 0xEC52u, 0x05E2u, 0x5D72u, 0x5D35u, 0x05A5u, 0xEC15u, 0xB485u, + 0xEC9Bu, 0xB40Bu, 0x5DBBu, 0x052Bu, 0x056Cu, 0x5DFCu, 0xB44Cu, 0xECDCu, + 0x0470u, 0x5CE0u, 0xB550u, 0xEDC0u, 0xED87u, 0xB517u, 0x5CA7u, 0x0437u, + 0x5C29u, 0x04B9u, 0xED09u, 0xB599u, 0xB5DEu, 0xED4Eu, 0x04FEu, 0x5C6Eu, + 0xBC22u, 0xE4B2u, 0x0D02u, 0x5592u, 0x55D5u, 0x0D45u, 0xE4F5u, 0xBC65u, + 0xE47Bu, 0xBCEBu, 0x555Bu, 0x0DCBu, 0x0D8Cu, 0x551Cu, 0xBCACu, 0xE43Cu, + 0x0C90u, 0x5400u, 0xBDB0u, 0xE520u, 0xE567u, 0xBDF7u, 0x5447u, 0x0CD7u, + 0x54C9u, 0x0C59u, 0xE5E9u, 0xBD79u, 0xBD3Eu, 0xE5AEu, 0x0C1Eu, 0x548Eu, + 0x56F1u, 0x0E61u, 0xE7D1u, 0xBF41u, 0xBF06u, 0xE796u, 0x0E26u, 0x56B6u, + 0x0EA8u, 0x5638u, 0xBF88u, 0xE718u, 0xE75Fu, 0xBFCFu, 0x567Fu, 0x0EEFu, + 0xE643u, 0xBED3u, 0x5763u, 0x0FF3u, 0x0FB4u, 0x5724u, 0xBE94u, 0xE604u, + 0xBE1Au, 0xE68Au, 0x0F3Au, 0x57AAu, 0x57EDu, 0x0F7Du, 0xE6CDu, 0xBE5Du, + 0xE233u, 0xBAA3u, 0x5313u, 0x0B83u, 0x0BC4u, 0x5354u, 0xBAE4u, 0xE274u, + 0xBA6Au, 0xE2FAu, 0x0B4Au, 0x53DAu, 0x539Du, 0x0B0Du, 0xE2BDu, 0xBA2Du, + 0x5281u, 0x0A11u, 0xE3A1u, 0xBB31u, 0xBB76u, 0xE3E6u, 0x0A56u, 0x52C6u, + 0x0AD8u, 0x5248u, 0xBBF8u, 0xE368u, 0xE32Fu, 0xBBBFu, 0x520Fu, 0x0A9Fu, + 0x08E0u, 0x5070u, 0xB9C0u, 0xE150u, 0xE117u, 0xB987u, 0x5037u, 0x08A7u, + 0x50B9u, 0x0829u, 0xE199u, 0xB909u, 0xB94Eu, 0xE1DEu, 0x086Eu, 0x50FEu, + 0xB852u, 0xE0C2u, 0x0972u, 0x51E2u, 0x51A5u, 0x0935u, 0xE085u, 0xB815u, + 0xE00Bu, 0xB89Bu, 0x512Bu, 0x09BBu, 0x09FCu, 0x516Cu, 0xB8DCu, 0xE04Cu + }, + { + 0x0000u, 0xF3F3u, 0x6C51u, 0x9FA2u, 0xD8A2u, 0x2B51u, 0xB4F3u, 0x4700u, + 0x3AF3u, 0xC900u, 0x56A2u, 0xA551u, 0xE251u, 0x11A2u, 0x8E00u, 0x7DF3u, + 0x75E6u, 0x8615u, 0x19B7u, 0xEA44u, 0xAD44u, 0x5EB7u, 0xC115u, 0x32E6u, + 0x4F15u, 0xBCE6u, 0x2344u, 0xD0B7u, 0x97B7u, 0x6444u, 0xFBE6u, 0x0815u, + 0xEBCCu, 0x183Fu, 0x879Du, 0x746Eu, 0x336Eu, 0xC09Du, 0x5F3Fu, 0xACCCu, + 0xD13Fu, 0x22CCu, 0xBD6Eu, 0x4E9Du, 0x099Du, 0xFA6Eu, 0x65CCu, 0x963Fu, + 0x9E2Au, 0x6DD9u, 0xF27Bu, 0x0188u, 0x4688u, 0xB57Bu, 0x2AD9u, 0xD92Au, + 0xA4D9u, 0x572Au, 0xC888u, 0x3B7Bu, 0x7C7Bu, 0x8F88u, 0x102Au, 0xE3D9u, + 0x5C2Fu, 0xAFDCu, 0x307Eu, 0xC38Du, 0x848Du, 0x777Eu, 0xE8DCu, 0x1B2Fu, + 0x66DCu, 0x952Fu, 0x0A8Du, 0xF97Eu, 0xBE7Eu, 0x4D8Du, 0xD22Fu, 0x21DCu, + 0x29C9u, 0xDA3Au, 0x4598u, 0xB66Bu, 0xF16Bu, 0x0298u, 0x9D3Au, 0x6EC9u, + 0x133Au, 0xE0C9u, 0x7F6Bu, 0x8C98u, 0xCB98u, 0x386Bu, 0xA7C9u, 0x543Au, + 0xB7E3u, 0x4410u, 0xDBB2u, 0x2841u, 0x6F41u, 0x9CB2u, 0x0310u, 0xF0E3u, + 0x8D10u, 0x7EE3u, 0xE141u, 0x12B2u, 0x55B2u, 0xA641u, 0x39E3u, 0xCA10u, + 0xC205u, 0x31F6u, 0xAE54u, 0x5DA7u, 0x1AA7u, 0xE954u, 0x76F6u, 0x8505u, + 0xF8F6u, 0x0B05u, 0x94A7u, 0x6754u, 0x2054u, 0xD3A7u, 0x4C05u, 0xBFF6u, + 0xB85Eu, 0x4BADu, 0xD40Fu, 0x27FCu, 0x60FCu, 0x930Fu, 0x0CADu, 0xFF5Eu, + 0x82ADu, 0x715Eu, 0xEEFCu, 0x1D0Fu, 0x5A0Fu, 0xA9FCu, 0x365Eu, 0xC5ADu, + 0xCDB8u, 0x3E4Bu, 0xA1E9u, 0x521Au, 0x151Au, 0xE6E9u, 0x794Bu, 0x8AB8u, + 0xF74Bu, 0x04B8u, 0x9B1Au, 0x68E9u, 0x2FE9u, 0xDC1Au, 0x43B8u, 0xB04Bu, + 0x5392u, 0xA061u, 0x3FC3u, 0xCC30u, 0x8B30u, 0x78C3u, 0xE761u, 0x1492u, + 0x6961u, 0x9A92u, 0x0530u, 0xF6C3u, 0xB1C3u, 0x4230u, 0xDD92u, 0x2E61u, + 0x2674u, 0xD587u, 0x4A25u, 0xB9D6u, 0xFED6u, 0x0D25u, 0x9287u, 0x6174u, + 0x1C87u, 0xEF74u, 0x70D6u, 0x8325u, 0xC425u, 0x37D6u, 0xA874u, 0x5B87u, + 0xE471u, 0x1782u, 0x8820u, 0x7BD3u, 0x3CD3u, 0xCF20u, 0x5082u, 0xA371u, + 0xDE82u, 0x2D71u, 0xB2D3u, 0x4120u, 0x0620u, 0xF5D3u, 0x6A71u, 0x9982u, + 0x9197u, 0x6264u, 0xFDC6u, 0x0E35u, 0x4935u, 0xBAC6u, 0x2564u, 0xD697u, + 0xAB64u, 0x5897u, 0xC735u, 0x34C6u, 0x73C6u, 0x8035u, 0x1F97u, 0xEC64u, + 0x0FBDu, 0xFC4Eu, 0x63ECu, 0x901Fu, 0xD71Fu, 0x24ECu, 0xBB4Eu, 0x48BDu, + 0x354Eu, 0xC6BDu, 0x591Fu, 0xAAECu, 0xEDECu, 0x1E1Fu, 0x81BDu, 0x724Eu, + 0x7A5Bu, 0x89A8u, 0x160Au, 0xE5F9u, 0xA2F9u, 0x510Au, 0xCEA8u, 0x3D5Bu, + 0x40A8u, 0xB35Bu, 0x2CF9u, 0xDF0Au, 0x980Au, 0x6BF9u, 0xF45Bu, 0x07A8u + }, + { + 0x0000u, 0xFB0Bu, 0x7DA1u, 0x86AAu, 0xFB42u, 0x0049u, 0x86E3u, 0x7DE8u, + 0x7D33u, 0x8638u, 0x0092u, 0xFB99u, 0x8671u, 0x7D7Au, 0xFBD0u, 0x00DBu, + 0xFA66u, 0x016Du, 0x87C7u, 0x7CCCu, 0x0124u, 0xFA2Fu, 0x7C85u, 0x878Eu, + 0x8755u, 0x7C5Eu, 0xFAF4u, 0x01FFu, 0x7C17u, 0x871Cu, 0x01B6u, 0xFABDu, + 0x7F7Bu, 0x8470u, 0x02DAu, 0xF9D1u, 0x8439u, 0x7F32u, 0xF998u, 0x0293u, + 0x0248u, 0xF943u, 0x7FE9u, 0x84E2u, 0xF90Au, 0x0201u, 0x84ABu, 0x7FA0u, + 0x851Du, 0x7E16u, 0xF8BCu, 0x03B7u, 0x7E5Fu, 0x8554u, 0x03FEu, 0xF8F5u, + 0xF82Eu, 0x0325u, 0x858Fu, 0x7E84u, 0x036Cu, 0xF867u, 0x7ECDu, 0x85C6u, + 0xFEF6u, 0x05FDu, 0x8357u, 0x785Cu, 0x05B4u, 0xFEBFu, 0x7815u, 0x831Eu, + 0x83C5u, 0x78CEu, 0xFE64u, 0x056Fu, 0x7887u, 0x838Cu, 0x0526u, 0xFE2Du, + 0x0490u, 0xFF9Bu, 0x7931u, 0x823Au, 0xFFD2u, 0x04D9u, 0x8273u, 0x7978u, + 0x79A3u, 0x82A8u, 0x0402u, 0xFF09u, 0x82E1u, 0x79EAu, 0xFF40u, 0x044Bu, + 0x818Du, 0x7A86u, 0xFC2Cu, 0x0727u, 0x7ACFu, 0x81C4u, 0x076Eu, 0xFC65u, + 0xFCBEu, 0x07B5u, 0x811Fu, 0x7A14u, 0x07FCu, 0xFCF7u, 0x7A5Du, 0x8156u, + 0x7BEBu, 0x80E0u, 0x064Au, 0xFD41u, 0x80A9u, 0x7BA2u, 0xFD08u, 0x0603u, + 0x06D8u, 0xFDD3u, 0x7B79u, 0x8072u, 0xFD9Au, 0x0691u, 0x803Bu, 0x7B30u, + 0x765Bu, 0x8D50u, 0x0BFAu, 0xF0F1u, 0x8D19u, 0x7612u, 0xF0B8u, 0x0BB3u, + 0x0B68u, 0xF063u, 0x76C9u, 0x8DC2u, 0xF02Au, 0x0B21u, 0x8D8Bu, 0x7680u, + 0x8C3Du, 0x7736u, 0xF19Cu, 0x0A97u, 0x777Fu, 0x8C74u, 0x0ADEu, 0xF1D5u, + 0xF10Eu, 0x0A05u, 0x8CAFu, 0x77A4u, 0x0A4Cu, 0xF147u, 0x77EDu, 0x8CE6u, + 0x0920u, 0xF22Bu, 0x7481u, 0x8F8Au, 0xF262u, 0x0969u, 0x8FC3u, 0x74C8u, + 0x7413u, 0x8F18u, 0x09B2u, 0xF2B9u, 0x8F51u, 0x745Au, 0xF2F0u, 0x09FBu, + 0xF346u, 0x084Du, 0x8EE7u, 0x75ECu, 0x0804u, 0xF30Fu, 0x75A5u, 0x8EAEu, + 0x8E75u, 0x757Eu, 0xF3D4u, 0x08DFu, 0x7537u, 0x8E3Cu, 0x0896u, 0xF39Du, + 0x88ADu, 0x73A6u, 0xF50Cu, 0x0E07u, 0x73EFu, 0x88E4u, 0x0E4Eu, 0xF545u, + 0xF59Eu, 0x0E95u, 0x883Fu, 0x7334u, 0x0EDCu, 0xF5D7u, 0x737Du, 0x8876u, + 0x72CBu, 0x89C0u, 0x0F6Au, 0xF461u, 0x8989u, 0x7282u, 0xF428u, 0x0F23u, + 0x0FF8u, 0xF4F3u, 0x7259u, 0x8952u, 0xF4BAu, 0x0FB1u, 0x891Bu, 0x7210u, + 0xF7D6u, 0x0CDDu, 0x8A77u, 0x717Cu, 0x0C94u, 0xF79Fu, 0x7135u, 0x8A3Eu, + 0x8AE5u, 0x71EEu, 0xF744u, 0x0C4Fu, 0x71A7u, 0x8AACu, 0x0C06u, 0xF70Du, + 0x0DB0u, 0xF6BBu, 0x7011u, 0x8B1Au, 0xF6F2u, 0x0DF9u, 0x8B53u, 0x7058u, + 0x7083u, 0x8B88u, 0x0D22u, 0xF629u, 0x8BC1u, 0x70CAu, 0xF660u, 0x0D6Bu + }, + { + 0x0000u, 0xECB6u, 0x52DBu, 0xBE6Du, 0xA5B6u, 0x4900u, 0xF76Du, 0x1BDBu, + 0xC0DBu, 0x2C6Du, 0x9200u, 0x7EB6u, 0x656Du, 0x89DBu, 0x37B6u, 0xDB00u, + 0x0A01u, 0xE6B7u, 0x58DAu, 0xB46Cu, 0xAFB7u, 0x4301u, 0xFD6Cu, 0x11DAu, + 0xCADAu, 0x266Cu, 0x9801u, 0x74B7u, 0x6F6Cu, 0x83DAu, 0x3DB7u, 0xD101u, + 0x1402u, 0xF8B4u, 0x46D9u, 0xAA6Fu, 0xB1B4u, 0x5D02u, 0xE36Fu, 0x0FD9u, + 0xD4D9u, 0x386Fu, 0x8602u, 0x6AB4u, 0x716Fu, 0x9DD9u, 0x23B4u, 0xCF02u, + 0x1E03u, 0xF2B5u, 0x4CD8u, 0xA06Eu, 0xBBB5u, 0x5703u, 0xE96Eu, 0x05D8u, + 0xDED8u, 0x326Eu, 0x8C03u, 0x60B5u, 0x7B6Eu, 0x97D8u, 0x29B5u, 0xC503u, + 0x2804u, 0xC4B2u, 0x7ADFu, 0x9669u, 0x8DB2u, 0x6104u, 0xDF69u, 0x33DFu, + 0xE8DFu, 0x0469u, 0xBA04u, 0x56B2u, 0x4D69u, 0xA1DFu, 0x1FB2u, 0xF304u, + 0x2205u, 0xCEB3u, 0x70DEu, 0x9C68u, 0x87B3u, 0x6B05u, 0xD568u, 0x39DEu, + 0xE2DEu, 0x0E68u, 0xB005u, 0x5CB3u, 0x4768u, 0xABDEu, 0x15B3u, 0xF905u, + 0x3C06u, 0xD0B0u, 0x6EDDu, 0x826Bu, 0x99B0u, 0x7506u, 0xCB6Bu, 0x27DDu, + 0xFCDDu, 0x106Bu, 0xAE06u, 0x42B0u, 0x596Bu, 0xB5DDu, 0x0BB0u, 0xE706u, + 0x3607u, 0xDAB1u, 0x64DCu, 0x886Au, 0x93B1u, 0x7F07u, 0xC16Au, 0x2DDCu, + 0xF6DCu, 0x1A6Au, 0xA407u, 0x48B1u, 0x536Au, 0xBFDCu, 0x01B1u, 0xED07u, + 0x5008u, 0xBCBEu, 0x02D3u, 0xEE65u, 0xF5BEu, 0x1908u, 0xA765u, 0x4BD3u, + 0x90D3u, 0x7C65u, 0xC208u, 0x2EBEu, 0x3565u, 0xD9D3u, 0x67BEu, 0x8B08u, + 0x5A09u, 0xB6BFu, 0x08D2u, 0xE464u, 0xFFBFu, 0x1309u, 0xAD64u, 0x41D2u, + 0x9AD2u, 0x7664u, 0xC809u, 0x24BFu, 0x3F64u, 0xD3D2u, 0x6DBFu, 0x8109u, + 0x440Au, 0xA8BCu, 0x16D1u, 0xFA67u, 0xE1BCu, 0x0D0Au, 0xB367u, 0x5FD1u, + 0x84D1u, 0x6867u, 0xD60Au, 0x3ABCu, 0x2167u, 0xCDD1u, 0x73BCu, 0x9F0Au, + 0x4E0Bu, 0xA2BDu, 0x1CD0u, 0xF066u, 0xEBBDu, 0x070Bu, 0xB966u, 0x55D0u, + 0x8ED0u, 0x6266u, 0xDC0Bu, 0x30BDu, 0x2B66u, 0xC7D0u, 0x79BDu, 0x950Bu, + 0x780Cu, 0x94BAu, 0x2AD7u, 0xC661u, 0xDDBAu, 0x310Cu, 0x8F61u, 0x63D7u, + 0xB8D7u, 0x5461u, 0xEA0Cu, 0x06BAu, 0x1D61u, 0xF1D7u, 0x4FBAu, 0xA30Cu, + 0x720Du, 0x9EBBu, 0x20D6u, 0xCC60u, 0xD7BBu, 0x3B0Du, 0x8560u, 0x69D6u, + 0xB2D6u, 0x5E60u, 0xE00Du, 0x0CBBu, 0x1760u, 0xFBD6u, 0x45BBu, 0xA90Du, + 0x6C0Eu, 0x80B8u, 0x3ED5u, 0xD263u, 0xC9B8u, 0x250Eu, 0x9B63u, 0x77D5u, + 0xACD5u, 0x4063u, 0xFE0Eu, 0x12B8u, 0x0963u, 0xE5D5u, 0x5BB8u, 0xB70Eu, + 0x660Fu, 0x8AB9u, 0x34D4u, 0xD862u, 0xC3B9u, 0x2F0Fu, 0x9162u, 0x7DD4u, + 0xA6D4u, 0x4A62u, 0xF40Fu, 0x18B9u, 0x0362u, 0xEFD4u, 0x51B9u, 0xBD0Fu + }, + { + 0x0000u, 0xA010u, 0xCB97u, 0x6B87u, 0x1C99u, 0xBC89u, 0xD70Eu, 0x771Eu, + 0x3932u, 0x9922u, 0xF2A5u, 0x52B5u, 0x25ABu, 0x85BBu, 0xEE3Cu, 0x4E2Cu, + 0x7264u, 0xD274u, 0xB9F3u, 0x19E3u, 0x6EFDu, 0xCEEDu, 0xA56Au, 0x057Au, + 0x4B56u, 0xEB46u, 0x80C1u, 0x20D1u, 0x57CFu, 0xF7DFu, 0x9C58u, 0x3C48u, + 0xE4C8u, 0x44D8u, 0x2F5Fu, 0x8F4Fu, 0xF851u, 0x5841u, 0x33C6u, 0x93D6u, + 0xDDFAu, 0x7DEAu, 0x166Du, 0xB67Du, 0xC163u, 0x6173u, 0x0AF4u, 0xAAE4u, + 0x96ACu, 0x36BCu, 0x5D3Bu, 0xFD2Bu, 0x8A35u, 0x2A25u, 0x41A2u, 0xE1B2u, + 0xAF9Eu, 0x0F8Eu, 0x6409u, 0xC419u, 0xB307u, 0x1317u, 0x7890u, 0xD880u, + 0x4227u, 0xE237u, 0x89B0u, 0x29A0u, 0x5EBEu, 0xFEAEu, 0x9529u, 0x3539u, + 0x7B15u, 0xDB05u, 0xB082u, 0x1092u, 0x678Cu, 0xC79Cu, 0xAC1Bu, 0x0C0Bu, + 0x3043u, 0x9053u, 0xFBD4u, 0x5BC4u, 0x2CDAu, 0x8CCAu, 0xE74Du, 0x475Du, + 0x0971u, 0xA961u, 0xC2E6u, 0x62F6u, 0x15E8u, 0xB5F8u, 0xDE7Fu, 0x7E6Fu, + 0xA6EFu, 0x06FFu, 0x6D78u, 0xCD68u, 0xBA76u, 0x1A66u, 0x71E1u, 0xD1F1u, + 0x9FDDu, 0x3FCDu, 0x544Au, 0xF45Au, 0x8344u, 0x2354u, 0x48D3u, 0xE8C3u, + 0xD48Bu, 0x749Bu, 0x1F1Cu, 0xBF0Cu, 0xC812u, 0x6802u, 0x0385u, 0xA395u, + 0xEDB9u, 0x4DA9u, 0x262Eu, 0x863Eu, 0xF120u, 0x5130u, 0x3AB7u, 0x9AA7u, + 0x844Eu, 0x245Eu, 0x4FD9u, 0xEFC9u, 0x98D7u, 0x38C7u, 0x5340u, 0xF350u, + 0xBD7Cu, 0x1D6Cu, 0x76EBu, 0xD6FBu, 0xA1E5u, 0x01F5u, 0x6A72u, 0xCA62u, + 0xF62Au, 0x563Au, 0x3DBDu, 0x9DADu, 0xEAB3u, 0x4AA3u, 0x2124u, 0x8134u, + 0xCF18u, 0x6F08u, 0x048Fu, 0xA49Fu, 0xD381u, 0x7391u, 0x1816u, 0xB806u, + 0x6086u, 0xC096u, 0xAB11u, 0x0B01u, 0x7C1Fu, 0xDC0Fu, 0xB788u, 0x1798u, + 0x59B4u, 0xF9A4u, 0x9223u, 0x3233u, 0x452Du, 0xE53Du, 0x8EBAu, 0x2EAAu, + 0x12E2u, 0xB2F2u, 0xD975u, 0x7965u, 0x0E7Bu, 0xAE6Bu, 0xC5ECu, 0x65FCu, + 0x2BD0u, 0x8BC0u, 0xE047u, 0x4057u, 0x3749u, 0x9759u, 0xFCDEu, 0x5CCEu, + 0xC669u, 0x6679u, 0x0DFEu, 0xADEEu, 0xDAF0u, 0x7AE0u, 0x1167u, 0xB177u, + 0xFF5Bu, 0x5F4Bu, 0x34CCu, 0x94DCu, 0xE3C2u, 0x43D2u, 0x2855u, 0x8845u, + 0xB40Du, 0x141Du, 0x7F9Au, 0xDF8Au, 0xA894u, 0x0884u, 0x6303u, 0xC313u, + 0x8D3Fu, 0x2D2Fu, 0x46A8u, 0xE6B8u, 0x91A6u, 0x31B6u, 0x5A31u, 0xFA21u, + 0x22A1u, 0x82B1u, 0xE936u, 0x4926u, 0x3E38u, 0x9E28u, 0xF5AFu, 0x55BFu, + 0x1B93u, 0xBB83u, 0xD004u, 0x7014u, 0x070Au, 0xA71Au, 0xCC9Du, 0x6C8Du, + 0x50C5u, 0xF0D5u, 0x9B52u, 0x3B42u, 0x4C5Cu, 0xEC4Cu, 0x87CBu, 0x27DBu, + 0x69F7u, 0xC9E7u, 0xA260u, 0x0270u, 0x756Eu, 0xD57Eu, 0xBEF9u, 0x1EE9u + }, + { + 0x0000u, 0x832Bu, 0x8DE1u, 0x0ECAu, 0x9075u, 0x135Eu, 0x1D94u, 0x9EBFu, + 0xAB5Du, 0x2876u, 0x26BCu, 0xA597u, 0x3B28u, 0xB803u, 0xB6C9u, 0x35E2u, + 0xDD0Du, 0x5E26u, 0x50ECu, 0xD3C7u, 0x4D78u, 0xCE53u, 0xC099u, 0x43B2u, + 0x7650u, 0xF57Bu, 0xFBB1u, 0x789Au, 0xE625u, 0x650Eu, 0x6BC4u, 0xE8EFu, + 0x31ADu, 0xB286u, 0xBC4Cu, 0x3F67u, 0xA1D8u, 0x22F3u, 0x2C39u, 0xAF12u, + 0x9AF0u, 0x19DBu, 0x1711u, 0x943Au, 0x0A85u, 0x89AEu, 0x8764u, 0x044Fu, + 0xECA0u, 0x6F8Bu, 0x6141u, 0xE26Au, 0x7CD5u, 0xFFFEu, 0xF134u, 0x721Fu, + 0x47FDu, 0xC4D6u, 0xCA1Cu, 0x4937u, 0xD788u, 0x54A3u, 0x5A69u, 0xD942u, + 0x635Au, 0xE071u, 0xEEBBu, 0x6D90u, 0xF32Fu, 0x7004u, 0x7ECEu, 0xFDE5u, + 0xC807u, 0x4B2Cu, 0x45E6u, 0xC6CDu, 0x5872u, 0xDB59u, 0xD593u, 0x56B8u, + 0xBE57u, 0x3D7Cu, 0x33B6u, 0xB09Du, 0x2E22u, 0xAD09u, 0xA3C3u, 0x20E8u, + 0x150Au, 0x9621u, 0x98EBu, 0x1BC0u, 0x857Fu, 0x0654u, 0x089Eu, 0x8BB5u, + 0x52F7u, 0xD1DCu, 0xDF16u, 0x5C3Du, 0xC282u, 0x41A9u, 0x4F63u, 0xCC48u, + 0xF9AAu, 0x7A81u, 0x744Bu, 0xF760u, 0x69DFu, 0xEAF4u, 0xE43Eu, 0x6715u, + 0x8FFAu, 0x0CD1u, 0x021Bu, 0x8130u, 0x1F8Fu, 0x9CA4u, 0x926Eu, 0x1145u, + 0x24A7u, 0xA78Cu, 0xA946u, 0x2A6Du, 0xB4D2u, 0x37F9u, 0x3933u, 0xBA18u, + 0xC6B4u, 0x459Fu, 0x4B55u, 0xC87Eu, 0x56C1u, 0xD5EAu, 0xDB20u, 0x580Bu, + 0x6DE9u, 0xEEC2u, 0xE008u, 0x6323u, 0xFD9Cu, 0x7EB7u, 0x707Du, 0xF356u, + 0x1BB9u, 0x9892u, 0x9658u, 0x1573u, 0x8BCCu, 0x08E7u, 0x062Du, 0x8506u, + 0xB0E4u, 0x33CFu, 0x3D05u, 0xBE2Eu, 0x2091u, 0xA3BAu, 0xAD70u, 0x2E5Bu, + 0xF719u, 0x7432u, 0x7AF8u, 0xF9D3u, 0x676Cu, 0xE447u, 0xEA8Du, 0x69A6u, + 0x5C44u, 0xDF6Fu, 0xD1A5u, 0x528Eu, 0xCC31u, 0x4F1Au, 0x41D0u, 0xC2FBu, + 0x2A14u, 0xA93Fu, 0xA7F5u, 0x24DEu, 0xBA61u, 0x394Au, 0x3780u, 0xB4ABu, + 0x8149u, 0x0262u, 0x0CA8u, 0x8F83u, 0x113Cu, 0x9217u, 0x9CDDu, 0x1FF6u, + 0xA5EEu, 0x26C5u, 0x280Fu, 0xAB24u, 0x359Bu, 0xB6B0u, 0xB87Au, 0x3B51u, + 0x0EB3u, 0x8D98u, 0x8352u, 0x0079u, 0x9EC6u, 0x1DEDu, 0x1327u, 0x900Cu, + 0x78E3u, 0xFBC8u, 0xF502u, 0x7629u, 0xE896u, 0x6BBDu, 0x6577u, 0xE65Cu, + 0xD3BEu, 0x5095u, 0x5E5Fu, 0xDD74u, 0x43CBu, 0xC0E0u, 0xCE2Au, 0x4D01u, + 0x9443u, 0x1768u, 0x19A2u, 0x9A89u, 0x0436u, 0x871Du, 0x89D7u, 0x0AFCu, + 0x3F1Eu, 0xBC35u, 0xB2FFu, 0x31D4u, 0xAF6Bu, 0x2C40u, 0x228Au, 0xA1A1u, + 0x494Eu, 0xCA65u, 0xC4AFu, 0x4784u, 0xD93Bu, 0x5A10u, 0x54DAu, 0xD7F1u, + 0xE213u, 0x6138u, 0x6FF2u, 0xECD9u, 0x7266u, 0xF14Du, 0xFF87u, 0x7CACu + } +}; + +static inline uint16_t +crc_update_fast(uint16_t crc, const void *data, size_t data_len) +{ + const unsigned char *d = (const unsigned char *)data; + const unsigned char *d_end = d + data_len; + const unsigned char *d_last16 = d + (data_len & ~0x0F); + + for (; d < d_last16 ; d += 16) { + crc = crc_table_fast[15][d[0] ^ (uint8_t)(crc >> 8)] ^ + crc_table_fast[14][d[1] ^ (uint8_t)(crc >> 0)] ^ + crc_table_fast[13][d[2]] ^ + crc_table_fast[12][d[3]] ^ + crc_table_fast[11][d[4]] ^ + crc_table_fast[10][d[5]] ^ + crc_table_fast[9][d[6]] ^ + crc_table_fast[8][d[7]] ^ + crc_table_fast[7][d[8]] ^ + crc_table_fast[6][d[9]] ^ + crc_table_fast[5][d[10]] ^ + crc_table_fast[4][d[11]] ^ + crc_table_fast[3][d[12]] ^ + crc_table_fast[2][d[13]] ^ + crc_table_fast[1][d[14]] ^ + crc_table_fast[0][d[15]]; + } + for (; d < d_end ; d++) { + crc = (crc << 8) ^ crc_table_fast[0][((uint8_t)(crc >> 8) ^ *d)]; + } + return crc & 0xffff; +} + +static inline uint16_t +crc16_table_t10dif(uint16_t init_crc, const void *buf, size_t len) +{ + uint16_t crc; + const uint8_t *data = (const uint8_t *)buf; + + crc = init_crc; + crc = crc_update_fast(crc, data, len); + return crc; +} + +uint16_t +spdk_crc16_t10dif(uint16_t init_crc, const void *buf, size_t len) +{ + return (crc16_table_t10dif(init_crc, buf, len)); +} + +uint16_t +spdk_crc16_t10dif_copy(uint16_t init_crc, uint8_t *dst, uint8_t *src, size_t len) +{ + memcpy(dst, src, len); + return (crc16_table_t10dif(init_crc, src, len)); +} + +#endif diff --git a/src/spdk/lib/util/crc32.c b/src/spdk/lib/util/crc32.c new file mode 100644 index 000000000..34bb60b78 --- /dev/null +++ b/src/spdk/lib/util/crc32.c @@ -0,0 +1,95 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "util_internal.h" +#include "spdk/crc32.h" + +void +crc32_table_init(struct spdk_crc32_table *table, uint32_t polynomial_reflect) +{ + int i, j; + uint32_t val; + + for (i = 0; i < 256; i++) { + val = i; + for (j = 0; j < 8; j++) { + if (val & 1) { + val = (val >> 1) ^ polynomial_reflect; + } else { + val = (val >> 1); + } + } + table->table[i] = val; + } +} + +#ifdef SPDK_HAVE_ARM_CRC + +uint32_t +crc32_update(const struct spdk_crc32_table *table, const void *buf, size_t len, uint32_t crc) +{ + size_t count; + const uint64_t *dword_buf; + + count = len & 7; + while (count--) { + crc = __crc32b(crc, *(const uint8_t *)buf); + buf++; + } + dword_buf = (const uint64_t *)buf; + + count = len / 8; + while (count--) { + crc = __crc32d(crc, *dword_buf); + dword_buf++; + } + + return crc; +} + +#else + +uint32_t +crc32_update(const struct spdk_crc32_table *table, const void *buf, size_t len, uint32_t crc) +{ + const uint8_t *buf_u8 = buf; + size_t i; + + for (i = 0; i < len; i++) { + crc = (crc >> 8) ^ table->table[(crc ^ buf_u8[i]) & 0xff]; + } + + return crc; +} + +#endif diff --git a/src/spdk/lib/util/crc32_ieee.c b/src/spdk/lib/util/crc32_ieee.c new file mode 100644 index 000000000..ddc3c9901 --- /dev/null +++ b/src/spdk/lib/util/crc32_ieee.c @@ -0,0 +1,49 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "util_internal.h" +#include "spdk/crc32.h" + +static struct spdk_crc32_table g_crc32_ieee_table; + +__attribute__((constructor)) static void +crc32_ieee_init(void) +{ + crc32_table_init(&g_crc32_ieee_table, SPDK_CRC32_POLYNOMIAL_REFLECT); +} + +uint32_t +spdk_crc32_ieee_update(const void *buf, size_t len, uint32_t crc) +{ + return crc32_update(&g_crc32_ieee_table, buf, len, crc); +} diff --git a/src/spdk/lib/util/crc32c.c b/src/spdk/lib/util/crc32c.c new file mode 100644 index 000000000..9acd8d80f --- /dev/null +++ b/src/spdk/lib/util/crc32c.c @@ -0,0 +1,133 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "util_internal.h" +#include "spdk/crc32.h" + +#ifdef SPDK_CONFIG_ISAL +#define SPDK_HAVE_ISAL +#include <isa-l/include/crc.h> +#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) +#define SPDK_HAVE_ARM_CRC +#include <arm_acle.h> +#elif defined(__x86_64__) && defined(__SSE4_2__) +#define SPDK_HAVE_SSE4_2 +#include <x86intrin.h> +#endif + +#ifdef SPDK_HAVE_ISAL + +uint32_t +spdk_crc32c_update(const void *buf, size_t len, uint32_t crc) +{ + return crc32_iscsi((unsigned char *)buf, len, crc); +} + +#elif defined(SPDK_HAVE_SSE4_2) + +uint32_t +spdk_crc32c_update(const void *buf, size_t len, uint32_t crc) +{ + uint64_t crc_tmp64; + size_t count; + + /* _mm_crc32_u64() needs a 64-bit intermediate value */ + crc_tmp64 = crc; + + /* Process as much of the buffer as possible in 64-bit blocks. */ + count = len / 8; + while (count--) { + uint64_t block; + + /* + * Use memcpy() to avoid unaligned loads, which are undefined behavior in C. + * The compiler will optimize out the memcpy() in release builds. + */ + memcpy(&block, buf, sizeof(block)); + crc_tmp64 = _mm_crc32_u64(crc_tmp64, block); + buf += sizeof(block); + } + crc = (uint32_t)crc_tmp64; + + /* Handle any trailing bytes. */ + count = len & 7; + while (count--) { + crc = _mm_crc32_u8(crc, *(const uint8_t *)buf); + buf++; + } + + return crc; +} + +#elif defined(SPDK_HAVE_ARM_CRC) + +uint32_t +spdk_crc32c_update(const void *buf, size_t len, uint32_t crc) +{ + size_t count; + + count = len / 8; + while (count--) { + uint64_t block; + + memcpy(&block, buf, sizeof(block)); + crc = __crc32cd(crc, block); + buf += sizeof(block); + } + + count = len & 7; + while (count--) { + crc = __crc32cb(crc, *(const uint8_t *)buf); + buf++; + } + + return crc; +} + +#else /* Neither SSE 4.2 nor ARM CRC32 instructions available */ + +static struct spdk_crc32_table g_crc32c_table; + +__attribute__((constructor)) static void +crc32c_init(void) +{ + crc32_table_init(&g_crc32c_table, SPDK_CRC32C_POLYNOMIAL_REFLECT); +} + +uint32_t +spdk_crc32c_update(const void *buf, size_t len, uint32_t crc) +{ + return crc32_update(&g_crc32c_table, buf, len, crc); +} + +#endif diff --git a/src/spdk/lib/util/dif.c b/src/spdk/lib/util/dif.c new file mode 100644 index 000000000..64bce1487 --- /dev/null +++ b/src/spdk/lib/util/dif.c @@ -0,0 +1,1999 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/dif.h" +#include "spdk/crc16.h" +#include "spdk/crc32.h" +#include "spdk/endian.h" +#include "spdk/log.h" +#include "spdk/util.h" + +/* Context to iterate or create a iovec array. + * Each sgl is either iterated or created at a time. + */ +struct _dif_sgl { + /* Current iovec in the iteration or creation */ + struct iovec *iov; + + /* Remaining count of iovecs in the iteration or creation. */ + int iovcnt; + + /* Current offset in the iovec */ + uint32_t iov_offset; + + /* Size of the created iovec array in bytes */ + uint32_t total_size; +}; + +static inline void +_dif_sgl_init(struct _dif_sgl *s, struct iovec *iovs, int iovcnt) +{ + s->iov = iovs; + s->iovcnt = iovcnt; + s->iov_offset = 0; + s->total_size = 0; +} + +static void +_dif_sgl_advance(struct _dif_sgl *s, uint32_t step) +{ + s->iov_offset += step; + while (s->iovcnt != 0) { + if (s->iov_offset < s->iov->iov_len) { + break; + } + + s->iov_offset -= s->iov->iov_len; + s->iov++; + s->iovcnt--; + } +} + +static inline void +_dif_sgl_get_buf(struct _dif_sgl *s, void **_buf, uint32_t *_buf_len) +{ + if (_buf != NULL) { + *_buf = s->iov->iov_base + s->iov_offset; + } + if (_buf_len != NULL) { + *_buf_len = s->iov->iov_len - s->iov_offset; + } +} + +static inline bool +_dif_sgl_append(struct _dif_sgl *s, uint8_t *data, uint32_t data_len) +{ + assert(s->iovcnt > 0); + s->iov->iov_base = data; + s->iov->iov_len = data_len; + s->total_size += data_len; + s->iov++; + s->iovcnt--; + + if (s->iovcnt > 0) { + return true; + } else { + return false; + } +} + +static inline bool +_dif_sgl_append_split(struct _dif_sgl *dst, struct _dif_sgl *src, uint32_t data_len) +{ + uint8_t *buf; + uint32_t buf_len; + + while (data_len != 0) { + _dif_sgl_get_buf(src, (void *)&buf, &buf_len); + buf_len = spdk_min(buf_len, data_len); + + if (!_dif_sgl_append(dst, buf, buf_len)) { + return false; + } + + _dif_sgl_advance(src, buf_len); + data_len -= buf_len; + } + + return true; +} + +/* This function must be used before starting iteration. */ +static bool +_dif_sgl_is_bytes_multiple(struct _dif_sgl *s, uint32_t bytes) +{ + int i; + + for (i = 0; i < s->iovcnt; i++) { + if (s->iov[i].iov_len % bytes) { + return false; + } + } + + return true; +} + +/* This function must be used before starting iteration. */ +static bool +_dif_sgl_is_valid(struct _dif_sgl *s, uint32_t bytes) +{ + uint64_t total = 0; + int i; + + for (i = 0; i < s->iovcnt; i++) { + total += s->iov[i].iov_len; + } + + return total >= bytes; +} + +static void +_dif_sgl_copy(struct _dif_sgl *to, struct _dif_sgl *from) +{ + memcpy(to, from, sizeof(struct _dif_sgl)); +} + +static bool +_dif_type_is_valid(enum spdk_dif_type dif_type, uint32_t dif_flags) +{ + switch (dif_type) { + case SPDK_DIF_TYPE1: + case SPDK_DIF_TYPE2: + case SPDK_DIF_DISABLE: + break; + case SPDK_DIF_TYPE3: + if (dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) { + SPDK_ERRLOG("Reference Tag should not be checked for Type 3\n"); + return false; + } + break; + default: + SPDK_ERRLOG("Unknown DIF Type: %d\n", dif_type); + return false; + } + + return true; +} + +static bool +_dif_is_disabled(enum spdk_dif_type dif_type) +{ + if (dif_type == SPDK_DIF_DISABLE) { + return true; + } else { + return false; + } +} + + +static uint32_t +_get_guard_interval(uint32_t block_size, uint32_t md_size, bool dif_loc, bool md_interleave) +{ + if (!dif_loc) { + /* For metadata formats with more than 8 bytes, if the DIF is + * contained in the last 8 bytes of metadata, then the CRC + * covers all metadata up to but excluding these last 8 bytes. + */ + if (md_interleave) { + return block_size - sizeof(struct spdk_dif); + } else { + return md_size - sizeof(struct spdk_dif); + } + } else { + /* For metadata formats with more than 8 bytes, if the DIF is + * contained in the first 8 bytes of metadata, then the CRC + * does not cover any metadata. + */ + if (md_interleave) { + return block_size - md_size; + } else { + return 0; + } + } +} + +int +spdk_dif_ctx_init(struct spdk_dif_ctx *ctx, uint32_t block_size, uint32_t md_size, + bool md_interleave, bool dif_loc, enum spdk_dif_type dif_type, uint32_t dif_flags, + uint32_t init_ref_tag, uint16_t apptag_mask, uint16_t app_tag, + uint32_t data_offset, uint16_t guard_seed) +{ + uint32_t data_block_size; + + if (md_size < sizeof(struct spdk_dif)) { + SPDK_ERRLOG("Metadata size is smaller than DIF size.\n"); + return -EINVAL; + } + + if (md_interleave) { + if (block_size < md_size) { + SPDK_ERRLOG("Block size is smaller than DIF size.\n"); + return -EINVAL; + } + data_block_size = block_size - md_size; + } else { + if (block_size == 0 || (block_size % 512) != 0) { + SPDK_ERRLOG("Zero block size is not allowed\n"); + return -EINVAL; + } + data_block_size = block_size; + } + + if (!_dif_type_is_valid(dif_type, dif_flags)) { + SPDK_ERRLOG("DIF type is invalid.\n"); + return -EINVAL; + } + + ctx->block_size = block_size; + ctx->md_size = md_size; + ctx->md_interleave = md_interleave; + ctx->guard_interval = _get_guard_interval(block_size, md_size, dif_loc, md_interleave); + ctx->dif_type = dif_type; + ctx->dif_flags = dif_flags; + ctx->init_ref_tag = init_ref_tag; + ctx->apptag_mask = apptag_mask; + ctx->app_tag = app_tag; + ctx->data_offset = data_offset; + ctx->ref_tag_offset = data_offset / data_block_size; + ctx->last_guard = guard_seed; + ctx->guard_seed = guard_seed; + ctx->remapped_init_ref_tag = 0; + + return 0; +} + +void +spdk_dif_ctx_set_data_offset(struct spdk_dif_ctx *ctx, uint32_t data_offset) +{ + uint32_t data_block_size; + + if (ctx->md_interleave) { + data_block_size = ctx->block_size - ctx->md_size; + } else { + data_block_size = ctx->block_size; + } + + ctx->data_offset = data_offset; + ctx->ref_tag_offset = data_offset / data_block_size; +} + +void +spdk_dif_ctx_set_remapped_init_ref_tag(struct spdk_dif_ctx *ctx, + uint32_t remapped_init_ref_tag) +{ + ctx->remapped_init_ref_tag = remapped_init_ref_tag; +} + +static void +_dif_generate(void *_dif, uint16_t guard, uint32_t offset_blocks, + const struct spdk_dif_ctx *ctx) +{ + struct spdk_dif *dif = _dif; + uint32_t ref_tag; + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + to_be16(&dif->guard, guard); + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) { + to_be16(&dif->app_tag, ctx->app_tag); + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) { + /* For type 1 and 2, the reference tag is incremented for each + * subsequent logical block. For type 3, the reference tag + * remains the same as the initial reference tag. + */ + if (ctx->dif_type != SPDK_DIF_TYPE3) { + ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks; + } else { + ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset; + } + + to_be32(&dif->ref_tag, ref_tag); + } +} + +static void +dif_generate(struct _dif_sgl *sgl, uint32_t num_blocks, const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_blocks = 0; + void *buf; + uint16_t guard = 0; + + while (offset_blocks < num_blocks) { + _dif_sgl_get_buf(sgl, &buf, NULL); + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(ctx->guard_seed, buf, ctx->guard_interval); + } + + _dif_generate(buf + ctx->guard_interval, guard, offset_blocks, ctx); + + _dif_sgl_advance(sgl, ctx->block_size); + offset_blocks++; + } +} + +static uint16_t +_dif_generate_split(struct _dif_sgl *sgl, uint32_t offset_in_block, uint32_t data_len, + uint16_t guard, uint32_t offset_blocks, const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_in_dif, buf_len; + void *buf; + struct spdk_dif dif = {}; + + assert(offset_in_block < ctx->guard_interval); + assert(offset_in_block + data_len < ctx->guard_interval || + offset_in_block + data_len == ctx->block_size); + + /* Compute CRC over split logical block data. */ + while (data_len != 0 && offset_in_block < ctx->guard_interval) { + _dif_sgl_get_buf(sgl, &buf, &buf_len); + buf_len = spdk_min(buf_len, data_len); + buf_len = spdk_min(buf_len, ctx->guard_interval - offset_in_block); + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(guard, buf, buf_len); + } + + _dif_sgl_advance(sgl, buf_len); + offset_in_block += buf_len; + data_len -= buf_len; + } + + if (offset_in_block < ctx->guard_interval) { + return guard; + } + + /* If a whole logical block data is parsed, generate DIF + * and save it to the temporary DIF area. + */ + _dif_generate(&dif, guard, offset_blocks, ctx); + + /* Copy generated DIF field to the split DIF field, and then + * skip metadata field after DIF field (if any). + */ + while (offset_in_block < ctx->block_size) { + _dif_sgl_get_buf(sgl, &buf, &buf_len); + + if (offset_in_block < ctx->guard_interval + sizeof(struct spdk_dif)) { + offset_in_dif = offset_in_block - ctx->guard_interval; + buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset_in_dif); + + memcpy(buf, ((uint8_t *)&dif) + offset_in_dif, buf_len); + } else { + buf_len = spdk_min(buf_len, ctx->block_size - offset_in_block); + } + + _dif_sgl_advance(sgl, buf_len); + offset_in_block += buf_len; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = ctx->guard_seed; + } + + return guard; +} + +static void +dif_generate_split(struct _dif_sgl *sgl, uint32_t num_blocks, + const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_blocks; + uint16_t guard = 0; + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = ctx->guard_seed; + } + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + _dif_generate_split(sgl, 0, ctx->block_size, guard, offset_blocks, ctx); + } +} + +int +spdk_dif_generate(struct iovec *iovs, int iovcnt, uint32_t num_blocks, + const struct spdk_dif_ctx *ctx) +{ + struct _dif_sgl sgl; + + _dif_sgl_init(&sgl, iovs, iovcnt); + + if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) { + SPDK_ERRLOG("Size of iovec array is not valid.\n"); + return -EINVAL; + } + + if (_dif_is_disabled(ctx->dif_type)) { + return 0; + } + + if (_dif_sgl_is_bytes_multiple(&sgl, ctx->block_size)) { + dif_generate(&sgl, num_blocks, ctx); + } else { + dif_generate_split(&sgl, num_blocks, ctx); + } + + return 0; +} + +static void +_dif_error_set(struct spdk_dif_error *err_blk, uint8_t err_type, + uint32_t expected, uint32_t actual, uint32_t err_offset) +{ + if (err_blk) { + err_blk->err_type = err_type; + err_blk->expected = expected; + err_blk->actual = actual; + err_blk->err_offset = err_offset; + } +} + +static int +_dif_verify(void *_dif, uint16_t guard, uint32_t offset_blocks, + const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk) +{ + struct spdk_dif *dif = _dif; + uint16_t _guard; + uint16_t _app_tag; + uint32_t ref_tag, _ref_tag; + + switch (ctx->dif_type) { + case SPDK_DIF_TYPE1: + case SPDK_DIF_TYPE2: + /* If Type 1 or 2 is used, then all DIF checks are disabled when + * the Application Tag is 0xFFFF. + */ + if (dif->app_tag == 0xFFFF) { + return 0; + } + break; + case SPDK_DIF_TYPE3: + /* If Type 3 is used, then all DIF checks are disabled when the + * Application Tag is 0xFFFF and the Reference Tag is 0xFFFFFFFF. + */ + if (dif->app_tag == 0xFFFF && dif->ref_tag == 0xFFFFFFFF) { + return 0; + } + break; + default: + break; + } + + /* For type 1 and 2, the reference tag is incremented for each + * subsequent logical block. For type 3, the reference tag + * remains the same as the initial reference tag. + */ + if (ctx->dif_type != SPDK_DIF_TYPE3) { + ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks; + } else { + ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + /* Compare the DIF Guard field to the CRC computed over the logical + * block data. + */ + _guard = from_be16(&dif->guard); + if (_guard != guard) { + _dif_error_set(err_blk, SPDK_DIF_GUARD_ERROR, _guard, guard, + offset_blocks); + SPDK_ERRLOG("Failed to compare Guard: LBA=%" PRIu32 "," \ + " Expected=%x, Actual=%x\n", + ref_tag, _guard, guard); + return -1; + } + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) { + /* Compare unmasked bits in the DIF Application Tag field to the + * passed Application Tag. + */ + _app_tag = from_be16(&dif->app_tag); + if ((_app_tag & ctx->apptag_mask) != ctx->app_tag) { + _dif_error_set(err_blk, SPDK_DIF_APPTAG_ERROR, ctx->app_tag, + (_app_tag & ctx->apptag_mask), offset_blocks); + SPDK_ERRLOG("Failed to compare App Tag: LBA=%" PRIu32 "," \ + " Expected=%x, Actual=%x\n", + ref_tag, ctx->app_tag, (_app_tag & ctx->apptag_mask)); + return -1; + } + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) { + switch (ctx->dif_type) { + case SPDK_DIF_TYPE1: + case SPDK_DIF_TYPE2: + /* Compare the DIF Reference Tag field to the passed Reference Tag. + * The passed Reference Tag will be the least significant 4 bytes + * of the LBA when Type 1 is used, and application specific value + * if Type 2 is used, + */ + _ref_tag = from_be32(&dif->ref_tag); + if (_ref_tag != ref_tag) { + _dif_error_set(err_blk, SPDK_DIF_REFTAG_ERROR, ref_tag, + _ref_tag, offset_blocks); + SPDK_ERRLOG("Failed to compare Ref Tag: LBA=%" PRIu32 "," \ + " Expected=%x, Actual=%x\n", + ref_tag, ref_tag, _ref_tag); + return -1; + } + break; + case SPDK_DIF_TYPE3: + /* For Type 3, computed Reference Tag remains unchanged. + * Hence ignore the Reference Tag field. + */ + break; + default: + break; + } + } + + return 0; +} + +static int +dif_verify(struct _dif_sgl *sgl, uint32_t num_blocks, + const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk) +{ + uint32_t offset_blocks = 0; + int rc; + void *buf; + uint16_t guard = 0; + + while (offset_blocks < num_blocks) { + _dif_sgl_get_buf(sgl, &buf, NULL); + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(ctx->guard_seed, buf, ctx->guard_interval); + } + + rc = _dif_verify(buf + ctx->guard_interval, guard, offset_blocks, ctx, err_blk); + if (rc != 0) { + return rc; + } + + _dif_sgl_advance(sgl, ctx->block_size); + offset_blocks++; + } + + return 0; +} + +static int +_dif_verify_split(struct _dif_sgl *sgl, uint32_t offset_in_block, uint32_t data_len, + uint16_t *_guard, uint32_t offset_blocks, + const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk) +{ + uint32_t offset_in_dif, buf_len; + void *buf; + uint16_t guard; + struct spdk_dif dif = {}; + int rc; + + assert(_guard != NULL); + assert(offset_in_block < ctx->guard_interval); + assert(offset_in_block + data_len < ctx->guard_interval || + offset_in_block + data_len == ctx->block_size); + + guard = *_guard; + + /* Compute CRC over split logical block data. */ + while (data_len != 0 && offset_in_block < ctx->guard_interval) { + _dif_sgl_get_buf(sgl, &buf, &buf_len); + buf_len = spdk_min(buf_len, data_len); + buf_len = spdk_min(buf_len, ctx->guard_interval - offset_in_block); + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(guard, buf, buf_len); + } + + _dif_sgl_advance(sgl, buf_len); + offset_in_block += buf_len; + data_len -= buf_len; + } + + if (offset_in_block < ctx->guard_interval) { + *_guard = guard; + return 0; + } + + /* Copy the split DIF field to the temporary DIF buffer, and then + * skip metadata field after DIF field (if any). */ + while (offset_in_block < ctx->block_size) { + _dif_sgl_get_buf(sgl, &buf, &buf_len); + + if (offset_in_block < ctx->guard_interval + sizeof(struct spdk_dif)) { + offset_in_dif = offset_in_block - ctx->guard_interval; + buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset_in_dif); + + memcpy((uint8_t *)&dif + offset_in_dif, buf, buf_len); + } else { + buf_len = spdk_min(buf_len, ctx->block_size - offset_in_block); + } + _dif_sgl_advance(sgl, buf_len); + offset_in_block += buf_len; + } + + rc = _dif_verify(&dif, guard, offset_blocks, ctx, err_blk); + if (rc != 0) { + return rc; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = ctx->guard_seed; + } + + *_guard = guard; + return 0; +} + +static int +dif_verify_split(struct _dif_sgl *sgl, uint32_t num_blocks, + const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk) +{ + uint32_t offset_blocks; + uint16_t guard = 0; + int rc; + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = ctx->guard_seed; + } + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + rc = _dif_verify_split(sgl, 0, ctx->block_size, &guard, offset_blocks, + ctx, err_blk); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +int +spdk_dif_verify(struct iovec *iovs, int iovcnt, uint32_t num_blocks, + const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk) +{ + struct _dif_sgl sgl; + + _dif_sgl_init(&sgl, iovs, iovcnt); + + if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) { + SPDK_ERRLOG("Size of iovec array is not valid.\n"); + return -EINVAL; + } + + if (_dif_is_disabled(ctx->dif_type)) { + return 0; + } + + if (_dif_sgl_is_bytes_multiple(&sgl, ctx->block_size)) { + return dif_verify(&sgl, num_blocks, ctx, err_blk); + } else { + return dif_verify_split(&sgl, num_blocks, ctx, err_blk); + } +} + +static uint32_t +dif_update_crc32c(struct _dif_sgl *sgl, uint32_t num_blocks, + uint32_t crc32c, const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_blocks; + void *buf; + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + _dif_sgl_get_buf(sgl, &buf, NULL); + + crc32c = spdk_crc32c_update(buf, ctx->block_size - ctx->md_size, crc32c); + + _dif_sgl_advance(sgl, ctx->block_size); + } + + return crc32c; +} + +static uint32_t +_dif_update_crc32c_split(struct _dif_sgl *sgl, uint32_t offset_in_block, uint32_t data_len, + uint32_t crc32c, const struct spdk_dif_ctx *ctx) +{ + uint32_t data_block_size, buf_len; + void *buf; + + data_block_size = ctx->block_size - ctx->md_size; + + assert(offset_in_block + data_len <= ctx->block_size); + + while (data_len != 0) { + _dif_sgl_get_buf(sgl, &buf, &buf_len); + buf_len = spdk_min(buf_len, data_len); + + if (offset_in_block < data_block_size) { + buf_len = spdk_min(buf_len, data_block_size - offset_in_block); + crc32c = spdk_crc32c_update(buf, buf_len, crc32c); + } + + _dif_sgl_advance(sgl, buf_len); + offset_in_block += buf_len; + data_len -= buf_len; + } + + return crc32c; +} + +static uint32_t +dif_update_crc32c_split(struct _dif_sgl *sgl, uint32_t num_blocks, + uint32_t crc32c, const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_blocks; + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + crc32c = _dif_update_crc32c_split(sgl, 0, ctx->block_size, crc32c, ctx); + } + + return crc32c; +} + +int +spdk_dif_update_crc32c(struct iovec *iovs, int iovcnt, uint32_t num_blocks, + uint32_t *_crc32c, const struct spdk_dif_ctx *ctx) +{ + struct _dif_sgl sgl; + + if (_crc32c == NULL) { + return -EINVAL; + } + + _dif_sgl_init(&sgl, iovs, iovcnt); + + if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) { + SPDK_ERRLOG("Size of iovec array is not valid.\n"); + return -EINVAL; + } + + if (_dif_sgl_is_bytes_multiple(&sgl, ctx->block_size)) { + *_crc32c = dif_update_crc32c(&sgl, num_blocks, *_crc32c, ctx); + } else { + *_crc32c = dif_update_crc32c_split(&sgl, num_blocks, *_crc32c, ctx); + } + + return 0; +} + +static void +dif_generate_copy(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_blocks = 0, data_block_size; + void *src, *dst; + uint16_t guard; + + data_block_size = ctx->block_size - ctx->md_size; + + while (offset_blocks < num_blocks) { + _dif_sgl_get_buf(src_sgl, &src, NULL); + _dif_sgl_get_buf(dst_sgl, &dst, NULL); + + guard = 0; + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif_copy(ctx->guard_seed, dst, src, data_block_size); + guard = spdk_crc16_t10dif(guard, dst + data_block_size, + ctx->guard_interval - data_block_size); + } else { + memcpy(dst, src, data_block_size); + } + + _dif_generate(dst + ctx->guard_interval, guard, offset_blocks, ctx); + + _dif_sgl_advance(src_sgl, data_block_size); + _dif_sgl_advance(dst_sgl, ctx->block_size); + offset_blocks++; + } +} + +static void +_dif_generate_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl, + uint32_t offset_blocks, const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_in_block, src_len, data_block_size; + uint16_t guard = 0; + void *src, *dst; + + _dif_sgl_get_buf(dst_sgl, &dst, NULL); + + data_block_size = ctx->block_size - ctx->md_size; + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = ctx->guard_seed; + } + offset_in_block = 0; + + while (offset_in_block < data_block_size) { + /* Compute CRC over split logical block data and copy + * data to bounce buffer. + */ + _dif_sgl_get_buf(src_sgl, &src, &src_len); + src_len = spdk_min(src_len, data_block_size - offset_in_block); + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif_copy(guard, dst + offset_in_block, + src, src_len); + } else { + memcpy(dst + offset_in_block, src, src_len); + } + + _dif_sgl_advance(src_sgl, src_len); + offset_in_block += src_len; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(guard, dst + data_block_size, + ctx->guard_interval - data_block_size); + } + + _dif_sgl_advance(dst_sgl, ctx->block_size); + + _dif_generate(dst + ctx->guard_interval, guard, offset_blocks, ctx); +} + +static void +dif_generate_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_blocks; + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + _dif_generate_copy_split(src_sgl, dst_sgl, offset_blocks, ctx); + } +} + +int +spdk_dif_generate_copy(struct iovec *iovs, int iovcnt, struct iovec *bounce_iov, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx) +{ + struct _dif_sgl src_sgl, dst_sgl; + uint32_t data_block_size; + + _dif_sgl_init(&src_sgl, iovs, iovcnt); + _dif_sgl_init(&dst_sgl, bounce_iov, 1); + + data_block_size = ctx->block_size - ctx->md_size; + + if (!_dif_sgl_is_valid(&src_sgl, data_block_size * num_blocks) || + !_dif_sgl_is_valid(&dst_sgl, ctx->block_size * num_blocks)) { + SPDK_ERRLOG("Size of iovec arrays are not valid.\n"); + return -EINVAL; + } + + if (_dif_is_disabled(ctx->dif_type)) { + return 0; + } + + if (_dif_sgl_is_bytes_multiple(&src_sgl, data_block_size)) { + dif_generate_copy(&src_sgl, &dst_sgl, num_blocks, ctx); + } else { + dif_generate_copy_split(&src_sgl, &dst_sgl, num_blocks, ctx); + } + + return 0; +} + +static int +dif_verify_copy(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx, + struct spdk_dif_error *err_blk) +{ + uint32_t offset_blocks = 0, data_block_size; + void *src, *dst; + int rc; + uint16_t guard; + + data_block_size = ctx->block_size - ctx->md_size; + + while (offset_blocks < num_blocks) { + _dif_sgl_get_buf(src_sgl, &src, NULL); + _dif_sgl_get_buf(dst_sgl, &dst, NULL); + + guard = 0; + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif_copy(ctx->guard_seed, dst, src, data_block_size); + guard = spdk_crc16_t10dif(guard, src + data_block_size, + ctx->guard_interval - data_block_size); + } else { + memcpy(dst, src, data_block_size); + } + + rc = _dif_verify(src + ctx->guard_interval, guard, offset_blocks, ctx, err_blk); + if (rc != 0) { + return rc; + } + + _dif_sgl_advance(src_sgl, ctx->block_size); + _dif_sgl_advance(dst_sgl, data_block_size); + offset_blocks++; + } + + return 0; +} + +static int +_dif_verify_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl, + uint32_t offset_blocks, const struct spdk_dif_ctx *ctx, + struct spdk_dif_error *err_blk) +{ + uint32_t offset_in_block, dst_len, data_block_size; + uint16_t guard = 0; + void *src, *dst; + + _dif_sgl_get_buf(src_sgl, &src, NULL); + + data_block_size = ctx->block_size - ctx->md_size; + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = ctx->guard_seed; + } + offset_in_block = 0; + + while (offset_in_block < data_block_size) { + /* Compute CRC over split logical block data and copy + * data to bounce buffer. + */ + _dif_sgl_get_buf(dst_sgl, &dst, &dst_len); + dst_len = spdk_min(dst_len, data_block_size - offset_in_block); + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif_copy(guard, dst, + src + offset_in_block, dst_len); + } else { + memcpy(dst, src + offset_in_block, dst_len); + } + + _dif_sgl_advance(dst_sgl, dst_len); + offset_in_block += dst_len; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(guard, src + data_block_size, + ctx->guard_interval - data_block_size); + } + + _dif_sgl_advance(src_sgl, ctx->block_size); + + return _dif_verify(src + ctx->guard_interval, guard, offset_blocks, ctx, err_blk); +} + +static int +dif_verify_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx, + struct spdk_dif_error *err_blk) +{ + uint32_t offset_blocks; + int rc; + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + rc = _dif_verify_copy_split(src_sgl, dst_sgl, offset_blocks, ctx, err_blk); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +int +spdk_dif_verify_copy(struct iovec *iovs, int iovcnt, struct iovec *bounce_iov, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx, + struct spdk_dif_error *err_blk) +{ + struct _dif_sgl src_sgl, dst_sgl; + uint32_t data_block_size; + + _dif_sgl_init(&src_sgl, bounce_iov, 1); + _dif_sgl_init(&dst_sgl, iovs, iovcnt); + + data_block_size = ctx->block_size - ctx->md_size; + + if (!_dif_sgl_is_valid(&dst_sgl, data_block_size * num_blocks) || + !_dif_sgl_is_valid(&src_sgl, ctx->block_size * num_blocks)) { + SPDK_ERRLOG("Size of iovec arrays are not valid\n"); + return -EINVAL; + } + + if (_dif_is_disabled(ctx->dif_type)) { + return 0; + } + + if (_dif_sgl_is_bytes_multiple(&dst_sgl, data_block_size)) { + return dif_verify_copy(&src_sgl, &dst_sgl, num_blocks, ctx, err_blk); + } else { + return dif_verify_copy_split(&src_sgl, &dst_sgl, num_blocks, ctx, err_blk); + } +} + +static void +_bit_flip(uint8_t *buf, uint32_t flip_bit) +{ + uint8_t byte; + + byte = *buf; + byte ^= 1 << flip_bit; + *buf = byte; +} + +static int +_dif_inject_error(struct _dif_sgl *sgl, + uint32_t block_size, uint32_t num_blocks, + uint32_t inject_offset_blocks, + uint32_t inject_offset_bytes, + uint32_t inject_offset_bits) +{ + uint32_t offset_in_block, buf_len; + void *buf; + + _dif_sgl_advance(sgl, block_size * inject_offset_blocks); + + offset_in_block = 0; + + while (offset_in_block < block_size) { + _dif_sgl_get_buf(sgl, &buf, &buf_len); + buf_len = spdk_min(buf_len, block_size - offset_in_block); + + if (inject_offset_bytes >= offset_in_block && + inject_offset_bytes < offset_in_block + buf_len) { + buf += inject_offset_bytes - offset_in_block; + _bit_flip(buf, inject_offset_bits); + return 0; + } + + _dif_sgl_advance(sgl, buf_len); + offset_in_block += buf_len; + } + + return -1; +} + +static int +dif_inject_error(struct _dif_sgl *sgl, uint32_t block_size, uint32_t num_blocks, + uint32_t start_inject_bytes, uint32_t inject_range_bytes, + uint32_t *inject_offset) +{ + uint32_t inject_offset_blocks, inject_offset_bytes, inject_offset_bits; + uint32_t offset_blocks; + int rc; + + srand(time(0)); + + inject_offset_blocks = rand() % num_blocks; + inject_offset_bytes = start_inject_bytes + (rand() % inject_range_bytes); + inject_offset_bits = rand() % 8; + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + if (offset_blocks == inject_offset_blocks) { + rc = _dif_inject_error(sgl, block_size, num_blocks, + inject_offset_blocks, + inject_offset_bytes, + inject_offset_bits); + if (rc == 0) { + *inject_offset = inject_offset_blocks; + } + return rc; + } + } + + return -1; +} + +#define _member_size(type, member) sizeof(((type *)0)->member) + +int +spdk_dif_inject_error(struct iovec *iovs, int iovcnt, uint32_t num_blocks, + const struct spdk_dif_ctx *ctx, uint32_t inject_flags, + uint32_t *inject_offset) +{ + struct _dif_sgl sgl; + int rc; + + _dif_sgl_init(&sgl, iovs, iovcnt); + + if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) { + SPDK_ERRLOG("Size of iovec array is not valid.\n"); + return -EINVAL; + } + + if (inject_flags & SPDK_DIF_REFTAG_ERROR) { + rc = dif_inject_error(&sgl, ctx->block_size, num_blocks, + ctx->guard_interval + offsetof(struct spdk_dif, ref_tag), + _member_size(struct spdk_dif, ref_tag), + inject_offset); + if (rc != 0) { + SPDK_ERRLOG("Failed to inject error to Reference Tag.\n"); + return rc; + } + } + + if (inject_flags & SPDK_DIF_APPTAG_ERROR) { + rc = dif_inject_error(&sgl, ctx->block_size, num_blocks, + ctx->guard_interval + offsetof(struct spdk_dif, app_tag), + _member_size(struct spdk_dif, app_tag), + inject_offset); + if (rc != 0) { + SPDK_ERRLOG("Failed to inject error to Application Tag.\n"); + return rc; + } + } + if (inject_flags & SPDK_DIF_GUARD_ERROR) { + rc = dif_inject_error(&sgl, ctx->block_size, num_blocks, + ctx->guard_interval, + _member_size(struct spdk_dif, guard), + inject_offset); + if (rc != 0) { + SPDK_ERRLOG("Failed to inject error to Guard.\n"); + return rc; + } + } + + if (inject_flags & SPDK_DIF_DATA_ERROR) { + /* If the DIF information is contained within the last 8 bytes of + * metadata, then the CRC covers all metadata bytes up to but excluding + * the last 8 bytes. But error injection does not cover these metadata + * because classification is not determined yet. + * + * Note: Error injection to data block is expected to be detected as + * guard error. + */ + rc = dif_inject_error(&sgl, ctx->block_size, num_blocks, + 0, + ctx->block_size - ctx->md_size, + inject_offset); + if (rc != 0) { + SPDK_ERRLOG("Failed to inject error to data block.\n"); + return rc; + } + } + + return 0; +} + +static void +dix_generate(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_blocks = 0; + uint16_t guard; + void *data_buf, *md_buf; + + while (offset_blocks < num_blocks) { + _dif_sgl_get_buf(data_sgl, &data_buf, NULL); + _dif_sgl_get_buf(md_sgl, &md_buf, NULL); + + guard = 0; + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(ctx->guard_seed, data_buf, ctx->block_size); + guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval); + } + + _dif_generate(md_buf + ctx->guard_interval, guard, offset_blocks, ctx); + + _dif_sgl_advance(data_sgl, ctx->block_size); + _dif_sgl_advance(md_sgl, ctx->md_size); + offset_blocks++; + } +} + +static void +_dix_generate_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl, + uint32_t offset_blocks, const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_in_block, data_buf_len; + uint16_t guard = 0; + void *data_buf, *md_buf; + + _dif_sgl_get_buf(md_sgl, &md_buf, NULL); + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = ctx->guard_seed; + } + offset_in_block = 0; + + while (offset_in_block < ctx->block_size) { + _dif_sgl_get_buf(data_sgl, &data_buf, &data_buf_len); + data_buf_len = spdk_min(data_buf_len, ctx->block_size - offset_in_block); + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(guard, data_buf, data_buf_len); + } + + _dif_sgl_advance(data_sgl, data_buf_len); + offset_in_block += data_buf_len; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval); + } + + _dif_sgl_advance(md_sgl, ctx->md_size); + + _dif_generate(md_buf + ctx->guard_interval, guard, offset_blocks, ctx); +} + +static void +dix_generate_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx) +{ + uint32_t offset_blocks; + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + _dix_generate_split(data_sgl, md_sgl, offset_blocks, ctx); + } +} + +int +spdk_dix_generate(struct iovec *iovs, int iovcnt, struct iovec *md_iov, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx) +{ + struct _dif_sgl data_sgl, md_sgl; + + _dif_sgl_init(&data_sgl, iovs, iovcnt); + _dif_sgl_init(&md_sgl, md_iov, 1); + + if (!_dif_sgl_is_valid(&data_sgl, ctx->block_size * num_blocks) || + !_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) { + SPDK_ERRLOG("Size of iovec array is not valid.\n"); + return -EINVAL; + } + + if (_dif_is_disabled(ctx->dif_type)) { + return 0; + } + + if (_dif_sgl_is_bytes_multiple(&data_sgl, ctx->block_size)) { + dix_generate(&data_sgl, &md_sgl, num_blocks, ctx); + } else { + dix_generate_split(&data_sgl, &md_sgl, num_blocks, ctx); + } + + return 0; +} + +static int +dix_verify(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx, + struct spdk_dif_error *err_blk) +{ + uint32_t offset_blocks = 0; + uint16_t guard; + void *data_buf, *md_buf; + int rc; + + while (offset_blocks < num_blocks) { + _dif_sgl_get_buf(data_sgl, &data_buf, NULL); + _dif_sgl_get_buf(md_sgl, &md_buf, NULL); + + guard = 0; + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(ctx->guard_seed, data_buf, ctx->block_size); + guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval); + } + + rc = _dif_verify(md_buf + ctx->guard_interval, guard, offset_blocks, ctx, err_blk); + if (rc != 0) { + return rc; + } + + _dif_sgl_advance(data_sgl, ctx->block_size); + _dif_sgl_advance(md_sgl, ctx->md_size); + offset_blocks++; + } + + return 0; +} + +static int +_dix_verify_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl, + uint32_t offset_blocks, const struct spdk_dif_ctx *ctx, + struct spdk_dif_error *err_blk) +{ + uint32_t offset_in_block, data_buf_len; + uint16_t guard = 0; + void *data_buf, *md_buf; + + _dif_sgl_get_buf(md_sgl, &md_buf, NULL); + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = ctx->guard_seed; + } + offset_in_block = 0; + + while (offset_in_block < ctx->block_size) { + _dif_sgl_get_buf(data_sgl, &data_buf, &data_buf_len); + data_buf_len = spdk_min(data_buf_len, ctx->block_size - offset_in_block); + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(guard, data_buf, data_buf_len); + } + + _dif_sgl_advance(data_sgl, data_buf_len); + offset_in_block += data_buf_len; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval); + } + + _dif_sgl_advance(md_sgl, ctx->md_size); + + return _dif_verify(md_buf + ctx->guard_interval, guard, offset_blocks, ctx, err_blk); +} + +static int +dix_verify_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx, + struct spdk_dif_error *err_blk) +{ + uint32_t offset_blocks; + int rc; + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + rc = _dix_verify_split(data_sgl, md_sgl, offset_blocks, ctx, err_blk); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +int +spdk_dix_verify(struct iovec *iovs, int iovcnt, struct iovec *md_iov, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx, + struct spdk_dif_error *err_blk) +{ + struct _dif_sgl data_sgl, md_sgl; + + _dif_sgl_init(&data_sgl, iovs, iovcnt); + _dif_sgl_init(&md_sgl, md_iov, 1); + + if (!_dif_sgl_is_valid(&data_sgl, ctx->block_size * num_blocks) || + !_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) { + SPDK_ERRLOG("Size of iovec array is not valid.\n"); + return -EINVAL; + } + + if (_dif_is_disabled(ctx->dif_type)) { + return 0; + } + + if (_dif_sgl_is_bytes_multiple(&data_sgl, ctx->block_size)) { + return dix_verify(&data_sgl, &md_sgl, num_blocks, ctx, err_blk); + } else { + return dix_verify_split(&data_sgl, &md_sgl, num_blocks, ctx, err_blk); + } +} + +int +spdk_dix_inject_error(struct iovec *iovs, int iovcnt, struct iovec *md_iov, + uint32_t num_blocks, const struct spdk_dif_ctx *ctx, + uint32_t inject_flags, uint32_t *inject_offset) +{ + struct _dif_sgl data_sgl, md_sgl; + int rc; + + _dif_sgl_init(&data_sgl, iovs, iovcnt); + _dif_sgl_init(&md_sgl, md_iov, 1); + + if (!_dif_sgl_is_valid(&data_sgl, ctx->block_size * num_blocks) || + !_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) { + SPDK_ERRLOG("Size of iovec array is not valid.\n"); + return -EINVAL; + } + + if (inject_flags & SPDK_DIF_REFTAG_ERROR) { + rc = dif_inject_error(&md_sgl, ctx->md_size, num_blocks, + ctx->guard_interval + offsetof(struct spdk_dif, ref_tag), + _member_size(struct spdk_dif, ref_tag), + inject_offset); + if (rc != 0) { + SPDK_ERRLOG("Failed to inject error to Reference Tag.\n"); + return rc; + } + } + + if (inject_flags & SPDK_DIF_APPTAG_ERROR) { + rc = dif_inject_error(&md_sgl, ctx->md_size, num_blocks, + ctx->guard_interval + offsetof(struct spdk_dif, app_tag), + _member_size(struct spdk_dif, app_tag), + inject_offset); + if (rc != 0) { + SPDK_ERRLOG("Failed to inject error to Application Tag.\n"); + return rc; + } + } + + if (inject_flags & SPDK_DIF_GUARD_ERROR) { + rc = dif_inject_error(&md_sgl, ctx->md_size, num_blocks, + ctx->guard_interval, + _member_size(struct spdk_dif, guard), + inject_offset); + if (rc != 0) { + SPDK_ERRLOG("Failed to inject error to Guard.\n"); + return rc; + } + } + + if (inject_flags & SPDK_DIF_DATA_ERROR) { + /* Note: Error injection to data block is expected to be detected + * as guard error. + */ + rc = dif_inject_error(&data_sgl, ctx->block_size, num_blocks, + 0, + ctx->block_size, + inject_offset); + if (rc != 0) { + SPDK_ERRLOG("Failed to inject error to Guard.\n"); + return rc; + } + } + + return 0; +} + +static uint32_t +_to_next_boundary(uint32_t offset, uint32_t boundary) +{ + return boundary - (offset % boundary); +} + +static uint32_t +_to_size_with_md(uint32_t size, uint32_t data_block_size, uint32_t block_size) +{ + return (size / data_block_size) * block_size + (size % data_block_size); +} + +int +spdk_dif_set_md_interleave_iovs(struct iovec *iovs, int iovcnt, + struct iovec *buf_iovs, int buf_iovcnt, + uint32_t data_offset, uint32_t data_len, + uint32_t *_mapped_len, + const struct spdk_dif_ctx *ctx) +{ + uint32_t data_block_size, data_unalign, buf_len, buf_offset, len; + struct _dif_sgl dif_sgl; + struct _dif_sgl buf_sgl; + + if (iovs == NULL || iovcnt == 0 || buf_iovs == NULL || buf_iovcnt == 0) { + return -EINVAL; + } + + data_block_size = ctx->block_size - ctx->md_size; + + data_unalign = ctx->data_offset % data_block_size; + + buf_len = _to_size_with_md(data_unalign + data_offset + data_len, data_block_size, + ctx->block_size); + buf_len -= data_unalign; + + _dif_sgl_init(&dif_sgl, iovs, iovcnt); + _dif_sgl_init(&buf_sgl, buf_iovs, buf_iovcnt); + + if (!_dif_sgl_is_valid(&buf_sgl, buf_len)) { + SPDK_ERRLOG("Buffer overflow will occur.\n"); + return -ERANGE; + } + + buf_offset = _to_size_with_md(data_unalign + data_offset, data_block_size, ctx->block_size); + buf_offset -= data_unalign; + + _dif_sgl_advance(&buf_sgl, buf_offset); + + while (data_len != 0) { + len = spdk_min(data_len, _to_next_boundary(ctx->data_offset + data_offset, data_block_size)); + if (!_dif_sgl_append_split(&dif_sgl, &buf_sgl, len)) { + break; + } + _dif_sgl_advance(&buf_sgl, ctx->md_size); + data_offset += len; + data_len -= len; + } + + if (_mapped_len != NULL) { + *_mapped_len = dif_sgl.total_size; + } + + return iovcnt - dif_sgl.iovcnt; +} + +static int +_dif_sgl_setup_stream(struct _dif_sgl *sgl, uint32_t *_buf_offset, uint32_t *_buf_len, + uint32_t data_offset, uint32_t data_len, + const struct spdk_dif_ctx *ctx) +{ + uint32_t data_block_size, data_unalign, buf_len, buf_offset; + + data_block_size = ctx->block_size - ctx->md_size; + + data_unalign = ctx->data_offset % data_block_size; + + /* If the last data block is complete, DIF of the data block is + * inserted or verified in this turn. + */ + buf_len = _to_size_with_md(data_unalign + data_offset + data_len, data_block_size, + ctx->block_size); + buf_len -= data_unalign; + + if (!_dif_sgl_is_valid(sgl, buf_len)) { + return -ERANGE; + } + + buf_offset = _to_size_with_md(data_unalign + data_offset, data_block_size, ctx->block_size); + buf_offset -= data_unalign; + + _dif_sgl_advance(sgl, buf_offset); + buf_len -= buf_offset; + + buf_offset += data_unalign; + + *_buf_offset = buf_offset; + *_buf_len = buf_len; + + return 0; +} + +int +spdk_dif_generate_stream(struct iovec *iovs, int iovcnt, + uint32_t data_offset, uint32_t data_len, + struct spdk_dif_ctx *ctx) +{ + uint32_t buf_len = 0, buf_offset = 0; + uint32_t len, offset_in_block, offset_blocks; + uint16_t guard = 0; + struct _dif_sgl sgl; + int rc; + + if (iovs == NULL || iovcnt == 0) { + return -EINVAL; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = ctx->last_guard; + } + + _dif_sgl_init(&sgl, iovs, iovcnt); + + rc = _dif_sgl_setup_stream(&sgl, &buf_offset, &buf_len, data_offset, data_len, ctx); + if (rc != 0) { + return rc; + } + + while (buf_len != 0) { + len = spdk_min(buf_len, _to_next_boundary(buf_offset, ctx->block_size)); + offset_in_block = buf_offset % ctx->block_size; + offset_blocks = buf_offset / ctx->block_size; + + guard = _dif_generate_split(&sgl, offset_in_block, len, guard, offset_blocks, ctx); + + buf_len -= len; + buf_offset += len; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + ctx->last_guard = guard; + } + + return 0; +} + +int +spdk_dif_verify_stream(struct iovec *iovs, int iovcnt, + uint32_t data_offset, uint32_t data_len, + struct spdk_dif_ctx *ctx, + struct spdk_dif_error *err_blk) +{ + uint32_t buf_len = 0, buf_offset = 0; + uint32_t len, offset_in_block, offset_blocks; + uint16_t guard = 0; + struct _dif_sgl sgl; + int rc = 0; + + if (iovs == NULL || iovcnt == 0) { + return -EINVAL; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + guard = ctx->last_guard; + } + + _dif_sgl_init(&sgl, iovs, iovcnt); + + rc = _dif_sgl_setup_stream(&sgl, &buf_offset, &buf_len, data_offset, data_len, ctx); + if (rc != 0) { + return rc; + } + + while (buf_len != 0) { + len = spdk_min(buf_len, _to_next_boundary(buf_offset, ctx->block_size)); + offset_in_block = buf_offset % ctx->block_size; + offset_blocks = buf_offset / ctx->block_size; + + rc = _dif_verify_split(&sgl, offset_in_block, len, &guard, offset_blocks, + ctx, err_blk); + if (rc != 0) { + goto error; + } + + buf_len -= len; + buf_offset += len; + } + + if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) { + ctx->last_guard = guard; + } +error: + return rc; +} + +int +spdk_dif_update_crc32c_stream(struct iovec *iovs, int iovcnt, + uint32_t data_offset, uint32_t data_len, + uint32_t *_crc32c, const struct spdk_dif_ctx *ctx) +{ + uint32_t buf_len = 0, buf_offset = 0, len, offset_in_block; + uint32_t crc32c; + struct _dif_sgl sgl; + int rc; + + if (iovs == NULL || iovcnt == 0) { + return -EINVAL; + } + + crc32c = *_crc32c; + _dif_sgl_init(&sgl, iovs, iovcnt); + + rc = _dif_sgl_setup_stream(&sgl, &buf_offset, &buf_len, data_offset, data_len, ctx); + if (rc != 0) { + return rc; + } + + while (buf_len != 0) { + len = spdk_min(buf_len, _to_next_boundary(buf_offset, ctx->block_size)); + offset_in_block = buf_offset % ctx->block_size; + + crc32c = _dif_update_crc32c_split(&sgl, offset_in_block, len, crc32c, ctx); + + buf_len -= len; + buf_offset += len; + } + + *_crc32c = crc32c; + + return 0; +} + +void +spdk_dif_get_range_with_md(uint32_t data_offset, uint32_t data_len, + uint32_t *_buf_offset, uint32_t *_buf_len, + const struct spdk_dif_ctx *ctx) +{ + uint32_t data_block_size, data_unalign, buf_offset, buf_len; + + if (!ctx->md_interleave) { + buf_offset = data_offset; + buf_len = data_len; + } else { + data_block_size = ctx->block_size - ctx->md_size; + + data_unalign = data_offset % data_block_size; + + buf_offset = _to_size_with_md(data_offset, data_block_size, ctx->block_size); + buf_len = _to_size_with_md(data_unalign + data_len, data_block_size, ctx->block_size) - + data_unalign; + } + + if (_buf_offset != NULL) { + *_buf_offset = buf_offset; + } + + if (_buf_len != NULL) { + *_buf_len = buf_len; + } +} + +uint32_t +spdk_dif_get_length_with_md(uint32_t data_len, const struct spdk_dif_ctx *ctx) +{ + uint32_t data_block_size; + + if (!ctx->md_interleave) { + return data_len; + } else { + data_block_size = ctx->block_size - ctx->md_size; + + return _to_size_with_md(data_len, data_block_size, ctx->block_size); + } +} + +static int +_dif_remap_ref_tag(struct _dif_sgl *sgl, uint32_t offset_blocks, + const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk) +{ + uint32_t offset, buf_len, expected = 0, _actual, remapped; + void *buf; + struct _dif_sgl tmp_sgl; + struct spdk_dif dif; + + /* Fast forward to DIF field. */ + _dif_sgl_advance(sgl, ctx->guard_interval); + _dif_sgl_copy(&tmp_sgl, sgl); + + /* Copy the split DIF field to the temporary DIF buffer */ + offset = 0; + while (offset < sizeof(struct spdk_dif)) { + _dif_sgl_get_buf(sgl, &buf, &buf_len); + buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset); + + memcpy((uint8_t *)&dif + offset, buf, buf_len); + + _dif_sgl_advance(sgl, buf_len); + offset += buf_len; + } + + switch (ctx->dif_type) { + case SPDK_DIF_TYPE1: + case SPDK_DIF_TYPE2: + /* If Type 1 or 2 is used, then all DIF checks are disabled when + * the Application Tag is 0xFFFF. + */ + if (dif.app_tag == 0xFFFF) { + goto end; + } + break; + case SPDK_DIF_TYPE3: + /* If Type 3 is used, then all DIF checks are disabled when the + * Application Tag is 0xFFFF and the Reference Tag is 0xFFFFFFFF. + */ + if (dif.app_tag == 0xFFFF && dif.ref_tag == 0xFFFFFFFF) { + goto end; + } + break; + default: + break; + } + + /* For type 1 and 2, the Reference Tag is incremented for each + * subsequent logical block. For type 3, the Reference Tag + * remains the same as the initial Reference Tag. + */ + if (ctx->dif_type != SPDK_DIF_TYPE3) { + expected = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks; + remapped = ctx->remapped_init_ref_tag + ctx->ref_tag_offset + offset_blocks; + } else { + remapped = ctx->remapped_init_ref_tag; + } + + /* Verify the stored Reference Tag. */ + switch (ctx->dif_type) { + case SPDK_DIF_TYPE1: + case SPDK_DIF_TYPE2: + /* Compare the DIF Reference Tag field to the computed Reference Tag. + * The computed Reference Tag will be the least significant 4 bytes + * of the LBA when Type 1 is used, and application specific value + * if Type 2 is used. + */ + _actual = from_be32(&dif.ref_tag); + if (_actual != expected) { + _dif_error_set(err_blk, SPDK_DIF_REFTAG_ERROR, expected, + _actual, offset_blocks); + SPDK_ERRLOG("Failed to compare Ref Tag: LBA=%" PRIu32 "," \ + " Expected=%x, Actual=%x\n", + expected, expected, _actual); + return -1; + } + break; + case SPDK_DIF_TYPE3: + /* For type 3, the computed Reference Tag remains unchanged. + * Hence ignore the Reference Tag field. + */ + break; + default: + break; + } + + /* Update the stored Reference Tag to the remapped one. */ + to_be32(&dif.ref_tag, remapped); + + offset = 0; + while (offset < sizeof(struct spdk_dif)) { + _dif_sgl_get_buf(&tmp_sgl, &buf, &buf_len); + buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset); + + memcpy(buf, (uint8_t *)&dif + offset, buf_len); + + _dif_sgl_advance(&tmp_sgl, buf_len); + offset += buf_len; + } + +end: + _dif_sgl_advance(sgl, ctx->block_size - ctx->guard_interval - sizeof(struct spdk_dif)); + + return 0; +} + +int +spdk_dif_remap_ref_tag(struct iovec *iovs, int iovcnt, uint32_t num_blocks, + const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk) +{ + struct _dif_sgl sgl; + uint32_t offset_blocks; + int rc; + + _dif_sgl_init(&sgl, iovs, iovcnt); + + if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) { + SPDK_ERRLOG("Size of iovec array is not valid.\n"); + return -EINVAL; + } + + if (_dif_is_disabled(ctx->dif_type)) { + return 0; + } + + if (!(ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) { + return 0; + } + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + rc = _dif_remap_ref_tag(&sgl, offset_blocks, ctx, err_blk); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +static int +_dix_remap_ref_tag(struct _dif_sgl *md_sgl, uint32_t offset_blocks, + const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk) +{ + uint32_t expected = 0, _actual, remapped; + uint8_t *md_buf; + struct spdk_dif *dif; + + _dif_sgl_get_buf(md_sgl, (void *)&md_buf, NULL); + + dif = (struct spdk_dif *)(md_buf + ctx->guard_interval); + + switch (ctx->dif_type) { + case SPDK_DIF_TYPE1: + case SPDK_DIF_TYPE2: + /* If Type 1 or 2 is used, then all DIF checks are disabled when + * the Application Tag is 0xFFFF. + */ + if (dif->app_tag == 0xFFFF) { + goto end; + } + break; + case SPDK_DIF_TYPE3: + /* If Type 3 is used, then all DIF checks are disabled when the + * Application Tag is 0xFFFF and the Reference Tag is 0xFFFFFFFF. + */ + if (dif->app_tag == 0xFFFF && dif->ref_tag == 0xFFFFFFFF) { + goto end; + } + break; + default: + break; + } + + /* For type 1 and 2, the Reference Tag is incremented for each + * subsequent logical block. For type 3, the Reference Tag + * remains the same as the initialReference Tag. + */ + if (ctx->dif_type != SPDK_DIF_TYPE3) { + expected = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks; + remapped = ctx->remapped_init_ref_tag + ctx->ref_tag_offset + offset_blocks; + } else { + remapped = ctx->remapped_init_ref_tag; + } + + /* Verify the stored Reference Tag. */ + switch (ctx->dif_type) { + case SPDK_DIF_TYPE1: + case SPDK_DIF_TYPE2: + /* Compare the DIF Reference Tag field to the computed Reference Tag. + * The computed Reference Tag will be the least significant 4 bytes + * of the LBA when Type 1 is used, and application specific value + * if Type 2 is used. + */ + _actual = from_be32(&dif->ref_tag); + if (_actual != expected) { + _dif_error_set(err_blk, SPDK_DIF_REFTAG_ERROR, expected, + _actual, offset_blocks); + SPDK_ERRLOG("Failed to compare Ref Tag: LBA=%" PRIu32 "," \ + " Expected=%x, Actual=%x\n", + expected, expected, _actual); + return -1; + } + break; + case SPDK_DIF_TYPE3: + /* For type 3, the computed Reference Tag remains unchanged. + * Hence ignore the Reference Tag field. + */ + break; + default: + break; + } + + /* Update the stored Reference Tag to the remapped one. */ + to_be32(&dif->ref_tag, remapped); + +end: + _dif_sgl_advance(md_sgl, ctx->md_size); + + return 0; +} + +int +spdk_dix_remap_ref_tag(struct iovec *md_iov, uint32_t num_blocks, + const struct spdk_dif_ctx *ctx, + struct spdk_dif_error *err_blk) +{ + struct _dif_sgl md_sgl; + uint32_t offset_blocks; + int rc; + + _dif_sgl_init(&md_sgl, md_iov, 1); + + if (!_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) { + SPDK_ERRLOG("Size of metadata iovec array is not valid.\n"); + return -EINVAL; + } + + if (_dif_is_disabled(ctx->dif_type)) { + return 0; + } + + if (!(ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) { + return 0; + } + + for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) { + rc = _dix_remap_ref_tag(&md_sgl, offset_blocks, ctx, err_blk); + if (rc != 0) { + return rc; + } + } + + return 0; +} diff --git a/src/spdk/lib/util/fd.c b/src/spdk/lib/util/fd.c new file mode 100644 index 000000000..6b0d0d554 --- /dev/null +++ b/src/spdk/lib/util/fd.c @@ -0,0 +1,103 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/fd.h" + +#ifdef __linux__ +#include <linux/fs.h> +#endif + +static uint64_t +dev_get_size(int fd) +{ +#if defined(DIOCGMEDIASIZE) /* FreeBSD */ + off_t size; + + if (ioctl(fd, DIOCGMEDIASIZE, &size) == 0) { + return size; + } +#elif defined(__linux__) && defined(BLKGETSIZE64) + uint64_t size; + + if (ioctl(fd, BLKGETSIZE64, &size) == 0) { + return size; + } +#endif + + return 0; +} + +uint32_t +spdk_fd_get_blocklen(int fd) +{ +#if defined(DKIOCGETBLOCKSIZE) /* FreeBSD */ + uint32_t blocklen; + + if (ioctl(fd, DKIOCGETBLOCKSIZE, &blocklen) == 0) { + return blocklen; + } +#elif defined(__linux__) && defined(BLKSSZGET) + uint32_t blocklen; + + if (ioctl(fd, BLKSSZGET, &blocklen) == 0) { + return blocklen; + } +#endif + + return 0; +} + +uint64_t +spdk_fd_get_size(int fd) +{ + struct stat st; + + if (fstat(fd, &st) != 0) { + return 0; + } + + if (S_ISLNK(st.st_mode)) { + return 0; + } + + if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { + return dev_get_size(fd); + } else if (S_ISREG(st.st_mode)) { + return st.st_size; + } + + /* Not REG, CHR or BLK */ + return 0; +} diff --git a/src/spdk/lib/util/file.c b/src/spdk/lib/util/file.c new file mode 100644 index 000000000..2ba08547b --- /dev/null +++ b/src/spdk/lib/util/file.c @@ -0,0 +1,71 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/file.h" + +void * +spdk_posix_file_load(FILE *file, size_t *size) +{ + void *newbuf, *buf = NULL; + size_t rc, buf_size, cur_size = 0; + + *size = 0; + buf_size = 128 * 1024; + + while (buf_size <= 1024 * 1024 * 1024) { + newbuf = realloc(buf, buf_size); + if (newbuf == NULL) { + free(buf); + return NULL; + } + buf = newbuf; + + rc = fread(buf + cur_size, 1, buf_size - cur_size, file); + cur_size += rc; + + if (feof(file)) { + *size = cur_size; + return buf; + } + + if (ferror(file)) { + free(buf); + return NULL; + } + + buf_size *= 2; + } + + free(buf); + return NULL; +} diff --git a/src/spdk/lib/util/iov.c b/src/spdk/lib/util/iov.c new file mode 100644 index 000000000..e89ef9d21 --- /dev/null +++ b/src/spdk/lib/util/iov.c @@ -0,0 +1,111 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/util.h" + +size_t +spdk_iovcpy(struct iovec *siov, size_t siovcnt, struct iovec *diov, size_t diovcnt) +{ + size_t total_sz; + size_t sidx; + size_t didx; + int siov_len; + uint8_t *siov_base; + int diov_len; + uint8_t *diov_base; + + /* d prefix = destination. s prefix = source. */ + + assert(diovcnt > 0); + assert(siovcnt > 0); + + total_sz = 0; + sidx = 0; + didx = 0; + siov_len = siov[0].iov_len; + siov_base = siov[0].iov_base; + diov_len = diov[0].iov_len; + diov_base = diov[0].iov_base; + while (siov_len > 0 && diov_len > 0) { + if (siov_len == diov_len) { + memcpy(diov_base, siov_base, siov_len); + total_sz += siov_len; + + /* Advance both iovs to the next element */ + sidx++; + if (sidx == siovcnt) { + break; + } + + didx++; + if (didx == diovcnt) { + break; + } + + siov_len = siov[sidx].iov_len; + siov_base = siov[sidx].iov_base; + diov_len = diov[didx].iov_len; + diov_base = diov[didx].iov_base; + } else if (siov_len < diov_len) { + memcpy(diov_base, siov_base, siov_len); + total_sz += siov_len; + + /* Advance only the source to the next element */ + sidx++; + if (sidx == siovcnt) { + break; + } + + diov_base += siov_len; + diov_len -= siov_len; + siov_len = siov[sidx].iov_len; + siov_base = siov[sidx].iov_base; + } else { + memcpy(diov_base, siov_base, diov_len); + total_sz += diov_len; + + /* Advance only the destination to the next element */ + didx++; + if (didx == diovcnt) { + break; + } + + siov_base += diov_len; + siov_len -= diov_len; + diov_len = diov[didx].iov_len; + diov_base = diov[didx].iov_base; + } + } + + return total_sz; +} diff --git a/src/spdk/lib/util/math.c b/src/spdk/lib/util/math.c new file mode 100644 index 000000000..7d1852421 --- /dev/null +++ b/src/spdk/lib/util/math.c @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/util.h" + +/* The following will automatically generate several version of + * this function, targeted at different architectures. This + * is only supported by GCC 6 or newer. */ +#if defined(__GNUC__) && __GNUC__ >= 6 && !defined(__clang__) \ + && (defined(__i386__) || defined(__x86_64__)) +__attribute__((target_clones("bmi", "arch=core2", "arch=atom", "default"))) +#endif +uint32_t +spdk_u32log2(uint32_t x) +{ + if (x == 0) { + /* log(0) is undefined */ + return 0; + } + return 31u - __builtin_clz(x); +} + +/* The following will automatically generate several version of + * this function, targeted at different architectures. This + * is only supported by GCC 6 or newer. */ +#if defined(__GNUC__) && __GNUC__ >= 6 && !defined(__clang__) \ + && (defined(__i386__) || defined(__x86_64__)) +__attribute__((target_clones("bmi", "arch=core2", "arch=atom", "default"))) +#endif +uint64_t +spdk_u64log2(uint64_t x) +{ + if (x == 0) { + /* log(0) is undefined */ + return 0; + } + return 63u - __builtin_clzl(x); +} diff --git a/src/spdk/lib/util/pipe.c b/src/spdk/lib/util/pipe.c new file mode 100644 index 000000000..1c640dd2e --- /dev/null +++ b/src/spdk/lib/util/pipe.c @@ -0,0 +1,246 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/pipe.h" +#include "spdk/util.h" + +struct spdk_pipe { + uint8_t *buf; + uint32_t sz; + + uint32_t write; + uint32_t read; +}; + +struct spdk_pipe * +spdk_pipe_create(void *buf, uint32_t sz) +{ + struct spdk_pipe *pipe; + + pipe = calloc(1, sizeof(*pipe)); + if (pipe == NULL) { + return NULL; + } + + pipe->buf = buf; + pipe->sz = sz; + + return pipe; +} + +void +spdk_pipe_destroy(struct spdk_pipe *pipe) +{ + free(pipe); +} + +int +spdk_pipe_writer_get_buffer(struct spdk_pipe *pipe, uint32_t requested_sz, struct iovec *iovs) +{ + uint32_t sz; + uint32_t read; + uint32_t write; + + read = pipe->read; + write = pipe->write; + + if (read <= write) { + requested_sz = spdk_min(requested_sz, ((read + pipe->sz) - write - 1)); + + sz = spdk_min(requested_sz, pipe->sz - write); + + iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + write); + iovs[0].iov_len = sz; + + requested_sz -= sz; + + if (requested_sz > 0) { + sz = spdk_min(requested_sz, read); + + iovs[1].iov_base = (sz == 0) ? NULL : pipe->buf; + iovs[1].iov_len = sz; + } else { + iovs[1].iov_base = NULL; + iovs[1].iov_len = 0; + } + } else { + sz = spdk_min(requested_sz, read - write - 1); + + iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + write); + iovs[0].iov_len = sz; + iovs[1].iov_base = NULL; + iovs[1].iov_len = 0; + } + + return iovs[0].iov_len + iovs[1].iov_len; +} + +int +spdk_pipe_writer_advance(struct spdk_pipe *pipe, uint32_t requested_sz) +{ + uint32_t sz; + uint32_t read; + uint32_t write; + + read = pipe->read; + write = pipe->write; + + if (requested_sz > pipe->sz - 1) { + return -EINVAL; + } + + if (read <= write) { + if (requested_sz > (read + pipe->sz) - write) { + return -EINVAL; + } + + sz = spdk_min(requested_sz, pipe->sz - write); + + write += sz; + if (write > pipe->sz - 1) { + write = 0; + } + requested_sz -= sz; + + if (requested_sz > 0) { + if (requested_sz >= read) { + return -EINVAL; + } + + write = requested_sz; + } + } else { + if (requested_sz > (read - write - 1)) { + return -EINVAL; + } + + write += requested_sz; + } + + pipe->write = write; + + return 0; +} + +uint32_t +spdk_pipe_reader_bytes_available(struct spdk_pipe *pipe) +{ + uint32_t read; + uint32_t write; + + read = pipe->read; + write = pipe->write; + + if (read <= write) { + return write - read; + } + + return (write + pipe->sz) - read; +} + +int +spdk_pipe_reader_get_buffer(struct spdk_pipe *pipe, uint32_t requested_sz, struct iovec *iovs) +{ + uint32_t sz; + uint32_t read; + uint32_t write; + + read = pipe->read; + write = pipe->write; + + if (read <= write) { + sz = spdk_min(requested_sz, write - read); + + iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + read); + iovs[0].iov_len = sz; + iovs[1].iov_base = NULL; + iovs[1].iov_len = 0; + } else { + sz = spdk_min(requested_sz, pipe->sz - read); + + iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + read); + iovs[0].iov_len = sz; + + requested_sz -= sz; + + if (requested_sz > 0) { + sz = spdk_min(requested_sz, write); + iovs[1].iov_base = (sz == 0) ? NULL : pipe->buf; + iovs[1].iov_len = sz; + } else { + iovs[1].iov_base = NULL; + iovs[1].iov_len = 0; + } + } + + return iovs[0].iov_len + iovs[1].iov_len; +} + +int +spdk_pipe_reader_advance(struct spdk_pipe *pipe, uint32_t requested_sz) +{ + uint32_t sz; + uint32_t read; + uint32_t write; + + read = pipe->read; + write = pipe->write; + + if (read <= write) { + if (requested_sz > (write - read)) { + return -EINVAL; + } + + read += requested_sz; + } else { + sz = spdk_min(requested_sz, pipe->sz - read); + + read += sz; + if (read > pipe->sz - 1) { + read = 0; + } + requested_sz -= sz; + + if (requested_sz > 0) { + if (requested_sz > write) { + return -EINVAL; + } + + read = requested_sz; + } + } + + pipe->read = read; + + return 0; +} diff --git a/src/spdk/lib/util/spdk_util.map b/src/spdk/lib/util/spdk_util.map new file mode 100644 index 000000000..07e067faa --- /dev/null +++ b/src/spdk/lib/util/spdk_util.map @@ -0,0 +1,128 @@ +{ + global: + + # public functions in base64.h + spdk_base64_encode; + spdk_base64_urlsafe_encode; + spdk_base64_decode; + spdk_base64_urlsafe_decode; + + # public functions in bit_array.h + spdk_bit_array_capacity; + spdk_bit_array_create; + spdk_bit_array_free; + spdk_bit_array_resize; + spdk_bit_array_get; + spdk_bit_array_set; + spdk_bit_array_clear; + spdk_bit_array_find_first_set; + spdk_bit_array_find_first_clear; + spdk_bit_array_count_set; + spdk_bit_array_count_clear; + spdk_bit_array_store_mask; + spdk_bit_array_load_mask; + spdk_bit_array_clear_mask; + + # public functions in cpuset.h + spdk_cpuset_alloc; + spdk_cpuset_free; + spdk_cpuset_equal; + spdk_cpuset_copy; + spdk_cpuset_and; + spdk_cpuset_or; + spdk_cpuset_xor; + spdk_cpuset_negate; + spdk_cpuset_zero; + spdk_cpuset_set_cpu; + spdk_cpuset_get_cpu; + spdk_cpuset_count; + spdk_cpuset_fmt; + spdk_cpuset_parse; + + # public functions in crc16.h + spdk_crc16_t10dif; + spdk_crc16_t10dif_copy; + + # public functions in crc32.h + spdk_crc32_ieee_update; + spdk_crc32c_update; + + # public functions in dif.h + spdk_dif_ctx_init; + spdk_dif_ctx_set_data_offset; + spdk_dif_ctx_set_remapped_init_ref_tag; + spdk_dif_generate; + spdk_dif_verify; + spdk_dif_update_crc32c; + spdk_dif_generate_copy; + spdk_dif_verify_copy; + spdk_dif_inject_error; + spdk_dix_generate; + spdk_dix_verify; + spdk_dix_inject_error; + spdk_dif_set_md_interleave_iovs; + spdk_dif_generate_stream; + spdk_dif_verify_stream; + spdk_dif_update_crc32c_stream; + spdk_dif_get_range_with_md; + spdk_dif_get_length_with_md; + spdk_dif_remap_ref_tag; + spdk_dix_remap_ref_tag; + + # public functions in fd.h + spdk_fd_get_size; + spdk_fd_get_blocklen; + + # public functions in file.h + spdk_posix_file_load; + + # public functions in pipe.h + spdk_pipe_create; + spdk_pipe_destroy; + spdk_pipe_writer_get_buffer; + spdk_pipe_writer_advance; + spdk_pipe_reader_bytes_available; + spdk_pipe_reader_get_buffer; + spdk_pipe_reader_advance; + + # public functions in string.h + spdk_sprintf_alloc; + spdk_vsprintf_alloc; + spdk_sprintf_append_realloc; + spdk_vsprintf_append_realloc; + spdk_strlwr; + spdk_strsepq; + spdk_str_trim; + spdk_strerror_r; + spdk_strerror; + spdk_str_chomp; + spdk_strcpy_pad; + spdk_strlen_pad; + spdk_parse_ip_addr; + spdk_parse_capacity; + spdk_mem_all_zero; + spdk_strtol; + spdk_strtoll; + + # public functions in util.h + spdk_u32log2; + spdk_u64log2; + spdk_iovcpy; + + # resolvers for functions in util.h + spdk_u32log2.resolver; + spdk_u64log2.resolver; + + # public functions in uuid.h + spdk_uuid_parse; + spdk_uuid_fmt_lower; + spdk_uuid_compare; + spdk_uuid_generate; + spdk_uuid_copy; + + + + + + local: *; +}; diff --git a/src/spdk/lib/util/strerror_tls.c b/src/spdk/lib/util/strerror_tls.c new file mode 100644 index 000000000..c9dc8f13f --- /dev/null +++ b/src/spdk/lib/util/strerror_tls.c @@ -0,0 +1,43 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/string.h" + +static __thread char strerror_message[64]; + +const char * +spdk_strerror(int errnum) +{ + spdk_strerror_r(errnum, strerror_message, sizeof(strerror_message)); + return strerror_message; +} diff --git a/src/spdk/lib/util/string.c b/src/spdk/lib/util/string.c new file mode 100644 index 000000000..30ac1628a --- /dev/null +++ b/src/spdk/lib/util/string.c @@ -0,0 +1,476 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/string.h" + +char * +spdk_vsprintf_append_realloc(char *buffer, const char *format, va_list args) +{ + va_list args_copy; + char *new_buffer; + int orig_size = 0, new_size; + + /* Original buffer size */ + if (buffer) { + orig_size = strlen(buffer); + } + + /* Necessary buffer size */ + va_copy(args_copy, args); + new_size = vsnprintf(NULL, 0, format, args_copy); + va_end(args_copy); + + if (new_size < 0) { + return NULL; + } + new_size += orig_size + 1; + + new_buffer = realloc(buffer, new_size); + if (new_buffer == NULL) { + return NULL; + } + + vsnprintf(new_buffer + orig_size, new_size - orig_size, format, args); + + return new_buffer; +} + +char * +spdk_sprintf_append_realloc(char *buffer, const char *format, ...) +{ + va_list args; + char *ret; + + va_start(args, format); + ret = spdk_vsprintf_append_realloc(buffer, format, args); + va_end(args); + + return ret; +} + +char * +spdk_vsprintf_alloc(const char *format, va_list args) +{ + return spdk_vsprintf_append_realloc(NULL, format, args); +} + +char * +spdk_sprintf_alloc(const char *format, ...) +{ + va_list args; + char *ret; + + va_start(args, format); + ret = spdk_vsprintf_alloc(format, args); + va_end(args); + + return ret; +} + +char * +spdk_strlwr(char *s) +{ + char *p; + + if (s == NULL) { + return NULL; + } + + p = s; + while (*p != '\0') { + *p = tolower(*p); + p++; + } + + return s; +} + +char * +spdk_strsepq(char **stringp, const char *delim) +{ + char *p, *q, *r; + int quoted = 0, bslash = 0; + + p = *stringp; + if (p == NULL) { + return NULL; + } + + r = q = p; + while (*q != '\0' && *q != '\n') { + /* eat quoted characters */ + if (bslash) { + bslash = 0; + *r++ = *q++; + continue; + } else if (quoted) { + if (quoted == '"' && *q == '\\') { + bslash = 1; + q++; + continue; + } else if (*q == quoted) { + quoted = 0; + q++; + continue; + } + *r++ = *q++; + continue; + } else if (*q == '\\') { + bslash = 1; + q++; + continue; + } else if (*q == '"' || *q == '\'') { + quoted = *q; + q++; + continue; + } + + /* separator? */ + if (strchr(delim, *q) == NULL) { + *r++ = *q++; + continue; + } + + /* new string */ + q++; + break; + } + *r = '\0'; + + /* skip tailer */ + while (*q != '\0' && strchr(delim, *q) != NULL) { + q++; + } + if (*q != '\0') { + *stringp = q; + } else { + *stringp = NULL; + } + + return p; +} + +char * +spdk_str_trim(char *s) +{ + char *p, *q; + + if (s == NULL) { + return NULL; + } + + /* remove header */ + p = s; + while (*p != '\0' && isspace(*p)) { + p++; + } + + /* remove tailer */ + q = p + strlen(p); + while (q - 1 >= p && isspace(*(q - 1))) { + q--; + *q = '\0'; + } + + /* if remove header, move */ + if (p != s) { + q = s; + while (*p != '\0') { + *q++ = *p++; + } + *q = '\0'; + } + + return s; +} + +void +spdk_strcpy_pad(void *dst, const char *src, size_t size, int pad) +{ + size_t len; + + len = strlen(src); + if (len < size) { + memcpy(dst, src, len); + memset((char *)dst + len, pad, size - len); + } else { + memcpy(dst, src, size); + } +} + +size_t +spdk_strlen_pad(const void *str, size_t size, int pad) +{ + const uint8_t *start; + const uint8_t *iter; + uint8_t pad_byte; + + pad_byte = (uint8_t)pad; + start = (const uint8_t *)str; + + if (size == 0) { + return 0; + } + + iter = start + size - 1; + while (1) { + if (*iter != pad_byte) { + return iter - start + 1; + } + + if (iter == start) { + /* Hit the start of the string finding only pad_byte. */ + return 0; + } + iter--; + } +} + +int +spdk_parse_ip_addr(char *ip, char **host, char **port) +{ + char *p; + + if (ip == NULL) { + return -EINVAL; + } + + *host = NULL; + *port = NULL; + + if (ip[0] == '[') { + /* IPv6 */ + p = strchr(ip, ']'); + if (p == NULL) { + return -EINVAL; + } + *host = &ip[1]; + *p = '\0'; + + p++; + if (*p == '\0') { + return 0; + } else if (*p != ':') { + return -EINVAL; + } + + p++; + if (*p == '\0') { + return 0; + } + + *port = p; + } else { + /* IPv4 */ + p = strchr(ip, ':'); + if (p == NULL) { + *host = ip; + return 0; + } + + *host = ip; + *p = '\0'; + + p++; + if (*p == '\0') { + return 0; + } + + *port = p; + } + + return 0; +} + +size_t +spdk_str_chomp(char *s) +{ + size_t len = strlen(s); + size_t removed = 0; + + while (len > 0) { + if (s[len - 1] != '\r' && s[len - 1] != '\n') { + break; + } + + s[len - 1] = '\0'; + len--; + removed++; + } + + return removed; +} + +void +spdk_strerror_r(int errnum, char *buf, size_t buflen) +{ + int rc; + +#if defined(__USE_GNU) + char *new_buffer; + new_buffer = strerror_r(errnum, buf, buflen); + if (new_buffer == buf) { + rc = 0; + } else if (new_buffer != NULL) { + snprintf(buf, buflen, "%s", new_buffer); + rc = 0; + } else { + rc = 1; + } +#else + rc = strerror_r(errnum, buf, buflen); +#endif + + if (rc != 0) { + snprintf(buf, buflen, "Unknown error %d", errnum); + } +} + +int +spdk_parse_capacity(const char *cap_str, uint64_t *cap, bool *has_prefix) +{ + int rc; + char bin_prefix; + + rc = sscanf(cap_str, "%"SCNu64"%c", cap, &bin_prefix); + if (rc == 1) { + *has_prefix = false; + return 0; + } else if (rc == 0) { + if (errno == 0) { + /* No scanf matches - the string does not start with a digit */ + return -EINVAL; + } else { + /* Parsing error */ + return -errno; + } + } + + *has_prefix = true; + switch (bin_prefix) { + case 'k': + case 'K': + *cap *= 1024; + break; + case 'm': + case 'M': + *cap *= 1024 * 1024; + break; + case 'g': + case 'G': + *cap *= 1024 * 1024 * 1024; + break; + default: + return -EINVAL; + } + + return 0; +} + +bool +spdk_mem_all_zero(const void *data, size_t size) +{ + const uint8_t *buf = data; + + while (size--) { + if (*buf++ != 0) { + return false; + } + } + + return true; +} + +long int +spdk_strtol(const char *nptr, int base) +{ + long val; + char *endptr; + + /* Since strtoll() can legitimately return 0, LONG_MAX, or LONG_MIN + * on both success and failure, the calling program should set errno + * to 0 before the call. + */ + errno = 0; + + val = strtol(nptr, &endptr, base); + + if (!errno && *endptr != '\0') { + /* Non integer character was found. */ + return -EINVAL; + } else if (errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) { + /* Overflow occurred. */ + return -ERANGE; + } else if (errno != 0 && val == 0) { + /* Other error occurred. */ + return -errno; + } else if (val < 0) { + /* Input string was negative number. */ + return -ERANGE; + } + + return val; +} + +long long int +spdk_strtoll(const char *nptr, int base) +{ + long long val; + char *endptr; + + /* Since strtoll() can legitimately return 0, LLONG_MAX, or LLONG_MIN + * on both success and failure, the calling program should set errno + * to 0 before the call. + */ + errno = 0; + + val = strtoll(nptr, &endptr, base); + + if (!errno && *endptr != '\0') { + /* Non integer character was found. */ + return -EINVAL; + } else if (errno == ERANGE && (val == LLONG_MAX || val == LLONG_MIN)) { + /* Overflow occurred. */ + return -ERANGE; + } else if (errno != 0 && val == 0) { + /* Other error occurred. */ + return -errno; + } else if (val < 0) { + /* Input string was negative number. */ + return -ERANGE; + } + + return val; +} diff --git a/src/spdk/lib/util/util_internal.h b/src/spdk/lib/util/util_internal.h new file mode 100644 index 000000000..655ef513d --- /dev/null +++ b/src/spdk/lib/util/util_internal.h @@ -0,0 +1,77 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_UTIL_INTERNAL_H +#define SPDK_UTIL_INTERNAL_H + +#include "spdk/stdinc.h" + +/** + * IEEE CRC-32 polynomial (bit reflected) + */ +#define SPDK_CRC32_POLYNOMIAL_REFLECT 0xedb88320UL + +/** + * CRC-32C (Castagnoli) polynomial (bit reflected) + */ +#define SPDK_CRC32C_POLYNOMIAL_REFLECT 0x82f63b78UL + +struct spdk_crc32_table { + uint32_t table[256]; +}; + +/** + * Initialize a CRC32 lookup table for a given polynomial. + * + * \param table Table to fill with precalculated CRC-32 data. + * \param polynomial_reflect Bit-reflected CRC-32 polynomial. + */ +void crc32_table_init(struct spdk_crc32_table *table, + uint32_t polynomial_reflect); + + +/** + * Calculate a partial CRC-32 checksum. + * + * \param table CRC-32 table initialized with crc32_table_init(). + * \param buf Data buffer to checksum. + * \param len Length of buf in bytes. + * \param crc Previous CRC-32 value. + * \return Updated CRC-32 value. + */ +uint32_t crc32_update(const struct spdk_crc32_table *table, + const void *buf, size_t len, + uint32_t crc); + +#endif /* SPDK_UTIL_INTERNAL_H */ diff --git a/src/spdk/lib/util/uuid.c b/src/spdk/lib/util/uuid.c new file mode 100644 index 000000000..176f65880 --- /dev/null +++ b/src/spdk/lib/util/uuid.c @@ -0,0 +1,73 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/uuid.h" + +#include <uuid/uuid.h> + +SPDK_STATIC_ASSERT(sizeof(struct spdk_uuid) == sizeof(uuid_t), "Size mismatch"); + +int +spdk_uuid_parse(struct spdk_uuid *uuid, const char *uuid_str) +{ + return uuid_parse(uuid_str, (void *)uuid) == 0 ? 0 : -EINVAL; +} + +int +spdk_uuid_fmt_lower(char *uuid_str, size_t uuid_str_size, const struct spdk_uuid *uuid) +{ + if (uuid_str_size < SPDK_UUID_STRING_LEN) { + return -EINVAL; + } + + uuid_unparse_lower((void *)uuid, uuid_str); + return 0; +} + +int +spdk_uuid_compare(const struct spdk_uuid *u1, const struct spdk_uuid *u2) +{ + return uuid_compare((void *)u1, (void *)u2); +} + +void +spdk_uuid_generate(struct spdk_uuid *uuid) +{ + uuid_generate((void *)uuid); +} + +void +spdk_uuid_copy(struct spdk_uuid *dst, const struct spdk_uuid *src) +{ + uuid_copy((void *)dst, (void *)src); +} diff --git a/src/spdk/lib/vhost/Makefile b/src/spdk/lib/vhost/Makefile new file mode 100644 index 000000000..1fe9b6e40 --- /dev/null +++ b/src/spdk/lib/vhost/Makefile @@ -0,0 +1,54 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 4 +SO_MINOR := 0 + +CFLAGS += -I. +CFLAGS += $(ENV_CFLAGS) + +C_SRCS = vhost.c vhost_rpc.c vhost_scsi.c vhost_blk.c rte_vhost_compat.c + +ifeq ($(CONFIG_VHOST_INTERNAL_LIB),y) +C_SRCS += vhost_nvme.c +CFLAGS := -I../rte_vhost $(CFLAGS) +endif + +LIBNAME = vhost + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_vhost.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/vhost/rte_vhost_compat.c b/src/spdk/lib/vhost/rte_vhost_compat.c new file mode 100644 index 000000000..53f31bfd7 --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost_compat.c @@ -0,0 +1,402 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * Set of workarounds for rte_vhost to make it work with device types + * other than vhost-net. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/memory.h" +#include "spdk/barrier.h" +#include "spdk/vhost.h" +#include "vhost_internal.h" + +#include "spdk_internal/vhost_user.h" + +static inline void +vhost_session_mem_region_calc(uint64_t *previous_start, uint64_t *start, uint64_t *end, + uint64_t *len, struct rte_vhost_mem_region *region) +{ + *start = FLOOR_2MB(region->mmap_addr); + *end = CEIL_2MB(region->mmap_addr + region->mmap_size); + if (*start == *previous_start) { + *start += (size_t) VALUE_2MB; + } + *previous_start = *start; + *len = *end - *start; +} + +void +vhost_session_mem_register(struct rte_vhost_memory *mem) +{ + uint64_t start, end, len; + uint32_t i; + uint64_t previous_start = UINT64_MAX; + + + for (i = 0; i < mem->nregions; i++) { + vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]); + SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n", + start, len); + + if (spdk_mem_register((void *)start, len) != 0) { + SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n", + i); + continue; + } + } +} + +void +vhost_session_mem_unregister(struct rte_vhost_memory *mem) +{ + uint64_t start, end, len; + uint32_t i; + uint64_t previous_start = UINT64_MAX; + + for (i = 0; i < mem->nregions; i++) { + vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]); + if (spdk_vtophys((void *) start, NULL) == SPDK_VTOPHYS_ERROR) { + continue; /* region has not been registered */ + } + + if (spdk_mem_unregister((void *)start, len) != 0) { + assert(false); + } + } +} + +static int +new_connection(int vid) +{ + char ifname[PATH_MAX]; + + if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) { + SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid); + return -1; + } + + return vhost_new_connection_cb(vid, ifname); +} + +static int +start_device(int vid) +{ + return vhost_start_device_cb(vid); +} + +static void +stop_device(int vid) +{ + vhost_stop_device_cb(vid); +} + +static void +destroy_connection(int vid) +{ + vhost_destroy_connection_cb(vid); +} + +static const struct vhost_device_ops g_spdk_vhost_ops = { + .new_device = start_device, + .destroy_device = stop_device, + .new_connection = new_connection, + .destroy_connection = destroy_connection, +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB + .get_config = vhost_get_config_cb, + .set_config = vhost_set_config_cb, + .vhost_nvme_admin_passthrough = vhost_nvme_admin_passthrough, + .vhost_nvme_set_cq_call = vhost_nvme_set_cq_call, + .vhost_nvme_get_cap = vhost_nvme_get_cap, + .vhost_nvme_set_bar_mr = vhost_nvme_set_bar_mr, +#endif +}; + +#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB + +static enum rte_vhost_msg_result +extern_vhost_pre_msg_handler(int vid, void *_msg) +{ + struct vhost_user_msg *msg = _msg; + struct spdk_vhost_session *vsession; + + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid); + assert(false); + return RTE_VHOST_MSG_RESULT_ERR; + } + + switch (msg->request) { + case VHOST_USER_GET_VRING_BASE: + if (vsession->forced_polling && vsession->started) { + /* Our queue is stopped for whatever reason, but we may still + * need to poll it after it's initialized again. + */ + g_spdk_vhost_ops.destroy_device(vid); + } + break; + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ADDR: + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_KICK: + if (vsession->forced_polling && vsession->started) { + /* Additional queues are being initialized, so we either processed + * enough I/Os and are switching from SeaBIOS to the OS now, or + * we were never in SeaBIOS in the first place. Either way, we + * don't need our workaround anymore. + */ + g_spdk_vhost_ops.destroy_device(vid); + vsession->forced_polling = false; + } + break; + case VHOST_USER_SET_VRING_CALL: + /* rte_vhost will close the previous callfd and won't notify + * us about any change. This will effectively make SPDK fail + * to deliver any subsequent interrupts until a session is + * restarted. We stop the session here before closing the previous + * fd (so that all interrupts must have been delivered by the + * time the descriptor is closed) and start right after (which + * will make SPDK retrieve the latest, up-to-date callfd from + * rte_vhost. + */ + case VHOST_USER_SET_MEM_TABLE: + /* rte_vhost will unmap previous memory that SPDK may still + * have pending DMA operations on. We can't let that happen, + * so stop the device before letting rte_vhost unmap anything. + * This will block until all pending I/Os are finished. + * We will start the device again from the post-processing + * message handler. + */ + if (vsession->started) { + g_spdk_vhost_ops.destroy_device(vid); + vsession->needs_restart = true; + } + break; + case VHOST_USER_GET_CONFIG: { + int rc = 0; + + spdk_vhost_lock(); + if (vsession->vdev->backend->vhost_get_config) { + rc = vsession->vdev->backend->vhost_get_config(vsession->vdev, + msg->payload.cfg.region, msg->payload.cfg.size); + if (rc != 0) { + msg->size = 0; + } + } + spdk_vhost_unlock(); + + return RTE_VHOST_MSG_RESULT_REPLY; + } + case VHOST_USER_SET_CONFIG: { + int rc = 0; + + spdk_vhost_lock(); + if (vsession->vdev->backend->vhost_set_config) { + rc = vsession->vdev->backend->vhost_set_config(vsession->vdev, + msg->payload.cfg.region, msg->payload.cfg.offset, + msg->payload.cfg.size, msg->payload.cfg.flags); + } + spdk_vhost_unlock(); + + return rc == 0 ? RTE_VHOST_MSG_RESULT_OK : RTE_VHOST_MSG_RESULT_ERR; + } + default: + break; + } + + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; +} + +static enum rte_vhost_msg_result +extern_vhost_post_msg_handler(int vid, void *_msg) +{ + struct vhost_user_msg *msg = _msg; + struct spdk_vhost_session *vsession; + + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid); + assert(false); + return RTE_VHOST_MSG_RESULT_ERR; + } + + if (vsession->needs_restart) { + g_spdk_vhost_ops.new_device(vid); + vsession->needs_restart = false; + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; + } + + switch (msg->request) { + case VHOST_USER_SET_FEATURES: + /* rte_vhost requires all queues to be fully initialized in order + * to start I/O processing. This behavior is not compliant with the + * vhost-user specification and doesn't work with QEMU 2.12+, which + * will only initialize 1 I/O queue for the SeaBIOS boot. + * Theoretically, we should start polling each virtqueue individually + * after receiving its SET_VRING_KICK message, but rte_vhost is not + * designed to poll individual queues. So here we use a workaround + * to detect when the vhost session could be potentially at that SeaBIOS + * stage and we mark it to start polling as soon as its first virtqueue + * gets initialized. This doesn't hurt any non-QEMU vhost slaves + * and allows QEMU 2.12+ to boot correctly. SET_FEATURES could be sent + * at any time, but QEMU will send it at least once on SeaBIOS + * initialization - whenever powered-up or rebooted. + */ + vsession->forced_polling = true; + break; + case VHOST_USER_SET_VRING_KICK: + /* vhost-user spec tells us to start polling a queue after receiving + * its SET_VRING_KICK message. Let's do it! + */ + if (vsession->forced_polling && !vsession->started) { + g_spdk_vhost_ops.new_device(vid); + } + break; + default: + break; + } + + return RTE_VHOST_MSG_RESULT_NOT_HANDLED; +} + +struct rte_vhost_user_extern_ops g_spdk_extern_vhost_ops = { + .pre_msg_handle = extern_vhost_pre_msg_handler, + .post_msg_handle = extern_vhost_post_msg_handler, +}; + +void +vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession) +{ + int rc; + + rc = rte_vhost_extern_callback_register(vsession->vid, &g_spdk_extern_vhost_ops, NULL); + if (rc != 0) { + SPDK_ERRLOG("rte_vhost_extern_callback_register() failed for vid = %d\n", + vsession->vid); + return; + } +} + +#else /* SPDK_CONFIG_VHOST_INTERNAL_LIB */ + +void +vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession) +{ + /* nothing to do. all the changes are already incorporated into rte_vhost */ +} + +#endif + +int +vhost_register_unix_socket(const char *path, const char *ctrl_name, + uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features) +{ + struct stat file_stat; +#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB + uint64_t features = 0; +#endif + + /* Register vhost driver to handle vhost messages. */ + if (stat(path, &file_stat) != -1) { + if (!S_ISSOCK(file_stat.st_mode)) { + SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": " + "The file already exists and is not a socket.\n", + path); + return -EIO; + } else if (unlink(path) != 0) { + SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": " + "The socket already exists and failed to unlink.\n", + path); + return -EIO; + } + } + + if (rte_vhost_driver_register(path, 0) != 0) { + SPDK_ERRLOG("Could not register controller %s with vhost library\n", ctrl_name); + SPDK_ERRLOG("Check if domain socket %s already exists\n", path); + return -EIO; + } + if (rte_vhost_driver_set_features(path, virtio_features) || + rte_vhost_driver_disable_features(path, disabled_features)) { + SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", ctrl_name); + + rte_vhost_driver_unregister(path); + return -EIO; + } + + if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) { + rte_vhost_driver_unregister(path); + SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", ctrl_name); + return -EIO; + } + +#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB + rte_vhost_driver_get_protocol_features(path, &features); + features |= protocol_features; + rte_vhost_driver_set_protocol_features(path, features); +#endif + + if (rte_vhost_driver_start(path) != 0) { + SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n", + ctrl_name, errno, spdk_strerror(errno)); + rte_vhost_driver_unregister(path); + return -EIO; + } + + return 0; +} + +int +vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) +{ + return rte_vhost_get_mem_table(vid, mem); +} + +int +vhost_driver_unregister(const char *path) +{ + return rte_vhost_driver_unregister(path); +} + +int +vhost_get_negotiated_features(int vid, uint64_t *negotiated_features) +{ + return rte_vhost_get_negotiated_features(vid, negotiated_features); +} diff --git a/src/spdk/lib/vhost/spdk_vhost.map b/src/spdk/lib/vhost/spdk_vhost.map new file mode 100644 index 000000000..de38e5a5e --- /dev/null +++ b/src/spdk/lib/vhost/spdk_vhost.map @@ -0,0 +1,27 @@ +{ + global: + + # public functions + spdk_vhost_set_socket_path; + spdk_vhost_init; + spdk_vhost_fini; + spdk_vhost_config_json; + spdk_vhost_shutdown_cb; + spdk_vhost_lock; + spdk_vhost_trylock; + spdk_vhost_unlock; + spdk_vhost_dev_find; + spdk_vhost_dev_next; + spdk_vhost_dev_get_name; + spdk_vhost_dev_get_cpumask; + spdk_vhost_set_coalescing; + spdk_vhost_get_coalescing; + spdk_vhost_scsi_dev_construct; + spdk_vhost_scsi_dev_add_tgt; + spdk_vhost_scsi_dev_get_tgt; + spdk_vhost_scsi_dev_remove_tgt; + spdk_vhost_blk_construct; + spdk_vhost_dev_remove; + + local: *; +}; diff --git a/src/spdk/lib/vhost/vhost.c b/src/spdk/lib/vhost/vhost.c new file mode 100644 index 000000000..b904d8bf9 --- /dev/null +++ b/src/spdk/lib/vhost/vhost.c @@ -0,0 +1,1634 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/memory.h" +#include "spdk/barrier.h" +#include "spdk/vhost.h" +#include "vhost_internal.h" + +static struct spdk_cpuset g_vhost_core_mask; + +/* Path to folder where character device will be created. Can be set by user. */ +static char dev_dirname[PATH_MAX] = ""; + +/* Thread performing all vhost management operations */ +static struct spdk_thread *g_vhost_init_thread; + +static spdk_vhost_fini_cb g_fini_cpl_cb; + +/** + * DPDK calls our callbacks synchronously but the work those callbacks + * perform needs to be async. Luckily, all DPDK callbacks are called on + * a DPDK-internal pthread, so we'll just wait on a semaphore in there. + */ +static sem_t g_dpdk_sem; + +/** Return code for the current DPDK callback */ +static int g_dpdk_response; + +struct vhost_session_fn_ctx { + /** Device pointer obtained before enqueuing the event */ + struct spdk_vhost_dev *vdev; + + /** ID of the session to send event to. */ + uint32_t vsession_id; + + /** User provided function to be executed on session's thread. */ + spdk_vhost_session_fn cb_fn; + + /** + * User provided function to be called on the init thread + * after iterating through all sessions. + */ + spdk_vhost_dev_fn cpl_fn; + + /** Custom user context */ + void *user_ctx; +}; + +static TAILQ_HEAD(, spdk_vhost_dev) g_vhost_devices = TAILQ_HEAD_INITIALIZER( + g_vhost_devices); +static pthread_mutex_t g_vhost_mutex = PTHREAD_MUTEX_INITIALIZER; + +void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len) +{ + void *vva; + uint64_t newlen; + + newlen = len; + vva = (void *)rte_vhost_va_from_guest_pa(vsession->mem, addr, &newlen); + if (newlen != len) { + return NULL; + } + + return vva; + +} + +static void +vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_id) +{ + struct vring_desc *desc, *desc_table; + uint32_t desc_table_size; + int rc; + + if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) { + return; + } + + rc = vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Can't log used ring descriptors!\n"); + return; + } + + do { + if (vhost_vring_desc_is_wr(desc)) { + /* To be honest, only pages realy touched should be logged, but + * doing so would require tracking those changes in each backed. + * Also backend most likely will touch all/most of those pages so + * for lets assume we touched all pages passed to as writeable buffers. */ + rte_vhost_log_write(vsession->vid, desc->addr, desc->len); + } + vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + } while (desc); +} + +static void +vhost_log_used_vring_elem(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t idx) +{ + uint64_t offset, len; + + if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) { + return; + } + + if (spdk_unlikely(virtqueue->packed.packed_ring)) { + offset = idx * sizeof(struct vring_packed_desc); + len = sizeof(struct vring_packed_desc); + } else { + offset = offsetof(struct vring_used, ring[idx]); + len = sizeof(virtqueue->vring.used->ring[idx]); + } + + rte_vhost_log_used_vring(vsession->vid, virtqueue->vring_idx, offset, len); +} + +static void +vhost_log_used_vring_idx(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue) +{ + uint64_t offset, len; + uint16_t vq_idx; + + if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) { + return; + } + + offset = offsetof(struct vring_used, idx); + len = sizeof(virtqueue->vring.used->idx); + vq_idx = virtqueue - vsession->virtqueue; + + rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len); +} + +/* + * Get available requests from avail ring. + */ +uint16_t +vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs, + uint16_t reqs_len) +{ + struct rte_vhost_vring *vring = &virtqueue->vring; + struct vring_avail *avail = vring->avail; + uint16_t size_mask = vring->size - 1; + uint16_t last_idx = virtqueue->last_avail_idx, avail_idx = avail->idx; + uint16_t count, i; + + count = avail_idx - last_idx; + if (spdk_likely(count == 0)) { + return 0; + } + + if (spdk_unlikely(count > vring->size)) { + /* TODO: the queue is unrecoverably broken and should be marked so. + * For now we will fail silently and report there are no new avail entries. + */ + return 0; + } + + count = spdk_min(count, reqs_len); + virtqueue->last_avail_idx += count; + for (i = 0; i < count; i++) { + reqs[i] = vring->avail->ring[(last_idx + i) & size_mask]; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n", + last_idx, avail_idx, count); + + return count; +} + +static bool +vhost_vring_desc_is_indirect(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_INDIRECT); +} + +static bool +vhost_vring_packed_desc_is_indirect(struct vring_packed_desc *cur_desc) +{ + return (cur_desc->flags & VRING_DESC_F_INDIRECT) != 0; +} + +int +vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table, + uint32_t *desc_table_size) +{ + if (spdk_unlikely(req_idx >= virtqueue->vring.size)) { + return -1; + } + + *desc = &virtqueue->vring.desc[req_idx]; + + if (vhost_vring_desc_is_indirect(*desc)) { + *desc_table_size = (*desc)->len / sizeof(**desc); + *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr, + sizeof(**desc) * *desc_table_size); + *desc = *desc_table; + if (*desc == NULL) { + return -1; + } + + return 0; + } + + *desc_table = virtqueue->vring.desc; + *desc_table_size = virtqueue->vring.size; + + return 0; +} + +int +vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_idx, struct vring_packed_desc **desc, + struct vring_packed_desc **desc_table, uint32_t *desc_table_size) +{ + *desc = &virtqueue->vring.desc_packed[req_idx]; + + /* In packed ring when the desc is non-indirect we get next desc + * by judging (desc->flag & VRING_DESC_F_NEXT) != 0. When the desc + * is indirect we get next desc by idx and desc_table_size. It's + * different from split ring. + */ + if (vhost_vring_packed_desc_is_indirect(*desc)) { + *desc_table_size = (*desc)->len / sizeof(struct vring_packed_desc); + *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr, + (*desc)->len); + *desc = *desc_table; + if (spdk_unlikely(*desc == NULL)) { + return -1; + } + } else { + *desc_table = NULL; + *desc_table_size = 0; + } + + return 0; +} + +int +vhost_vq_used_signal(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue) +{ + if (virtqueue->used_req_cnt == 0) { + return 0; + } + + virtqueue->req_cnt += virtqueue->used_req_cnt; + virtqueue->used_req_cnt = 0; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n", + virtqueue - vsession->virtqueue, virtqueue->last_used_idx); + + if (rte_vhost_vring_call(vsession->vid, virtqueue->vring_idx) == 0) { + /* interrupt signalled */ + return 1; + } else { + /* interrupt not signalled */ + return 0; + } +} + + +static void +check_session_io_stats(struct spdk_vhost_session *vsession, uint64_t now) +{ + struct spdk_vhost_virtqueue *virtqueue; + uint32_t irq_delay_base = vsession->coalescing_delay_time_base; + uint32_t io_threshold = vsession->coalescing_io_rate_threshold; + int32_t irq_delay; + uint32_t req_cnt; + uint16_t q_idx; + + if (now < vsession->next_stats_check_time) { + return; + } + + vsession->next_stats_check_time = now + vsession->stats_check_interval; + for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { + virtqueue = &vsession->virtqueue[q_idx]; + + req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt; + if (req_cnt <= io_threshold) { + continue; + } + + irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold; + virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay); + + virtqueue->req_cnt = 0; + virtqueue->next_event_time = now; + } +} + +static inline bool +vhost_vq_event_is_suppressed(struct spdk_vhost_virtqueue *vq) +{ + if (spdk_unlikely(vq->packed.packed_ring)) { + if (vq->vring.driver_event->flags & VRING_PACKED_EVENT_FLAG_DISABLE) { + return true; + } + } else { + if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { + return true; + } + } + + return false; +} + +void +vhost_session_used_signal(struct spdk_vhost_session *vsession) +{ + struct spdk_vhost_virtqueue *virtqueue; + uint64_t now; + uint16_t q_idx; + + if (vsession->coalescing_delay_time_base == 0) { + for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { + virtqueue = &vsession->virtqueue[q_idx]; + + if (virtqueue->vring.desc == NULL) { + continue; + } + + if (vhost_vq_event_is_suppressed(virtqueue)) { + continue; + } + + vhost_vq_used_signal(vsession, virtqueue); + } + } else { + now = spdk_get_ticks(); + check_session_io_stats(vsession, now); + + for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { + virtqueue = &vsession->virtqueue[q_idx]; + + /* No need for event right now */ + if (now < virtqueue->next_event_time) { + continue; + } + + if (vhost_vq_event_is_suppressed(virtqueue)) { + continue; + } + + if (!vhost_vq_used_signal(vsession, virtqueue)) { + continue; + } + + /* Syscall is quite long so update time */ + now = spdk_get_ticks(); + virtqueue->next_event_time = now + virtqueue->irq_delay_time; + } + } +} + +static int +vhost_session_set_coalescing(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *ctx) +{ + vsession->coalescing_delay_time_base = + vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL; + vsession->coalescing_io_rate_threshold = + vdev->coalescing_iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U; + return 0; +} + +static int +vhost_dev_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us, + uint32_t iops_threshold) +{ + uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL; + uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U; + + if (delay_time_base >= UINT32_MAX) { + SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us); + return -EINVAL; + } else if (io_rate == 0) { + SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate, + 1000U / SPDK_VHOST_STATS_CHECK_INTERVAL_MS); + return -EINVAL; + } + + vdev->coalescing_delay_us = delay_base_us; + vdev->coalescing_iops_threshold = iops_threshold; + return 0; +} + +int +spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us, + uint32_t iops_threshold) +{ + int rc; + + rc = vhost_dev_set_coalescing(vdev, delay_base_us, iops_threshold); + if (rc != 0) { + return rc; + } + + vhost_dev_foreach_session(vdev, vhost_session_set_coalescing, NULL, NULL); + return 0; +} + +void +spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us, + uint32_t *iops_threshold) +{ + if (delay_base_us) { + *delay_base_us = vdev->coalescing_delay_us; + } + + if (iops_threshold) { + *iops_threshold = vdev->coalescing_iops_threshold; + } +} + +/* + * Enqueue id and len to used ring. + */ +void +vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t id, uint32_t len) +{ + struct rte_vhost_vring *vring = &virtqueue->vring; + struct vring_used *used = vring->used; + uint16_t last_idx = virtqueue->last_used_idx & (vring->size - 1); + uint16_t vq_idx = virtqueue->vring_idx; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n", + virtqueue - vsession->virtqueue, virtqueue->last_used_idx, id, len); + + vhost_log_req_desc(vsession, virtqueue, id); + + virtqueue->last_used_idx++; + used->ring[last_idx].id = id; + used->ring[last_idx].len = len; + + /* Ensure the used ring is updated before we log it or increment used->idx. */ + spdk_smp_wmb(); + + rte_vhost_set_last_inflight_io_split(vsession->vid, vq_idx, id); + + vhost_log_used_vring_elem(vsession, virtqueue, last_idx); + * (volatile uint16_t *) &used->idx = virtqueue->last_used_idx; + vhost_log_used_vring_idx(vsession, virtqueue); + + rte_vhost_clr_inflight_desc_split(vsession->vid, vq_idx, virtqueue->last_used_idx, id); + + virtqueue->used_req_cnt++; +} + +void +vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t num_descs, uint16_t buffer_id, + uint32_t length) +{ + struct vring_packed_desc *desc = &virtqueue->vring.desc_packed[virtqueue->last_used_idx]; + bool used, avail; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "Queue %td - RING: buffer_id=%"PRIu16"\n", + virtqueue - vsession->virtqueue, buffer_id); + + /* When the descriptor is used, two flags in descriptor + * avail flag and used flag are set to equal + * and used flag value == used_wrap_counter. + */ + used = !!(desc->flags & VRING_DESC_F_USED); + avail = !!(desc->flags & VRING_DESC_F_AVAIL); + if (spdk_unlikely(used == virtqueue->packed.used_phase && used == avail)) { + SPDK_ERRLOG("descriptor has been used before\n"); + return; + } + + /* In used desc addr is unused and len specifies the buffer length + * that has been written to by the device. + */ + desc->addr = 0; + desc->len = length; + + /* This bit specifies whether any data has been written by the device */ + if (length != 0) { + desc->flags |= VRING_DESC_F_WRITE; + } + + /* Buffer ID is included in the last descriptor in the list. + * The driver needs to keep track of the size of the list corresponding + * to each buffer ID. + */ + desc->id = buffer_id; + + /* A device MUST NOT make the descriptor used before buffer_id is + * written to the descriptor. + */ + spdk_smp_wmb(); + /* To mark a desc as used, the device sets the F_USED bit in flags to match + * the internal Device ring wrap counter. It also sets the F_AVAIL bit to + * match the same value. + */ + if (virtqueue->packed.used_phase) { + desc->flags |= VRING_DESC_F_AVAIL_USED; + } else { + desc->flags &= ~VRING_DESC_F_AVAIL_USED; + } + + vhost_log_used_vring_elem(vsession, virtqueue, virtqueue->last_used_idx); + virtqueue->last_used_idx += num_descs; + if (virtqueue->last_used_idx >= virtqueue->vring.size) { + virtqueue->last_used_idx -= virtqueue->vring.size; + virtqueue->packed.used_phase = !virtqueue->packed.used_phase; + } + + virtqueue->used_req_cnt++; +} + +bool +vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue) +{ + uint16_t flags = virtqueue->vring.desc_packed[virtqueue->last_avail_idx].flags; + + /* To mark a desc as available, the driver sets the F_AVAIL bit in flags + * to match the internal avail wrap counter. It also sets the F_USED bit to + * match the inverse value but it's not mandatory. + */ + return (!!(flags & VRING_DESC_F_AVAIL) == virtqueue->packed.avail_phase); +} + +bool +vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc) +{ + return (cur_desc->flags & VRING_DESC_F_WRITE) != 0; +} + +int +vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx, + struct spdk_vhost_virtqueue *vq, + struct vring_packed_desc *desc_table, + uint32_t desc_table_size) +{ + if (desc_table != NULL) { + /* When the desc_table isn't NULL means it's indirect and we get the next + * desc by req_idx and desc_table_size. The return value is NULL means + * we reach the last desc of this request. + */ + (*req_idx)++; + if (*req_idx < desc_table_size) { + *desc = &desc_table[*req_idx]; + } else { + *desc = NULL; + } + } else { + /* When the desc_table is NULL means it's non-indirect and we get the next + * desc by req_idx and F_NEXT in flags. The return value is NULL means + * we reach the last desc of this request. When return new desc + * we update the req_idx too. + */ + if (((*desc)->flags & VRING_DESC_F_NEXT) == 0) { + *desc = NULL; + return 0; + } + + *req_idx = (*req_idx + 1) % vq->vring.size; + *desc = &vq->vring.desc_packed[*req_idx]; + } + + return 0; +} + +static int +vhost_vring_desc_payload_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, + uint16_t *iov_index, uintptr_t payload, uint64_t remaining) +{ + uintptr_t vva; + uint64_t len; + + do { + if (*iov_index >= SPDK_VHOST_IOVS_MAX) { + SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX); + return -1; + } + len = remaining; + vva = (uintptr_t)rte_vhost_va_from_guest_pa(vsession->mem, payload, &len); + if (vva == 0 || len == 0) { + SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload); + return -1; + } + iov[*iov_index].iov_base = (void *)vva; + iov[*iov_index].iov_len = len; + remaining -= len; + payload += len; + (*iov_index)++; + } while (remaining); + + return 0; +} + +int +vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, + uint16_t *iov_index, const struct vring_packed_desc *desc) +{ + return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index, + desc->addr, desc->len); +} + +/* 1, Traverse the desc chain to get the buffer_id and return buffer_id as task_idx. + * 2, Update the vq->last_avail_idx to point next available desc chain. + * 3, Update the avail_wrap_counter if last_avail_idx overturn. + */ +uint16_t +vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx, + uint16_t *num_descs) +{ + struct vring_packed_desc *desc; + uint16_t desc_head = req_idx; + + *num_descs = 1; + + desc = &vq->vring.desc_packed[req_idx]; + if (!vhost_vring_packed_desc_is_indirect(desc)) { + while ((desc->flags & VRING_DESC_F_NEXT) != 0) { + req_idx = (req_idx + 1) % vq->vring.size; + desc = &vq->vring.desc_packed[req_idx]; + (*num_descs)++; + } + } + + /* Queue Size doesn't have to be a power of 2 + * Device maintains last_avail_idx so we can make sure + * the value is valid(0 ~ vring.size - 1) + */ + vq->last_avail_idx = (req_idx + 1) % vq->vring.size; + if (vq->last_avail_idx < desc_head) { + vq->packed.avail_phase = !vq->packed.avail_phase; + } + + return desc->id; +} + +int +vhost_vring_desc_get_next(struct vring_desc **desc, + struct vring_desc *desc_table, uint32_t desc_table_size) +{ + struct vring_desc *old_desc = *desc; + uint16_t next_idx; + + if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) { + *desc = NULL; + return 0; + } + + next_idx = old_desc->next; + if (spdk_unlikely(next_idx >= desc_table_size)) { + *desc = NULL; + return -1; + } + + *desc = &desc_table[next_idx]; + return 0; +} + +int +vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, + uint16_t *iov_index, const struct vring_desc *desc) +{ + return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index, + desc->addr, desc->len); +} + +static struct spdk_vhost_session * +vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id) +{ + struct spdk_vhost_session *vsession; + + TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { + if (vsession->id == id) { + return vsession; + } + } + + return NULL; +} + +struct spdk_vhost_session * +vhost_session_find_by_vid(int vid) +{ + struct spdk_vhost_dev *vdev; + struct spdk_vhost_session *vsession; + + TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) { + TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { + if (vsession->vid == vid) { + return vsession; + } + } + } + + return NULL; +} + +struct spdk_vhost_dev * +spdk_vhost_dev_next(struct spdk_vhost_dev *vdev) +{ + if (vdev == NULL) { + return TAILQ_FIRST(&g_vhost_devices); + } + + return TAILQ_NEXT(vdev, tailq); +} + +struct spdk_vhost_dev * +spdk_vhost_dev_find(const char *ctrlr_name) +{ + struct spdk_vhost_dev *vdev; + size_t dev_dirname_len = strlen(dev_dirname); + + if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) { + ctrlr_name += dev_dirname_len; + } + + TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) { + if (strcmp(vdev->name, ctrlr_name) == 0) { + return vdev; + } + } + + return NULL; +} + +static int +vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask) +{ + int rc; + + if (cpumask == NULL) { + return -1; + } + + if (mask == NULL) { + spdk_cpuset_copy(cpumask, &g_vhost_core_mask); + return 0; + } + + rc = spdk_cpuset_parse(cpumask, mask); + if (rc < 0) { + SPDK_ERRLOG("invalid cpumask %s\n", mask); + return -1; + } + + spdk_cpuset_and(cpumask, &g_vhost_core_mask); + + if (spdk_cpuset_count(cpumask) == 0) { + SPDK_ERRLOG("no cpu is selected among core mask(=%s)\n", + spdk_cpuset_fmt(&g_vhost_core_mask)); + return -1; + } + + return 0; +} + +static void +vhost_setup_core_mask(void *ctx) +{ + struct spdk_thread *thread = spdk_get_thread(); + spdk_cpuset_or(&g_vhost_core_mask, spdk_thread_get_cpumask(thread)); +} + +static void +vhost_setup_core_mask_done(void *ctx) +{ + spdk_vhost_init_cb init_cb = ctx; + + if (spdk_cpuset_count(&g_vhost_core_mask) == 0) { + init_cb(-ECHILD); + return; + } + + init_cb(0); +} + +static void +vhost_dev_thread_exit(void *arg1) +{ + spdk_thread_exit(spdk_get_thread()); +} + +int +vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str, + const struct spdk_vhost_dev_backend *backend) +{ + char path[PATH_MAX]; + struct spdk_cpuset cpumask = {}; + int rc; + + assert(vdev); + if (name == NULL) { + SPDK_ERRLOG("Can't register controller with no name\n"); + return -EINVAL; + } + + if (vhost_parse_core_mask(mask_str, &cpumask) != 0) { + SPDK_ERRLOG("cpumask %s is invalid (core mask is 0x%s)\n", + mask_str, spdk_cpuset_fmt(&g_vhost_core_mask)); + return -EINVAL; + } + + if (spdk_vhost_dev_find(name)) { + SPDK_ERRLOG("vhost controller %s already exists.\n", name); + return -EEXIST; + } + + if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) { + SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname, + name); + return -EINVAL; + } + + vdev->name = strdup(name); + vdev->path = strdup(path); + if (vdev->name == NULL || vdev->path == NULL) { + rc = -EIO; + goto out; + } + + vdev->thread = spdk_thread_create(vdev->name, &cpumask); + if (vdev->thread == NULL) { + SPDK_ERRLOG("Failed to create thread for vhost controller %s.\n", name); + rc = -EIO; + goto out; + } + + vdev->registered = true; + vdev->backend = backend; + TAILQ_INIT(&vdev->vsessions); + + vhost_dev_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US, + SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD); + + if (vhost_register_unix_socket(path, name, vdev->virtio_features, vdev->disabled_features, + vdev->protocol_features)) { + spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL); + rc = -EIO; + goto out; + } + + TAILQ_INSERT_TAIL(&g_vhost_devices, vdev, tailq); + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name); + return 0; + +out: + free(vdev->name); + free(vdev->path); + return rc; +} + +int +vhost_dev_unregister(struct spdk_vhost_dev *vdev) +{ + if (!TAILQ_EMPTY(&vdev->vsessions)) { + SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name); + return -EBUSY; + } + + if (vdev->registered && vhost_driver_unregister(vdev->path) != 0) { + SPDK_ERRLOG("Could not unregister controller %s with vhost library\n" + "Check if domain socket %s still exists\n", + vdev->name, vdev->path); + return -EIO; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name); + + spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL); + + free(vdev->name); + free(vdev->path); + TAILQ_REMOVE(&g_vhost_devices, vdev, tailq); + return 0; +} + +const char * +spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev) +{ + assert(vdev != NULL); + return vdev->name; +} + +const struct spdk_cpuset * +spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev) +{ + assert(vdev != NULL); + return spdk_thread_get_cpumask(vdev->thread); +} + +static void +wait_for_semaphore(int timeout_sec, const char *errmsg) +{ + struct timespec timeout; + int rc; + + clock_gettime(CLOCK_REALTIME, &timeout); + timeout.tv_sec += timeout_sec; + rc = sem_timedwait(&g_dpdk_sem, &timeout); + if (rc != 0) { + SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg); + sem_wait(&g_dpdk_sem); + } +} + +static void +vhost_session_cb_done(int rc) +{ + g_dpdk_response = rc; + sem_post(&g_dpdk_sem); +} + +void +vhost_session_start_done(struct spdk_vhost_session *vsession, int response) +{ + if (response == 0) { + vsession->started = true; + + assert(vsession->vdev->active_session_num < UINT32_MAX); + vsession->vdev->active_session_num++; + } + + vhost_session_cb_done(response); +} + +void +vhost_session_stop_done(struct spdk_vhost_session *vsession, int response) +{ + if (response == 0) { + vsession->started = false; + + assert(vsession->vdev->active_session_num > 0); + vsession->vdev->active_session_num--; + } + + vhost_session_cb_done(response); +} + +static void +vhost_event_cb(void *arg1) +{ + struct vhost_session_fn_ctx *ctx = arg1; + struct spdk_vhost_session *vsession; + + if (pthread_mutex_trylock(&g_vhost_mutex) != 0) { + spdk_thread_send_msg(spdk_get_thread(), vhost_event_cb, arg1); + return; + } + + vsession = vhost_session_find_by_id(ctx->vdev, ctx->vsession_id); + ctx->cb_fn(ctx->vdev, vsession, NULL); + pthread_mutex_unlock(&g_vhost_mutex); +} + +int +vhost_session_send_event(struct spdk_vhost_session *vsession, + spdk_vhost_session_fn cb_fn, unsigned timeout_sec, + const char *errmsg) +{ + struct vhost_session_fn_ctx ev_ctx = {0}; + struct spdk_vhost_dev *vdev = vsession->vdev; + + ev_ctx.vdev = vdev; + ev_ctx.vsession_id = vsession->id; + ev_ctx.cb_fn = cb_fn; + + spdk_thread_send_msg(vdev->thread, vhost_event_cb, &ev_ctx); + + pthread_mutex_unlock(&g_vhost_mutex); + wait_for_semaphore(timeout_sec, errmsg); + pthread_mutex_lock(&g_vhost_mutex); + + return g_dpdk_response; +} + +static void +foreach_session_finish_cb(void *arg1) +{ + struct vhost_session_fn_ctx *ev_ctx = arg1; + struct spdk_vhost_dev *vdev = ev_ctx->vdev; + + if (pthread_mutex_trylock(&g_vhost_mutex) != 0) { + spdk_thread_send_msg(spdk_get_thread(), + foreach_session_finish_cb, arg1); + return; + } + + assert(vdev->pending_async_op_num > 0); + vdev->pending_async_op_num--; + if (ev_ctx->cpl_fn != NULL) { + ev_ctx->cpl_fn(vdev, ev_ctx->user_ctx); + } + + pthread_mutex_unlock(&g_vhost_mutex); + free(ev_ctx); +} + +static void +foreach_session(void *arg1) +{ + struct vhost_session_fn_ctx *ev_ctx = arg1; + struct spdk_vhost_session *vsession; + struct spdk_vhost_dev *vdev = ev_ctx->vdev; + int rc; + + if (pthread_mutex_trylock(&g_vhost_mutex) != 0) { + spdk_thread_send_msg(spdk_get_thread(), foreach_session, arg1); + return; + } + + TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { + if (vsession->initialized) { + rc = ev_ctx->cb_fn(vdev, vsession, ev_ctx->user_ctx); + if (rc < 0) { + goto out; + } + } + } + +out: + pthread_mutex_unlock(&g_vhost_mutex); + + spdk_thread_send_msg(g_vhost_init_thread, foreach_session_finish_cb, arg1); +} + +void +vhost_dev_foreach_session(struct spdk_vhost_dev *vdev, + spdk_vhost_session_fn fn, + spdk_vhost_dev_fn cpl_fn, + void *arg) +{ + struct vhost_session_fn_ctx *ev_ctx; + + ev_ctx = calloc(1, sizeof(*ev_ctx)); + if (ev_ctx == NULL) { + SPDK_ERRLOG("Failed to alloc vhost event.\n"); + assert(false); + return; + } + + ev_ctx->vdev = vdev; + ev_ctx->cb_fn = fn; + ev_ctx->cpl_fn = cpl_fn; + ev_ctx->user_ctx = arg; + + assert(vdev->pending_async_op_num < UINT32_MAX); + vdev->pending_async_op_num++; + + spdk_thread_send_msg(vdev->thread, foreach_session, ev_ctx); +} + +static int +_stop_session(struct spdk_vhost_session *vsession) +{ + struct spdk_vhost_dev *vdev = vsession->vdev; + struct spdk_vhost_virtqueue *q; + int rc; + uint16_t i; + + rc = vdev->backend->stop_session(vsession); + if (rc != 0) { + SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid); + pthread_mutex_unlock(&g_vhost_mutex); + return rc; + } + + for (i = 0; i < vsession->max_queues; i++) { + q = &vsession->virtqueue[i]; + + /* vring.desc and vring.desc_packed are in a union struct + * so q->vring.desc can replace q->vring.desc_packed. + */ + if (q->vring.desc == NULL) { + continue; + } + + /* Packed virtqueues support up to 2^15 entries each + * so left one bit can be used as wrap counter. + */ + if (q->packed.packed_ring) { + q->last_avail_idx = q->last_avail_idx | + ((uint16_t)q->packed.avail_phase << 15); + q->last_used_idx = q->last_used_idx | + ((uint16_t)q->packed.used_phase << 15); + } + + rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx); + } + + vhost_session_mem_unregister(vsession->mem); + free(vsession->mem); + + return 0; +} + +int +vhost_stop_device_cb(int vid) +{ + struct spdk_vhost_session *vsession; + int rc; + + pthread_mutex_lock(&g_vhost_mutex); + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); + pthread_mutex_unlock(&g_vhost_mutex); + return -EINVAL; + } + + if (!vsession->started) { + /* already stopped, nothing to do */ + pthread_mutex_unlock(&g_vhost_mutex); + return -EALREADY; + } + + rc = _stop_session(vsession); + pthread_mutex_unlock(&g_vhost_mutex); + + return rc; +} + +int +vhost_start_device_cb(int vid) +{ + struct spdk_vhost_dev *vdev; + struct spdk_vhost_session *vsession; + int rc = -1; + uint16_t i; + bool packed_ring; + + pthread_mutex_lock(&g_vhost_mutex); + + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); + goto out; + } + + vdev = vsession->vdev; + if (vsession->started) { + /* already started, nothing to do */ + rc = 0; + goto out; + } + + if (vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid); + goto out; + } + + packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0); + + vsession->max_queues = 0; + memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue)); + for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) { + struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i]; + + q->vring_idx = -1; + if (rte_vhost_get_vhost_vring(vid, i, &q->vring)) { + continue; + } + q->vring_idx = i; + rte_vhost_get_vhost_ring_inflight(vid, i, &q->vring_inflight); + + /* vring.desc and vring.desc_packed are in a union struct + * so q->vring.desc can replace q->vring.desc_packed. + */ + if (q->vring.desc == NULL || q->vring.size == 0) { + continue; + } + + if (rte_vhost_get_vring_base(vsession->vid, i, &q->last_avail_idx, &q->last_used_idx)) { + q->vring.desc = NULL; + continue; + } + + if (packed_ring) { + /* Packed virtqueues support up to 2^15 entries each + * so left one bit can be used as wrap counter. + */ + q->packed.avail_phase = q->last_avail_idx >> 15; + q->last_avail_idx = q->last_avail_idx & 0x7FFF; + q->packed.used_phase = q->last_used_idx >> 15; + q->last_used_idx = q->last_used_idx & 0x7FFF; + + /* Disable I/O submission notifications, we'll be polling. */ + q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE; + } else { + /* Disable I/O submission notifications, we'll be polling. */ + q->vring.used->flags = VRING_USED_F_NO_NOTIFY; + } + + q->packed.packed_ring = packed_ring; + vsession->max_queues = i + 1; + } + + if (vhost_get_mem_table(vid, &vsession->mem) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid); + goto out; + } + + /* + * Not sure right now but this look like some kind of QEMU bug and guest IO + * might be frozed without kicking all queues after live-migration. This look like + * the previous vhost instance failed to effectively deliver all interrupts before + * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts + * should be ignored by guest virtio driver. + * + * Tested on QEMU 2.10.91 and 2.11.50. + */ + for (i = 0; i < vsession->max_queues; i++) { + struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i]; + + /* vring.desc and vring.desc_packed are in a union struct + * so q->vring.desc can replace q->vring.desc_packed. + */ + if (q->vring.desc != NULL && q->vring.size > 0) { + rte_vhost_vring_call(vsession->vid, q->vring_idx); + } + } + + vhost_session_set_coalescing(vdev, vsession, NULL); + vhost_session_mem_register(vsession->mem); + vsession->initialized = true; + rc = vdev->backend->start_session(vsession); + if (rc != 0) { + vhost_session_mem_unregister(vsession->mem); + free(vsession->mem); + goto out; + } + +out: + pthread_mutex_unlock(&g_vhost_mutex); + return rc; +} + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB +int +vhost_get_config_cb(int vid, uint8_t *config, uint32_t len) +{ + struct spdk_vhost_session *vsession; + struct spdk_vhost_dev *vdev; + int rc = -1; + + pthread_mutex_lock(&g_vhost_mutex); + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); + goto out; + } + + vdev = vsession->vdev; + if (vdev->backend->vhost_get_config) { + rc = vdev->backend->vhost_get_config(vdev, config, len); + } + +out: + pthread_mutex_unlock(&g_vhost_mutex); + return rc; +} + +int +vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags) +{ + struct spdk_vhost_session *vsession; + struct spdk_vhost_dev *vdev; + int rc = -1; + + pthread_mutex_lock(&g_vhost_mutex); + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); + goto out; + } + + vdev = vsession->vdev; + if (vdev->backend->vhost_set_config) { + rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags); + } + +out: + pthread_mutex_unlock(&g_vhost_mutex); + return rc; +} +#endif + +int +spdk_vhost_set_socket_path(const char *basename) +{ + int ret; + + if (basename && strlen(basename) > 0) { + ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename); + if (ret <= 0) { + return -EINVAL; + } + if ((size_t)ret >= sizeof(dev_dirname) - 2) { + SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret); + return -EINVAL; + } + + if (dev_dirname[ret - 1] != '/') { + dev_dirname[ret] = '/'; + dev_dirname[ret + 1] = '\0'; + } + } + + return 0; +} + +void +vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + assert(vdev->backend->dump_info_json != NULL); + vdev->backend->dump_info_json(vdev, w); +} + +int +spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev) +{ + if (vdev->pending_async_op_num) { + return -EBUSY; + } + + return vdev->backend->remove_device(vdev); +} + +int +vhost_new_connection_cb(int vid, const char *ifname) +{ + struct spdk_vhost_dev *vdev; + struct spdk_vhost_session *vsession; + + pthread_mutex_lock(&g_vhost_mutex); + + vdev = spdk_vhost_dev_find(ifname); + if (vdev == NULL) { + SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid); + pthread_mutex_unlock(&g_vhost_mutex); + return -1; + } + + /* We expect sessions inside vdev->vsessions to be sorted in ascending + * order in regard of vsession->id. For now we always set id = vsessions_cnt++ + * and append each session to the very end of the vsessions list. + * This is required for spdk_vhost_dev_foreach_session() to work. + */ + if (vdev->vsessions_num == UINT_MAX) { + assert(false); + return -EINVAL; + } + + if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) + + vdev->backend->session_ctx_size)) { + SPDK_ERRLOG("vsession alloc failed\n"); + pthread_mutex_unlock(&g_vhost_mutex); + return -1; + } + memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size); + + vsession->vdev = vdev; + vsession->vid = vid; + vsession->id = vdev->vsessions_num++; + vsession->name = spdk_sprintf_alloc("%ss%u", vdev->name, vsession->vid); + if (vsession->name == NULL) { + SPDK_ERRLOG("vsession alloc failed\n"); + pthread_mutex_unlock(&g_vhost_mutex); + free(vsession); + return -1; + } + vsession->started = false; + vsession->initialized = false; + vsession->next_stats_check_time = 0; + vsession->stats_check_interval = SPDK_VHOST_STATS_CHECK_INTERVAL_MS * + spdk_get_ticks_hz() / 1000UL; + TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq); + + vhost_session_install_rte_compat_hooks(vsession); + pthread_mutex_unlock(&g_vhost_mutex); + return 0; +} + +int +vhost_destroy_connection_cb(int vid) +{ + struct spdk_vhost_session *vsession; + int rc = 0; + + pthread_mutex_lock(&g_vhost_mutex); + vsession = vhost_session_find_by_vid(vid); + if (vsession == NULL) { + SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid); + pthread_mutex_unlock(&g_vhost_mutex); + return -EINVAL; + } + + if (vsession->started) { + rc = _stop_session(vsession); + } + + TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq); + free(vsession->name); + free(vsession); + pthread_mutex_unlock(&g_vhost_mutex); + + return rc; +} + +void +spdk_vhost_lock(void) +{ + pthread_mutex_lock(&g_vhost_mutex); +} + +int +spdk_vhost_trylock(void) +{ + return -pthread_mutex_trylock(&g_vhost_mutex); +} + +void +spdk_vhost_unlock(void) +{ + pthread_mutex_unlock(&g_vhost_mutex); +} + +void +spdk_vhost_init(spdk_vhost_init_cb init_cb) +{ + size_t len; + int ret; + + g_vhost_init_thread = spdk_get_thread(); + assert(g_vhost_init_thread != NULL); + + if (dev_dirname[0] == '\0') { + if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) { + SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno)); + ret = -1; + goto out; + } + + len = strlen(dev_dirname); + if (dev_dirname[len - 1] != '/') { + dev_dirname[len] = '/'; + dev_dirname[len + 1] = '\0'; + } + } + + ret = sem_init(&g_dpdk_sem, 0, 0); + if (ret != 0) { + SPDK_ERRLOG("Failed to initialize semaphore for rte_vhost pthread.\n"); + ret = -1; + goto out; + } + + ret = vhost_scsi_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost controllers\n"); + goto out; + } + + ret = vhost_blk_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost block controllers\n"); + goto out; + } + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB + ret = vhost_nvme_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n"); + goto out; + } +#endif + + spdk_cpuset_zero(&g_vhost_core_mask); + + /* iterate threads instead of using SPDK_ENV_FOREACH_CORE to ensure that threads are really + * created. + */ + spdk_for_each_thread(vhost_setup_core_mask, init_cb, vhost_setup_core_mask_done); + return; +out: + init_cb(ret); +} + +static void +vhost_fini(void *arg1) +{ + struct spdk_vhost_dev *vdev, *tmp; + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_next(NULL); + while (vdev != NULL) { + tmp = spdk_vhost_dev_next(vdev); + spdk_vhost_dev_remove(vdev); + /* don't care if it fails, there's nothing we can do for now */ + vdev = tmp; + } + spdk_vhost_unlock(); + + spdk_cpuset_zero(&g_vhost_core_mask); + + /* All devices are removed now. */ + sem_destroy(&g_dpdk_sem); + + g_fini_cpl_cb(); +} + +static void * +session_shutdown(void *arg) +{ + struct spdk_vhost_dev *vdev = NULL; + + TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) { + vhost_driver_unregister(vdev->path); + vdev->registered = false; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n"); + spdk_thread_send_msg(g_vhost_init_thread, vhost_fini, NULL); + return NULL; +} + +void +spdk_vhost_fini(spdk_vhost_fini_cb fini_cb) +{ + pthread_t tid; + int rc; + + assert(spdk_get_thread() == g_vhost_init_thread); + g_fini_cpl_cb = fini_cb; + + /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK + * ops for stopping a device or removing a connection, we need to call it from + * a separate thread to avoid deadlock. + */ + rc = pthread_create(&tid, NULL, &session_shutdown, NULL); + if (rc < 0) { + SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc)); + abort(); + } + pthread_detach(tid); +} + +void +spdk_vhost_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_dev *vdev; + uint32_t delay_base_us; + uint32_t iops_threshold; + + spdk_json_write_array_begin(w); + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_next(NULL); + while (vdev != NULL) { + vdev->backend->write_config_json(vdev, w); + + spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold); + if (delay_base_us) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_controller_set_coalescing"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us); + spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + vdev = spdk_vhost_dev_next(vdev); + } + spdk_vhost_unlock(); + + spdk_json_write_array_end(w); +} + +SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST) +SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING) diff --git a/src/spdk/lib/vhost/vhost_blk.c b/src/spdk/lib/vhost/vhost_blk.c new file mode 100644 index 000000000..d387cb27d --- /dev/null +++ b/src/spdk/lib/vhost/vhost_blk.c @@ -0,0 +1,1354 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/virtio_blk.h> + +#include "spdk/env.h" +#include "spdk/bdev.h" +#include "spdk/bdev_module.h" +#include "spdk/conf.h" +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/vhost.h" + +#include "vhost_internal.h" +#include <rte_version.h> + +/* Minimal set of features supported by every SPDK VHOST-BLK device */ +#define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \ + (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ + (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ + (1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \ + (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ + (1ULL << VIRTIO_BLK_F_MQ)) + +/* Not supported features */ +#define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \ + (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ + (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI)) + +/* Vhost-blk support protocol features */ +#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB +#define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \ + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) +#else +#define SPDK_VHOST_BLK_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_CONFIG) +#endif + +struct spdk_vhost_blk_task { + struct spdk_bdev_io *bdev_io; + struct spdk_vhost_blk_session *bvsession; + struct spdk_vhost_virtqueue *vq; + + volatile uint8_t *status; + + uint16_t req_idx; + uint16_t num_descs; + uint16_t buffer_id; + + /* for io wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + + /* If set, the task is currently used for I/O processing. */ + bool used; + + /** Number of bytes that were written. */ + uint32_t used_len; + uint16_t iovcnt; + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; +}; + +struct spdk_vhost_blk_dev { + struct spdk_vhost_dev vdev; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *bdev_desc; + /* dummy_io_channel is used to hold a bdev reference */ + struct spdk_io_channel *dummy_io_channel; + bool readonly; +}; + +struct spdk_vhost_blk_session { + /* The parent session must be the very first field in this struct */ + struct spdk_vhost_session vsession; + struct spdk_vhost_blk_dev *bvdev; + struct spdk_poller *requestq_poller; + struct spdk_io_channel *io_channel; + struct spdk_poller *stop_poller; +}; + +/* forward declaration */ +static const struct spdk_vhost_dev_backend vhost_blk_device_backend; + +static int +process_blk_request(struct spdk_vhost_blk_task *task, + struct spdk_vhost_blk_session *bvsession, + struct spdk_vhost_virtqueue *vq); + +static void +blk_task_finish(struct spdk_vhost_blk_task *task) +{ + assert(task->bvsession->vsession.task_cnt > 0); + task->bvsession->vsession.task_cnt--; + task->used = false; +} + +static void +blk_task_init(struct spdk_vhost_blk_task *task) +{ + task->used = true; + task->iovcnt = SPDK_COUNTOF(task->iovs); + task->status = NULL; + task->used_len = 0; +} + +static void +blk_task_enqueue(struct spdk_vhost_blk_task *task) +{ + if (task->vq->packed.packed_ring) { + vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq, + task->num_descs, + task->buffer_id, task->used_len); + } else { + vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq, + task->req_idx, task->used_len); + } +} + +static void +invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status) +{ + if (task->status) { + *task->status = status; + } + + blk_task_enqueue(task); + blk_task_finish(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status); +} + +/* + * Process task's descriptor chain and setup data related fields. + * Return + * total size of suplied buffers + * + * FIXME: Make this function return to rd_cnt and wr_cnt + */ +static int +blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession, + struct spdk_vhost_virtqueue *vq, + uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct spdk_vhost_dev *vdev = vsession->vdev; + struct vring_desc *desc, *desc_table; + uint16_t out_cnt = 0, cnt = 0; + uint32_t desc_table_size, len = 0; + uint32_t desc_handled_cnt; + int rc; + + rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size); + if (rc != 0) { + SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); + return -1; + } + + desc_handled_cnt = 0; + while (1) { + /* + * Maximum cnt reached? + * Should not happen if request is well formatted, otherwise this is a BUG. + */ + if (spdk_unlikely(cnt == *iovs_cnt)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n", + vsession->name, req_idx); + return -1; + } + + if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", + vsession->name, req_idx, cnt); + return -1; + } + + len += desc->len; + + out_cnt += vhost_vring_desc_is_wr(desc); + + rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + if (rc != 0) { + SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n", + vsession->name, req_idx); + return -1; + } else if (desc == NULL) { + break; + } + + desc_handled_cnt++; + if (spdk_unlikely(desc_handled_cnt > desc_table_size)) { + /* Break a cycle and report an error, if any. */ + SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n", + vsession->name, desc_table_size, desc_handled_cnt); + return -1; + } + } + + /* + * There must be least two descriptors. + * First contain request so it must be readable. + * Last descriptor contain buffer for response so it must be writable. + */ + if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { + return -1; + } + + *length = len; + *iovs_cnt = cnt; + return 0; +} + +static int +blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession, + struct spdk_vhost_virtqueue *vq, + uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct spdk_vhost_dev *vdev = vsession->vdev; + struct vring_packed_desc *desc = NULL, *desc_table; + uint16_t out_cnt = 0, cnt = 0; + uint32_t desc_table_size, len = 0; + int rc = 0; + + rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc, + &desc_table, &desc_table_size); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); + return rc; + } + + if (desc_table != NULL) { + req_idx = 0; + } + + while (1) { + /* + * Maximum cnt reached? + * Should not happen if request is well formatted, otherwise this is a BUG. + */ + if (spdk_unlikely(cnt == *iovs_cnt)) { + SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n", + vsession->name, req_idx); + return -EINVAL; + } + + if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) { + SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", + vsession->name, req_idx, cnt); + return -EINVAL; + } + + len += desc->len; + out_cnt += vhost_vring_packed_desc_is_wr(desc); + + /* desc is NULL means we reach the last desc of this request */ + vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size); + if (desc == NULL) { + break; + } + } + + /* + * There must be least two descriptors. + * First contain request so it must be readable. + * Last descriptor contain buffer for response so it must be writable. + */ + if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { + return -EINVAL; + } + + *length = len; + *iovs_cnt = cnt; + + return 0; +} + +static void +blk_request_finish(bool success, struct spdk_vhost_blk_task *task) +{ + *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; + + blk_task_enqueue(task); + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task, + task->req_idx, success ? "OK" : "FAIL"); + blk_task_finish(task); +} + +static void +blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_blk_task *task = cb_arg; + + spdk_bdev_free_io(bdev_io); + blk_request_finish(success, task); +} + +static void +blk_request_resubmit(void *arg) +{ + struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg; + int rc = 0; + + blk_task_init(task); + + rc = process_blk_request(task, task->bvsession, task->vq); + if (rc == 0) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task); + } +} + +static inline void +blk_request_queue_io(struct spdk_vhost_blk_task *task) +{ + int rc; + struct spdk_vhost_blk_session *bvsession = task->bvsession; + struct spdk_bdev *bdev = bvsession->bvdev->bdev; + + task->bdev_io_wait.bdev = bdev; + task->bdev_io_wait.cb_fn = blk_request_resubmit; + task->bdev_io_wait.cb_arg = task; + + rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc); + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + } +} + +static int +process_blk_request(struct spdk_vhost_blk_task *task, + struct spdk_vhost_blk_session *bvsession, + struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev; + const struct virtio_blk_outhdr *req; + struct virtio_blk_discard_write_zeroes *desc; + struct iovec *iov; + uint32_t type; + uint32_t payload_len; + uint64_t flush_bytes; + int rc; + + if (vq->packed.packed_ring) { + rc = blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, + &payload_len); + } else { + rc = blk_iovs_split_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, + &payload_len); + } + + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); + /* Only READ and WRITE are supported for now. */ + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + iov = &task->iovs[0]; + if (spdk_unlikely(iov->iov_len != sizeof(*req))) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, + "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n", + iov->iov_len, sizeof(*req), task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + req = iov->iov_base; + + iov = &task->iovs[task->iovcnt - 1]; + if (spdk_unlikely(iov->iov_len != 1)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, + "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n", + iov->iov_len, 1, task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + task->status = iov->iov_base; + payload_len -= sizeof(*req) + sizeof(*task->status); + task->iovcnt -= 2; + + type = req->type; +#ifdef VIRTIO_BLK_T_BARRIER + /* Don't care about barier for now (as QEMU's virtio-blk do). */ + type &= ~VIRTIO_BLK_T_BARRIER; +#endif + + switch (type) { + case VIRTIO_BLK_T_IN: + case VIRTIO_BLK_T_OUT: + if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) { + SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n", + type ? "WRITE" : "READ", task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + if (type == VIRTIO_BLK_T_IN) { + task->used_len = payload_len + sizeof(*task->status); + rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel, + &task->iovs[1], task->iovcnt, req->sector * 512, + payload_len, blk_request_complete_cb, task); + } else if (!bvdev->readonly) { + task->used_len = sizeof(*task->status); + rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel, + &task->iovs[1], task->iovcnt, req->sector * 512, + payload_len, blk_request_complete_cb, task); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n"); + rc = -1; + } + + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); + blk_request_queue_io(task); + } else { + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + } + break; + case VIRTIO_BLK_T_DISCARD: + desc = task->iovs[1].iov_base; + if (payload_len != sizeof(*desc)) { + SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len); + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + + rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel, + desc->sector * 512, desc->num_sectors * 512, + blk_request_complete_cb, task); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); + blk_request_queue_io(task); + } else { + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + } + break; + case VIRTIO_BLK_T_WRITE_ZEROES: + desc = task->iovs[1].iov_base; + if (payload_len != sizeof(*desc)) { + SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len); + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + + /* Zeroed and Unmap the range, SPDK doen't support it. */ + if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { + SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n"); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel, + desc->sector * 512, desc->num_sectors * 512, + blk_request_complete_cb, task); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); + blk_request_queue_io(task); + } else { + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + } + break; + case VIRTIO_BLK_T_FLUSH: + flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev); + if (req->sector != 0) { + SPDK_NOTICELOG("sector must be zero for flush command\n"); + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel, + 0, flush_bytes, + blk_request_complete_cb, task); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); + blk_request_queue_io(task); + } else { + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + } + break; + case VIRTIO_BLK_T_GET_ID: + if (!task->iovcnt || !payload_len) { + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len); + spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev), + task->used_len, ' '); + blk_request_finish(true, task); + break; + default: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + return 0; +} + +static void +process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx) +{ + struct spdk_vhost_blk_task *task; + uint16_t task_idx = req_idx, num_descs; + + if (vq->packed.packed_ring) { + /* Packed ring used the buffer_id as the task_idx to get task struct. + * In kernel driver, it uses the vq->free_head to set the buffer_id so the value + * must be in the range of 0 ~ vring.size. The free_head value must be unique + * in the outstanding requests. + * We can't use the req_idx as the task_idx because the desc can be reused in + * the next phase even when it's not completed in the previous phase. For example, + * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving + * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used + * as task_idx because we will know task[0]->used is true at phase 1. + * The split queue is quite different, the desc would insert into the free list when + * device completes the request, the driver gets the desc from the free list which + * ensures the req_idx is unique in the outstanding requests. + */ + task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs); + } + + task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", + task->bvsession->vsession.name, task_idx); + task->used_len = 0; + blk_task_enqueue(task); + return; + } + + if (vq->packed.packed_ring) { + task->req_idx = req_idx; + task->num_descs = num_descs; + task->buffer_id = task_idx; + } + + task->bvsession->vsession.task_cnt++; + + blk_task_init(task); + + if (process_blk_request(task, task->bvsession, vq) == 0) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task, + task_idx); + } else { + SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx); + } +} + +static void +submit_inflight_desc(struct spdk_vhost_blk_session *bvsession, + struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight; + spdk_vhost_resubmit_desc *resubmit_list; + uint16_t req_idx; + + if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) { + return; + } + + resubmit_list = resubmit->resubmit_list; + while (resubmit->resubmit_num-- > 0) { + req_idx = resubmit_list[resubmit->resubmit_num].index; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Start processing request idx %"PRIu16"======\n", + req_idx); + + if (spdk_unlikely(req_idx >= vq->vring.size)) { + SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", + vsession->name, req_idx, vq->vring.size); + vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); + continue; + } + + process_blk_task(vq, req_idx); + } + + free(resubmit_list); + resubmit->resubmit_list = NULL; +} + +static void +process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS]; + uint16_t reqs_cnt, i; + + submit_inflight_desc(bvsession, vq); + + reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); + if (!reqs_cnt) { + return; + } + + for (i = 0; i < reqs_cnt; i++) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n", + reqs[i]); + + if (spdk_unlikely(reqs[i] >= vq->vring.size)) { + SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", + vsession->name, reqs[i], vq->vring.size); + vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0); + continue; + } + + rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]); + + process_blk_task(vq, reqs[i]); + } +} + +static void +process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) +{ + uint16_t i = 0; + + while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS && + vhost_vq_packed_ring_is_avail(vq)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n", + vq->last_avail_idx); + + process_blk_task(vq, vq->last_avail_idx); + } +} + +static int +vdev_worker(void *arg) +{ + struct spdk_vhost_blk_session *bvsession = arg; + struct spdk_vhost_session *vsession = &bvsession->vsession; + + uint16_t q_idx; + bool packed_ring; + + /* In a session, every vq supports the same format */ + packed_ring = vsession->virtqueue[0].packed.packed_ring; + for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { + if (packed_ring) { + process_packed_vq(bvsession, &vsession->virtqueue[q_idx]); + } else { + process_vq(bvsession, &vsession->virtqueue[q_idx]); + } + } + + vhost_session_used_signal(vsession); + + return SPDK_POLLER_BUSY; +} + +static void +no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; + uint32_t length; + uint16_t iovcnt, req_idx; + + if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) { + return; + } + + iovcnt = SPDK_COUNTOF(iovs); + if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) { + *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx); + } + + vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); +} + +static void +no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct spdk_vhost_blk_task *task; + uint32_t length; + uint16_t req_idx = vq->last_avail_idx; + uint16_t task_idx, num_descs; + + if (!vhost_vq_packed_ring_is_avail(vq)) { + return; + } + + task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs); + task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", + vsession->name, req_idx); + vhost_vq_packed_ring_enqueue(vsession, vq, num_descs, + task->buffer_id, task->used_len); + return; + } + + task->req_idx = req_idx; + task->num_descs = num_descs; + task->buffer_id = task_idx; + blk_task_init(task); + + if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt, + &length)) { + *(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx); + } + + task->used = false; + vhost_vq_packed_ring_enqueue(vsession, vq, num_descs, + task->buffer_id, task->used_len); +} + +static int +no_bdev_vdev_worker(void *arg) +{ + struct spdk_vhost_blk_session *bvsession = arg; + struct spdk_vhost_session *vsession = &bvsession->vsession; + uint16_t q_idx; + bool packed_ring; + + /* In a session, every vq supports the same format */ + packed_ring = vsession->virtqueue[0].packed.packed_ring; + for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) { + if (packed_ring) { + no_bdev_process_packed_vq(bvsession, &vsession->virtqueue[q_idx]); + } else { + no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]); + } + } + + vhost_session_used_signal(vsession); + + if (vsession->task_cnt == 0 && bvsession->io_channel) { + spdk_put_io_channel(bvsession->io_channel); + bvsession->io_channel = NULL; + } + + return SPDK_POLLER_BUSY; +} + +static struct spdk_vhost_blk_session * +to_blk_session(struct spdk_vhost_session *vsession) +{ + assert(vsession->vdev->backend == &vhost_blk_device_backend); + return (struct spdk_vhost_blk_session *)vsession; +} + +static struct spdk_vhost_blk_dev * +to_blk_dev(struct spdk_vhost_dev *vdev) +{ + if (vdev == NULL) { + return NULL; + } + + if (vdev->backend != &vhost_blk_device_backend) { + SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name); + return NULL; + } + + return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev); +} + +static int +vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, + void *ctx) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0) + SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid); + rte_vhost_slave_config_change(vsession->vid, false); +#else + SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n"); +#endif + + return 0; +} + +static void +blk_resize_cb(void *resize_ctx) +{ + struct spdk_vhost_blk_dev *bvdev = resize_ctx; + + spdk_vhost_lock(); + vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb, + NULL, NULL); + spdk_vhost_unlock(); +} + +static void +vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) +{ + + /* All sessions have been notified, time to close the bdev */ + struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); + + assert(bvdev != NULL); + spdk_put_io_channel(bvdev->dummy_io_channel); + spdk_bdev_close(bvdev->bdev_desc); + bvdev->bdev_desc = NULL; + bvdev->bdev = NULL; +} + +static int +vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, + void *ctx) +{ + struct spdk_vhost_blk_session *bvsession; + + bvsession = (struct spdk_vhost_blk_session *)vsession; + if (bvsession->requestq_poller) { + spdk_poller_unregister(&bvsession->requestq_poller); + bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0); + } + + return 0; +} + +static void +bdev_remove_cb(void *remove_ctx) +{ + struct spdk_vhost_blk_dev *bvdev = remove_ctx; + + SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n", + bvdev->vdev.name); + + spdk_vhost_lock(); + vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb, + vhost_dev_bdev_remove_cpl_cb, NULL); + spdk_vhost_unlock(); +} + +static void +bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, + void *event_ctx) +{ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Bdev event: type %d, name %s\n", + type, + bdev->name); + + switch (type) { + case SPDK_BDEV_EVENT_REMOVE: + SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name); + bdev_remove_cb(event_ctx); + break; + case SPDK_BDEV_EVENT_RESIZE: + SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name); + blk_resize_cb(event_ctx); + break; + default: + SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); + break; + } +} + +static void +free_task_pool(struct spdk_vhost_blk_session *bvsession) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct spdk_vhost_virtqueue *vq; + uint16_t i; + + for (i = 0; i < vsession->max_queues; i++) { + vq = &vsession->virtqueue[i]; + if (vq->tasks == NULL) { + continue; + } + + spdk_free(vq->tasks); + vq->tasks = NULL; + } +} + +static int +alloc_task_pool(struct spdk_vhost_blk_session *bvsession) +{ + struct spdk_vhost_session *vsession = &bvsession->vsession; + struct spdk_vhost_virtqueue *vq; + struct spdk_vhost_blk_task *task; + uint32_t task_cnt; + uint16_t i; + uint32_t j; + + for (i = 0; i < vsession->max_queues; i++) { + vq = &vsession->virtqueue[i]; + if (vq->vring.desc == NULL) { + continue; + } + + task_cnt = vq->vring.size; + if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { + /* sanity check */ + SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", + vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); + free_task_pool(bvsession); + return -1; + } + vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt, + SPDK_CACHE_LINE_SIZE, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (vq->tasks == NULL) { + SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", + vsession->name, task_cnt, i); + free_task_pool(bvsession); + return -1; + } + + for (j = 0; j < task_cnt; j++) { + task = &((struct spdk_vhost_blk_task *)vq->tasks)[j]; + task->bvsession = bvsession; + task->req_idx = j; + task->vq = vq; + } + } + + return 0; +} + +static int +vhost_blk_start_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); + struct spdk_vhost_blk_dev *bvdev; + int i, rc = 0; + + bvdev = to_blk_dev(vdev); + assert(bvdev != NULL); + bvsession->bvdev = bvdev; + + /* validate all I/O queues are in a contiguous index range */ + for (i = 0; i < vsession->max_queues; i++) { + /* vring.desc and vring.desc_packed are in a union struct + * so q->vring.desc can replace q->vring.desc_packed. + */ + if (vsession->virtqueue[i].vring.desc == NULL) { + SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i); + rc = -1; + goto out; + } + } + + rc = alloc_task_pool(bvsession); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name); + goto out; + } + + if (bvdev->bdev) { + bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); + if (!bvsession->io_channel) { + free_task_pool(bvsession); + SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name); + rc = -1; + goto out; + } + } + + bvsession->requestq_poller = SPDK_POLLER_REGISTER(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker, + bvsession, 0); + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n", + vsession->name, spdk_env_get_current_core()); +out: + vhost_session_start_done(vsession, rc); + return rc; +} + +static int +vhost_blk_start(struct spdk_vhost_session *vsession) +{ + return vhost_session_send_event(vsession, vhost_blk_start_cb, + 3, "start session"); +} + +static int +destroy_session_poller_cb(void *arg) +{ + struct spdk_vhost_blk_session *bvsession = arg; + struct spdk_vhost_session *vsession = &bvsession->vsession; + int i; + + if (vsession->task_cnt > 0) { + return SPDK_POLLER_BUSY; + } + + if (spdk_vhost_trylock() != 0) { + return SPDK_POLLER_BUSY; + } + + for (i = 0; i < vsession->max_queues; i++) { + vsession->virtqueue[i].next_event_time = 0; + vhost_vq_used_signal(vsession, &vsession->virtqueue[i]); + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n", + vsession->name, spdk_env_get_current_core()); + + if (bvsession->io_channel) { + spdk_put_io_channel(bvsession->io_channel); + bvsession->io_channel = NULL; + } + + free_task_pool(bvsession); + spdk_poller_unregister(&bvsession->stop_poller); + vhost_session_stop_done(vsession, 0); + + spdk_vhost_unlock(); + return SPDK_POLLER_BUSY; +} + +static int +vhost_blk_stop_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); + + spdk_poller_unregister(&bvsession->requestq_poller); + bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb, + bvsession, 1000); + return 0; +} + +static int +vhost_blk_stop(struct spdk_vhost_session *vsession) +{ + return vhost_session_send_event(vsession, vhost_blk_stop_cb, + 3, "stop session"); +} + +static void +vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_blk_dev *bvdev; + + bvdev = to_blk_dev(vdev); + assert(bvdev != NULL); + + spdk_json_write_named_object_begin(w, "block"); + + spdk_json_write_named_bool(w, "readonly", bvdev->readonly); + + spdk_json_write_name(w, "bdev"); + if (bvdev->bdev) { + spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev)); + } else { + spdk_json_write_null(w); + } + + spdk_json_write_object_end(w); +} + +static void +vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_blk_dev *bvdev; + + bvdev = to_blk_dev(vdev); + assert(bvdev != NULL); + + if (!bvdev->bdev) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_create_blk_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev)); + spdk_json_write_named_string(w, "cpumask", + spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread))); + spdk_json_write_named_bool(w, "readonly", bvdev->readonly); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static int vhost_blk_destroy(struct spdk_vhost_dev *dev); + +static int +vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config, + uint32_t len) +{ + struct virtio_blk_config blkcfg; + struct spdk_vhost_blk_dev *bvdev; + struct spdk_bdev *bdev; + uint32_t blk_size; + uint64_t blkcnt; + + memset(&blkcfg, 0, sizeof(blkcfg)); + bvdev = to_blk_dev(vdev); + assert(bvdev != NULL); + bdev = bvdev->bdev; + if (bdev == NULL) { + /* We can't just return -1 here as this GET_CONFIG message might + * be caused by a QEMU VM reboot. Returning -1 will indicate an + * error to QEMU, who might then decide to terminate itself. + * We don't want that. A simple reboot shouldn't break the system. + * + * Presenting a block device with block size 0 and block count 0 + * doesn't cause any problems on QEMU side and the virtio-pci + * device is even still available inside the VM, but there will + * be no block device created for it - the kernel drivers will + * silently reject it. + */ + blk_size = 0; + blkcnt = 0; + } else { + blk_size = spdk_bdev_get_block_size(bdev); + blkcnt = spdk_bdev_get_num_blocks(bdev); + if (spdk_bdev_get_buf_align(bdev) > 1) { + blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE; + blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1); + } else { + blkcfg.size_max = 131072; + /* -2 for REQ and RESP and -1 for region boundary splitting */ + blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1; + } + } + + blkcfg.blk_size = blk_size; + /* minimum I/O size in blocks */ + blkcfg.min_io_size = 1; + /* expressed in 512 Bytes sectors */ + blkcfg.capacity = (blkcnt * blk_size) / 512; + /* QEMU can overwrite this value when started */ + blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES; + + if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + /* 16MiB, expressed in 512 Bytes */ + blkcfg.max_discard_sectors = 32768; + blkcfg.max_discard_seg = 1; + blkcfg.discard_sector_alignment = blk_size / 512; + } + if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { + blkcfg.max_write_zeroes_sectors = 32768; + blkcfg.max_write_zeroes_seg = 1; + } + + memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg))); + + return 0; +} + +static const struct spdk_vhost_dev_backend vhost_blk_device_backend = { + .session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session), + .start_session = vhost_blk_start, + .stop_session = vhost_blk_stop, + .vhost_get_config = vhost_blk_get_config, + .dump_info_json = vhost_blk_dump_info_json, + .write_config_json = vhost_blk_write_config_json, + .remove_device = vhost_blk_destroy, +}; + +int +vhost_blk_controller_construct(void) +{ + struct spdk_conf_section *sp; + unsigned ctrlr_num; + char *bdev_name; + char *cpumask; + char *name; + bool readonly; + bool packed_ring; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num); + return -1; + } + + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false); + packed_ring = spdk_conf_section_get_boolval(sp, "PackedRing", false); + + bdev_name = spdk_conf_section_get_val(sp, "Dev"); + if (bdev_name == NULL) { + continue; + } + + if (spdk_vhost_blk_construct(name, cpumask, bdev_name, + readonly, packed_ring) < 0) { + return -1; + } + } + + return 0; +} + +int +spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, + bool readonly, bool packed_ring) +{ + struct spdk_vhost_blk_dev *bvdev = NULL; + struct spdk_vhost_dev *vdev; + struct spdk_bdev *bdev; + int ret = 0; + + spdk_vhost_lock(); + bdev = spdk_bdev_get_by_name(dev_name); + if (bdev == NULL) { + SPDK_ERRLOG("%s: bdev '%s' not found\n", + name, dev_name); + ret = -ENODEV; + goto out; + } + + bvdev = calloc(1, sizeof(*bvdev)); + if (bvdev == NULL) { + ret = -ENOMEM; + goto out; + } + + vdev = &bvdev->vdev; + vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE; + vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES; + vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES; + + vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED; + + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD); + } + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { + vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); + } + if (readonly) { + vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO); + } + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { + vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH); + } + + ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc); + if (ret != 0) { + SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n", + name, dev_name, ret); + goto out; + } + + /* + * When starting qemu with vhost-user-blk multiqueue, the vhost device will + * be started/stopped many times, related to the queues num, as the + * vhost-user backend doesn't know the exact number of queues used for this + * device. The target have to stop and start the device once got a valid + * IO queue. + * When stoping and starting the vhost device, the backend bdev io device + * will be deleted and created repeatedly. + * Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that + * the io device will not be deleted. + */ + bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); + + bvdev->bdev = bdev; + bvdev->readonly = readonly; + ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend); + if (ret != 0) { + spdk_put_io_channel(bvdev->dummy_io_channel); + spdk_bdev_close(bvdev->bdev_desc); + goto out; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name); +out: + if (ret != 0 && bvdev) { + free(bvdev); + } + spdk_vhost_unlock(); + return ret; +} + +static int +vhost_blk_destroy(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); + int rc; + + assert(bvdev != NULL); + + rc = vhost_dev_unregister(&bvdev->vdev); + if (rc != 0) { + return rc; + } + + /* if the bdev is removed, don't need call spdk_put_io_channel. */ + if (bvdev->bdev) { + spdk_put_io_channel(bvdev->dummy_io_channel); + } + + if (bvdev->bdev_desc) { + spdk_bdev_close(bvdev->bdev_desc); + bvdev->bdev_desc = NULL; + } + bvdev->bdev = NULL; + + free(bvdev); + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK) +SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA) diff --git a/src/spdk/lib/vhost/vhost_internal.h b/src/spdk/lib/vhost/vhost_internal.h new file mode 100644 index 000000000..3aa89768a --- /dev/null +++ b/src/spdk/lib/vhost/vhost_internal.h @@ -0,0 +1,496 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VHOST_INTERNAL_H +#define SPDK_VHOST_INTERNAL_H +#include <linux/virtio_config.h> + +#include "spdk/stdinc.h" + +#include <rte_vhost.h> + +#include "spdk_internal/vhost_user.h" +#include "spdk_internal/log.h" +#include "spdk/util.h" +#include "spdk/rpc.h" +#include "spdk/config.h" + +#define SPDK_VHOST_MAX_VQUEUES 256 +#define SPDK_VHOST_MAX_VQ_SIZE 1024 + +#define SPDK_VHOST_SCSI_CTRLR_MAX_DEVS 8 + +#define SPDK_VHOST_IOVS_MAX 129 + +#define SPDK_VHOST_VQ_MAX_SUBMISSIONS 32 + +/* + * Rate at which stats are checked for interrupt coalescing. + */ +#define SPDK_VHOST_STATS_CHECK_INTERVAL_MS 10 +/* + * Default threshold at which interrupts start to be coalesced. + */ +#define SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD 60000 + +/* + * Currently coalescing is not used by default. + * Setting this to value > 0 here or by RPC will enable coalescing. + */ +#define SPDK_VHOST_COALESCING_DELAY_BASE_US 0 + +#define SPDK_VHOST_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ + (1ULL << VIRTIO_RING_F_EVENT_IDX) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1ULL << VIRTIO_F_RING_PACKED)) + +#define SPDK_VHOST_DISABLED_FEATURES ((1ULL << VIRTIO_RING_F_EVENT_IDX) | \ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY)) + +#define VRING_DESC_F_AVAIL (1ULL << VRING_PACKED_DESC_F_AVAIL) +#define VRING_DESC_F_USED (1ULL << VRING_PACKED_DESC_F_USED) +#define VRING_DESC_F_AVAIL_USED (VRING_DESC_F_AVAIL | VRING_DESC_F_USED) + +typedef struct rte_vhost_resubmit_desc spdk_vhost_resubmit_desc; +typedef struct rte_vhost_resubmit_info spdk_vhost_resubmit_info; + +struct spdk_vhost_virtqueue { + struct rte_vhost_vring vring; + struct rte_vhost_ring_inflight vring_inflight; + uint16_t last_avail_idx; + uint16_t last_used_idx; + + struct { + /* To mark a descriptor as available in packed ring + * Equal to avail_wrap_counter in spec. + */ + uint8_t avail_phase : 1; + /* To mark a descriptor as used in packed ring + * Equal to used_wrap_counter in spec. + */ + uint8_t used_phase : 1; + uint8_t padding : 5; + bool packed_ring : 1; + } packed; + + void *tasks; + + /* Request count from last stats check */ + uint32_t req_cnt; + + /* Request count from last event */ + uint16_t used_req_cnt; + + /* How long interrupt is delayed */ + uint32_t irq_delay_time; + + /* Next time when we need to send event */ + uint64_t next_event_time; + + /* Associated vhost_virtqueue in the virtio device's virtqueue list */ + uint32_t vring_idx; +} __attribute((aligned(SPDK_CACHE_LINE_SIZE))); + +struct spdk_vhost_session { + struct spdk_vhost_dev *vdev; + + /* rte_vhost connection ID. */ + int vid; + + /* Unique session ID. */ + uint64_t id; + /* Unique session name. */ + char *name; + + bool initialized; + bool started; + bool needs_restart; + bool forced_polling; + + struct rte_vhost_memory *mem; + + int task_cnt; + + uint16_t max_queues; + + uint64_t negotiated_features; + + /* Local copy of device coalescing settings. */ + uint32_t coalescing_delay_time_base; + uint32_t coalescing_io_rate_threshold; + + /* Next time when stats for event coalescing will be checked. */ + uint64_t next_stats_check_time; + + /* Interval used for event coalescing checking. */ + uint64_t stats_check_interval; + + struct spdk_vhost_virtqueue virtqueue[SPDK_VHOST_MAX_VQUEUES]; + + TAILQ_ENTRY(spdk_vhost_session) tailq; +}; + +struct spdk_vhost_dev { + char *name; + char *path; + + struct spdk_thread *thread; + bool registered; + + uint64_t virtio_features; + uint64_t disabled_features; + uint64_t protocol_features; + + const struct spdk_vhost_dev_backend *backend; + + /* Saved orginal values used to setup coalescing to avoid integer + * rounding issues during save/load config. + */ + uint32_t coalescing_delay_us; + uint32_t coalescing_iops_threshold; + + /* Current connections to the device */ + TAILQ_HEAD(, spdk_vhost_session) vsessions; + + /* Increment-only session counter */ + uint64_t vsessions_num; + + /* Number of started and actively polled sessions */ + uint32_t active_session_num; + + /* Number of pending asynchronous operations */ + uint32_t pending_async_op_num; + + TAILQ_ENTRY(spdk_vhost_dev) tailq; +}; + +/** + * \param vdev vhost device. + * \param vsession vhost session. + * \param arg user-provided parameter. + * + * \return negative values will break the foreach call, meaning + * the function won't be called again. Return codes zero and + * positive don't have any effect. + */ +typedef int (*spdk_vhost_session_fn)(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, + void *arg); + +/** + * \param vdev vhost device. + * \param arg user-provided parameter. + */ +typedef void (*spdk_vhost_dev_fn)(struct spdk_vhost_dev *vdev, void *arg); + +struct spdk_vhost_dev_backend { + /** + * Size of additional per-session context data + * allocated whenever a new client connects. + */ + size_t session_ctx_size; + + int (*start_session)(struct spdk_vhost_session *vsession); + int (*stop_session)(struct spdk_vhost_session *vsession); + + int (*vhost_get_config)(struct spdk_vhost_dev *vdev, uint8_t *config, uint32_t len); + int (*vhost_set_config)(struct spdk_vhost_dev *vdev, uint8_t *config, + uint32_t offset, uint32_t size, uint32_t flags); + + void (*dump_info_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); + void (*write_config_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); + int (*remove_device)(struct spdk_vhost_dev *vdev); +}; + +void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len); + +uint16_t vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *vq, uint16_t *reqs, + uint16_t reqs_len); + +/** + * Get a virtio split descriptor at given index in given virtqueue. + * The descriptor will provide access to the entire descriptor + * chain. The subsequent descriptors are accesible via + * \c spdk_vhost_vring_desc_get_next. + * \param vsession vhost session + * \param vq virtqueue + * \param req_idx descriptor index + * \param desc pointer to be set to the descriptor + * \param desc_table descriptor table to be used with + * \c spdk_vhost_vring_desc_get_next. This might be either + * default virtqueue descriptor table or per-chain indirect + * table. + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid. + * If -1 is returned, the content of params is undefined. + */ +int vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq, + uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table, + uint32_t *desc_table_size); + +/** + * Get a virtio packed descriptor at given index in given virtqueue. + * The descriptor will provide access to the entire descriptor + * chain. The subsequent descriptors are accesible via + * \c vhost_vring_packed_desc_get_next. + * \param vsession vhost session + * \param vq virtqueue + * \param req_idx descriptor index + * \param desc pointer to be set to the descriptor + * \param desc_table descriptor table to be used with + * \c spdk_vhost_vring_desc_get_next. This might be either + * \c NULL or per-chain indirect table. + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid. + * If -1 is returned, the content of params is undefined. + */ +int vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_idx, struct vring_packed_desc **desc, + struct vring_packed_desc **desc_table, uint32_t *desc_table_size); + +/** + * Send IRQ/call client (if pending) for \c vq. + * \param vsession vhost session + * \param vq virtqueue + * \return + * 0 - if no interrupt was signalled + * 1 - if interrupt was signalled + */ +int vhost_vq_used_signal(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq); + + +/** + * Send IRQs for all queues that need to be signaled. + * \param vsession vhost session + * \param vq virtqueue + */ +void vhost_session_used_signal(struct spdk_vhost_session *vsession); + +void vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *vq, + uint16_t id, uint32_t len); + +/** + * Enqueue the entry to the used ring when device complete the request. + * \param vsession vhost session + * \param vq virtqueue + * \req_idx descriptor index. It's the first index of this descriptor chain. + * \num_descs descriptor count. It's the count of the number of buffers in the chain. + * \buffer_id descriptor buffer ID. + * \length device write length. Specify the length of the buffer that has been initialized + * (written to) by the device + */ +void vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *virtqueue, + uint16_t num_descs, uint16_t buffer_id, + uint32_t length); + +/** + * Get subsequent descriptor from given table. + * \param desc current descriptor, will be set to the + * next descriptor (NULL in case this is the last + * descriptor in the chain or the next desc is invalid) + * \param desc_table descriptor table + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid + * The *desc* param will be set regardless of the + * return value. + */ +int vhost_vring_desc_get_next(struct vring_desc **desc, + struct vring_desc *desc_table, uint32_t desc_table_size); +static inline bool +vhost_vring_desc_is_wr(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +int vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, + uint16_t *iov_index, const struct vring_desc *desc); + +bool vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue); + +/** + * Get subsequent descriptor from vq or desc table. + * \param desc current descriptor, will be set to the + * next descriptor (NULL in case this is the last + * descriptor in the chain or the next desc is invalid) + * \req_idx index of current desc, will be set to the next + * index. If desc_table != NULL the req_idx is the the vring index + * or the req_idx is the desc_table index. + * \param desc_table descriptor table + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid + * The *desc* param will be set regardless of the + * return value. + */ +int vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx, + struct spdk_vhost_virtqueue *vq, + struct vring_packed_desc *desc_table, + uint32_t desc_table_size); + +bool vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc); + +int vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov, + uint16_t *iov_index, const struct vring_packed_desc *desc); + +uint16_t vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx, + uint16_t *num_descs); + +static inline bool __attribute__((always_inline)) +vhost_dev_has_feature(struct spdk_vhost_session *vsession, unsigned feature_id) +{ + return vsession->negotiated_features & (1ULL << feature_id); +} + +int vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str, + const struct spdk_vhost_dev_backend *backend); +int vhost_dev_unregister(struct spdk_vhost_dev *vdev); + +int vhost_scsi_controller_construct(void); +int vhost_blk_controller_construct(void); +void vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); + +/* + * Vhost callbacks for vhost_device_ops interface + */ + +int vhost_new_connection_cb(int vid, const char *ifname); +int vhost_start_device_cb(int vid); +int vhost_stop_device_cb(int vid); +int vhost_destroy_connection_cb(int vid); + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB +int vhost_get_config_cb(int vid, uint8_t *config, uint32_t len); +int vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset, + uint32_t size, uint32_t flags); +#endif + +/* + * Memory registration functions used in start/stop device callbacks + */ +void vhost_session_mem_register(struct rte_vhost_memory *mem); +void vhost_session_mem_unregister(struct rte_vhost_memory *mem); + +/* + * Call a function for each session of the provided vhost device. + * The function will be called one-by-one on each session's thread. + * + * \param vdev vhost device + * \param fn function to call on each session's thread + * \param cpl_fn function to be called at the end of the iteration on + * the vhost management thread. + * Optional, can be NULL. + * \param arg additional argument to the both callbacks + */ +void vhost_dev_foreach_session(struct spdk_vhost_dev *dev, + spdk_vhost_session_fn fn, + spdk_vhost_dev_fn cpl_fn, + void *arg); + +/** + * Call a function on the provided lcore and block until either + * spdk_vhost_session_start_done() or spdk_vhost_session_stop_done() + * is called. + * + * This must be called under the global vhost mutex, which this function + * will unlock for the time it's waiting. It's meant to be called only + * from start/stop session callbacks. + * + * \param vsession vhost session + * \param cb_fn the function to call. The void *arg parameter in cb_fn + * is always NULL. + * \param timeout_sec timeout in seconds. This function will still + * block after the timeout expires, but will print the provided errmsg. + * \param errmsg error message to print once the timeout expires + * \return return the code passed to spdk_vhost_session_event_done(). + */ +int vhost_session_send_event(struct spdk_vhost_session *vsession, + spdk_vhost_session_fn cb_fn, unsigned timeout_sec, + const char *errmsg); + +/** + * Finish a blocking spdk_vhost_session_send_event() call and finally + * start the session. This must be called on the target lcore, which + * will now receive all session-related messages (e.g. from + * spdk_vhost_dev_foreach_session()). + * + * Must be called under the global vhost lock. + * + * \param vsession vhost session + * \param response return code + */ +void vhost_session_start_done(struct spdk_vhost_session *vsession, int response); + +/** + * Finish a blocking spdk_vhost_session_send_event() call and finally + * stop the session. This must be called on the session's lcore which + * used to receive all session-related messages (e.g. from + * spdk_vhost_dev_foreach_session()). After this call, the session- + * related messages will be once again processed by any arbitrary thread. + * + * Must be called under the global vhost lock. + * + * Must be called under the global vhost mutex. + * + * \param vsession vhost session + * \param response return code + */ +void vhost_session_stop_done(struct spdk_vhost_session *vsession, int response); + +struct spdk_vhost_session *vhost_session_find_by_vid(int vid); +void vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession); +int vhost_register_unix_socket(const char *path, const char *ctrl_name, + uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features); +int vhost_driver_unregister(const char *path); +int vhost_get_mem_table(int vid, struct rte_vhost_memory **mem); +int vhost_get_negotiated_features(int vid, uint64_t *negotiated_features); + +int remove_vhost_controller(struct spdk_vhost_dev *vdev); + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB +int vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf); +int vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd); +int vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size); +int vhost_nvme_get_cap(int vid, uint64_t *cap); +int vhost_nvme_controller_construct(void); +int vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues); +int vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev); +int vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, + const char *bdev_name); +#endif + +#endif /* SPDK_VHOST_INTERNAL_H */ diff --git a/src/spdk/lib/vhost/vhost_nvme.c b/src/spdk/lib/vhost/vhost_nvme.c new file mode 100644 index 000000000..10f53baf9 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_nvme.c @@ -0,0 +1,1500 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/barrier.h" +#include "spdk/vhost.h" +#include "spdk/bdev.h" +#include "spdk/version.h" +#include "spdk/nvme_spec.h" +#include "spdk/likely.h" + +#include "vhost_internal.h" + +#define MAX_IO_QUEUES 31 +#define MAX_IOVS 64 +#define MAX_NAMESPACE 8 +#define MAX_QUEUE_ENTRIES_SUPPORTED 256 +#define MAX_BATCH_IO 8 + +struct spdk_vhost_nvme_sq { + uint16_t sqid; + uint16_t size; + uint16_t cqid; + bool valid; + struct spdk_nvme_cmd *sq_cmd; + uint16_t sq_head; + uint16_t sq_tail; +}; + +struct spdk_vhost_nvme_cq { + uint8_t phase; + uint16_t size; + uint16_t cqid; + bool valid; + volatile struct spdk_nvme_cpl *cq_cqe; + uint16_t cq_head; + uint16_t guest_signaled_cq_head; + uint32_t need_signaled_cnt; + STAILQ_HEAD(, spdk_vhost_nvme_task) cq_full_waited_tasks; + bool irq_enabled; + int virq; +}; + +struct spdk_vhost_nvme_ns { + struct spdk_bdev *bdev; + uint32_t block_size; + uint64_t capacity; + uint32_t nsid; + uint32_t active_ns; + struct spdk_bdev_desc *bdev_desc; + struct spdk_io_channel *bdev_io_channel; + struct spdk_nvme_ns_data nsdata; +}; + +struct spdk_vhost_nvme_task { + struct spdk_nvme_cmd cmd; + struct spdk_vhost_nvme_dev *nvme; + uint16_t sqid; + uint16_t cqid; + + /** array of iovecs to transfer. */ + struct iovec iovs[MAX_IOVS]; + + /** Number of iovecs in iovs array. */ + int iovcnt; + + /** Current iovec position. */ + int iovpos; + + /** Offset in current iovec. */ + uint32_t iov_offset; + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + struct spdk_vhost_nvme_sq *sq; + struct spdk_vhost_nvme_ns *ns; + + /* parent pointer. */ + struct spdk_vhost_nvme_task *parent; + uint8_t dnr; + uint8_t sct; + uint8_t sc; + uint32_t num_children; + STAILQ_ENTRY(spdk_vhost_nvme_task) stailq; +}; + +struct spdk_vhost_nvme_dev { + struct spdk_vhost_dev vdev; + + uint32_t num_io_queues; + union spdk_nvme_cap_register cap; + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + struct spdk_nvme_ctrlr_data cdata; + + uint32_t num_sqs; + uint32_t num_cqs; + + uint32_t num_ns; + struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE]; + + volatile uint32_t *bar; + volatile uint32_t *bar_db; + uint64_t bar_size; + bool dataplane_started; + + volatile uint32_t *dbbuf_dbs; + volatile uint32_t *dbbuf_eis; + struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1]; + struct spdk_vhost_nvme_cq cq_queue[MAX_IO_QUEUES + 1]; + + /* The one and only session associated with this device */ + struct spdk_vhost_session *vsession; + + TAILQ_ENTRY(spdk_vhost_nvme_dev) tailq; + STAILQ_HEAD(, spdk_vhost_nvme_task) free_tasks; + struct spdk_poller *requestq_poller; + struct spdk_poller *stop_poller; +}; + +static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend; + +/* + * Report the SPDK version as the firmware revision. + * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts. + */ +#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING + +static int +nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq, + struct spdk_vhost_nvme_task *task); + +static struct spdk_vhost_nvme_dev * +to_nvme_dev(struct spdk_vhost_dev *vdev) +{ + if (vdev->backend != &spdk_vhost_nvme_device_backend) { + SPDK_ERRLOG("%s: not a vhost-nvme device\n", vdev->name); + return NULL; + } + + return SPDK_CONTAINEROF(vdev, struct spdk_vhost_nvme_dev, vdev); +} + +static TAILQ_HEAD(, spdk_vhost_nvme_dev) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs); + +static inline unsigned int sq_offset(unsigned int qid, uint32_t db_stride) +{ + return qid * 2 * db_stride; +} + +static inline unsigned int cq_offset(unsigned int qid, uint32_t db_stride) +{ + return (qid * 2 + 1) * db_stride; +} + +static void +nvme_inc_cq_head(struct spdk_vhost_nvme_cq *cq) +{ + cq->cq_head++; + if (cq->cq_head >= cq->size) { + cq->cq_head = 0; + cq->phase = !cq->phase; + } +} + +static bool +nvme_cq_is_full(struct spdk_vhost_nvme_cq *cq) +{ + return ((cq->cq_head + 1) % cq->size == cq->guest_signaled_cq_head); +} + +static void +nvme_inc_sq_head(struct spdk_vhost_nvme_sq *sq) +{ + sq->sq_head = (sq->sq_head + 1) % sq->size; +} + +static struct spdk_vhost_nvme_sq * +vhost_nvme_get_sq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid) +{ + if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) { + return NULL; + } + + return &dev->sq_queue[qid]; +} + +static struct spdk_vhost_nvme_cq * +vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid) +{ + if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) { + return NULL; + } + + return &dev->cq_queue[qid]; +} + +static inline uint32_t +vhost_nvme_get_queue_head(struct spdk_vhost_nvme_dev *nvme, uint32_t offset) +{ + if (nvme->dataplane_started) { + return nvme->dbbuf_dbs[offset]; + + } else if (nvme->bar) { + return nvme->bar_db[offset]; + } + + assert(0); + + return 0; +} + +static void * +vhost_nvme_gpa_to_vva(void *priv, uint64_t addr, uint64_t len) +{ + struct spdk_vhost_session *vsession = priv; + + return vhost_gpa_to_vva(vsession, addr, len); +} + +static int +vhost_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd, + struct spdk_vhost_nvme_task *task, uint32_t len) +{ + int err; + + err = spdk_nvme_map_prps(nvme->vsession, cmd, task->iovs, len, 4096, + vhost_nvme_gpa_to_vva); + if (spdk_unlikely(err < 0)) { + return err; + } + task->iovcnt = err; + return 0; +} + +static void +nvme_cq_signal_fd(struct spdk_vhost_nvme_dev *nvme) +{ + struct spdk_vhost_nvme_cq *cq; + uint32_t qid, cq_head; + + assert(nvme != NULL); + + for (qid = 1; qid <= MAX_IO_QUEUES; qid++) { + cq = vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq || !cq->valid) { + continue; + } + + cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(qid, 1)); + if (cq->irq_enabled && cq->need_signaled_cnt && (cq->cq_head != cq_head)) { + eventfd_write(cq->virq, (eventfd_t)1); + cq->need_signaled_cnt = 0; + } + } +} + +static void +vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task) +{ + struct spdk_vhost_nvme_dev *nvme = task->nvme; + struct spdk_nvme_cpl cqe = {0}; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_sq *sq; + struct spdk_nvme_cmd *cmd = &task->cmd; + uint16_t cqid = task->cqid; + uint16_t sqid = task->sqid; + + cq = vhost_nvme_get_cq_from_qid(nvme, cqid); + sq = vhost_nvme_get_sq_from_qid(nvme, sqid); + if (spdk_unlikely(!cq || !sq)) { + return; + } + + cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(cqid, 1)); + if (spdk_unlikely(nvme_cq_is_full(cq))) { + STAILQ_INSERT_TAIL(&cq->cq_full_waited_tasks, task, stailq); + return; + } + + cqe.sqid = sqid; + cqe.sqhd = sq->sq_head; + cqe.cid = cmd->cid; + cqe.status.dnr = task->dnr; + cqe.status.sct = task->sct; + cqe.status.sc = task->sc; + cqe.status.p = !cq->phase; + cq->cq_cqe[cq->cq_head] = cqe; + spdk_smp_wmb(); + cq->cq_cqe[cq->cq_head].status.p = cq->phase; + + nvme_inc_cq_head(cq); + cq->need_signaled_cnt++; + + /* MMIO Controll */ + if (nvme->dataplane_started) { + nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1); + } + + STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq); +} + +static void +blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_nvme_task *task = cb_arg; + struct spdk_nvme_cmd *cmd = &task->cmd; + int sc, sct; + uint32_t cdw0; + + assert(bdev_io != NULL); + + spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc); + spdk_bdev_free_io(bdev_io); + + task->dnr = !success; + task->sct = sct; + task->sc = sc; + + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("I/O error, sector %u\n", cmd->cdw10); + } + + vhost_nvme_task_complete(task); +} + +static void +blk_unmap_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_nvme_task *child = cb_arg; + struct spdk_vhost_nvme_task *task = child->parent; + struct spdk_vhost_nvme_dev *nvme = task->nvme; + int sct, sc; + uint32_t cdw0; + + assert(bdev_io != NULL); + + task->num_children--; + if (!success) { + task->dnr = 1; + spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc); + task->sct = sct; + task->sc = sc; + } + + spdk_bdev_free_io(bdev_io); + + if (!task->num_children) { + vhost_nvme_task_complete(task); + } + + STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq); +} + +static struct spdk_vhost_nvme_ns * +vhost_nvme_get_ns_from_nsid(struct spdk_vhost_nvme_dev *dev, uint32_t nsid) +{ + if (spdk_unlikely(!nsid || nsid > dev->num_ns)) { + return NULL; + } + + return &dev->ns[nsid - 1]; +} + +static void +vhost_nvme_resubmit_task(void *arg) +{ + struct spdk_vhost_nvme_task *task = (struct spdk_vhost_nvme_task *)arg; + int rc; + + rc = nvme_process_sq(task->nvme, task->sq, task); + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "vhost_nvme: task resubmit failed, rc = %d.\n", rc); + } +} + +static int +vhost_nvme_queue_task(struct spdk_vhost_nvme_task *task) +{ + int rc; + + task->bdev_io_wait.bdev = task->ns->bdev; + task->bdev_io_wait.cb_fn = vhost_nvme_resubmit_task; + task->bdev_io_wait.cb_arg = task; + + rc = spdk_bdev_queue_io_wait(task->ns->bdev, task->ns->bdev_io_channel, &task->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vhost_nvme_queue_task, rc=%d.\n", rc); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + vhost_nvme_task_complete(task); + } + + return rc; +} + +static int +nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq, + struct spdk_vhost_nvme_task *task) +{ + struct spdk_vhost_nvme_task *child; + struct spdk_nvme_cmd *cmd = &task->cmd; + struct spdk_vhost_nvme_ns *ns; + int ret = -1; + uint32_t len, nlba, block_size; + uint64_t slba; + struct spdk_nvme_dsm_range *range; + uint16_t i, num_ranges = 0; + + task->nvme = nvme; + task->dnr = 0; + task->sct = 0; + task->sc = 0; + + ns = vhost_nvme_get_ns_from_nsid(nvme, cmd->nsid); + if (spdk_unlikely(!ns)) { + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + vhost_nvme_task_complete(task); + return -1; + } + + block_size = ns->block_size; + task->num_children = 0; + task->cqid = sq->cqid; + task->sqid = sq->sqid; + + task->ns = ns; + + if (spdk_unlikely(!ns->active_ns)) { + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + vhost_nvme_task_complete(task); + return -1; + } + + /* valid only for Read/Write commands */ + nlba = (cmd->cdw12 & 0xffff) + 1; + slba = cmd->cdw11; + slba = (slba << 32) | cmd->cdw10; + + if (cmd->opc == SPDK_NVME_OPC_READ || cmd->opc == SPDK_NVME_OPC_WRITE || + cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { + if (cmd->psdt != SPDK_NVME_PSDT_PRP) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PSDT %u%ub in command\n", + cmd->psdt >> 1, cmd->psdt & 1u); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_FIELD; + vhost_nvme_task_complete(task); + return -1; + } + + if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { + num_ranges = (cmd->cdw10 & 0xff) + 1; + len = num_ranges * sizeof(struct spdk_nvme_dsm_range); + } else { + len = nlba * block_size; + } + + ret = vhost_nvme_map_prps(nvme, cmd, task, len); + if (spdk_unlikely(ret != 0)) { + SPDK_ERRLOG("nvme command map prps failed\n"); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_FIELD; + vhost_nvme_task_complete(task); + return -1; + } + } + + switch (cmd->opc) { + case SPDK_NVME_OPC_READ: + ret = spdk_bdev_readv(ns->bdev_desc, ns->bdev_io_channel, + task->iovs, task->iovcnt, slba * block_size, + nlba * block_size, blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_WRITE: + ret = spdk_bdev_writev(ns->bdev_desc, ns->bdev_io_channel, + task->iovs, task->iovcnt, slba * block_size, + nlba * block_size, blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_FLUSH: + ret = spdk_bdev_flush(ns->bdev_desc, ns->bdev_io_channel, + 0, ns->capacity, + blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + range = (struct spdk_nvme_dsm_range *)task->iovs[0].iov_base; + for (i = 0; i < num_ranges; i++) { + if (!STAILQ_EMPTY(&nvme->free_tasks)) { + child = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + } else { + SPDK_ERRLOG("No free task now\n"); + ret = -1; + break; + } + task->num_children++; + child->parent = task; + ret = spdk_bdev_unmap(ns->bdev_desc, ns->bdev_io_channel, + range[i].starting_lba * block_size, + range[i].length * block_size, + blk_unmap_complete_cb, child); + if (ret) { + STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq); + break; + } + } + break; + default: + ret = -1; + break; + } + + if (spdk_unlikely(ret)) { + if (ret == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "No memory, start to queue io.\n"); + task->sq = sq; + ret = vhost_nvme_queue_task(task); + } else { + /* post error status to cqe */ + SPDK_ERRLOG("Error Submission For Command %u, ret %d\n", cmd->opc, ret); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + vhost_nvme_task_complete(task); + } + } + + return ret; +} + +static int +nvme_worker(void *arg) +{ + struct spdk_vhost_nvme_dev *nvme = (struct spdk_vhost_nvme_dev *)arg; + struct spdk_vhost_nvme_sq *sq; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_task *task; + uint32_t qid, dbbuf_sq; + int ret; + int count = -1; + + if (spdk_unlikely(!nvme->num_sqs)) { + return SPDK_POLLER_IDLE; + } + + if (spdk_unlikely(!nvme->dataplane_started && !nvme->bar)) { + return SPDK_POLLER_IDLE; + } + + for (qid = 1; qid <= MAX_IO_QUEUES; qid++) { + + sq = vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq->valid) { + continue; + } + cq = vhost_nvme_get_cq_from_qid(nvme, sq->cqid); + if (spdk_unlikely(!cq)) { + return SPDK_POLLER_BUSY; + } + cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(sq->cqid, 1)); + if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) && + !nvme_cq_is_full(cq))) { + task = STAILQ_FIRST(&cq->cq_full_waited_tasks); + STAILQ_REMOVE_HEAD(&cq->cq_full_waited_tasks, stailq); + vhost_nvme_task_complete(task); + } + + dbbuf_sq = vhost_nvme_get_queue_head(nvme, sq_offset(qid, 1)); + sq->sq_tail = (uint16_t)dbbuf_sq; + count = 0; + + while (sq->sq_head != sq->sq_tail) { + if (spdk_unlikely(!sq->sq_cmd)) { + break; + } + if (spdk_likely(!STAILQ_EMPTY(&nvme->free_tasks))) { + task = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + } else { + return SPDK_POLLER_BUSY; + } + + task->cmd = sq->sq_cmd[sq->sq_head]; + nvme_inc_sq_head(sq); + + /* processing IO */ + ret = nvme_process_sq(nvme, sq, task); + if (spdk_unlikely(ret)) { + SPDK_ERRLOG("QID %u CID %u, SQ HEAD %u, DBBUF SQ TAIL %u\n", qid, task->cmd.cid, sq->sq_head, + sq->sq_tail); + } + + /* MMIO Control */ + if (nvme->dataplane_started) { + nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1); + } + + /* Maximum batch I/Os to pick up at once */ + if (count++ == MAX_BATCH_IO) { + break; + } + } + } + + /* Completion Queue */ + nvme_cq_signal_fd(nvme); + + return count; +} + +static int +vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + struct spdk_vhost_session *vsession = nvme->vsession; + uint64_t dbs_dma_addr, eis_dma_addr; + + dbs_dma_addr = cmd->dptr.prp.prp1; + eis_dma_addr = cmd->dptr.prp.prp2; + + if ((dbs_dma_addr % 4096) || (eis_dma_addr % 4096)) { + return -1; + } + /* Guest Physical Address to Host Virtual Address */ + nvme->dbbuf_dbs = vhost_gpa_to_vva(vsession, dbs_dma_addr, 4096); + nvme->dbbuf_eis = vhost_gpa_to_vva(vsession, eis_dma_addr, 4096); + if (!nvme->dbbuf_dbs || !nvme->dbbuf_eis) { + return -1; + } + /* zeroed the doorbell buffer memory */ + memset((void *)nvme->dbbuf_dbs, 0, 4096); + memset((void *)nvme->dbbuf_eis, 0, 4096); + + cpl->status.sc = 0; + cpl->status.sct = 0; + + /* Data plane started */ + nvme->dataplane_started = true; + + return 0; +} + +static int +vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid, qsize, cqid; + uint64_t dma_addr; + uint64_t requested_len; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_sq *sq; + + /* physical contiguous */ + if (!(cmd->cdw11 & 0x1)) { + return -1; + } + + cqid = (cmd->cdw11 >> 16) & 0xffff; + qid = cmd->cdw10 & 0xffff; + qsize = (cmd->cdw10 >> 16) & 0xffff; + dma_addr = cmd->dptr.prp.prp1; + if (!dma_addr || dma_addr % 4096) { + return -1; + } + + sq = vhost_nvme_get_sq_from_qid(nvme, qid); + cq = vhost_nvme_get_cq_from_qid(nvme, cqid); + if (!sq || !cq) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u or CQID %u\n", + qid, cqid); + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return -1; + } + + sq->sqid = qid; + sq->cqid = cqid; + sq->size = qsize + 1; + sq->sq_head = sq->sq_tail = 0; + requested_len = sizeof(struct spdk_nvme_cmd) * sq->size; + sq->sq_cmd = vhost_gpa_to_vva(nvme->vsession, dma_addr, requested_len); + if (!sq->sq_cmd) { + return -1; + } + nvme->num_sqs++; + sq->valid = true; + if (nvme->bar) { + nvme->bar_db[sq_offset(qid, 1)] = 0; + } + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_delete_io_sq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid; + struct spdk_vhost_nvme_sq *sq; + + qid = cmd->cdw10 & 0xffff; + sq = vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq) { + return -1; + } + + /* We didn't see scenarios when deleting submission + * queue while I/O is running against the submisson + * queue for now, otherwise, we must ensure the poller + * will not run with this submission queue. + */ + nvme->num_sqs--; + sq->valid = false; + + memset(sq, 0, sizeof(*sq)); + sq->sq_cmd = NULL; + + cpl->status.sc = 0; + cpl->status.sct = 0; + + return 0; +} + +static int +vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qsize, qid; + uint64_t dma_addr; + struct spdk_vhost_nvme_cq *cq; + uint64_t requested_len; + + /* physical contiguous */ + if (!(cmd->cdw11 & 0x1)) { + return -1; + } + + qid = cmd->cdw10 & 0xffff; + qsize = (cmd->cdw10 >> 16) & 0xffff; + dma_addr = cmd->dptr.prp.prp1; + if (!dma_addr || dma_addr % 4096) { + return -1; + } + + cq = vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u\n", qid); + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return -1; + } + cq->cqid = qid; + cq->size = qsize + 1; + cq->phase = 1; + cq->irq_enabled = (cmd->cdw11 >> 1) & 0x1; + /* Setup virq through vhost messages */ + cq->virq = -1; + cq->cq_head = 0; + cq->guest_signaled_cq_head = 0; + cq->need_signaled_cnt = 0; + requested_len = sizeof(struct spdk_nvme_cpl) * cq->size; + cq->cq_cqe = vhost_gpa_to_vva(nvme->vsession, dma_addr, requested_len); + if (!cq->cq_cqe) { + return -1; + } + nvme->num_cqs++; + cq->valid = true; + if (nvme->bar) { + nvme->bar_db[cq_offset(qid, 1)] = 0; + } + STAILQ_INIT(&cq->cq_full_waited_tasks); + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_delete_io_cq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid; + struct spdk_vhost_nvme_cq *cq; + + qid = cmd->cdw10 & 0xffff; + cq = vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + nvme->num_cqs--; + cq->valid = false; + + memset(cq, 0, sizeof(*cq)); + cq->cq_cqe = NULL; + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static struct spdk_vhost_nvme_dev * +vhost_nvme_get_by_name(int vid) +{ + struct spdk_vhost_nvme_dev *nvme; + struct spdk_vhost_dev *vdev; + struct spdk_vhost_session *vsession; + + TAILQ_FOREACH(nvme, &g_nvme_ctrlrs, tailq) { + vdev = &nvme->vdev; + TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) { + if (vsession->vid == vid) { + return nvme; + } + } + } + + return NULL; +} + +int +vhost_nvme_get_cap(int vid, uint64_t *cap) +{ + struct spdk_vhost_nvme_dev *nvme; + + nvme = vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + *cap = nvme->cap.raw; + return 0; +} + +int +vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf) +{ + struct spdk_nvme_cmd *req = (struct spdk_nvme_cmd *)cmd; + struct spdk_nvme_cpl *cpl = (struct spdk_nvme_cpl *)cqe; + struct spdk_vhost_nvme_ns *ns; + int ret = 0; + struct spdk_vhost_nvme_dev *nvme; + + nvme = vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Admin Command Opcode %u\n", req->opc); + switch (req->opc) { + case SPDK_NVME_OPC_IDENTIFY: + if (req->cdw10 == SPDK_NVME_IDENTIFY_CTRLR) { + memcpy(buf, &nvme->cdata, sizeof(struct spdk_nvme_ctrlr_data)); + + } else if (req->cdw10 == SPDK_NVME_IDENTIFY_NS) { + ns = vhost_nvme_get_ns_from_nsid(nvme, req->nsid); + if (!ns) { + cpl->status.sc = SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE; + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + break; + } + memcpy(buf, &ns->nsdata, sizeof(struct spdk_nvme_ns_data)); + } + /* successfully */ + cpl->status.sc = 0; + cpl->status.sct = 0; + break; + case SPDK_NVME_OPC_CREATE_IO_CQ: + ret = vhost_nvme_create_io_cq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_DELETE_IO_CQ: + ret = vhost_nvme_delete_io_cq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_CREATE_IO_SQ: + ret = vhost_nvme_create_io_sq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_DELETE_IO_SQ: + ret = vhost_nvme_delete_io_sq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_GET_FEATURES: + case SPDK_NVME_OPC_SET_FEATURES: + if (req->cdw10 == SPDK_NVME_FEAT_NUMBER_OF_QUEUES) { + cpl->status.sc = 0; + cpl->status.sct = 0; + cpl->cdw0 = (nvme->num_io_queues - 1) | ((nvme->num_io_queues - 1) << 16); + } else { + cpl->status.sc = SPDK_NVME_SC_INVALID_FIELD; + cpl->status.sct = SPDK_NVME_SCT_GENERIC; + } + break; + case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: + ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl); + break; + case SPDK_NVME_OPC_ABORT: + /* TODO: ABORT failed fow now */ + cpl->cdw0 = 1; + cpl->status.sc = 0; + cpl->status.sct = 0; + break; + } + + if (ret) { + SPDK_ERRLOG("Admin Passthrough Failed with %u\n", req->opc); + } + + return 0; +} + +int +vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size) +{ + struct spdk_vhost_nvme_dev *nvme; + + nvme = vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + nvme->bar = (volatile uint32_t *)(uintptr_t)(bar_addr); + /* BAR0 SQ/CQ doorbell registers start from offset 0x1000 */ + nvme->bar_db = (volatile uint32_t *)(uintptr_t)(bar_addr + 0x1000ull); + nvme->bar_size = bar_size; + + return 0; +} + +int +vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd) +{ + struct spdk_vhost_nvme_dev *nvme; + struct spdk_vhost_nvme_cq *cq; + + nvme = vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + cq = vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + if (cq->irq_enabled) { + cq->virq = fd; + } else { + SPDK_ERRLOG("NVMe Qid %d Disabled IRQ\n", qid); + } + + return 0; +} + +static void +free_task_pool(struct spdk_vhost_nvme_dev *nvme) +{ + struct spdk_vhost_nvme_task *task; + + while (!STAILQ_EMPTY(&nvme->free_tasks)) { + task = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + spdk_free(task); + } +} + +static int +alloc_task_pool(struct spdk_vhost_nvme_dev *nvme) +{ + uint32_t entries, i; + struct spdk_vhost_nvme_task *task; + + entries = nvme->num_io_queues * MAX_QUEUE_ENTRIES_SUPPORTED; + + for (i = 0; i < entries; i++) { + task = spdk_zmalloc(sizeof(struct spdk_vhost_nvme_task), + SPDK_CACHE_LINE_SIZE, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (task == NULL) { + SPDK_ERRLOG("Controller %s alloc task pool failed\n", + nvme->vdev.name); + free_task_pool(nvme); + return -1; + } + STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq); + } + + return 0; +} + +static int +vhost_nvme_start_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + int rc = 0; + + if (nvme == NULL) { + rc = -1; + goto out; + } + + rc = alloc_task_pool(nvme); + if (rc) { + goto out; + } + + SPDK_NOTICELOG("Start Device %u, Path %s, lcore %d\n", vsession->vid, + vdev->path, spdk_env_get_current_core()); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + ns_dev->bdev_io_channel = spdk_bdev_get_io_channel(ns_dev->bdev_desc); + if (!ns_dev->bdev_io_channel) { + rc = -1; + goto out; + } + } + + nvme->vsession = vsession; + /* Start the NVMe Poller */ + nvme->requestq_poller = SPDK_POLLER_REGISTER(nvme_worker, nvme, 0); + +out: + vhost_session_start_done(vsession, rc); + return rc; +} + +static int +vhost_nvme_start(struct spdk_vhost_session *vsession) +{ + if (vsession->vdev->active_session_num > 0) { + /* We're trying to start a second session */ + SPDK_ERRLOG("Vhost-NVMe devices can support only one simultaneous connection.\n"); + return -1; + } + + return vhost_session_send_event(vsession, vhost_nvme_start_cb, + 3, "start session"); +} + +static void +vhost_nvme_deactive_ns(struct spdk_vhost_nvme_ns *ns) +{ + ns->active_ns = 0; + spdk_bdev_close(ns->bdev_desc); + ns->bdev_desc = NULL; + ns->bdev = NULL; +} + +static void +bdev_remove_cb(void *remove_ctx) +{ + struct spdk_vhost_nvme_ns *ns = remove_ctx; + + SPDK_NOTICELOG("Removing NS %u, Block Device %s\n", + ns->nsid, spdk_bdev_get_name(ns->bdev)); + + vhost_nvme_deactive_ns(ns); +} + +static int +destroy_device_poller_cb(void *arg) +{ + struct spdk_vhost_nvme_dev *nvme = arg; + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Destroy device poller callback\n"); + + /* FIXME wait for pending I/Os to complete */ + + if (spdk_vhost_trylock() != 0) { + return SPDK_POLLER_BUSY; + } + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (ns_dev->bdev_io_channel) { + spdk_put_io_channel(ns_dev->bdev_io_channel); + ns_dev->bdev_io_channel = NULL; + } + } + /* Clear BAR space */ + if (nvme->bar) { + memset((void *)nvme->bar, 0, nvme->bar_size); + } + nvme->num_sqs = 0; + nvme->num_cqs = 0; + nvme->dbbuf_dbs = NULL; + nvme->dbbuf_eis = NULL; + nvme->dataplane_started = false; + + spdk_poller_unregister(&nvme->stop_poller); + vhost_session_stop_done(nvme->vsession, 0); + + spdk_vhost_unlock(); + return SPDK_POLLER_BUSY; +} + +static int +vhost_nvme_stop_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + + if (nvme == NULL) { + vhost_session_stop_done(vsession, -1); + return -1; + } + + free_task_pool(nvme); + SPDK_NOTICELOG("Stopping Device %u, Path %s\n", vsession->vid, vdev->path); + + spdk_poller_unregister(&nvme->requestq_poller); + nvme->stop_poller = SPDK_POLLER_REGISTER(destroy_device_poller_cb, nvme, 1000); + + return 0; +} + +static int +vhost_nvme_stop(struct spdk_vhost_session *vsession) +{ + return vhost_session_send_event(vsession, vhost_nvme_stop_cb, + 3, "start session"); +} + +static void +vhost_nvme_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + if (nvme == NULL) { + return; + } + + spdk_json_write_named_array_begin(w, "namespaces"); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (!ns_dev->active_ns) { + continue; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_uint32(w, "nsid", ns_dev->nsid); + spdk_json_write_named_string(w, "bdev", spdk_bdev_get_name(ns_dev->bdev)); + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); +} + +static void +vhost_nvme_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + if (nvme == NULL) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_create_nvme_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name); + spdk_json_write_named_uint32(w, "io_queues", nvme->num_io_queues); + spdk_json_write_named_string(w, "cpumask", + spdk_cpuset_fmt(spdk_thread_get_cpumask(nvme->vdev.thread))); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (!ns_dev->active_ns) { + continue; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_nvme_controller_add_ns"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name); + spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(ns_dev->bdev)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } +} + +static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend = { + .session_ctx_size = 0, + .start_session = vhost_nvme_start, + .stop_session = vhost_nvme_stop, + .dump_info_json = vhost_nvme_dump_info_json, + .write_config_json = vhost_nvme_write_config_json, + .remove_device = vhost_nvme_dev_remove, +}; + +static int +vhost_nvme_ns_identify_update(struct spdk_vhost_nvme_dev *dev) +{ + struct spdk_nvme_ctrlr_data *cdata = &dev->cdata; + struct spdk_nvme_ns_data *nsdata; + uint64_t num_blocks; + uint32_t i; + + /* Identify Namespace */ + cdata->nn = dev->num_ns; + for (i = 0; i < dev->num_ns; i++) { + nsdata = &dev->ns[i].nsdata; + if (dev->ns[i].active_ns) { + num_blocks = spdk_bdev_get_num_blocks(dev->ns[i].bdev); + nsdata->nsze = num_blocks; + /* ncap must be non-zero for active Namespace */ + nsdata->ncap = num_blocks; + nsdata->nuse = num_blocks; + nsdata->nlbaf = 0; + nsdata->flbas.format = 0; + nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(dev->ns[i].bdev)); + nsdata->noiob = spdk_bdev_get_optimal_io_boundary(dev->ns[i].bdev); + dev->ns[i].block_size = spdk_bdev_get_block_size(dev->ns[i].bdev); + dev->ns[i].capacity = num_blocks * dev->ns[i].block_size; + } else { + memset(nsdata, 0, sizeof(*nsdata)); + } + } + return 0; +} + +static int +vhost_nvme_ctrlr_identify_update(struct spdk_vhost_nvme_dev *dev) +{ + struct spdk_nvme_ctrlr_data *cdata = &dev->cdata; + char sn[20]; + + /* Controller Capabilities */ + dev->cap.bits.cqr = 1; + dev->cap.bits.to = 1; + dev->cap.bits.dstrd = 0; + dev->cap.bits.css = SPDK_NVME_CAP_CSS_NVM; + dev->cap.bits.mpsmin = 0; + dev->cap.bits.mpsmax = 0; + /* MQES is 0 based value */ + dev->cap.bits.mqes = MAX_QUEUE_ENTRIES_SUPPORTED - 1; + + /* Controller Configuration */ + dev->cc.bits.en = 0; + + /* Controller Status */ + dev->csts.bits.rdy = 0; + + /* Identify Controller */ + spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' '); + cdata->vid = 0x8086; + cdata->ssvid = 0x8086; + spdk_strcpy_pad(cdata->mn, "SPDK Virtual NVMe Controller", sizeof(cdata->mn), ' '); + snprintf(sn, sizeof(sn), "NVMe_%s", dev->vdev.name); + spdk_strcpy_pad(cdata->sn, sn, sizeof(cdata->sn), ' '); + cdata->ieee[0] = 0xe4; + cdata->ieee[1] = 0xd2; + cdata->ieee[2] = 0x5c; + cdata->ver.bits.mjr = 1; + cdata->ver.bits.mnr = 0; + cdata->mdts = 5; /* 128 KiB */ + cdata->rab = 6; + cdata->sqes.min = 6; + cdata->sqes.max = 6; + cdata->cqes.min = 4; + cdata->cqes.max = 4; + cdata->oncs.dsm = 1; + /* Emulated NVMe controller */ + cdata->oacs.doorbell_buffer_config = 1; + + vhost_nvme_ns_identify_update(dev); + + return 0; +} + +int +vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t num_io_queues) +{ + struct spdk_vhost_nvme_dev *dev; + int rc; + + if (posix_memalign((void **)&dev, SPDK_CACHE_LINE_SIZE, sizeof(*dev))) { + return -ENOMEM; + } + memset(dev, 0, sizeof(*dev)); + + if (num_io_queues < 1 || num_io_queues > MAX_IO_QUEUES) { + free(dev); + return -EINVAL; + } + + spdk_vhost_lock(); + rc = vhost_dev_register(&dev->vdev, name, cpumask, + &spdk_vhost_nvme_device_backend); + + if (rc) { + free(dev); + spdk_vhost_unlock(); + return rc; + } + + dev->num_io_queues = num_io_queues; + STAILQ_INIT(&dev->free_tasks); + TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, dev, tailq); + + vhost_nvme_ctrlr_identify_update(dev); + + SPDK_NOTICELOG("Controller %s: Constructed\n", name); + spdk_vhost_unlock(); + return rc; +} + +int +vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns; + int rc; + uint32_t i; + + if (nvme == NULL) { + return -EINVAL; + } + + TAILQ_REMOVE(&g_nvme_ctrlrs, nvme, tailq); + for (i = 0; i < nvme->num_ns; i++) { + ns = &nvme->ns[i]; + if (ns->active_ns) { + vhost_nvme_deactive_ns(ns); + } + } + + rc = vhost_dev_unregister(vdev); + if (rc != 0) { + return rc; + } + + free(nvme); + return 0; +} + +int +vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, const char *bdev_name) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns; + struct spdk_bdev *bdev; + int rc = -1; + + if (nvme == NULL) { + return -ENODEV; + } + + if (nvme->num_ns == MAX_NAMESPACE) { + SPDK_ERRLOG("Can't support %d Namespaces\n", nvme->num_ns); + return -ENOSPC; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + SPDK_ERRLOG("could not find bdev %s\n", bdev_name); + return -ENODEV; + } + + ns = &nvme->ns[nvme->num_ns]; + rc = spdk_bdev_open(bdev, true, bdev_remove_cb, ns, &nvme->ns[nvme->num_ns].bdev_desc); + if (rc != 0) { + SPDK_ERRLOG("Could not open bdev '%s', error=%d\n", + bdev_name, rc); + return rc; + } + + nvme->ns[nvme->num_ns].bdev = bdev; + nvme->ns[nvme->num_ns].active_ns = 1; + nvme->ns[nvme->num_ns].nsid = nvme->num_ns + 1; + nvme->num_ns++; + + vhost_nvme_ns_identify_update(nvme); + + return rc; +} + +int +vhost_nvme_controller_construct(void) +{ + struct spdk_conf_section *sp; + const char *name; + const char *bdev_name; + const char *cpumask; + int rc, i = 0; + struct spdk_vhost_dev *vdev; + uint32_t ctrlr_num, io_queues; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VhostNvme")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostNvme%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + SPDK_ERRLOG("VhostNvme%u: missing Name\n", ctrlr_num); + return -1; + } + + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + rc = spdk_conf_section_get_intval(sp, "NumberOfQueues"); + if (rc > 0) { + io_queues = rc; + } else { + io_queues = 1; + } + + rc = vhost_nvme_dev_construct(name, cpumask, io_queues); + if (rc < 0) { + SPDK_ERRLOG("VhostNvme%u: Construct failed\n", ctrlr_num); + return -1; + } + + vdev = spdk_vhost_dev_find(name); + if (!vdev) { + return -1; + } + + for (i = 0; spdk_conf_section_get_nval(sp, "Namespace", i) != NULL; i++) { + bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0); + if (!bdev_name) { + SPDK_ERRLOG("namespace configuration missing bdev name\n"); + break; + } + rc = vhost_nvme_dev_add_ns(vdev, bdev_name); + if (rc < 0) { + SPDK_WARNLOG("VhostNvme%u: Construct Namespace with %s failed\n", + ctrlr_num, bdev_name); + break; + } + } + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_nvme", SPDK_LOG_VHOST_NVME) diff --git a/src/spdk/lib/vhost/vhost_rpc.c b/src/spdk/lib/vhost/vhost_rpc.c new file mode 100644 index 000000000..196d75918 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_rpc.c @@ -0,0 +1,652 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/log.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/env.h" + +#include "spdk/scsi.h" +#include "spdk/vhost.h" +#include "vhost_internal.h" +#include "spdk/bdev.h" + +struct rpc_vhost_scsi_ctrlr { + char *ctrlr; + char *cpumask; +}; + +static void +free_rpc_vhost_scsi_ctrlr(struct rpc_vhost_scsi_ctrlr *req) +{ + free(req->ctrlr); + free(req->cpumask); +} + +static const struct spdk_json_object_decoder rpc_vhost_create_scsi_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr, ctrlr), spdk_json_decode_string }, + {"cpumask", offsetof(struct rpc_vhost_scsi_ctrlr, cpumask), spdk_json_decode_string, true}, +}; + +static void +rpc_vhost_create_scsi_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_scsi_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_vhost_create_scsi_ctrlr, + SPDK_COUNTOF(rpc_vhost_create_scsi_ctrlr), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_construct(req.ctrlr, req.cpumask); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_scsi_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_scsi_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("vhost_create_scsi_controller", rpc_vhost_create_scsi_controller, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_scsi_controller, construct_vhost_scsi_controller) + +struct rpc_vhost_scsi_ctrlr_add_target { + char *ctrlr; + int32_t scsi_target_num; + char *bdev_name; +}; + +static void +free_rpc_vhost_scsi_ctrlr_add_target(struct rpc_vhost_scsi_ctrlr_add_target *req) +{ + free(req->ctrlr); + free(req->bdev_name); +} + +static const struct spdk_json_object_decoder rpc_vhost_scsi_ctrlr_add_target[] = { + {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, ctrlr), spdk_json_decode_string }, + {"scsi_target_num", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, scsi_target_num), spdk_json_decode_int32}, + {"bdev_name", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, bdev_name), spdk_json_decode_string }, +}; + +static void +rpc_vhost_scsi_controller_add_target(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_scsi_ctrlr_add_target req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_vhost_dev *vdev; + int rc; + + if (spdk_json_decode_object(params, rpc_vhost_scsi_ctrlr_add_target, + SPDK_COUNTOF(rpc_vhost_scsi_ctrlr_add_target), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_find(req.ctrlr); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_add_tgt(vdev, req.scsi_target_num, req.bdev_name); + spdk_vhost_unlock(); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_scsi_ctrlr_add_target(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_int32(w, rc); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_scsi_ctrlr_add_target(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("vhost_scsi_controller_add_target", rpc_vhost_scsi_controller_add_target, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_scsi_controller_add_target, add_vhost_scsi_lun) + +struct rpc_remove_vhost_scsi_ctrlr_target { + char *ctrlr; + uint32_t scsi_target_num; +}; + +static void +free_rpc_remove_vhost_scsi_ctrlr_target(struct rpc_remove_vhost_scsi_ctrlr_target *req) +{ + free(req->ctrlr); +} + +static const struct spdk_json_object_decoder rpc_vhost_remove_target[] = { + {"ctrlr", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, ctrlr), spdk_json_decode_string }, + {"scsi_target_num", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, scsi_target_num), spdk_json_decode_uint32}, +}; + +static int +rpc_vhost_scsi_controller_remove_target_finish_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct spdk_jsonrpc_request *request = arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return 0; +} + +static void +rpc_vhost_scsi_controller_remove_target(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_remove_vhost_scsi_ctrlr_target req = {0}; + struct spdk_vhost_dev *vdev; + int rc; + + if (spdk_json_decode_object(params, rpc_vhost_remove_target, + SPDK_COUNTOF(rpc_vhost_remove_target), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_find(req.ctrlr); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_remove_tgt(vdev, req.scsi_target_num, + rpc_vhost_scsi_controller_remove_target_finish_cb, + request); + spdk_vhost_unlock(); + if (rc < 0) { + goto invalid; + } + + free_rpc_remove_vhost_scsi_ctrlr_target(&req); + return; + +invalid: + free_rpc_remove_vhost_scsi_ctrlr_target(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} + +SPDK_RPC_REGISTER("vhost_scsi_controller_remove_target", + rpc_vhost_scsi_controller_remove_target, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_scsi_controller_remove_target, remove_vhost_scsi_target) + +struct rpc_vhost_blk_ctrlr { + char *ctrlr; + char *dev_name; + char *cpumask; + bool readonly; + bool packed_ring; +}; + +static const struct spdk_json_object_decoder rpc_construct_vhost_blk_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_blk_ctrlr, ctrlr), spdk_json_decode_string }, + {"dev_name", offsetof(struct rpc_vhost_blk_ctrlr, dev_name), spdk_json_decode_string }, + {"cpumask", offsetof(struct rpc_vhost_blk_ctrlr, cpumask), spdk_json_decode_string, true}, + {"readonly", offsetof(struct rpc_vhost_blk_ctrlr, readonly), spdk_json_decode_bool, true}, + {"packed_ring", offsetof(struct rpc_vhost_blk_ctrlr, packed_ring), spdk_json_decode_bool, true}, +}; + +static void +free_rpc_vhost_blk_ctrlr(struct rpc_vhost_blk_ctrlr *req) +{ + free(req->ctrlr); + free(req->dev_name); + free(req->cpumask); +} + +static void +rpc_vhost_create_blk_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_blk_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_vhost_blk_ctrlr, + SPDK_COUNTOF(rpc_construct_vhost_blk_ctrlr), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_blk_construct(req.ctrlr, req.cpumask, req.dev_name, + req.readonly, req.packed_ring); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_blk_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_blk_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("vhost_create_blk_controller", rpc_vhost_create_blk_controller, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_blk_controller, construct_vhost_blk_controller) + +struct rpc_delete_vhost_ctrlr { + char *ctrlr; +}; + +static const struct spdk_json_object_decoder rpc_delete_vhost_ctrlr_decoder[] = { + {"ctrlr", offsetof(struct rpc_delete_vhost_ctrlr, ctrlr), spdk_json_decode_string }, +}; + +static void +free_rpc_delete_vhost_ctrlr(struct rpc_delete_vhost_ctrlr *req) +{ + free(req->ctrlr); +} + +static void +rpc_vhost_delete_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_vhost_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_vhost_dev *vdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_vhost_ctrlr_decoder, + SPDK_COUNTOF(rpc_delete_vhost_ctrlr_decoder), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_find(req.ctrlr); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_dev_remove(vdev); + spdk_vhost_unlock(); + if (rc < 0) { + goto invalid; + } + + free_rpc_delete_vhost_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + + return; + +invalid: + free_rpc_delete_vhost_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("vhost_delete_controller", rpc_vhost_delete_controller, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_delete_controller, remove_vhost_controller) + +struct rpc_get_vhost_ctrlrs { + char *name; +}; + +static void +_rpc_get_vhost_controller(struct spdk_json_write_ctx *w, struct spdk_vhost_dev *vdev) +{ + uint32_t delay_base_us, iops_threshold; + + spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold); + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "ctrlr", spdk_vhost_dev_get_name(vdev)); + spdk_json_write_named_string_fmt(w, "cpumask", "0x%s", + spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread))); + spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us); + spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold); + spdk_json_write_named_string(w, "socket", vdev->path); + + spdk_json_write_named_object_begin(w, "backend_specific"); + vhost_dump_info_json(vdev, w); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_json_object_decoder rpc_get_vhost_ctrlrs_decoders[] = { + {"name", offsetof(struct rpc_get_vhost_ctrlrs, name), spdk_json_decode_string, true}, +}; + +static void +free_rpc_get_vhost_ctrlrs(struct rpc_get_vhost_ctrlrs *req) +{ + free(req->name); +} + +static void +rpc_vhost_get_controllers(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_vhost_ctrlrs req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_vhost_dev *vdev; + int rc; + + if (params && spdk_json_decode_object(params, rpc_get_vhost_ctrlrs_decoders, + SPDK_COUNTOF(rpc_get_vhost_ctrlrs_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + if (req.name != NULL) { + vdev = spdk_vhost_dev_find(req.name); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + free_rpc_get_vhost_ctrlrs(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + _rpc_get_vhost_controller(w, vdev); + spdk_vhost_unlock(); + + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + return; + } + + free_rpc_get_vhost_ctrlrs(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + vdev = spdk_vhost_dev_next(NULL); + while (vdev != NULL) { + _rpc_get_vhost_controller(w, vdev); + vdev = spdk_vhost_dev_next(vdev); + } + spdk_vhost_unlock(); + + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_get_vhost_ctrlrs(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("vhost_get_controllers", rpc_vhost_get_controllers, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_get_controllers, get_vhost_controllers) + + +struct rpc_vhost_ctrlr_coalescing { + char *ctrlr; + uint32_t delay_base_us; + uint32_t iops_threshold; +}; + +static const struct spdk_json_object_decoder rpc_set_vhost_ctrlr_coalescing[] = { + {"ctrlr", offsetof(struct rpc_vhost_ctrlr_coalescing, ctrlr), spdk_json_decode_string }, + {"delay_base_us", offsetof(struct rpc_vhost_ctrlr_coalescing, delay_base_us), spdk_json_decode_uint32}, + {"iops_threshold", offsetof(struct rpc_vhost_ctrlr_coalescing, iops_threshold), spdk_json_decode_uint32}, +}; + +static void +free_rpc_set_vhost_controllers_event_coalescing(struct rpc_vhost_ctrlr_coalescing *req) +{ + free(req->ctrlr); +} + +static void +rpc_vhost_controller_set_coalescing(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_ctrlr_coalescing req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_vhost_dev *vdev; + int rc; + + if (spdk_json_decode_object(params, rpc_set_vhost_ctrlr_coalescing, + SPDK_COUNTOF(rpc_set_vhost_ctrlr_coalescing), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_find(req.ctrlr); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_set_coalescing(vdev, req.delay_base_us, req.iops_threshold); + spdk_vhost_unlock(); + if (rc) { + goto invalid; + } + + free_rpc_set_vhost_controllers_event_coalescing(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + + return; + +invalid: + free_rpc_set_vhost_controllers_event_coalescing(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("vhost_controller_set_coalescing", rpc_vhost_controller_set_coalescing, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_controller_set_coalescing, set_vhost_controller_coalescing) + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB + +struct rpc_vhost_nvme_ctrlr { + char *ctrlr; + uint32_t io_queues; + char *cpumask; +}; + +static const struct spdk_json_object_decoder rpc_construct_vhost_nvme_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr, ctrlr), spdk_json_decode_string }, + {"io_queues", offsetof(struct rpc_vhost_nvme_ctrlr, io_queues), spdk_json_decode_uint32}, + {"cpumask", offsetof(struct rpc_vhost_nvme_ctrlr, cpumask), spdk_json_decode_string, true}, +}; + +static void +free_rpc_vhost_nvme_ctrlr(struct rpc_vhost_nvme_ctrlr *req) +{ + free(req->ctrlr); + free(req->cpumask); +} + +static void +rpc_vhost_create_nvme_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_nvme_ctrlr req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_vhost_nvme_ctrlr, + SPDK_COUNTOF(rpc_construct_vhost_nvme_ctrlr), + &req)) { + rc = -EINVAL; + goto invalid; + } + + rc = vhost_nvme_dev_construct(req.ctrlr, req.cpumask, req.io_queues); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_nvme_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_nvme_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("vhost_create_nvme_controller", rpc_vhost_create_nvme_controller, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_nvme_controller, construct_vhost_nvme_controller) + +struct rpc_vhost_nvme_ctrlr_add_ns { + char *ctrlr; + char *bdev_name; +}; + +static void +free_rpc_vhost_nvme_ctrlr_add_ns(struct rpc_vhost_nvme_ctrlr_add_ns *req) +{ + free(req->ctrlr); + free(req->bdev_name); +} + +static const struct spdk_json_object_decoder rpc_vhost_nvme_add_ns[] = { + {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr_add_ns, ctrlr), spdk_json_decode_string }, + {"bdev_name", offsetof(struct rpc_vhost_nvme_ctrlr_add_ns, bdev_name), spdk_json_decode_string }, +}; + +static void +rpc_vhost_nvme_controller_add_ns(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_nvme_ctrlr_add_ns req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_vhost_dev *vdev; + int rc; + + if (spdk_json_decode_object(params, rpc_vhost_nvme_add_ns, + SPDK_COUNTOF(rpc_vhost_nvme_add_ns), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_lock(); + vdev = spdk_vhost_dev_find(req.ctrlr); + if (vdev == NULL) { + spdk_vhost_unlock(); + rc = -ENODEV; + goto invalid; + } + + rc = vhost_nvme_dev_add_ns(vdev, req.bdev_name); + spdk_vhost_unlock(); + if (rc < 0) { + goto invalid; + } + free_rpc_vhost_nvme_ctrlr_add_ns(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_nvme_ctrlr_add_ns(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("vhost_nvme_controller_add_ns", rpc_vhost_nvme_controller_add_ns, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_nvme_controller_add_ns, add_vhost_nvme_ns) + +#endif /* SPDK_CONFIG_VHOST_INTERNAL_LIB */ + +SPDK_LOG_REGISTER_COMPONENT("vhost_rpc", SPDK_LOG_VHOST_RPC) diff --git a/src/spdk/lib/vhost/vhost_scsi.c b/src/spdk/lib/vhost/vhost_scsi.c new file mode 100644 index 000000000..49e49dc76 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_scsi.c @@ -0,0 +1,1536 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include <linux/virtio_scsi.h> + +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/scsi.h" +#include "spdk/scsi_spec.h" +#include "spdk/conf.h" +#include "spdk/util.h" +#include "spdk/likely.h" + +#include "spdk/vhost.h" +#include "vhost_internal.h" + +/* Features supported by SPDK VHOST lib. */ +#define SPDK_VHOST_SCSI_FEATURES (SPDK_VHOST_FEATURES | \ + (1ULL << VIRTIO_SCSI_F_INOUT) | \ + (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \ + (1ULL << VIRTIO_SCSI_F_CHANGE ) | \ + (1ULL << VIRTIO_SCSI_F_T10_PI )) + +/* Features that are specified in VIRTIO SCSI but currently not supported: + * - Live migration not supported yet + * - T10 PI + */ +#define SPDK_VHOST_SCSI_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \ + (1ULL << VIRTIO_SCSI_F_T10_PI )) + +#define MGMT_POLL_PERIOD_US (1000 * 5) + +#define VIRTIO_SCSI_CONTROLQ 0 +#define VIRTIO_SCSI_EVENTQ 1 +#define VIRTIO_SCSI_REQUESTQ 2 + +enum spdk_scsi_dev_vhost_status { + /* Target ID is empty. */ + VHOST_SCSI_DEV_EMPTY, + + /* Target is still being added. */ + VHOST_SCSI_DEV_ADDING, + + /* Target ID occupied. */ + VHOST_SCSI_DEV_PRESENT, + + /* Target ID is occupied but removal is in progress. */ + VHOST_SCSI_DEV_REMOVING, + + /* In session - device (SCSI target) seen but removed. */ + VHOST_SCSI_DEV_REMOVED, +}; + +/** Context for a SCSI target in a vhost device */ +struct spdk_scsi_dev_vhost_state { + struct spdk_scsi_dev *dev; + enum spdk_scsi_dev_vhost_status status; + spdk_vhost_event_fn remove_cb; + void *remove_ctx; +}; + +struct spdk_vhost_scsi_dev { + int ref; + bool registered; + struct spdk_vhost_dev vdev; + struct spdk_scsi_dev_vhost_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS]; +}; + +/** Context for a SCSI target in a vhost session */ +struct spdk_scsi_dev_session_state { + struct spdk_scsi_dev *dev; + enum spdk_scsi_dev_vhost_status status; +}; + +struct spdk_vhost_scsi_session { + struct spdk_vhost_session vsession; + + struct spdk_vhost_scsi_dev *svdev; + /** Local copy of the device state */ + struct spdk_scsi_dev_session_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS]; + struct spdk_poller *requestq_poller; + struct spdk_poller *mgmt_poller; + struct spdk_poller *stop_poller; +}; + +struct spdk_vhost_scsi_task { + struct spdk_scsi_task scsi; + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; + + union { + struct virtio_scsi_cmd_resp *resp; + struct virtio_scsi_ctrl_tmf_resp *tmf_resp; + }; + + struct spdk_vhost_scsi_session *svsession; + struct spdk_scsi_dev *scsi_dev; + + /** Number of bytes that were written. */ + uint32_t used_len; + + int req_idx; + + /* If set, the task is currently used for I/O processing. */ + bool used; + + struct spdk_vhost_virtqueue *vq; +}; + +static int vhost_scsi_start(struct spdk_vhost_session *vsession); +static int vhost_scsi_stop(struct spdk_vhost_session *vsession); +static void vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, + struct spdk_json_write_ctx *w); +static void vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, + struct spdk_json_write_ctx *w); +static int vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev); + +static const struct spdk_vhost_dev_backend spdk_vhost_scsi_device_backend = { + .session_ctx_size = sizeof(struct spdk_vhost_scsi_session) - sizeof(struct spdk_vhost_session), + .start_session = vhost_scsi_start, + .stop_session = vhost_scsi_stop, + .dump_info_json = vhost_scsi_dump_info_json, + .write_config_json = vhost_scsi_write_config_json, + .remove_device = vhost_scsi_dev_remove, +}; + +static inline void +scsi_task_init(struct spdk_vhost_scsi_task *task) +{ + memset(&task->scsi, 0, sizeof(task->scsi)); + /* Tmf_resp pointer and resp pointer are in a union. + * Here means task->tmf_resp = task->resp = NULL. + */ + task->resp = NULL; + task->used = true; + task->used_len = 0; +} + +static void +vhost_scsi_task_put(struct spdk_vhost_scsi_task *task) +{ + spdk_scsi_task_put(&task->scsi); +} + +static void +vhost_scsi_task_free_cb(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + struct spdk_vhost_session *vsession = &task->svsession->vsession; + + assert(vsession->task_cnt > 0); + vsession->task_cnt--; + task->used = false; +} + +static void +remove_scsi_tgt(struct spdk_vhost_scsi_dev *svdev, + unsigned scsi_tgt_num) +{ + struct spdk_scsi_dev_vhost_state *state; + struct spdk_scsi_dev *dev; + + state = &svdev->scsi_dev_state[scsi_tgt_num]; + dev = state->dev; + state->dev = NULL; + assert(state->status == VHOST_SCSI_DEV_REMOVING); + state->status = VHOST_SCSI_DEV_EMPTY; + spdk_scsi_dev_destruct(dev, NULL, NULL); + if (state->remove_cb) { + state->remove_cb(&svdev->vdev, state->remove_ctx); + state->remove_cb = NULL; + } + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: removed target 'Target %u'\n", + svdev->vdev.name, scsi_tgt_num); + + if (--svdev->ref == 0 && svdev->registered == false) { + free(svdev); + } +} + +static void +vhost_scsi_dev_process_removed_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) +{ + unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx; + struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev, + struct spdk_vhost_scsi_dev, vdev); + + /* all sessions have already detached the device */ + if (svdev->scsi_dev_state[scsi_tgt_num].status != VHOST_SCSI_DEV_REMOVING) { + /* device was already removed in the meantime */ + return; + } + + remove_scsi_tgt(svdev, scsi_tgt_num); +} + +static int +vhost_scsi_session_process_removed(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *ctx) +{ + unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx; + struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession; + struct spdk_scsi_dev_session_state *state = &svsession->scsi_dev_state[scsi_tgt_num]; + + if (state->dev != NULL) { + /* there's still a session that references this device, + * so abort our foreach chain here. We'll be called + * again from this session's management poller after it + * is removed in there + */ + return -1; + } + + return 0; +} + +static void +process_removed_devs(struct spdk_vhost_scsi_session *svsession) +{ + struct spdk_scsi_dev *dev; + struct spdk_scsi_dev_session_state *state; + int i; + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) { + state = &svsession->scsi_dev_state[i]; + dev = state->dev; + + if (dev && state->status == VHOST_SCSI_DEV_REMOVING && + !spdk_scsi_dev_has_pending_tasks(dev, NULL)) { + /* detach the device from this session */ + spdk_scsi_dev_free_io_channels(dev); + state->dev = NULL; + state->status = VHOST_SCSI_DEV_REMOVED; + /* try to detach it globally */ + spdk_vhost_lock(); + vhost_dev_foreach_session(&svsession->svdev->vdev, + vhost_scsi_session_process_removed, + vhost_scsi_dev_process_removed_cpl_cb, + (void *)(uintptr_t)i); + spdk_vhost_unlock(); + } + } +} + +static void +eventq_enqueue(struct spdk_vhost_scsi_session *svsession, unsigned scsi_dev_num, + uint32_t event, uint32_t reason) +{ + struct spdk_vhost_session *vsession = &svsession->vsession; + struct spdk_vhost_virtqueue *vq; + struct vring_desc *desc, *desc_table; + struct virtio_scsi_event *desc_ev; + uint32_t desc_table_size, req_size = 0; + uint16_t req; + int rc; + + assert(scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + vq = &vsession->virtqueue[VIRTIO_SCSI_EVENTQ]; + + if (vq->vring.desc == NULL || vhost_vq_avail_ring_get(vq, &req, 1) != 1) { + SPDK_ERRLOG("%s: failed to send virtio event (no avail ring entries?).\n", + vsession->name); + return; + } + + rc = vhost_vq_get_desc(vsession, vq, req, &desc, &desc_table, &desc_table_size); + if (rc != 0 || desc->len < sizeof(*desc_ev)) { + SPDK_ERRLOG("%s: invalid eventq descriptor at index %"PRIu16".\n", + vsession->name, req); + goto out; + } + + desc_ev = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*desc_ev)); + if (desc_ev == NULL) { + SPDK_ERRLOG("%s: eventq descriptor at index %"PRIu16" points " + "to unmapped guest memory address %p.\n", + vsession->name, req, (void *)(uintptr_t)desc->addr); + goto out; + } + + desc_ev->event = event; + desc_ev->lun[0] = 1; + desc_ev->lun[1] = scsi_dev_num; + /* virtio LUN id 0 can refer either to the entire device + * or actual LUN 0 (the only supported by vhost for now) + */ + desc_ev->lun[2] = 0 >> 8; + desc_ev->lun[3] = 0 & 0xFF; + /* virtio doesn't specify any strict format for LUN id (bytes 2 and 3) + * current implementation relies on linux kernel sources + */ + memset(&desc_ev->lun[4], 0, 4); + desc_ev->reason = reason; + req_size = sizeof(*desc_ev); + +out: + vhost_vq_used_ring_enqueue(vsession, vq, req, req_size); +} + +static void +submit_completion(struct spdk_vhost_scsi_task *task) +{ + struct spdk_vhost_session *vsession = &task->svsession->vsession; + + vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx, + task->used_len); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Finished task (%p) req_idx=%d\n", task, task->req_idx); + + vhost_scsi_task_put(task); +} + +static void +vhost_scsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + + submit_completion(task); +} + +static void +vhost_scsi_task_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + + /* The SCSI task has completed. Do final processing and then post + notification to the virtqueue's "used" ring. + */ + task->resp->status = task->scsi.status; + + if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) { + memcpy(task->resp->sense, task->scsi.sense_data, task->scsi.sense_data_len); + task->resp->sense_len = task->scsi.sense_data_len; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Task (%p) req_idx=%d failed - status=%u\n", task, task->req_idx, + task->scsi.status); + } + assert(task->scsi.transfer_len == task->scsi.length); + task->resp->resid = task->scsi.length - task->scsi.data_transferred; + + submit_completion(task); +} + +static void +task_submit(struct spdk_vhost_scsi_task *task) +{ + task->resp->response = VIRTIO_SCSI_S_OK; + spdk_scsi_dev_queue_task(task->scsi_dev, &task->scsi); +} + +static void +mgmt_task_submit(struct spdk_vhost_scsi_task *task, enum spdk_scsi_task_func func) +{ + task->tmf_resp->response = VIRTIO_SCSI_S_OK; + task->scsi.function = func; + spdk_scsi_dev_queue_mgmt_task(task->scsi_dev, &task->scsi); +} + +static void +invalid_request(struct spdk_vhost_scsi_task *task) +{ + struct spdk_vhost_session *vsession = &task->svsession->vsession; + + vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx, + task->used_len); + vhost_scsi_task_put(task); + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Invalid request (status=%" PRIu8")\n", + task->resp ? task->resp->response : -1); +} + +static int +vhost_scsi_task_init_target(struct spdk_vhost_scsi_task *task, const __u8 *lun) +{ + struct spdk_vhost_scsi_session *svsession = task->svsession; + struct spdk_scsi_dev_session_state *state; + uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF; + + SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN", lun, 8); + + /* First byte must be 1 and second is target */ + if (lun[0] != 1 || lun[1] >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + return -1; + } + + state = &svsession->scsi_dev_state[lun[1]]; + task->scsi_dev = state->dev; + if (state->dev == NULL || state->status != VHOST_SCSI_DEV_PRESENT) { + /* If dev has been hotdetached, return 0 to allow sending + * additional hotremove event via sense codes. + */ + return state->status != VHOST_SCSI_DEV_EMPTY ? 0 : -1; + } + + task->scsi.target_port = spdk_scsi_dev_find_port_by_id(task->scsi_dev, 0); + task->scsi.lun = spdk_scsi_dev_get_lun(state->dev, lun_id); + return 0; +} + +static void +process_ctrl_request(struct spdk_vhost_scsi_task *task) +{ + struct spdk_vhost_session *vsession = &task->svsession->vsession; + struct vring_desc *desc, *desc_table; + struct virtio_scsi_ctrl_tmf_req *ctrl_req; + struct virtio_scsi_ctrl_an_resp *an_resp; + uint32_t desc_table_size, used_len = 0; + int rc; + + spdk_scsi_task_construct(&task->scsi, vhost_scsi_task_mgmt_cpl, vhost_scsi_task_free_cb); + rc = vhost_vq_get_desc(vsession, task->vq, task->req_idx, &desc, &desc_table, + &desc_table_size); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("%s: invalid controlq descriptor at index %d.\n", + vsession->name, task->req_idx); + goto out; + } + + ctrl_req = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*ctrl_req)); + if (ctrl_req == NULL) { + SPDK_ERRLOG("%s: invalid task management request at index %d.\n", + vsession->name, task->req_idx); + goto out; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, + "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; kickfd %d; size %d\n", + task->req_idx, desc, (void *)desc->addr, desc->len, desc->flags, task->vq->last_used_idx, + task->vq->vring.kickfd, task->vq->vring.size); + SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "Request descriptor", (uint8_t *)ctrl_req, desc->len); + + vhost_scsi_task_init_target(task, ctrl_req->lun); + + vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + if (spdk_unlikely(desc == NULL)) { + SPDK_ERRLOG("%s: no response descriptor for controlq request %d.\n", + vsession->name, task->req_idx); + goto out; + } + + /* Process the TMF request */ + switch (ctrl_req->type) { + case VIRTIO_SCSI_T_TMF: + task->tmf_resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->tmf_resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_tmf_resp) || task->tmf_resp == NULL)) { + SPDK_ERRLOG("%s: TMF response descriptor at index %d points to invalid guest memory region\n", + vsession->name, task->req_idx); + goto out; + } + + /* Check if we are processing a valid request */ + if (task->scsi_dev == NULL) { + task->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET; + break; + } + + switch (ctrl_req->subtype) { + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + /* Handle LUN reset */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: LUN reset\n", vsession->name); + + mgmt_task_submit(task, SPDK_SCSI_TASK_FUNC_LUN_RESET); + return; + default: + task->tmf_resp->response = VIRTIO_SCSI_S_ABORTED; + /* Unsupported command */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: unsupported TMF command %x\n", + vsession->name, ctrl_req->subtype); + break; + } + break; + case VIRTIO_SCSI_T_AN_QUERY: + case VIRTIO_SCSI_T_AN_SUBSCRIBE: { + an_resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*an_resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_an_resp) || an_resp == NULL)) { + SPDK_WARNLOG("%s: asynchronous response descriptor points to invalid guest memory region\n", + vsession->name); + goto out; + } + + an_resp->response = VIRTIO_SCSI_S_ABORTED; + break; + } + default: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: Unsupported control command %x\n", + vsession->name, ctrl_req->type); + break; + } + + used_len = sizeof(struct virtio_scsi_ctrl_tmf_resp); +out: + vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx, used_len); + vhost_scsi_task_put(task); +} + +/* + * Process task's descriptor chain and setup data related fields. + * Return + * -1 if request is invalid and must be aborted, + * 0 if all data are set. + */ +static int +task_data_setup(struct spdk_vhost_scsi_task *task, + struct virtio_scsi_cmd_req **req) +{ + struct spdk_vhost_session *vsession = &task->svsession->vsession; + struct vring_desc *desc, *desc_table; + struct iovec *iovs = task->iovs; + uint16_t iovcnt = 0; + uint32_t desc_table_len, len = 0; + int rc; + + spdk_scsi_task_construct(&task->scsi, vhost_scsi_task_cpl, vhost_scsi_task_free_cb); + + rc = vhost_vq_get_desc(vsession, task->vq, task->req_idx, &desc, &desc_table, &desc_table_len); + /* First descriptor must be readable */ + if (spdk_unlikely(rc != 0 || vhost_vring_desc_is_wr(desc) || + desc->len < sizeof(struct virtio_scsi_cmd_req))) { + SPDK_WARNLOG("%s: invalid first request descriptor at index %"PRIu16".\n", + vsession->name, task->req_idx); + goto invalid_task; + } + + *req = vhost_gpa_to_vva(vsession, desc->addr, sizeof(**req)); + if (spdk_unlikely(*req == NULL)) { + SPDK_WARNLOG("%s: request descriptor at index %d points to invalid guest memory region\n", + vsession->name, task->req_idx); + goto invalid_task; + } + + /* Each request must have at least 2 descriptors (e.g. request and response) */ + vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (desc == NULL) { + SPDK_WARNLOG("%s: descriptor chain at index %d contains neither payload nor response buffer.\n", + vsession->name, task->req_idx); + goto invalid_task; + } + task->scsi.dxfer_dir = vhost_vring_desc_is_wr(desc) ? SPDK_SCSI_DIR_FROM_DEV : + SPDK_SCSI_DIR_TO_DEV; + task->scsi.iovs = iovs; + + if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) { + /* + * FROM_DEV (READ): [RD_req][WR_resp][WR_buf0]...[WR_bufN] + */ + task->resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) { + SPDK_WARNLOG("%s: response descriptor at index %d points to invalid guest memory region\n", + vsession->name, task->req_idx); + goto invalid_task; + } + rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(rc != 0)) { + SPDK_WARNLOG("%s: invalid descriptor chain at request index %d (descriptor id overflow?).\n", + vsession->name, task->req_idx); + goto invalid_task; + } + + if (desc == NULL) { + /* + * TEST UNIT READY command and some others might not contain any payload and this is not an error. + */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, + "No payload descriptors for FROM DEV command req_idx=%"PRIu16".\n", task->req_idx); + SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_DATA, "CDB=", (*req)->cdb, VIRTIO_SCSI_CDB_SIZE); + task->used_len = sizeof(struct virtio_scsi_cmd_resp); + task->scsi.iovcnt = 1; + task->scsi.iovs[0].iov_len = 0; + task->scsi.length = 0; + task->scsi.transfer_len = 0; + return 0; + } + + /* All remaining descriptors are data. */ + while (desc) { + if (spdk_unlikely(!vhost_vring_desc_is_wr(desc))) { + SPDK_WARNLOG("%s: FROM DEV cmd: descriptor nr %" PRIu16" in payload chain is read only.\n", + vsession->name, iovcnt); + goto invalid_task; + } + + if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &iovcnt, desc))) { + goto invalid_task; + } + len += desc->len; + + rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(rc != 0)) { + SPDK_WARNLOG("%s: invalid payload in descriptor chain starting at index %d.\n", + vsession->name, task->req_idx); + goto invalid_task; + } + } + + task->used_len = sizeof(struct virtio_scsi_cmd_resp) + len; + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "TO DEV"); + /* + * TO_DEV (WRITE):[RD_req][RD_buf0]...[RD_bufN][WR_resp] + * No need to check descriptor WR flag as this is done while setting scsi.dxfer_dir. + */ + + /* Process descriptors up to response. */ + while (!vhost_vring_desc_is_wr(desc)) { + if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &iovcnt, desc))) { + goto invalid_task; + } + len += desc->len; + + vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(desc == NULL)) { + SPDK_WARNLOG("%s: TO_DEV cmd: no response descriptor.\n", vsession->name); + goto invalid_task; + } + } + + task->resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) { + SPDK_WARNLOG("%s: response descriptor at index %d points to invalid guest memory region\n", + vsession->name, task->req_idx); + goto invalid_task; + } + + task->used_len = sizeof(struct virtio_scsi_cmd_resp); + } + + task->scsi.iovcnt = iovcnt; + task->scsi.length = len; + task->scsi.transfer_len = len; + return 0; + +invalid_task: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "%s: Invalid task at index %"PRIu16".\n", + vsession->name, task->req_idx); + return -1; +} + +static int +process_request(struct spdk_vhost_scsi_task *task) +{ + struct virtio_scsi_cmd_req *req; + int result; + + result = task_data_setup(task, &req); + if (result) { + return result; + } + + result = vhost_scsi_task_init_target(task, req->lun); + if (spdk_unlikely(result != 0)) { + task->resp->response = VIRTIO_SCSI_S_BAD_TARGET; + return -1; + } + + task->scsi.cdb = req->cdb; + SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_DATA, "request CDB", req->cdb, VIRTIO_SCSI_CDB_SIZE); + + if (spdk_unlikely(task->scsi.lun == NULL)) { + spdk_scsi_task_process_null_lun(&task->scsi); + task->resp->response = VIRTIO_SCSI_S_OK; + return 1; + } + + return 0; +} + +static void +process_scsi_task(struct spdk_vhost_session *vsession, + struct spdk_vhost_virtqueue *vq, + uint16_t req_idx) +{ + struct spdk_vhost_scsi_task *task; + int result; + + task = &((struct spdk_vhost_scsi_task *)vq->tasks)[req_idx]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", + vsession->name, req_idx); + vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0); + return; + } + + vsession->task_cnt++; + scsi_task_init(task); + + if (spdk_unlikely(vq->vring_idx == VIRTIO_SCSI_CONTROLQ)) { + process_ctrl_request(task); + } else { + result = process_request(task); + if (likely(result == 0)) { + task_submit(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d submitted ======\n", task, + task->req_idx); + } else if (result > 0) { + vhost_scsi_task_cpl(&task->scsi); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d finished early ======\n", task, + task->req_idx); + } else { + invalid_request(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d failed ======\n", task, + task->req_idx); + } + } +} + +static void +process_vq(struct spdk_vhost_scsi_session *svsession, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_session *vsession = &svsession->vsession; + uint16_t reqs[32]; + uint16_t reqs_cnt, i; + + reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); + assert(reqs_cnt <= 32); + + for (i = 0; i < reqs_cnt; i++) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Starting processing request idx %"PRIu16"======\n", + reqs[i]); + + if (spdk_unlikely(reqs[i] >= vq->vring.size)) { + SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", + vsession->name, reqs[i], vq->vring.size); + vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0); + continue; + } + + process_scsi_task(vsession, vq, reqs[i]); + } +} + +static int +vdev_mgmt_worker(void *arg) +{ + struct spdk_vhost_scsi_session *svsession = arg; + struct spdk_vhost_session *vsession = &svsession->vsession; + + process_removed_devs(svsession); + vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_EVENTQ]); + + process_vq(svsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]); + vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]); + + return SPDK_POLLER_BUSY; +} + +static int +vdev_worker(void *arg) +{ + struct spdk_vhost_scsi_session *svsession = arg; + struct spdk_vhost_session *vsession = &svsession->vsession; + uint32_t q_idx; + + for (q_idx = VIRTIO_SCSI_REQUESTQ; q_idx < vsession->max_queues; q_idx++) { + process_vq(svsession, &vsession->virtqueue[q_idx]); + } + + vhost_session_used_signal(vsession); + + return SPDK_POLLER_BUSY; +} + +static struct spdk_vhost_scsi_dev * +to_scsi_dev(struct spdk_vhost_dev *ctrlr) +{ + if (ctrlr == NULL) { + return NULL; + } + + if (ctrlr->backend != &spdk_vhost_scsi_device_backend) { + SPDK_ERRLOG("%s: not a vhost-scsi device.\n", ctrlr->name); + return NULL; + } + + return SPDK_CONTAINEROF(ctrlr, struct spdk_vhost_scsi_dev, vdev); +} + +static struct spdk_vhost_scsi_session * +to_scsi_session(struct spdk_vhost_session *vsession) +{ + assert(vsession->vdev->backend == &spdk_vhost_scsi_device_backend); + return (struct spdk_vhost_scsi_session *)vsession; +} + +int +spdk_vhost_scsi_dev_construct(const char *name, const char *cpumask) +{ + struct spdk_vhost_scsi_dev *svdev = calloc(1, sizeof(*svdev)); + int rc; + + if (svdev == NULL) { + return -ENOMEM; + } + + svdev->vdev.virtio_features = SPDK_VHOST_SCSI_FEATURES; + svdev->vdev.disabled_features = SPDK_VHOST_SCSI_DISABLED_FEATURES; + + spdk_vhost_lock(); + rc = vhost_dev_register(&svdev->vdev, name, cpumask, + &spdk_vhost_scsi_device_backend); + + if (rc) { + free(svdev); + spdk_vhost_unlock(); + return rc; + } + + svdev->registered = true; + + spdk_vhost_unlock(); + return rc; +} + +static int +vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_scsi_dev *svdev = to_scsi_dev(vdev); + int rc, i; + + assert(svdev != NULL); + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) { + if (svdev->scsi_dev_state[i].dev) { + if (vdev->registered) { + SPDK_ERRLOG("%s: SCSI target %d is still present.\n", vdev->name, i); + return -EBUSY; + } + + rc = spdk_vhost_scsi_dev_remove_tgt(vdev, i, NULL, NULL); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to force-remove target %d\n", vdev->name, i); + return rc; + } + } + } + + rc = vhost_dev_unregister(vdev); + if (rc != 0) { + return rc; + } + svdev->registered = false; + + if (svdev->ref == 0) { + free(svdev); + } + + return 0; +} + +struct spdk_scsi_dev * +spdk_vhost_scsi_dev_get_tgt(struct spdk_vhost_dev *vdev, uint8_t num) +{ + struct spdk_vhost_scsi_dev *svdev; + + assert(num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + svdev = to_scsi_dev(vdev); + assert(svdev != NULL); + if (svdev->scsi_dev_state[num].status != VHOST_SCSI_DEV_PRESENT) { + return NULL; + } + + assert(svdev->scsi_dev_state[num].dev != NULL); + return svdev->scsi_dev_state[num].dev; +} + +static void +vhost_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg) +{ + struct spdk_vhost_scsi_dev *svdev = arg; + const struct spdk_scsi_dev *scsi_dev; + unsigned scsi_dev_num; + + assert(lun != NULL); + assert(svdev != NULL); + scsi_dev = spdk_scsi_lun_get_dev(lun); + for (scsi_dev_num = 0; scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_dev_num++) { + if (svdev->scsi_dev_state[scsi_dev_num].dev == scsi_dev) { + break; + } + } + + if (scsi_dev_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + /* The entire device has been already removed. */ + return; + } + + /* remove entire device */ + spdk_vhost_scsi_dev_remove_tgt(&svdev->vdev, scsi_dev_num, NULL, NULL); +} + +static void +vhost_scsi_dev_add_tgt_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx) +{ + unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx; + struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev, + struct spdk_vhost_scsi_dev, vdev); + struct spdk_scsi_dev_vhost_state *vhost_sdev; + + vhost_sdev = &svdev->scsi_dev_state[scsi_tgt_num]; + + /* All sessions have added the target */ + assert(vhost_sdev->status == VHOST_SCSI_DEV_ADDING); + vhost_sdev->status = VHOST_SCSI_DEV_PRESENT; + svdev->ref++; +} + +static int +vhost_scsi_session_add_tgt(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *ctx) +{ + unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx; + struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession; + struct spdk_scsi_dev_session_state *session_sdev = &svsession->scsi_dev_state[scsi_tgt_num]; + struct spdk_scsi_dev_vhost_state *vhost_sdev; + int rc; + + if (!vsession->started || session_sdev->dev != NULL) { + /* Nothing to do. */ + return 0; + } + + vhost_sdev = &svsession->svdev->scsi_dev_state[scsi_tgt_num]; + session_sdev->dev = vhost_sdev->dev; + session_sdev->status = VHOST_SCSI_DEV_PRESENT; + + rc = spdk_scsi_dev_allocate_io_channels(svsession->scsi_dev_state[scsi_tgt_num].dev); + if (rc != 0) { + SPDK_ERRLOG("%s: Couldn't allocate io channnel for SCSI target %u.\n", + vsession->name, scsi_tgt_num); + + /* unset the SCSI target so that all I/O to it will be rejected */ + session_sdev->dev = NULL; + /* Set status to EMPTY so that we won't reply with SCSI hotremove + * sense codes - the device hasn't ever been added. + */ + session_sdev->status = VHOST_SCSI_DEV_EMPTY; + + /* Return with no error. We'll continue allocating io_channels for + * other sessions on this device in hopes they succeed. The sessions + * that failed to allocate io_channels simply won't be able to + * detect the SCSI target, nor do any I/O to it. + */ + return 0; + } + + if (vhost_dev_has_feature(vsession, VIRTIO_SCSI_F_HOTPLUG)) { + eventq_enqueue(svsession, scsi_tgt_num, + VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN); + } else { + SPDK_NOTICELOG("%s: driver does not support hotplug. " + "Please restart it or perform a rescan.\n", + vsession->name); + } + + return 0; +} + +int +spdk_vhost_scsi_dev_add_tgt(struct spdk_vhost_dev *vdev, int scsi_tgt_num, + const char *bdev_name) +{ + struct spdk_vhost_scsi_dev *svdev; + struct spdk_scsi_dev_vhost_state *state; + char target_name[SPDK_SCSI_DEV_MAX_NAME]; + int lun_id_list[1]; + const char *bdev_names_list[1]; + + svdev = to_scsi_dev(vdev); + assert(svdev != NULL); + if (scsi_tgt_num < 0) { + for (scsi_tgt_num = 0; scsi_tgt_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_tgt_num++) { + if (svdev->scsi_dev_state[scsi_tgt_num].dev == NULL) { + break; + } + } + + if (scsi_tgt_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("%s: all SCSI target slots are already in use.\n", vdev->name); + return -ENOSPC; + } + } else { + if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("%s: SCSI target number is too big (got %d, max %d)\n", + vdev->name, scsi_tgt_num, SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + return -EINVAL; + } + } + + if (bdev_name == NULL) { + SPDK_ERRLOG("No lun name specified\n"); + return -EINVAL; + } + + state = &svdev->scsi_dev_state[scsi_tgt_num]; + if (state->dev != NULL) { + SPDK_ERRLOG("%s: SCSI target %u already occupied\n", vdev->name, scsi_tgt_num); + return -EEXIST; + } + + /* + * At this stage only one LUN per target + */ + snprintf(target_name, sizeof(target_name), "Target %u", scsi_tgt_num); + lun_id_list[0] = 0; + bdev_names_list[0] = (char *)bdev_name; + + state->status = VHOST_SCSI_DEV_ADDING; + state->dev = spdk_scsi_dev_construct(target_name, bdev_names_list, lun_id_list, 1, + SPDK_SPC_PROTOCOL_IDENTIFIER_SAS, + vhost_scsi_lun_hotremove, svdev); + + if (state->dev == NULL) { + state->status = VHOST_SCSI_DEV_EMPTY; + SPDK_ERRLOG("%s: couldn't create SCSI target %u using bdev '%s'\n", + vdev->name, scsi_tgt_num, bdev_name); + return -EINVAL; + } + spdk_scsi_dev_add_port(state->dev, 0, "vhost"); + + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: added SCSI target %u using bdev '%s'\n", + vdev->name, scsi_tgt_num, bdev_name); + + vhost_dev_foreach_session(vdev, vhost_scsi_session_add_tgt, + vhost_scsi_dev_add_tgt_cpl_cb, + (void *)(uintptr_t)scsi_tgt_num); + return scsi_tgt_num; +} + +struct scsi_tgt_hotplug_ctx { + unsigned scsi_tgt_num; + bool async_fini; +}; + +static void +vhost_scsi_dev_remove_tgt_cpl_cb(struct spdk_vhost_dev *vdev, void *_ctx) +{ + struct scsi_tgt_hotplug_ctx *ctx = _ctx; + struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev, + struct spdk_vhost_scsi_dev, vdev); + + if (!ctx->async_fini) { + /* there aren't any active sessions, so remove the dev and exit */ + remove_scsi_tgt(svdev, ctx->scsi_tgt_num); + } + + free(ctx); +} + +static int +vhost_scsi_session_remove_tgt(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *_ctx) +{ + struct scsi_tgt_hotplug_ctx *ctx = _ctx; + unsigned scsi_tgt_num = ctx->scsi_tgt_num; + struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession; + struct spdk_scsi_dev_session_state *state = &svsession->scsi_dev_state[scsi_tgt_num]; + + if (!vsession->started || state->dev == NULL) { + /* Nothing to do */ + return 0; + } + + /* Mark the target for removal */ + assert(state->status == VHOST_SCSI_DEV_PRESENT); + state->status = VHOST_SCSI_DEV_REMOVING; + + /* Send a hotremove Virtio event */ + if (vhost_dev_has_feature(vsession, VIRTIO_SCSI_F_HOTPLUG)) { + eventq_enqueue(svsession, scsi_tgt_num, + VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED); + } + + /* Wait for the session's management poller to remove the target after + * all its pending I/O has finished. + */ + ctx->async_fini = true; + return 0; +} + +int +spdk_vhost_scsi_dev_remove_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num, + spdk_vhost_event_fn cb_fn, void *cb_arg) +{ + struct spdk_vhost_scsi_dev *svdev; + struct spdk_scsi_dev_vhost_state *scsi_dev_state; + struct scsi_tgt_hotplug_ctx *ctx; + + if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("%s: invalid SCSI target number %d\n", vdev->name, scsi_tgt_num); + return -EINVAL; + } + + svdev = to_scsi_dev(vdev); + assert(svdev != NULL); + scsi_dev_state = &svdev->scsi_dev_state[scsi_tgt_num]; + + if (scsi_dev_state->status != VHOST_SCSI_DEV_PRESENT) { + return -EBUSY; + } + + if (scsi_dev_state->dev == NULL || scsi_dev_state->status == VHOST_SCSI_DEV_ADDING) { + SPDK_ERRLOG("%s: SCSI target %u is not occupied\n", vdev->name, scsi_tgt_num); + return -ENODEV; + } + + assert(scsi_dev_state->status != VHOST_SCSI_DEV_EMPTY); + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + SPDK_ERRLOG("calloc failed\n"); + return -ENOMEM; + } + + ctx->scsi_tgt_num = scsi_tgt_num; + ctx->async_fini = false; + + scsi_dev_state->remove_cb = cb_fn; + scsi_dev_state->remove_ctx = cb_arg; + scsi_dev_state->status = VHOST_SCSI_DEV_REMOVING; + + vhost_dev_foreach_session(vdev, vhost_scsi_session_remove_tgt, + vhost_scsi_dev_remove_tgt_cpl_cb, ctx); + return 0; +} + +int +vhost_scsi_controller_construct(void) +{ + struct spdk_conf_section *sp = spdk_conf_first_section(NULL); + struct spdk_vhost_dev *vdev; + int i, dev_num; + unsigned ctrlr_num = 0; + char *bdev_name, *tgt_num_str; + char *cpumask; + char *name; + char *tgt = NULL; + + while (sp != NULL) { + if (!spdk_conf_section_match_prefix(sp, "VhostScsi")) { + sp = spdk_conf_next_section(sp); + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostScsi%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + + if (spdk_vhost_scsi_dev_construct(name, cpumask) < 0) { + return -1; + } + + vdev = spdk_vhost_dev_find(name); + assert(vdev); + + for (i = 0; ; i++) { + + tgt = spdk_conf_section_get_nval(sp, "Target", i); + if (tgt == NULL) { + break; + } + + tgt_num_str = spdk_conf_section_get_nmval(sp, "Target", i, 0); + if (tgt_num_str == NULL) { + SPDK_ERRLOG("%s: invalid or missing SCSI target number\n", name); + return -1; + } + + dev_num = (int)strtol(tgt_num_str, NULL, 10); + bdev_name = spdk_conf_section_get_nmval(sp, "Target", i, 1); + if (bdev_name == NULL) { + SPDK_ERRLOG("%s: invalid or missing bdev name for SCSI target %d\n", name, dev_num); + return -1; + } else if (spdk_conf_section_get_nmval(sp, "Target", i, 2)) { + SPDK_ERRLOG("%s: only one LUN per SCSI target is supported\n", name); + return -1; + } + + if (spdk_vhost_scsi_dev_add_tgt(vdev, dev_num, bdev_name) < 0) { + return -1; + } + } + + sp = spdk_conf_next_section(sp); + } + + return 0; +} + +static void +free_task_pool(struct spdk_vhost_scsi_session *svsession) +{ + struct spdk_vhost_session *vsession = &svsession->vsession; + struct spdk_vhost_virtqueue *vq; + uint16_t i; + + for (i = 0; i < vsession->max_queues; i++) { + vq = &vsession->virtqueue[i]; + if (vq->tasks == NULL) { + continue; + } + + spdk_free(vq->tasks); + vq->tasks = NULL; + } +} + +static int +alloc_task_pool(struct spdk_vhost_scsi_session *svsession) +{ + struct spdk_vhost_session *vsession = &svsession->vsession; + struct spdk_vhost_virtqueue *vq; + struct spdk_vhost_scsi_task *task; + uint32_t task_cnt; + uint16_t i; + uint32_t j; + + for (i = 0; i < vsession->max_queues; i++) { + vq = &vsession->virtqueue[i]; + if (vq->vring.desc == NULL) { + continue; + } + + task_cnt = vq->vring.size; + if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { + /* sanity check */ + SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", + vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); + free_task_pool(svsession); + return -1; + } + vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_scsi_task) * task_cnt, + SPDK_CACHE_LINE_SIZE, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (vq->tasks == NULL) { + SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", + vsession->name, task_cnt, i); + free_task_pool(svsession); + return -1; + } + + for (j = 0; j < task_cnt; j++) { + task = &((struct spdk_vhost_scsi_task *)vq->tasks)[j]; + task->svsession = svsession; + task->vq = vq; + task->req_idx = j; + } + } + + return 0; +} + +static int +vhost_scsi_start_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession); + struct spdk_vhost_scsi_dev *svdev = svsession->svdev; + struct spdk_scsi_dev_vhost_state *state; + uint32_t i; + int rc; + + /* validate all I/O queues are in a contiguous index range */ + for (i = VIRTIO_SCSI_REQUESTQ; i < vsession->max_queues; i++) { + if (vsession->virtqueue[i].vring.desc == NULL) { + SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i); + rc = -1; + goto out; + } + } + + rc = alloc_task_pool(svsession); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name); + goto out; + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + state = &svdev->scsi_dev_state[i]; + if (state->dev == NULL || state->status == VHOST_SCSI_DEV_REMOVING) { + continue; + } + + assert(svsession->scsi_dev_state[i].status == VHOST_SCSI_DEV_EMPTY); + svsession->scsi_dev_state[i].dev = state->dev; + svsession->scsi_dev_state[i].status = VHOST_SCSI_DEV_PRESENT; + rc = spdk_scsi_dev_allocate_io_channels(state->dev); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to alloc io_channel for SCSI target %"PRIu32"\n", + vsession->name, i); + /* unset the SCSI target so that all I/O to it will be rejected */ + svsession->scsi_dev_state[i].dev = NULL; + /* set EMPTY state so that we won't reply with SCSI hotremove + * sense codes - the device hasn't ever been added. + */ + svsession->scsi_dev_state[i].status = VHOST_SCSI_DEV_EMPTY; + continue; + } + } + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n", + vsession->name, spdk_env_get_current_core()); + + svsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, svsession, 0); + if (vsession->virtqueue[VIRTIO_SCSI_CONTROLQ].vring.desc && + vsession->virtqueue[VIRTIO_SCSI_EVENTQ].vring.desc) { + svsession->mgmt_poller = SPDK_POLLER_REGISTER(vdev_mgmt_worker, svsession, + MGMT_POLL_PERIOD_US); + } +out: + vhost_session_start_done(vsession, rc); + return rc; +} + +static int +vhost_scsi_start(struct spdk_vhost_session *vsession) +{ + struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession); + struct spdk_vhost_scsi_dev *svdev; + + svdev = to_scsi_dev(vsession->vdev); + assert(svdev != NULL); + svsession->svdev = svdev; + + return vhost_session_send_event(vsession, vhost_scsi_start_cb, + 3, "start session"); +} + +static int +destroy_session_poller_cb(void *arg) +{ + struct spdk_vhost_scsi_session *svsession = arg; + struct spdk_vhost_session *vsession = &svsession->vsession; + struct spdk_scsi_dev_session_state *state; + uint32_t i; + + if (vsession->task_cnt > 0) { + return SPDK_POLLER_BUSY; + } + + if (spdk_vhost_trylock() != 0) { + return SPDK_POLLER_BUSY; + } + + for (i = 0; i < vsession->max_queues; i++) { + vhost_vq_used_signal(vsession, &vsession->virtqueue[i]); + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + enum spdk_scsi_dev_vhost_status prev_status; + + state = &svsession->scsi_dev_state[i]; + /* clear the REMOVED status so that we won't send hotremove events anymore */ + prev_status = state->status; + state->status = VHOST_SCSI_DEV_EMPTY; + if (state->dev == NULL) { + continue; + } + + spdk_scsi_dev_free_io_channels(state->dev); + + state->dev = NULL; + + if (prev_status == VHOST_SCSI_DEV_REMOVING) { + /* try to detach it globally */ + vhost_dev_foreach_session(vsession->vdev, + vhost_scsi_session_process_removed, + vhost_scsi_dev_process_removed_cpl_cb, + (void *)(uintptr_t)i); + } + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n", + vsession->name, spdk_env_get_current_core()); + + free_task_pool(svsession); + + spdk_poller_unregister(&svsession->stop_poller); + vhost_session_stop_done(vsession, 0); + + spdk_vhost_unlock(); + return SPDK_POLLER_BUSY; +} + +static int +vhost_scsi_stop_cb(struct spdk_vhost_dev *vdev, + struct spdk_vhost_session *vsession, void *unused) +{ + struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession); + + /* Stop receiving new I/O requests */ + spdk_poller_unregister(&svsession->requestq_poller); + + /* Stop receiving controlq requests, also stop processing the + * asynchronous hotremove events. All the remaining events + * will be finalized by the stop_poller below. + */ + spdk_poller_unregister(&svsession->mgmt_poller); + + /* Wait for all pending I/Os to complete, then process all the + * remaining hotremove events one last time. + */ + svsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb, + svsession, 1000); + + return 0; +} + +static int +vhost_scsi_stop(struct spdk_vhost_session *vsession) +{ + return vhost_session_send_event(vsession, vhost_scsi_stop_cb, + 3, "stop session"); +} + +static void +vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_scsi_dev *sdev; + struct spdk_scsi_lun *lun; + uint32_t dev_idx; + uint32_t lun_idx; + + assert(vdev != NULL); + spdk_json_write_named_array_begin(w, "scsi"); + for (dev_idx = 0; dev_idx < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; dev_idx++) { + sdev = spdk_vhost_scsi_dev_get_tgt(vdev, dev_idx); + if (!sdev) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_named_uint32(w, "scsi_dev_num", dev_idx); + + spdk_json_write_named_uint32(w, "id", spdk_scsi_dev_get_id(sdev)); + + spdk_json_write_named_string(w, "target_name", spdk_scsi_dev_get_name(sdev)); + + spdk_json_write_named_array_begin(w, "luns"); + + for (lun_idx = 0; lun_idx < SPDK_SCSI_DEV_MAX_LUN; lun_idx++) { + lun = spdk_scsi_dev_get_lun(sdev, lun_idx); + if (!lun) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_named_int32(w, "id", spdk_scsi_lun_get_id(lun)); + + spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); + + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); +} + +static void +vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_scsi_dev *scsi_dev; + struct spdk_scsi_lun *lun; + uint32_t i; + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_create_scsi_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_string(w, "cpumask", + spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread))); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + scsi_dev = spdk_vhost_scsi_dev_get_tgt(vdev, i); + if (scsi_dev == NULL) { + continue; + } + + lun = spdk_scsi_dev_get_lun(scsi_dev, 0); + assert(lun != NULL); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "vhost_scsi_controller_add_target"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_uint32(w, "scsi_target_num", i); + + spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi", SPDK_LOG_VHOST_SCSI) +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_queue", SPDK_LOG_VHOST_SCSI_QUEUE) +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_data", SPDK_LOG_VHOST_SCSI_DATA) diff --git a/src/spdk/lib/virtio/Makefile b/src/spdk/lib/virtio/Makefile new file mode 100644 index 000000000..8ea173c3b --- /dev/null +++ b/src/spdk/lib/virtio/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +CFLAGS += $(ENV_CFLAGS) +C_SRCS = virtio.c virtio_user.c virtio_pci.c vhost_user.c +LIBNAME = virtio + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_virtio.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/virtio/spdk_virtio.map b/src/spdk/lib/virtio/spdk_virtio.map new file mode 100644 index 000000000..76e02cff8 --- /dev/null +++ b/src/spdk/lib/virtio/spdk_virtio.map @@ -0,0 +1,33 @@ +{ + global: + + # internal functions in spdk_internal/virtio.h + virtio_recv_pkts; + virtqueue_req_start; + virtqueue_req_flush; + virtqueue_req_abort; + virtqueue_req_add_iovs; + virtio_dev_construct; + virtio_dev_reset; + virtio_dev_start; + virtio_dev_stop; + virtio_dev_destruct; + virtio_dev_acquire_queue; + virtio_dev_find_and_acquire_queue; + virtio_dev_queue_get_thread; + virtio_dev_queue_is_acquired; + virtio_dev_release_queue; + virtio_dev_get_status; + virtio_dev_set_status; + virtio_dev_write_dev_config; + virtio_dev_read_dev_config; + virtio_dev_backend_ops; + virtio_dev_has_feature; + virtio_dev_dump_json_info; + virtio_pci_dev_enumerate; + virtio_pci_dev_attach; + virtio_user_dev_init; + virtio_pci_dev_init; + + local: *; +}; diff --git a/src/spdk/lib/virtio/vhost_user.c b/src/spdk/lib/virtio/vhost_user.c new file mode 100644 index 000000000..b3da9d988 --- /dev/null +++ b/src/spdk/lib/virtio/vhost_user.c @@ -0,0 +1,489 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "vhost_user.h" + +#include "spdk/string.h" +#include "spdk_internal/vhost_user.h" + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + +static int +vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num) +{ + int r; + struct msghdr msgh; + struct iovec iov; + size_t fd_size = fd_num * sizeof(int); + char control[CMSG_SPACE(fd_size)]; + struct cmsghdr *cmsg; + + memset(&msgh, 0, sizeof(msgh)); + memset(control, 0, sizeof(control)); + + iov.iov_base = (uint8_t *)buf; + iov.iov_len = len; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fd_size); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fd_size); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + r = sendmsg(fd, &msgh, 0); + } while (r < 0 && errno == EINTR); + + if (r == -1) { + return -errno; + } + + return 0; +} + +static int +vhost_user_read(int fd, struct vhost_user_msg *msg) +{ + uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; + ssize_t ret; + size_t sz_hdr = VHOST_USER_HDR_SIZE, sz_payload; + + ret = recv(fd, (void *)msg, sz_hdr, 0); + if ((size_t)ret != sz_hdr) { + SPDK_WARNLOG("Failed to recv msg hdr: %zd instead of %zu.\n", + ret, sz_hdr); + if (ret == -1) { + return -errno; + } else { + return -EBUSY; + } + } + + /* validate msg flags */ + if (msg->flags != (valid_flags)) { + SPDK_WARNLOG("Failed to recv msg: flags %"PRIx32" instead of %"PRIx32".\n", + msg->flags, valid_flags); + return -EIO; + } + + sz_payload = msg->size; + + if (sz_payload > VHOST_USER_PAYLOAD_SIZE) { + SPDK_WARNLOG("Received oversized msg: payload size %zu > available space %zu\n", + sz_payload, VHOST_USER_PAYLOAD_SIZE); + return -EIO; + } + + if (sz_payload) { + ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0); + if ((size_t)ret != sz_payload) { + SPDK_WARNLOG("Failed to recv msg payload: %zd instead of %"PRIu32".\n", + ret, msg->size); + if (ret == -1) { + return -errno; + } else { + return -EBUSY; + } + } + } + + return 0; +} + +struct hugepage_file_info { + uint64_t addr; /**< virtual addr */ + size_t size; /**< the file size */ + char path[PATH_MAX]; /**< path to backing file */ +}; + +/* Two possible options: + * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file + * array. This is simple but cannot be used in secondary process because + * secondary process will close and munmap that file. + * 2. Match HUGEFILE_FMT to find hugepage files directly. + * + * We choose option 2. + */ +static int +get_hugepage_file_info(struct hugepage_file_info huges[], int max) +{ + int idx, rc; + FILE *f; + char buf[BUFSIZ], *tmp, *tail; + char *str_underline, *str_start; + int huge_index; + uint64_t v_start, v_end; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + SPDK_ERRLOG("cannot open /proc/self/maps\n"); + rc = -errno; + assert(rc < 0); /* scan-build hack */ + return rc; + } + + idx = 0; + while (fgets(buf, sizeof(buf), f) != NULL) { + if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) { + SPDK_ERRLOG("Failed to parse address\n"); + rc = -EIO; + goto out; + } + + tmp = strchr(buf, ' ') + 1; /** skip address */ + tmp = strchr(tmp, ' ') + 1; /** skip perm */ + tmp = strchr(tmp, ' ') + 1; /** skip offset */ + tmp = strchr(tmp, ' ') + 1; /** skip dev */ + tmp = strchr(tmp, ' ') + 1; /** skip inode */ + while (*tmp == ' ') { /** skip spaces */ + tmp++; + } + tail = strrchr(tmp, '\n'); /** remove newline if exists */ + if (tail) { + *tail = '\0'; + } + + /* Match HUGEFILE_FMT, aka "%s/%smap_%d", + * which is defined in eal_filesystem.h + */ + str_underline = strrchr(tmp, '_'); + if (!str_underline) { + continue; + } + + str_start = str_underline - strlen("map"); + if (str_start < tmp) { + continue; + } + + if (sscanf(str_start, "map_%d", &huge_index) != 1) { + continue; + } + + if (idx >= max) { + SPDK_ERRLOG("Exceed maximum of %d\n", max); + rc = -ENOSPC; + goto out; + } + + if (idx > 0 && + strncmp(tmp, huges[idx - 1].path, PATH_MAX) == 0 && + v_start == huges[idx - 1].addr + huges[idx - 1].size) { + huges[idx - 1].size += (v_end - v_start); + continue; + } + + huges[idx].addr = v_start; + huges[idx].size = v_end - v_start; + snprintf(huges[idx].path, PATH_MAX, "%s", tmp); + idx++; + } + + rc = idx; +out: + fclose(f); + return rc; +} + +static int +prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[]) +{ + int i, num; + struct hugepage_file_info huges[VHOST_USER_MEMORY_MAX_NREGIONS]; + + num = get_hugepage_file_info(huges, VHOST_USER_MEMORY_MAX_NREGIONS); + if (num < 0) { + SPDK_ERRLOG("Failed to prepare memory for vhost-user\n"); + return num; + } + + for (i = 0; i < num; ++i) { + /* the memory regions are unaligned */ + msg->payload.memory.regions[i].guest_phys_addr = huges[i].addr; /* use vaddr! */ + msg->payload.memory.regions[i].userspace_addr = huges[i].addr; + msg->payload.memory.regions[i].memory_size = huges[i].size; + msg->payload.memory.regions[i].flags_padding = 0; + fds[i] = open(huges[i].path, O_RDWR); + } + + msg->payload.memory.nregions = num; + msg->payload.memory.padding = 0; + + return 0; +} + +static const char *const vhost_msg_strings[VHOST_USER_MAX] = { + [VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER", + [VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES", + [VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES", + [VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM", + [VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG", + [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG", +}; + +static int +vhost_user_sock(struct virtio_user_dev *dev, + enum vhost_user_request req, + void *arg) +{ + struct vhost_user_msg msg; + struct vhost_vring_file *file = 0; + int need_reply = 0; + int fds[VHOST_USER_MEMORY_MAX_NREGIONS]; + int fd_num = 0; + int i, len, rc; + int vhostfd = dev->vhostfd; + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_USER, "sent message %d = %s\n", req, vhost_msg_strings[req]); + + msg.request = req; + msg.flags = VHOST_USER_VERSION; + msg.size = 0; + + switch (req) { + case VHOST_USER_GET_FEATURES: + case VHOST_USER_GET_PROTOCOL_FEATURES: + case VHOST_USER_GET_QUEUE_NUM: + need_reply = 1; + break; + + case VHOST_USER_SET_FEATURES: + case VHOST_USER_SET_LOG_BASE: + case VHOST_USER_SET_PROTOCOL_FEATURES: + msg.payload.u64 = *((__u64 *)arg); + msg.size = sizeof(msg.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + case VHOST_USER_RESET_OWNER: + break; + + case VHOST_USER_SET_MEM_TABLE: + rc = prepare_vhost_memory_user(&msg, fds); + if (rc < 0) { + return rc; + } + fd_num = msg.payload.memory.nregions; + msg.size = sizeof(msg.payload.memory.nregions); + msg.size += sizeof(msg.payload.memory.padding); + msg.size += fd_num * sizeof(struct vhost_memory_region); + break; + + case VHOST_USER_SET_LOG_FD: + fds[fd_num++] = *((int *)arg); + break; + + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); + msg.size = sizeof(msg.payload.state); + break; + + case VHOST_USER_GET_VRING_BASE: + memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); + msg.size = sizeof(msg.payload.state); + need_reply = 1; + break; + + case VHOST_USER_SET_VRING_ADDR: + memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr)); + msg.size = sizeof(msg.payload.addr); + break; + + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + file = arg; + msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK; + msg.size = sizeof(msg.payload.u64); + if (file->fd > 0) { + fds[fd_num++] = file->fd; + } else { + msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK; + } + break; + + case VHOST_USER_GET_CONFIG: + memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg)); + msg.size = sizeof(msg.payload.cfg); + need_reply = 1; + break; + + case VHOST_USER_SET_CONFIG: + memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg)); + msg.size = sizeof(msg.payload.cfg); + break; + + default: + SPDK_ERRLOG("trying to send unknown msg\n"); + return -EINVAL; + } + + len = VHOST_USER_HDR_SIZE + msg.size; + rc = vhost_user_write(vhostfd, &msg, len, fds, fd_num); + if (rc < 0) { + SPDK_ERRLOG("%s failed: %s\n", + vhost_msg_strings[req], spdk_strerror(-rc)); + return rc; + } + + if (req == VHOST_USER_SET_MEM_TABLE) + for (i = 0; i < fd_num; ++i) { + close(fds[i]); + } + + if (need_reply) { + rc = vhost_user_read(vhostfd, &msg); + if (rc < 0) { + SPDK_WARNLOG("Received msg failed: %s\n", spdk_strerror(-rc)); + return rc; + } + + if (req != msg.request) { + SPDK_WARNLOG("Received unexpected msg type\n"); + return -EIO; + } + + switch (req) { + case VHOST_USER_GET_FEATURES: + case VHOST_USER_GET_PROTOCOL_FEATURES: + case VHOST_USER_GET_QUEUE_NUM: + if (msg.size != sizeof(msg.payload.u64)) { + SPDK_WARNLOG("Received bad msg size\n"); + return -EIO; + } + *((__u64 *)arg) = msg.payload.u64; + break; + case VHOST_USER_GET_VRING_BASE: + if (msg.size != sizeof(msg.payload.state)) { + SPDK_WARNLOG("Received bad msg size\n"); + return -EIO; + } + memcpy(arg, &msg.payload.state, + sizeof(struct vhost_vring_state)); + break; + case VHOST_USER_GET_CONFIG: + if (msg.size != sizeof(msg.payload.cfg)) { + SPDK_WARNLOG("Received bad msg size\n"); + return -EIO; + } + memcpy(arg, &msg.payload.cfg, sizeof(msg.payload.cfg)); + break; + default: + SPDK_WARNLOG("Received unexpected msg type\n"); + return -EBADMSG; + } + } + + return 0; +} + +/** + * Set up environment to talk with a vhost user backend. + * + * @return + * - (-1) if fail; + * - (0) if succeed. + */ +static int +vhost_user_setup(struct virtio_user_dev *dev) +{ + int fd; + int flag; + struct sockaddr_un un; + ssize_t rc; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + SPDK_ERRLOG("socket() error, %s\n", spdk_strerror(errno)); + return -errno; + } + + flag = fcntl(fd, F_GETFD); + if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) { + SPDK_ERRLOG("fcntl failed, %s\n", spdk_strerror(errno)); + } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path); + if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) { + SPDK_ERRLOG("socket path too long\n"); + close(fd); + if (rc < 0) { + return -errno; + } else { + return -EINVAL; + } + } + if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) { + SPDK_ERRLOG("connect error, %s\n", spdk_strerror(errno)); + close(fd); + return -errno; + } + + dev->vhostfd = fd; + return 0; +} + +struct virtio_user_backend_ops ops_user = { + .setup = vhost_user_setup, + .send_request = vhost_user_sock, +}; + +SPDK_LOG_REGISTER_COMPONENT("virtio_user", SPDK_LOG_VIRTIO_USER) diff --git a/src/spdk/lib/virtio/vhost_user.h b/src/spdk/lib/virtio/vhost_user.h new file mode 100644 index 000000000..0caf51ebc --- /dev/null +++ b/src/spdk/lib/virtio/vhost_user.h @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_H +#define _VHOST_H + +#include "spdk/stdinc.h" + +#include "spdk_internal/log.h" +#include "spdk_internal/virtio.h" +#include "spdk_internal/vhost_user.h" + +struct virtio_user_backend_ops; + +struct virtio_user_dev { + int vhostfd; + + int callfds[SPDK_VIRTIO_MAX_VIRTQUEUES]; + int kickfds[SPDK_VIRTIO_MAX_VIRTQUEUES]; + uint32_t queue_size; + + uint8_t status; + char path[PATH_MAX]; + uint64_t protocol_features; + struct vring vrings[SPDK_VIRTIO_MAX_VIRTQUEUES]; + struct virtio_user_backend_ops *ops; + struct spdk_mem_map *mem_map; +}; + +struct virtio_user_backend_ops { + int (*setup)(struct virtio_user_dev *dev); + int (*send_request)(struct virtio_user_dev *dev, + enum vhost_user_request req, + void *arg); +}; + +extern struct virtio_user_backend_ops ops_user; + +#endif diff --git a/src/spdk/lib/virtio/virtio.c b/src/spdk/lib/virtio/virtio.c new file mode 100644 index 000000000..03866040a --- /dev/null +++ b/src/spdk/lib/virtio/virtio.c @@ -0,0 +1,717 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/util.h" +#include "spdk/barrier.h" + +#include "spdk_internal/virtio.h" + +/* We use SMP memory barrier variants as all virtio_pci devices + * are purely virtual. All MMIO is executed on a CPU core, so + * there's no need to do full MMIO synchronization. + */ +#define virtio_mb() spdk_smp_mb() +#define virtio_rmb() spdk_smp_rmb() +#define virtio_wmb() spdk_smp_wmb() + +/* Chain all the descriptors in the ring with an END */ +static inline void +vring_desc_init(struct vring_desc *dp, uint16_t n) +{ + uint16_t i; + + for (i = 0; i < n - 1; i++) { + dp[i].next = (uint16_t)(i + 1); + } + dp[i].next = VQ_RING_DESC_CHAIN_END; +} + +static void +virtio_init_vring(struct virtqueue *vq) +{ + int size = vq->vq_nentries; + struct vring *vr = &vq->vq_ring; + uint8_t *ring_mem = vq->vq_ring_virt_mem; + + /* + * Reinitialise since virtio port might have been stopped and restarted + */ + memset(ring_mem, 0, vq->vq_ring_size); + vring_init(vr, size, ring_mem, VIRTIO_PCI_VRING_ALIGN); + vq->vq_used_cons_idx = 0; + vq->vq_desc_head_idx = 0; + vq->vq_avail_idx = 0; + vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1); + vq->vq_free_cnt = vq->vq_nentries; + vq->req_start = VQ_RING_DESC_CHAIN_END; + vq->req_end = VQ_RING_DESC_CHAIN_END; + vq->reqs_finished = 0; + memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries); + + vring_desc_init(vr->desc, size); + + /* Tell the backend not to interrupt us. + * If F_EVENT_IDX is negotiated, we will always set incredibly high + * used event idx, so that we will practically never receive an + * interrupt. See virtqueue_req_flush() + */ + if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { + vring_used_event(&vq->vq_ring) = UINT16_MAX; + } else { + vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + } +} + +static int +virtio_init_queue(struct virtio_dev *dev, uint16_t vtpci_queue_idx) +{ + unsigned int vq_size, size; + struct virtqueue *vq; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "setting up queue: %"PRIu16"\n", vtpci_queue_idx); + + /* + * Read the virtqueue size from the Queue Size field + * Always power of 2 and if 0 virtqueue does not exist + */ + vq_size = virtio_dev_backend_ops(dev)->get_queue_size(dev, vtpci_queue_idx); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq_size: %u\n", vq_size); + if (vq_size == 0) { + SPDK_ERRLOG("virtqueue %"PRIu16" does not exist\n", vtpci_queue_idx); + return -EINVAL; + } + + if (!spdk_u32_is_pow2(vq_size)) { + SPDK_ERRLOG("virtqueue %"PRIu16" size (%u) is not powerof 2\n", + vtpci_queue_idx, vq_size); + return -EINVAL; + } + + size = sizeof(*vq) + vq_size * sizeof(struct vq_desc_extra); + + if (posix_memalign((void **)&vq, SPDK_CACHE_LINE_SIZE, size)) { + SPDK_ERRLOG("can not allocate vq\n"); + return -ENOMEM; + } + memset(vq, 0, size); + dev->vqs[vtpci_queue_idx] = vq; + + vq->vdev = dev; + vq->vq_queue_index = vtpci_queue_idx; + vq->vq_nentries = vq_size; + + /* + * Reserve a memzone for vring elements + */ + size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN); + vq->vq_ring_size = SPDK_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vring_size: %u, rounded_vring_size: %u\n", + size, vq->vq_ring_size); + + vq->owner_thread = NULL; + + rc = virtio_dev_backend_ops(dev)->setup_queue(dev, vq); + if (rc < 0) { + SPDK_ERRLOG("setup_queue failed\n"); + free(vq); + dev->vqs[vtpci_queue_idx] = NULL; + return rc; + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_mem: 0x%" PRIx64 "\n", + vq->vq_ring_mem); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_virt_mem: 0x%" PRIx64 "\n", + (uint64_t)(uintptr_t)vq->vq_ring_virt_mem); + + virtio_init_vring(vq); + return 0; +} + +static void +virtio_free_queues(struct virtio_dev *dev) +{ + uint16_t nr_vq = dev->max_queues; + struct virtqueue *vq; + uint16_t i; + + if (dev->vqs == NULL) { + return; + } + + for (i = 0; i < nr_vq; i++) { + vq = dev->vqs[i]; + if (!vq) { + continue; + } + + virtio_dev_backend_ops(dev)->del_queue(dev, vq); + + free(vq); + dev->vqs[i] = NULL; + } + + free(dev->vqs); + dev->vqs = NULL; +} + +static int +virtio_alloc_queues(struct virtio_dev *dev, uint16_t request_vq_num, uint16_t fixed_vq_num) +{ + uint16_t nr_vq; + uint16_t i; + int ret; + + nr_vq = request_vq_num + fixed_vq_num; + if (nr_vq == 0) { + /* perfectly fine to have a device with no virtqueues. */ + return 0; + } + + assert(dev->vqs == NULL); + dev->vqs = calloc(1, sizeof(struct virtqueue *) * nr_vq); + if (!dev->vqs) { + SPDK_ERRLOG("failed to allocate %"PRIu16" vqs\n", nr_vq); + return -ENOMEM; + } + + for (i = 0; i < nr_vq; i++) { + ret = virtio_init_queue(dev, i); + if (ret < 0) { + virtio_free_queues(dev); + return ret; + } + } + + dev->max_queues = nr_vq; + dev->fixed_queues_num = fixed_vq_num; + return 0; +} + +/** + * Negotiate virtio features. For virtio_user this will also set + * dev->modern flag if VIRTIO_F_VERSION_1 flag is negotiated. + */ +static int +virtio_negotiate_features(struct virtio_dev *dev, uint64_t req_features) +{ + uint64_t host_features = virtio_dev_backend_ops(dev)->get_features(dev); + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "guest features = %" PRIx64 "\n", req_features); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "device features = %" PRIx64 "\n", host_features); + + rc = virtio_dev_backend_ops(dev)->set_features(dev, req_features & host_features); + if (rc != 0) { + SPDK_ERRLOG("failed to negotiate device features.\n"); + return rc; + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "negotiated features = %" PRIx64 "\n", + dev->negotiated_features); + + virtio_dev_set_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); + if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_FEATURES_OK)) { + SPDK_ERRLOG("failed to set FEATURES_OK status!\n"); + /* either the device failed, or we offered some features that + * depend on other, not offered features. + */ + return -EINVAL; + } + + return 0; +} + +int +virtio_dev_construct(struct virtio_dev *vdev, const char *name, + const struct virtio_dev_ops *ops, void *ctx) +{ + int rc; + + vdev->name = strdup(name); + if (vdev->name == NULL) { + return -ENOMEM; + } + + rc = pthread_mutex_init(&vdev->mutex, NULL); + if (rc != 0) { + free(vdev->name); + return -rc; + } + + vdev->backend_ops = ops; + vdev->ctx = ctx; + + return 0; +} + +int +virtio_dev_reset(struct virtio_dev *dev, uint64_t req_features) +{ + req_features |= (1ULL << VIRTIO_F_VERSION_1); + + virtio_dev_stop(dev); + + virtio_dev_set_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_ACKNOWLEDGE)) { + SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_ACKNOWLEDGE status.\n"); + return -EIO; + } + + virtio_dev_set_status(dev, VIRTIO_CONFIG_S_DRIVER); + if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_DRIVER)) { + SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER status.\n"); + return -EIO; + } + + return virtio_negotiate_features(dev, req_features); +} + +int +virtio_dev_start(struct virtio_dev *vdev, uint16_t max_queues, uint16_t fixed_queue_num) +{ + int ret; + + ret = virtio_alloc_queues(vdev, max_queues, fixed_queue_num); + if (ret < 0) { + return ret; + } + + virtio_dev_set_status(vdev, VIRTIO_CONFIG_S_DRIVER_OK); + if (!(virtio_dev_get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) { + SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER_OK status.\n"); + return -1; + } + + return 0; +} + +void +virtio_dev_destruct(struct virtio_dev *dev) +{ + virtio_dev_backend_ops(dev)->destruct_dev(dev); + pthread_mutex_destroy(&dev->mutex); + free(dev->name); +} + +static void +vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx) +{ + struct vring_desc *dp, *dp_tail; + struct vq_desc_extra *dxp; + uint16_t desc_idx_last = desc_idx; + + dp = &vq->vq_ring.desc[desc_idx]; + dxp = &vq->vq_descx[desc_idx]; + vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs); + if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) { + while (dp->flags & VRING_DESC_F_NEXT) { + desc_idx_last = dp->next; + dp = &vq->vq_ring.desc[dp->next]; + } + } + dxp->ndescs = 0; + + /* + * We must append the existing free chain, if any, to the end of + * newly freed chain. If the virtqueue was completely used, then + * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above). + */ + if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) { + vq->vq_desc_head_idx = desc_idx; + } else { + dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx]; + dp_tail->next = desc_idx; + } + + vq->vq_desc_tail_idx = desc_idx_last; + dp->next = VQ_RING_DESC_CHAIN_END; +} + +static uint16_t +virtqueue_dequeue_burst_rx(struct virtqueue *vq, void **rx_pkts, + uint32_t *len, uint16_t num) +{ + struct vring_used_elem *uep; + void *cookie; + uint16_t used_idx, desc_idx; + uint16_t i; + + /* Caller does the check */ + for (i = 0; i < num ; i++) { + used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1)); + uep = &vq->vq_ring.used->ring[used_idx]; + desc_idx = (uint16_t) uep->id; + len[i] = uep->len; + cookie = vq->vq_descx[desc_idx].cookie; + + if (spdk_unlikely(cookie == NULL)) { + SPDK_WARNLOG("vring descriptor with no mbuf cookie at %"PRIu16"\n", + vq->vq_used_cons_idx); + break; + } + + __builtin_prefetch(cookie); + + rx_pkts[i] = cookie; + vq->vq_used_cons_idx++; + vq_ring_free_chain(vq, desc_idx); + vq->vq_descx[desc_idx].cookie = NULL; + } + + return i; +} + +static void +finish_req(struct virtqueue *vq) +{ + struct vring_desc *desc; + uint16_t avail_idx; + + desc = &vq->vq_ring.desc[vq->req_end]; + desc->flags &= ~VRING_DESC_F_NEXT; + + /* + * Place the head of the descriptor chain into the next slot and make + * it usable to the host. The chain is made available now rather than + * deferring to virtqueue_req_flush() in the hopes that if the host is + * currently running on another CPU, we can keep it processing the new + * descriptor. + */ + avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1)); + vq->vq_ring.avail->ring[avail_idx] = vq->req_start; + vq->vq_avail_idx++; + vq->req_end = VQ_RING_DESC_CHAIN_END; + virtio_wmb(); + vq->vq_ring.avail->idx = vq->vq_avail_idx; + vq->reqs_finished++; +} + +int +virtqueue_req_start(struct virtqueue *vq, void *cookie, int iovcnt) +{ + struct vq_desc_extra *dxp; + + if (iovcnt > vq->vq_free_cnt) { + return iovcnt > vq->vq_nentries ? -EINVAL : -ENOMEM; + } + + if (vq->req_end != VQ_RING_DESC_CHAIN_END) { + finish_req(vq); + } + + vq->req_start = vq->vq_desc_head_idx; + dxp = &vq->vq_descx[vq->req_start]; + dxp->cookie = cookie; + dxp->ndescs = 0; + + return 0; +} + +void +virtqueue_req_flush(struct virtqueue *vq) +{ + uint16_t reqs_finished; + + if (vq->req_end == VQ_RING_DESC_CHAIN_END) { + /* no non-empty requests have been started */ + return; + } + + finish_req(vq); + virtio_mb(); + + reqs_finished = vq->reqs_finished; + vq->reqs_finished = 0; + + if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { + /* Set used event idx to a value the device will never reach. + * This effectively disables interrupts. + */ + vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx - vq->vq_nentries - 1; + + if (!vring_need_event(vring_avail_event(&vq->vq_ring), + vq->vq_avail_idx, + vq->vq_avail_idx - reqs_finished)) { + return; + } + } else if (vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) { + return; + } + + virtio_dev_backend_ops(vq->vdev)->notify_queue(vq->vdev, vq); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "Notified backend after xmit\n"); +} + +void +virtqueue_req_abort(struct virtqueue *vq) +{ + struct vring_desc *desc; + + if (vq->req_start == VQ_RING_DESC_CHAIN_END) { + /* no requests have been started */ + return; + } + + desc = &vq->vq_ring.desc[vq->req_end]; + desc->flags &= ~VRING_DESC_F_NEXT; + + vq_ring_free_chain(vq, vq->req_start); + vq->req_start = VQ_RING_DESC_CHAIN_END; +} + +void +virtqueue_req_add_iovs(struct virtqueue *vq, struct iovec *iovs, uint16_t iovcnt, + enum spdk_virtio_desc_type desc_type) +{ + struct vring_desc *desc; + struct vq_desc_extra *dxp; + uint16_t i, prev_head, new_head; + + assert(vq->req_start != VQ_RING_DESC_CHAIN_END); + assert(iovcnt <= vq->vq_free_cnt); + + /* TODO use indirect descriptors if iovcnt is high enough + * or the caller specifies SPDK_VIRTIO_DESC_F_INDIRECT + */ + + prev_head = vq->req_end; + new_head = vq->vq_desc_head_idx; + for (i = 0; i < iovcnt; ++i) { + desc = &vq->vq_ring.desc[new_head]; + + if (!vq->vdev->is_hw) { + desc->addr = (uintptr_t)iovs[i].iov_base; + } else { + desc->addr = spdk_vtophys(iovs[i].iov_base, NULL); + } + + desc->len = iovs[i].iov_len; + /* always set NEXT flag. unset it on the last descriptor + * in the request-ending function. + */ + desc->flags = desc_type | VRING_DESC_F_NEXT; + + prev_head = new_head; + new_head = desc->next; + } + + dxp = &vq->vq_descx[vq->req_start]; + dxp->ndescs += iovcnt; + + vq->req_end = prev_head; + vq->vq_desc_head_idx = new_head; + vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - iovcnt); + if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END) { + assert(vq->vq_free_cnt == 0); + vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END; + } +} + +#define DESC_PER_CACHELINE (SPDK_CACHE_LINE_SIZE / sizeof(struct vring_desc)) +uint16_t +virtio_recv_pkts(struct virtqueue *vq, void **io, uint32_t *len, uint16_t nb_pkts) +{ + uint16_t nb_used, num; + + nb_used = vq->vq_ring.used->idx - vq->vq_used_cons_idx; + virtio_rmb(); + + num = (uint16_t)(spdk_likely(nb_used <= nb_pkts) ? nb_used : nb_pkts); + if (spdk_likely(num > DESC_PER_CACHELINE)) { + num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE); + } + + return virtqueue_dequeue_burst_rx(vq, io, len, num); +} + +int +virtio_dev_acquire_queue(struct virtio_dev *vdev, uint16_t index) +{ + struct virtqueue *vq = NULL; + + if (index >= vdev->max_queues) { + SPDK_ERRLOG("requested vq index %"PRIu16" exceeds max queue count %"PRIu16".\n", + index, vdev->max_queues); + return -1; + } + + pthread_mutex_lock(&vdev->mutex); + vq = vdev->vqs[index]; + if (vq == NULL || vq->owner_thread != NULL) { + pthread_mutex_unlock(&vdev->mutex); + return -1; + } + + vq->owner_thread = spdk_get_thread(); + pthread_mutex_unlock(&vdev->mutex); + return 0; +} + +int32_t +virtio_dev_find_and_acquire_queue(struct virtio_dev *vdev, uint16_t start_index) +{ + struct virtqueue *vq = NULL; + uint16_t i; + + pthread_mutex_lock(&vdev->mutex); + for (i = start_index; i < vdev->max_queues; ++i) { + vq = vdev->vqs[i]; + if (vq != NULL && vq->owner_thread == NULL) { + break; + } + } + + if (vq == NULL || i == vdev->max_queues) { + SPDK_ERRLOG("no more unused virtio queues with idx >= %"PRIu16".\n", start_index); + pthread_mutex_unlock(&vdev->mutex); + return -1; + } + + vq->owner_thread = spdk_get_thread(); + pthread_mutex_unlock(&vdev->mutex); + return i; +} + +struct spdk_thread * +virtio_dev_queue_get_thread(struct virtio_dev *vdev, uint16_t index) +{ + struct spdk_thread *thread = NULL; + + if (index >= vdev->max_queues) { + SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16"\n", + index, vdev->max_queues); + abort(); /* This is not recoverable */ + } + + pthread_mutex_lock(&vdev->mutex); + thread = vdev->vqs[index]->owner_thread; + pthread_mutex_unlock(&vdev->mutex); + + return thread; +} + +bool +virtio_dev_queue_is_acquired(struct virtio_dev *vdev, uint16_t index) +{ + return virtio_dev_queue_get_thread(vdev, index) != NULL; +} + +void +virtio_dev_release_queue(struct virtio_dev *vdev, uint16_t index) +{ + struct virtqueue *vq = NULL; + + if (index >= vdev->max_queues) { + SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16".\n", + index, vdev->max_queues); + return; + } + + pthread_mutex_lock(&vdev->mutex); + vq = vdev->vqs[index]; + if (vq == NULL) { + SPDK_ERRLOG("virtqueue at index %"PRIu16" is not initialized.\n", index); + pthread_mutex_unlock(&vdev->mutex); + return; + } + + assert(vq->owner_thread == spdk_get_thread()); + vq->owner_thread = NULL; + pthread_mutex_unlock(&vdev->mutex); +} + +int +virtio_dev_read_dev_config(struct virtio_dev *dev, size_t offset, + void *dst, int length) +{ + return virtio_dev_backend_ops(dev)->read_dev_cfg(dev, offset, dst, length); +} + +int +virtio_dev_write_dev_config(struct virtio_dev *dev, size_t offset, + const void *src, int length) +{ + return virtio_dev_backend_ops(dev)->write_dev_cfg(dev, offset, src, length); +} + +void +virtio_dev_stop(struct virtio_dev *dev) +{ + virtio_dev_backend_ops(dev)->set_status(dev, VIRTIO_CONFIG_S_RESET); + /* flush status write */ + virtio_dev_backend_ops(dev)->get_status(dev); + virtio_free_queues(dev); +} + +void +virtio_dev_set_status(struct virtio_dev *dev, uint8_t status) +{ + if (status != VIRTIO_CONFIG_S_RESET) { + status |= virtio_dev_backend_ops(dev)->get_status(dev); + } + + virtio_dev_backend_ops(dev)->set_status(dev, status); +} + +uint8_t +virtio_dev_get_status(struct virtio_dev *dev) +{ + return virtio_dev_backend_ops(dev)->get_status(dev); +} + +const struct virtio_dev_ops * +virtio_dev_backend_ops(struct virtio_dev *dev) +{ + return dev->backend_ops; +} + +void +virtio_dev_dump_json_info(struct virtio_dev *hw, struct spdk_json_write_ctx *w) +{ + spdk_json_write_named_object_begin(w, "virtio"); + + spdk_json_write_named_uint32(w, "vq_count", hw->max_queues); + + spdk_json_write_named_uint32(w, "vq_size", + virtio_dev_backend_ops(hw)->get_queue_size(hw, 0)); + + virtio_dev_backend_ops(hw)->dump_json_info(hw, w); + + spdk_json_write_object_end(w); +} + +SPDK_LOG_REGISTER_COMPONENT("virtio_dev", SPDK_LOG_VIRTIO_DEV) diff --git a/src/spdk/lib/virtio/virtio_pci.c b/src/spdk/lib/virtio/virtio_pci.c new file mode 100644 index 000000000..646f77c1a --- /dev/null +++ b/src/spdk/lib/virtio/virtio_pci.c @@ -0,0 +1,599 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/memory.h" +#include "spdk/mmio.h" +#include "spdk/string.h" +#include "spdk/env.h" + +#include "spdk_internal/virtio.h" + +struct virtio_hw { + uint8_t use_msix; + uint32_t notify_off_multiplier; + uint8_t *isr; + uint16_t *notify_base; + + struct { + /** Mem-mapped resources from given PCI BAR */ + void *vaddr; + + /** Length of the address space */ + uint32_t len; + } pci_bar[6]; + + struct virtio_pci_common_cfg *common_cfg; + struct spdk_pci_device *pci_dev; + + /** Device-specific PCI config space */ + void *dev_cfg; +}; + +struct virtio_pci_probe_ctx { + virtio_pci_create_cb enum_cb; + void *enum_ctx; + uint16_t device_id; +}; + +/* + * Following macros are derived from linux/pci_regs.h, however, + * we can't simply include that header here, as there is no such + * file for non-Linux platform. + */ +#define PCI_CAPABILITY_LIST 0x34 +#define PCI_CAP_ID_VNDR 0x09 +#define PCI_CAP_ID_MSIX 0x11 + +static inline int +check_vq_phys_addr_ok(struct virtqueue *vq) +{ + /* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit, + * and only accepts 32 bit page frame number. + * Check if the allocated physical memory exceeds 16TB. + */ + if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >> + (VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) { + SPDK_ERRLOG("vring address shouldn't be above 16TB!\n"); + return 0; + } + + return 1; +} + +static void +free_virtio_hw(struct virtio_hw *hw) +{ + unsigned i; + + for (i = 0; i < 6; ++i) { + if (hw->pci_bar[i].vaddr == NULL) { + continue; + } + + spdk_pci_device_unmap_bar(hw->pci_dev, i, hw->pci_bar[i].vaddr); + } + + free(hw); +} + +static void +pci_dump_json_info(struct virtio_dev *dev, struct spdk_json_write_ctx *w) +{ + struct virtio_hw *hw = dev->ctx; + struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr((struct spdk_pci_device *)hw->pci_dev); + char addr[32]; + + spdk_json_write_name(w, "type"); + if (dev->modern) { + spdk_json_write_string(w, "pci-modern"); + } else { + spdk_json_write_string(w, "pci-legacy"); + } + + spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr); + spdk_json_write_named_string(w, "pci_address", addr); +} + +static void +pci_write_json_config(struct virtio_dev *dev, struct spdk_json_write_ctx *w) +{ + struct virtio_hw *hw = dev->ctx; + struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr(hw->pci_dev); + char addr[32]; + + spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr); + + spdk_json_write_named_string(w, "trtype", "pci"); + spdk_json_write_named_string(w, "traddr", addr); +} + +static inline void +io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi) +{ + spdk_mmio_write_4(lo, val & ((1ULL << 32) - 1)); + spdk_mmio_write_4(hi, val >> 32); +} + +static int +modern_read_dev_config(struct virtio_dev *dev, size_t offset, + void *dst, int length) +{ + struct virtio_hw *hw = dev->ctx; + int i; + uint8_t *p; + uint8_t old_gen, new_gen; + + do { + old_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation); + + p = dst; + for (i = 0; i < length; i++) { + *p++ = spdk_mmio_read_1((uint8_t *)hw->dev_cfg + offset + i); + } + + new_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation); + } while (old_gen != new_gen); + + return 0; +} + +static int +modern_write_dev_config(struct virtio_dev *dev, size_t offset, + const void *src, int length) +{ + struct virtio_hw *hw = dev->ctx; + int i; + const uint8_t *p = src; + + for (i = 0; i < length; i++) { + spdk_mmio_write_1(((uint8_t *)hw->dev_cfg) + offset + i, *p++); + } + + return 0; +} + +static uint64_t +modern_get_features(struct virtio_dev *dev) +{ + struct virtio_hw *hw = dev->ctx; + uint32_t features_lo, features_hi; + + spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 0); + features_lo = spdk_mmio_read_4(&hw->common_cfg->device_feature); + + spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 1); + features_hi = spdk_mmio_read_4(&hw->common_cfg->device_feature); + + return ((uint64_t)features_hi << 32) | features_lo; +} + +static int +modern_set_features(struct virtio_dev *dev, uint64_t features) +{ + struct virtio_hw *hw = dev->ctx; + + if ((features & (1ULL << VIRTIO_F_VERSION_1)) == 0) { + SPDK_ERRLOG("VIRTIO_F_VERSION_1 feature is not enabled.\n"); + return -EINVAL; + } + + spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 0); + spdk_mmio_write_4(&hw->common_cfg->guest_feature, features & ((1ULL << 32) - 1)); + + spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 1); + spdk_mmio_write_4(&hw->common_cfg->guest_feature, features >> 32); + + dev->negotiated_features = features; + + return 0; +} + +static void +modern_destruct_dev(struct virtio_dev *vdev) +{ + struct virtio_hw *hw = vdev->ctx; + struct spdk_pci_device *pci_dev = hw->pci_dev; + + free_virtio_hw(hw); + spdk_pci_device_detach(pci_dev); +} + +static uint8_t +modern_get_status(struct virtio_dev *dev) +{ + struct virtio_hw *hw = dev->ctx; + + return spdk_mmio_read_1(&hw->common_cfg->device_status); +} + +static void +modern_set_status(struct virtio_dev *dev, uint8_t status) +{ + struct virtio_hw *hw = dev->ctx; + + spdk_mmio_write_1(&hw->common_cfg->device_status, status); +} + +static uint16_t +modern_get_queue_size(struct virtio_dev *dev, uint16_t queue_id) +{ + struct virtio_hw *hw = dev->ctx; + + spdk_mmio_write_2(&hw->common_cfg->queue_select, queue_id); + return spdk_mmio_read_2(&hw->common_cfg->queue_size); +} + +static int +modern_setup_queue(struct virtio_dev *dev, struct virtqueue *vq) +{ + struct virtio_hw *hw = dev->ctx; + uint64_t desc_addr, avail_addr, used_addr; + uint16_t notify_off; + void *queue_mem; + uint64_t queue_mem_phys_addr; + + /* To ensure physical address contiguity we make the queue occupy + * only a single hugepage (2MB). As of Virtio 1.0, the queue size + * always falls within this limit. + */ + if (vq->vq_ring_size > VALUE_2MB) { + return -ENOMEM; + } + + queue_mem = spdk_zmalloc(vq->vq_ring_size, VALUE_2MB, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (queue_mem == NULL) { + return -ENOMEM; + } + + queue_mem_phys_addr = spdk_vtophys(queue_mem, NULL); + if (queue_mem_phys_addr == SPDK_VTOPHYS_ERROR) { + spdk_free(queue_mem); + return -EFAULT; + } + + vq->vq_ring_mem = queue_mem_phys_addr; + vq->vq_ring_virt_mem = queue_mem; + + if (!check_vq_phys_addr_ok(vq)) { + spdk_free(queue_mem); + return -ENOMEM; + } + + desc_addr = vq->vq_ring_mem; + avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc); + used_addr = (avail_addr + offsetof(struct vring_avail, ring[vq->vq_nentries]) + + VIRTIO_PCI_VRING_ALIGN - 1) & ~(VIRTIO_PCI_VRING_ALIGN - 1); + + spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index); + + io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo, + &hw->common_cfg->queue_desc_hi); + io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo, + &hw->common_cfg->queue_avail_hi); + io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo, + &hw->common_cfg->queue_used_hi); + + notify_off = spdk_mmio_read_2(&hw->common_cfg->queue_notify_off); + vq->notify_addr = (void *)((uint8_t *)hw->notify_base + + notify_off * hw->notify_off_multiplier); + + spdk_mmio_write_2(&hw->common_cfg->queue_enable, 1); + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "queue %"PRIu16" addresses:\n", vq->vq_queue_index); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t desc_addr: %" PRIx64 "\n", desc_addr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t aval_addr: %" PRIx64 "\n", avail_addr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t used_addr: %" PRIx64 "\n", used_addr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t notify addr: %p (notify offset: %"PRIu16")\n", + vq->notify_addr, notify_off); + + return 0; +} + +static void +modern_del_queue(struct virtio_dev *dev, struct virtqueue *vq) +{ + struct virtio_hw *hw = dev->ctx; + + spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index); + + io_write64_twopart(0, &hw->common_cfg->queue_desc_lo, + &hw->common_cfg->queue_desc_hi); + io_write64_twopart(0, &hw->common_cfg->queue_avail_lo, + &hw->common_cfg->queue_avail_hi); + io_write64_twopart(0, &hw->common_cfg->queue_used_lo, + &hw->common_cfg->queue_used_hi); + + spdk_mmio_write_2(&hw->common_cfg->queue_enable, 0); + + spdk_free(vq->vq_ring_virt_mem); +} + +static void +modern_notify_queue(struct virtio_dev *dev, struct virtqueue *vq) +{ + spdk_mmio_write_2(vq->notify_addr, vq->vq_queue_index); +} + +static const struct virtio_dev_ops modern_ops = { + .read_dev_cfg = modern_read_dev_config, + .write_dev_cfg = modern_write_dev_config, + .get_status = modern_get_status, + .set_status = modern_set_status, + .get_features = modern_get_features, + .set_features = modern_set_features, + .destruct_dev = modern_destruct_dev, + .get_queue_size = modern_get_queue_size, + .setup_queue = modern_setup_queue, + .del_queue = modern_del_queue, + .notify_queue = modern_notify_queue, + .dump_json_info = pci_dump_json_info, + .write_json_config = pci_write_json_config, +}; + +static void * +get_cfg_addr(struct virtio_hw *hw, struct virtio_pci_cap *cap) +{ + uint8_t bar = cap->bar; + uint32_t length = cap->length; + uint32_t offset = cap->offset; + + if (bar > 5) { + SPDK_ERRLOG("invalid bar: %"PRIu8"\n", bar); + return NULL; + } + + if (offset + length < offset) { + SPDK_ERRLOG("offset(%"PRIu32") + length(%"PRIu32") overflows\n", + offset, length); + return NULL; + } + + if (offset + length > hw->pci_bar[bar].len) { + SPDK_ERRLOG("invalid cap: overflows bar space: %"PRIu32" > %"PRIu32"\n", + offset + length, hw->pci_bar[bar].len); + return NULL; + } + + if (hw->pci_bar[bar].vaddr == NULL) { + SPDK_ERRLOG("bar %"PRIu8" base addr is NULL\n", bar); + return NULL; + } + + return hw->pci_bar[bar].vaddr + offset; +} + +static int +virtio_read_caps(struct virtio_hw *hw) +{ + uint8_t pos; + struct virtio_pci_cap cap; + int ret; + + ret = spdk_pci_device_cfg_read(hw->pci_dev, &pos, 1, PCI_CAPABILITY_LIST); + if (ret < 0) { + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "failed to read pci capability list\n"); + return ret; + } + + while (pos) { + ret = spdk_pci_device_cfg_read(hw->pci_dev, &cap, sizeof(cap), pos); + if (ret < 0) { + SPDK_ERRLOG("failed to read pci cap at pos: %"PRIx8"\n", pos); + break; + } + + if (cap.cap_vndr == PCI_CAP_ID_MSIX) { + hw->use_msix = 1; + } + + if (cap.cap_vndr != PCI_CAP_ID_VNDR) { + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, + "[%2"PRIx8"] skipping non VNDR cap id: %02"PRIx8"\n", + pos, cap.cap_vndr); + goto next; + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, + "[%2"PRIx8"] cfg type: %"PRIu8", bar: %"PRIu8", offset: %04"PRIx32", len: %"PRIu32"\n", + pos, cap.cfg_type, cap.bar, cap.offset, cap.length); + + switch (cap.cfg_type) { + case VIRTIO_PCI_CAP_COMMON_CFG: + hw->common_cfg = get_cfg_addr(hw, &cap); + break; + case VIRTIO_PCI_CAP_NOTIFY_CFG: + spdk_pci_device_cfg_read(hw->pci_dev, &hw->notify_off_multiplier, + 4, pos + sizeof(cap)); + hw->notify_base = get_cfg_addr(hw, &cap); + break; + case VIRTIO_PCI_CAP_DEVICE_CFG: + hw->dev_cfg = get_cfg_addr(hw, &cap); + break; + case VIRTIO_PCI_CAP_ISR_CFG: + hw->isr = get_cfg_addr(hw, &cap); + break; + } + +next: + pos = cap.cap_next; + } + + if (hw->common_cfg == NULL || hw->notify_base == NULL || + hw->dev_cfg == NULL || hw->isr == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "no modern virtio pci device found.\n"); + if (ret < 0) { + return ret; + } else { + return -EINVAL; + } + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "found modern virtio pci device.\n"); + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "common cfg mapped at: %p\n", hw->common_cfg); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "device cfg mapped at: %p\n", hw->dev_cfg); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "isr cfg mapped at: %p\n", hw->isr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "notify base: %p, notify off multiplier: %u\n", + hw->notify_base, hw->notify_off_multiplier); + + return 0; +} + +static int +virtio_pci_dev_probe(struct spdk_pci_device *pci_dev, struct virtio_pci_probe_ctx *ctx) +{ + struct virtio_hw *hw; + uint8_t *bar_vaddr; + uint64_t bar_paddr, bar_len; + int rc; + unsigned i; + char bdf[32]; + struct spdk_pci_addr addr; + + addr = spdk_pci_device_get_addr(pci_dev); + rc = spdk_pci_addr_fmt(bdf, sizeof(bdf), &addr); + if (rc != 0) { + SPDK_ERRLOG("Ignoring a device with non-parseable PCI address\n"); + return -1; + } + + hw = calloc(1, sizeof(*hw)); + if (hw == NULL) { + SPDK_ERRLOG("%s: calloc failed\n", bdf); + return -1; + } + + hw->pci_dev = pci_dev; + + for (i = 0; i < 6; ++i) { + rc = spdk_pci_device_map_bar(pci_dev, i, (void *) &bar_vaddr, &bar_paddr, + &bar_len); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to memmap PCI BAR %u\n", bdf, i); + free_virtio_hw(hw); + return -1; + } + + hw->pci_bar[i].vaddr = bar_vaddr; + hw->pci_bar[i].len = bar_len; + } + + /* Virtio PCI caps exist only on modern PCI devices. + * Legacy devices are not supported. + */ + if (virtio_read_caps(hw) != 0) { + SPDK_NOTICELOG("Ignoring legacy PCI device at %s\n", bdf); + free_virtio_hw(hw); + return -1; + } + + rc = ctx->enum_cb((struct virtio_pci_ctx *)hw, ctx->enum_ctx); + if (rc != 0) { + free_virtio_hw(hw); + } + + return rc; +} + +static int +virtio_pci_dev_probe_cb(void *probe_ctx, struct spdk_pci_device *pci_dev) +{ + struct virtio_pci_probe_ctx *ctx = probe_ctx; + uint16_t pci_device_id = spdk_pci_device_get_device_id(pci_dev); + + if (pci_device_id != ctx->device_id) { + return 1; + } + + return virtio_pci_dev_probe(pci_dev, ctx); +} + +int +virtio_pci_dev_enumerate(virtio_pci_create_cb enum_cb, void *enum_ctx, + uint16_t pci_device_id) +{ + struct virtio_pci_probe_ctx ctx; + + if (!spdk_process_is_primary()) { + SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n"); + return 0; + } + + ctx.enum_cb = enum_cb; + ctx.enum_ctx = enum_ctx; + ctx.device_id = pci_device_id; + + return spdk_pci_enumerate(spdk_pci_virtio_get_driver(), + virtio_pci_dev_probe_cb, &ctx); +} + +int +virtio_pci_dev_attach(virtio_pci_create_cb enum_cb, void *enum_ctx, + uint16_t pci_device_id, struct spdk_pci_addr *pci_address) +{ + struct virtio_pci_probe_ctx ctx; + + if (!spdk_process_is_primary()) { + SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n"); + return 0; + } + + ctx.enum_cb = enum_cb; + ctx.enum_ctx = enum_ctx; + ctx.device_id = pci_device_id; + + return spdk_pci_device_attach(spdk_pci_virtio_get_driver(), + virtio_pci_dev_probe_cb, &ctx, pci_address); +} + +int +virtio_pci_dev_init(struct virtio_dev *vdev, const char *name, + struct virtio_pci_ctx *pci_ctx) +{ + int rc; + + rc = virtio_dev_construct(vdev, name, &modern_ops, pci_ctx); + if (rc != 0) { + return rc; + } + + vdev->is_hw = 1; + vdev->modern = 1; + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("virtio_pci", SPDK_LOG_VIRTIO_PCI) diff --git a/src/spdk/lib/virtio/virtio_user.c b/src/spdk/lib/virtio/virtio_user.c new file mode 100644 index 000000000..4f4932db9 --- /dev/null +++ b/src/spdk/lib/virtio/virtio_user.c @@ -0,0 +1,628 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include <sys/eventfd.h> + +#include "vhost_user.h" +#include "spdk/string.h" +#include "spdk/config.h" + +#include "spdk_internal/virtio.h" + +#define VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES \ + ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) + +static int +virtio_user_create_queue(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + + /* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come + * firstly because vhost depends on this msg to allocate virtqueue + * pair. + */ + struct vhost_vring_file file; + + file.index = queue_sel; + file.fd = dev->callfds[queue_sel]; + return dev->ops->send_request(dev, VHOST_USER_SET_VRING_CALL, &file); +} + +static int +virtio_user_set_vring_addr(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vring *vring = &dev->vrings[queue_sel]; + struct vhost_vring_addr addr = { + .index = queue_sel, + .desc_user_addr = (uint64_t)(uintptr_t)vring->desc, + .avail_user_addr = (uint64_t)(uintptr_t)vring->avail, + .used_user_addr = (uint64_t)(uintptr_t)vring->used, + .log_guest_addr = 0, + .flags = 0, /* disable log */ + }; + + return dev->ops->send_request(dev, VHOST_USER_SET_VRING_ADDR, &addr); +} + +static int +virtio_user_kick_queue(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_vring_file file; + struct vhost_vring_state state; + struct vring *vring = &dev->vrings[queue_sel]; + int rc; + + state.index = queue_sel; + state.num = vring->num; + rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_NUM, &state); + if (rc < 0) { + return rc; + } + + state.index = queue_sel; + state.num = 0; /* no reservation */ + rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_BASE, &state); + if (rc < 0) { + return rc; + } + + virtio_user_set_vring_addr(vdev, queue_sel); + + /* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes + * lastly because vhost depends on this msg to judge if + * virtio is ready. + */ + file.index = queue_sel; + file.fd = dev->kickfds[queue_sel]; + return dev->ops->send_request(dev, VHOST_USER_SET_VRING_KICK, &file); +} + +static int +virtio_user_stop_queue(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_vring_state state; + + state.index = queue_sel; + state.num = 0; + + return dev->ops->send_request(dev, VHOST_USER_GET_VRING_BASE, &state); +} + +static int +virtio_user_queue_setup(struct virtio_dev *vdev, + int (*fn)(struct virtio_dev *, uint32_t)) +{ + uint32_t i; + int rc; + + for (i = 0; i < vdev->max_queues; ++i) { + rc = fn(vdev, i); + if (rc < 0) { + SPDK_ERRLOG("setup tx vq fails: %"PRIu32".\n", i); + return rc; + } + } + + return 0; +} + +static int +virtio_user_map_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t size) +{ + struct virtio_dev *vdev = cb_ctx; + struct virtio_user_dev *dev = vdev->ctx; + uint64_t features; + int ret; + + /* We have to resend all mappings anyway, so don't bother with any + * page tracking. + */ + ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL); + if (ret < 0) { + return ret; + } + +#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB + /* Our internal rte_vhost lib requires SET_VRING_ADDR to flush a pending + * SET_MEM_TABLE. On the other hand, the upstream rte_vhost will invalidate + * the entire queue upon receiving SET_VRING_ADDR message, so we mustn't + * send it here. Both behaviors are strictly implementation specific, but + * this message isn't needed from the point of the spec, so send it only + * if vhost is compiled with our internal lib. + */ + ret = virtio_user_queue_setup(vdev, virtio_user_set_vring_addr); + if (ret < 0) { + return ret; + } +#endif + + /* Since we might want to use that mapping straight away, we have to + * make sure the guest has already processed our SET_MEM_TABLE message. + * F_REPLY_ACK is just a feature and the host is not obliged to + * support it, so we send a simple message that always has a response + * and we wait for that response. Messages are always processed in order. + */ + return dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features); +} + +static int +virtio_user_register_mem(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + const struct spdk_mem_map_ops virtio_user_map_ops = { + .notify_cb = virtio_user_map_notify, + .are_contiguous = NULL + }; + + dev->mem_map = spdk_mem_map_alloc(0, &virtio_user_map_ops, vdev); + if (dev->mem_map == NULL) { + SPDK_ERRLOG("spdk_mem_map_alloc() failed\n"); + return -1; + } + + return 0; +} + +static void +virtio_user_unregister_mem(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + + spdk_mem_map_free(&dev->mem_map); +} + +static int +virtio_user_start_device(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint64_t host_max_queues; + int ret; + + if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) == 0 && + vdev->max_queues > 1 + vdev->fixed_queues_num) { + SPDK_WARNLOG("%s: requested %"PRIu16" request queues, but the " + "host doesn't support VHOST_USER_PROTOCOL_F_MQ. " + "Only one request queue will be used.\n", + vdev->name, vdev->max_queues - vdev->fixed_queues_num); + vdev->max_queues = 1 + vdev->fixed_queues_num; + } + + /* negotiate the number of I/O queues. */ + ret = dev->ops->send_request(dev, VHOST_USER_GET_QUEUE_NUM, &host_max_queues); + if (ret < 0) { + return ret; + } + + if (vdev->max_queues > host_max_queues + vdev->fixed_queues_num) { + SPDK_WARNLOG("%s: requested %"PRIu16" request queues" + "but only %"PRIu64" available\n", + vdev->name, vdev->max_queues - vdev->fixed_queues_num, + host_max_queues); + vdev->max_queues = host_max_queues; + } + + /* tell vhost to create queues */ + ret = virtio_user_queue_setup(vdev, virtio_user_create_queue); + if (ret < 0) { + return ret; + } + + ret = virtio_user_register_mem(vdev); + if (ret < 0) { + return ret; + } + + return virtio_user_queue_setup(vdev, virtio_user_kick_queue); +} + +static int +virtio_user_stop_device(struct virtio_dev *vdev) +{ + int ret; + + ret = virtio_user_queue_setup(vdev, virtio_user_stop_queue); + /* a queue might fail to stop for various reasons, e.g. socket + * connection going down, but this mustn't prevent us from freeing + * the mem map. + */ + virtio_user_unregister_mem(vdev); + return ret; +} + +static int +virtio_user_dev_setup(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint16_t i; + + dev->vhostfd = -1; + + for (i = 0; i < SPDK_VIRTIO_MAX_VIRTQUEUES; ++i) { + dev->callfds[i] = -1; + dev->kickfds[i] = -1; + } + + dev->ops = &ops_user; + + return dev->ops->setup(dev); +} + +static int +virtio_user_read_dev_config(struct virtio_dev *vdev, size_t offset, + void *dst, int length) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_user_config cfg = {0}; + int rc; + + if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) { + return -ENOTSUP; + } + + cfg.offset = 0; + cfg.size = VHOST_USER_MAX_CONFIG_SIZE; + + rc = dev->ops->send_request(dev, VHOST_USER_GET_CONFIG, &cfg); + if (rc < 0) { + SPDK_ERRLOG("get_config failed: %s\n", spdk_strerror(-rc)); + return rc; + } + + memcpy(dst, cfg.region + offset, length); + return 0; +} + +static int +virtio_user_write_dev_config(struct virtio_dev *vdev, size_t offset, + const void *src, int length) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_user_config cfg = {0}; + int rc; + + if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) { + return -ENOTSUP; + } + + cfg.offset = offset; + cfg.size = length; + memcpy(cfg.region, src, length); + + rc = dev->ops->send_request(dev, VHOST_USER_SET_CONFIG, &cfg); + if (rc < 0) { + SPDK_ERRLOG("set_config failed: %s\n", spdk_strerror(-rc)); + return rc; + } + + return 0; +} + +static void +virtio_user_set_status(struct virtio_dev *vdev, uint8_t status) +{ + struct virtio_user_dev *dev = vdev->ctx; + int rc = 0; + + if ((dev->status & VIRTIO_CONFIG_S_NEEDS_RESET) && + status != VIRTIO_CONFIG_S_RESET) { + rc = -1; + } else if (status & VIRTIO_CONFIG_S_DRIVER_OK) { + rc = virtio_user_start_device(vdev); + } else if (status == VIRTIO_CONFIG_S_RESET && + (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { + rc = virtio_user_stop_device(vdev); + } + + if (rc != 0) { + dev->status |= VIRTIO_CONFIG_S_NEEDS_RESET; + } else { + dev->status = status; + } +} + +static uint8_t +virtio_user_get_status(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + + return dev->status; +} + +static uint64_t +virtio_user_get_features(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint64_t features; + int rc; + + rc = dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features); + if (rc < 0) { + SPDK_ERRLOG("get_features failed: %s\n", spdk_strerror(-rc)); + return 0; + } + + return features; +} + +static int +virtio_user_set_features(struct virtio_dev *vdev, uint64_t features) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint64_t protocol_features; + int ret; + + ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features); + if (ret < 0) { + return ret; + } + + vdev->negotiated_features = features; + vdev->modern = virtio_dev_has_feature(vdev, VIRTIO_F_VERSION_1); + + if (!virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) { + /* nothing else to do */ + return 0; + } + + ret = dev->ops->send_request(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features); + if (ret < 0) { + return ret; + } + + protocol_features &= VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES; + ret = dev->ops->send_request(dev, VHOST_USER_SET_PROTOCOL_FEATURES, &protocol_features); + if (ret < 0) { + return ret; + } + + dev->protocol_features = protocol_features; + return 0; +} + +static uint16_t +virtio_user_get_queue_size(struct virtio_dev *vdev, uint16_t queue_id) +{ + struct virtio_user_dev *dev = vdev->ctx; + + /* Currently each queue has same queue size */ + return dev->queue_size; +} + +static int +virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_vring_state state; + uint16_t queue_idx = vq->vq_queue_index; + void *queue_mem; + uint64_t desc_addr, avail_addr, used_addr; + int callfd, kickfd, rc; + + if (dev->callfds[queue_idx] != -1 || dev->kickfds[queue_idx] != -1) { + SPDK_ERRLOG("queue %"PRIu16" already exists\n", queue_idx); + return -EEXIST; + } + + /* May use invalid flag, but some backend uses kickfd and + * callfd as criteria to judge if dev is alive. so finally we + * use real event_fd. + */ + callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (callfd < 0) { + SPDK_ERRLOG("callfd error, %s\n", spdk_strerror(errno)); + return -errno; + } + + kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (kickfd < 0) { + SPDK_ERRLOG("kickfd error, %s\n", spdk_strerror(errno)); + close(callfd); + return -errno; + } + + queue_mem = spdk_zmalloc(vq->vq_ring_size, VIRTIO_PCI_VRING_ALIGN, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (queue_mem == NULL) { + close(kickfd); + close(callfd); + return -ENOMEM; + } + + vq->vq_ring_mem = SPDK_VTOPHYS_ERROR; + vq->vq_ring_virt_mem = queue_mem; + + state.index = vq->vq_queue_index; + state.num = 0; + + if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) { + rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_ENABLE, &state); + if (rc < 0) { + SPDK_ERRLOG("failed to send VHOST_USER_SET_VRING_ENABLE: %s\n", + spdk_strerror(-rc)); + close(kickfd); + close(callfd); + spdk_free(queue_mem); + return -rc; + } + } + + dev->callfds[queue_idx] = callfd; + dev->kickfds[queue_idx] = kickfd; + + desc_addr = (uintptr_t)vq->vq_ring_virt_mem; + avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc); + used_addr = SPDK_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail, + ring[vq->vq_nentries]), + VIRTIO_PCI_VRING_ALIGN); + + dev->vrings[queue_idx].num = vq->vq_nentries; + dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr; + dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr; + dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr; + + return 0; +} + +static void +virtio_user_del_queue(struct virtio_dev *vdev, struct virtqueue *vq) +{ + /* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU + * correspondingly stops the ioeventfds, and reset the status of + * the device. + * For modern devices, set queue desc, avail, used in PCI bar to 0, + * not see any more behavior in QEMU. + * + * Here we just care about what information to deliver to vhost-user. + * So we just close ioeventfd for now. + */ + struct virtio_user_dev *dev = vdev->ctx; + + close(dev->callfds[vq->vq_queue_index]); + close(dev->kickfds[vq->vq_queue_index]); + dev->callfds[vq->vq_queue_index] = -1; + dev->kickfds[vq->vq_queue_index] = -1; + + spdk_free(vq->vq_ring_virt_mem); +} + +static void +virtio_user_notify_queue(struct virtio_dev *vdev, struct virtqueue *vq) +{ + uint64_t buf = 1; + struct virtio_user_dev *dev = vdev->ctx; + + if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) { + SPDK_ERRLOG("failed to kick backend: %s.\n", spdk_strerror(errno)); + } +} + +static void +virtio_user_destroy(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + + close(dev->vhostfd); + free(dev); +} + +static void +virtio_user_dump_json_info(struct virtio_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct virtio_user_dev *dev = vdev->ctx; + + spdk_json_write_named_string(w, "type", "user"); + spdk_json_write_named_string(w, "socket", dev->path); +} + +static void +virtio_user_write_json_config(struct virtio_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct virtio_user_dev *dev = vdev->ctx; + + spdk_json_write_named_string(w, "trtype", "user"); + spdk_json_write_named_string(w, "traddr", dev->path); + spdk_json_write_named_uint32(w, "vq_count", vdev->max_queues - vdev->fixed_queues_num); + spdk_json_write_named_uint32(w, "vq_size", virtio_dev_backend_ops(vdev)->get_queue_size(vdev, 0)); +} + +static const struct virtio_dev_ops virtio_user_ops = { + .read_dev_cfg = virtio_user_read_dev_config, + .write_dev_cfg = virtio_user_write_dev_config, + .get_status = virtio_user_get_status, + .set_status = virtio_user_set_status, + .get_features = virtio_user_get_features, + .set_features = virtio_user_set_features, + .destruct_dev = virtio_user_destroy, + .get_queue_size = virtio_user_get_queue_size, + .setup_queue = virtio_user_setup_queue, + .del_queue = virtio_user_del_queue, + .notify_queue = virtio_user_notify_queue, + .dump_json_info = virtio_user_dump_json_info, + .write_json_config = virtio_user_write_json_config, +}; + +int +virtio_user_dev_init(struct virtio_dev *vdev, const char *name, const char *path, + uint32_t queue_size) +{ + struct virtio_user_dev *dev; + int rc; + + if (name == NULL) { + SPDK_ERRLOG("No name gived for controller: %s\n", path); + return -EINVAL; + } + + dev = calloc(1, sizeof(*dev)); + if (dev == NULL) { + return -ENOMEM; + } + + rc = virtio_dev_construct(vdev, name, &virtio_user_ops, dev); + if (rc != 0) { + SPDK_ERRLOG("Failed to init device: %s\n", path); + free(dev); + return rc; + } + + vdev->is_hw = 0; + + snprintf(dev->path, PATH_MAX, "%s", path); + dev->queue_size = queue_size; + + rc = virtio_user_dev_setup(vdev); + if (rc < 0) { + SPDK_ERRLOG("backend set up fails\n"); + goto err; + } + + rc = dev->ops->send_request(dev, VHOST_USER_SET_OWNER, NULL); + if (rc < 0) { + SPDK_ERRLOG("set_owner fails: %s\n", spdk_strerror(-rc)); + goto err; + } + + return 0; + +err: + virtio_dev_destruct(vdev); + return rc; +} diff --git a/src/spdk/lib/vmd/Makefile b/src/spdk/lib/vmd/Makefile new file mode 100644 index 000000000..13813c559 --- /dev/null +++ b/src/spdk/lib/vmd/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = vmd.c led.c +LIBNAME = vmd + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_vmd.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/vmd/led.c b/src/spdk/lib/vmd/led.c new file mode 100644 index 000000000..878983aab --- /dev/null +++ b/src/spdk/lib/vmd/led.c @@ -0,0 +1,166 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/likely.h" +#include "spdk/log.h" +#include "vmd.h" + +struct vmd_led_indicator_config { + uint8_t attention_indicator : 2; + uint8_t power_indicator : 2; + uint8_t reserved : 4; +}; + +/* + * VMD LED Attn Power LED Amber + * State Indicator Indicator + * Control Control + * ------------------------------------------------ + * Off 11b 11b Off + * Ident 11b 01b Blink 4Hz + * Fault 01b 11b On + * Rebuild 01b 01b Blink 1Hz + */ +static const struct vmd_led_indicator_config g_led_config[] = { + [SPDK_VMD_LED_STATE_OFF] = { .attention_indicator = 3, .power_indicator = 3 }, + [SPDK_VMD_LED_STATE_IDENTIFY] = { .attention_indicator = 3, .power_indicator = 1 }, + [SPDK_VMD_LED_STATE_FAULT] = { .attention_indicator = 1, .power_indicator = 3 }, + [SPDK_VMD_LED_STATE_REBUILD] = { .attention_indicator = 1, .power_indicator = 1 }, +}; + +static void +vmd_led_set_indicator_control(struct vmd_pci_device *vmd_device, enum spdk_vmd_led_state state) +{ + const struct vmd_led_indicator_config *config; + union express_slot_control_register slot_control; + + assert(state >= SPDK_VMD_LED_STATE_OFF && state <= SPDK_VMD_LED_STATE_REBUILD); + config = &g_led_config[state]; + + slot_control = vmd_device->pcie_cap->slot_control; + slot_control.bit_field.attention_indicator_control = config->attention_indicator; + slot_control.bit_field.power_indicator_control = config->power_indicator; + + /* + * Due to the fact that writes to the PCI config space are posted writes, we need to issue + * a read to the register we've just written to ensure it reached its destination. + * TODO: wrap all register writes with a function taking care of that. + */ + vmd_device->pcie_cap->slot_control = slot_control; + vmd_device->cached_slot_control = vmd_device->pcie_cap->slot_control; +} + +static unsigned int +vmd_led_get_state(struct vmd_pci_device *vmd_device) +{ + const struct vmd_led_indicator_config *config; + union express_slot_control_register slot_control; + unsigned int state; + + slot_control = vmd_device->cached_slot_control; + for (state = SPDK_VMD_LED_STATE_OFF; state <= SPDK_VMD_LED_STATE_REBUILD; ++state) { + config = &g_led_config[state]; + + if (slot_control.bit_field.attention_indicator_control == config->attention_indicator && + slot_control.bit_field.power_indicator_control == config->power_indicator) { + return state; + } + } + + return SPDK_VMD_LED_STATE_UNKNOWN; +} + +/* + * The identifying device under VMD is located in the global list of VMD controllers. If the BDF + * identifies an endpoint, then the LED is attached to the endpoint's parent. If the BDF identifies + * a type 1 header, then this device has the corresponding LED. This may arise when a user wants to + * identify a given empty slot under VMD. + */ +static struct vmd_pci_device * +vmd_get_led_device(const struct spdk_pci_device *pci_device) +{ + struct vmd_pci_device *vmd_device; + + assert(strcmp(spdk_pci_device_get_type(pci_device), "vmd") == 0); + + vmd_device = vmd_find_device(&pci_device->addr); + if (spdk_unlikely(vmd_device == NULL)) { + return NULL; + } + + if (vmd_device->header_type == PCI_HEADER_TYPE_NORMAL) { + if (spdk_unlikely(vmd_device->parent == NULL)) { + return NULL; + } + + return vmd_device->parent->self; + } + + return vmd_device; +} + +int +spdk_vmd_set_led_state(struct spdk_pci_device *pci_device, enum spdk_vmd_led_state state) +{ + struct vmd_pci_device *vmd_device; + + if (state < SPDK_VMD_LED_STATE_OFF || state > SPDK_VMD_LED_STATE_REBUILD) { + SPDK_ERRLOG("Invalid LED state\n"); + return -EINVAL; + } + + vmd_device = vmd_get_led_device(pci_device); + if (spdk_unlikely(vmd_device == NULL)) { + SPDK_ERRLOG("The PCI device is not behind the VMD\n"); + return -ENODEV; + } + + vmd_led_set_indicator_control(vmd_device, state); + return 0; +} + +int +spdk_vmd_get_led_state(struct spdk_pci_device *pci_device, enum spdk_vmd_led_state *state) +{ + struct vmd_pci_device *vmd_device; + + vmd_device = vmd_get_led_device(pci_device); + if (spdk_unlikely(vmd_device == NULL)) { + SPDK_ERRLOG("The PCI device is not behind the VMD\n"); + return -ENODEV; + } + + *state = (enum spdk_vmd_led_state)vmd_led_get_state(vmd_device); + return 0; +} diff --git a/src/spdk/lib/vmd/spdk_vmd.map b/src/spdk/lib/vmd/spdk_vmd.map new file mode 100644 index 000000000..036d079b5 --- /dev/null +++ b/src/spdk/lib/vmd/spdk_vmd.map @@ -0,0 +1,13 @@ +{ + global: + + # public functions + spdk_vmd_init; + spdk_vmd_fini; + spdk_vmd_pci_device_list; + spdk_vmd_set_led_state; + spdk_vmd_get_led_state; + spdk_vmd_hotplug_monitor; + + local: *; +}; diff --git a/src/spdk/lib/vmd/vmd.c b/src/spdk/lib/vmd/vmd.c new file mode 100644 index 000000000..14d9558c2 --- /dev/null +++ b/src/spdk/lib/vmd/vmd.c @@ -0,0 +1,1376 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vmd.h" + +#include "spdk/stdinc.h" +#include "spdk/likely.h" + +static unsigned char *device_type[] = { + "PCI Express Endpoint", + "Legacy PCI Express Endpoint", + "Reserved 1", + "Reserved 2", + "Root Port of PCI Express Root Complex", + "Upstream Port of PCI Express Switch", + "Downstream Port of PCI Express Switch", + "PCI Express to PCI/PCI-X Bridge", + "PCI/PCI-X to PCI Express Bridge", + "Root Complex Integrated Endpoint", + "Root Complex Event Collector", + "Reserved Capability" +}; + +/* + * Container for all VMD adapter probed in the system. + */ +struct vmd_container { + uint32_t count; + struct vmd_adapter vmd[MAX_VMD_SUPPORTED]; +}; + +static struct vmd_container g_vmd_container; +static uint8_t g_end_device_count; + +static bool +vmd_is_valid_cfg_addr(struct vmd_pci_bus *bus, uint64_t addr) +{ + return addr >= (uint64_t)bus->vmd->cfg_vaddr && + addr < bus->vmd->cfgbar_size + (uint64_t)bus->vmd->cfg_vaddr; +} + +static void +vmd_align_base_addrs(struct vmd_adapter *vmd, uint32_t alignment) +{ + uint32_t pad; + + /* + * Device is not in hot plug path, align the base address remaining from membar 1. + */ + if (vmd->physical_addr & (alignment - 1)) { + pad = alignment - (vmd->physical_addr & (alignment - 1)); + vmd->physical_addr += pad; + vmd->current_addr_size -= pad; + } +} + +static bool +vmd_device_is_enumerated(const struct vmd_pci_device *vmd_device) +{ + return vmd_device->header->one.prefetch_base_upper == VMD_UPPER_BASE_SIGNATURE && + vmd_device->header->one.prefetch_limit_upper == VMD_UPPER_LIMIT_SIGNATURE; +} + +static bool +vmd_device_is_root_port(const struct vmd_pci_device *vmd_device) +{ + return vmd_device->header->common.vendor_id == 0x8086 && + (vmd_device->header->common.device_id == 0x2030 || + vmd_device->header->common.device_id == 0x2031 || + vmd_device->header->common.device_id == 0x2032 || + vmd_device->header->common.device_id == 0x2033); +} + +static void +vmd_hotplug_coalesce_regions(struct vmd_hot_plug *hp) +{ + struct pci_mem_mgr *region, *prev; + + do { + prev = NULL; + TAILQ_FOREACH(region, &hp->free_mem_queue, tailq) { + if (prev != NULL && (prev->addr + prev->size == region->addr)) { + break; + } + + prev = region; + } + + if (region != NULL) { + prev->size += region->size; + TAILQ_REMOVE(&hp->free_mem_queue, region, tailq); + TAILQ_INSERT_TAIL(&hp->unused_mem_queue, region, tailq); + } + } while (region != NULL); +} + +static void +vmd_hotplug_free_region(struct vmd_hot_plug *hp, struct pci_mem_mgr *region) +{ + struct pci_mem_mgr *current, *prev = NULL; + + assert(region->addr >= hp->bar.start && region->addr < hp->bar.start + hp->bar.size); + + TAILQ_FOREACH(current, &hp->free_mem_queue, tailq) { + if (current->addr > region->addr) { + break; + } + + prev = current; + } + + if (prev != NULL) { + assert(prev->addr + prev->size <= region->addr); + assert(current == NULL || (region->addr + region->size <= current->addr)); + TAILQ_INSERT_AFTER(&hp->free_mem_queue, prev, region, tailq); + } else { + TAILQ_INSERT_HEAD(&hp->free_mem_queue, region, tailq); + } + + vmd_hotplug_coalesce_regions(hp); +} + +static void +vmd_hotplug_free_addr(struct vmd_hot_plug *hp, uint64_t addr) +{ + struct pci_mem_mgr *region; + + TAILQ_FOREACH(region, &hp->alloc_mem_queue, tailq) { + if (region->addr == addr) { + break; + } + } + + assert(region != NULL); + TAILQ_REMOVE(&hp->alloc_mem_queue, region, tailq); + + vmd_hotplug_free_region(hp, region); +} + +static uint64_t +vmd_hotplug_allocate_base_addr(struct vmd_hot_plug *hp, uint32_t size) +{ + struct pci_mem_mgr *region = NULL, *free_region; + + TAILQ_FOREACH(region, &hp->free_mem_queue, tailq) { + if (region->size >= size) { + break; + } + } + + if (region == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_VMD, "Unable to find free hotplug memory region of size:" + "%"PRIx32"\n", size); + return 0; + } + + TAILQ_REMOVE(&hp->free_mem_queue, region, tailq); + if (size < region->size) { + free_region = TAILQ_FIRST(&hp->unused_mem_queue); + if (free_region == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_VMD, "Unable to find unused descriptor to store the " + "free region of size: %"PRIu32"\n", region->size - size); + } else { + TAILQ_REMOVE(&hp->unused_mem_queue, free_region, tailq); + free_region->size = region->size - size; + free_region->addr = region->addr + size; + region->size = size; + vmd_hotplug_free_region(hp, free_region); + } + } + + TAILQ_INSERT_TAIL(&hp->alloc_mem_queue, region, tailq); + + return region->addr; +} + +/* + * Allocates an address from vmd membar for the input memory size + * vmdAdapter - vmd adapter object + * dev - vmd_pci_device to allocate a base address for. + * size - size of the memory window requested. + * Size must be an integral multiple of 2. Addresses are returned on the size boundary. + * Returns physical address within the VMD membar window, or 0x0 if cannot allocate window. + * Consider increasing the size of vmd membar if 0x0 is returned. + */ +static uint64_t +vmd_allocate_base_addr(struct vmd_adapter *vmd, struct vmd_pci_device *dev, uint32_t size) +{ + uint64_t base_address = 0, padding = 0; + struct vmd_pci_bus *hp_bus; + + if (size && ((size & (~size + 1)) != size)) { + return base_address; + } + + /* + * If device is downstream of a hot plug port, allocate address from the + * range dedicated for the hot plug slot. Search the list of addresses allocated to determine + * if a free range exists that satisfy the input request. If a free range cannot be found, + * get a buffer from the unused chunk. First fit algorithm, is used. + */ + if (dev) { + hp_bus = dev->parent; + if (hp_bus && hp_bus->self && hp_bus->self->hotplug_capable) { + return vmd_hotplug_allocate_base_addr(&hp_bus->self->hp, size); + } + } + + /* Ensure physical membar allocated is size aligned */ + if (vmd->physical_addr & (size - 1)) { + padding = size - (vmd->physical_addr & (size - 1)); + } + + /* Allocate from membar if enough memory is left */ + if (vmd->current_addr_size >= size + padding) { + base_address = vmd->physical_addr + padding; + vmd->physical_addr += size + padding; + vmd->current_addr_size -= size + padding; + } + + SPDK_DEBUGLOG(SPDK_LOG_VMD, "allocated(size) %lx (%x)\n", base_address, size); + + return base_address; +} + +static bool +vmd_is_end_device(struct vmd_pci_device *dev) +{ + return (dev && dev->header) && + ((dev->header->common.header_type & ~PCI_MULTI_FUNCTION) == PCI_HEADER_TYPE_NORMAL); +} + +static void +vmd_update_base_limit_register(struct vmd_pci_device *dev, uint16_t base, uint16_t limit) +{ + struct vmd_pci_bus *bus; + struct vmd_pci_device *bridge; + + if (base == 0 || limit == 0) { + return; + } + + if (dev->header->common.header_type == PCI_HEADER_TYPE_BRIDGE) { + bus = dev->bus_object; + } else { + bus = dev->parent; + } + + bridge = bus->self; + SPDK_DEBUGLOG(SPDK_LOG_VMD, "base:limit = %x:%x\n", bridge->header->one.mem_base, + bridge->header->one.mem_limit); + + if (dev->bus->vmd->scan_completed) { + return; + } + + while (bus && bus->self != NULL) { + bridge = bus->self; + + /* This is only for 32-bit memory space, need to revisit to support 64-bit */ + if (bridge->header->one.mem_base > base) { + bridge->header->one.mem_base = base; + base = bridge->header->one.mem_base; + } + + if (bridge->header->one.mem_limit < limit) { + bridge->header->one.mem_limit = limit; + limit = bridge->header->one.mem_limit; + } + + bus = bus->parent; + } +} + +static uint64_t +vmd_get_base_addr(struct vmd_pci_device *dev, uint32_t index, uint32_t size) +{ + struct vmd_pci_bus *bus = dev->parent; + + if (dev->header_type == PCI_HEADER_TYPE_BRIDGE) { + return dev->header->zero.BAR[index] & ~0xf; + } else { + if (bus->self->hotplug_capable) { + return vmd_hotplug_allocate_base_addr(&bus->self->hp, size); + } else { + return (uint64_t)bus->self->header->one.mem_base << 16; + } + } +} + +static bool +vmd_assign_base_addrs(struct vmd_pci_device *dev) +{ + uint16_t mem_base = 0, mem_limit = 0; + unsigned char mem_attr = 0; + int last; + struct vmd_adapter *vmd = NULL; + bool ret_val = false; + uint32_t bar_value; + uint32_t table_offset; + + if (dev && dev->bus) { + vmd = dev->bus->vmd; + } + + if (!vmd) { + return 0; + } + + vmd_align_base_addrs(vmd, ONE_MB); + + last = dev->header_type ? 2 : 6; + for (int i = 0; i < last; i++) { + bar_value = dev->header->zero.BAR[i]; + dev->header->zero.BAR[i] = ~(0U); + dev->bar[i].size = dev->header->zero.BAR[i]; + dev->header->zero.BAR[i] = bar_value; + + if (dev->bar[i].size == ~(0U) || dev->bar[i].size == 0 || + dev->header->zero.BAR[i] & 1) { + dev->bar[i].size = 0; + continue; + } + mem_attr = dev->bar[i].size & PCI_BASE_ADDR_MASK; + dev->bar[i].size = TWOS_COMPLEMENT(dev->bar[i].size & PCI_BASE_ADDR_MASK); + + if (vmd->scan_completed) { + dev->bar[i].start = vmd_get_base_addr(dev, i, dev->bar[i].size); + } else { + dev->bar[i].start = vmd_allocate_base_addr(vmd, dev, dev->bar[i].size); + } + + dev->header->zero.BAR[i] = (uint32_t)dev->bar[i].start; + + if (!dev->bar[i].start) { + if (mem_attr == (PCI_BAR_MEMORY_PREFETCH | PCI_BAR_MEMORY_TYPE_64)) { + i++; + } + continue; + } + + dev->bar[i].vaddr = ((uint64_t)vmd->mem_vaddr + (dev->bar[i].start - vmd->membar)); + mem_limit = BRIDGE_BASEREG(dev->header->zero.BAR[i]) + + BRIDGE_BASEREG(dev->bar[i].size - 1); + if (!mem_base) { + mem_base = BRIDGE_BASEREG(dev->header->zero.BAR[i]); + } + + ret_val = true; + + if (mem_attr == (PCI_BAR_MEMORY_PREFETCH | PCI_BAR_MEMORY_TYPE_64)) { + i++; + if (i < last) { + dev->header->zero.BAR[i] = (uint32_t)(dev->bar[i].start >> PCI_DWORD_SHIFT); + } + } + } + + /* Enable device MEM and bus mastering */ + dev->header->zero.command |= (PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); + uint16_t cmd = dev->header->zero.command; + cmd++; + + if (dev->msix_cap && ret_val) { + table_offset = ((volatile struct pci_msix_cap *)dev->msix_cap)->msix_table_offset; + if (dev->bar[table_offset & 0x3].vaddr) { + dev->msix_table = (volatile struct pci_msix_table_entry *) + (dev->bar[table_offset & 0x3].vaddr + (table_offset & 0xfff8)); + } + } + + if (ret_val && vmd_is_end_device(dev)) { + vmd_update_base_limit_register(dev, mem_base, mem_limit); + } + + return ret_val; +} + +static void +vmd_get_device_capabilities(struct vmd_pci_device *dev) + +{ + volatile uint8_t *config_space; + uint8_t capabilities_offset; + struct pci_capabilities_header *capabilities_hdr; + + config_space = (volatile uint8_t *)dev->header; + if ((dev->header->common.status & PCI_CAPABILITIES_LIST) == 0) { + return; + } + + capabilities_offset = dev->header->zero.cap_pointer; + if (dev->header->common.header_type & PCI_HEADER_TYPE_BRIDGE) { + capabilities_offset = dev->header->one.cap_pointer; + } + + while (capabilities_offset > 0) { + capabilities_hdr = (struct pci_capabilities_header *) + &config_space[capabilities_offset]; + switch (capabilities_hdr->capability_id) { + case CAPABILITY_ID_PCI_EXPRESS: + dev->pcie_cap = (volatile struct pci_express_cap *)(capabilities_hdr); + break; + + case CAPABILITY_ID_MSI: + dev->msi_cap = (volatile struct pci_msi_cap *)capabilities_hdr; + break; + + case CAPABILITY_ID_MSIX: + dev->msix_cap = (volatile struct pci_msix_capability *)capabilities_hdr; + dev->msix_table_size = dev->msix_cap->message_control.bit.table_size + 1; + break; + + default: + break; + } + capabilities_offset = capabilities_hdr->next; + } +} + +static volatile struct pci_enhanced_capability_header * +vmd_get_enhanced_capabilities(struct vmd_pci_device *dev, uint16_t capability_id) +{ + uint8_t *data; + uint16_t cap_offset = EXTENDED_CAPABILITY_OFFSET; + volatile struct pci_enhanced_capability_header *cap_hdr = NULL; + + data = (uint8_t *)dev->header; + while (cap_offset >= EXTENDED_CAPABILITY_OFFSET) { + cap_hdr = (volatile struct pci_enhanced_capability_header *) &data[cap_offset]; + if (cap_hdr->capability_id == capability_id) { + return cap_hdr; + } + cap_offset = cap_hdr->next; + if (cap_offset == 0 || cap_offset < EXTENDED_CAPABILITY_OFFSET) { + break; + } + } + + return NULL; +} + +static void +vmd_read_config_space(struct vmd_pci_device *dev) +{ + /* + * Writes to the pci config space is posted weite. To ensure transaction reaches its destination + * before another write is posed, an immediate read of the written value should be performed. + */ + dev->header->common.command |= (BUS_MASTER_ENABLE | MEMORY_SPACE_ENABLE); + { uint16_t cmd = dev->header->common.command; (void)cmd; } + + vmd_get_device_capabilities(dev); + dev->sn_cap = (struct serial_number_capability *)vmd_get_enhanced_capabilities(dev, + DEVICE_SERIAL_NUMBER_CAP_ID); +} + +static void +vmd_update_scan_info(struct vmd_pci_device *dev) +{ + struct vmd_adapter *vmd_adapter = dev->bus->vmd; + + if (vmd_adapter->root_port_updated) { + return; + } + + if (dev->header_type == PCI_HEADER_TYPE_NORMAL) { + return; + } + + if (vmd_device_is_root_port(dev)) { + vmd_adapter->root_port_updated = 1; + SPDK_DEBUGLOG(SPDK_LOG_VMD, "root_port_updated = %d\n", + vmd_adapter->root_port_updated); + SPDK_DEBUGLOG(SPDK_LOG_VMD, "upper:limit = %x : %x\n", + dev->header->one.prefetch_base_upper, + dev->header->one.prefetch_limit_upper); + if (vmd_device_is_enumerated(dev)) { + vmd_adapter->scan_completed = 1; + SPDK_DEBUGLOG(SPDK_LOG_VMD, "scan_completed = %d\n", + vmd_adapter->scan_completed); + } + } +} + +static void +vmd_reset_base_limit_registers(struct vmd_pci_device *dev) +{ + uint32_t reg __attribute__((unused)); + + assert(dev->header_type != PCI_HEADER_TYPE_NORMAL); + /* + * Writes to the pci config space are posted writes. + * To ensure transaction reaches its destination + * before another write is posted, an immediate read + * of the written value should be performed. + */ + dev->header->one.mem_base = 0xfff0; + reg = dev->header->one.mem_base; + dev->header->one.mem_limit = 0x0; + reg = dev->header->one.mem_limit; + dev->header->one.prefetch_base = 0x0; + reg = dev->header->one.prefetch_base; + dev->header->one.prefetch_limit = 0x0; + reg = dev->header->one.prefetch_limit; + dev->header->one.prefetch_base_upper = 0x0; + reg = dev->header->one.prefetch_base_upper; + dev->header->one.prefetch_limit_upper = 0x0; + reg = dev->header->one.prefetch_limit_upper; + dev->header->one.io_base_upper = 0x0; + reg = dev->header->one.io_base_upper; + dev->header->one.io_limit_upper = 0x0; + reg = dev->header->one.io_limit_upper; + dev->header->one.primary = 0; + reg = dev->header->one.primary; + dev->header->one.secondary = 0; + reg = dev->header->one.secondary; + dev->header->one.subordinate = 0; + reg = dev->header->one.subordinate; +} + +static void +vmd_init_hotplug(struct vmd_pci_device *dev, struct vmd_pci_bus *bus) +{ + struct vmd_adapter *vmd = bus->vmd; + struct vmd_hot_plug *hp = &dev->hp; + size_t mem_id; + + dev->hotplug_capable = true; + hp->bar.size = 1 << 20; + + if (!vmd->scan_completed) { + hp->bar.start = vmd_allocate_base_addr(vmd, NULL, hp->bar.size); + bus->self->header->one.mem_base = BRIDGE_BASEREG(hp->bar.start); + bus->self->header->one.mem_limit = + bus->self->header->one.mem_base + BRIDGE_BASEREG(hp->bar.size - 1); + } else { + hp->bar.start = (uint64_t)bus->self->header->one.mem_base << 16; + } + + hp->bar.vaddr = (uint64_t)vmd->mem_vaddr + (hp->bar.start - vmd->membar); + + TAILQ_INIT(&hp->free_mem_queue); + TAILQ_INIT(&hp->unused_mem_queue); + TAILQ_INIT(&hp->alloc_mem_queue); + + hp->mem[0].size = hp->bar.size; + hp->mem[0].addr = hp->bar.start; + + TAILQ_INSERT_TAIL(&hp->free_mem_queue, &hp->mem[0], tailq); + + for (mem_id = 1; mem_id < ADDR_ELEM_COUNT; ++mem_id) { + TAILQ_INSERT_TAIL(&hp->unused_mem_queue, &hp->mem[mem_id], tailq); + } + + SPDK_DEBUGLOG(SPDK_LOG_VMD, "%s: mem_base:mem_limit = %x : %x\n", __func__, + bus->self->header->one.mem_base, bus->self->header->one.mem_limit); +} + +static bool +vmd_bus_device_present(struct vmd_pci_bus *bus, uint32_t devfn) +{ + volatile struct pci_header *header; + + header = (volatile struct pci_header *)(bus->vmd->cfg_vaddr + + CONFIG_OFFSET_ADDR(bus->bus_number, devfn, 0, 0)); + if (!vmd_is_valid_cfg_addr(bus, (uint64_t)header)) { + return false; + } + + if (header->common.vendor_id == PCI_INVALID_VENDORID || header->common.vendor_id == 0x0) { + return false; + } + + return true; +} + +static struct vmd_pci_device * +vmd_alloc_dev(struct vmd_pci_bus *bus, uint32_t devfn) +{ + struct vmd_pci_device *dev = NULL; + struct pci_header volatile *header; + uint8_t header_type; + uint32_t rev_class; + + /* Make sure we're not creating two devices on the same dev/fn */ + TAILQ_FOREACH(dev, &bus->dev_list, tailq) { + if (dev->devfn == devfn) { + return NULL; + } + } + + if (!vmd_bus_device_present(bus, devfn)) { + return NULL; + } + + header = (struct pci_header * volatile)(bus->vmd->cfg_vaddr + + CONFIG_OFFSET_ADDR(bus->bus_number, devfn, 0, 0)); + + SPDK_DEBUGLOG(SPDK_LOG_VMD, "PCI device found: %04x:%04x ***\n", + header->common.vendor_id, header->common.device_id); + + dev = calloc(1, sizeof(*dev)); + if (!dev) { + return NULL; + } + + dev->header = header; + dev->vid = dev->header->common.vendor_id; + dev->did = dev->header->common.device_id; + dev->bus = bus; + dev->parent = bus; + dev->devfn = devfn; + header_type = dev->header->common.header_type; + rev_class = dev->header->common.rev_class; + dev->class = rev_class >> 8; + dev->header_type = header_type & 0x7; + + if (header_type == PCI_HEADER_TYPE_BRIDGE) { + vmd_update_scan_info(dev); + if (!dev->bus->vmd->scan_completed) { + vmd_reset_base_limit_registers(dev); + } + } + + vmd_read_config_space(dev); + + return dev; +} + +static struct vmd_pci_bus * +vmd_create_new_bus(struct vmd_pci_bus *parent, struct vmd_pci_device *bridge, uint8_t bus_number) +{ + struct vmd_pci_bus *new_bus; + + new_bus = calloc(1, sizeof(*new_bus)); + if (!new_bus) { + return NULL; + } + + new_bus->parent = parent; + new_bus->domain = parent->domain; + new_bus->bus_number = bus_number; + new_bus->secondary_bus = new_bus->subordinate_bus = bus_number; + new_bus->self = bridge; + new_bus->vmd = parent->vmd; + TAILQ_INIT(&new_bus->dev_list); + + bridge->subordinate = new_bus; + + bridge->pci.addr.bus = new_bus->bus_number; + bridge->pci.addr.dev = bridge->devfn; + bridge->pci.addr.func = 0; + bridge->pci.addr.domain = parent->vmd->pci->addr.domain; + + return new_bus; +} + +/* + * Assigns a bus number from the list of available + * bus numbers. If the device is downstream of a hot plug port, + * assign the bus number from thiose assigned to the HP port. Otherwise, + * assign the next bus number from the vmd bus number list. + */ +static uint8_t +vmd_get_next_bus_number(struct vmd_pci_device *dev, struct vmd_adapter *vmd) +{ + uint8_t bus = 0xff; + struct vmd_pci_bus *hp_bus; + + if (dev) { + hp_bus = vmd_is_dev_in_hotplug_path(dev); + if (hp_bus && hp_bus->self && hp_bus->self->hotplug_capable) { + return vmd_hp_get_next_bus_number(&hp_bus->self->hp); + } + } + + /* Device is not under a hot plug path. Return next global bus number */ + if ((vmd->next_bus_number + 1) < vmd->max_pci_bus) { + bus = vmd->next_bus_number; + vmd->next_bus_number++; + } + return bus; +} + +static uint8_t +vmd_get_hotplug_bus_numbers(struct vmd_pci_device *dev) +{ + uint8_t bus_number = 0xff; + + if (dev && dev->bus && dev->bus->vmd && + ((dev->bus->vmd->next_bus_number + RESERVED_HOTPLUG_BUSES) < dev->bus->vmd->max_pci_bus)) { + bus_number = RESERVED_HOTPLUG_BUSES; + dev->bus->vmd->next_bus_number += RESERVED_HOTPLUG_BUSES; + } + + return bus_number; +} + +static void +vmd_enable_msix(struct vmd_pci_device *dev) +{ + volatile uint16_t control; + + control = dev->msix_cap->message_control.as_uint16_t | (1 << 14); + dev->msix_cap->message_control.as_uint16_t = control; + control = dev->msix_cap->message_control.as_uint16_t; + dev->msix_cap->message_control.as_uint16_t = (control | (1 << 15)); + control = dev->msix_cap->message_control.as_uint16_t; + control = control & ~(1 << 14); + dev->msix_cap->message_control.as_uint16_t = control; + control = dev->msix_cap->message_control.as_uint16_t; +} + +static void +vmd_disable_msix(struct vmd_pci_device *dev) +{ + volatile uint16_t control; + + control = dev->msix_cap->message_control.as_uint16_t | (1 << 14); + dev->msix_cap->message_control.as_uint16_t = control; + control = dev->msix_cap->message_control.as_uint16_t & ~(1 << 15); + dev->msix_cap->message_control.as_uint16_t = control; + control = dev->msix_cap->message_control.as_uint16_t; +} + +/* + * Set up MSI-X table entries for the port. Vmd MSIX vector 0 is used for + * port interrupt, so vector 0 is mapped to all MSIX entries for the port. + */ +static void +vmd_setup_msix(struct vmd_pci_device *dev, volatile struct pci_msix_table_entry *vmdEntry) +{ + int entry; + + if (!dev || !vmdEntry || !dev->msix_cap) { + return; + } + + vmd_disable_msix(dev); + if (dev->msix_table == NULL || dev->msix_table_size > MAX_MSIX_TABLE_SIZE) { + return; + } + + for (entry = 0; entry < dev->msix_table_size; ++entry) { + dev->msix_table[entry].vector_control = 1; + } + vmd_enable_msix(dev); +} + +static void +vmd_bus_update_bridge_info(struct vmd_pci_device *bridge) +{ + /* Update the subordinate bus of all bridges above this bridge */ + volatile struct vmd_pci_device *dev = bridge; + uint8_t subordinate_bus; + + if (!dev) { + return; + } + subordinate_bus = bridge->header->one.subordinate; + while (dev->parent_bridge != NULL) { + dev = dev->parent_bridge; + if (dev->header->one.subordinate < subordinate_bus) { + dev->header->one.subordinate = subordinate_bus; + subordinate_bus = dev->header->one.subordinate; + } + } +} + +static bool +vmd_is_supported_device(struct vmd_pci_device *dev) +{ + return dev->class == PCI_CLASS_STORAGE_EXPRESS; +} + +static int +vmd_dev_map_bar(struct spdk_pci_device *pci_dev, uint32_t bar, + void **mapped_addr, uint64_t *phys_addr, uint64_t *size) +{ + struct vmd_pci_device *dev = SPDK_CONTAINEROF(pci_dev, struct vmd_pci_device, pci); + + *size = dev->bar[bar].size; + *phys_addr = dev->bar[bar].start; + *mapped_addr = (void *)dev->bar[bar].vaddr; + + return 0; +} + +static int +vmd_dev_unmap_bar(struct spdk_pci_device *_dev, uint32_t bar, void *addr) +{ + return 0; +} + +static int +vmd_dev_cfg_read(struct spdk_pci_device *_dev, void *value, uint32_t len, + uint32_t offset) +{ + struct vmd_pci_device *dev = SPDK_CONTAINEROF(_dev, struct vmd_pci_device, pci); + volatile uint8_t *src = (volatile uint8_t *)dev->header; + uint8_t *dst = value; + size_t i; + + if (len + offset > PCI_MAX_CFG_SIZE) { + return -1; + } + + for (i = 0; i < len; ++i) { + dst[i] = src[offset + i]; + } + + return 0; +} + +static int +vmd_dev_cfg_write(struct spdk_pci_device *_dev, void *value, + uint32_t len, uint32_t offset) +{ + struct vmd_pci_device *dev = SPDK_CONTAINEROF(_dev, struct vmd_pci_device, pci); + volatile uint8_t *dst = (volatile uint8_t *)dev->header; + uint8_t *src = value; + size_t i; + + if ((len + offset) > PCI_MAX_CFG_SIZE) { + return -1; + } + + for (i = 0; i < len; ++i) { + dst[offset + i] = src[i]; + } + + return 0; +} + +static void +vmd_dev_detach(struct spdk_pci_device *dev) +{ + struct vmd_pci_device *vmd_device = (struct vmd_pci_device *)dev; + struct vmd_pci_device *bus_device = vmd_device->bus->self; + struct vmd_pci_bus *bus = vmd_device->bus; + size_t i, num_bars = vmd_device->header_type ? 2 : 6; + + spdk_pci_unhook_device(dev); + TAILQ_REMOVE(&bus->dev_list, vmd_device, tailq); + + /* Release the hotplug region if the device is under hotplug-capable bus */ + if (bus_device && bus_device->hotplug_capable) { + for (i = 0; i < num_bars; ++i) { + if (vmd_device->bar[i].start != 0) { + vmd_hotplug_free_addr(&bus_device->hp, vmd_device->bar[i].start); + } + } + } + + free(dev); +} + +static void +vmd_dev_init(struct vmd_pci_device *dev) +{ + uint8_t bdf[32]; + + dev->pci.addr.domain = dev->bus->vmd->domain; + dev->pci.addr.bus = dev->bus->bus_number; + dev->pci.addr.dev = dev->devfn; + dev->pci.addr.func = 0; + dev->pci.id.vendor_id = dev->header->common.vendor_id; + dev->pci.id.device_id = dev->header->common.device_id; + dev->pci.type = "vmd"; + dev->pci.map_bar = vmd_dev_map_bar; + dev->pci.unmap_bar = vmd_dev_unmap_bar; + dev->pci.cfg_read = vmd_dev_cfg_read; + dev->pci.cfg_write = vmd_dev_cfg_write; + dev->hotplug_capable = false; + if (dev->pcie_cap != NULL) { + dev->cached_slot_control = dev->pcie_cap->slot_control; + } + + if (vmd_is_supported_device(dev)) { + spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->pci.addr); + SPDK_DEBUGLOG(SPDK_LOG_VMD, "Initalizing NVMe device at %s\n", bdf); + dev->pci.parent = dev->bus->vmd->pci; + spdk_pci_hook_device(spdk_pci_nvme_get_driver(), &dev->pci); + } +} + +/* + * Scans a single bus for all devices attached and return a count of + * how many devices found. In the VMD topology, it is assume there are no multi- + * function devices. Hence a bus(bridge) will not have multi function with both type + * 0 and 1 header. + * + * The other option for implementing this function is the bus is an int and + * create a new device PciBridge. PciBridge would inherit from PciDevice with extra fields, + * sub/pri/sec bus. The input becomes PciPort, bus number and parent_bridge. + * + * The bus number is scanned and if a device is found, based on the header_type, create + * either PciBridge(1) or PciDevice(0). + * + * If a PciBridge, assign bus numbers and rescan new bus. The currenty PciBridge being + * scanned becomes the passed in parent_bridge with the new bus number. + * + * The linked list becomes list of pciBridges with PciDevices attached. + * + * Return count of how many devices found(type1 + type 0 header devices) + */ +static uint8_t +vmd_scan_single_bus(struct vmd_pci_bus *bus, struct vmd_pci_device *parent_bridge) +{ + /* assuming only single function devices are on the bus */ + struct vmd_pci_device *new_dev; + struct vmd_adapter *vmd; + union express_slot_capabilities_register slot_cap; + struct vmd_pci_bus *new_bus; + uint8_t device_number, dev_cnt = 0; + uint8_t new_bus_num; + + for (device_number = 0; device_number < 32; device_number++) { + new_dev = vmd_alloc_dev(bus, device_number); + if (new_dev == NULL) { + continue; + } + + dev_cnt++; + if (new_dev->header->common.header_type & PCI_HEADER_TYPE_BRIDGE) { + slot_cap.as_uint32_t = 0; + if (new_dev->pcie_cap != NULL) { + slot_cap.as_uint32_t = new_dev->pcie_cap->slot_cap.as_uint32_t; + } + + new_bus_num = vmd_get_next_bus_number(bus->vmd->is_hotplug_scan ? new_dev : NULL, bus->vmd); + if (new_bus_num == 0xff) { + free(new_dev); + return dev_cnt; + } + new_bus = vmd_create_new_bus(bus, new_dev, new_bus_num); + if (!new_bus) { + free(new_dev); + return dev_cnt; + } + new_bus->primary_bus = bus->secondary_bus; + new_bus->self = new_dev; + new_dev->bus_object = new_bus; + + if (slot_cap.bit_field.hotplug_capable && new_dev->pcie_cap != NULL && + new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented) { + new_bus->hotplug_buses = vmd_get_hotplug_bus_numbers(new_dev); + new_bus->subordinate_bus += new_bus->hotplug_buses; + + /* Attach hot plug instance if HP is supported */ + /* Hot inserted SSDs can be assigned port bus of sub-ordinate + 1 */ + SPDK_DEBUGLOG(SPDK_LOG_VMD, "hotplug_capable/slot_implemented = " + "%x:%x\n", slot_cap.bit_field.hotplug_capable, + new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented); + } + + new_dev->parent_bridge = parent_bridge; + new_dev->header->one.primary = new_bus->primary_bus; + new_dev->header->one.secondary = new_bus->secondary_bus; + new_dev->header->one.subordinate = new_bus->subordinate_bus; + + vmd_bus_update_bridge_info(new_dev); + TAILQ_INSERT_TAIL(&bus->vmd->bus_list, new_bus, tailq); + + vmd_dev_init(new_dev); + + if (slot_cap.bit_field.hotplug_capable && new_dev->pcie_cap != NULL && + new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented) { + vmd_init_hotplug(new_dev, new_bus); + } + + dev_cnt += vmd_scan_single_bus(new_bus, new_dev); + if (new_dev->pcie_cap != NULL) { + if (new_dev->pcie_cap->express_cap_register.bit_field.device_type == SwitchUpstreamPort) { + return dev_cnt; + } + } + } else { + /* Attach the device to the current bus and assign base addresses */ + TAILQ_INSERT_TAIL(&bus->dev_list, new_dev, tailq); + g_end_device_count++; + if (vmd_assign_base_addrs(new_dev)) { + vmd_setup_msix(new_dev, &bus->vmd->msix_table[0]); + vmd_dev_init(new_dev); + if (vmd_is_supported_device(new_dev)) { + vmd = bus->vmd; + vmd->target[vmd->nvme_count] = new_dev; + vmd->nvme_count++; + } + } else { + SPDK_DEBUGLOG(SPDK_LOG_VMD, "Removing failed device:%p\n", new_dev); + TAILQ_REMOVE(&bus->dev_list, new_dev, tailq); + free(new_dev); + if (dev_cnt) { + dev_cnt--; + } + } + } + } + + return dev_cnt; +} + +static void +vmd_print_pci_info(struct vmd_pci_device *dev) +{ + if (!dev) { + return; + } + + if (dev->pcie_cap != NULL) { + SPDK_INFOLOG(SPDK_LOG_VMD, "PCI DEVICE: [%04X:%04X] type(%x) : %s\n", + dev->header->common.vendor_id, dev->header->common.device_id, + dev->pcie_cap->express_cap_register.bit_field.device_type, + device_type[dev->pcie_cap->express_cap_register.bit_field.device_type]); + } else { + SPDK_INFOLOG(SPDK_LOG_VMD, "PCI DEVICE: [%04X:%04X]\n", + dev->header->common.vendor_id, dev->header->common.device_id); + } + + SPDK_INFOLOG(SPDK_LOG_VMD, "\tDOMAIN:BDF: %04x:%02x:%02x:%x\n", dev->pci.addr.domain, + dev->pci.addr.bus, dev->pci.addr.dev, dev->pci.addr.func); + + if (!(dev->header_type & PCI_HEADER_TYPE_BRIDGE) && dev->bus) { + SPDK_INFOLOG(SPDK_LOG_VMD, "\tbase addr: %x : %p\n", + dev->header->zero.BAR[0], (void *)dev->bar[0].vaddr); + } + + if ((dev->header_type & PCI_HEADER_TYPE_BRIDGE)) { + SPDK_INFOLOG(SPDK_LOG_VMD, "\tPrimary = %d, Secondary = %d, Subordinate = %d\n", + dev->header->one.primary, dev->header->one.secondary, dev->header->one.subordinate); + if (dev->pcie_cap && dev->pcie_cap->express_cap_register.bit_field.slot_implemented) { + SPDK_INFOLOG(SPDK_LOG_VMD, "\tSlot implemented on this device.\n"); + if (dev->pcie_cap->slot_cap.bit_field.hotplug_capable) { + SPDK_INFOLOG(SPDK_LOG_VMD, "Device has HOT-PLUG capable slot.\n"); + } + } + } + + if (dev->sn_cap != NULL) { + uint8_t *snLow = (uint8_t *)&dev->sn_cap->sn_low; + uint8_t *snHi = (uint8_t *)&dev->sn_cap->sn_hi; + + SPDK_INFOLOG(SPDK_LOG_VMD, "\tSN: %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x\n", + snHi[3], snHi[2], snHi[1], snHi[0], snLow[3], snLow[2], snLow[1], snLow[0]); + } +} + +static void +vmd_cache_scan_info(struct vmd_pci_device *dev) +{ + uint32_t reg __attribute__((unused)); + + if (dev->header_type == PCI_HEADER_TYPE_NORMAL) { + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_VMD, "vendor/device id:%x:%x\n", dev->header->common.vendor_id, + dev->header->common.device_id); + + if (vmd_device_is_root_port(dev)) { + dev->header->one.prefetch_base_upper = VMD_UPPER_BASE_SIGNATURE; + reg = dev->header->one.prefetch_base_upper; + dev->header->one.prefetch_limit_upper = VMD_UPPER_LIMIT_SIGNATURE; + reg = dev->header->one.prefetch_limit_upper; + + SPDK_DEBUGLOG(SPDK_LOG_VMD, "prefetch: %x:%x\n", + dev->header->one.prefetch_base_upper, + dev->header->one.prefetch_limit_upper); + } +} + +static uint8_t +vmd_scan_pcibus(struct vmd_pci_bus *bus) +{ + struct vmd_pci_bus *bus_entry; + struct vmd_pci_device *dev; + uint8_t dev_cnt; + + g_end_device_count = 0; + TAILQ_INSERT_TAIL(&bus->vmd->bus_list, bus, tailq); + bus->vmd->next_bus_number = bus->bus_number + 1; + dev_cnt = vmd_scan_single_bus(bus, NULL); + + SPDK_DEBUGLOG(SPDK_LOG_VMD, "VMD scan found %u devices\n", dev_cnt); + SPDK_DEBUGLOG(SPDK_LOG_VMD, "VMD scan found %u END DEVICES\n", g_end_device_count); + + SPDK_INFOLOG(SPDK_LOG_VMD, "PCIe devices attached to VMD %04x:%02x:%02x:%x...\n", + bus->vmd->pci->addr.domain, bus->vmd->pci->addr.bus, + bus->vmd->pci->addr.dev, bus->vmd->pci->addr.func); + + TAILQ_FOREACH(bus_entry, &bus->vmd->bus_list, tailq) { + if (bus_entry->self != NULL) { + vmd_print_pci_info(bus_entry->self); + vmd_cache_scan_info(bus_entry->self); + } + + TAILQ_FOREACH(dev, &bus_entry->dev_list, tailq) { + vmd_print_pci_info(dev); + } + } + + return dev_cnt; +} + +static int +vmd_map_bars(struct vmd_adapter *vmd, struct spdk_pci_device *dev) +{ + int rc; + + rc = spdk_pci_device_map_bar(dev, 0, (void **)&vmd->cfg_vaddr, + &vmd->cfgbar, &vmd->cfgbar_size); + if (rc == 0) { + rc = spdk_pci_device_map_bar(dev, 2, (void **)&vmd->mem_vaddr, + &vmd->membar, &vmd->membar_size); + } + + if (rc == 0) { + rc = spdk_pci_device_map_bar(dev, 4, (void **)&vmd->msix_vaddr, + &vmd->msixbar, &vmd->msixbar_size); + } + + if (rc == 0) { + vmd->physical_addr = vmd->membar; + vmd->current_addr_size = vmd->membar_size; + } + return rc; +} + +static int +vmd_enumerate_devices(struct vmd_adapter *vmd) +{ + vmd->vmd_bus.vmd = vmd; + vmd->vmd_bus.secondary_bus = vmd->vmd_bus.subordinate_bus = 0; + vmd->vmd_bus.primary_bus = vmd->vmd_bus.bus_number = 0; + vmd->vmd_bus.domain = vmd->pci->addr.domain; + + return vmd_scan_pcibus(&vmd->vmd_bus); +} + +struct vmd_pci_device * +vmd_find_device(const struct spdk_pci_addr *addr) +{ + struct vmd_pci_bus *bus; + struct vmd_pci_device *dev; + int i; + + for (i = 0; i < MAX_VMD_TARGET; ++i) { + TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) { + if (bus->self) { + if (spdk_pci_addr_compare(&bus->self->pci.addr, addr) == 0) { + return bus->self; + } + } + + TAILQ_FOREACH(dev, &bus->dev_list, tailq) { + if (spdk_pci_addr_compare(&dev->pci.addr, addr) == 0) { + return dev; + } + } + } + } + + return NULL; +} + +static int +vmd_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) +{ + uint32_t cmd_reg = 0; + char bdf[32] = {0}; + struct vmd_container *vmd_c = ctx; + size_t i; + + spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4); + cmd_reg |= 0x6; /* PCI bus master/memory enable. */ + spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4); + + spdk_pci_addr_fmt(bdf, sizeof(bdf), &pci_dev->addr); + SPDK_DEBUGLOG(SPDK_LOG_VMD, "Found a VMD[ %d ] at %s\n", vmd_c->count, bdf); + + /* map vmd bars */ + i = vmd_c->count; + vmd_c->vmd[i].pci = pci_dev; + vmd_c->vmd[i].vmd_index = i; + vmd_c->vmd[i].domain = + (pci_dev->addr.bus << 16) | (pci_dev->addr.dev << 8) | pci_dev->addr.func; + vmd_c->vmd[i].max_pci_bus = PCI_MAX_BUS_NUMBER; + TAILQ_INIT(&vmd_c->vmd[i].bus_list); + + if (vmd_map_bars(&vmd_c->vmd[i], pci_dev) == -1) { + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_VMD, "vmd config bar(%p) vaddr(%p) size(%x)\n", + (void *)vmd_c->vmd[i].cfgbar, (void *)vmd_c->vmd[i].cfg_vaddr, + (uint32_t)vmd_c->vmd[i].cfgbar_size); + SPDK_DEBUGLOG(SPDK_LOG_VMD, "vmd mem bar(%p) vaddr(%p) size(%x)\n", + (void *)vmd_c->vmd[i].membar, (void *)vmd_c->vmd[i].mem_vaddr, + (uint32_t)vmd_c->vmd[i].membar_size); + SPDK_DEBUGLOG(SPDK_LOG_VMD, "vmd msix bar(%p) vaddr(%p) size(%x)\n\n", + (void *)vmd_c->vmd[i].msixbar, (void *)vmd_c->vmd[i].msix_vaddr, + (uint32_t)vmd_c->vmd[i].msixbar_size); + + vmd_c->count = i + 1; + + vmd_enumerate_devices(&vmd_c->vmd[i]); + + return 0; +} + +int +spdk_vmd_pci_device_list(struct spdk_pci_addr vmd_addr, struct spdk_pci_device *nvme_list) +{ + int cnt = 0; + struct vmd_pci_bus *bus; + struct vmd_pci_device *dev; + + if (!nvme_list) { + return -1; + } + + for (int i = 0; i < MAX_VMD_TARGET; ++i) { + if (spdk_pci_addr_compare(&vmd_addr, &g_vmd_container.vmd[i].pci->addr) == 0) { + TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) { + TAILQ_FOREACH(dev, &bus->dev_list, tailq) { + nvme_list[cnt++] = dev->pci; + if (!dev->is_hooked) { + vmd_dev_init(dev); + dev->is_hooked = 1; + } + } + } + } + } + + return cnt; +} + +static void +vmd_clear_hotplug_status(struct vmd_pci_bus *bus) +{ + struct vmd_pci_device *device = bus->self; + uint16_t status __attribute__((unused)); + + status = device->pcie_cap->slot_status.as_uint16_t; + device->pcie_cap->slot_status.as_uint16_t = status; + status = device->pcie_cap->slot_status.as_uint16_t; + + status = device->pcie_cap->link_status.as_uint16_t; + device->pcie_cap->link_status.as_uint16_t = status; + status = device->pcie_cap->link_status.as_uint16_t; +} + +static void +vmd_bus_handle_hotplug(struct vmd_pci_bus *bus) +{ + uint8_t num_devices, sleep_count; + + for (sleep_count = 0; sleep_count < 20; ++sleep_count) { + /* Scan until a new device is found */ + num_devices = vmd_scan_single_bus(bus, bus->self); + if (num_devices > 0) { + break; + } + + spdk_delay_us(200000); + } + + if (num_devices == 0) { + SPDK_ERRLOG("Timed out while scanning for hotplugged devices\n"); + } +} + +static void +vmd_bus_handle_hotremove(struct vmd_pci_bus *bus) +{ + struct vmd_pci_device *device, *tmpdev; + + TAILQ_FOREACH_SAFE(device, &bus->dev_list, tailq, tmpdev) { + if (!vmd_bus_device_present(bus, device->devfn)) { + device->pci.internal.pending_removal = true; + + /* If the device isn't attached, remove it immediately */ + if (!device->pci.internal.attached) { + vmd_dev_detach(&device->pci); + } + } + } +} + +int +spdk_vmd_hotplug_monitor(void) +{ + struct vmd_pci_bus *bus; + struct vmd_pci_device *device; + int num_hotplugs = 0; + uint32_t i; + + for (i = 0; i < g_vmd_container.count; ++i) { + TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) { + device = bus->self; + if (device == NULL || !device->hotplug_capable) { + continue; + } + + if (device->pcie_cap->slot_status.bit_field.datalink_state_changed != 1) { + continue; + } + + if (device->pcie_cap->link_status.bit_field.datalink_layer_active == 1) { + SPDK_DEBUGLOG(SPDK_LOG_VMD, "Device hotplug detected on bus " + "%"PRIu32"\n", bus->bus_number); + vmd_bus_handle_hotplug(bus); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VMD, "Device hotremove detected on bus " + "%"PRIu32"\n", bus->bus_number); + vmd_bus_handle_hotremove(bus); + } + + vmd_clear_hotplug_status(bus); + num_hotplugs++; + } + } + + return num_hotplugs; +} + +int +spdk_vmd_init(void) +{ + return spdk_pci_enumerate(spdk_pci_vmd_get_driver(), vmd_enum_cb, &g_vmd_container); +} + +void +spdk_vmd_fini(void) +{ + uint32_t i; + + for (i = 0; i < g_vmd_container.count; ++i) { + spdk_pci_device_detach(g_vmd_container.vmd[i].pci); + } +} + +SPDK_LOG_REGISTER_COMPONENT("vmd", SPDK_LOG_VMD) diff --git a/src/spdk/lib/vmd/vmd.h b/src/spdk/lib/vmd/vmd.h new file mode 100644 index 000000000..46490a6f7 --- /dev/null +++ b/src/spdk/lib/vmd/vmd.h @@ -0,0 +1,201 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VMD_H +#define VMD_H + +#include "spdk/stdinc.h" +#include "spdk/vmd.h" +#include "spdk/env.h" +#include "spdk/util.h" +#include "spdk_internal/log.h" +#include "vmd_spec.h" + +struct vmd_hot_plug; +struct vmd_adapter; +struct vmd_pci_device; + +struct pci_bars { + uint64_t vaddr; + uint64_t start; + uint32_t size; +}; + +struct vmd_pci_bus { + struct vmd_adapter *vmd; + struct vmd_pci_bus *parent; /* parent bus that this bus is attached to(primary bus. */ + struct vmd_pci_device *self; /* Pci device that describes this bus(bar, bus numbers, etc */ + + uint32_t domain : 8; + uint32_t hotplug_buses : 10; + uint32_t is_added : 1; + uint32_t hp_event_queued : 1; + uint32_t rsv : 12; + + uint32_t bus_number : 8; + uint32_t primary_bus : 8; + uint32_t secondary_bus : 8; + uint32_t subordinate_bus : 8; + + TAILQ_HEAD(, vmd_pci_device) dev_list; /* list of pci end device attached to this bus */ + TAILQ_ENTRY(vmd_pci_bus) tailq; /* link for all buses found during scan */ +}; + +/* + * memory element for base address assignment and reuse + */ +struct pci_mem_mgr { + uint32_t size : 30; /* size of memory element */ + uint32_t in_use : 1; + uint32_t rsv : 1; + uint64_t addr; + TAILQ_ENTRY(pci_mem_mgr) tailq; +}; + +struct vmd_hot_plug { + uint32_t count : 12; + uint32_t reserved_bus_count : 4; + uint32_t max_hotplug_bus_number : 8; + uint32_t next_bus_number : 8; + struct pci_bars bar; + union express_slot_status_register slot_status; + struct pci_mem_mgr mem[ADDR_ELEM_COUNT]; + uint8_t bus_numbers[RESERVED_HOTPLUG_BUSES]; + struct vmd_pci_bus *bus; + TAILQ_HEAD(, pci_mem_mgr) free_mem_queue; + TAILQ_HEAD(, pci_mem_mgr) alloc_mem_queue; + TAILQ_HEAD(, pci_mem_mgr) unused_mem_queue; +}; + +struct vmd_pci_device { + struct spdk_pci_device pci; + struct pci_bars bar[6]; + + struct vmd_pci_device *parent_bridge; + struct vmd_pci_bus *bus, *parent; + struct vmd_pci_bus *bus_object; /* bus tracks pci bus associated with this dev if type 1 dev. */ + struct vmd_pci_bus *subordinate; + volatile struct pci_header *header; + volatile struct pci_express_cap *pcie_cap; + volatile struct pci_msix_capability *msix_cap; + volatile struct pci_msi_cap *msi_cap; + volatile struct serial_number_capability *sn_cap; + volatile struct pci_msix_table_entry *msix_table; + + TAILQ_ENTRY(vmd_pci_device) tailq; + + uint32_t class; + uint16_t vid; + uint16_t did; + uint16_t pcie_flags, msix_table_size; + uint32_t devfn; + bool hotplug_capable; + + uint32_t header_type : 1; + uint32_t multifunction : 1; + uint32_t hotplug_bridge : 1; + uint32_t is_added : 1; + uint32_t is_hooked : 1; + uint32_t rsv1 : 12; + uint32_t target : 16; + + struct vmd_hot_plug hp; + /* Cached version of the slot_control register */ + union express_slot_control_register cached_slot_control; +}; + +/* + * The VMD adapter + */ +struct vmd_adapter { + struct spdk_pci_device *pci; + uint32_t domain; + /* physical and virtual VMD bars */ + uint64_t cfgbar, cfgbar_size; + uint64_t membar, membar_size; + uint64_t msixbar, msixbar_size; + volatile uint8_t *cfg_vaddr; + volatile uint8_t *mem_vaddr; + volatile uint8_t *msix_vaddr; + volatile struct pci_msix_table_entry *msix_table; + uint32_t bar_sizes[6]; + + uint64_t physical_addr; + uint32_t current_addr_size; + + uint32_t next_bus_number : 10; + uint32_t max_pci_bus : 10; + uint32_t is_hotplug_scan : 1; + uint32_t is_ready : 1; + uint32_t processing_hp : 1; + uint32_t max_payload_size: 3; + uint32_t root_port_updated : 1; + uint32_t scan_completed : 1; + uint32_t rsv : 4; + + /* end devices attached to vmd adapters */ + struct vmd_pci_device *target[MAX_VMD_TARGET]; + uint32_t dev_count : 16; + uint32_t nvme_count : 8; + uint32_t vmd_index : 8; + + struct vmd_pci_bus vmd_bus; + + TAILQ_HEAD(, vmd_pci_bus) bus_list; + + struct event_fifo *hp_queue; +}; + +/* TODO: Temporary stubs for Hot Plug interface */ +static inline struct vmd_pci_bus * +vmd_is_dev_in_hotplug_path(struct vmd_pci_device *dev) +{ + return NULL; +} + +static inline void +vmd_hp_enable_hotplug(struct vmd_hot_plug *hp) +{ + +} + +static inline uint8_t +vmd_hp_get_next_bus_number(struct vmd_hot_plug *hp) +{ + assert(false); + return 0; +} + +struct vmd_pci_device *vmd_find_device(const struct spdk_pci_addr *addr); + +#endif /* VMD_H */ diff --git a/src/spdk/lib/vmd/vmd_spec.h b/src/spdk/lib/vmd/vmd_spec.h new file mode 100644 index 000000000..07a4a113d --- /dev/null +++ b/src/spdk/lib/vmd/vmd_spec.h @@ -0,0 +1,473 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef VMD_SPEC_H +#define VMD_SPEC_H + +#define MAX_VMD_SUPPORTED 48 /* max number of vmd controllers in a system - */ +#define VMD_DOMAIN_START 0x201D + +#define PCI_INVALID_VENDORID 0xFFFF +#define ONE_MB (1<<20) +#define PCI_OFFSET_OF(object, member) ((uint32_t)&((object*)0)->member) +#define TWOS_COMPLEMENT(value) (~(value) + 1) + +#define VMD_UPPER_BASE_SIGNATURE 0xFFFFFFEF +#define VMD_UPPER_LIMIT_SIGNATURE 0xFFFFFFED + +/* + * BAR assignment constants + */ +#define PCI_DWORD_SHIFT 32 +#define PCI_BASE_ADDR_MASK 0xFFFFFFF0 +#define PCI_BAR_MEMORY_MASK 0x0000000F +#define PCI_BAR_MEMORY_MEM_IND 0x1 +#define PCI_BAR_MEMORY_TYPE 0x6 +#define PCI_BAR_MEMORY_PREFETCH 0x8 +#define PCI_BAR_MEMORY_TYPE_32 0x0 +#define PCI_BAR_MEMORY_TYPE_64 0x4 +#define PCI_BAR_MB_MASK 0xFFFFF +#define PCI_PCI_BRIDGE_ADDR_DEF 0xFFF0 +#define PCI_BRIDGE_MEMORY_MASK 0xFFF0 +#define PCI_BRIDGE_PREFETCH_64 0x0001 +#define PCI_BRIDGE_MEMORY_SHIFT 16 +#define PCI_CONFIG_ACCESS_DELAY 500 + +#define PCI_MAX_CFG_SIZE 0x1000 + +#define PCI_HEADER_TYPE 0x0e +#define PCI_HEADER_TYPE_NORMAL 0 +#define PCI_HEADER_TYPE_BRIDGE 1 +#define PCI_MULTI_FUNCTION 0x80 + +#define PCI_COMMAND_MEMORY 0x2 +#define PCI_COMMAND_MASTER 0x4 + +#define PCIE_TYPE_FLAGS 0xf0 +#define PCIE_TYPE_SHIFT 4 +#define PCIE_TYPE_ROOT_PORT 0x4 +#define PCIE_TYPE_DOWNSTREAM 0x6 + +#define PCI_CLASS_STORAGE_EXPRESS 0x010802 +#define ADDR_ELEM_COUNT 32 +#define PCI_MAX_BUS_NUMBER 0x7F +#define RESERVED_HOTPLUG_BUSES 1 +#define isHotPlugCapable(slotCap) ((slotCap) & (1<<6)) +#define CONFIG_OFFSET_ADDR(bus, device, function, reg) (((bus)<<20) | (device)<<15 | (function<<12) | (reg)) +#define BRIDGE_BASEREG(reg) (0xFFF0 & ((reg)>>16)) + +#define MISCCTRLSTS_0_OFFSET 0x188 +#define ENABLE_ACPI_MODE_FOR_HOTPLUG (1 << 3) + +/* Bit encodings for Command Register */ +#define IO_SPACE_ENABLE 0x0001 +#define MEMORY_SPACE_ENABLE 0x0002 +#define BUS_MASTER_ENABLE 0x0004 + +/* Bit encodings for Status Register */ +#define PCI_CAPABILITIES_LIST 0x0010 +#define PCI_RECEIVED_TARGET_ABORT 0x1000 +#define PCI_RECEIVED_MASTER_ABORT 0x2000 +#define PCI_SIGNALED_SYSTEM_ERROR 0x4000 +#define PCI_DETECTED_PARITY_ERROR 0x8000 + +/* Capability IDs */ +#define CAPABILITY_ID_POWER_MANAGEMENT 0x01 +#define CAPABILITY_ID_MSI 0x05 +#define CAPABILITY_ID_PCI_EXPRESS 0x10 +#define CAPABILITY_ID_MSIX 0x11 + +#define PCI_MSIX_ENABLE (1 << 15) /* bit 15 of MSIX Message Control */ +#define PCI_MSIX_FUNCTION_MASK (1 << 14) /* bit 14 of MSIX Message Control */ + +/* extended capability */ +#define EXTENDED_CAPABILITY_OFFSET 0x100 +#define DEVICE_SERIAL_NUMBER_CAP_ID 0x3 + +#define BAR_SIZE (1 << 20) + +struct pci_enhanced_capability_header { + uint16_t capability_id; + uint16_t version: 4; + uint16_t next: 12; +}; + +struct serial_number_capability { + struct pci_enhanced_capability_header hdr; + uint32_t sn_low; + uint32_t sn_hi; +}; + +struct pci_header_common { + uint16_t vendor_id; + uint16_t device_id; + uint16_t command; + uint16_t status; + uint32_t rev_class; + uint8_t cache_line_size; + uint8_t master_lat_timer; + uint8_t header_type; + uint8_t BIST; + uint8_t rsvd12[36]; + uint8_t cap_pointer; + uint8_t rsvd53[7]; + uint8_t int_line; + uint8_t int_pin; + uint8_t rsvd62[2]; +}; + +struct pci_header_zero { + uint16_t vendor_id; + uint16_t device_id; + uint16_t command; + uint16_t status; + uint32_t rev_class; + uint8_t cache_line_size; + uint8_t master_lat_timer; + uint8_t header_type; + uint8_t BIST; + uint32_t BAR[6]; + uint32_t carbus_cis_pointer; + uint16_t ssvid; + uint16_t ssid; + uint32_t exp_rom_base_addr; + uint8_t cap_pointer; + uint8_t rsvd53[7]; + uint8_t intLine; + uint8_t int_pin; + uint8_t min_gnt; + uint8_t max_lat; +}; + +struct pci_header_one { + uint16_t vendor_id; + uint16_t device_id; + uint16_t command; + uint16_t status; + uint32_t rev_class; + uint8_t cache_line_size; + uint8_t master_lat_timer; + uint8_t header_type; + uint8_t BIST; + uint32_t BAR[2]; + uint8_t primary; + uint8_t secondary; + uint8_t subordinate; + uint8_t secondary_lat_timer; + uint8_t io_base; + uint8_t io_limit; + uint16_t secondary_status; + uint16_t mem_base; + uint16_t mem_limit; + uint16_t prefetch_base; + uint16_t prefetch_limit; + uint32_t prefetch_base_upper; + uint32_t prefetch_limit_upper; + uint16_t io_base_upper; + uint16_t io_limit_upper; + uint8_t cap_pointer; + uint8_t rsvd53[3]; + uint32_t exp_romBase_addr; + uint8_t int_line; + uint8_t int_pin; + uint16_t bridge_control; +}; + +struct pci_capabilities_header { + uint8_t capability_id; + uint8_t next; +}; + +/* + * MSI capability structure for msi interrupt vectors + */ +#define MAX_MSIX_TABLE_SIZE 0x800 +#define MSIX_ENTRY_VECTOR_CTRL_MASKBIT 1 +#define PORT_INT_VECTOR 0; +#define CLEAR_MSIX_DESTINATION_ID 0xfff00fff +struct pci_msi_cap { + struct pci_capabilities_header header; + union _MsiControl { + uint16_t as_uint16_t; + struct _PCI_MSI_MESSAGE_CONTROL { + uint16_t msi_enable : 1; + uint16_t multiple_message_capable : 3; + uint16_t multiple_message_enable : 3; + uint16_t capable_of_64bits : 1; + uint16_t per_vector_mask_capable : 1; + uint16_t reserved : 7; + } bit; + } message_control; + union { + struct _PCI_MSI_MESSAGE_ADDRESS { + uint32_t reserved : 2; + uint32_t address : 30; + } reg; + uint32_t raw; + } message_address_lower; + union { + struct _Option32_bit { + uint16_t message_data; + } option32_bit; + struct _Option64_bit { + uint32_t message_address_upper; + uint16_t message_data; + uint16_t reserved; + uint32_t mask_bits; + uint32_t pending_bits; + } option64_bit; + }; +}; + +struct pcix_table_pointer { + union { + struct { + uint32_t BaseIndexRegister : 3; + uint32_t Reserved : 29; + } TableBIR; + uint32_t TableOffset; + }; +}; + +struct pci_msix_capability { + struct pci_capabilities_header header; + union _MsixControl { + uint16_t as_uint16_t; + struct msg_ctrl { + uint16_t table_size : 11; + uint16_t reserved : 3; + uint16_t function_mask : 1; + uint16_t msix_enable : 1; + } bit; + } message_control; + + struct pcix_table_pointer message_table; + struct pcix_table_pointer pba_table; +}; + +struct pci_msix_table_entry { + volatile uint32_t message_addr_lo; + volatile uint32_t message_addr_hi; + volatile uint32_t message_data; + volatile uint32_t vector_control; +}; + +/* + * Pci express capability + */ +enum PciExpressCapabilities { + /* 0001b Legacy PCI Express Endpoint */ + LegacyEndpoint = 0x1, + /* 0000b PCI Express Endpoint */ + ExpressEndpoint = 0x0, + /* 0100b Root Port of PCI Express Root Complex* */ + RootComplexRootPort = 0x4, + /* 0101b Upstream Port of PCI Express Switch* */ + SwitchUpstreamPort = 0x5, + /* 0110b Downstream Port of PCI Express Switch* */ + SwitchDownStreamPort = 0x6, + /* 0111b PCI Express to PCI/PCI-X Bridge* */ + ExpressToPciBridge = 0x7, + /* 1000b PCI/PCI-X to PCI Express Bridge* */ + PciToExpressBridge = 0x8, + /* 1001b Root Complex Integrated Endpoint */ + RCIntegratedEndpoint = 0x9, + /* 1010b Root Complex Event Collector */ + RootComplexEventCollector = 0xa, + InvalidCapability = 0xff +}; + +union express_capability_register { + struct { + uint16_t capability_version : 4; + uint16_t device_type : 4; + uint16_t slot_implemented : 1; + uint16_t interrupt_message_number : 5; + uint16_t rsv : 2; + } bit_field; + uint16_t as_uint16_t; +}; + +union express_slot_capabilities_register { + struct { + uint32_t attention_button_present : 1; + uint32_t power_controller_present : 1; + uint32_t MRL_sensor_present : 1; + uint32_t attention_indicator_present : 1; + uint32_t power_indicator_present : 1; + uint32_t hotplug_surprise : 1; + uint32_t hotplug_capable : 1; + uint32_t slot_power_limit : 8; + uint32_t slotPower_limit_scale : 2; + uint32_t electromechanical_lock_present : 1; + uint32_t no_command_completed_support : 1; + uint32_t physical_slot_number : 13; + } bit_field; + uint32_t as_uint32_t; +}; + +union express_slot_control_register { + struct { + uint16_t attention_button_enable : 1; + uint16_t power_fault_detect_enable : 1; + uint16_t MRLsensor_enable : 1; + uint16_t presence_detect_enable : 1; + uint16_t command_completed_enable : 1; + uint16_t hotplug_interrupt_enable : 1; + uint16_t attention_indicator_control : 2; + uint16_t power_indicator_control : 2; + uint16_t power_controller_control : 1; + uint16_t electromechanical_lockcontrol : 1; + uint16_t datalink_state_change_enable : 1; + uint16_t Rsvd : 3; + } bit_field; + uint16_t as_uint16_t; +}; + +union express_slot_status_register { + struct { + uint16_t attention_button_pressed : 1; + uint16_t power_fault_detected : 1; + uint16_t MRL_sensor_changed : 1; + uint16_t presence_detect_changed : 1; + uint16_t command_completed : 1; + uint16_t MRL_sensor_state : 1; + uint16_t presence_detect_state : 1; + uint16_t electromechanical_lock_engaged : 1; + uint16_t datalink_state_changed : 1; + uint16_t rsvd : 7; + } bit_field; + uint16_t as_uint16_t; +}; + +union express_root_control_register { + struct { + uint16_t CorrectableSerrEnable : 1; + uint16_t NonFatalSerrEnable : 1; + uint16_t FatalSerrEnable : 1; + uint16_t PMEInterruptEnable : 1; + uint16_t CRSSoftwareVisibilityEnable : 1; + uint16_t Rsvd : 11; + } bit_field; + uint16_t as_uint16_t; +}; + +union express_link_capability_register { + struct { + uint32_t maximum_link_speed : 4; + uint32_t maximum_link_width : 6; + uint32_t active_state_pms_support : 2; + uint32_t l0_exit_latency : 3; + uint32_t l1_exit_latency : 3; + uint32_t clock_power_management : 1; + uint32_t surprise_down_error_reporting_capable : 1; + uint32_t datalink_layer_active_reporting_capable : 1; + uint32_t link_bandwidth_notification_capability : 1; + uint32_t aspm_optionality_compliance : 1; + uint32_t rsvd : 1; + uint32_t port_number : 8; + } bit_field; + uint32_t as_uint32_t; +}; + +union express_link_control_register { + struct { + uint16_t active_state_pm_control : 2; + uint16_t rsvd1 : 1; + uint16_t read_completion_boundary : 1; + uint16_t link_disable : 1; + uint16_t retrain_link : 1; + uint16_t common_clock_config : 1; + uint16_t extended_synch : 1; + uint16_t enable_clock_power_management : 1; + uint16_t rsvd2 : 7; + } bit_field; + uint16_t as_uint16_t; +}; + +union express_link_status_register { + struct { + uint16_t link_speed : 4; + uint16_t link_width : 6; + uint16_t undefined : 1; + uint16_t link_training : 1; + uint16_t slot_clock_config : 1; + uint16_t datalink_layer_active : 1; + uint16_t asvd : 2; + } bit_field; + uint16_t as_uint16_t; +}; + +struct pci_express_cap { + uint8_t capid; + uint8_t next_cap; + union express_capability_register express_cap_register; + uint32_t device_cap; + uint16_t device_control; + uint16_t device_status; + union express_link_capability_register link_cap; + union express_link_control_register link_control; + union express_link_status_register link_status; + union express_slot_capabilities_register slot_cap; + union express_slot_control_register slot_control; + union express_slot_status_register slot_status; + uint32_t root_status; + uint32_t deviceCap2; + uint16_t deviceControl2; + uint16_t deviceStatus2; + uint32_t linkCap2; + uint16_t linkControl2; + uint16_t linkStatus2; + uint32_t slotCap2; + uint16_t slotControl2; + uint16_t slotStatus2; +}; + +struct pci_msix_cap { + uint8_t cap_idd; + uint8_t next_cap; + uint16_t msg_control_reg; + uint32_t msix_table_offset; + uint32_t pba_offset; +}; + +struct pci_header { + union { + struct pci_header_common common; + struct pci_header_zero zero; + struct pci_header_one one; + }; +}; + +#endif /* VMD_SPEC_H */ |