diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/examples/nvme | |
parent | Initial commit. (diff) | |
download | ceph-upstream.tar.xz ceph-upstream.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/examples/nvme')
36 files changed, 12615 insertions, 0 deletions
diff --git a/src/spdk/examples/nvme/Makefile b/src/spdk/examples/nvme/Makefile new file mode 100644 index 000000000..14eeb9be7 --- /dev/null +++ b/src/spdk/examples/nvme/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +DIRS-y += hello_world identify perf reconnect nvme_manage arbitration \ + hotplug cmb_copy abort + +DIRS-$(CONFIG_FIO_PLUGIN) += fio_plugin + +.PHONY: all clean $(DIRS-y) + +all: $(DIRS-y) +clean: $(DIRS-y) + +include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk diff --git a/src/spdk/examples/nvme/abort/.gitignore b/src/spdk/examples/nvme/abort/.gitignore new file mode 100644 index 000000000..f7d13fd04 --- /dev/null +++ b/src/spdk/examples/nvme/abort/.gitignore @@ -0,0 +1 @@ +abort diff --git a/src/spdk/examples/nvme/abort/Makefile b/src/spdk/examples/nvme/abort/Makefile new file mode 100644 index 000000000..5073a842d --- /dev/null +++ b/src/spdk/examples/nvme/abort/Makefile @@ -0,0 +1,38 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +APP = abort + +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk diff --git a/src/spdk/examples/nvme/abort/abort.c b/src/spdk/examples/nvme/abort/abort.c new file mode 100644 index 000000000..728790513 --- /dev/null +++ b/src/spdk/examples/nvme/abort/abort.c @@ -0,0 +1,1144 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/nvme.h" +#include "spdk/queue.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/likely.h" + +struct ctrlr_entry { + struct spdk_nvme_ctrlr *ctrlr; + enum spdk_nvme_transport_type trtype; + + struct ctrlr_entry *next; + char name[1024]; +}; + +struct ns_entry { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ns *ns; + + struct ns_entry *next; + uint32_t io_size_blocks; + uint32_t num_io_requests; + uint64_t size_in_ios; + uint32_t block_size; + char name[1024]; +}; + +struct ctrlr_worker_ctx { + pthread_mutex_t mutex; + struct ctrlr_entry *entry; + uint64_t abort_submitted; + uint64_t abort_submit_failed; + uint64_t successful_abort; + uint64_t unsuccessful_abort; + uint64_t abort_failed; + uint64_t current_queue_depth; + struct spdk_nvme_ctrlr *ctrlr; + struct ctrlr_worker_ctx *next; +}; + +struct ns_worker_ctx { + struct ns_entry *entry; + uint64_t io_submitted; + uint64_t io_completed; + uint64_t io_aborted; + uint64_t io_failed; + uint64_t current_queue_depth; + uint64_t offset_in_ios; + bool is_draining; + struct spdk_nvme_qpair *qpair; + struct ctrlr_worker_ctx *ctrlr_ctx; + struct ns_worker_ctx *next; +}; + +struct perf_task { + struct ns_worker_ctx *ns_ctx; + void *buf; +}; + +struct worker_thread { + struct ns_worker_ctx *ns_ctx; + struct ctrlr_worker_ctx *ctrlr_ctx; + struct worker_thread *next; + unsigned lcore; +}; + +static const char *g_workload_type = "read"; +static struct ctrlr_entry *g_controllers; +static struct ns_entry *g_namespaces; +static int g_num_namespaces; +static struct worker_thread *g_workers; +static int g_num_workers; +static uint32_t g_master_core; + +static int g_abort_interval = 1; + +static uint64_t g_tsc_rate; + +static uint32_t g_io_size_bytes = 131072; +static uint32_t g_max_io_size_blocks; +static int g_rw_percentage = -1; +static int g_is_random; +static int g_queue_depth = 128; +static int g_time_in_sec = 3; +static int g_dpdk_mem; +static int g_shm_id = -1; +static bool g_no_pci; +static bool g_warn; +static bool g_mix_specified; + +static const char *g_core_mask; + +struct trid_entry { + struct spdk_nvme_transport_id trid; + uint16_t nsid; + TAILQ_ENTRY(trid_entry) tailq; +}; + +static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); + +static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); + +static int +build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport_id *trid; + int res = 0; + + trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); + + switch (trid->trtype) { + case SPDK_NVME_TRANSPORT_PCIE: + res = snprintf(name, length, "PCIE (%s)", trid->traddr); + break; + case SPDK_NVME_TRANSPORT_RDMA: + res = snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + break; + case SPDK_NVME_TRANSPORT_TCP: + res = snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + break; + + default: + fprintf(stderr, "Unknown transport type %d\n", trid->trtype); + break; + } + return res; +} + +static void +build_nvme_ns_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + int res = 0; + + res = build_nvme_name(name, length, ctrlr); + if (res > 0) { + snprintf(name + res, length - res, " NSID %u", nsid); + } + +} + +static void +register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) +{ + struct ns_entry *entry; + const struct spdk_nvme_ctrlr_data *cdata; + uint32_t max_xfer_size, entries, sector_size; + uint64_t ns_size; + struct spdk_nvme_io_qpair_opts opts; + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + if (!spdk_nvme_ns_is_active(ns)) { + printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", + cdata->mn, cdata->sn, + spdk_nvme_ns_get_id(ns)); + g_warn = true; + return; + } + + ns_size = spdk_nvme_ns_get_size(ns); + sector_size = spdk_nvme_ns_get_sector_size(ns); + + if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) { + printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " + "ns size %" PRIu64 " / block size %u for I/O size %u\n", + cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), + ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); + g_warn = true; + return; + } + + max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); + spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); + /* NVMe driver may add additional entries based on + * stripe size and maximum transfer size, we assume + * 1 more entry be used for stripe. + */ + entries = (g_io_size_bytes - 1) / max_xfer_size + 2; + if ((g_queue_depth * entries) > opts.io_queue_size) { + printf("controller IO queue size %u less than required\n", + opts.io_queue_size); + printf("Consider using lower queue depth or small IO size because " + "IO requests may be queued at the NVMe driver.\n"); + } + /* For requests which have children requests, parent request itself + * will also occupy 1 entry. + */ + entries += 1; + + entry = calloc(1, sizeof(struct ns_entry)); + if (entry == NULL) { + perror("ns_entry malloc"); + exit(1); + } + + entry->ctrlr = ctrlr; + entry->ns = ns; + entry->num_io_requests = g_queue_depth * entries; + + entry->size_in_ios = ns_size / g_io_size_bytes; + entry->io_size_blocks = g_io_size_bytes / sector_size; + + entry->block_size = spdk_nvme_ns_get_sector_size(ns); + + if (g_max_io_size_blocks < entry->io_size_blocks) { + g_max_io_size_blocks = entry->io_size_blocks; + } + + build_nvme_ns_name(entry->name, sizeof(entry->name), ctrlr, spdk_nvme_ns_get_id(ns)); + + g_num_namespaces++; + entry->next = g_namespaces; + g_namespaces = entry; +} + +static void +unregister_namespaces(void) +{ + struct ns_entry *entry = g_namespaces; + + while (entry) { + struct ns_entry *next = entry->next; + free(entry); + entry = next; + } +} + +static void +register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry) +{ + struct spdk_nvme_ns *ns; + struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry)); + uint32_t nsid; + + if (entry == NULL) { + perror("ctrlr_entry malloc"); + exit(1); + } + + build_nvme_name(entry->name, sizeof(entry->name), ctrlr); + + entry->ctrlr = ctrlr; + entry->trtype = trid_entry->trid.trtype; + entry->next = g_controllers; + g_controllers = entry; + + if (trid_entry->nsid == 0) { + for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + continue; + } + register_ns(ctrlr, ns); + } + } else { + ns = spdk_nvme_ctrlr_get_ns(ctrlr, trid_entry->nsid); + if (!ns) { + perror("Namespace does not exist."); + exit(1); + } + + register_ns(ctrlr, ns); + } +} + +static void +abort_complete(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct ctrlr_worker_ctx *ctrlr_ctx = ctx; + + ctrlr_ctx->current_queue_depth--; + if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { + ctrlr_ctx->abort_failed++; + } else if ((cpl->cdw0 & 0x1) == 0) { + ctrlr_ctx->successful_abort++; + } else { + ctrlr_ctx->unsuccessful_abort++; + } +} + +static void +abort_task(struct perf_task *task) +{ + struct ns_worker_ctx *ns_ctx = task->ns_ctx; + struct ctrlr_worker_ctx *ctrlr_ctx = ns_ctx->ctrlr_ctx; + int rc; + + /* Hold mutex to guard ctrlr_ctx->current_queue_depth. */ + pthread_mutex_lock(&ctrlr_ctx->mutex); + + rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ctx->ctrlr, ns_ctx->qpair, task, abort_complete, + ctrlr_ctx); + + if (spdk_unlikely(rc != 0)) { + ctrlr_ctx->abort_submit_failed++; + } else { + ctrlr_ctx->current_queue_depth++; + ctrlr_ctx->abort_submitted++; + } + + pthread_mutex_unlock(&ctrlr_ctx->mutex); +} + +static __thread unsigned int seed = 0; + +static inline void +submit_single_io(struct perf_task *task) +{ + uint64_t offset_in_ios, lba; + int rc; + struct ns_worker_ctx *ns_ctx = task->ns_ctx; + struct ns_entry *entry = ns_ctx->entry; + + if (g_is_random) { + offset_in_ios = rand_r(&seed) % entry->size_in_ios; + } else { + offset_in_ios = ns_ctx->offset_in_ios++; + if (ns_ctx->offset_in_ios == entry->size_in_ios) { + ns_ctx->offset_in_ios = 0; + } + } + + lba = offset_in_ios * entry->io_size_blocks; + + if ((g_rw_percentage == 100) || + (g_rw_percentage != 0 && (rand_r(&seed) % 100) < g_rw_percentage)) { + rc = spdk_nvme_ns_cmd_read(entry->ns, ns_ctx->qpair, task->buf, + lba, entry->io_size_blocks, io_complete, task, 0); + } else { + rc = spdk_nvme_ns_cmd_write(entry->ns, ns_ctx->qpair, task->buf, + lba, entry->io_size_blocks, io_complete, task, 0); + } + + if (spdk_unlikely(rc != 0)) { + fprintf(stderr, "I/O submission failed\n"); + } else { + ns_ctx->current_queue_depth++; + ns_ctx->io_submitted++; + + if ((ns_ctx->io_submitted % g_abort_interval) == 0) { + abort_task(task); + } + } + +} + +static void +io_complete(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct perf_task *task = ctx; + struct ns_worker_ctx *ns_ctx = task->ns_ctx; + + ns_ctx->current_queue_depth--; + if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { + ns_ctx->io_failed++; + } else { + ns_ctx->io_completed++; + } + + /* is_draining indicates when time has expired for the test run and we are + * just waiting for the previously submitted I/O to complete. In this case, + * do not submit a new I/O to replace the one just completed. + */ + if (spdk_unlikely(ns_ctx->is_draining)) { + spdk_dma_free(task->buf); + free(task); + } else { + submit_single_io(task); + } +} + +static struct perf_task * +allocate_task(struct ns_worker_ctx *ns_ctx) +{ + struct perf_task *task; + + task = calloc(1, sizeof(*task)); + if (task == NULL) { + fprintf(stderr, "Failed to allocate task\n"); + exit(1); + } + + task->buf = spdk_dma_zmalloc(g_io_size_bytes, 0x200, NULL); + if (task->buf == NULL) { + free(task); + fprintf(stderr, "Failed to allocate task->buf\n"); + exit(1); + } + + task->ns_ctx = ns_ctx; + + return task; +} + +static void +submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) +{ + struct perf_task *task; + + while (queue_depth-- > 0) { + task = allocate_task(ns_ctx); + submit_single_io(task); + } +} + +static int +work_fn(void *arg) +{ + struct worker_thread *worker = (struct worker_thread *)arg; + struct ns_worker_ctx *ns_ctx; + struct ctrlr_worker_ctx *ctrlr_ctx; + struct ns_entry *ns_entry; + struct spdk_nvme_io_qpair_opts opts; + uint64_t tsc_end; + uint32_t unfinished_ctx; + + /* Allocate queue pair for each namespace. */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + ns_entry = ns_ctx->entry; + + spdk_nvme_ctrlr_get_default_io_qpair_opts(ns_entry->ctrlr, &opts, sizeof(opts)); + if (opts.io_queue_requests < ns_entry->num_io_requests) { + opts.io_queue_requests = ns_entry->num_io_requests; + } + + ns_ctx->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ns_entry->ctrlr, &opts, sizeof(opts)); + if (ns_ctx->qpair == NULL) { + fprintf(stderr, "spdk_nvme_ctrlr_alloc_io_qpair failed\n"); + return 1; + } + + ns_ctx = ns_ctx->next; + } + + tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate; + + /* Submit initial I/O for each namespace. */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + submit_io(ns_ctx, g_queue_depth); + ns_ctx = ns_ctx->next; + } + + while (1) { + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + spdk_nvme_qpair_process_completions(ns_ctx->qpair, 0); + ns_ctx = ns_ctx->next; + } + + if (worker->lcore == g_master_core) { + ctrlr_ctx = worker->ctrlr_ctx; + while (ctrlr_ctx) { + /* Hold mutex to guard ctrlr_ctx->current_queue_depth. */ + pthread_mutex_lock(&ctrlr_ctx->mutex); + spdk_nvme_ctrlr_process_admin_completions(ctrlr_ctx->ctrlr); + pthread_mutex_unlock(&ctrlr_ctx->mutex); + ctrlr_ctx = ctrlr_ctx->next; + } + } + + if (spdk_get_ticks() > tsc_end) { + break; + } + } + + do { + unfinished_ctx = 0; + + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + if (!ns_ctx->is_draining) { + ns_ctx->is_draining = true; + } + if (ns_ctx->current_queue_depth > 0) { + spdk_nvme_qpair_process_completions(ns_ctx->qpair, 0); + if (ns_ctx->current_queue_depth == 0) { + spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair); + } else { + unfinished_ctx++; + } + } + ns_ctx = ns_ctx->next; + } + } while (unfinished_ctx > 0); + + if (worker->lcore == g_master_core) { + do { + unfinished_ctx = 0; + + ctrlr_ctx = worker->ctrlr_ctx; + while (ctrlr_ctx != NULL) { + pthread_mutex_lock(&ctrlr_ctx->mutex); + if (ctrlr_ctx->current_queue_depth > 0) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr_ctx->ctrlr); + if (ctrlr_ctx->current_queue_depth > 0) { + unfinished_ctx++; + } + } + pthread_mutex_unlock(&ctrlr_ctx->mutex); + ctrlr_ctx = ctrlr_ctx->next; + } + } while (unfinished_ctx > 0); + } + + return 0; +} + +static void +usage(char *program_name) +{ + printf("%s options", program_name); + + printf("\n"); + printf("\t[-q io depth]\n"); + printf("\t[-o io size in bytes]\n"); + printf("\t[-w io pattern type, must be one of\n"); + printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n"); + printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n"); + printf("\t[-t time in seconds]\n"); + printf("\t[-c core mask for I/O submission/completion.]\n"); + printf("\t\t(default: 1)\n"); + printf("\t[-r Transport ID for local PCIe NVMe or NVMeoF]\n"); + printf("\t Format: 'key:value [key:value] ...'\n"); + printf("\t Keys:\n"); + printf("\t trtype Transport type (e.g. PCIe, RDMA)\n"); + printf("\t adrfam Address family (e.g. IPv4, IPv6)\n"); + printf("\t traddr Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n"); + printf("\t trsvcid Transport service identifier (e.g. 4420)\n"); + printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); + printf("\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n"); + printf("\t -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n"); + printf("\t[-s DPDK huge memory size in MB.]\n"); + printf("\t[-i shared memory group ID]\n"); + printf("\t[-a abort interval.]\n"); + printf("\t"); + spdk_log_usage(stdout, "-T"); +#ifdef DEBUG + printf("\t[-G enable debug logging]\n"); +#else + printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)\n"); +#endif +} + +static void +unregister_trids(void) +{ + struct trid_entry *trid_entry, *tmp; + + TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) { + TAILQ_REMOVE(&g_trid_list, trid_entry, tailq); + free(trid_entry); + } +} + +static int +add_trid(const char *trid_str) +{ + struct trid_entry *trid_entry; + struct spdk_nvme_transport_id *trid; + char *ns; + + trid_entry = calloc(1, sizeof(*trid_entry)); + if (trid_entry == NULL) { + return -1; + } + + trid = &trid_entry->trid; + trid->trtype = SPDK_NVME_TRANSPORT_PCIE; + snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); + + if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) { + fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str); + free(trid_entry); + return 1; + } + + spdk_nvme_transport_id_populate_trstring(trid, + spdk_nvme_transport_id_trtype_str(trid->trtype)); + + ns = strcasestr(trid_str, "ns:"); + if (ns) { + char nsid_str[6]; /* 5 digits maximum in an nsid */ + int len; + int nsid; + + ns += 3; + + len = strcspn(ns, " \t\n"); + if (len > 5) { + fprintf(stderr, "NVMe namespace IDs must be 5 digits or less\n"); + free(trid_entry); + return 1; + } + + memcpy(nsid_str, ns, len); + nsid_str[len] = '\0'; + + nsid = spdk_strtol(nsid_str, 10); + if (nsid <= 0 || nsid > 65535) { + fprintf(stderr, "NVMe namespace IDs must be less than 65536 and greater than 0\n"); + free(trid_entry); + return 1; + } + + trid_entry->nsid = (uint16_t)nsid; + } + + TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq); + return 0; +} + +static int +parse_args(int argc, char **argv) +{ + int op; + long int val; + int rc; + + while ((op = getopt(argc, argv, "a:c:i:o:q:r:s:t:w:M:")) != -1) { + switch (op) { + case 'a': + case 'i': + case 'o': + case 'q': + case 's': + case 't': + case 'M': + val = spdk_strtol(optarg, 10); + if (val < 0) { + fprintf(stderr, "Converting a string to integer failed\n"); + return val; + } + switch (op) { + case 'a': + g_abort_interval = val; + break; + case 'i': + g_shm_id = val; + break; + case 'o': + g_io_size_bytes = val; + break; + case 'q': + g_queue_depth = val; + break; + case 's': + g_dpdk_mem = val; + break; + case 't': + g_time_in_sec = val; + break; + case 'M': + g_rw_percentage = val; + g_mix_specified = true; + break; + } + break; + case 'c': + g_core_mask = optarg; + break; + case 'r': + if (add_trid(optarg)) { + usage(argv[0]); + return 1; + } + break; + case 'w': + g_workload_type = optarg; + break; + case 'G': +#ifndef DEBUG + fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n", + argv[0]); + usage(argv[0]); + return 1; +#else + spdk_log_set_flag("nvme"); + spdk_log_set_print_level(SPDK_LOG_DEBUG); + break; +#endif + case 'T': + rc = spdk_log_set_flag(optarg); + if (rc < 0) { + fprintf(stderr, "unknown flag\n"); + usage(argv[0]); + exit(EXIT_FAILURE); + } + spdk_log_set_print_level(SPDK_LOG_DEBUG); +#ifndef DEBUG + fprintf(stderr, "%s must be rebuilt with CONFIG_DEBUG=y for -T flag.\n", + argv[0]); + usage(argv[0]); + return 0; +#endif + break; + default: + usage(argv[0]); + return 1; + } + } + + if (!g_queue_depth) { + fprintf(stderr, "missing -q (queue size) operand\n"); + usage(argv[0]); + return 1; + } + if (!g_io_size_bytes) { + fprintf(stderr, "missing -o (block size) operand\n"); + usage(argv[0]); + return 1; + } + if (!g_workload_type) { + fprintf(stderr, "missing -t (test time in seconds) operand\n"); + usage(argv[0]); + return 1; + } + + if (!g_time_in_sec) { + usage(argv[0]); + return 1; + } + + if (strncmp(g_workload_type, "rand", 4) == 0) { + g_is_random = 1; + g_workload_type = &g_workload_type[4]; + } + + if (strcmp(g_workload_type, "read") == 0 || strcmp(g_workload_type, "write") == 0) { + g_rw_percentage = strcmp(g_workload_type, "read") == 0 ? 100 : 0; + if (g_mix_specified) { + fprintf(stderr, "Ignoring -M option... Please use -M option" + " only when using rw or randrw.\n"); + } + } else if (strcmp(g_workload_type, "rw") == 0) { + if (g_rw_percentage < 0 || g_rw_percentage > 100) { + fprintf(stderr, + "-M must be specified to value from 0 to 100 " + "for rw or randrw.\n"); + return 1; + } + } else { + fprintf(stderr, + "io pattern type must be one of\n" + "(read, write, randread, randwrite, rw, randrw)\n"); + return 1; + } + + if (TAILQ_EMPTY(&g_trid_list)) { + /* If no transport IDs specified, default to enumerating all local PCIe devices */ + add_trid("trtype:PCIe"); + } else { + struct trid_entry *trid_entry, *trid_entry_tmp; + + g_no_pci = true; + /* check whether there is local PCIe type */ + TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) { + if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + g_no_pci = false; + break; + } + } + } + + return 0; +} + +static int +register_workers(void) +{ + uint32_t i; + struct worker_thread *worker; + + g_workers = NULL; + g_num_workers = 0; + + SPDK_ENV_FOREACH_CORE(i) { + worker = calloc(1, sizeof(*worker)); + if (worker == NULL) { + fprintf(stderr, "Unable to allocate worker\n"); + return -1; + } + + worker->lcore = i; + worker->next = g_workers; + g_workers = worker; + g_num_workers++; + } + + return 0; +} + +static void +unregister_workers(void) +{ + struct worker_thread *worker = g_workers; + + /* Free namespace context and worker thread */ + while (worker) { + struct worker_thread *next_worker = worker->next; + struct ns_worker_ctx *ns_ctx = worker->ns_ctx; + + while (ns_ctx) { + struct ns_worker_ctx *next_ns_ctx = ns_ctx->next; + + printf("NS: %s I/O completed: %lu, failed: %lu\n", + ns_ctx->entry->name, ns_ctx->io_completed, ns_ctx->io_failed); + free(ns_ctx); + ns_ctx = next_ns_ctx; + } + + struct ctrlr_worker_ctx *ctrlr_ctx = worker->ctrlr_ctx; + + while (ctrlr_ctx) { + struct ctrlr_worker_ctx *next_ctrlr_ctx = ctrlr_ctx->next; + + printf("CTRLR: %s abort submitted %lu, failed to submit %lu\n", + ctrlr_ctx->entry->name, ctrlr_ctx->abort_submitted, + ctrlr_ctx->abort_submit_failed); + printf("\t success %lu, unsuccess %lu, failed %lu\n", + ctrlr_ctx->successful_abort, ctrlr_ctx->unsuccessful_abort, + ctrlr_ctx->abort_failed); + free(ctrlr_ctx); + ctrlr_ctx = next_ctrlr_ctx; + } + + free(worker); + worker = next_worker; + } +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + return true; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + struct trid_entry *trid_entry = cb_ctx; + struct spdk_pci_addr pci_addr; + struct spdk_pci_device *pci_dev; + struct spdk_pci_id pci_id; + + if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) { + printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n", + trid->traddr, trid->trsvcid, + trid->subnqn); + } else { + if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) { + return; + } + + pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr); + if (!pci_dev) { + return; + } + + pci_id = spdk_pci_device_get_id(pci_dev); + + printf("Attached to NVMe Controller at %s [%04x:%04x]\n", + trid->traddr, + pci_id.vendor_id, pci_id.device_id); + } + + register_ctrlr(ctrlr, trid_entry); +} + +static int +register_controllers(void) +{ + struct trid_entry *trid_entry; + + printf("Initializing NVMe Controllers\n"); + + TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) { + if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n", + trid_entry->trid.traddr); + return -1; + } + } + + return 0; +} + +static void +unregister_controllers(void) +{ + struct ctrlr_entry *entry = g_controllers; + + while (entry) { + struct ctrlr_entry *next = entry->next; + spdk_nvme_detach(entry->ctrlr); + free(entry); + entry = next; + } +} + +static int +associate_master_worker_with_ctrlr(void) +{ + struct ctrlr_entry *entry = g_controllers; + struct worker_thread *worker = g_workers; + struct ctrlr_worker_ctx *ctrlr_ctx; + + while (worker) { + if (worker->lcore == g_master_core) { + break; + } + worker = worker->next; + } + + if (!worker) { + return -1; + } + + while (entry) { + ctrlr_ctx = calloc(1, sizeof(struct ctrlr_worker_ctx)); + if (!ctrlr_ctx) { + return -1; + } + + pthread_mutex_init(&ctrlr_ctx->mutex, NULL); + ctrlr_ctx->entry = entry; + ctrlr_ctx->ctrlr = entry->ctrlr; + ctrlr_ctx->next = worker->ctrlr_ctx; + worker->ctrlr_ctx = ctrlr_ctx; + + entry = entry->next; + } + + return 0; +} + +static struct ctrlr_worker_ctx * +get_ctrlr_worker_ctx(struct spdk_nvme_ctrlr *ctrlr) +{ + struct worker_thread *worker = g_workers; + struct ctrlr_worker_ctx *ctrlr_ctx; + + while (worker != NULL) { + if (worker->lcore == g_master_core) { + break; + } + worker = worker->next; + } + + if (!worker) { + return NULL; + } + + ctrlr_ctx = worker->ctrlr_ctx; + + while (ctrlr_ctx != NULL) { + if (ctrlr_ctx->ctrlr == ctrlr) { + return ctrlr_ctx; + } + ctrlr_ctx = ctrlr_ctx->next; + } + + return NULL; +} + +static int +associate_workers_with_ns(void) +{ + struct ns_entry *entry = g_namespaces; + struct worker_thread *worker = g_workers; + struct ns_worker_ctx *ns_ctx; + int i, count; + + count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers; + + for (i = 0; i < count; i++) { + if (entry == NULL) { + break; + } + + ns_ctx = calloc(1, sizeof(struct ns_worker_ctx)); + if (!ns_ctx) { + return -1; + } + + printf("Associating %s with lcore %d\n", entry->name, worker->lcore); + ns_ctx->entry = entry; + ns_ctx->ctrlr_ctx = get_ctrlr_worker_ctx(entry->ctrlr); + if (!ns_ctx->ctrlr_ctx) { + free(ns_ctx); + return -1; + } + + ns_ctx->next = worker->ns_ctx; + worker->ns_ctx = ns_ctx; + + worker = worker->next; + if (worker == NULL) { + worker = g_workers; + } + + entry = entry->next; + if (entry == NULL) { + entry = g_namespaces; + } + } + + return 0; +} + +int main(int argc, char **argv) +{ + int rc; + struct worker_thread *worker, *master_worker; + struct spdk_env_opts opts; + + rc = parse_args(argc, argv); + if (rc != 0) { + return rc; + } + + spdk_env_opts_init(&opts); + opts.name = "abort"; + opts.shm_id = g_shm_id; + if (g_core_mask) { + opts.core_mask = g_core_mask; + } + + if (g_dpdk_mem) { + opts.mem_size = g_dpdk_mem; + } + if (g_no_pci) { + opts.no_pci = g_no_pci; + } + if (spdk_env_init(&opts) < 0) { + fprintf(stderr, "Unable to initialize SPDK env\n"); + rc = -1; + goto cleanup; + } + + g_tsc_rate = spdk_get_ticks_hz(); + + if (register_workers() != 0) { + rc = -1; + goto cleanup; + } + + if (register_controllers() != 0) { + rc = -1; + goto cleanup; + } + + if (g_warn) { + printf("WARNING: Some requested NVMe devices were skipped\n"); + } + + if (g_num_namespaces == 0) { + fprintf(stderr, "No valid NVMe controllers found\n"); + goto cleanup; + } + + if (associate_master_worker_with_ctrlr() != 0) { + rc = -1; + goto cleanup; + } + + if (associate_workers_with_ns() != 0) { + rc = -1; + goto cleanup; + } + + printf("Initialization complete. Launching workers.\n"); + + /* Launch all of the slave workers */ + g_master_core = spdk_env_get_current_core(); + master_worker = NULL; + worker = g_workers; + while (worker != NULL) { + if (worker->lcore != g_master_core) { + spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); + } else { + assert(master_worker == NULL); + master_worker = worker; + } + worker = worker->next; + } + + assert(master_worker != NULL); + rc = work_fn(master_worker); + + spdk_env_thread_wait_all(); + +cleanup: + unregister_trids(); + unregister_workers(); + unregister_namespaces(); + unregister_controllers(); + + if (rc != 0) { + fprintf(stderr, "%s: errors occured\n", argv[0]); + } + + return rc; +} diff --git a/src/spdk/examples/nvme/arbitration/.gitignore b/src/spdk/examples/nvme/arbitration/.gitignore new file mode 100644 index 000000000..f1d6e38dd --- /dev/null +++ b/src/spdk/examples/nvme/arbitration/.gitignore @@ -0,0 +1 @@ +arbitration diff --git a/src/spdk/examples/nvme/arbitration/Makefile b/src/spdk/examples/nvme/arbitration/Makefile new file mode 100644 index 000000000..71cff76e6 --- /dev/null +++ b/src/spdk/examples/nvme/arbitration/Makefile @@ -0,0 +1,38 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +APP = arbitration + +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk diff --git a/src/spdk/examples/nvme/arbitration/arbitration.c b/src/spdk/examples/nvme/arbitration/arbitration.c new file mode 100644 index 000000000..444076041 --- /dev/null +++ b/src/spdk/examples/nvme/arbitration/arbitration.c @@ -0,0 +1,1158 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/env.h" +#include "spdk/string.h" +#include "spdk/nvme_intel.h" + +struct ctrlr_entry { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_intel_rw_latency_page latency_page; + struct ctrlr_entry *next; + char name[1024]; +}; + +struct ns_entry { + struct { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ns *ns; + } nvme; + + struct ns_entry *next; + uint32_t io_size_blocks; + uint64_t size_in_ios; + char name[1024]; +}; + +struct ns_worker_ctx { + struct ns_entry *entry; + uint64_t io_completed; + uint64_t current_queue_depth; + uint64_t offset_in_ios; + bool is_draining; + struct spdk_nvme_qpair *qpair; + struct ns_worker_ctx *next; +}; + +struct arb_task { + struct ns_worker_ctx *ns_ctx; + void *buf; +}; + +struct worker_thread { + struct ns_worker_ctx *ns_ctx; + struct worker_thread *next; + unsigned lcore; + enum spdk_nvme_qprio qprio; +}; + +struct arb_context { + int shm_id; + int outstanding_commands; + int num_namespaces; + int num_workers; + int rw_percentage; + int is_random; + int queue_depth; + int time_in_sec; + int io_count; + uint8_t latency_tracking_enable; + uint8_t arbitration_mechanism; + uint8_t arbitration_config; + uint32_t io_size_bytes; + uint32_t max_completions; + uint64_t tsc_rate; + const char *core_mask; + const char *workload_type; +}; + +struct feature { + uint32_t result; + bool valid; +}; + +static struct spdk_mempool *task_pool = NULL; + +static struct ctrlr_entry *g_controllers = NULL; +static struct ns_entry *g_namespaces = NULL; +static struct worker_thread *g_workers = NULL; + +static struct feature features[SPDK_NVME_FEAT_ARBITRATION + 1] = {}; + +static struct arb_context g_arbitration = { + .shm_id = -1, + .outstanding_commands = 0, + .num_workers = 0, + .num_namespaces = 0, + .rw_percentage = 50, + .queue_depth = 64, + .time_in_sec = 60, + .io_count = 100000, + .latency_tracking_enable = 0, + .arbitration_mechanism = SPDK_NVME_CC_AMS_RR, + .arbitration_config = 0, + .io_size_bytes = 131072, + .max_completions = 0, + /* Default 4 cores for urgent/high/medium/low */ + .core_mask = "0xf", + .workload_type = "randrw", +}; + +/* + * For weighted round robin arbitration mechanism, the smaller value between + * weight and burst will be picked to execute the commands in one queue. + */ +#define USER_SPECIFIED_HIGH_PRIORITY_WEIGHT 32 +#define USER_SPECIFIED_MEDIUM_PRIORITY_WEIGHT 16 +#define USER_SPECIFIED_LOW_PRIORITY_WEIGHT 8 + +static void task_complete(struct arb_task *task); + +static void io_complete(void *ctx, const struct spdk_nvme_cpl *completion); + +static void get_arb_feature(struct spdk_nvme_ctrlr *ctrlr); + +static int set_arb_feature(struct spdk_nvme_ctrlr *ctrlr); + +static const char *print_qprio(enum spdk_nvme_qprio); + + +static void +register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) +{ + struct ns_entry *entry; + const struct spdk_nvme_ctrlr_data *cdata; + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + if (spdk_nvme_ns_get_size(ns) < g_arbitration.io_size_bytes || + spdk_nvme_ns_get_extended_sector_size(ns) > g_arbitration.io_size_bytes || + g_arbitration.io_size_bytes % spdk_nvme_ns_get_extended_sector_size(ns)) { + printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " + "ns size %" PRIu64 " / block size %u for I/O size %u\n", + cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), + spdk_nvme_ns_get_size(ns), spdk_nvme_ns_get_extended_sector_size(ns), + g_arbitration.io_size_bytes); + return; + } + + entry = malloc(sizeof(struct ns_entry)); + if (entry == NULL) { + perror("ns_entry malloc"); + exit(1); + } + + entry->nvme.ctrlr = ctrlr; + entry->nvme.ns = ns; + + entry->size_in_ios = spdk_nvme_ns_get_size(ns) / g_arbitration.io_size_bytes; + entry->io_size_blocks = g_arbitration.io_size_bytes / spdk_nvme_ns_get_sector_size(ns); + + snprintf(entry->name, 44, "%-20.20s (%-20.20s)", cdata->mn, cdata->sn); + + g_arbitration.num_namespaces++; + entry->next = g_namespaces; + g_namespaces = entry; +} + +static void +enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + if (spdk_nvme_cpl_is_error(cpl)) { + printf("enable_latency_tracking_complete failed\n"); + } + g_arbitration.outstanding_commands--; +} + +static void +set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable) +{ + int res; + union spdk_nvme_intel_feat_latency_tracking latency_tracking; + + if (enable) { + latency_tracking.bits.enable = 0x01; + } else { + latency_tracking.bits.enable = 0x00; + } + + res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING, + latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL); + if (res) { + printf("fail to allocate nvme request.\n"); + return; + } + g_arbitration.outstanding_commands++; + + while (g_arbitration.outstanding_commands) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr); + } +} + +static void +register_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t nsid; + struct spdk_nvme_ns *ns; + struct ctrlr_entry *entry = calloc(1, sizeof(struct ctrlr_entry)); + union spdk_nvme_cap_register cap = spdk_nvme_ctrlr_get_regs_cap(ctrlr); + const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + if (entry == NULL) { + perror("ctrlr_entry malloc"); + exit(1); + } + + snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn); + + entry->ctrlr = ctrlr; + entry->next = g_controllers; + g_controllers = entry; + + if ((g_arbitration.latency_tracking_enable != 0) && + spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) { + set_latency_tracking_feature(ctrlr, true); + } + + for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); nsid != 0; + nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + continue; + } + register_ns(ctrlr, ns); + } + + if (g_arbitration.arbitration_mechanism == SPDK_NVME_CAP_AMS_WRR && + (cap.bits.ams & SPDK_NVME_CAP_AMS_WRR)) { + get_arb_feature(ctrlr); + + if (g_arbitration.arbitration_config != 0) { + set_arb_feature(ctrlr); + get_arb_feature(ctrlr); + } + } +} + +static __thread unsigned int seed = 0; + +static void +submit_single_io(struct ns_worker_ctx *ns_ctx) +{ + struct arb_task *task = NULL; + uint64_t offset_in_ios; + int rc; + struct ns_entry *entry = ns_ctx->entry; + + task = spdk_mempool_get(task_pool); + if (!task) { + fprintf(stderr, "Failed to get task from task_pool\n"); + exit(1); + } + + task->buf = spdk_dma_zmalloc(g_arbitration.io_size_bytes, 0x200, NULL); + if (!task->buf) { + spdk_mempool_put(task_pool, task); + fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n"); + exit(1); + } + + task->ns_ctx = ns_ctx; + + if (g_arbitration.is_random) { + offset_in_ios = rand_r(&seed) % entry->size_in_ios; + } else { + offset_in_ios = ns_ctx->offset_in_ios++; + if (ns_ctx->offset_in_ios == entry->size_in_ios) { + ns_ctx->offset_in_ios = 0; + } + } + + if ((g_arbitration.rw_percentage == 100) || + (g_arbitration.rw_percentage != 0 && + ((rand_r(&seed) % 100) < g_arbitration.rw_percentage))) { + rc = spdk_nvme_ns_cmd_read(entry->nvme.ns, ns_ctx->qpair, task->buf, + offset_in_ios * entry->io_size_blocks, + entry->io_size_blocks, io_complete, task, 0); + } else { + rc = spdk_nvme_ns_cmd_write(entry->nvme.ns, ns_ctx->qpair, task->buf, + offset_in_ios * entry->io_size_blocks, + entry->io_size_blocks, io_complete, task, 0); + } + + if (rc != 0) { + fprintf(stderr, "starting I/O failed\n"); + } else { + ns_ctx->current_queue_depth++; + } +} + +static void +task_complete(struct arb_task *task) +{ + struct ns_worker_ctx *ns_ctx; + + ns_ctx = task->ns_ctx; + ns_ctx->current_queue_depth--; + ns_ctx->io_completed++; + + spdk_dma_free(task->buf); + spdk_mempool_put(task_pool, task); + + /* + * is_draining indicates when time has expired for the test run + * and we are just waiting for the previously submitted I/O + * to complete. In this case, do not submit a new I/O to replace + * the one just completed. + */ + if (!ns_ctx->is_draining) { + submit_single_io(ns_ctx); + } +} + +static void +io_complete(void *ctx, const struct spdk_nvme_cpl *completion) +{ + task_complete((struct arb_task *)ctx); +} + +static void +check_io(struct ns_worker_ctx *ns_ctx) +{ + spdk_nvme_qpair_process_completions(ns_ctx->qpair, g_arbitration.max_completions); +} + +static void +submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) +{ + while (queue_depth-- > 0) { + submit_single_io(ns_ctx); + } +} + +static void +drain_io(struct ns_worker_ctx *ns_ctx) +{ + ns_ctx->is_draining = true; + while (ns_ctx->current_queue_depth > 0) { + check_io(ns_ctx); + } +} + +static int +init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx, enum spdk_nvme_qprio qprio) +{ + struct spdk_nvme_ctrlr *ctrlr = ns_ctx->entry->nvme.ctrlr; + struct spdk_nvme_io_qpair_opts opts; + + spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); + opts.qprio = qprio; + + ns_ctx->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); + if (!ns_ctx->qpair) { + printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); + return 1; + } + + return 0; +} + +static void +cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair); +} + +static void +cleanup(uint32_t task_count) +{ + struct ns_entry *entry = g_namespaces; + struct ns_entry *next_entry = NULL; + struct worker_thread *worker = g_workers; + struct worker_thread *next_worker = NULL; + + while (entry) { + next_entry = entry->next; + free(entry); + entry = next_entry; + }; + + while (worker) { + struct ns_worker_ctx *ns_ctx = worker->ns_ctx; + + /* ns_worker_ctx is a list in the worker */ + while (ns_ctx) { + struct ns_worker_ctx *next_ns_ctx = ns_ctx->next; + free(ns_ctx); + ns_ctx = next_ns_ctx; + } + + next_worker = worker->next; + free(worker); + worker = next_worker; + }; + + if (spdk_mempool_count(task_pool) != (size_t)task_count) { + fprintf(stderr, "task_pool count is %zu but should be %u\n", + spdk_mempool_count(task_pool), task_count); + } + spdk_mempool_free(task_pool); +} + +static int +work_fn(void *arg) +{ + uint64_t tsc_end; + struct worker_thread *worker = (struct worker_thread *)arg; + struct ns_worker_ctx *ns_ctx = NULL; + + printf("Starting thread on core %u with %s\n", worker->lcore, print_qprio(worker->qprio)); + + /* Allocate a queue pair for each namespace. */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + if (init_ns_worker_ctx(ns_ctx, worker->qprio) != 0) { + printf("ERROR: init_ns_worker_ctx() failed\n"); + return 1; + } + ns_ctx = ns_ctx->next; + } + + tsc_end = spdk_get_ticks() + g_arbitration.time_in_sec * g_arbitration.tsc_rate; + + /* Submit initial I/O for each namespace. */ + ns_ctx = worker->ns_ctx; + + while (ns_ctx != NULL) { + submit_io(ns_ctx, g_arbitration.queue_depth); + ns_ctx = ns_ctx->next; + } + + while (1) { + /* + * Check for completed I/O for each controller. A new + * I/O will be submitted in the io_complete callback + * to replace each I/O that is completed. + */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + check_io(ns_ctx); + ns_ctx = ns_ctx->next; + } + + if (spdk_get_ticks() > tsc_end) { + break; + } + } + + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + drain_io(ns_ctx); + cleanup_ns_worker_ctx(ns_ctx); + ns_ctx = ns_ctx->next; + } + + return 0; +} + +static void +usage(char *program_name) +{ + printf("%s options", program_name); + printf("\n"); + printf("\t[-q io depth]\n"); + printf("\t[-s io size in bytes]\n"); + printf("\t[-w io pattern type, must be one of\n"); + printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n"); + printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n"); + printf("\t[-l enable latency tracking, default: disabled]\n"); + printf("\t\t(0 - disabled; 1 - enabled)\n"); + printf("\t[-t time in seconds]\n"); + printf("\t[-c core mask for I/O submission/completion.]\n"); + printf("\t\t(default: 0xf - 4 cores)]\n"); + printf("\t[-m max completions per poll]\n"); + printf("\t\t(default: 0 - unlimited)\n"); + printf("\t[-a arbitration mechanism, must be one of below]\n"); + printf("\t\t(0, 1, 2)]\n"); + printf("\t\t(0: default round robin mechanism)]\n"); + printf("\t\t(1: weighted round robin mechanism)]\n"); + printf("\t\t(2: vendor specific mechanism)]\n"); + printf("\t[-b enable arbitration user configuration, default: disabled]\n"); + printf("\t\t(0 - disabled; 1 - enabled)\n"); + printf("\t[-n subjected IOs for performance comparison]\n"); + printf("\t[-i shared memory group ID]\n"); +} + +static const char * +print_qprio(enum spdk_nvme_qprio qprio) +{ + switch (qprio) { + case SPDK_NVME_QPRIO_URGENT: + return "urgent priority queue"; + case SPDK_NVME_QPRIO_HIGH: + return "high priority queue"; + case SPDK_NVME_QPRIO_MEDIUM: + return "medium priority queue"; + case SPDK_NVME_QPRIO_LOW: + return "low priority queue"; + default: + return "invalid priority queue"; + } +} + + +static void +print_configuration(char *program_name) +{ + printf("%s run with configuration:\n", program_name); + printf("%s -q %d -s %d -w %s -M %d -l %d -t %d -c %s -m %d -a %d -b %d -n %d -i %d\n", + program_name, + g_arbitration.queue_depth, + g_arbitration.io_size_bytes, + g_arbitration.workload_type, + g_arbitration.rw_percentage, + g_arbitration.latency_tracking_enable, + g_arbitration.time_in_sec, + g_arbitration.core_mask, + g_arbitration.max_completions, + g_arbitration.arbitration_mechanism, + g_arbitration.arbitration_config, + g_arbitration.io_count, + g_arbitration.shm_id); +} + + +static void +print_performance(void) +{ + float io_per_second, sent_all_io_in_secs; + struct worker_thread *worker; + struct ns_worker_ctx *ns_ctx; + + worker = g_workers; + while (worker) { + ns_ctx = worker->ns_ctx; + while (ns_ctx) { + io_per_second = (float)ns_ctx->io_completed / g_arbitration.time_in_sec; + sent_all_io_in_secs = g_arbitration.io_count / io_per_second; + printf("%-43.43s core %u: %8.2f IO/s %8.2f secs/%d ios\n", + ns_ctx->entry->name, worker->lcore, + io_per_second, sent_all_io_in_secs, g_arbitration.io_count); + ns_ctx = ns_ctx->next; + } + worker = worker->next; + } + printf("========================================================\n"); + + printf("\n"); +} + +static void +print_latency_page(struct ctrlr_entry *entry) +{ + int i; + + printf("\n"); + printf("%s\n", entry->name); + printf("--------------------------------------------------------\n"); + + for (i = 0; i < 32; i++) { + if (entry->latency_page.buckets_32us[i]) + printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, + entry->latency_page.buckets_32us[i]); + } + for (i = 0; i < 31; i++) { + if (entry->latency_page.buckets_1ms[i]) + printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, + entry->latency_page.buckets_1ms[i]); + } + for (i = 0; i < 31; i++) { + if (entry->latency_page.buckets_32ms[i]) + printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32, + entry->latency_page.buckets_32ms[i]); + } +} + +static void +print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page) +{ + struct ctrlr_entry *ctrlr; + + printf("%s Latency Statistics:\n", op_name); + printf("========================================================\n"); + ctrlr = g_controllers; + while (ctrlr) { + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) { + if (spdk_nvme_ctrlr_cmd_get_log_page( + ctrlr->ctrlr, log_page, + SPDK_NVME_GLOBAL_NS_TAG, + &ctrlr->latency_page, + sizeof(struct spdk_nvme_intel_rw_latency_page), + 0, + enable_latency_tracking_complete, + NULL)) { + printf("nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + + g_arbitration.outstanding_commands++; + } else { + printf("Controller %s: %s latency statistics not supported\n", + ctrlr->name, op_name); + } + ctrlr = ctrlr->next; + } + + while (g_arbitration.outstanding_commands) { + ctrlr = g_controllers; + while (ctrlr) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr); + ctrlr = ctrlr->next; + } + } + + ctrlr = g_controllers; + while (ctrlr) { + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) { + print_latency_page(ctrlr); + } + ctrlr = ctrlr->next; + } + printf("\n"); +} + +static void +print_stats(void) +{ + print_performance(); + if (g_arbitration.latency_tracking_enable) { + if (g_arbitration.rw_percentage != 0) { + print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY); + } + if (g_arbitration.rw_percentage != 100) { + print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY); + } + } +} + +static int +parse_args(int argc, char **argv) +{ + const char *workload_type = NULL; + int op = 0; + bool mix_specified = false; + long int val; + + while ((op = getopt(argc, argv, "c:l:i:m:q:s:t:w:M:a:b:n:h")) != -1) { + switch (op) { + case 'c': + g_arbitration.core_mask = optarg; + break; + case 'w': + g_arbitration.workload_type = optarg; + break; + case 'h': + case '?': + usage(argv[0]); + return 1; + default: + val = spdk_strtol(optarg, 10); + if (val < 0) { + fprintf(stderr, "Converting a string to integer failed\n"); + return val; + } + switch (op) { + case 'i': + g_arbitration.shm_id = val; + break; + case 'l': + g_arbitration.latency_tracking_enable = val; + break; + case 'm': + g_arbitration.max_completions = val; + break; + case 'q': + g_arbitration.queue_depth = val; + break; + case 's': + g_arbitration.io_size_bytes = val; + break; + case 't': + g_arbitration.time_in_sec = val; + break; + case 'M': + g_arbitration.rw_percentage = val; + mix_specified = true; + break; + case 'a': + g_arbitration.arbitration_mechanism = val; + break; + case 'b': + g_arbitration.arbitration_config = val; + break; + case 'n': + g_arbitration.io_count = val; + break; + default: + usage(argv[0]); + return -EINVAL; + } + } + } + + workload_type = g_arbitration.workload_type; + + if (strcmp(workload_type, "read") && + strcmp(workload_type, "write") && + strcmp(workload_type, "randread") && + strcmp(workload_type, "randwrite") && + strcmp(workload_type, "rw") && + strcmp(workload_type, "randrw")) { + fprintf(stderr, + "io pattern type must be one of\n" + "(read, write, randread, randwrite, rw, randrw)\n"); + return 1; + } + + if (!strcmp(workload_type, "read") || + !strcmp(workload_type, "randread")) { + g_arbitration.rw_percentage = 100; + } + + if (!strcmp(workload_type, "write") || + !strcmp(workload_type, "randwrite")) { + g_arbitration.rw_percentage = 0; + } + + if (!strcmp(workload_type, "read") || + !strcmp(workload_type, "randread") || + !strcmp(workload_type, "write") || + !strcmp(workload_type, "randwrite")) { + if (mix_specified) { + fprintf(stderr, "Ignoring -M option... Please use -M option" + " only when using rw or randrw.\n"); + } + } + + if (!strcmp(workload_type, "rw") || + !strcmp(workload_type, "randrw")) { + if (g_arbitration.rw_percentage < 0 || g_arbitration.rw_percentage > 100) { + fprintf(stderr, + "-M must be specified to value from 0 to 100 " + "for rw or randrw.\n"); + return 1; + } + } + + if (!strcmp(workload_type, "read") || + !strcmp(workload_type, "write") || + !strcmp(workload_type, "rw")) { + g_arbitration.is_random = 0; + } else { + g_arbitration.is_random = 1; + } + + if (g_arbitration.latency_tracking_enable != 0 && + g_arbitration.latency_tracking_enable != 1) { + fprintf(stderr, + "-l must be specified to value 0 or 1.\n"); + return 1; + } + + switch (g_arbitration.arbitration_mechanism) { + case SPDK_NVME_CC_AMS_RR: + case SPDK_NVME_CC_AMS_WRR: + case SPDK_NVME_CC_AMS_VS: + break; + default: + fprintf(stderr, + "-a must be specified to value 0, 1, or 7.\n"); + return 1; + } + + if (g_arbitration.arbitration_config != 0 && + g_arbitration.arbitration_config != 1) { + fprintf(stderr, + "-b must be specified to value 0 or 1.\n"); + return 1; + } else if (g_arbitration.arbitration_config == 1 && + g_arbitration.arbitration_mechanism != SPDK_NVME_CC_AMS_WRR) { + fprintf(stderr, + "-a must be specified to 1 (WRR) together.\n"); + return 1; + } + + return 0; +} + +static int +register_workers(void) +{ + uint32_t i; + struct worker_thread *worker; + enum spdk_nvme_qprio qprio = SPDK_NVME_QPRIO_URGENT; + + g_workers = NULL; + g_arbitration.num_workers = 0; + + SPDK_ENV_FOREACH_CORE(i) { + worker = calloc(1, sizeof(*worker)); + if (worker == NULL) { + fprintf(stderr, "Unable to allocate worker\n"); + return -1; + } + + worker->lcore = i; + worker->next = g_workers; + g_workers = worker; + g_arbitration.num_workers++; + + if (g_arbitration.arbitration_mechanism == SPDK_NVME_CAP_AMS_WRR) { + qprio++; + } + + worker->qprio = qprio & SPDK_NVME_CREATE_IO_SQ_QPRIO_MASK; + } + + return 0; +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + /* Update with user specified arbitration configuration */ + opts->arb_mechanism = g_arbitration.arbitration_mechanism; + + printf("Attaching to %s\n", trid->traddr); + + return true; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + printf("Attached to %s\n", trid->traddr); + + /* Update with actual arbitration configuration in use */ + g_arbitration.arbitration_mechanism = opts->arb_mechanism; + + register_ctrlr(ctrlr); +} + +static int +register_controllers(void) +{ + printf("Initializing NVMe Controllers\n"); + + if (spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, NULL) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed\n"); + return 1; + } + + if (g_arbitration.num_namespaces == 0) { + fprintf(stderr, "No valid namespaces to continue IO testing\n"); + return 1; + } + + return 0; +} + +static void +unregister_controllers(void) +{ + struct ctrlr_entry *entry = g_controllers; + + while (entry) { + struct ctrlr_entry *next = entry->next; + if (g_arbitration.latency_tracking_enable && + spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) { + set_latency_tracking_feature(entry->ctrlr, false); + } + spdk_nvme_detach(entry->ctrlr); + free(entry); + entry = next; + } +} + +static int +associate_workers_with_ns(void) +{ + struct ns_entry *entry = g_namespaces; + struct worker_thread *worker = g_workers; + struct ns_worker_ctx *ns_ctx; + int i, count; + + count = g_arbitration.num_namespaces > g_arbitration.num_workers ? + g_arbitration.num_namespaces : g_arbitration.num_workers; + + for (i = 0; i < count; i++) { + if (entry == NULL) { + break; + } + + ns_ctx = malloc(sizeof(struct ns_worker_ctx)); + if (!ns_ctx) { + return 1; + } + memset(ns_ctx, 0, sizeof(*ns_ctx)); + + printf("Associating %s with lcore %d\n", entry->name, worker->lcore); + ns_ctx->entry = entry; + ns_ctx->next = worker->ns_ctx; + worker->ns_ctx = ns_ctx; + + worker = worker->next; + if (worker == NULL) { + worker = g_workers; + } + + entry = entry->next; + if (entry == NULL) { + entry = g_namespaces; + } + + } + + return 0; +} + +static void +get_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + struct feature *feature = cb_arg; + int fid = feature - features; + + if (spdk_nvme_cpl_is_error(cpl)) { + printf("get_feature(0x%02X) failed\n", fid); + } else { + feature->result = cpl->cdw0; + feature->valid = true; + } + + g_arbitration.outstanding_commands--; +} + +static int +get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t fid) +{ + struct spdk_nvme_cmd cmd = {}; + struct feature *feature = &features[fid]; + + feature->valid = false; + + cmd.opc = SPDK_NVME_OPC_GET_FEATURES; + cmd.cdw10_bits.get_features.fid = fid; + + return spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &cmd, NULL, 0, get_feature_completion, feature); +} + +static void +get_arb_feature(struct spdk_nvme_ctrlr *ctrlr) +{ + get_feature(ctrlr, SPDK_NVME_FEAT_ARBITRATION); + + g_arbitration.outstanding_commands++; + + while (g_arbitration.outstanding_commands) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr); + } + + if (features[SPDK_NVME_FEAT_ARBITRATION].valid) { + union spdk_nvme_cmd_cdw11 arb; + arb.feat_arbitration.raw = features[SPDK_NVME_FEAT_ARBITRATION].result; + + printf("Current Arbitration Configuration\n"); + printf("===========\n"); + printf("Arbitration Burst: "); + if (arb.feat_arbitration.bits.ab == SPDK_NVME_ARBITRATION_BURST_UNLIMITED) { + printf("no limit\n"); + } else { + printf("%u\n", 1u << arb.feat_arbitration.bits.ab); + } + + printf("Low Priority Weight: %u\n", arb.feat_arbitration.bits.lpw + 1); + printf("Medium Priority Weight: %u\n", arb.feat_arbitration.bits.mpw + 1); + printf("High Priority Weight: %u\n", arb.feat_arbitration.bits.hpw + 1); + printf("\n"); + } +} + +static void +set_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + struct feature *feature = cb_arg; + int fid = feature - features; + + if (spdk_nvme_cpl_is_error(cpl)) { + printf("set_feature(0x%02X) failed\n", fid); + feature->valid = false; + } else { + printf("Set Arbitration Feature Successfully\n"); + } + + g_arbitration.outstanding_commands--; +} + +static int +set_arb_feature(struct spdk_nvme_ctrlr *ctrlr) +{ + int ret; + struct spdk_nvme_cmd cmd = {}; + + cmd.opc = SPDK_NVME_OPC_SET_FEATURES; + cmd.cdw10_bits.set_features.fid = SPDK_NVME_FEAT_ARBITRATION; + + g_arbitration.outstanding_commands = 0; + + if (features[SPDK_NVME_FEAT_ARBITRATION].valid) { + cmd.cdw11_bits.feat_arbitration.bits.ab = SPDK_NVME_ARBITRATION_BURST_UNLIMITED; + cmd.cdw11_bits.feat_arbitration.bits.lpw = USER_SPECIFIED_LOW_PRIORITY_WEIGHT; + cmd.cdw11_bits.feat_arbitration.bits.mpw = USER_SPECIFIED_MEDIUM_PRIORITY_WEIGHT; + cmd.cdw11_bits.feat_arbitration.bits.hpw = USER_SPECIFIED_HIGH_PRIORITY_WEIGHT; + } + + ret = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &cmd, NULL, 0, + set_feature_completion, &features[SPDK_NVME_FEAT_ARBITRATION]); + if (ret) { + printf("Set Arbitration Feature: Failed 0x%x\n", ret); + return 1; + } + + g_arbitration.outstanding_commands++; + + while (g_arbitration.outstanding_commands) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr); + } + + if (!features[SPDK_NVME_FEAT_ARBITRATION].valid) { + printf("Set Arbitration Feature failed and use default configuration\n"); + } + + return 0; +} + +int +main(int argc, char **argv) +{ + int rc; + struct worker_thread *worker, *master_worker; + unsigned master_core; + char task_pool_name[30]; + uint32_t task_count; + struct spdk_env_opts opts; + + rc = parse_args(argc, argv); + if (rc != 0) { + return rc; + } + + spdk_env_opts_init(&opts); + opts.name = "arb"; + opts.core_mask = g_arbitration.core_mask; + opts.shm_id = g_arbitration.shm_id; + if (spdk_env_init(&opts) < 0) { + return 1; + } + + g_arbitration.tsc_rate = spdk_get_ticks_hz(); + + if (register_workers() != 0) { + return 1; + } + + if (register_controllers() != 0) { + return 1; + } + + if (associate_workers_with_ns() != 0) { + return 1; + } + + snprintf(task_pool_name, sizeof(task_pool_name), "task_pool_%d", getpid()); + + /* + * The task_count will be dynamically calculated based on the + * number of attached active namespaces, queue depth and number + * of cores (workers) involved in the IO perations. + */ + task_count = g_arbitration.num_namespaces > g_arbitration.num_workers ? + g_arbitration.num_namespaces : g_arbitration.num_workers; + task_count *= g_arbitration.queue_depth; + + task_pool = spdk_mempool_create(task_pool_name, task_count, + sizeof(struct arb_task), 0, SPDK_ENV_SOCKET_ID_ANY); + if (task_pool == NULL) { + fprintf(stderr, "could not initialize task pool\n"); + return 1; + } + + print_configuration(argv[0]); + + printf("Initialization complete. Launching workers.\n"); + + /* Launch all of the slave workers */ + master_core = spdk_env_get_current_core(); + master_worker = NULL; + worker = g_workers; + while (worker != NULL) { + if (worker->lcore != master_core) { + spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); + } else { + assert(master_worker == NULL); + master_worker = worker; + } + worker = worker->next; + } + + assert(master_worker != NULL); + rc = work_fn(master_worker); + + spdk_env_thread_wait_all(); + + print_stats(); + + unregister_controllers(); + + cleanup(task_count); + + if (rc != 0) { + fprintf(stderr, "%s: errors occured\n", argv[0]); + } + + return rc; +} diff --git a/src/spdk/examples/nvme/cmb_copy/.gitignore b/src/spdk/examples/nvme/cmb_copy/.gitignore new file mode 100644 index 000000000..fce738032 --- /dev/null +++ b/src/spdk/examples/nvme/cmb_copy/.gitignore @@ -0,0 +1 @@ +cmb_copy diff --git a/src/spdk/examples/nvme/cmb_copy/Makefile b/src/spdk/examples/nvme/cmb_copy/Makefile new file mode 100644 index 000000000..77a143abb --- /dev/null +++ b/src/spdk/examples/nvme/cmb_copy/Makefile @@ -0,0 +1,38 @@ +# +# BSD LICENSE +# +# Copyright (c) Eideticom Inc +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Eideticom Inc nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +APP = cmb_copy + +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk diff --git a/src/spdk/examples/nvme/cmb_copy/cmb_copy.c b/src/spdk/examples/nvme/cmb_copy/cmb_copy.c new file mode 100644 index 000000000..50eedcbba --- /dev/null +++ b/src/spdk/examples/nvme/cmb_copy/cmb_copy.c @@ -0,0 +1,412 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Eideticom Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Eideticom Inc, nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/nvme.h" +#include "spdk/string.h" + +#define CMB_COPY_DELIM "-" +#define CMB_COPY_READ 0 +#define CMB_COPY_WRITE 1 + +struct nvme_io { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_transport_id trid; + struct spdk_nvme_qpair *qpair; + struct spdk_nvme_ns *ns; + unsigned nsid; + unsigned slba; + unsigned nlbas; + uint32_t lba_size; + unsigned done; +}; + +struct cmb_t { + struct spdk_nvme_transport_id trid; + struct spdk_nvme_ctrlr *ctrlr; +}; + +struct config { + struct nvme_io read; + struct nvme_io write; + struct cmb_t cmb; + size_t copy_size; +}; + +static struct config g_config; + +/* Namespaces index from 1. Return 0 to invoke an error */ +static unsigned +get_nsid(const struct spdk_nvme_transport_id *trid) +{ + if (!strcmp(trid->traddr, g_config.read.trid.traddr)) { + return g_config.read.nsid; + } + if (!strcmp(trid->traddr, g_config.write.trid.traddr)) { + return g_config.write.nsid; + } + return 0; +} + +static int +get_rw(const struct spdk_nvme_transport_id *trid) +{ + if (!strcmp(trid->traddr, g_config.read.trid.traddr)) { + return CMB_COPY_READ; + } + if (!strcmp(trid->traddr, g_config.write.trid.traddr)) { + return CMB_COPY_WRITE; + } + return -1; +} + +static void +check_io(void *arg, const struct spdk_nvme_cpl *completion) +{ + int *rw = (unsigned *)arg; + + if (*rw == CMB_COPY_READ) { + g_config.read.done = 1; + } else { + g_config.write.done = 1; + } +} + +static int +cmb_copy(void) +{ + int rc = 0, rw; + void *buf; + size_t sz; + + /* Allocate QPs for the read and write controllers */ + g_config.read.qpair = spdk_nvme_ctrlr_alloc_io_qpair(g_config.read.ctrlr, NULL, 0); + g_config.write.qpair = spdk_nvme_ctrlr_alloc_io_qpair(g_config.write.ctrlr, NULL, 0); + if (g_config.read.qpair == NULL || g_config.read.qpair == NULL) { + printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair() failed\n"); + return -ENOMEM; + } + + /* Allocate a buffer from our CMB */ + buf = spdk_nvme_ctrlr_map_cmb(g_config.cmb.ctrlr, &sz); + if (buf == NULL || sz < g_config.copy_size) { + printf("ERROR: buffer allocation failed\n"); + printf("Are you sure %s has a valid CMB?\n", + g_config.cmb.trid.traddr); + return -ENOMEM; + } + + /* Clear the done flags */ + g_config.read.done = 0; + g_config.write.done = 0; + + rw = CMB_COPY_READ; + /* Do the read to the CMB IO buffer */ + rc = spdk_nvme_ns_cmd_read(g_config.read.ns, g_config.read.qpair, buf, + g_config.read.slba, g_config.read.nlbas, + check_io, &rw, 0); + if (rc != 0) { + fprintf(stderr, "starting read I/O failed\n"); + return -EIO; + } + while (!g_config.read.done) { + spdk_nvme_qpair_process_completions(g_config.read.qpair, 0); + } + + /* Do the write from the CMB IO buffer */ + rw = CMB_COPY_WRITE; + rc = spdk_nvme_ns_cmd_write(g_config.write.ns, g_config.write.qpair, buf, + g_config.write.slba, g_config.write.nlbas, + check_io, &rw, 0); + if (rc != 0) { + fprintf(stderr, "starting write I/O failed\n"); + return -EIO; + } + while (!g_config.write.done) { + spdk_nvme_qpair_process_completions(g_config.write.qpair, 0); + } + + /* Clear the done flags */ + g_config.read.done = 0; + g_config.write.done = 0; + + /* Free CMB buffer */ + spdk_nvme_ctrlr_unmap_cmb(g_config.cmb.ctrlr); + + /* Free the queues */ + spdk_nvme_ctrlr_free_io_qpair(g_config.read.qpair); + spdk_nvme_ctrlr_free_io_qpair(g_config.write.qpair); + + return rc; +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + /* We will only attach to the read or write controller */ + if (strcmp(trid->traddr, g_config.read.trid.traddr) && + strcmp(trid->traddr, g_config.write.trid.traddr)) { + printf("%s - not probed %s!\n", __func__, trid->traddr); + return 0; + } + + opts->use_cmb_sqs = false; + + printf("%s - probed %s!\n", __func__, trid->traddr); + return 1; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + struct spdk_nvme_ns *ns; + + ns = spdk_nvme_ctrlr_get_ns(ctrlr, get_nsid(trid)); + if (ns == NULL) { + fprintf(stderr, "Could not locate namespace %d on controller %s.\n", + get_nsid(trid), trid->traddr); + exit(-1); + } + if (get_rw(trid) == CMB_COPY_READ) { + g_config.read.ctrlr = ctrlr; + g_config.read.ns = ns; + g_config.read.lba_size = spdk_nvme_ns_get_sector_size(ns); + } else { + g_config.write.ctrlr = ctrlr; + g_config.write.ns = ns; + g_config.write.lba_size = spdk_nvme_ns_get_sector_size(ns); + } + printf("%s - attached %s!\n", __func__, trid->traddr); + + return; +} + +static void +usage(char *program_name) +{ + printf("%s options (all mandatory)", program_name); + printf("\n"); + printf("\t[-r NVMe read parameters]\n"); + printf("\t[-w NVMe write parameters]\n"); + printf("\t[-c CMB to use for data buffers]\n"); + printf("\n"); + printf("Read/Write params:\n"); + printf(" <pci id>-<namespace>-<start LBA>-<number of LBAs>\n"); +} + +static void +parse(char *in, struct nvme_io *io) +{ + char *tok = NULL; + long int val; + + tok = strtok(in, CMB_COPY_DELIM); + if (tok == NULL) { + goto err; + } + snprintf(&io->trid.traddr[0], SPDK_NVMF_TRADDR_MAX_LEN + 1, + "%s", tok); + + tok = strtok(NULL, CMB_COPY_DELIM); + if (tok == NULL) { + goto err; + } + val = spdk_strtol(tok, 10); + if (val < 0) { + goto err; + } + io->nsid = (unsigned)val; + + tok = strtok(NULL, CMB_COPY_DELIM); + if (tok == NULL) { + goto err; + } + val = spdk_strtol(tok, 10); + if (val < 0) { + goto err; + } + io->slba = (unsigned)val; + + tok = strtok(NULL, CMB_COPY_DELIM); + if (tok == NULL) { + goto err; + } + val = spdk_strtol(tok, 10); + if (val < 0) { + goto err; + } + io->nlbas = (unsigned)val; + + tok = strtok(NULL, CMB_COPY_DELIM); + if (tok != NULL) { + goto err; + } + return; + +err: + fprintf(stderr, "%s: error parsing %s\n", __func__, in); + exit(-1); + +} + +static int +parse_args(int argc, char **argv) +{ + int op; + unsigned read = 0, write = 0, cmb = 0; + + while ((op = getopt(argc, argv, "r:w:c:")) != -1) { + switch (op) { + case 'r': + parse(optarg, &g_config.read); + read = 1; + break; + case 'w': + parse(optarg, &g_config.write); + write = 1; + break; + case 'c': + snprintf(g_config.cmb.trid.traddr, SPDK_NVMF_TRADDR_MAX_LEN + 1, + "%s", optarg); + cmb = 1; + break; + default: + usage(argv[0]); + return 1; + } + } + + if ((!read || !write || !cmb)) { + usage(argv[0]); + return 1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + int rc = 0; + struct spdk_env_opts opts; + + /* + * Parse the input arguments. For now we use the following + * format list: + * + * <pci id>-<namespace>-<start LBA>-<number of LBAs> + * + */ + rc = parse_args(argc, argv); + if (rc) { + fprintf(stderr, "Error in parse_args(): %d\n", + rc); + return -1; + } + + /* + * SPDK relies on an abstraction around the local environment + * named env that handles memory allocation and PCI device operations. + * This library must be initialized first. + * + */ + spdk_env_opts_init(&opts); + opts.name = "cmb_copy"; + opts.shm_id = 0; + if (spdk_env_init(&opts) < 0) { + fprintf(stderr, "Unable to initialize SPDK env\n"); + return 1; + } + + /* + * CMBs only apply to PCIe attached NVMe controllers so we + * only probe the PCIe bus. This is the default when we pass + * in NULL for the first argument. + */ + + rc = spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, NULL); + if (rc) { + fprintf(stderr, "Error in spdk_nvme_probe(): %d\n", + rc); + return -1; + } + + /* + * For now enforce that the read and write controller are not + * the same. This avoids an internal only DMA. + */ + if (!strcmp(g_config.write.trid.traddr, g_config.read.trid.traddr)) { + fprintf(stderr, "Read and Write controllers must differ!\n"); + return -1; + } + + /* + * Perform a few sanity checks and set the buffer size for the + * CMB. + */ + if (g_config.read.nlbas * g_config.read.lba_size != + g_config.write.nlbas * g_config.write.lba_size) { + fprintf(stderr, "Read and write sizes do not match!\n"); + return -1; + } + g_config.copy_size = g_config.read.nlbas * g_config.read.lba_size; + + /* + * Get the ctrlr pointer for the CMB. For now we assume this + * is either the read or write NVMe controller though in + * theory that is not a necessary condition. + */ + + if (!strcmp(g_config.cmb.trid.traddr, g_config.read.trid.traddr)) { + g_config.cmb.ctrlr = g_config.read.ctrlr; + } + if (!strcmp(g_config.cmb.trid.traddr, g_config.write.trid.traddr)) { + g_config.cmb.ctrlr = g_config.write.ctrlr; + } + + /* + * Call the cmb_copy() function which performs the CMB + * based copy or returns an error code if it fails. + */ + rc = cmb_copy(); + if (rc) { + fprintf(stderr, "Error in spdk_cmb_copy(): %d\n", + rc); + return -1; + } + + return rc; +} diff --git a/src/spdk/examples/nvme/fio_plugin/.gitignore b/src/spdk/examples/nvme/fio_plugin/.gitignore new file mode 100644 index 000000000..1b0b36ac4 --- /dev/null +++ b/src/spdk/examples/nvme/fio_plugin/.gitignore @@ -0,0 +1 @@ +fio_plugin diff --git a/src/spdk/examples/nvme/fio_plugin/Makefile b/src/spdk/examples/nvme/fio_plugin/Makefile new file mode 100644 index 000000000..1f71802df --- /dev/null +++ b/src/spdk/examples/nvme/fio_plugin/Makefile @@ -0,0 +1,51 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# Copyright (c) 2015-2016, Micron Technology, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk + +FIO_PLUGIN := spdk_nvme + +C_SRCS = fio_plugin.c + +# Unable to combine the FIO plugin and the VPP socket abstraction (license incompatibility) +SPDK_LIB_LIST = $(filter-out sock_vpp,$(SOCK_MODULES_LIST)) +SPDK_LIB_LIST += nvme thread util log sock vmd jsonrpc json rpc + +ifeq ($(CONFIG_RDMA),y) +SPDK_LIB_LIST += rdma +endif + +include $(SPDK_ROOT_DIR)/mk/spdk.fio.mk diff --git a/src/spdk/examples/nvme/fio_plugin/README.md b/src/spdk/examples/nvme/fio_plugin/README.md new file mode 100644 index 000000000..e7a8b7c01 --- /dev/null +++ b/src/spdk/examples/nvme/fio_plugin/README.md @@ -0,0 +1,107 @@ +# Compiling fio + +First, clone the fio source repository from https://github.com/axboe/fio + + git clone https://github.com/axboe/fio + +Then check out the latest fio version and compile the code: + + make + +# Compiling SPDK + +First, clone the SPDK source repository from https://github.com/spdk/spdk + + git clone https://github.com/spdk/spdk + git submodule update --init + +Then, run the SPDK configure script to enable fio (point it to the root of the fio repository): + + cd spdk + ./configure --with-fio=/path/to/fio/repo <other configuration options> + +Finally, build SPDK: + + make + +**Note to advanced users**: These steps assume you're using the DPDK submodule. If you are using your +own version of DPDK, the fio plugin requires that DPDK be compiled with -fPIC. You can compile DPDK +with -fPIC by modifying your DPDK configuration file and adding the line: + + EXTRA_CFLAGS=-fPIC + +# Usage + +To use the SPDK fio plugin with fio, specify the plugin binary using LD_PRELOAD when running +fio and set ioengine=spdk in the fio configuration file (see example_config.fio in the same +directory as this README). + + LD_PRELOAD=<path to spdk repo>/build/fio/spdk_nvme fio + +To select NVMe devices, you pass an SPDK Transport Identifier string as the filename. These are in the +form: + + filename=key=value [key=value] ... ns=value + +Specifically, for local PCIe NVMe devices it will look like this: + + filename=trtype=PCIe traddr=0000.04.00.0 ns=1 + +And remote devices accessed via NVMe over Fabrics will look like this: + + filename=trtype=RDMA adrfam=IPv4 traddr=192.168.100.8 trsvcid=4420 ns=1 + +**Note**: The specification of the PCIe address should not use the normal ':' +and instead only use '.'. This is a limitation in fio - it splits filenames on +':'. Also, the NVMe namespaces start at 1, not 0, and the namespace must be +specified at the end of the string. + +Currently the SPDK fio plugin is limited to the thread usage model, so fio jobs must also specify thread=1 +when using the SPDK fio plugin. + +fio also currently has a race condition on shutdown if dynamically loading the ioengine by specifying the +engine's full path via the ioengine parameter - LD_PRELOAD is recommended to avoid this race condition. + +When testing random workloads, it is recommended to set norandommap=1. fio's random map +processing consumes extra CPU cycles which will degrade performance over time with +the fio_plugin since all I/O are submitted and completed on a single CPU core. + +When testing FIO on multiple NVMe SSDs with SPDK plugin, it is recommended to use multiple jobs in FIO configurion. +It has been observed that there are some performance gap between FIO(with SPDK plugin enabled) and SPDK perf +(examples/nvme/perf/perf) on testing multiple NVMe SSDs. If you use one job(i.e., use one CPU core) configured for +FIO test, the performance is worse than SPDK perf (also using one CPU core) against many NVMe SSDs. But if you use +multiple jobs for FIO test, the performance of FIO is similiar with SPDK perf. After analyzing this phenomenon, we +think that is caused by the FIO architecture. Mainly FIO can scale with multiple threads (i.e., using CPU cores), +but it is not good to use one thread against many I/O devices. + +# End-to-end Data Protection (Optional) + +Running with PI setting, following settings steps are required. +First, format device namespace with proper PI setting. For example: + + nvme format /dev/nvme0n1 -l 1 -i 1 -p 0 -m 1 + +In fio configure file, add PRACT and set PRCHK by flags(GUARD|REFTAG|APPTAG) properly. For example: + + pi_act=0 + pi_chk=GUARD + +Blocksize should be set as the sum of data and metadata. For example, if data blocksize is 512 Byte, host generated +PI metadata is 8 Byte, then blocksize in fio configure file should be 520 Byte: + + bs=520 + +The storage device may use a block format that requires separate metadata (DIX). In this scenario, the fio_plugin +will automatically allocate an extra 4KiB buffer per I/O to hold this metadata. For some cases, such as 512 byte +blocks with 32 metadata bytes per block and a 128KiB I/O size, 4KiB isn't large enough. In this case, the +`md_per_io_size` option may be specified to increase the size of the metadata buffer. + +Expose two options 'apptag' and 'apptag_mask', users can change them in the configuration file when using +application tag and application tag mask in end-to-end data protection. Application tag and application +tag mask are set to 0x1234 and 0xFFFF by default. + +# VMD (Optional) + +To enable VMD enumeration add enable_vmd flag in fio configuration file: + + enable_vmd=1 diff --git a/src/spdk/examples/nvme/fio_plugin/example_config.fio b/src/spdk/examples/nvme/fio_plugin/example_config.fio new file mode 100644 index 000000000..a8e62ccb9 --- /dev/null +++ b/src/spdk/examples/nvme/fio_plugin/example_config.fio @@ -0,0 +1,15 @@ +[global] +ioengine=spdk +thread=1 +group_reporting=1 +direct=1 +verify=0 +time_based=1 +ramp_time=0 +runtime=2 +iodepth=128 +rw=randrw +bs=4k + +[test] +numjobs=1 diff --git a/src/spdk/examples/nvme/fio_plugin/fio_plugin.c b/src/spdk/examples/nvme/fio_plugin/fio_plugin.c new file mode 100644 index 000000000..7aabeb8cb --- /dev/null +++ b/src/spdk/examples/nvme/fio_plugin/fio_plugin.c @@ -0,0 +1,1267 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/vmd.h" +#include "spdk/env.h" +#include "spdk/string.h" +#include "spdk/log.h" +#include "spdk/endian.h" +#include "spdk/dif.h" +#include "spdk/util.h" + +#include "config-host.h" +#include "fio.h" +#include "optgroup.h" + +/* FreeBSD is missing CLOCK_MONOTONIC_RAW, + * so alternative is provided. */ +#ifndef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC +#endif + +#define NVME_IO_ALIGN 4096 + +static bool g_spdk_env_initialized; +static int g_spdk_enable_sgl = 0; +static uint32_t g_spdk_sge_size = 4096; +static uint32_t g_spdk_bit_bucket_data_len = 0; +static uint32_t g_spdk_pract_flag; +static uint32_t g_spdk_prchk_flags; +static uint32_t g_spdk_md_per_io_size = 4096; +static uint16_t g_spdk_apptag; +static uint16_t g_spdk_apptag_mask; + +struct spdk_fio_options { + void *pad; /* off1 used in option descriptions may not be 0 */ + int enable_wrr; + int arbitration_burst; + int low_weight; + int medium_weight; + int high_weight; + int wrr_priority; + int mem_size; + int shm_id; + int enable_sgl; + int sge_size; + int bit_bucket_data_len; + char *hostnqn; + int pi_act; + char *pi_chk; + int md_per_io_size; + int apptag; + int apptag_mask; + char *digest_enable; + int enable_vmd; +}; + +struct spdk_fio_request { + struct io_u *io; + /** Offset in current iovec, fio only uses 1 vector */ + uint32_t iov_offset; + + /** Amount of data used for Bit Bucket SGL */ + uint32_t bit_bucket_data_len; + + /** Context for NVMe PI */ + struct spdk_dif_ctx dif_ctx; + /** Separate metadata buffer pointer */ + void *md_buf; + + struct spdk_fio_thread *fio_thread; + struct spdk_fio_qpair *fio_qpair; +}; + +struct spdk_fio_ctrlr { + struct spdk_nvme_transport_id tr_id; + struct spdk_nvme_ctrlr_opts opts; + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_fio_ctrlr *next; +}; + +static struct spdk_fio_ctrlr *g_ctrlr; +static int g_td_count; +static pthread_t g_ctrlr_thread_id = 0; +static pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER; +static bool g_error; + +struct spdk_fio_qpair { + struct fio_file *f; + struct spdk_nvme_qpair *qpair; + struct spdk_nvme_ns *ns; + uint32_t io_flags; + bool nvme_pi_enabled; + /* True for DIF and false for DIX, and this is valid only if nvme_pi_enabled is true. */ + bool extended_lba; + /* True for protection info transferred at start of metadata, + * false for protection info transferred at end of metadata, and + * this is valid only if nvme_pi_enabled is true. + */ + bool md_start; + struct spdk_fio_qpair *next; + struct spdk_fio_ctrlr *fio_ctrlr; +}; + +struct spdk_fio_thread { + struct thread_data *td; + + struct spdk_fio_qpair *fio_qpair; + struct spdk_fio_qpair *fio_qpair_current; /* the current fio_qpair to be handled. */ + + struct io_u **iocq; /* io completion queue */ + unsigned int iocq_count; /* number of iocq entries filled by last getevents */ + unsigned int iocq_size; /* number of iocq entries allocated */ + struct fio_file *current_f; /* fio_file given by user */ + +}; + +static void * +spdk_fio_poll_ctrlrs(void *arg) +{ + struct spdk_fio_ctrlr *fio_ctrlr; + int oldstate; + int rc; + + /* Loop until the thread is cancelled */ + while (true) { + rc = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); + if (rc != 0) { + SPDK_ERRLOG("Unable to set cancel state disabled on g_init_thread (%d): %s\n", + rc, spdk_strerror(rc)); + } + + pthread_mutex_lock(&g_mutex); + fio_ctrlr = g_ctrlr; + + while (fio_ctrlr) { + spdk_nvme_ctrlr_process_admin_completions(fio_ctrlr->ctrlr); + fio_ctrlr = fio_ctrlr->next; + } + + pthread_mutex_unlock(&g_mutex); + + rc = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate); + if (rc != 0) { + SPDK_ERRLOG("Unable to set cancel state enabled on g_init_thread (%d): %s\n", + rc, spdk_strerror(rc)); + } + + /* This is a pthread cancellation point and cannot be removed. */ + sleep(1); + } + + return NULL; +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + struct thread_data *td = cb_ctx; + struct spdk_fio_options *fio_options = td->eo; + + if (fio_options->hostnqn) { + snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", fio_options->hostnqn); + } + + if (fio_options->enable_wrr) { + opts->arb_mechanism = SPDK_NVME_CC_AMS_WRR; + opts->arbitration_burst = fio_options->arbitration_burst; + opts->low_priority_weight = fio_options->low_weight; + opts->medium_priority_weight = fio_options->medium_weight; + opts->high_priority_weight = fio_options->high_weight; + } + + if (fio_options->digest_enable) { + if (strcasecmp(fio_options->digest_enable, "HEADER") == 0) { + opts->header_digest = true; + } else if (strcasecmp(fio_options->digest_enable, "DATA") == 0) { + opts->data_digest = true; + } else if (strcasecmp(fio_options->digest_enable, "BOTH") == 0) { + opts->header_digest = true; + opts->data_digest = true; + } + } + + return true; +} + +static struct spdk_fio_ctrlr * +get_fio_ctrlr(const struct spdk_nvme_transport_id *trid) +{ + struct spdk_fio_ctrlr *fio_ctrlr = g_ctrlr; + while (fio_ctrlr) { + if (spdk_nvme_transport_id_compare(trid, &fio_ctrlr->tr_id) == 0) { + return fio_ctrlr; + } + + fio_ctrlr = fio_ctrlr->next; + } + + return NULL; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + struct thread_data *td = cb_ctx; + struct spdk_fio_thread *fio_thread = td->io_ops_data; + struct spdk_nvme_io_qpair_opts qpopts; + struct spdk_fio_ctrlr *fio_ctrlr; + struct spdk_fio_qpair *fio_qpair; + struct spdk_nvme_ns *ns; + const struct spdk_nvme_ns_data *nsdata; + struct fio_file *f = fio_thread->current_f; + uint32_t ns_id; + char *p; + long int tmp; + struct spdk_fio_options *fio_options = td->eo; + + p = strstr(f->file_name, "ns="); + if (p != NULL) { + tmp = spdk_strtol(p + 3, 10); + if (tmp <= 0) { + SPDK_ERRLOG("namespace id should be >=1, but was invalid: %ld\n", tmp); + g_error = true; + return; + } + ns_id = (uint32_t)tmp; + } else { + ns_id = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + if (ns_id == 0) { + /* The ctrlr has no active namespaces and we didn't specify any so nothing to do. */ + return; + } + } + + pthread_mutex_lock(&g_mutex); + fio_ctrlr = get_fio_ctrlr(trid); + /* it is a new ctrlr and needs to be added */ + if (!fio_ctrlr) { + /* Create an fio_ctrlr and add it to the list */ + fio_ctrlr = calloc(1, sizeof(*fio_ctrlr)); + if (!fio_ctrlr) { + SPDK_ERRLOG("Cannot allocate space for fio_ctrlr\n"); + g_error = true; + pthread_mutex_unlock(&g_mutex); + return; + } + fio_ctrlr->opts = *opts; + fio_ctrlr->ctrlr = ctrlr; + fio_ctrlr->tr_id = *trid; + fio_ctrlr->next = g_ctrlr; + g_ctrlr = fio_ctrlr; + } + pthread_mutex_unlock(&g_mutex); + + ns = spdk_nvme_ctrlr_get_ns(fio_ctrlr->ctrlr, ns_id); + if (ns == NULL) { + SPDK_ERRLOG("Cannot get namespace by ns_id=%d\n", ns_id); + g_error = true; + return; + } + + if (!spdk_nvme_ns_is_active(ns)) { + SPDK_ERRLOG("Inactive namespace by ns_id=%d\n", ns_id); + g_error = true; + return; + } + nsdata = spdk_nvme_ns_get_data(ns); + + fio_qpair = fio_thread->fio_qpair; + while (fio_qpair != NULL) { + if ((fio_qpair->f == f) || + ((spdk_nvme_transport_id_compare(trid, &fio_qpair->fio_ctrlr->tr_id) == 0) && + (spdk_nvme_ns_get_id(fio_qpair->ns) == ns_id))) { + /* Not the error case. Avoid duplicated connection */ + return; + } + fio_qpair = fio_qpair->next; + } + + /* create a new qpair */ + fio_qpair = calloc(1, sizeof(*fio_qpair)); + if (!fio_qpair) { + g_error = true; + SPDK_ERRLOG("Cannot allocate space for fio_qpair\n"); + return; + } + + spdk_nvme_ctrlr_get_default_io_qpair_opts(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts)); + qpopts.delay_cmd_submit = true; + if (fio_options->enable_wrr) { + qpopts.qprio = fio_options->wrr_priority; + } + + fio_qpair->qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts)); + if (!fio_qpair->qpair) { + SPDK_ERRLOG("Cannot allocate nvme io_qpair any more\n"); + g_error = true; + free(fio_qpair); + return; + } + + fio_qpair->ns = ns; + fio_qpair->f = f; + fio_qpair->fio_ctrlr = fio_ctrlr; + fio_qpair->next = fio_thread->fio_qpair; + fio_thread->fio_qpair = fio_qpair; + + if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) { + assert(spdk_nvme_ns_get_pi_type(ns) != SPDK_NVME_FMT_NVM_PROTECTION_DISABLE); + fio_qpair->io_flags = g_spdk_pract_flag | g_spdk_prchk_flags; + fio_qpair->nvme_pi_enabled = true; + fio_qpair->md_start = nsdata->dps.md_start; + fio_qpair->extended_lba = spdk_nvme_ns_supports_extended_lba(ns); + fprintf(stdout, "PI type%u enabled with %s\n", spdk_nvme_ns_get_pi_type(ns), + fio_qpair->extended_lba ? "extended lba" : "separate metadata"); + } + + f->real_file_size = spdk_nvme_ns_get_size(fio_qpair->ns); + if (f->real_file_size <= 0) { + g_error = true; + SPDK_ERRLOG("Cannot get namespace size by ns=%p\n", ns); + return; + } + + f->filetype = FIO_TYPE_BLOCK; + fio_file_set_size_known(f); +} + +static void parse_prchk_flags(const char *prchk_str) +{ + if (!prchk_str) { + return; + } + + if (strstr(prchk_str, "GUARD") != NULL) { + g_spdk_prchk_flags = SPDK_NVME_IO_FLAGS_PRCHK_GUARD; + } + if (strstr(prchk_str, "REFTAG") != NULL) { + g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG; + } + if (strstr(prchk_str, "APPTAG") != NULL) { + g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG; + } +} + +static void parse_pract_flag(int pract) +{ + if (pract == 1) { + g_spdk_pract_flag = SPDK_NVME_IO_FLAGS_PRACT; + } else { + g_spdk_pract_flag = 0; + } +} + +/* Called once at initialization. This is responsible for gathering the size of + * each "file", which in our case are in the form + * 'key=value [key=value] ... ns=value' + * For example, For local PCIe NVMe device - 'trtype=PCIe traddr=0000.04.00.0 ns=1' + * For remote exported by NVMe-oF target, 'trtype=RDMA adrfam=IPv4 traddr=192.168.100.8 trsvcid=4420 ns=1' */ +static int spdk_fio_setup(struct thread_data *td) +{ + struct spdk_fio_thread *fio_thread; + struct spdk_fio_options *fio_options = td->eo; + struct spdk_env_opts opts; + struct fio_file *f; + char *p; + int rc = 0; + struct spdk_nvme_transport_id trid; + struct spdk_fio_ctrlr *fio_ctrlr; + char *trid_info; + unsigned int i; + + /* we might be running in a daemonized FIO instance where standard + * input and output were closed and fds 0, 1, and 2 are reused + * for something important by FIO. We can't ensure we won't print + * anything (and so will our dependencies, e.g. DPDK), so abort early. + * (is_backend is an fio global variable) + */ + if (is_backend) { + char buf[1024]; + snprintf(buf, sizeof(buf), + "SPDK FIO plugin won't work with daemonized FIO server."); + fio_server_text_output(FIO_LOG_ERR, buf, sizeof(buf)); + return -1; + } + + if (!td->o.use_thread) { + log_err("spdk: must set thread=1 when using spdk plugin\n"); + return 1; + } + + pthread_mutex_lock(&g_mutex); + + fio_thread = calloc(1, sizeof(*fio_thread)); + assert(fio_thread != NULL); + + td->io_ops_data = fio_thread; + fio_thread->td = td; + + fio_thread->iocq_size = td->o.iodepth; + fio_thread->iocq = calloc(fio_thread->iocq_size, sizeof(struct io_u *)); + assert(fio_thread->iocq != NULL); + + if (!g_spdk_env_initialized) { + spdk_env_opts_init(&opts); + opts.name = "fio"; + opts.mem_size = fio_options->mem_size; + opts.shm_id = fio_options->shm_id; + g_spdk_enable_sgl = fio_options->enable_sgl; + g_spdk_sge_size = fio_options->sge_size; + g_spdk_bit_bucket_data_len = fio_options->bit_bucket_data_len; + parse_pract_flag(fio_options->pi_act); + g_spdk_md_per_io_size = spdk_max(fio_options->md_per_io_size, 4096); + g_spdk_apptag = (uint16_t)fio_options->apptag; + g_spdk_apptag_mask = (uint16_t)fio_options->apptag_mask; + parse_prchk_flags(fio_options->pi_chk); + if (spdk_env_init(&opts) < 0) { + SPDK_ERRLOG("Unable to initialize SPDK env\n"); + free(fio_thread->iocq); + free(fio_thread); + fio_thread = NULL; + pthread_mutex_unlock(&g_mutex); + return 1; + } + g_spdk_env_initialized = true; + spdk_unaffinitize_thread(); + + /* Spawn a thread to continue polling the controllers */ + rc = pthread_create(&g_ctrlr_thread_id, NULL, &spdk_fio_poll_ctrlrs, NULL); + if (rc != 0) { + SPDK_ERRLOG("Unable to spawn a thread to poll admin queues. They won't be polled.\n"); + } + + if (fio_options->enable_vmd && spdk_vmd_init()) { + SPDK_ERRLOG("Failed to initialize VMD. Some NVMe devices can be unavailable.\n"); + } + } + pthread_mutex_unlock(&g_mutex); + + for_each_file(td, f, i) { + memset(&trid, 0, sizeof(trid)); + + trid.trtype = SPDK_NVME_TRANSPORT_PCIE; + + p = strstr(f->file_name, " ns="); + if (p != NULL) { + trid_info = strndup(f->file_name, p - f->file_name); + } else { + trid_info = strndup(f->file_name, strlen(f->file_name)); + } + + if (!trid_info) { + SPDK_ERRLOG("Failed to allocate space for trid_info\n"); + continue; + } + + rc = spdk_nvme_transport_id_parse(&trid, trid_info); + if (rc < 0) { + SPDK_ERRLOG("Failed to parse given str: %s\n", trid_info); + free(trid_info); + continue; + } + free(trid_info); + + if (trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + struct spdk_pci_addr pci_addr; + if (spdk_pci_addr_parse(&pci_addr, trid.traddr) < 0) { + SPDK_ERRLOG("Invalid traddr=%s\n", trid.traddr); + continue; + } + spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr); + } else { + if (trid.subnqn[0] == '\0') { + snprintf(trid.subnqn, sizeof(trid.subnqn), "%s", + SPDK_NVMF_DISCOVERY_NQN); + } + } + + fio_thread->current_f = f; + + pthread_mutex_lock(&g_mutex); + fio_ctrlr = get_fio_ctrlr(&trid); + pthread_mutex_unlock(&g_mutex); + if (fio_ctrlr) { + attach_cb(td, &trid, fio_ctrlr->ctrlr, &fio_ctrlr->opts); + } else { + /* Enumerate all of the controllers */ + if (spdk_nvme_probe(&trid, td, probe_cb, attach_cb, NULL) != 0) { + SPDK_ERRLOG("spdk_nvme_probe() failed\n"); + continue; + } + } + + if (g_error) { + log_err("Failed to initialize spdk fio plugin\n"); + rc = 1; + break; + } + } + + pthread_mutex_lock(&g_mutex); + g_td_count++; + pthread_mutex_unlock(&g_mutex); + + return rc; +} + +static int spdk_fio_open(struct thread_data *td, struct fio_file *f) +{ + return 0; +} + +static int spdk_fio_close(struct thread_data *td, struct fio_file *f) +{ + return 0; +} + +static int spdk_fio_iomem_alloc(struct thread_data *td, size_t total_mem) +{ + td->orig_buffer = spdk_dma_zmalloc(total_mem, NVME_IO_ALIGN, NULL); + return td->orig_buffer == NULL; +} + +static void spdk_fio_iomem_free(struct thread_data *td) +{ + spdk_dma_free(td->orig_buffer); +} + +static int spdk_fio_io_u_init(struct thread_data *td, struct io_u *io_u) +{ + struct spdk_fio_thread *fio_thread = td->io_ops_data; + struct spdk_fio_request *fio_req; + + io_u->engine_data = NULL; + + fio_req = calloc(1, sizeof(*fio_req)); + if (fio_req == NULL) { + return 1; + } + + fio_req->md_buf = spdk_dma_zmalloc(g_spdk_md_per_io_size, NVME_IO_ALIGN, NULL); + if (fio_req->md_buf == NULL) { + fprintf(stderr, "Allocate %u metadata failed\n", g_spdk_md_per_io_size); + free(fio_req); + return 1; + } + + fio_req->io = io_u; + fio_req->fio_thread = fio_thread; + + io_u->engine_data = fio_req; + + return 0; +} + +static void spdk_fio_io_u_free(struct thread_data *td, struct io_u *io_u) +{ + struct spdk_fio_request *fio_req = io_u->engine_data; + + if (fio_req) { + assert(fio_req->io == io_u); + spdk_dma_free(fio_req->md_buf); + free(fio_req); + io_u->engine_data = NULL; + } +} + +static int +fio_extended_lba_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u) +{ + struct spdk_nvme_ns *ns = fio_qpair->ns; + struct spdk_fio_request *fio_req = io_u->engine_data; + uint32_t md_size, extended_lba_size, lba_count; + uint64_t lba; + struct iovec iov; + int rc; + + /* Set appmask and apptag when PRACT is enabled */ + if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) { + fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask; + fio_req->dif_ctx.app_tag = g_spdk_apptag; + return 0; + } + + extended_lba_size = spdk_nvme_ns_get_extended_sector_size(ns); + md_size = spdk_nvme_ns_get_md_size(ns); + lba = io_u->offset / extended_lba_size; + lba_count = io_u->xfer_buflen / extended_lba_size; + + rc = spdk_dif_ctx_init(&fio_req->dif_ctx, extended_lba_size, md_size, + true, fio_qpair->md_start, + (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns), + fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag, 0, 0); + if (rc != 0) { + fprintf(stderr, "Initialization of DIF context failed\n"); + return rc; + } + + if (io_u->ddir != DDIR_WRITE) { + return 0; + } + + iov.iov_base = io_u->buf; + iov.iov_len = io_u->xfer_buflen; + rc = spdk_dif_generate(&iov, 1, lba_count, &fio_req->dif_ctx); + if (rc != 0) { + fprintf(stderr, "Generation of DIF failed\n"); + } + + return rc; +} + +static int +fio_separate_md_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u) +{ + struct spdk_nvme_ns *ns = fio_qpair->ns; + struct spdk_fio_request *fio_req = io_u->engine_data; + uint32_t md_size, block_size, lba_count; + uint64_t lba; + struct iovec iov, md_iov; + int rc; + + /* Set appmask and apptag when PRACT is enabled */ + if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) { + fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask; + fio_req->dif_ctx.app_tag = g_spdk_apptag; + return 0; + } + + block_size = spdk_nvme_ns_get_sector_size(ns); + md_size = spdk_nvme_ns_get_md_size(ns); + lba = io_u->offset / block_size; + lba_count = io_u->xfer_buflen / block_size; + + rc = spdk_dif_ctx_init(&fio_req->dif_ctx, block_size, md_size, + false, fio_qpair->md_start, + (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns), + fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag, 0, 0); + if (rc != 0) { + fprintf(stderr, "Initialization of DIF context failed\n"); + return rc; + } + + if (io_u->ddir != DDIR_WRITE) { + return 0; + } + + iov.iov_base = io_u->buf; + iov.iov_len = io_u->xfer_buflen; + md_iov.iov_base = fio_req->md_buf; + md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size); + rc = spdk_dix_generate(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx); + if (rc < 0) { + fprintf(stderr, "Generation of DIX failed\n"); + } + + return rc; +} + +static int +fio_extended_lba_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u) +{ + struct spdk_nvme_ns *ns = fio_qpair->ns; + struct spdk_fio_request *fio_req = io_u->engine_data; + uint32_t lba_count; + struct iovec iov; + struct spdk_dif_error err_blk = {}; + int rc; + + /* Do nothing when PRACT is enabled */ + if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) { + return 0; + } + + iov.iov_base = io_u->buf; + iov.iov_len = io_u->xfer_buflen; + lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_extended_sector_size(ns); + + rc = spdk_dif_verify(&iov, 1, lba_count, &fio_req->dif_ctx, &err_blk); + if (rc != 0) { + fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n", + err_blk.err_type, err_blk.err_offset); + } + + return rc; +} + +static int +fio_separate_md_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u) +{ + struct spdk_nvme_ns *ns = fio_qpair->ns; + struct spdk_fio_request *fio_req = io_u->engine_data; + uint32_t md_size, lba_count; + struct iovec iov, md_iov; + struct spdk_dif_error err_blk = {}; + int rc; + + /* Do nothing when PRACT is enabled */ + if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) { + return 0; + } + + iov.iov_base = io_u->buf; + iov.iov_len = io_u->xfer_buflen; + lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_sector_size(ns); + md_size = spdk_nvme_ns_get_md_size(ns); + md_iov.iov_base = fio_req->md_buf; + md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size); + + rc = spdk_dix_verify(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx, &err_blk); + if (rc != 0) { + fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n", + err_blk.err_type, err_blk.err_offset); + } + + return rc; +} + +static void spdk_fio_completion_cb(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_fio_request *fio_req = ctx; + struct spdk_fio_thread *fio_thread = fio_req->fio_thread; + struct spdk_fio_qpair *fio_qpair = fio_req->fio_qpair; + int rc; + + if (fio_qpair->nvme_pi_enabled && fio_req->io->ddir == DDIR_READ) { + if (fio_qpair->extended_lba) { + rc = fio_extended_lba_verify_pi(fio_qpair, fio_req->io); + } else { + rc = fio_separate_md_verify_pi(fio_qpair, fio_req->io); + } + if (rc != 0) { + fio_req->io->error = abs(rc); + } + } + + assert(fio_thread->iocq_count < fio_thread->iocq_size); + fio_thread->iocq[fio_thread->iocq_count++] = fio_req->io; +} + +static void +spdk_nvme_io_reset_sgl(void *ref, uint32_t sgl_offset) +{ + struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref; + + fio_req->iov_offset = sgl_offset; + fio_req->bit_bucket_data_len = 0; +} + +static int +spdk_nvme_io_next_sge(void *ref, void **address, uint32_t *length) +{ + struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref; + struct io_u *io_u = fio_req->io; + uint32_t iov_len; + uint32_t bit_bucket_len; + + *address = io_u->buf; + + if (fio_req->iov_offset) { + assert(fio_req->iov_offset <= io_u->xfer_buflen); + *address += fio_req->iov_offset; + } + + iov_len = io_u->xfer_buflen - fio_req->iov_offset; + if (iov_len > g_spdk_sge_size) { + iov_len = g_spdk_sge_size; + } + + if ((fio_req->bit_bucket_data_len < g_spdk_bit_bucket_data_len) && (io_u->ddir == DDIR_READ)) { + assert(g_spdk_bit_bucket_data_len < io_u->xfer_buflen); + *address = (void *)UINT64_MAX; + bit_bucket_len = g_spdk_bit_bucket_data_len - fio_req->bit_bucket_data_len; + if (iov_len > bit_bucket_len) { + iov_len = bit_bucket_len; + } + fio_req->bit_bucket_data_len += iov_len; + } + + fio_req->iov_offset += iov_len; + *length = iov_len; + + return 0; +} + +#if FIO_IOOPS_VERSION >= 24 +typedef enum fio_q_status fio_q_status_t; +#else +typedef int fio_q_status_t; +#endif + +static fio_q_status_t +spdk_fio_queue(struct thread_data *td, struct io_u *io_u) +{ + int rc = 1; + struct spdk_fio_thread *fio_thread = td->io_ops_data; + struct spdk_fio_request *fio_req = io_u->engine_data; + struct spdk_fio_qpair *fio_qpair; + struct spdk_nvme_ns *ns = NULL; + void *md_buf = NULL; + struct spdk_dif_ctx *dif_ctx = &fio_req->dif_ctx; + uint32_t block_size; + uint64_t lba; + uint32_t lba_count; + + /* Find the namespace that corresponds to the file in the io_u */ + fio_qpair = fio_thread->fio_qpair; + while (fio_qpair != NULL) { + if (fio_qpair->f == io_u->file) { + ns = fio_qpair->ns; + break; + } + fio_qpair = fio_qpair->next; + } + if (fio_qpair == NULL || ns == NULL) { + return -ENXIO; + } + if (fio_qpair->nvme_pi_enabled && !fio_qpair->extended_lba) { + md_buf = fio_req->md_buf; + } + fio_req->fio_qpair = fio_qpair; + + block_size = spdk_nvme_ns_get_extended_sector_size(ns); + if ((fio_qpair->io_flags & g_spdk_pract_flag) && (spdk_nvme_ns_get_md_size(ns) == 8)) { + /* If metadata size = 8 bytes, PI is stripped (read) or inserted (write), and + * so reduce metadata size from block size. (If metadata size > 8 bytes, PI + * is passed (read) or replaced (write). So block size is not necessary to + * change.) + */ + block_size = spdk_nvme_ns_get_sector_size(ns); + } + + lba = io_u->offset / block_size; + lba_count = io_u->xfer_buflen / block_size; + + /* TODO: considering situations that fio will randomize and verify io_u */ + if (fio_qpair->nvme_pi_enabled) { + if (fio_qpair->extended_lba) { + rc = fio_extended_lba_setup_pi(fio_qpair, io_u); + } else { + rc = fio_separate_md_setup_pi(fio_qpair, io_u); + } + if (rc < 0) { + io_u->error = -rc; + return FIO_Q_COMPLETED; + } + } + + switch (io_u->ddir) { + case DDIR_READ: + if (!g_spdk_enable_sgl) { + rc = spdk_nvme_ns_cmd_read_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba, lba_count, + spdk_fio_completion_cb, fio_req, + fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag); + } else { + rc = spdk_nvme_ns_cmd_readv_with_md(ns, fio_qpair->qpair, lba, + lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags, + spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf, + dif_ctx->apptag_mask, dif_ctx->app_tag); + } + break; + case DDIR_WRITE: + if (!g_spdk_enable_sgl) { + rc = spdk_nvme_ns_cmd_write_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba, + lba_count, + spdk_fio_completion_cb, fio_req, + fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag); + } else { + rc = spdk_nvme_ns_cmd_writev_with_md(ns, fio_qpair->qpair, lba, + lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags, + spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf, + dif_ctx->apptag_mask, dif_ctx->app_tag); + } + break; + default: + assert(false); + break; + } + + /* NVMe read/write functions return -ENOMEM if there are no free requests. */ + if (rc == -ENOMEM) { + return FIO_Q_BUSY; + } + + if (rc != 0) { + io_u->error = abs(rc); + return FIO_Q_COMPLETED; + } + + return FIO_Q_QUEUED; +} + +static struct io_u *spdk_fio_event(struct thread_data *td, int event) +{ + struct spdk_fio_thread *fio_thread = td->io_ops_data; + + assert(event >= 0); + assert((unsigned)event < fio_thread->iocq_count); + return fio_thread->iocq[event]; +} + +static int spdk_fio_getevents(struct thread_data *td, unsigned int min, + unsigned int max, const struct timespec *t) +{ + struct spdk_fio_thread *fio_thread = td->io_ops_data; + struct spdk_fio_qpair *fio_qpair = NULL; + struct timespec t0, t1; + uint64_t timeout = 0; + + if (t) { + timeout = t->tv_sec * 1000000000L + t->tv_nsec; + clock_gettime(CLOCK_MONOTONIC_RAW, &t0); + } + + fio_thread->iocq_count = 0; + + /* fetch the next qpair */ + if (fio_thread->fio_qpair_current) { + fio_qpair = fio_thread->fio_qpair_current->next; + } + + for (;;) { + if (fio_qpair == NULL) { + fio_qpair = fio_thread->fio_qpair; + } + + while (fio_qpair != NULL) { + spdk_nvme_qpair_process_completions(fio_qpair->qpair, max - fio_thread->iocq_count); + + if (fio_thread->iocq_count >= min) { + /* reset the currrent handling qpair */ + fio_thread->fio_qpair_current = fio_qpair; + return fio_thread->iocq_count; + } + + fio_qpair = fio_qpair->next; + } + + if (t) { + uint64_t elapse; + + clock_gettime(CLOCK_MONOTONIC_RAW, &t1); + elapse = ((t1.tv_sec - t0.tv_sec) * 1000000000L) + + t1.tv_nsec - t0.tv_nsec; + if (elapse > timeout) { + break; + } + } + } + + /* reset the currrent handling qpair */ + fio_thread->fio_qpair_current = fio_qpair; + return fio_thread->iocq_count; +} + +static int spdk_fio_invalidate(struct thread_data *td, struct fio_file *f) +{ + /* TODO: This should probably send a flush to the device, but for now just return successful. */ + return 0; +} + +static void spdk_fio_cleanup(struct thread_data *td) +{ + struct spdk_fio_thread *fio_thread = td->io_ops_data; + struct spdk_fio_qpair *fio_qpair, *fio_qpair_tmp; + struct spdk_fio_options *fio_options = td->eo; + + fio_qpair = fio_thread->fio_qpair; + while (fio_qpair != NULL) { + spdk_nvme_ctrlr_free_io_qpair(fio_qpair->qpair); + fio_qpair_tmp = fio_qpair->next; + free(fio_qpair); + fio_qpair = fio_qpair_tmp; + } + + free(fio_thread->iocq); + free(fio_thread); + + pthread_mutex_lock(&g_mutex); + g_td_count--; + if (g_td_count == 0) { + struct spdk_fio_ctrlr *fio_ctrlr, *fio_ctrlr_tmp; + + fio_ctrlr = g_ctrlr; + while (fio_ctrlr != NULL) { + spdk_nvme_detach(fio_ctrlr->ctrlr); + fio_ctrlr_tmp = fio_ctrlr->next; + free(fio_ctrlr); + fio_ctrlr = fio_ctrlr_tmp; + } + g_ctrlr = NULL; + + if (fio_options->enable_vmd) { + spdk_vmd_fini(); + } + } + pthread_mutex_unlock(&g_mutex); + if (!g_ctrlr) { + if (pthread_cancel(g_ctrlr_thread_id) == 0) { + pthread_join(g_ctrlr_thread_id, NULL); + } + } +} + +/* This function enables addition of SPDK parameters to the fio config + * Adding new parameters by defining them here and defining a callback + * function to read the parameter value. */ +static struct fio_option options[] = { + { + .name = "enable_wrr", + .lname = "Enable weighted round robin (WRR) for IO submission queues", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, enable_wrr), + .def = "0", + .help = "Enable weighted round robin (WRR) for IO submission queues", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "arbitration_burst", + .lname = "Arbitration Burst", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, arbitration_burst), + .def = "0", + .help = "Arbitration Burst used for WRR (valid range from 0 - 7)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "low_weight", + .lname = "low_weight for WRR", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, low_weight), + .def = "0", + .help = "low_weight used for WRR (valid range from 0 - 255)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "medium_weight", + .lname = "medium_weight for WRR", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, medium_weight), + .def = "0", + .help = "medium weight used for WRR (valid range from 0 - 255)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "high_weight", + .lname = "high_weight for WRR", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, high_weight), + .def = "0", + .help = "high weight used for WRR (valid range from 0 - 255)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "wrr_priority", + .lname = "priority used for WRR", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, wrr_priority), + .def = "0", + .help = "priority used for WRR (valid range from 0-3)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "mem_size_mb", + .lname = "Memory size in MB", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, mem_size), + .def = "0", + .help = "Memory Size for SPDK (MB)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "shm_id", + .lname = "shared memory ID", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, shm_id), + .def = "-1", + .help = "Shared Memory ID", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "enable_sgl", + .lname = "SGL used for I/O commands", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, enable_sgl), + .def = "0", + .help = "SGL Used for I/O Commands (enable_sgl=1 or enable_sgl=0)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "sge_size", + .lname = "SGL size used for I/O commands", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, sge_size), + .def = "4096", + .help = "SGL size in bytes for I/O Commands (default 4096)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "bit_bucket_data_len", + .lname = "Amount of data used for Bit Bucket", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, bit_bucket_data_len), + .def = "0", + .help = "Bit Bucket Data Length for READ commands (disabled by default)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "hostnqn", + .lname = "Host NQN to use when connecting to controllers.", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct spdk_fio_options, hostnqn), + .help = "Host NQN", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "pi_act", + .lname = "Protection Information Action", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, pi_act), + .def = "1", + .help = "Protection Information Action bit (pi_act=1 or pi_act=0)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "pi_chk", + .lname = "Protection Information Check(GUARD|REFTAG|APPTAG)", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct spdk_fio_options, pi_chk), + .def = NULL, + .help = "Control of Protection Information Checking (pi_chk=GUARD|REFTAG|APPTAG)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "md_per_io_size", + .lname = "Separate Metadata Buffer Size per I/O", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, md_per_io_size), + .def = "4096", + .help = "Size of separate metadata buffer per I/O (Default: 4096)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "apptag", + .lname = "Application Tag used in Protection Information", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, apptag), + .def = "0x1234", + .help = "Application Tag used in Protection Information field (Default: 0x1234)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "apptag_mask", + .lname = "Application Tag Mask", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, apptag_mask), + .def = "0xffff", + .help = "Application Tag Mask used with Application Tag (Default: 0xffff)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "digest_enable", + .lname = "PDU digest choice for NVMe/TCP Transport(NONE|HEADER|DATA|BOTH)", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct spdk_fio_options, digest_enable), + .def = NULL, + .help = "Control the NVMe/TCP control(digest_enable=NONE|HEADER|DATA|BOTH)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "enable_vmd", + .lname = "Enable VMD enumeration", + .type = FIO_OPT_INT, + .off1 = offsetof(struct spdk_fio_options, enable_vmd), + .def = "0", + .help = "Enable VMD enumeration (enable_vmd=1 or enable_vmd=0)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_INVALID, + }, + { + .name = NULL, + }, +}; + +/* FIO imports this structure using dlsym */ +struct ioengine_ops ioengine = { + .name = "spdk", + .version = FIO_IOOPS_VERSION, + .queue = spdk_fio_queue, + .getevents = spdk_fio_getevents, + .event = spdk_fio_event, + .cleanup = spdk_fio_cleanup, + .open_file = spdk_fio_open, + .close_file = spdk_fio_close, + .invalidate = spdk_fio_invalidate, + .iomem_alloc = spdk_fio_iomem_alloc, + .iomem_free = spdk_fio_iomem_free, + .setup = spdk_fio_setup, + .io_u_init = spdk_fio_io_u_init, + .io_u_free = spdk_fio_io_u_free, + .flags = FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN, + .options = options, + .option_struct_size = sizeof(struct spdk_fio_options), +}; + +static void fio_init fio_spdk_register(void) +{ + register_ioengine(&ioengine); +} + +static void fio_exit fio_spdk_unregister(void) +{ + unregister_ioengine(&ioengine); +} diff --git a/src/spdk/examples/nvme/fio_plugin/full_bench.fio b/src/spdk/examples/nvme/fio_plugin/full_bench.fio new file mode 100644 index 000000000..4dea21d13 --- /dev/null +++ b/src/spdk/examples/nvme/fio_plugin/full_bench.fio @@ -0,0 +1,40 @@ +[global] +thread=1 +group_reporting=1 +direct=1 +verify=0 +norandommap=1 +cpumask=1 +disable_slat=1 +disable_bw=1 +lat_percentiles=1 +clat_percentiles=0 +percentile_list=50:99:99.999 + +[precondition-sequential] +stonewall +description="Sequentially write to the device twice" +rw=write +iodepth=128 +bs=128k +loops=2 + +[4k_randwrite_qd1] +stonewall +description="4KiB Random Write QD=1" +bs=4k +rw=randwrite +iodepth=1 +time_based=1 +ramp_time=60 +runtime=240 + +[4k_randread_qd1] +stonewall +description="4KiB Random Read QD=1" +bs=4k +rw=randread +iodepth=1 +time_based=1 +ramp_time=60 +runtime=240 diff --git a/src/spdk/examples/nvme/fio_plugin/mock_sgl_config.fio b/src/spdk/examples/nvme/fio_plugin/mock_sgl_config.fio new file mode 100644 index 000000000..713fce0a2 --- /dev/null +++ b/src/spdk/examples/nvme/fio_plugin/mock_sgl_config.fio @@ -0,0 +1,17 @@ +[global] +ioengine=spdk +thread=1 +group_reporting=1 +direct=1 +enable_sgl=1 +time_based=1 +ramp_time=0 +runtime=2 +iodepth=128 +rw=randrw +bs=16k +verify=md5 +verify_backlog=32 + +[test] +numjobs=1 diff --git a/src/spdk/examples/nvme/hello_world/.gitignore b/src/spdk/examples/nvme/hello_world/.gitignore new file mode 100644 index 000000000..242c034c1 --- /dev/null +++ b/src/spdk/examples/nvme/hello_world/.gitignore @@ -0,0 +1 @@ +hello_world diff --git a/src/spdk/examples/nvme/hello_world/Makefile b/src/spdk/examples/nvme/hello_world/Makefile new file mode 100644 index 000000000..bbb3527cb --- /dev/null +++ b/src/spdk/examples/nvme/hello_world/Makefile @@ -0,0 +1,38 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +APP = hello_world + +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk diff --git a/src/spdk/examples/nvme/hello_world/hello_world.c b/src/spdk/examples/nvme/hello_world/hello_world.c new file mode 100644 index 000000000..6e1d9d62a --- /dev/null +++ b/src/spdk/examples/nvme/hello_world/hello_world.c @@ -0,0 +1,435 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/vmd.h" +#include "spdk/env.h" + +struct ctrlr_entry { + struct spdk_nvme_ctrlr *ctrlr; + struct ctrlr_entry *next; + char name[1024]; +}; + +struct ns_entry { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ns *ns; + struct ns_entry *next; + struct spdk_nvme_qpair *qpair; +}; + +static struct ctrlr_entry *g_controllers = NULL; +static struct ns_entry *g_namespaces = NULL; + +static bool g_vmd = false; + +static void +register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) +{ + struct ns_entry *entry; + + if (!spdk_nvme_ns_is_active(ns)) { + return; + } + + entry = malloc(sizeof(struct ns_entry)); + if (entry == NULL) { + perror("ns_entry malloc"); + exit(1); + } + + entry->ctrlr = ctrlr; + entry->ns = ns; + entry->next = g_namespaces; + g_namespaces = entry; + + printf(" Namespace ID: %d size: %juGB\n", spdk_nvme_ns_get_id(ns), + spdk_nvme_ns_get_size(ns) / 1000000000); +} + +struct hello_world_sequence { + struct ns_entry *ns_entry; + char *buf; + unsigned using_cmb_io; + int is_completed; +}; + +static void +read_complete(void *arg, const struct spdk_nvme_cpl *completion) +{ + struct hello_world_sequence *sequence = arg; + + /* Assume the I/O was successful */ + sequence->is_completed = 1; + /* See if an error occurred. If so, display information + * about it, and set completion value so that I/O + * caller is aware that an error occurred. + */ + if (spdk_nvme_cpl_is_error(completion)) { + spdk_nvme_qpair_print_completion(sequence->ns_entry->qpair, (struct spdk_nvme_cpl *)completion); + fprintf(stderr, "I/O error status: %s\n", spdk_nvme_cpl_get_status_string(&completion->status)); + fprintf(stderr, "Read I/O failed, aborting run\n"); + sequence->is_completed = 2; + } + + /* + * The read I/O has completed. Print the contents of the + * buffer, free the buffer, then mark the sequence as + * completed. This will trigger the hello_world() function + * to exit its polling loop. + */ + printf("%s", sequence->buf); + spdk_free(sequence->buf); +} + +static void +write_complete(void *arg, const struct spdk_nvme_cpl *completion) +{ + struct hello_world_sequence *sequence = arg; + struct ns_entry *ns_entry = sequence->ns_entry; + int rc; + + /* See if an error occurred. If so, display information + * about it, and set completion value so that I/O + * caller is aware that an error occurred. + */ + if (spdk_nvme_cpl_is_error(completion)) { + spdk_nvme_qpair_print_completion(sequence->ns_entry->qpair, (struct spdk_nvme_cpl *)completion); + fprintf(stderr, "I/O error status: %s\n", spdk_nvme_cpl_get_status_string(&completion->status)); + fprintf(stderr, "Write I/O failed, aborting run\n"); + sequence->is_completed = 2; + exit(1); + } + /* + * The write I/O has completed. Free the buffer associated with + * the write I/O and allocate a new zeroed buffer for reading + * the data back from the NVMe namespace. + */ + if (sequence->using_cmb_io) { + spdk_nvme_ctrlr_unmap_cmb(ns_entry->ctrlr); + } else { + spdk_free(sequence->buf); + } + sequence->buf = spdk_zmalloc(0x1000, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + + rc = spdk_nvme_ns_cmd_read(ns_entry->ns, ns_entry->qpair, sequence->buf, + 0, /* LBA start */ + 1, /* number of LBAs */ + read_complete, (void *)sequence, 0); + if (rc != 0) { + fprintf(stderr, "starting read I/O failed\n"); + exit(1); + } +} + +static void +hello_world(void) +{ + struct ns_entry *ns_entry; + struct hello_world_sequence sequence; + int rc; + size_t sz; + + ns_entry = g_namespaces; + while (ns_entry != NULL) { + /* + * Allocate an I/O qpair that we can use to submit read/write requests + * to namespaces on the controller. NVMe controllers typically support + * many qpairs per controller. Any I/O qpair allocated for a controller + * can submit I/O to any namespace on that controller. + * + * The SPDK NVMe driver provides no synchronization for qpair accesses - + * the application must ensure only a single thread submits I/O to a + * qpair, and that same thread must also check for completions on that + * qpair. This enables extremely efficient I/O processing by making all + * I/O operations completely lockless. + */ + ns_entry->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ns_entry->ctrlr, NULL, 0); + if (ns_entry->qpair == NULL) { + printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair() failed\n"); + return; + } + + /* + * Use spdk_dma_zmalloc to allocate a 4KB zeroed buffer. This memory + * will be pinned, which is required for data buffers used for SPDK NVMe + * I/O operations. + */ + sequence.using_cmb_io = 1; + sequence.buf = spdk_nvme_ctrlr_map_cmb(ns_entry->ctrlr, &sz); + if (sequence.buf == NULL || sz < 0x1000) { + sequence.using_cmb_io = 0; + sequence.buf = spdk_zmalloc(0x1000, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + } + if (sequence.buf == NULL) { + printf("ERROR: write buffer allocation failed\n"); + return; + } + if (sequence.using_cmb_io) { + printf("INFO: using controller memory buffer for IO\n"); + } else { + printf("INFO: using host memory buffer for IO\n"); + } + sequence.is_completed = 0; + sequence.ns_entry = ns_entry; + + /* + * Print "Hello world!" to sequence.buf. We will write this data to LBA + * 0 on the namespace, and then later read it back into a separate buffer + * to demonstrate the full I/O path. + */ + snprintf(sequence.buf, 0x1000, "%s", "Hello world!\n"); + + /* + * Write the data buffer to LBA 0 of this namespace. "write_complete" and + * "&sequence" are specified as the completion callback function and + * argument respectively. write_complete() will be called with the + * value of &sequence as a parameter when the write I/O is completed. + * This allows users to potentially specify different completion + * callback routines for each I/O, as well as pass a unique handle + * as an argument so the application knows which I/O has completed. + * + * Note that the SPDK NVMe driver will only check for completions + * when the application calls spdk_nvme_qpair_process_completions(). + * It is the responsibility of the application to trigger the polling + * process. + */ + rc = spdk_nvme_ns_cmd_write(ns_entry->ns, ns_entry->qpair, sequence.buf, + 0, /* LBA start */ + 1, /* number of LBAs */ + write_complete, &sequence, 0); + if (rc != 0) { + fprintf(stderr, "starting write I/O failed\n"); + exit(1); + } + + /* + * Poll for completions. 0 here means process all available completions. + * In certain usage models, the caller may specify a positive integer + * instead of 0 to signify the maximum number of completions it should + * process. This function will never block - if there are no + * completions pending on the specified qpair, it will return immediately. + * + * When the write I/O completes, write_complete() will submit a new I/O + * to read LBA 0 into a separate buffer, specifying read_complete() as its + * completion routine. When the read I/O completes, read_complete() will + * print the buffer contents and set sequence.is_completed = 1. That will + * break this loop and then exit the program. + */ + while (!sequence.is_completed) { + spdk_nvme_qpair_process_completions(ns_entry->qpair, 0); + } + + /* + * Free the I/O qpair. This typically is done when an application exits. + * But SPDK does support freeing and then reallocating qpairs during + * operation. It is the responsibility of the caller to ensure all + * pending I/O are completed before trying to free the qpair. + */ + spdk_nvme_ctrlr_free_io_qpair(ns_entry->qpair); + ns_entry = ns_entry->next; + } +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + printf("Attaching to %s\n", trid->traddr); + + return true; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + int nsid, num_ns; + struct ctrlr_entry *entry; + struct spdk_nvme_ns *ns; + const struct spdk_nvme_ctrlr_data *cdata; + + entry = malloc(sizeof(struct ctrlr_entry)); + if (entry == NULL) { + perror("ctrlr_entry malloc"); + exit(1); + } + + printf("Attached to %s\n", trid->traddr); + + /* + * spdk_nvme_ctrlr is the logical abstraction in SPDK for an NVMe + * controller. During initialization, the IDENTIFY data for the + * controller is read using an NVMe admin command, and that data + * can be retrieved using spdk_nvme_ctrlr_get_data() to get + * detailed information on the controller. Refer to the NVMe + * specification for more details on IDENTIFY for NVMe controllers. + */ + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn); + + entry->ctrlr = ctrlr; + entry->next = g_controllers; + g_controllers = entry; + + /* + * Each controller has one or more namespaces. An NVMe namespace is basically + * equivalent to a SCSI LUN. The controller's IDENTIFY data tells us how + * many namespaces exist on the controller. For Intel(R) P3X00 controllers, + * it will just be one namespace. + * + * Note that in NVMe, namespace IDs start at 1, not 0. + */ + num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); + printf("Using controller %s with %d namespaces.\n", entry->name, num_ns); + for (nsid = 1; nsid <= num_ns; nsid++) { + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + continue; + } + register_ns(ctrlr, ns); + } +} + +static void +cleanup(void) +{ + struct ns_entry *ns_entry = g_namespaces; + struct ctrlr_entry *ctrlr_entry = g_controllers; + + while (ns_entry) { + struct ns_entry *next = ns_entry->next; + free(ns_entry); + ns_entry = next; + } + + while (ctrlr_entry) { + struct ctrlr_entry *next = ctrlr_entry->next; + + spdk_nvme_detach(ctrlr_entry->ctrlr); + free(ctrlr_entry); + ctrlr_entry = next; + } +} + +static void +usage(const char *program_name) +{ + printf("%s [options]", program_name); + printf("\n"); + printf("options:\n"); + printf(" -V enumerate VMD\n"); +} + +static int +parse_args(int argc, char **argv) +{ + int op; + + while ((op = getopt(argc, argv, "V")) != -1) { + switch (op) { + case 'V': + g_vmd = true; + break; + default: + usage(argv[0]); + return 1; + } + } + + return 0; +} + +int main(int argc, char **argv) +{ + int rc; + struct spdk_env_opts opts; + + rc = parse_args(argc, argv); + if (rc != 0) { + return rc; + } + + /* + * SPDK relies on an abstraction around the local environment + * named env that handles memory allocation and PCI device operations. + * This library must be initialized first. + * + */ + spdk_env_opts_init(&opts); + opts.name = "hello_world"; + opts.shm_id = 0; + if (spdk_env_init(&opts) < 0) { + fprintf(stderr, "Unable to initialize SPDK env\n"); + return 1; + } + + printf("Initializing NVMe Controllers\n"); + + if (g_vmd && spdk_vmd_init()) { + fprintf(stderr, "Failed to initialize VMD." + " Some NVMe devices can be unavailable.\n"); + } + + /* + * Start the SPDK NVMe enumeration process. probe_cb will be called + * for each NVMe controller found, giving our application a choice on + * whether to attach to each controller. attach_cb will then be + * called for each controller after the SPDK NVMe driver has completed + * initializing the controller we chose to attach. + */ + rc = spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, NULL); + if (rc != 0) { + fprintf(stderr, "spdk_nvme_probe() failed\n"); + cleanup(); + return 1; + } + + if (g_controllers == NULL) { + fprintf(stderr, "no NVMe controllers found\n"); + cleanup(); + return 1; + } + + printf("Initialization complete.\n"); + hello_world(); + cleanup(); + if (g_vmd) { + spdk_vmd_fini(); + } + + return 0; +} diff --git a/src/spdk/examples/nvme/hotplug/.gitignore b/src/spdk/examples/nvme/hotplug/.gitignore new file mode 100644 index 000000000..e6ff53805 --- /dev/null +++ b/src/spdk/examples/nvme/hotplug/.gitignore @@ -0,0 +1 @@ +hotplug diff --git a/src/spdk/examples/nvme/hotplug/Makefile b/src/spdk/examples/nvme/hotplug/Makefile new file mode 100644 index 000000000..c77c61227 --- /dev/null +++ b/src/spdk/examples/nvme/hotplug/Makefile @@ -0,0 +1,38 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +APP = hotplug + +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk diff --git a/src/spdk/examples/nvme/hotplug/hotplug.c b/src/spdk/examples/nvme/hotplug/hotplug.c new file mode 100644 index 000000000..ff821c7ed --- /dev/null +++ b/src/spdk/examples/nvme/hotplug/hotplug.c @@ -0,0 +1,525 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/queue.h" +#include "spdk/string.h" +#include "spdk/util.h" + +struct dev_ctx { + TAILQ_ENTRY(dev_ctx) tailq; + bool is_new; + bool is_removed; + bool is_draining; + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ns *ns; + struct spdk_nvme_qpair *qpair; + uint32_t io_size_blocks; + uint64_t size_in_ios; + uint64_t io_completed; + uint64_t prev_io_completed; + uint64_t current_queue_depth; + uint64_t offset_in_ios; + char name[1024]; +}; + +struct perf_task { + struct dev_ctx *dev; + void *buf; +}; + +static TAILQ_HEAD(, dev_ctx) g_devs = TAILQ_HEAD_INITIALIZER(g_devs); + +static uint64_t g_tsc_rate; + +static uint32_t g_io_size_bytes = 4096; +static int g_queue_depth = 4; +static int g_time_in_sec; +static int g_expected_insert_times = -1; +static int g_expected_removal_times = -1; +static int g_insert_times; +static int g_removal_times; +static int g_shm_id = -1; +static uint64_t g_timeout_in_us = SPDK_SEC_TO_USEC; + +static void +task_complete(struct perf_task *task); + +static void +timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, uint16_t cid); + +static void +register_dev(struct spdk_nvme_ctrlr *ctrlr) +{ + struct dev_ctx *dev; + const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + dev = calloc(1, sizeof(*dev)); + if (dev == NULL) { + perror("dev_ctx malloc"); + exit(1); + } + + snprintf(dev->name, sizeof(dev->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn); + + dev->ctrlr = ctrlr; + dev->is_new = true; + dev->is_removed = false; + dev->is_draining = false; + + spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_timeout_in_us, timeout_cb, NULL); + + dev->ns = spdk_nvme_ctrlr_get_ns(ctrlr, 1); + + if (!dev->ns || !spdk_nvme_ns_is_active(dev->ns)) { + fprintf(stderr, "Controller %s: No active namespace; skipping\n", dev->name); + goto skip; + } + + if (spdk_nvme_ns_get_size(dev->ns) < g_io_size_bytes || + spdk_nvme_ns_get_sector_size(dev->ns) > g_io_size_bytes) { + fprintf(stderr, "Controller %s: Invalid " + "ns size %" PRIu64 " / block size %u for I/O size %u\n", + dev->name, + spdk_nvme_ns_get_size(dev->ns), + spdk_nvme_ns_get_sector_size(dev->ns), + g_io_size_bytes); + goto skip; + } + + dev->size_in_ios = spdk_nvme_ns_get_size(dev->ns) / g_io_size_bytes; + dev->io_size_blocks = g_io_size_bytes / spdk_nvme_ns_get_sector_size(dev->ns); + + dev->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0); + if (!dev->qpair) { + fprintf(stderr, "ERROR: spdk_nvme_ctrlr_alloc_io_qpair() failed\n"); + goto skip; + } + g_insert_times++; + TAILQ_INSERT_TAIL(&g_devs, dev, tailq); + return; + +skip: + free(dev); +} + +static void +unregister_dev(struct dev_ctx *dev) +{ + fprintf(stderr, "unregister_dev: %s\n", dev->name); + + spdk_nvme_ctrlr_free_io_qpair(dev->qpair); + spdk_nvme_detach(dev->ctrlr); + + TAILQ_REMOVE(&g_devs, dev, tailq); + free(dev); +} + +static struct perf_task * +alloc_task(struct dev_ctx *dev) +{ + struct perf_task *task; + + task = calloc(1, sizeof(*task)); + if (task == NULL) { + return NULL; + } + + task->buf = spdk_dma_zmalloc(g_io_size_bytes, 0x200, NULL); + if (task->buf == NULL) { + free(task); + return NULL; + } + + task->dev = dev; + + return task; +} + +static void +free_task(struct perf_task *task) +{ + spdk_dma_free(task->buf); + free(task); +} + +static void io_complete(void *ctx, const struct spdk_nvme_cpl *completion); + +static void +submit_single_io(struct perf_task *task) +{ + struct dev_ctx *dev = task->dev; + uint64_t offset_in_ios; + int rc; + + offset_in_ios = dev->offset_in_ios++; + if (dev->offset_in_ios == dev->size_in_ios) { + dev->offset_in_ios = 0; + } + + rc = spdk_nvme_ns_cmd_read(dev->ns, dev->qpair, task->buf, + offset_in_ios * dev->io_size_blocks, + dev->io_size_blocks, io_complete, task, 0); + + if (rc != 0) { + fprintf(stderr, "starting I/O failed\n"); + free_task(task); + } else { + dev->current_queue_depth++; + } +} + +static void +task_complete(struct perf_task *task) +{ + struct dev_ctx *dev; + + dev = task->dev; + dev->current_queue_depth--; + dev->io_completed++; + + /* + * is_draining indicates when time has expired for the test run + * and we are just waiting for the previously submitted I/O + * to complete. In this case, do not submit a new I/O to replace + * the one just completed. + */ + if (!dev->is_draining && !dev->is_removed) { + submit_single_io(task); + } else { + free_task(task); + } +} + +static void +io_complete(void *ctx, const struct spdk_nvme_cpl *completion) +{ + task_complete((struct perf_task *)ctx); +} + +static void +check_io(struct dev_ctx *dev) +{ + spdk_nvme_qpair_process_completions(dev->qpair, 0); +} + +static void +submit_io(struct dev_ctx *dev, int queue_depth) +{ + struct perf_task *task; + + while (queue_depth-- > 0) { + task = alloc_task(dev); + if (task == NULL) { + fprintf(stderr, "task allocation failed\n"); + exit(1); + } + + submit_single_io(task); + } +} + +static void +drain_io(struct dev_ctx *dev) +{ + dev->is_draining = true; + while (dev->current_queue_depth > 0) { + check_io(dev); + } +} + +static void +print_stats(void) +{ + struct dev_ctx *dev; + + TAILQ_FOREACH(dev, &g_devs, tailq) { + fprintf(stderr, "%-43.43s: %10" PRIu64 " I/Os completed (+%" PRIu64 ")\n", + dev->name, + dev->io_completed, + dev->io_completed - dev->prev_io_completed); + dev->prev_io_completed = dev->io_completed; + } + + fprintf(stderr, "\n"); +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + fprintf(stderr, "Attaching to %s\n", trid->traddr); + + return true; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + fprintf(stderr, "Attached to %s\n", trid->traddr); + + register_dev(ctrlr); +} + +static void +remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) +{ + struct dev_ctx *dev; + + TAILQ_FOREACH(dev, &g_devs, tailq) { + if (dev->ctrlr == ctrlr) { + /* + * Mark the device as removed, but don't detach yet. + * + * The I/O handling code will detach once it sees that + * is_removed is true and all outstanding I/O have been completed. + */ + dev->is_removed = true; + fprintf(stderr, "Controller removed: %s\n", dev->name); + return; + } + } + + /* + * If we get here, this remove_cb is for a controller that we are not tracking + * in g_devs (for example, because we skipped it during register_dev), + * so immediately detach it. + */ + spdk_nvme_detach(ctrlr); +} + +static void +timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, uint16_t cid) +{ + /* leave hotplug monitor loop, use the timeout_cb to monitor the hotplug */ + if (spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, remove_cb) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed\n"); + } +} + +static void +io_loop(void) +{ + struct dev_ctx *dev, *dev_tmp; + uint64_t tsc_end; + uint64_t next_stats_tsc; + + tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate; + next_stats_tsc = spdk_get_ticks(); + + while (1) { + uint64_t now; + + /* + * Check for completed I/O for each controller. A new + * I/O will be submitted in the io_complete callback + * to replace each I/O that is completed. + */ + TAILQ_FOREACH(dev, &g_devs, tailq) { + if (dev->is_new) { + /* Submit initial I/O for this controller. */ + submit_io(dev, g_queue_depth); + dev->is_new = false; + } + + check_io(dev); + } + + /* + * Check for hotplug events. + */ + if (spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, remove_cb) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed\n"); + break; + } + + /* + * Check for devices which were hot-removed and have finished + * processing outstanding I/Os. + * + * unregister_dev() may remove devs from the list, so use the + * removal-safe iterator. + */ + TAILQ_FOREACH_SAFE(dev, &g_devs, tailq, dev_tmp) { + if (dev->is_removed && dev->current_queue_depth == 0) { + g_removal_times++; + unregister_dev(dev); + } + } + + now = spdk_get_ticks(); + if (now > tsc_end) { + break; + } + if (now > next_stats_tsc) { + print_stats(); + next_stats_tsc += g_tsc_rate; + } + + if (g_insert_times == g_expected_insert_times && g_removal_times == g_expected_removal_times) { + break; + } + } + + TAILQ_FOREACH_SAFE(dev, &g_devs, tailq, dev_tmp) { + drain_io(dev); + unregister_dev(dev); + } +} + +static void usage(char *program_name) +{ + printf("%s options", program_name); + printf("\n"); + printf("\t[-c timeout for each command in second(default:1s)]\n"); + printf("\t[-i shm id (optional)]\n"); + printf("\t[-n expected hot insert times]\n"); + printf("\t[-r expected hot removal times]\n"); + printf("\t[-t time in seconds]\n"); +} + +static int +parse_args(int argc, char **argv) +{ + int op; + long int val; + + /* default value */ + g_time_in_sec = 0; + + while ((op = getopt(argc, argv, "c:i:n:r:t:")) != -1) { + if (op == '?') { + usage(argv[0]); + return 1; + } + + val = spdk_strtol(optarg, 10); + if (val < 0) { + fprintf(stderr, "Converting a string to integer failed\n"); + return val; + } + switch (op) { + case 'c': + g_timeout_in_us = val * SPDK_SEC_TO_USEC; + break; + case 'i': + g_shm_id = val; + break; + case 'n': + g_expected_insert_times = val; + break; + case 'r': + g_expected_removal_times = val; + break; + case 't': + g_time_in_sec = val; + break; + default: + usage(argv[0]); + return 1; + } + } + + if (!g_time_in_sec) { + usage(argv[0]); + return 1; + } + + return 0; +} + + +static int +register_controllers(void) +{ + fprintf(stderr, "Initializing NVMe Controllers\n"); + + if (spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, remove_cb) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed\n"); + return 1; + } + /* Reset g_insert_times to 0 so that we do not count controllers attached at start as hotplug events. */ + g_insert_times = 0; + return 0; +} + +int main(int argc, char **argv) +{ + int rc; + struct spdk_env_opts opts; + + rc = parse_args(argc, argv); + if (rc != 0) { + return rc; + } + + spdk_env_opts_init(&opts); + opts.name = "hotplug"; + opts.core_mask = "0x1"; + if (g_shm_id > -1) { + opts.shm_id = g_shm_id; + } + if (spdk_env_init(&opts) < 0) { + fprintf(stderr, "Unable to initialize SPDK env\n"); + return 1; + } + + g_tsc_rate = spdk_get_ticks_hz(); + + /* Detect the controllers that are plugged in at startup. */ + if (register_controllers() != 0) { + return 1; + } + + fprintf(stderr, "Initialization complete. Starting I/O...\n"); + io_loop(); + + if (g_expected_insert_times != -1 && g_insert_times != g_expected_insert_times) { + fprintf(stderr, "Expected inserts %d != actual inserts %d\n", + g_expected_insert_times, g_insert_times); + return 1; + } + + if (g_expected_removal_times != -1 && g_removal_times != g_expected_removal_times) { + fprintf(stderr, "Expected removals %d != actual removals %d\n", + g_expected_removal_times, g_removal_times); + return 1; + } + + return 0; +} diff --git a/src/spdk/examples/nvme/identify/.gitignore b/src/spdk/examples/nvme/identify/.gitignore new file mode 100644 index 000000000..5c5444c1e --- /dev/null +++ b/src/spdk/examples/nvme/identify/.gitignore @@ -0,0 +1 @@ +identify diff --git a/src/spdk/examples/nvme/identify/Makefile b/src/spdk/examples/nvme/identify/Makefile new file mode 100644 index 000000000..ed7aa60a8 --- /dev/null +++ b/src/spdk/examples/nvme/identify/Makefile @@ -0,0 +1,44 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +APP = identify + +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk + +install: $(APP) + $(INSTALL_EXAMPLE) + +uninstall: + $(UNINSTALL_EXAMPLE) diff --git a/src/spdk/examples/nvme/identify/identify.c b/src/spdk/examples/nvme/identify/identify.c new file mode 100644 index 000000000..722f8d3ee --- /dev/null +++ b/src/spdk/examples/nvme/identify/identify.c @@ -0,0 +1,1827 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/endian.h" +#include "spdk/log.h" +#include "spdk/nvme.h" +#include "spdk/vmd.h" +#include "spdk/nvme_ocssd.h" +#include "spdk/env.h" +#include "spdk/nvme_intel.h" +#include "spdk/nvmf_spec.h" +#include "spdk/pci_ids.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/uuid.h" + +#define MAX_DISCOVERY_LOG_ENTRIES ((uint64_t)1000) + +#define NUM_CHUNK_INFO_ENTRIES 8 + +static int outstanding_commands; + +struct feature { + uint32_t result; + bool valid; +}; + +static struct feature features[256] = {}; + +static struct spdk_nvme_error_information_entry error_page[256]; + +static struct spdk_nvme_health_information_page health_page; + +static struct spdk_nvme_firmware_page firmware_page; + +static struct spdk_nvme_cmds_and_effect_log_page cmd_effects_log_page; + +static struct spdk_nvme_intel_smart_information_page intel_smart_page; + +static struct spdk_nvme_intel_temperature_page intel_temperature_page; + +static struct spdk_nvme_intel_marketing_description_page intel_md_page; + +static struct spdk_nvmf_discovery_log_page *g_discovery_page; +static size_t g_discovery_page_size; +static uint64_t g_discovery_page_numrec; + +static struct spdk_ocssd_geometry_data geometry_data; + +static struct spdk_ocssd_chunk_information_entry g_ocssd_chunk_info_page[NUM_CHUNK_INFO_ENTRIES ]; + +static bool g_hex_dump = false; + +static int g_shm_id = -1; + +static int g_dpdk_mem = 0; + +static int g_master_core = 0; + +static char g_core_mask[16] = "0x1"; + +static struct spdk_nvme_transport_id g_trid; + +static int g_controllers_found = 0; + +static bool g_vmd = false; + +static void +hex_dump(const void *data, size_t size) +{ + size_t offset = 0, i; + const uint8_t *bytes = data; + + while (size) { + printf("%08zX:", offset); + + for (i = 0; i < 16; i++) { + if (i == 8) { + printf("-"); + } else { + printf(" "); + } + + if (i < size) { + printf("%02X", bytes[offset + i]); + } else { + printf(" "); + } + } + + printf(" "); + + for (i = 0; i < 16; i++) { + if (i < size) { + if (bytes[offset + i] > 0x20 && bytes[offset + i] < 0x7F) { + printf("%c", bytes[offset + i]); + } else { + printf("."); + } + } + } + + printf("\n"); + + offset += 16; + if (size > 16) { + size -= 16; + } else { + break; + } + } +} + +static void +get_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + struct feature *feature = cb_arg; + int fid = feature - features; + + if (spdk_nvme_cpl_is_error(cpl)) { + printf("get_feature(0x%02X) failed\n", fid); + } else { + feature->result = cpl->cdw0; + feature->valid = true; + } + outstanding_commands--; +} + +static void +get_log_page_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + if (spdk_nvme_cpl_is_error(cpl)) { + printf("get log page failed\n"); + } + outstanding_commands--; +} + +static void +get_ocssd_geometry_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + if (spdk_nvme_cpl_is_error(cpl)) { + printf("get ocssd geometry failed\n"); + } + outstanding_commands--; +} + +static int +get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t fid) +{ + struct spdk_nvme_cmd cmd = {}; + struct feature *feature = &features[fid]; + + feature->valid = false; + + cmd.opc = SPDK_NVME_OPC_GET_FEATURES; + cmd.cdw10_bits.get_features.fid = fid; + + return spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &cmd, NULL, 0, get_feature_completion, feature); +} + +static void +get_features(struct spdk_nvme_ctrlr *ctrlr) +{ + size_t i; + + uint8_t features_to_get[] = { + SPDK_NVME_FEAT_ARBITRATION, + SPDK_NVME_FEAT_POWER_MANAGEMENT, + SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD, + SPDK_NVME_FEAT_ERROR_RECOVERY, + SPDK_NVME_FEAT_NUMBER_OF_QUEUES, + SPDK_OCSSD_FEAT_MEDIA_FEEDBACK, + }; + + /* Submit several GET FEATURES commands and wait for them to complete */ + outstanding_commands = 0; + for (i = 0; i < SPDK_COUNTOF(features_to_get); i++) { + if (!spdk_nvme_ctrlr_is_ocssd_supported(ctrlr) && + features_to_get[i] == SPDK_OCSSD_FEAT_MEDIA_FEEDBACK) { + continue; + } + if (get_feature(ctrlr, features_to_get[i]) == 0) { + outstanding_commands++; + } else { + printf("get_feature(0x%02X) failed to submit command\n", features_to_get[i]); + } + } + + while (outstanding_commands) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr); + } +} + +static int +get_error_log_page(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_ctrlr_data *cdata; + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_ERROR, + SPDK_NVME_GLOBAL_NS_TAG, error_page, + sizeof(*error_page) * (cdata->elpe + 1), + 0, + get_log_page_completion, NULL)) { + printf("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + + return 0; +} + +static int +get_health_log_page(struct spdk_nvme_ctrlr *ctrlr) +{ + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_HEALTH_INFORMATION, + SPDK_NVME_GLOBAL_NS_TAG, &health_page, sizeof(health_page), 0, get_log_page_completion, NULL)) { + printf("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + + return 0; +} + +static int +get_firmware_log_page(struct spdk_nvme_ctrlr *ctrlr) +{ + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_FIRMWARE_SLOT, + SPDK_NVME_GLOBAL_NS_TAG, &firmware_page, sizeof(firmware_page), 0, get_log_page_completion, NULL)) { + printf("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + + return 0; +} + +static int +get_cmd_effects_log_page(struct spdk_nvme_ctrlr *ctrlr) +{ + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_COMMAND_EFFECTS_LOG, + SPDK_NVME_GLOBAL_NS_TAG, &cmd_effects_log_page, sizeof(cmd_effects_log_page), 0, + get_log_page_completion, NULL)) { + printf("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + + return 0; +} + +static int +get_intel_smart_log_page(struct spdk_nvme_ctrlr *ctrlr) +{ + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_INTEL_LOG_SMART, SPDK_NVME_GLOBAL_NS_TAG, + &intel_smart_page, sizeof(intel_smart_page), 0, get_log_page_completion, NULL)) { + printf("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + + return 0; +} + +static int +get_intel_temperature_log_page(struct spdk_nvme_ctrlr *ctrlr) +{ + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_INTEL_LOG_TEMPERATURE, + SPDK_NVME_GLOBAL_NS_TAG, &intel_temperature_page, sizeof(intel_temperature_page), 0, + get_log_page_completion, NULL)) { + printf("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + return 0; +} + +static int +get_intel_md_log_page(struct spdk_nvme_ctrlr *ctrlr) +{ + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_INTEL_MARKETING_DESCRIPTION, + SPDK_NVME_GLOBAL_NS_TAG, &intel_md_page, sizeof(intel_md_page), 0, + get_log_page_completion, NULL)) { + printf("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + return 0; +} + +static void +get_discovery_log_page_header_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvmf_discovery_log_page *new_discovery_page; + struct spdk_nvme_ctrlr *ctrlr = cb_arg; + uint16_t recfmt; + uint64_t remaining; + uint64_t offset; + + outstanding_commands--; + if (spdk_nvme_cpl_is_error(cpl)) { + /* Return without printing anything - this may not be a discovery controller */ + free(g_discovery_page); + g_discovery_page = NULL; + return; + } + + /* Got the first 4K of the discovery log page */ + recfmt = from_le16(&g_discovery_page->recfmt); + if (recfmt != 0) { + printf("Unrecognized discovery log record format %" PRIu16 "\n", recfmt); + return; + } + + g_discovery_page_numrec = from_le64(&g_discovery_page->numrec); + + /* Pick an arbitrary limit to avoid ridiculously large buffer size. */ + if (g_discovery_page_numrec > MAX_DISCOVERY_LOG_ENTRIES) { + printf("Discovery log has %" PRIu64 " entries - limiting to %" PRIu64 ".\n", + g_discovery_page_numrec, MAX_DISCOVERY_LOG_ENTRIES); + g_discovery_page_numrec = MAX_DISCOVERY_LOG_ENTRIES; + } + + /* + * Now that we now how many entries should be in the log page, we can allocate + * the full log page buffer. + */ + g_discovery_page_size += g_discovery_page_numrec * sizeof(struct + spdk_nvmf_discovery_log_page_entry); + new_discovery_page = realloc(g_discovery_page, g_discovery_page_size); + if (new_discovery_page == NULL) { + free(g_discovery_page); + printf("Discovery page allocation failed!\n"); + return; + } + + g_discovery_page = new_discovery_page; + + /* Retrieve the rest of the discovery log page */ + offset = offsetof(struct spdk_nvmf_discovery_log_page, entries); + remaining = g_discovery_page_size - offset; + while (remaining) { + uint32_t size; + + /* Retrieve up to 4 KB at a time */ + size = spdk_min(remaining, 4096); + + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_DISCOVERY, + 0, (char *)g_discovery_page + offset, size, offset, + get_log_page_completion, NULL)) { + printf("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + + offset += size; + remaining -= size; + outstanding_commands++; + } +} + +static int +get_discovery_log_page(struct spdk_nvme_ctrlr *ctrlr) +{ + /* Allocate the initial discovery log page buffer - this will be resized later. */ + g_discovery_page_size = sizeof(*g_discovery_page); + g_discovery_page = calloc(1, g_discovery_page_size); + if (g_discovery_page == NULL) { + printf("Discovery log page allocation failed!\n"); + exit(1); + } + + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_DISCOVERY, + 0, g_discovery_page, g_discovery_page_size, 0, + get_discovery_log_page_header_completion, ctrlr)) { + printf("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + + return 0; +} + +static void +get_log_pages(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_ctrlr_data *cdata; + outstanding_commands = 0; + bool is_discovery = spdk_nvme_ctrlr_is_discovery(ctrlr); + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + if (!is_discovery) { + /* + * Only attempt to retrieve the following log pages + * when the NVM subsystem that's being targeted is + * NOT the Discovery Controller which only fields + * a Discovery Log Page. + */ + if (get_error_log_page(ctrlr) == 0) { + outstanding_commands++; + } else { + printf("Get Error Log Page failed\n"); + } + + if (get_health_log_page(ctrlr) == 0) { + outstanding_commands++; + } else { + printf("Get Log Page (SMART/health) failed\n"); + } + + if (get_firmware_log_page(ctrlr) == 0) { + outstanding_commands++; + } else { + printf("Get Log Page (Firmware Slot Information) failed\n"); + } + } + + if (cdata->lpa.celp) { + if (get_cmd_effects_log_page(ctrlr) == 0) { + outstanding_commands++; + } else { + printf("Get Log Page (Commands Supported and Effects) failed\n"); + } + } + + if (cdata->vid == SPDK_PCI_VID_INTEL) { + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr, SPDK_NVME_INTEL_LOG_SMART)) { + if (get_intel_smart_log_page(ctrlr) == 0) { + outstanding_commands++; + } else { + printf("Get Log Page (Intel SMART/health) failed\n"); + } + } + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr, SPDK_NVME_INTEL_LOG_TEMPERATURE)) { + if (get_intel_temperature_log_page(ctrlr) == 0) { + outstanding_commands++; + } else { + printf("Get Log Page (Intel temperature) failed\n"); + } + } + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr, SPDK_NVME_INTEL_MARKETING_DESCRIPTION)) { + if (get_intel_md_log_page(ctrlr) == 0) { + outstanding_commands++; + } else { + printf("Get Log Page (Intel Marketing Description) failed\n"); + } + } + + } + + if (is_discovery && (get_discovery_log_page(ctrlr) == 0)) { + outstanding_commands++; + } + + while (outstanding_commands) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr); + } +} + +static int +get_ocssd_chunk_info_log_page(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); + int nsid = spdk_nvme_ns_get_id(ns); + outstanding_commands = 0; + + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_OCSSD_LOG_CHUNK_INFO, + nsid, &g_ocssd_chunk_info_page, sizeof(g_ocssd_chunk_info_page), 0, + get_log_page_completion, NULL) == 0) { + outstanding_commands++; + } else { + printf("get_ocssd_chunk_info_log_page() failed\n"); + return -1; + } + + while (outstanding_commands) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr); + } + + return 0; +} + +static void +get_ocssd_geometry(struct spdk_nvme_ns *ns, struct spdk_ocssd_geometry_data *geometry_data) +{ + struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); + int nsid = spdk_nvme_ns_get_id(ns); + outstanding_commands = 0; + + if (spdk_nvme_ocssd_ctrlr_cmd_geometry(ctrlr, nsid, geometry_data, + sizeof(*geometry_data), get_ocssd_geometry_completion, NULL)) { + printf("Get OpenChannel SSD geometry failed\n"); + exit(1); + } else { + outstanding_commands++; + } + + while (outstanding_commands) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr); + } +} + +static void +print_hex_be(const void *v, size_t size) +{ + const uint8_t *buf = v; + + while (size--) { + printf("%02X", *buf++); + } +} + +static void +print_uint128_hex(uint64_t *v) +{ + unsigned long long lo = v[0], hi = v[1]; + if (hi) { + printf("0x%llX%016llX", hi, lo); + } else { + printf("0x%llX", lo); + } +} + +static void +print_uint128_dec(uint64_t *v) +{ + unsigned long long lo = v[0], hi = v[1]; + if (hi) { + /* can't handle large (>64-bit) decimal values for now, so fall back to hex */ + print_uint128_hex(v); + } else { + printf("%llu", (unsigned long long)lo); + } +} + +/* The len should be <= 8. */ +static void +print_uint_var_dec(uint8_t *array, unsigned int len) +{ + uint64_t result = 0; + int i = len; + + while (i > 0) { + result += (uint64_t)array[i - 1] << (8 * (i - 1)); + i--; + } + printf("%lu", result); +} + +/* Print ASCII string as defined by the NVMe spec */ +static void +print_ascii_string(const void *buf, size_t size) +{ + const uint8_t *str = buf; + + /* Trim trailing spaces */ + while (size > 0 && str[size - 1] == ' ') { + size--; + } + + while (size--) { + if (*str >= 0x20 && *str <= 0x7E) { + printf("%c", *str); + } else { + printf("."); + } + str++; + } +} + +static void +print_ocssd_chunk_info(struct spdk_ocssd_chunk_information_entry *chk_info, int chk_num) +{ + int i; + char *cs_str, *ct_str; + + printf("OCSSD Chunk Info Glance\n"); + printf("======================\n"); + + for (i = 0; i < chk_num; i++) { + cs_str = chk_info[i].cs.free ? "Free" : + chk_info[i].cs.closed ? "Closed" : + chk_info[i].cs.open ? "Open" : + chk_info[i].cs.offline ? "Offline" : "Unknown"; + ct_str = chk_info[i].ct.seq_write ? "Sequential Write" : + chk_info[i].ct.rnd_write ? "Random Write" : "Unknown"; + + printf("------------\n"); + printf("Chunk index: %d\n", i); + printf("Chunk state: %s(0x%x)\n", cs_str, *(uint8_t *) & (chk_info[i].cs)); + printf("Chunk type (write mode): %s\n", ct_str); + printf("Chunk type (size_deviate): %s\n", chk_info[i].ct.size_deviate ? "Yes" : "No"); + printf("Wear-level Index: %d\n", chk_info[i].wli); + printf("Starting LBA: %ld\n", chk_info[i].slba); + printf("Number of blocks in chunk: %ld\n", chk_info[i].cnlb); + printf("Write Pointer: %ld\n", chk_info[i].wp); + } +} + +static void +print_ocssd_geometry(struct spdk_ocssd_geometry_data *geometry_data) +{ + printf("Namespace OCSSD Geometry\n"); + printf("=======================\n"); + + if (geometry_data->mjr < 2) { + printf("Open-Channel Spec version is less than 2.0\n"); + printf("OC version: maj:%d\n", geometry_data->mjr); + return; + } + + printf("OC version: maj:%d min:%d\n", geometry_data->mjr, geometry_data->mnr); + printf("LBA format:\n"); + printf(" Group bits: %d\n", geometry_data->lbaf.grp_len); + printf(" PU bits: %d\n", geometry_data->lbaf.pu_len); + printf(" Chunk bits: %d\n", geometry_data->lbaf.chk_len); + printf(" Logical block bits: %d\n", geometry_data->lbaf.lbk_len); + + printf("Media and Controller Capabilities:\n"); + printf(" Namespace supports Vector Chunk Copy: %s\n", + geometry_data->mccap.vec_chk_cpy ? "Supported" : "Not Supported"); + printf(" Namespace supports multiple resets a free chunk: %s\n", + geometry_data->mccap.multi_reset ? "Supported" : "Not Supported"); + + printf("Wear-level Index Delta Threshold: %d\n", geometry_data->wit); + printf("Groups (channels): %d\n", geometry_data->num_grp); + printf("PUs (LUNs) per group: %d\n", geometry_data->num_pu); + printf("Chunks per LUN: %d\n", geometry_data->num_chk); + printf("Logical blks per chunk: %d\n", geometry_data->clba); + printf("MIN write size: %d\n", geometry_data->ws_min); + printf("OPT write size: %d\n", geometry_data->ws_opt); + printf("Cache min write size: %d\n", geometry_data->mw_cunits); + printf("Max open chunks: %d\n", geometry_data->maxoc); + printf("Max open chunks per PU: %d\n", geometry_data->maxocpu); + printf("\n"); +} + +static void +print_namespace(struct spdk_nvme_ns *ns) +{ + const struct spdk_nvme_ns_data *nsdata; + const struct spdk_uuid *uuid; + uint32_t i; + uint32_t flags; + char uuid_str[SPDK_UUID_STRING_LEN]; + uint32_t blocksize; + + nsdata = spdk_nvme_ns_get_data(ns); + flags = spdk_nvme_ns_get_flags(ns); + + printf("Namespace ID:%d\n", spdk_nvme_ns_get_id(ns)); + + if (g_hex_dump) { + hex_dump(nsdata, sizeof(*nsdata)); + printf("\n"); + } + + /* This function is only called for active namespaces. */ + assert(spdk_nvme_ns_is_active(ns)); + + printf("Deallocate: %s\n", + (flags & SPDK_NVME_NS_DEALLOCATE_SUPPORTED) ? "Supported" : "Not Supported"); + printf("Deallocated/Unwritten Error: %s\n", + nsdata->nsfeat.dealloc_or_unwritten_error ? "Supported" : "Not Supported"); + printf("Deallocated Read Value: %s\n", + nsdata->dlfeat.bits.read_value == SPDK_NVME_DEALLOC_READ_00 ? "All 0x00" : + nsdata->dlfeat.bits.read_value == SPDK_NVME_DEALLOC_READ_FF ? "All 0xFF" : + "Unknown"); + printf("Deallocate in Write Zeroes: %s\n", + nsdata->dlfeat.bits.write_zero_deallocate ? "Supported" : "Not Supported"); + printf("Deallocated Guard Field: %s\n", + nsdata->dlfeat.bits.guard_value ? "CRC for Read Value" : "0xFFFF"); + printf("Flush: %s\n", + (flags & SPDK_NVME_NS_FLUSH_SUPPORTED) ? "Supported" : "Not Supported"); + printf("Reservation: %s\n", + (flags & SPDK_NVME_NS_RESERVATION_SUPPORTED) ? "Supported" : "Not Supported"); + if (flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) { + printf("End-to-End Data Protection: Supported\n"); + printf("Protection Type: Type%d\n", nsdata->dps.pit); + printf("Protection Information Transferred as: %s\n", + nsdata->dps.md_start ? "First 8 Bytes" : "Last 8 Bytes"); + } + if (nsdata->lbaf[nsdata->flbas.format].ms > 0) { + printf("Metadata Transferred as: %s\n", + nsdata->flbas.extended ? "Extended Data LBA" : "Separate Metadata Buffer"); + } + printf("Namespace Sharing Capabilities: %s\n", + nsdata->nmic.can_share ? "Multiple Controllers" : "Private"); + blocksize = 1 << nsdata->lbaf[nsdata->flbas.format].lbads; + printf("Size (in LBAs): %lld (%lldGiB)\n", + (long long)nsdata->nsze, + (long long)nsdata->nsze * blocksize / 1024 / 1024 / 1024); + printf("Capacity (in LBAs): %lld (%lldGiB)\n", + (long long)nsdata->ncap, + (long long)nsdata->ncap * blocksize / 1024 / 1024 / 1024); + printf("Utilization (in LBAs): %lld (%lldGiB)\n", + (long long)nsdata->nuse, + (long long)nsdata->nuse * blocksize / 1024 / 1024 / 1024); + if (nsdata->noiob) { + printf("Optimal I/O Boundary: %u blocks\n", nsdata->noiob); + } + if (!spdk_mem_all_zero(nsdata->nguid, sizeof(nsdata->nguid))) { + printf("NGUID: "); + print_hex_be(nsdata->nguid, sizeof(nsdata->nguid)); + printf("\n"); + } + if (!spdk_mem_all_zero(&nsdata->eui64, sizeof(nsdata->eui64))) { + printf("EUI64: "); + print_hex_be(&nsdata->eui64, sizeof(nsdata->eui64)); + printf("\n"); + } + uuid = spdk_nvme_ns_get_uuid(ns); + if (uuid) { + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), uuid); + printf("UUID: %s\n", uuid_str); + } + printf("Thin Provisioning: %s\n", + nsdata->nsfeat.thin_prov ? "Supported" : "Not Supported"); + printf("Per-NS Atomic Units: %s\n", + nsdata->nsfeat.ns_atomic_write_unit ? "Yes" : "No"); + if (nsdata->nsfeat.ns_atomic_write_unit) { + if (nsdata->nawun) { + printf(" Atomic Write Unit (Normal): %d\n", nsdata->nawun + 1); + } + + if (nsdata->nawupf) { + printf(" Atomic Write Unit (PFail): %d\n", nsdata->nawupf + 1); + } + + if (nsdata->nacwu) { + printf(" Atomic Compare & Write Unit: %d\n", nsdata->nacwu + 1); + } + + printf(" Atomic Boundary Size (Normal): %d\n", nsdata->nabsn); + printf(" Atomic Boundary Size (PFail): %d\n", nsdata->nabspf); + printf(" Atomic Boundary Offset: %d\n", nsdata->nabo); + } + + printf("NGUID/EUI64 Never Reused: %s\n", + nsdata->nsfeat.guid_never_reused ? "Yes" : "No"); + printf("Number of LBA Formats: %d\n", nsdata->nlbaf + 1); + printf("Current LBA Format: LBA Format #%02d\n", + nsdata->flbas.format); + for (i = 0; i <= nsdata->nlbaf; i++) + printf("LBA Format #%02d: Data Size: %5d Metadata Size: %5d\n", + i, 1 << nsdata->lbaf[i].lbads, nsdata->lbaf[i].ms); + printf("\n"); + + if (spdk_nvme_ctrlr_is_ocssd_supported(spdk_nvme_ns_get_ctrlr(ns))) { + get_ocssd_geometry(ns, &geometry_data); + print_ocssd_geometry(&geometry_data); + get_ocssd_chunk_info_log_page(ns); + print_ocssd_chunk_info(g_ocssd_chunk_info_page, NUM_CHUNK_INFO_ENTRIES); + } +} + +static const char * +admin_opc_name(uint8_t opc) +{ + switch (opc) { + case SPDK_NVME_OPC_DELETE_IO_SQ: + return "Delete I/O Submission Queue"; + case SPDK_NVME_OPC_CREATE_IO_SQ: + return "Create I/O Submission Queue"; + case SPDK_NVME_OPC_GET_LOG_PAGE: + return "Get Log Page"; + case SPDK_NVME_OPC_DELETE_IO_CQ: + return "Delete I/O Completion Queue"; + case SPDK_NVME_OPC_CREATE_IO_CQ: + return "Create I/O Completion Queue"; + case SPDK_NVME_OPC_IDENTIFY: + return "Identify"; + case SPDK_NVME_OPC_ABORT: + return "Abort"; + case SPDK_NVME_OPC_SET_FEATURES: + return "Set Features"; + case SPDK_NVME_OPC_GET_FEATURES: + return "Get Features"; + case SPDK_NVME_OPC_ASYNC_EVENT_REQUEST: + return "Asynchronous Event Request"; + case SPDK_NVME_OPC_NS_MANAGEMENT: + return "Namespace Management"; + case SPDK_NVME_OPC_FIRMWARE_COMMIT: + return "Firmware Commit"; + case SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD: + return "Firmware Image Download"; + case SPDK_NVME_OPC_DEVICE_SELF_TEST: + return "Device Self-test"; + case SPDK_NVME_OPC_NS_ATTACHMENT: + return "Namespace Attachment"; + case SPDK_NVME_OPC_KEEP_ALIVE: + return "Keep Alive"; + case SPDK_NVME_OPC_DIRECTIVE_SEND: + return "Directive Send"; + case SPDK_NVME_OPC_DIRECTIVE_RECEIVE: + return "Directive Receive"; + case SPDK_NVME_OPC_VIRTUALIZATION_MANAGEMENT: + return "Virtualization Management"; + case SPDK_NVME_OPC_NVME_MI_SEND: + return "NVMe-MI Send"; + case SPDK_NVME_OPC_NVME_MI_RECEIVE: + return "NVMe-MI Receive"; + case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: + return "Doorbell Buffer Config"; + case SPDK_NVME_OPC_FORMAT_NVM: + return "Format NVM"; + case SPDK_NVME_OPC_SECURITY_SEND: + return "Security Send"; + case SPDK_NVME_OPC_SECURITY_RECEIVE: + return "Security Receive"; + case SPDK_NVME_OPC_SANITIZE: + return "Sanitize"; + default: + if (opc >= 0xC0) { + return "Vendor specific"; + } + return "Unknown"; + } +} + +static const char * +io_opc_name(uint8_t opc) +{ + switch (opc) { + case SPDK_NVME_OPC_FLUSH: + return "Flush"; + case SPDK_NVME_OPC_WRITE: + return "Write"; + case SPDK_NVME_OPC_READ: + return "Read"; + case SPDK_NVME_OPC_WRITE_UNCORRECTABLE: + return "Write Uncorrectable"; + case SPDK_NVME_OPC_COMPARE: + return "Compare"; + case SPDK_NVME_OPC_WRITE_ZEROES: + return "Write Zeroes"; + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + return "Dataset Management"; + case SPDK_NVME_OPC_RESERVATION_REGISTER: + return "Reservation Register"; + case SPDK_NVME_OPC_RESERVATION_REPORT: + return "Reservation Report"; + case SPDK_NVME_OPC_RESERVATION_ACQUIRE: + return "Reservation Acquire"; + case SPDK_NVME_OPC_RESERVATION_RELEASE: + return "Reservation Release"; + default: + if (opc >= 0x80) { + return "Vendor specific"; + } + return "Unknown"; + } +} + +static void +print_controller(struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_transport_id *trid) +{ + const struct spdk_nvme_ctrlr_data *cdata; + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + union spdk_nvme_cmbsz_register cmbsz; + uint8_t str[512]; + uint32_t i; + struct spdk_nvme_error_information_entry *error_entry; + struct spdk_pci_addr pci_addr; + struct spdk_pci_device *pci_dev; + struct spdk_pci_id pci_id; + uint32_t nsid; + + cap = spdk_nvme_ctrlr_get_regs_cap(ctrlr); + vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); + cmbsz = spdk_nvme_ctrlr_get_regs_cmbsz(ctrlr); + + if (!spdk_nvme_ctrlr_is_discovery(ctrlr)) { + /* + * Discovery Controller only supports the + * IDENTIFY and GET_LOG_PAGE cmd set, so only + * attempt GET_FEATURES when NOT targeting a + * Discovery Controller. + */ + get_features(ctrlr); + } + get_log_pages(ctrlr); + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + printf("=====================================================\n"); + if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) { + printf("NVMe over Fabrics controller at %s:%s: %s\n", + trid->traddr, trid->trsvcid, trid->subnqn); + } else { + if (spdk_pci_addr_parse(&pci_addr, trid->traddr) != 0) { + return; + } + + pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr); + if (!pci_dev) { + return; + } + + pci_id = spdk_pci_device_get_id(pci_dev); + + printf("NVMe Controller at %04x:%02x:%02x.%x [%04x:%04x]\n", + pci_addr.domain, pci_addr.bus, + pci_addr.dev, pci_addr.func, + pci_id.vendor_id, pci_id.device_id); + } + printf("=====================================================\n"); + + if (g_hex_dump) { + hex_dump(cdata, sizeof(*cdata)); + printf("\n"); + } + + printf("Controller Capabilities/Features\n"); + printf("================================\n"); + printf("Vendor ID: %04x\n", cdata->vid); + printf("Subsystem Vendor ID: %04x\n", cdata->ssvid); + printf("Serial Number: "); + print_ascii_string(cdata->sn, sizeof(cdata->sn)); + printf("\n"); + printf("Model Number: "); + print_ascii_string(cdata->mn, sizeof(cdata->mn)); + printf("\n"); + printf("Firmware Version: "); + print_ascii_string(cdata->fr, sizeof(cdata->fr)); + printf("\n"); + printf("Recommended Arb Burst: %d\n", cdata->rab); + printf("IEEE OUI Identifier: %02x %02x %02x\n", + cdata->ieee[0], cdata->ieee[1], cdata->ieee[2]); + printf("Multi-path I/O\n"); + printf(" May have multiple subsystem ports: %s\n", cdata->cmic.multi_port ? "Yes" : "No"); + printf(" May be connected to multiple hosts: %s\n", cdata->cmic.multi_host ? "Yes" : "No"); + printf(" Associated with SR-IOV VF: %s\n", cdata->cmic.sr_iov ? "Yes" : "No"); + printf("Max Data Transfer Size: "); + if (cdata->mdts == 0) { + printf("Unlimited\n"); + } else { + printf("%" PRIu64 "\n", (uint64_t)1 << (12 + cap.bits.mpsmin + cdata->mdts)); + } + printf("Max Number of Namespaces: %d\n", cdata->nn); + if (features[SPDK_NVME_FEAT_ERROR_RECOVERY].valid) { + unsigned tler = features[SPDK_NVME_FEAT_ERROR_RECOVERY].result & 0xFFFF; + printf("Error Recovery Timeout: "); + if (tler == 0) { + printf("Unlimited\n"); + } else { + printf("%u milliseconds\n", tler * 100); + } + } + printf("NVMe Specification Version (VS): %u.%u", vs.bits.mjr, vs.bits.mnr); + if (vs.bits.ter) { + printf(".%u", vs.bits.ter); + } + printf("\n"); + if (cdata->ver.raw != 0) { + printf("NVMe Specification Version (Identify): %u.%u", cdata->ver.bits.mjr, cdata->ver.bits.mnr); + if (cdata->ver.bits.ter) { + printf(".%u", cdata->ver.bits.ter); + } + printf("\n"); + } + + printf("Maximum Queue Entries: %u\n", cap.bits.mqes + 1); + printf("Contiguous Queues Required: %s\n", cap.bits.cqr ? "Yes" : "No"); + printf("Arbitration Mechanisms Supported\n"); + printf(" Weighted Round Robin: %s\n", + cap.bits.ams & SPDK_NVME_CAP_AMS_WRR ? "Supported" : "Not Supported"); + printf(" Vendor Specific: %s\n", + cap.bits.ams & SPDK_NVME_CAP_AMS_VS ? "Supported" : "Not Supported"); + printf("Reset Timeout: %" PRIu64 " ms\n", (uint64_t)500 * cap.bits.to); + printf("Doorbell Stride: %" PRIu64 " bytes\n", + (uint64_t)1 << (2 + cap.bits.dstrd)); + printf("NVM Subsystem Reset: %s\n", + cap.bits.nssrs ? "Supported" : "Not Supported"); + printf("Command Sets Supported\n"); + printf(" NVM Command Set: %s\n", + cap.bits.css & SPDK_NVME_CAP_CSS_NVM ? "Supported" : "Not Supported"); + printf("Boot Partition: %s\n", + cap.bits.bps ? "Supported" : "Not Supported"); + printf("Memory Page Size Minimum: %" PRIu64 " bytes\n", + (uint64_t)1 << (12 + cap.bits.mpsmin)); + printf("Memory Page Size Maximum: %" PRIu64 " bytes\n", + (uint64_t)1 << (12 + cap.bits.mpsmax)); + printf("Optional Asynchronous Events Supported\n"); + printf(" Namespace Attribute Notices: %s\n", + cdata->oaes.ns_attribute_notices ? "Supported" : "Not Supported"); + printf(" Firmware Activation Notices: %s\n", + cdata->oaes.fw_activation_notices ? "Supported" : "Not Supported"); + + printf("128-bit Host Identifier: %s\n", + cdata->ctratt.host_id_exhid_supported ? "Supported" : "Not Supported"); + printf("\n"); + + printf("Controller Memory Buffer Support\n"); + printf("================================\n"); + if (cmbsz.raw != 0) { + uint64_t size = cmbsz.bits.sz; + + /* Convert the size to bytes by multiplying by the granularity. + By spec, szu is at most 6 and sz is 20 bits, so size requires + at most 56 bits. */ + size *= (0x1000 << (cmbsz.bits.szu * 4)); + + printf("Supported: Yes\n"); + printf("Total Size: %lu bytes\n", size); + printf("Submission Queues in CMB: %s\n", + cmbsz.bits.sqs ? "Supported" : "Not Supported"); + printf("Completion Queues in CMB: %s\n", + cmbsz.bits.cqs ? "Supported" : "Not Supported"); + printf("Read data and metadata in CMB %s\n", + cmbsz.bits.rds ? "Supported" : "Not Supported"); + printf("Write data and metadata in CMB: %s\n", + cmbsz.bits.wds ? "Supported" : "Not Supported"); + } else { + printf("Supported: No\n"); + } + printf("\n"); + + printf("Admin Command Set Attributes\n"); + printf("============================\n"); + printf("Security Send/Receive: %s\n", + cdata->oacs.security ? "Supported" : "Not Supported"); + printf("Format NVM: %s\n", + cdata->oacs.format ? "Supported" : "Not Supported"); + printf("Firmware Activate/Download: %s\n", + cdata->oacs.firmware ? "Supported" : "Not Supported"); + printf("Namespace Management: %s\n", + cdata->oacs.ns_manage ? "Supported" : "Not Supported"); + printf("Device Self-Test: %s\n", + cdata->oacs.device_self_test ? "Supported" : "Not Supported"); + printf("Directives: %s\n", + cdata->oacs.directives ? "Supported" : "Not Supported"); + printf("NVMe-MI: %s\n", + cdata->oacs.nvme_mi ? "Supported" : "Not Supported"); + printf("Virtualization Management: %s\n", + cdata->oacs.virtualization_management ? "Supported" : "Not Supported"); + printf("Doorbell Buffer Config: %s\n", + cdata->oacs.doorbell_buffer_config ? "Supported" : "Not Supported"); + printf("Abort Command Limit: %d\n", cdata->acl + 1); + printf("Async Event Request Limit: %d\n", cdata->aerl + 1); + printf("Number of Firmware Slots: "); + if (cdata->oacs.firmware != 0) { + printf("%d\n", cdata->frmw.num_slots); + } else { + printf("N/A\n"); + } + printf("Firmware Slot 1 Read-Only: "); + if (cdata->oacs.firmware != 0) { + printf("%s\n", cdata->frmw.slot1_ro ? "Yes" : "No"); + } else { + printf("N/A\n"); + } + if (cdata->fwug == 0x00) { + printf("Firmware Update Granularity: No Information Provided\n"); + } else if (cdata->fwug == 0xFF) { + printf("Firmware Update Granularity: No Restriction\n"); + } else { + printf("Firmware Update Granularity: %u KiB\n", + cdata->fwug * 4); + } + printf("Per-Namespace SMART Log: %s\n", + cdata->lpa.ns_smart ? "Yes" : "No"); + printf("Command Effects Log Page: %s\n", + cdata->lpa.celp ? "Supported" : "Not Supported"); + printf("Get Log Page Extended Data: %s\n", + cdata->lpa.edlp ? "Supported" : "Not Supported"); + printf("Telemetry Log Pages: %s\n", + cdata->lpa.telemetry ? "Supported" : "Not Supported"); + printf("Error Log Page Entries Supported: %d\n", cdata->elpe + 1); + if (cdata->kas == 0) { + printf("Keep Alive: Not Supported\n"); + } else { + printf("Keep Alive: Supported\n"); + printf("Keep Alive Granularity: %u ms\n", + cdata->kas * 100); + } + printf("\n"); + + printf("NVM Command Set Attributes\n"); + printf("==========================\n"); + printf("Submission Queue Entry Size\n"); + printf(" Max: %d\n", 1 << cdata->sqes.max); + printf(" Min: %d\n", 1 << cdata->sqes.min); + printf("Completion Queue Entry Size\n"); + printf(" Max: %d\n", 1 << cdata->cqes.max); + printf(" Min: %d\n", 1 << cdata->cqes.min); + printf("Number of Namespaces: %d\n", cdata->nn); + printf("Compare Command: %s\n", + cdata->oncs.compare ? "Supported" : "Not Supported"); + printf("Write Uncorrectable Command: %s\n", + cdata->oncs.write_unc ? "Supported" : "Not Supported"); + printf("Dataset Management Command: %s\n", + cdata->oncs.dsm ? "Supported" : "Not Supported"); + printf("Write Zeroes Command: %s\n", + cdata->oncs.write_zeroes ? "Supported" : "Not Supported"); + printf("Set Features Save Field: %s\n", + cdata->oncs.set_features_save ? "Supported" : "Not Supported"); + printf("Reservations: %s\n", + cdata->oncs.reservations ? "Supported" : "Not Supported"); + printf("Timestamp: %s\n", + cdata->oncs.timestamp ? "Supported" : "Not Supported"); + printf("Volatile Write Cache: %s\n", + cdata->vwc.present ? "Present" : "Not Present"); + printf("Atomic Write Unit (Normal): %d\n", cdata->awun + 1); + printf("Atomic Write Unit (PFail): %d\n", cdata->awupf + 1); + printf("Atomic Compare & Write Unit: %d\n", cdata->acwu + 1); + printf("Fused Compare & Write: %s\n", + cdata->fuses.compare_and_write ? "Supported" : "Not Supported"); + printf("Scatter-Gather List\n"); + printf(" SGL Command Set: %s\n", + cdata->sgls.supported == SPDK_NVME_SGLS_SUPPORTED ? "Supported" : + cdata->sgls.supported == SPDK_NVME_SGLS_SUPPORTED_DWORD_ALIGNED ? "Supported (Dword aligned)" : + "Not Supported"); + printf(" SGL Keyed: %s\n", + cdata->sgls.keyed_sgl ? "Supported" : "Not Supported"); + printf(" SGL Bit Bucket Descriptor: %s\n", + cdata->sgls.bit_bucket_descriptor ? "Supported" : "Not Supported"); + printf(" SGL Metadata Pointer: %s\n", + cdata->sgls.metadata_pointer ? "Supported" : "Not Supported"); + printf(" Oversized SGL: %s\n", + cdata->sgls.oversized_sgl ? "Supported" : "Not Supported"); + printf(" SGL Metadata Address: %s\n", + cdata->sgls.metadata_address ? "Supported" : "Not Supported"); + printf(" SGL Offset: %s\n", + cdata->sgls.sgl_offset ? "Supported" : "Not Supported"); + printf(" Transport SGL Data Block: %s\n", + cdata->sgls.transport_sgl ? "Supported" : "Not Supported"); + printf("Replay Protected Memory Block:"); + if (cdata->rpmbs.num_rpmb_units > 0) { + printf(" Supported\n"); + printf(" Number of RPMB Units: %d\n", cdata->rpmbs.num_rpmb_units); + printf(" Authentication Method: %s\n", cdata->rpmbs.auth_method == 0 ? "HMAC SHA-256" : "Unknown"); + printf(" Total Size (in 128KB units) = %d\n", cdata->rpmbs.total_size + 1); + printf(" Access Size (in 512B units) = %d\n", cdata->rpmbs.access_size + 1); + } else { + printf(" Not Supported\n"); + } + printf("\n"); + + printf("Firmware Slot Information\n"); + printf("=========================\n"); + if (g_hex_dump) { + hex_dump(&firmware_page, sizeof(firmware_page)); + printf("\n"); + } + printf("Active slot: %u\n", firmware_page.afi.active_slot); + if (firmware_page.afi.next_reset_slot) { + printf("Next controller reset slot: %u\n", firmware_page.afi.next_reset_slot); + } + for (i = 0; i < 7; i++) { + if (!spdk_mem_all_zero(firmware_page.revision[i], sizeof(firmware_page.revision[i]))) { + printf("Slot %u Firmware Revision: ", i + 1); + print_ascii_string(firmware_page.revision[i], sizeof(firmware_page.revision[i])); + printf("\n"); + } + } + printf("\n"); + + if (cdata->lpa.celp) { + printf("Commands Supported and Effects\n"); + printf("==============================\n"); + + if (g_hex_dump) { + hex_dump(&cmd_effects_log_page, sizeof(cmd_effects_log_page)); + printf("\n"); + } + + printf("Admin Commands\n"); + printf("--------------\n"); + for (i = 0; i < SPDK_COUNTOF(cmd_effects_log_page.admin_cmds_supported); i++) { + struct spdk_nvme_cmds_and_effect_entry *cmd = &cmd_effects_log_page.admin_cmds_supported[i]; + if (cmd->csupp) { + printf("%30s (%02Xh): Supported %s%s%s%s%s\n", + admin_opc_name(i), i, + cmd->lbcc ? "LBA-Change " : "", + cmd->ncc ? "NS-Cap-Change " : "", + cmd->nic ? "NS-Inventory-Change " : "", + cmd->ccc ? "Ctrlr-Cap-Change " : "", + cmd->cse == 0 ? "" : cmd->cse == 1 ? "Per-NS-Exclusive" : cmd->cse == 2 ? "All-NS-Exclusive" : ""); + } + } + + printf("I/O Commands\n"); + printf("------------\n"); + for (i = 0; i < SPDK_COUNTOF(cmd_effects_log_page.io_cmds_supported); i++) { + struct spdk_nvme_cmds_and_effect_entry *cmd = &cmd_effects_log_page.io_cmds_supported[i]; + if (cmd->csupp) { + printf("%30s (%02Xh): Supported %s%s%s%s%s\n", + io_opc_name(i), i, + cmd->lbcc ? "LBA-Change " : "", + cmd->ncc ? "NS-Cap-Change " : "", + cmd->nic ? "NS-Inventory-Change " : "", + cmd->ccc ? "Ctrlr-Cap-Change " : "", + cmd->cse == 0 ? "" : cmd->cse == 1 ? "Per-NS-Exclusive" : cmd->cse == 2 ? "All-NS-Exclusive" : ""); + } + } + printf("\n"); + } + + printf("Error Log\n"); + printf("=========\n"); + for (i = 0; i <= cdata->elpe; i++) { + error_entry = &error_page[i]; + if (error_entry->error_count == 0) { + continue; + } + if (i != 0) { + printf("-----------\n"); + } + + printf("Entry: %u\n", i); + printf("Error Count: 0x%"PRIx64"\n", error_entry->error_count); + printf("Submission Queue Id: 0x%x\n", error_entry->sqid); + printf("Command Id: 0x%x\n", error_entry->cid); + printf("Phase Bit: %x\n", error_entry->status.p); + printf("Status Code: 0x%x\n", error_entry->status.sc); + printf("Status Code Type: 0x%x\n", error_entry->status.sct); + printf("Do Not Retry: %x\n", error_entry->status.dnr); + printf("Error Location: 0x%x\n", error_entry->error_location); + printf("LBA: 0x%"PRIx64"\n", error_entry->lba); + printf("Namespace: 0x%x\n", error_entry->nsid); + printf("Vendor Log Page: 0x%x\n", error_entry->vendor_specific); + + } + printf("\n"); + + if (features[SPDK_NVME_FEAT_ARBITRATION].valid) { + uint32_t arb = features[SPDK_NVME_FEAT_ARBITRATION].result; + unsigned ab, lpw, mpw, hpw; + + ab = arb & 0x7; + lpw = ((arb >> 8) & 0xFF) + 1; + mpw = ((arb >> 16) & 0xFF) + 1; + hpw = ((arb >> 24) & 0xFF) + 1; + + printf("Arbitration\n"); + printf("===========\n"); + printf("Arbitration Burst: "); + if (ab == 0x7) { + printf("no limit\n"); + } else { + printf("%u\n", 1u << ab); + } + + if (cap.bits.ams & SPDK_NVME_CAP_AMS_WRR) { + printf("Low Priority Weight: %u\n", lpw); + printf("Medium Priority Weight: %u\n", mpw); + printf("High Priority Weight: %u\n", hpw); + } + printf("\n"); + } + + if (features[SPDK_NVME_FEAT_POWER_MANAGEMENT].valid) { + unsigned ps = features[SPDK_NVME_FEAT_POWER_MANAGEMENT].result & 0x1F; + printf("Power Management\n"); + printf("================\n"); + printf("Number of Power States: %u\n", cdata->npss + 1); + printf("Current Power State: Power State #%u\n", ps); + for (i = 0; i <= cdata->npss; i++) { + const struct spdk_nvme_power_state *psd = &cdata->psd[i]; + printf("Power State #%u: ", i); + if (psd->mps) { + /* MP scale is 0.0001 W */ + printf("Max Power: %u.%04u W\n", + psd->mp / 10000, + psd->mp % 10000); + } else { + /* MP scale is 0.01 W */ + printf("Max Power: %3u.%02u W\n", + psd->mp / 100, + psd->mp % 100); + } + /* TODO: print other power state descriptor fields */ + } + printf("Non-Operational Permissive Mode: %s\n", + cdata->ctratt.non_operational_power_state_permissive_mode ? "Supported" : "Not Supported"); + printf("\n"); + } + + if (features[SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD].valid) { + printf("Health Information\n"); + printf("==================\n"); + + if (g_hex_dump) { + hex_dump(&health_page, sizeof(health_page)); + printf("\n"); + } + + printf("Critical Warnings:\n"); + printf(" Available Spare Space: %s\n", + health_page.critical_warning.bits.available_spare ? "WARNING" : "OK"); + printf(" Temperature: %s\n", + health_page.critical_warning.bits.temperature ? "WARNING" : "OK"); + printf(" Device Reliability: %s\n", + health_page.critical_warning.bits.device_reliability ? "WARNING" : "OK"); + printf(" Read Only: %s\n", + health_page.critical_warning.bits.read_only ? "Yes" : "No"); + printf(" Volatile Memory Backup: %s\n", + health_page.critical_warning.bits.volatile_memory_backup ? "WARNING" : "OK"); + printf("Current Temperature: %u Kelvin (%d Celsius)\n", + health_page.temperature, + (int)health_page.temperature - 273); + printf("Temperature Threshold: %u Kelvin (%d Celsius)\n", + features[SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD].result, + (int)features[SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD].result - 273); + printf("Available Spare: %u%%\n", health_page.available_spare); + printf("Available Spare Threshold: %u%%\n", health_page.available_spare_threshold); + printf("Life Percentage Used: %u%%\n", health_page.percentage_used); + printf("Data Units Read: "); + print_uint128_dec(health_page.data_units_read); + printf("\n"); + printf("Data Units Written: "); + print_uint128_dec(health_page.data_units_written); + printf("\n"); + printf("Host Read Commands: "); + print_uint128_dec(health_page.host_read_commands); + printf("\n"); + printf("Host Write Commands: "); + print_uint128_dec(health_page.host_write_commands); + printf("\n"); + printf("Controller Busy Time: "); + print_uint128_dec(health_page.controller_busy_time); + printf(" minutes\n"); + printf("Power Cycles: "); + print_uint128_dec(health_page.power_cycles); + printf("\n"); + printf("Power On Hours: "); + print_uint128_dec(health_page.power_on_hours); + printf(" hours\n"); + printf("Unsafe Shutdowns: "); + print_uint128_dec(health_page.unsafe_shutdowns); + printf("\n"); + printf("Unrecoverable Media Errors: "); + print_uint128_dec(health_page.media_errors); + printf("\n"); + printf("Lifetime Error Log Entries: "); + print_uint128_dec(health_page.num_error_info_log_entries); + printf("\n"); + printf("Warning Temperature Time: %u minutes\n", health_page.warning_temp_time); + printf("Critical Temperature Time: %u minutes\n", health_page.critical_temp_time); + for (i = 0; i < 8; i++) { + if (health_page.temp_sensor[i] != 0) { + printf("Temperature Sensor %d: %u Kelvin (%d Celsius)\n", + i + 1, health_page.temp_sensor[i], + (int)health_page.temp_sensor[i] - 273); + } + } + printf("\n"); + } + + if (features[SPDK_NVME_FEAT_NUMBER_OF_QUEUES].valid) { + uint32_t result = features[SPDK_NVME_FEAT_NUMBER_OF_QUEUES].result; + + printf("Number of Queues\n"); + printf("================\n"); + printf("Number of I/O Submission Queues: %u\n", (result & 0xFFFF) + 1); + printf("Number of I/O Completion Queues: %u\n", (result & 0xFFFF0000 >> 16) + 1); + printf("\n"); + } + + if (features[SPDK_OCSSD_FEAT_MEDIA_FEEDBACK].valid) { + uint32_t result = features[SPDK_OCSSD_FEAT_MEDIA_FEEDBACK].result; + + printf("OCSSD Media Feedback\n"); + printf("=======================\n"); + printf("High ECC status: %u\n", (result & 0x1)); + printf("Vector High ECC status: %u\n", (result & 0x2 >> 1)); + printf("\n"); + } + + if (cdata->hctma.bits.supported) { + printf("Host Controlled Thermal Management\n"); + printf("==================================\n"); + printf("Minimum Thermal Management Temperature: "); + if (cdata->mntmt) { + printf("%u Kelvin (%d Celsius)\n", cdata->mntmt, (int)cdata->mntmt - 273); + } else { + printf("Not Reported\n"); + } + printf("Maximum Thermal Managment Temperature: "); + if (cdata->mxtmt) { + printf("%u Kelvin (%d Celsius)\n", cdata->mxtmt, (int)cdata->mxtmt - 273); + } else { + printf("Not Reported\n"); + } + printf("\n"); + } + + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr, SPDK_NVME_INTEL_LOG_SMART)) { + size_t i = 0; + + printf("Intel Health Information\n"); + printf("==================\n"); + for (i = 0; + i < SPDK_COUNTOF(intel_smart_page.attributes); i++) { + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_PROGRAM_FAIL_COUNT) { + printf("Program Fail Count:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_ERASE_FAIL_COUNT) { + printf("Erase Fail Count:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_WEAR_LEVELING_COUNT) { + printf("Wear Leveling Count:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value:\n"); + printf(" Min: "); + print_uint_var_dec(&intel_smart_page.attributes[i].raw_value[0], 2); + printf("\n"); + printf(" Max: "); + print_uint_var_dec(&intel_smart_page.attributes[i].raw_value[2], 2); + printf("\n"); + printf(" Avg: "); + print_uint_var_dec(&intel_smart_page.attributes[i].raw_value[4], 2); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_E2E_ERROR_COUNT) { + printf("End to End Error Detection Count:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_CRC_ERROR_COUNT) { + printf("CRC Error Count:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_MEDIA_WEAR) { + printf("Timed Workload, Media Wear:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_HOST_READ_PERCENTAGE) { + printf("Timed Workload, Host Read/Write Ratio:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("%%"); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_TIMER) { + printf("Timed Workload, Timer:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_THERMAL_THROTTLE_STATUS) { + printf("Thermal Throttle Status:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value:\n"); + printf(" Percentage: %d%%\n", intel_smart_page.attributes[i].raw_value[0]); + printf(" Throttling Event Count: "); + print_uint_var_dec(&intel_smart_page.attributes[i].raw_value[1], 4); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_RETRY_BUFFER_OVERFLOW_COUNTER) { + printf("Retry Buffer Overflow Counter:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_PLL_LOCK_LOSS_COUNT) { + printf("PLL Lock Loss Count:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_NAND_BYTES_WRITTEN) { + printf("NAND Bytes Written:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("\n"); + } + if (intel_smart_page.attributes[i].code == SPDK_NVME_INTEL_SMART_HOST_BYTES_WRITTEN) { + printf("Host Bytes Written:\n"); + printf(" Normalized Value : %d\n", + intel_smart_page.attributes[i].normalized_value); + printf(" Current Raw Value: "); + print_uint_var_dec(intel_smart_page.attributes[i].raw_value, 6); + printf("\n"); + } + } + printf("\n"); + } + + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr, SPDK_NVME_INTEL_LOG_TEMPERATURE)) { + printf("Intel Temperature Information\n"); + printf("==================\n"); + printf("Current Temperature: %lu\n", intel_temperature_page.current_temperature); + printf("Overtemp shutdown Flag for last critical component temperature: %lu\n", + intel_temperature_page.shutdown_flag_last); + printf("Overtemp shutdown Flag for life critical component temperature: %lu\n", + intel_temperature_page.shutdown_flag_life); + printf("Highest temperature: %lu\n", intel_temperature_page.highest_temperature); + printf("Lowest temperature: %lu\n", intel_temperature_page.lowest_temperature); + printf("Specified Maximum Operating Temperature: %lu\n", + intel_temperature_page.specified_max_op_temperature); + printf("Specified Minimum Operating Temperature: %lu\n", + intel_temperature_page.specified_min_op_temperature); + printf("Estimated offset: %ld\n", intel_temperature_page.estimated_offset); + printf("\n"); + printf("\n"); + + } + + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr, SPDK_NVME_INTEL_MARKETING_DESCRIPTION)) { + printf("Intel Marketing Information\n"); + printf("==================\n"); + snprintf(str, sizeof(intel_md_page.marketing_product), "%s", intel_md_page.marketing_product); + printf("Marketing Product Information: %s\n", str); + printf("\n"); + printf("\n"); + } + + printf("Active Namespaces\n"); + printf("=================\n"); + for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { + print_namespace(spdk_nvme_ctrlr_get_ns(ctrlr, nsid)); + } + + if (g_discovery_page) { + printf("Discovery Log Page\n"); + printf("==================\n"); + + if (g_hex_dump) { + hex_dump(g_discovery_page, g_discovery_page_size); + printf("\n"); + } + + printf("Generation Counter: %" PRIu64 "\n", + from_le64(&g_discovery_page->genctr)); + printf("Number of Records: %" PRIu64 "\n", + from_le64(&g_discovery_page->numrec)); + printf("Record Format: %" PRIu16 "\n", + from_le16(&g_discovery_page->recfmt)); + printf("\n"); + + for (i = 0; i < g_discovery_page_numrec; i++) { + struct spdk_nvmf_discovery_log_page_entry *entry = &g_discovery_page->entries[i]; + + printf("Discovery Log Entry %u\n", i); + printf("----------------------\n"); + printf("Transport Type: %u (%s)\n", + entry->trtype, spdk_nvme_transport_id_trtype_str(entry->trtype)); + printf("Address Family: %u (%s)\n", + entry->adrfam, spdk_nvme_transport_id_adrfam_str(entry->adrfam)); + printf("Subsystem Type: %u (%s)\n", + entry->subtype, + entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY ? "Discovery Service" : + entry->subtype == SPDK_NVMF_SUBTYPE_NVME ? "NVM Subsystem" : + "Unknown"); + printf("Transport Requirements:\n"); + printf(" Secure Channel: %s\n", + entry->treq.secure_channel == SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED ? "Not Specified" : + entry->treq.secure_channel == SPDK_NVMF_TREQ_SECURE_CHANNEL_REQUIRED ? "Required" : + entry->treq.secure_channel == SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED ? "Not Required" : + "Reserved"); + printf("Port ID: %" PRIu16 " (0x%04" PRIx16 ")\n", + from_le16(&entry->portid), from_le16(&entry->portid)); + printf("Controller ID: %" PRIu16 " (0x%04" PRIx16 ")\n", + from_le16(&entry->cntlid), from_le16(&entry->cntlid)); + printf("Admin Max SQ Size: %" PRIu16 "\n", + from_le16(&entry->asqsz)); + snprintf(str, sizeof(entry->trsvcid) + 1, "%s", entry->trsvcid); + printf("Transport Service Identifier: %s\n", str); + snprintf(str, sizeof(entry->subnqn) + 1, "%s", entry->subnqn); + printf("NVM Subsystem Qualified Name: %s\n", str); + snprintf(str, sizeof(entry->traddr) + 1, "%s", entry->traddr); + printf("Transport Address: %s\n", str); + + if (entry->trtype == SPDK_NVMF_TRTYPE_RDMA) { + printf("Transport Specific Address Subtype - RDMA\n"); + printf(" RDMA QP Service Type: %u (%s)\n", + entry->tsas.rdma.rdma_qptype, + entry->tsas.rdma.rdma_qptype == SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED ? "Reliable Connected" : + entry->tsas.rdma.rdma_qptype == SPDK_NVMF_RDMA_QPTYPE_RELIABLE_DATAGRAM ? "Reliable Datagram" : + "Unknown"); + printf(" RDMA Provider Type: %u (%s)\n", + entry->tsas.rdma.rdma_prtype, + entry->tsas.rdma.rdma_prtype == SPDK_NVMF_RDMA_PRTYPE_NONE ? "No provider specified" : + entry->tsas.rdma.rdma_prtype == SPDK_NVMF_RDMA_PRTYPE_IB ? "InfiniBand" : + entry->tsas.rdma.rdma_prtype == SPDK_NVMF_RDMA_PRTYPE_ROCE ? "InfiniBand RoCE" : + entry->tsas.rdma.rdma_prtype == SPDK_NVMF_RDMA_PRTYPE_ROCE2 ? "InfiniBand RoCE v2" : + entry->tsas.rdma.rdma_prtype == SPDK_NVMF_RDMA_PRTYPE_IWARP ? "iWARP" : + "Unknown"); + printf(" RDMA CM Service: %u (%s)\n", + entry->tsas.rdma.rdma_cms, + entry->tsas.rdma.rdma_cms == SPDK_NVMF_RDMA_CMS_RDMA_CM ? "RDMA_CM" : + "Unknown"); + if (entry->adrfam == SPDK_NVMF_ADRFAM_IB) { + printf(" RDMA Partition Key: %" PRIu32 "\n", + from_le32(&entry->tsas.rdma.rdma_pkey)); + } + } + } + free(g_discovery_page); + g_discovery_page = NULL; + } +} + +static void +usage(const char *program_name) +{ + printf("%s [options]", program_name); + printf("\n"); + printf("options:\n"); + printf(" -r trid remote NVMe over Fabrics target address\n"); + printf(" Format: 'key:value [key:value] ...'\n"); + printf(" Keys:\n"); + printf(" trtype Transport type (e.g. RDMA)\n"); + printf(" adrfam Address family (e.g. IPv4, IPv6)\n"); + printf(" traddr Transport address (e.g. 192.168.100.8)\n"); + printf(" trsvcid Transport service identifier (e.g. 4420)\n"); + printf(" subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); + printf(" Example: -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420'\n"); + + spdk_log_usage(stdout, "-L"); + + printf(" -i shared memory group ID\n"); + printf(" -p core number in decimal to run this application which started from 0\n"); + printf(" -d DPDK huge memory size in MB\n"); + printf(" -x print hex dump of raw data\n"); + printf(" -v verbose (enable warnings)\n"); + printf(" -V enumerate VMD\n"); + printf(" -H show this usage\n"); +} + +static int +parse_args(int argc, char **argv) +{ + int op, rc; + + spdk_nvme_trid_populate_transport(&g_trid, SPDK_NVME_TRANSPORT_PCIE); + snprintf(g_trid.subnqn, sizeof(g_trid.subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); + + while ((op = getopt(argc, argv, "d:i:p:r:xHL:V")) != -1) { + switch (op) { + case 'd': + g_dpdk_mem = spdk_strtol(optarg, 10); + if (g_dpdk_mem < 0) { + fprintf(stderr, "Invalid DPDK memory size\n"); + return g_dpdk_mem; + } + break; + case 'i': + g_shm_id = spdk_strtol(optarg, 10); + if (g_shm_id < 0) { + fprintf(stderr, "Invalid shared memory ID\n"); + return g_shm_id; + } + break; + case 'p': + g_master_core = spdk_strtol(optarg, 10); + if (g_master_core < 0) { + fprintf(stderr, "Invalid core number\n"); + return g_master_core; + } + snprintf(g_core_mask, sizeof(g_core_mask), "0x%llx", 1ULL << g_master_core); + break; + case 'r': + if (spdk_nvme_transport_id_parse(&g_trid, optarg) != 0) { + fprintf(stderr, "Error parsing transport address\n"); + return 1; + } + break; + case 'x': + g_hex_dump = true; + break; + case 'L': + rc = spdk_log_set_flag(optarg); + if (rc < 0) { + fprintf(stderr, "unknown flag\n"); + usage(argv[0]); + exit(EXIT_FAILURE); + } + spdk_log_set_print_level(SPDK_LOG_DEBUG); +#ifndef DEBUG + fprintf(stderr, "%s must be rebuilt with CONFIG_DEBUG=y for -L flag.\n", + argv[0]); + usage(argv[0]); + return 0; +#endif + break; + case 'H': + usage(argv[0]); + break; + case 'V': + g_vmd = true; + break; + default: + usage(argv[0]); + return 1; + } + } + + return 0; +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + return true; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + g_controllers_found++; + print_controller(ctrlr, trid); + spdk_nvme_detach(ctrlr); +} + +int main(int argc, char **argv) +{ + int rc; + struct spdk_env_opts opts; + struct spdk_nvme_ctrlr *ctrlr; + + rc = parse_args(argc, argv); + if (rc != 0) { + return rc; + } + + spdk_env_opts_init(&opts); + opts.name = "identify"; + opts.shm_id = g_shm_id; + opts.mem_size = g_dpdk_mem; + opts.mem_channel = 1; + opts.master_core = g_master_core; + opts.core_mask = g_core_mask; + if (g_trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + opts.no_pci = true; + } + if (spdk_env_init(&opts) < 0) { + fprintf(stderr, "Unable to initialize SPDK env\n"); + return 1; + } + + if (g_vmd && spdk_vmd_init()) { + fprintf(stderr, "Failed to initialize VMD." + " Some NVMe devices can be unavailable.\n"); + } + + /* A specific trid is required. */ + if (strlen(g_trid.traddr) != 0) { + ctrlr = spdk_nvme_connect(&g_trid, NULL, 0); + if (!ctrlr) { + fprintf(stderr, "spdk_nvme_connect() failed\n"); + return 1; + } + + g_controllers_found++; + print_controller(ctrlr, &g_trid); + spdk_nvme_detach(ctrlr); + } else if (spdk_nvme_probe(&g_trid, NULL, probe_cb, attach_cb, NULL) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed\n"); + return 1; + } + + if (g_controllers_found == 0) { + fprintf(stderr, "No NVMe controllers found.\n"); + } + + if (g_vmd) { + spdk_vmd_fini(); + } + + return 0; +} diff --git a/src/spdk/examples/nvme/nvme_manage/.gitignore b/src/spdk/examples/nvme/nvme_manage/.gitignore new file mode 100644 index 000000000..cdc78a1a1 --- /dev/null +++ b/src/spdk/examples/nvme/nvme_manage/.gitignore @@ -0,0 +1 @@ +nvme_manage diff --git a/src/spdk/examples/nvme/nvme_manage/Makefile b/src/spdk/examples/nvme/nvme_manage/Makefile new file mode 100644 index 000000000..ed467b884 --- /dev/null +++ b/src/spdk/examples/nvme/nvme_manage/Makefile @@ -0,0 +1,38 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +APP = nvme_manage + +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk diff --git a/src/spdk/examples/nvme/nvme_manage/nvme_manage.c b/src/spdk/examples/nvme/nvme_manage/nvme_manage.c new file mode 100644 index 000000000..c202dab42 --- /dev/null +++ b/src/spdk/examples/nvme/nvme_manage/nvme_manage.c @@ -0,0 +1,1703 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/env.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/opal.h" + +#define MAX_DEVS 64 + +struct dev { + struct spdk_pci_addr pci_addr; + struct spdk_nvme_ctrlr *ctrlr; + const struct spdk_nvme_ctrlr_data *cdata; + struct spdk_nvme_ns_data *common_ns_data; + int outstanding_admin_cmds; + struct spdk_opal_dev *opal_dev; +}; + +static struct dev devs[MAX_DEVS]; +static int num_devs = 0; +static int g_shm_id = -1; + +#define foreach_dev(iter) \ + for (iter = devs; iter - devs < num_devs; iter++) + +enum controller_display_model { + CONTROLLER_DISPLAY_ALL = 0x0, + CONTROLLER_DISPLAY_SIMPLISTIC = 0x1, +}; + +static int +cmp_devs(const void *ap, const void *bp) +{ + const struct dev *a = ap, *b = bp; + + return spdk_pci_addr_compare(&a->pci_addr, &b->pci_addr); +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + return true; +} + +static void +identify_common_ns_cb(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + struct dev *dev = cb_arg; + + if (cpl->status.sc != SPDK_NVME_SC_SUCCESS) { + /* Identify Namespace for NSID = FFFFFFFFh is optional, so failure is not fatal. */ + spdk_dma_free(dev->common_ns_data); + dev->common_ns_data = NULL; + } + + dev->outstanding_admin_cmds--; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + struct dev *dev; + struct spdk_nvme_cmd cmd; + + /* add to dev list */ + dev = &devs[num_devs++]; + spdk_pci_addr_parse(&dev->pci_addr, trid->traddr); + dev->ctrlr = ctrlr; + + /* Retrieve controller data */ + dev->cdata = spdk_nvme_ctrlr_get_data(dev->ctrlr); + + dev->common_ns_data = spdk_dma_zmalloc(sizeof(struct spdk_nvme_ns_data), 4096, NULL); + if (dev->common_ns_data == NULL) { + fprintf(stderr, "common_ns_data allocation failure\n"); + return; + } + + /* Identify Namespace with NSID set to FFFFFFFFh to get common namespace capabilities. */ + memset(&cmd, 0, sizeof(cmd)); + cmd.opc = SPDK_NVME_OPC_IDENTIFY; + cmd.cdw10_bits.identify.cns = 0; /* CNS = 0 (Identify Namespace) */ + cmd.nsid = SPDK_NVME_GLOBAL_NS_TAG; + + dev->outstanding_admin_cmds++; + if (spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &cmd, dev->common_ns_data, + sizeof(struct spdk_nvme_ns_data), identify_common_ns_cb, dev) != 0) { + dev->outstanding_admin_cmds--; + spdk_dma_free(dev->common_ns_data); + dev->common_ns_data = NULL; + } + + while (dev->outstanding_admin_cmds) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr); + } +} + +static void usage(void) +{ + printf("NVMe Management Options"); + printf("\n"); + printf("\t[1: list controllers]\n"); + printf("\t[2: create namespace]\n"); + printf("\t[3: delete namespace]\n"); + printf("\t[4: attach namespace to controller]\n"); + printf("\t[5: detach namespace from controller]\n"); + printf("\t[6: format namespace or controller]\n"); + printf("\t[7: firmware update]\n"); + printf("\t[8: opal]\n"); + printf("\t[9: quit]\n"); +} + +static void +display_namespace_dpc(const struct spdk_nvme_ns_data *nsdata) +{ + if (nsdata->dpc.pit1 || nsdata->dpc.pit2 || nsdata->dpc.pit3) { + if (nsdata->dpc.pit1) { + printf("PIT1 "); + } + + if (nsdata->dpc.pit2) { + printf("PIT2 "); + } + + if (nsdata->dpc.pit3) { + printf("PIT3 "); + } + } else { + printf("Not Supported\n"); + return; + } + + if (nsdata->dpc.md_start && nsdata->dpc.md_end) { + printf("Location: Head or Tail\n"); + } else if (nsdata->dpc.md_start) { + printf("Location: Head\n"); + } else if (nsdata->dpc.md_end) { + printf("Location: Tail\n"); + } else { + printf("Not Supported\n"); + } +} + +static void +display_namespace(struct spdk_nvme_ns *ns) +{ + const struct spdk_nvme_ns_data *nsdata; + uint32_t i; + + nsdata = spdk_nvme_ns_get_data(ns); + + printf("Namespace ID:%d\n", spdk_nvme_ns_get_id(ns)); + + printf("Size (in LBAs): %lld (%lldM)\n", + (long long)nsdata->nsze, + (long long)nsdata->nsze / 1024 / 1024); + printf("Capacity (in LBAs): %lld (%lldM)\n", + (long long)nsdata->ncap, + (long long)nsdata->ncap / 1024 / 1024); + printf("Utilization (in LBAs): %lld (%lldM)\n", + (long long)nsdata->nuse, + (long long)nsdata->nuse / 1024 / 1024); + printf("Format Progress Indicator: %s\n", + nsdata->fpi.fpi_supported ? "Supported" : "Not Supported"); + if (nsdata->fpi.fpi_supported && nsdata->fpi.percentage_remaining) { + printf("Formatted Percentage: %d%%\n", 100 - nsdata->fpi.percentage_remaining); + } + printf("Number of LBA Formats: %d\n", nsdata->nlbaf + 1); + printf("Current LBA Format: LBA Format #%02d\n", + nsdata->flbas.format); + for (i = 0; i <= nsdata->nlbaf; i++) + printf("LBA Format #%02d: Data Size: %5d Metadata Size: %5d\n", + i, 1 << nsdata->lbaf[i].lbads, nsdata->lbaf[i].ms); + printf("Data Protection Capabilities:"); + display_namespace_dpc(nsdata); + if (SPDK_NVME_FMT_NVM_PROTECTION_DISABLE == nsdata->dps.pit) { + printf("Data Protection Setting: N/A\n"); + } else { + printf("Data Protection Setting: PIT%d Location: %s\n", + nsdata->dps.pit, nsdata->dps.md_start ? "Head" : "Tail"); + } + printf("Multipath IO and Sharing: %s\n", + nsdata->nmic.can_share ? "Supported" : "Not Supported"); + printf("\n"); +} + +static void +display_controller(struct dev *dev, int model) +{ + struct spdk_nvme_ns *ns; + const struct spdk_nvme_ctrlr_data *cdata; + uint8_t str[128]; + uint32_t nsid; + + cdata = spdk_nvme_ctrlr_get_data(dev->ctrlr); + + if (model == CONTROLLER_DISPLAY_SIMPLISTIC) { + printf("%04x:%02x:%02x.%02x ", + dev->pci_addr.domain, dev->pci_addr.bus, dev->pci_addr.dev, dev->pci_addr.func); + printf("%-40.40s %-20.20s ", + cdata->mn, cdata->sn); + printf("%5d ", cdata->cntlid); + printf("\n"); + return; + } + + printf("=====================================================\n"); + printf("NVMe Controller: %04x:%02x:%02x.%02x\n", + dev->pci_addr.domain, dev->pci_addr.bus, dev->pci_addr.dev, dev->pci_addr.func); + printf("============================\n"); + printf("Controller Capabilities/Features\n"); + printf("Controller ID: %d\n", cdata->cntlid); + snprintf(str, sizeof(cdata->sn) + 1, "%s", cdata->sn); + printf("Serial Number: %s\n", str); + printf("\n"); + + printf("Admin Command Set Attributes\n"); + printf("============================\n"); + printf("Namespace Manage And Attach: %s\n", + cdata->oacs.ns_manage ? "Supported" : "Not Supported"); + printf("Namespace Format: %s\n", + cdata->oacs.format ? "Supported" : "Not Supported"); + printf("\n"); + printf("NVM Command Set Attributes\n"); + printf("============================\n"); + if (cdata->fna.format_all_ns) { + printf("Namespace format operation applies to all namespaces\n"); + } else { + printf("Namespace format operation applies to per namespace\n"); + } + printf("\n"); + printf("Namespace Attributes\n"); + printf("============================\n"); + for (nsid = spdk_nvme_ctrlr_get_first_active_ns(dev->ctrlr); + nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(dev->ctrlr, nsid)) { + ns = spdk_nvme_ctrlr_get_ns(dev->ctrlr, nsid); + assert(ns != NULL); + display_namespace(ns); + } +} + +static void +display_controller_list(void) +{ + struct dev *iter; + + foreach_dev(iter) { + display_controller(iter, CONTROLLER_DISPLAY_ALL); + } +} + +static char * +get_line(char *buf, int buf_size, FILE *f, bool secret) +{ + char *ch; + size_t len; + struct termios default_attr = {}, new_attr = {}; + int ret; + + if (secret) { + ret = tcgetattr(STDIN_FILENO, &default_attr); + if (ret) { + return NULL; + } + + new_attr = default_attr; + new_attr.c_lflag &= ~ECHO; /* disable echo */ + ret = tcsetattr(STDIN_FILENO, TCSAFLUSH, &new_attr); + if (ret) { + return NULL; + } + } + + ch = fgets(buf, buf_size, f); + if (ch == NULL) { + return NULL; + } + + if (secret) { + ret = tcsetattr(STDIN_FILENO, TCSAFLUSH, &default_attr); /* restore default confing */ + if (ret) { + return NULL; + } + } + + len = strlen(buf); + if (len > 0 && buf[len - 1] == '\n') { + buf[len - 1] = '\0'; + } + return buf; +} + +static struct dev * +get_controller(void) +{ + struct spdk_pci_addr pci_addr; + char address[64]; + char *p; + int ch; + struct dev *iter; + + memset(address, 0, sizeof(address)); + + foreach_dev(iter) { + display_controller(iter, CONTROLLER_DISPLAY_SIMPLISTIC); + } + + printf("Please Input PCI Address(domain:bus:dev.func):\n"); + + while ((ch = getchar()) != '\n' && ch != EOF); + p = get_line(address, 64, stdin, false); + if (p == NULL) { + return NULL; + } + + while (isspace(*p)) { + p++; + } + + if (spdk_pci_addr_parse(&pci_addr, p) < 0) { + return NULL; + } + + foreach_dev(iter) { + if (spdk_pci_addr_compare(&pci_addr, &iter->pci_addr) == 0) { + return iter; + } + } + return NULL; +} + +static int +get_lba_format(const struct spdk_nvme_ns_data *ns_data) +{ + int lbaf, i; + + printf("\nSupported LBA formats:\n"); + for (i = 0; i <= ns_data->nlbaf; i++) { + printf("%2d: %d data bytes", i, 1 << ns_data->lbaf[i].lbads); + if (ns_data->lbaf[i].ms) { + printf(" + %d metadata bytes", ns_data->lbaf[i].ms); + } + printf("\n"); + } + + printf("Please input LBA format index (0 - %d):\n", ns_data->nlbaf); + if (scanf("%d", &lbaf) != 1 || lbaf > ns_data->nlbaf) { + return -1; + } + + return lbaf; +} + +static void +identify_allocated_ns_cb(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + struct dev *dev = cb_arg; + + dev->outstanding_admin_cmds--; +} + +static uint32_t +get_allocated_nsid(struct dev *dev) +{ + uint32_t nsid; + size_t i; + struct spdk_nvme_ns_list *ns_list; + struct spdk_nvme_cmd cmd = {0}; + + ns_list = spdk_dma_zmalloc(sizeof(*ns_list), 4096, NULL); + if (ns_list == NULL) { + printf("Allocation error\n"); + return 0; + } + + cmd.opc = SPDK_NVME_OPC_IDENTIFY; + cmd.cdw10_bits.identify.cns = SPDK_NVME_IDENTIFY_ALLOCATED_NS_LIST; + cmd.nsid = 0; + + dev->outstanding_admin_cmds++; + if (spdk_nvme_ctrlr_cmd_admin_raw(dev->ctrlr, &cmd, ns_list, sizeof(*ns_list), + identify_allocated_ns_cb, dev)) { + printf("Identify command failed\n"); + spdk_dma_free(ns_list); + return 0; + } + + while (dev->outstanding_admin_cmds) { + spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr); + } + + printf("Allocated Namespace IDs:\n"); + for (i = 0; i < SPDK_COUNTOF(ns_list->ns_list); i++) { + if (ns_list->ns_list[i] == 0) { + break; + } + printf("%u\n", ns_list->ns_list[i]); + } + + spdk_dma_free(ns_list); + + printf("Please Input Namespace ID:\n"); + if (!scanf("%u", &nsid)) { + printf("Invalid Namespace ID\n"); + nsid = 0; + } + + return nsid; +} + +static void +ns_attach(struct dev *device, int attachment_op, int ctrlr_id, int ns_id) +{ + int ret = 0; + struct spdk_nvme_ctrlr_list *ctrlr_list; + + ctrlr_list = spdk_dma_zmalloc(sizeof(struct spdk_nvme_ctrlr_list), + 4096, NULL); + if (ctrlr_list == NULL) { + printf("Allocation error (controller list)\n"); + exit(1); + } + + ctrlr_list->ctrlr_count = 1; + ctrlr_list->ctrlr_list[0] = ctrlr_id; + + if (attachment_op == SPDK_NVME_NS_CTRLR_ATTACH) { + ret = spdk_nvme_ctrlr_attach_ns(device->ctrlr, ns_id, ctrlr_list); + } else if (attachment_op == SPDK_NVME_NS_CTRLR_DETACH) { + ret = spdk_nvme_ctrlr_detach_ns(device->ctrlr, ns_id, ctrlr_list); + } + + if (ret) { + fprintf(stdout, "ns attach: Failed\n"); + } + + spdk_dma_free(ctrlr_list); +} + +static void +ns_manage_add(struct dev *device, uint64_t ns_size, uint64_t ns_capacity, int ns_lbasize, + uint8_t ns_dps_type, uint8_t ns_dps_location, uint8_t ns_nmic) +{ + uint32_t nsid; + struct spdk_nvme_ns_data *ndata; + + ndata = spdk_dma_zmalloc(sizeof(struct spdk_nvme_ns_data), 4096, NULL); + if (ndata == NULL) { + printf("Allocation error (namespace data)\n"); + exit(1); + } + + ndata->nsze = ns_size; + ndata->ncap = ns_capacity; + ndata->flbas.format = ns_lbasize; + if (SPDK_NVME_FMT_NVM_PROTECTION_DISABLE != ns_dps_type) { + ndata->dps.pit = ns_dps_type; + ndata->dps.md_start = ns_dps_location; + } + ndata->nmic.can_share = ns_nmic; + nsid = spdk_nvme_ctrlr_create_ns(device->ctrlr, ndata); + if (nsid == 0) { + fprintf(stdout, "ns manage: Failed\n"); + } else { + printf("Created namespace ID %u\n", nsid); + } + + spdk_dma_free(ndata); +} + +static void +ns_manage_delete(struct dev *device, int ns_id) +{ + int ret = 0; + + ret = spdk_nvme_ctrlr_delete_ns(device->ctrlr, ns_id); + if (ret) { + fprintf(stdout, "ns manage: Failed\n"); + return; + } +} + +static void +nvme_manage_format(struct dev *device, int ns_id, int ses, int pi, int pil, int ms, int lbaf) +{ + int ret = 0; + struct spdk_nvme_format format = {}; + + format.lbaf = lbaf; + format.ms = ms; + format.pi = pi; + format.pil = pil; + format.ses = ses; + ret = spdk_nvme_ctrlr_format(device->ctrlr, ns_id, &format); + if (ret) { + fprintf(stdout, "nvme format: Failed\n"); + return; + } +} + +static void +attach_and_detach_ns(int attachment_op) +{ + uint32_t nsid; + struct dev *ctrlr; + + ctrlr = get_controller(); + if (ctrlr == NULL) { + printf("Invalid controller PCI Address.\n"); + return; + } + + if (!ctrlr->cdata->oacs.ns_manage) { + printf("Controller does not support ns management\n"); + return; + } + + nsid = get_allocated_nsid(ctrlr); + if (nsid == 0) { + printf("Invalid Namespace ID\n"); + return; + } + + ns_attach(ctrlr, attachment_op, ctrlr->cdata->cntlid, nsid); +} + +static void +add_ns(void) +{ + uint64_t ns_size = 0; + uint64_t ns_capacity = 0; + int ns_lbasize; + int ns_dps_type = 0; + int ns_dps_location = 0; + int ns_nmic = 0; + struct dev *ctrlr = NULL; + + ctrlr = get_controller(); + if (ctrlr == NULL) { + printf("Invalid controller PCI Address.\n"); + return; + } + + if (!ctrlr->cdata->oacs.ns_manage) { + printf("Controller does not support ns management\n"); + return; + } + + if (!ctrlr->common_ns_data) { + printf("Controller did not return common namespace capabilities\n"); + return; + } + + ns_lbasize = get_lba_format(ctrlr->common_ns_data); + if (ns_lbasize < 0) { + printf("Invalid LBA format number\n"); + return; + } + + printf("Please Input Namespace Size (in LBAs):\n"); + if (!scanf("%" SCNu64, &ns_size)) { + printf("Invalid Namespace Size\n"); + while (getchar() != '\n'); + return; + } + + printf("Please Input Namespace Capacity (in LBAs):\n"); + if (!scanf("%" SCNu64, &ns_capacity)) { + printf("Invalid Namespace Capacity\n"); + while (getchar() != '\n'); + return; + } + + printf("Please Input Data Protection Type (0 - 3):\n"); + if (!scanf("%d", &ns_dps_type)) { + printf("Invalid Data Protection Type\n"); + while (getchar() != '\n'); + return; + } + + if (SPDK_NVME_FMT_NVM_PROTECTION_DISABLE != ns_dps_type) { + printf("Please Input Data Protection Location (1: Head; 0: Tail):\n"); + if (!scanf("%d", &ns_dps_location)) { + printf("Invalid Data Protection Location\n"); + while (getchar() != '\n'); + return; + } + } + + printf("Please Input Multi-path IO and Sharing Capabilities (1: Share; 0: Private):\n"); + if (!scanf("%d", &ns_nmic)) { + printf("Invalid Multi-path IO and Sharing Capabilities\n"); + while (getchar() != '\n'); + return; + } + + ns_manage_add(ctrlr, ns_size, ns_capacity, ns_lbasize, + ns_dps_type, ns_dps_location, ns_nmic); +} + +static void +delete_ns(void) +{ + int ns_id; + struct dev *ctrlr; + + ctrlr = get_controller(); + if (ctrlr == NULL) { + printf("Invalid controller PCI Address.\n"); + return; + } + + if (!ctrlr->cdata->oacs.ns_manage) { + printf("Controller does not support ns management\n"); + return; + } + + printf("Please Input Namespace ID:\n"); + if (!scanf("%d", &ns_id)) { + printf("Invalid Namespace ID\n"); + while (getchar() != '\n'); + return; + } + + ns_manage_delete(ctrlr, ns_id); +} + +static void +format_nvm(void) +{ + int ns_id; + int ses; + int pil; + int pi; + int ms; + int lbaf; + char option; + struct dev *ctrlr; + const struct spdk_nvme_ctrlr_data *cdata; + struct spdk_nvme_ns *ns; + const struct spdk_nvme_ns_data *nsdata; + + ctrlr = get_controller(); + if (ctrlr == NULL) { + printf("Invalid controller PCI BDF.\n"); + return; + } + + cdata = ctrlr->cdata; + + if (!cdata->oacs.format) { + printf("Controller does not support Format NVM command\n"); + return; + } + + if (cdata->fna.format_all_ns) { + ns_id = SPDK_NVME_GLOBAL_NS_TAG; + ns = spdk_nvme_ctrlr_get_ns(ctrlr->ctrlr, 1); + } else { + printf("Please Input Namespace ID (1 - %d):\n", cdata->nn); + if (!scanf("%d", &ns_id)) { + printf("Invalid Namespace ID\n"); + while (getchar() != '\n'); + return; + } + ns = spdk_nvme_ctrlr_get_ns(ctrlr->ctrlr, ns_id); + } + + if (ns == NULL) { + printf("Namespace ID %d not found\n", ns_id); + while (getchar() != '\n'); + return; + } + + nsdata = spdk_nvme_ns_get_data(ns); + + printf("Please Input Secure Erase Setting:\n"); + printf(" 0: No secure erase operation requested\n"); + printf(" 1: User data erase\n"); + if (cdata->fna.crypto_erase_supported) { + printf(" 2: Cryptographic erase\n"); + } + if (!scanf("%d", &ses)) { + printf("Invalid Secure Erase Setting\n"); + while (getchar() != '\n'); + return; + } + + lbaf = get_lba_format(nsdata); + if (lbaf < 0) { + printf("Invalid LBA format number\n"); + return; + } + + if (nsdata->lbaf[lbaf].ms) { + printf("Please Input Protection Information:\n"); + printf(" 0: Protection information is not enabled\n"); + printf(" 1: Protection information is enabled, Type 1\n"); + printf(" 2: Protection information is enabled, Type 2\n"); + printf(" 3: Protection information is enabled, Type 3\n"); + if (!scanf("%d", &pi)) { + printf("Invalid protection information\n"); + while (getchar() != '\n'); + return; + } + + if (pi) { + printf("Please Input Protection Information Location:\n"); + printf(" 0: Protection information transferred as the last eight bytes of metadata\n"); + printf(" 1: Protection information transferred as the first eight bytes of metadata\n"); + if (!scanf("%d", &pil)) { + printf("Invalid protection information location\n"); + while (getchar() != '\n'); + return; + } + } else { + pil = 0; + } + + printf("Please Input Metadata Setting:\n"); + printf(" 0: Metadata is transferred as part of a separate buffer\n"); + printf(" 1: Metadata is transferred as part of an extended data LBA\n"); + if (!scanf("%d", &ms)) { + printf("Invalid metadata setting\n"); + while (getchar() != '\n'); + return; + } + } else { + ms = 0; + pi = 0; + pil = 0; + } + + printf("Warning: use this utility at your own risk.\n" + "This command will format your namespace and all data will be lost.\n" + "This command may take several minutes to complete,\n" + "so do not interrupt the utility until it completes.\n" + "Press 'Y' to continue with the format operation.\n"); + + while (getchar() != '\n'); + if (!scanf("%c", &option)) { + printf("Invalid option\n"); + while (getchar() != '\n'); + return; + } + + if (option == 'y' || option == 'Y') { + nvme_manage_format(ctrlr, ns_id, ses, pi, pil, ms, lbaf); + } else { + printf("NVMe format abort\n"); + } +} + +static void +update_firmware_image(void) +{ + int rc; + int fd = -1; + int slot; + unsigned int size; + struct stat fw_stat; + char path[256]; + void *fw_image; + struct dev *ctrlr; + const struct spdk_nvme_ctrlr_data *cdata; + enum spdk_nvme_fw_commit_action commit_action; + struct spdk_nvme_status status; + + ctrlr = get_controller(); + if (ctrlr == NULL) { + printf("Invalid controller PCI BDF.\n"); + return; + } + + cdata = ctrlr->cdata; + + if (!cdata->oacs.firmware) { + printf("Controller does not support firmware download and commit command\n"); + return; + } + + printf("Please Input The Path Of Firmware Image\n"); + + if (get_line(path, sizeof(path), stdin, false) == NULL) { + printf("Invalid path setting\n"); + while (getchar() != '\n'); + return; + } + + fd = open(path, O_RDONLY); + if (fd < 0) { + perror("Open file failed"); + return; + } + rc = fstat(fd, &fw_stat); + if (rc < 0) { + printf("Fstat failed\n"); + close(fd); + return; + } + + if (fw_stat.st_size % 4) { + printf("Firmware image size is not multiple of 4\n"); + close(fd); + return; + } + + size = fw_stat.st_size; + + fw_image = spdk_dma_zmalloc(size, 4096, NULL); + if (fw_image == NULL) { + printf("Allocation error\n"); + close(fd); + return; + } + + if (read(fd, fw_image, size) != ((ssize_t)(size))) { + printf("Read firmware image failed\n"); + close(fd); + spdk_dma_free(fw_image); + return; + } + close(fd); + + printf("Please Input Slot(0 - 7):\n"); + if (!scanf("%d", &slot)) { + printf("Invalid Slot\n"); + spdk_dma_free(fw_image); + while (getchar() != '\n'); + return; + } + + commit_action = SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG; + rc = spdk_nvme_ctrlr_update_firmware(ctrlr->ctrlr, fw_image, size, slot, commit_action, &status); + if (rc == -ENXIO && status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC && + status.sc == SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET) { + printf("conventional reset is needed to enable firmware !\n"); + } else if (rc) { + printf("spdk_nvme_ctrlr_update_firmware failed\n"); + } else { + printf("spdk_nvme_ctrlr_update_firmware success\n"); + } + spdk_dma_free(fw_image); +} + +static void +opal_dump_info(struct spdk_opal_d0_features_info *feat) +{ + if (feat->tper.hdr.code) { + printf("\nOpal TPer feature:\n"); + printf("ACKNACK = %s", (feat->tper.acknack ? "Y, " : "N, ")); + printf("ASYNC = %s", (feat->tper.async ? "Y, " : "N, ")); + printf("BufferManagement = %s\n", (feat->tper.buffer_management ? "Y, " : "N, ")); + printf("ComIDManagement = %s", (feat->tper.comid_management ? "Y, " : "N, ")); + printf("Streaming = %s", (feat->tper.streaming ? "Y, " : "N, ")); + printf("Sync = %s\n", (feat->tper.sync ? "Y" : "N")); + printf("\n"); + } + + if (feat->locking.hdr.code) { + printf("Opal Locking feature:\n"); + printf("Locked = %s", (feat->locking.locked ? "Y, " : "N, ")); + printf("Locking Enabled = %s", (feat->locking.locking_enabled ? "Y, " : "N, ")); + printf("Locking supported = %s\n", (feat->locking.locking_supported ? "Y" : "N")); + + printf("MBR done = %s", (feat->locking.mbr_done ? "Y, " : "N, ")); + printf("MBR enabled = %s", (feat->locking.mbr_enabled ? "Y, " : "N, ")); + printf("Media encrypt = %s\n", (feat->locking.media_encryption ? "Y" : "N")); + printf("\n"); + } + + if (feat->geo.hdr.code) { + printf("Opal Geometry feature:\n"); + printf("Align = %s", (feat->geo.alignment_granularity ? "Y, " : "N, ")); + printf("Logical block size = %d, ", from_be32(&feat->geo.logical_block_size)); + printf("Lowest aligned LBA = %ld\n", from_be64(&feat->geo.lowest_aligned_lba)); + printf("\n"); + } + + if (feat->single_user.hdr.code) { + printf("Opal Single User Mode feature:\n"); + printf("Any in SUM = %s", (feat->single_user.any ? "Y, " : "N, ")); + printf("All in SUM = %s", (feat->single_user.all ? "Y, " : "N, ")); + printf("Policy: %s Authority,\n", (feat->single_user.policy ? "Admin" : "Users")); + printf("Number of locking objects = %d\n ", from_be32(&feat->single_user.num_locking_objects)); + printf("\n"); + } + + if (feat->datastore.hdr.code) { + printf("Opal DataStore feature:\n"); + printf("Table alignment = %d, ", from_be32(&feat->datastore.alignment)); + printf("Max number of tables = %d, ", from_be16(&feat->datastore.max_tables)); + printf("Max size of tables = %d\n", from_be32(&feat->datastore.max_table_size)); + printf("\n"); + } + + if (feat->v100.hdr.code) { + printf("Opal V100 feature:\n"); + printf("Base comID = %d, ", from_be16(&feat->v100.base_comid)); + printf("Number of comIDs = %d, ", from_be16(&feat->v100.number_comids)); + printf("Range crossing = %s\n", (feat->v100.range_crossing ? "N" : "Y")); + printf("\n"); + } + + if (feat->v200.hdr.code) { + printf("Opal V200 feature:\n"); + printf("Base comID = %d, ", from_be16(&feat->v200.base_comid)); + printf("Number of comIDs = %d, ", from_be16(&feat->v200.num_comids)); + printf("Initial PIN = %d,\n", feat->v200.initial_pin); + printf("Reverted PIN = %d, ", feat->v200.reverted_pin); + printf("Number of admins = %d, ", from_be16(&feat->v200.num_locking_admin_auth)); + printf("Number of users = %d\n", from_be16(&feat->v200.num_locking_user_auth)); + printf("\n"); + } +} + +static void +opal_usage(void) +{ + printf("Opal General Usage:\n"); + printf("\n"); + printf("\t[1: scan device]\n"); + printf("\t[2: init - take ownership and activate locking]\n"); + printf("\t[3: revert tper]\n"); + printf("\t[4: setup locking range]\n"); + printf("\t[5: list locking ranges]\n"); + printf("\t[6: enable user]\n"); + printf("\t[7: set new password]\n"); + printf("\t[8: add user to locking range]\n"); + printf("\t[9: lock/unlock range]\n"); + printf("\t[10: erase locking range]\n"); + printf("\t[0: quit]\n"); +} + +static void +opal_scan(struct dev *iter) +{ + while (getchar() != '\n'); + if (spdk_nvme_ctrlr_get_flags(iter->ctrlr) & SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + iter->opal_dev = spdk_opal_dev_construct(iter->ctrlr); + if (iter->opal_dev == NULL) { + return; + } + + printf("\n\nOpal Supported:\n"); + display_controller(iter, CONTROLLER_DISPLAY_SIMPLISTIC); + opal_dump_info(spdk_opal_get_d0_features_info(iter->opal_dev)); + spdk_opal_dev_destruct(iter->opal_dev); + } else { + printf("%04x:%02x:%02x.%02x: NVMe Security Support/Receive Not supported.\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + printf("%04x:%02x:%02x.%02x: Opal Not Supported\n\n\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + } +} + +static void +opal_init(struct dev *iter) +{ + char new_passwd[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *passwd_p; + int ret; + int ch; + + if (spdk_nvme_ctrlr_get_flags(iter->ctrlr) & SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + iter->opal_dev = spdk_opal_dev_construct(iter->ctrlr); + if (iter->opal_dev == NULL) { + return; + } + printf("Please input the new password for ownership:"); + while ((ch = getchar()) != '\n' && ch != EOF); + passwd_p = get_line(new_passwd, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + printf("\n...\n"); + if (passwd_p) { + ret = spdk_opal_cmd_take_ownership(iter->opal_dev, passwd_p); + if (ret) { + printf("Take ownership failure: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + ret = spdk_opal_cmd_activate_locking_sp(iter->opal_dev, passwd_p); + if (ret) { + printf("Locking SP activate failure: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + printf("...\nOpal Init Success\n"); + } else { + printf("Input password invalid. Opal Init failure\n"); + } + spdk_opal_dev_destruct(iter->opal_dev); + } else { + printf("%04x:%02x:%02x.%02x: NVMe Security Support/Receive Not supported.\nOpal Not Supported\n\n\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + } +} + +static void +opal_locking_usage(void) +{ + printf("Choose Opal locking state:\n"); + printf("\n"); + printf("\t[1: read write lock]\n"); + printf("\t[2: read only]\n"); + printf("\t[3: read write unlock]\n"); +} + +static void +opal_setup_lockingrange(struct dev *iter) +{ + char passwd[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *passwd_p; + int ret; + int ch; + uint64_t range_start; + uint64_t range_length; + int locking_range_id; + struct spdk_opal_locking_range_info *info; + + if (spdk_nvme_ctrlr_get_flags(iter->ctrlr) & SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + iter->opal_dev = spdk_opal_dev_construct(iter->ctrlr); + if (iter->opal_dev == NULL) { + return; + } + printf("Please input the password for setting up locking range:"); + while ((ch = getchar()) != '\n' && ch != EOF); + passwd_p = get_line(passwd, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + printf("\n"); + if (passwd_p) { + printf("Specify locking range id:\n"); + if (!scanf("%d", &locking_range_id)) { + printf("Invalid locking range id\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + printf("range length:\n"); + if (!scanf("%" SCNu64, &range_length)) { + printf("Invalid range length\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + printf("range start:\n"); + if (!scanf("%" SCNu64, &range_start)) { + printf("Invalid range start address\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + while (getchar() != '\n'); + + ret = spdk_opal_cmd_setup_locking_range(iter->opal_dev, + OPAL_ADMIN1, locking_range_id, range_start, range_length, passwd_p); + if (ret) { + printf("Setup locking range failure: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + ret = spdk_opal_cmd_get_locking_range_info(iter->opal_dev, + passwd_p, OPAL_ADMIN1, locking_range_id); + if (ret) { + printf("Get locking range info failure: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + info = spdk_opal_get_locking_range_info(iter->opal_dev, locking_range_id); + + printf("\nlocking range ID: %d\n", info->locking_range_id); + printf("range start: %ld\n", info->range_start); + printf("range length: %ld\n", info->range_length); + printf("read lock enabled: %d\n", info->read_lock_enabled); + printf("write lock enabled: %d\n", info->write_lock_enabled); + printf("read locked: %d\n", info->read_locked); + printf("write locked: %d\n", info->write_locked); + + printf("...\n...\nOpal setup locking range success\n"); + } else { + printf("Input password invalid. Opal setup locking range failure\n"); + } + spdk_opal_dev_destruct(iter->opal_dev); + } else { + printf("%04x:%02x:%02x.%02x: NVMe Security Support/Receive Not supported.\nOpal Not Supported\n\n\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + } +} + +static void +opal_list_locking_ranges(struct dev *iter) +{ + char passwd[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *passwd_p; + int ret; + int ch; + int max_ranges; + int i; + struct spdk_opal_locking_range_info *info; + + if (spdk_nvme_ctrlr_get_flags(iter->ctrlr) & SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + iter->opal_dev = spdk_opal_dev_construct(iter->ctrlr); + if (iter->opal_dev == NULL) { + return; + } + printf("Please input password:"); + while ((ch = getchar()) != '\n' && ch != EOF); + passwd_p = get_line(passwd, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + printf("\n"); + if (passwd_p) { + ret = spdk_opal_cmd_get_max_ranges(iter->opal_dev, passwd_p); + if (ret <= 0) { + printf("get max ranges failure: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + max_ranges = ret; + for (i = 0; i < max_ranges; i++) { + ret = spdk_opal_cmd_get_locking_range_info(iter->opal_dev, + passwd_p, OPAL_ADMIN1, i); + if (ret) { + printf("Get locking range info failure: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + info = spdk_opal_get_locking_range_info(iter->opal_dev, i); + if (info == NULL) { + continue; + } + + printf("===============================================\n"); + printf("locking range ID: %d\t", info->locking_range_id); + if (i == 0) { printf("(Global Range)"); } + printf("\n===============================================\n"); + printf("range start: %ld\t", info->range_start); + printf("range length: %ld\n", info->range_length); + printf("read lock enabled: %d\t", info->read_lock_enabled); + printf("write lock enabled: %d\t", info->write_lock_enabled); + printf("read locked: %d\t", info->read_locked); + printf("write locked: %d\n", info->write_locked); + printf("\n"); + } + } else { + printf("Input password invalid. List locking ranges failure\n"); + } + spdk_opal_dev_destruct(iter->opal_dev); + } else { + printf("%04x:%02x:%02x.%02x: NVMe Security Support/Receive Not supported.\nOpal Not Supported\n\n\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + } +} + +static void +opal_new_user_enable(struct dev *iter) +{ + int user_id; + char passwd[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *passwd_p; + char user_pw[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *user_pw_p; + int ret; + int ch; + + if (spdk_nvme_ctrlr_get_flags(iter->ctrlr) & SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + iter->opal_dev = spdk_opal_dev_construct(iter->ctrlr); + if (iter->opal_dev == NULL) { + return; + } + printf("Please input admin password:"); + while ((ch = getchar()) != '\n' && ch != EOF); + passwd_p = get_line(passwd, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + printf("\n"); + if (passwd_p) { + printf("which user to enable: "); + if (!scanf("%d", &user_id)) { + printf("Invalid user id\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + ret = spdk_opal_cmd_enable_user(iter->opal_dev, user_id, passwd_p); + if (ret) { + printf("Enable user failure error code: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + printf("Please set a new password for this user:"); + while ((ch = getchar()) != '\n' && ch != EOF); + user_pw_p = get_line(user_pw, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + if (user_pw_p == NULL) { + printf("Input password invalid. Enable user failure\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + ret = spdk_opal_cmd_set_new_passwd(iter->opal_dev, user_id, user_pw_p, passwd_p, true); + if (ret) { + printf("Set new password failure error code: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + printf("\n...\n...\nEnable User Success\n"); + } else { + printf("Input password invalid. Enable user failure\n"); + } + spdk_opal_dev_destruct(iter->opal_dev); + } else { + printf("%04x:%02x:%02x.%02x: NVMe Security Support/Receive Not supported.\nOpal Not Supported\n\n\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + } +} + +static void +opal_change_password(struct dev *iter) +{ + int user_id; + char old_passwd[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *old_passwd_p; + char new_passwd[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *new_passwd_p; + int ret; + int ch; + + if (spdk_nvme_ctrlr_get_flags(iter->ctrlr) & SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + iter->opal_dev = spdk_opal_dev_construct(iter->ctrlr); + if (iter->opal_dev == NULL) { + return; + } + printf("user id: "); + if (!scanf("%d", &user_id)) { + printf("Invalid user id\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + printf("Password:"); + while ((ch = getchar()) != '\n' && ch != EOF); + old_passwd_p = get_line(old_passwd, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + printf("\n"); + if (old_passwd_p) { + printf("Please input new password:\n"); + new_passwd_p = get_line(new_passwd, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + printf("\n"); + if (new_passwd_p == NULL) { + printf("Input password invalid. Change password failure\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + ret = spdk_opal_cmd_set_new_passwd(iter->opal_dev, user_id, new_passwd_p, old_passwd_p, false); + if (ret) { + printf("Set new password failure error code: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + printf("...\n...\nChange password Success\n"); + } else { + printf("Input password invalid. Change password failure\n"); + } + spdk_opal_dev_destruct(iter->opal_dev); + } else { + printf("%04x:%02x:%02x.%02x: NVMe Security Support/Receive Not supported.\nOpal Not Supported\n\n\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + } +} + +static void +opal_add_user_to_locking_range(struct dev *iter) +{ + int locking_range_id, user_id; + char passwd[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *passwd_p; + int ret; + int ch; + + if (spdk_nvme_ctrlr_get_flags(iter->ctrlr) & SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + iter->opal_dev = spdk_opal_dev_construct(iter->ctrlr); + if (iter->opal_dev == NULL) { + return; + } + printf("Please input admin password:"); + while ((ch = getchar()) != '\n' && ch != EOF); + passwd_p = get_line(passwd, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + printf("\n"); + if (passwd_p) { + printf("Specify locking range id:\n"); + if (!scanf("%d", &locking_range_id)) { + printf("Invalid locking range id\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + printf("which user to enable:\n"); + if (!scanf("%d", &user_id)) { + printf("Invalid user id\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + while (getchar() != '\n'); + + ret = spdk_opal_cmd_add_user_to_locking_range(iter->opal_dev, user_id, locking_range_id, + OPAL_READONLY, passwd_p); + ret += spdk_opal_cmd_add_user_to_locking_range(iter->opal_dev, user_id, locking_range_id, + OPAL_READWRITE, passwd_p); + if (ret) { + printf("Add user to locking range error: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + printf("...\n...\nAdd user to locking range Success\n"); + } else { + printf("Input password invalid. Add user to locking range failure\n"); + } + spdk_opal_dev_destruct(iter->opal_dev); + } else { + printf("%04x:%02x:%02x.%02x: NVMe Security Support/Receive Not supported.\nOpal Not Supported\n\n\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + } +} + +static void +opal_user_lock_unlock_range(struct dev *iter) +{ + char passwd[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *passwd_p; + int ch; + int ret; + int user_id; + int locking_range_id; + int state; + enum spdk_opal_lock_state state_flag; + + if (spdk_nvme_ctrlr_get_flags(iter->ctrlr) & SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + iter->opal_dev = spdk_opal_dev_construct(iter->ctrlr); + if (iter->opal_dev == NULL) { + return; + } + printf("User id: "); + if (!scanf("%d", &user_id)) { + printf("Invalid user id\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + printf("Please input password:"); + while ((ch = getchar()) != '\n' && ch != EOF); + passwd_p = get_line(passwd, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + printf("\n"); + if (passwd_p) { + printf("Specify locking range id:\n"); + if (!scanf("%d", &locking_range_id)) { + printf("Invalid locking range id\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + + opal_locking_usage(); + if (!scanf("%d", &state)) { + printf("Invalid option\n"); + } + switch (state) { + case 1: + state_flag = OPAL_RWLOCK; + break; + case 2: + state_flag = OPAL_READONLY; + break; + case 3: + state_flag = OPAL_READWRITE; + break; + default: + printf("Invalid options\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + while (getchar() != '\n'); + + ret = spdk_opal_cmd_lock_unlock(iter->opal_dev, user_id, state_flag, + locking_range_id, passwd_p); + if (ret) { + printf("lock/unlock range failure: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + printf("...\n...\nLock/unlock range Success\n"); + } else { + printf("Input password invalid. lock/unlock range failure\n"); + } + spdk_opal_dev_destruct(iter->opal_dev); + } else { + printf("%04x:%02x:%02x.%02x: NVMe Security Support/Receive Not supported.\nOpal Not Supported\n\n\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + } +} + +static void +opal_revert_tper(struct dev *iter) +{ + char passwd[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *passwd_p; + int ret; + int ch; + + if (spdk_nvme_ctrlr_get_flags(iter->ctrlr) & SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + iter->opal_dev = spdk_opal_dev_construct(iter->ctrlr); + if (iter->opal_dev == NULL) { + return; + } + printf("Please be noted this operation will erase ALL DATA on this drive\n"); + printf("Please don't ternminate this excecution. Otherwise undefined error may occur\n"); + printf("Please input password for revert TPer:"); + while ((ch = getchar()) != '\n' && ch != EOF); + passwd_p = get_line(passwd, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + printf("\n...\n"); + if (passwd_p) { + ret = spdk_opal_cmd_revert_tper(iter->opal_dev, passwd_p); + if (ret) { + printf("Revert TPer failure: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + printf("...\nRevert TPer Success\n"); + } else { + printf("Input password invalid. Revert TPer failure\n"); + } + spdk_opal_dev_destruct(iter->opal_dev); + } else { + printf("%04x:%02x:%02x.%02x: NVMe Security Support/Receive Not supported.\nOpal Not Supported\n\n\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + } +} + +static void +opal_erase_locking_range(struct dev *iter) +{ + char passwd[SPDK_OPAL_MAX_PASSWORD_SIZE] = {0}; + char *passwd_p; + int ret; + int ch; + int locking_range_id; + + if (spdk_nvme_ctrlr_get_flags(iter->ctrlr) & SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + iter->opal_dev = spdk_opal_dev_construct(iter->ctrlr); + if (iter->opal_dev == NULL) { + return; + } + printf("Please be noted this operation will erase ALL DATA on this range\n"); + printf("Please input password for erase locking range:"); + while ((ch = getchar()) != '\n' && ch != EOF); + passwd_p = get_line(passwd, SPDK_OPAL_MAX_PASSWORD_SIZE, stdin, true); + if (passwd_p) { + printf("\nSpecify locking range id:\n"); + if (!scanf("%d", &locking_range_id)) { + printf("Invalid locking range id\n"); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + printf("\n...\n"); + ret = spdk_opal_cmd_secure_erase_locking_range(iter->opal_dev, OPAL_ADMIN1, locking_range_id, + passwd_p); + if (ret) { + printf("Erase locking range failure: %d\n", ret); + spdk_opal_dev_destruct(iter->opal_dev); + return; + } + printf("...\nErase locking range Success\n"); + } else { + printf("Input password invalid. Erase locking range failure\n"); + } + spdk_opal_dev_destruct(iter->opal_dev); + } else { + printf("%04x:%02x:%02x.%02x: NVMe Security Support/Receive Not supported.\nOpal Not Supported\n\n\n", + iter->pci_addr.domain, iter->pci_addr.bus, iter->pci_addr.dev, iter->pci_addr.func); + } +} + +static void +test_opal(void) +{ + int exit_flag = false; + struct dev *ctrlr; + + ctrlr = get_controller(); + if (ctrlr == NULL) { + printf("Invalid controller PCI Address.\n"); + return; + } + + opal_usage(); + while (!exit_flag) { + int cmd; + if (!scanf("%d", &cmd)) { + printf("Invalid Command: command must be number 0-9\n"); + while (getchar() != '\n'); + opal_usage(); + continue; + } + + switch (cmd) { + case 0: + exit_flag = true; + continue; + case 1: + opal_scan(ctrlr); + break; + case 2: + opal_init(ctrlr); /* Take ownership, Activate Locking SP */ + break; + case 3: + opal_revert_tper(ctrlr); + break; + case 4: + opal_setup_lockingrange(ctrlr); + break; + case 5: + opal_list_locking_ranges(ctrlr); + break; + case 6: + opal_new_user_enable(ctrlr); + break; + case 7: + opal_change_password(ctrlr); + break; + case 8: + opal_add_user_to_locking_range(ctrlr); + break; + case 9: + opal_user_lock_unlock_range(ctrlr); + break; + case 10: + opal_erase_locking_range(ctrlr); + break; + + default: + printf("Invalid option\n"); + } + + printf("\npress Enter to display Opal cmd menu ...\n"); + while (getchar() != '\n'); + opal_usage(); + } +} + +static void +args_usage(const char *program_name) +{ + printf("%s [options]", program_name); + printf("\n"); + printf("options:\n"); + printf(" -i shared memory group ID\n"); +} + +static int +parse_args(int argc, char **argv) +{ + int op; + + while ((op = getopt(argc, argv, "i:")) != -1) { + switch (op) { + case 'i': + g_shm_id = spdk_strtol(optarg, 10); + if (g_shm_id < 0) { + fprintf(stderr, "Invalid shared memory ID\n"); + return g_shm_id; + } + break; + default: + args_usage(argv[0]); + return 1; + } + } + + return 0; +} + +int main(int argc, char **argv) +{ + int i, rc; + struct spdk_env_opts opts; + + rc = parse_args(argc, argv); + if (rc != 0) { + return rc; + } + + spdk_env_opts_init(&opts); + opts.name = "nvme_manage"; + opts.core_mask = "0x1"; + opts.shm_id = g_shm_id; + if (spdk_env_init(&opts) < 0) { + fprintf(stderr, "Unable to initialize SPDK env\n"); + return 1; + } + + if (spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, NULL) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed\n"); + return 1; + } + + qsort(devs, num_devs, sizeof(devs[0]), cmp_devs); + + usage(); + + while (1) { + int cmd; + bool exit_flag = false; + + if (!scanf("%d", &cmd)) { + printf("Invalid Command: command must be number 1-8\n"); + while (getchar() != '\n'); + usage(); + continue; + } + switch (cmd) { + case 1: + display_controller_list(); + break; + case 2: + add_ns(); + break; + case 3: + delete_ns(); + break; + case 4: + attach_and_detach_ns(SPDK_NVME_NS_CTRLR_ATTACH); + break; + case 5: + attach_and_detach_ns(SPDK_NVME_NS_CTRLR_DETACH); + break; + case 6: + format_nvm(); + break; + case 7: + update_firmware_image(); + break; + case 8: + test_opal(); + break; + case 9: + exit_flag = true; + break; + default: + printf("Invalid Command\n"); + break; + } + + if (exit_flag) { + break; + } + + while (getchar() != '\n'); + printf("press Enter to display cmd menu ...\n"); + while (getchar() != '\n'); + usage(); + } + + printf("Cleaning up...\n"); + + for (i = 0; i < num_devs; i++) { + struct dev *dev = &devs[i]; + spdk_nvme_detach(dev->ctrlr); + } + + return 0; +} diff --git a/src/spdk/examples/nvme/perf/.gitignore b/src/spdk/examples/nvme/perf/.gitignore new file mode 100644 index 000000000..bd14107d8 --- /dev/null +++ b/src/spdk/examples/nvme/perf/.gitignore @@ -0,0 +1 @@ +perf diff --git a/src/spdk/examples/nvme/perf/Makefile b/src/spdk/examples/nvme/perf/Makefile new file mode 100644 index 000000000..0742f1842 --- /dev/null +++ b/src/spdk/examples/nvme/perf/Makefile @@ -0,0 +1,49 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +APP = perf + +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk + +ifeq ($(OS),Linux) +SYS_LIBS += -laio +CFLAGS += -DHAVE_LIBAIO +endif + +install: $(APP) + $(INSTALL_EXAMPLE) + +uninstall: + $(UNINSTALL_EXAMPLE) diff --git a/src/spdk/examples/nvme/perf/README.md b/src/spdk/examples/nvme/perf/README.md new file mode 100644 index 000000000..e5ec38d12 --- /dev/null +++ b/src/spdk/examples/nvme/perf/README.md @@ -0,0 +1,5 @@ +# Compiling perf on FreeBSD + +To use perf test on FreeBSD over NVMe-oF, explicitly link userspace library of HBA. For example, on a setup with Mellanox HBA, + + LIBS += -lmlx5 diff --git a/src/spdk/examples/nvme/perf/perf.c b/src/spdk/examples/nvme/perf/perf.c new file mode 100644 index 000000000..9e8cf6793 --- /dev/null +++ b/src/spdk/examples/nvme/perf/perf.c @@ -0,0 +1,2308 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/fd.h" +#include "spdk/nvme.h" +#include "spdk/vmd.h" +#include "spdk/queue.h" +#include "spdk/string.h" +#include "spdk/nvme_intel.h" +#include "spdk/histogram_data.h" +#include "spdk/endian.h" +#include "spdk/dif.h" +#include "spdk/util.h" +#include "spdk/log.h" +#include "spdk/likely.h" + +#ifdef SPDK_CONFIG_URING +#include <liburing.h> +#endif + +#if HAVE_LIBAIO +#include <libaio.h> +#endif + +struct ctrlr_entry { + struct spdk_nvme_ctrlr *ctrlr; + enum spdk_nvme_transport_type trtype; + struct spdk_nvme_intel_rw_latency_page *latency_page; + + struct spdk_nvme_qpair **unused_qpairs; + + struct ctrlr_entry *next; + char name[1024]; +}; + +enum entry_type { + ENTRY_TYPE_NVME_NS, + ENTRY_TYPE_AIO_FILE, + ENTRY_TYPE_URING_FILE, +}; + +struct ns_fn_table; + +struct ns_entry { + enum entry_type type; + const struct ns_fn_table *fn_table; + + union { + struct { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ns *ns; + } nvme; +#ifdef SPDK_CONFIG_URING + struct { + int fd; + } uring; +#endif +#if HAVE_LIBAIO + struct { + int fd; + } aio; +#endif + } u; + + struct ns_entry *next; + uint32_t io_size_blocks; + uint32_t num_io_requests; + uint64_t size_in_ios; + uint32_t block_size; + uint32_t md_size; + bool md_interleave; + bool pi_loc; + enum spdk_nvme_pi_type pi_type; + uint32_t io_flags; + char name[1024]; +}; + +static const double g_latency_cutoffs[] = { + 0.01, + 0.10, + 0.25, + 0.50, + 0.75, + 0.90, + 0.95, + 0.98, + 0.99, + 0.995, + 0.999, + 0.9999, + 0.99999, + 0.999999, + 0.9999999, + -1, +}; + +struct ns_worker_ctx { + struct ns_entry *entry; + uint64_t io_completed; + uint64_t last_io_completed; + uint64_t total_tsc; + uint64_t min_tsc; + uint64_t max_tsc; + uint64_t current_queue_depth; + uint64_t offset_in_ios; + bool is_draining; + + union { + struct { + int num_active_qpairs; + int num_all_qpairs; + struct spdk_nvme_qpair **qpair; + struct spdk_nvme_poll_group *group; + int last_qpair; + } nvme; + +#ifdef SPDK_CONFIG_URING + struct { + struct io_uring ring; + uint64_t io_inflight; + uint64_t io_pending; + struct io_uring_cqe **cqes; + + } uring; +#endif +#if HAVE_LIBAIO + struct { + struct io_event *events; + io_context_t ctx; + } aio; +#endif + } u; + + struct ns_worker_ctx *next; + + struct spdk_histogram_data *histogram; +}; + +struct perf_task { + struct ns_worker_ctx *ns_ctx; + struct iovec iov; + struct iovec md_iov; + uint64_t submit_tsc; + bool is_read; + struct spdk_dif_ctx dif_ctx; +#if HAVE_LIBAIO + struct iocb iocb; +#endif +}; + +struct worker_thread { + struct ns_worker_ctx *ns_ctx; + struct worker_thread *next; + unsigned lcore; +}; + +struct ns_fn_table { + void (*setup_payload)(struct perf_task *task, uint8_t pattern); + + int (*submit_io)(struct perf_task *task, struct ns_worker_ctx *ns_ctx, + struct ns_entry *entry, uint64_t offset_in_ios); + + void (*check_io)(struct ns_worker_ctx *ns_ctx); + + void (*verify_io)(struct perf_task *task, struct ns_entry *entry); + + int (*init_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx); + + void (*cleanup_ns_worker_ctx)(struct ns_worker_ctx *ns_ctx); +}; + +static int g_outstanding_commands; + +static bool g_latency_ssd_tracking_enable; +static int g_latency_sw_tracking_level; + +static bool g_vmd; +static const char *g_workload_type; +static struct ctrlr_entry *g_controllers; +static struct ns_entry *g_namespaces; +static int g_num_namespaces; +static struct worker_thread *g_workers; +static int g_num_workers; +static uint32_t g_master_core; + +static uint64_t g_tsc_rate; + +static uint32_t g_io_align = 0x200; +static uint32_t g_io_size_bytes; +static uint32_t g_max_io_md_size; +static uint32_t g_max_io_size_blocks; +static uint32_t g_metacfg_pract_flag; +static uint32_t g_metacfg_prchk_flags; +static int g_rw_percentage = -1; +static int g_is_random; +static int g_queue_depth; +static int g_nr_io_queues_per_ns = 1; +static int g_nr_unused_io_queues; +static int g_time_in_sec; +static uint32_t g_max_completions; +static int g_dpdk_mem; +static int g_shm_id = -1; +static uint32_t g_disable_sq_cmb; +static bool g_use_uring; +static bool g_no_pci; +static bool g_warn; +static bool g_header_digest; +static bool g_data_digest; +static bool g_no_shn_notification; +static bool g_mix_specified; +/* Default to 10 seconds for the keep alive value. This value is arbitrary. */ +static uint32_t g_keep_alive_timeout_in_ms = 10000; + +static const char *g_core_mask; + +struct trid_entry { + struct spdk_nvme_transport_id trid; + uint16_t nsid; + TAILQ_ENTRY(trid_entry) tailq; +}; + +static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); + +static int g_file_optind; /* Index of first filename in argv */ + +static inline void +task_complete(struct perf_task *task); + +#ifdef SPDK_CONFIG_URING + +static void +uring_setup_payload(struct perf_task *task, uint8_t pattern) +{ + task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); + task->iov.iov_len = g_io_size_bytes; + if (task->iov.iov_base == NULL) { + fprintf(stderr, "spdk_dma_zmalloc() for task->iov.iov_base failed\n"); + exit(1); + } + memset(task->iov.iov_base, pattern, task->iov.iov_len); +} + +static int +uring_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, + struct ns_entry *entry, uint64_t offset_in_ios) +{ + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(&ns_ctx->u.uring.ring); + if (!sqe) { + fprintf(stderr, "Cannot get sqe\n"); + return -1; + } + + if (task->is_read) { + io_uring_prep_readv(sqe, entry->u.uring.fd, &task->iov, 1, offset_in_ios * task->iov.iov_len); + } else { + io_uring_prep_writev(sqe, entry->u.uring.fd, &task->iov, 1, offset_in_ios * task->iov.iov_len); + } + + io_uring_sqe_set_data(sqe, task); + ns_ctx->u.uring.io_pending++; + + return 0; +} + +static void +uring_check_io(struct ns_worker_ctx *ns_ctx) +{ + int i, count, to_complete, to_submit, ret = 0; + struct perf_task *task; + + to_submit = ns_ctx->u.uring.io_pending; + + if (to_submit > 0) { + /* If there are I/O to submit, use io_uring_submit here. + * It will automatically call spdk_io_uring_enter appropriately. */ + ret = io_uring_submit(&ns_ctx->u.uring.ring); + if (ret < 0) { + return; + } + ns_ctx->u.uring.io_pending = 0; + ns_ctx->u.uring.io_inflight += to_submit; + } + + to_complete = ns_ctx->u.uring.io_inflight; + if (to_complete > 0) { + count = io_uring_peek_batch_cqe(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes, to_complete); + ns_ctx->u.uring.io_inflight -= count; + for (i = 0; i < count; i++) { + assert(ns_ctx->u.uring.cqes[i] != NULL); + task = (struct perf_task *)ns_ctx->u.uring.cqes[i]->user_data; + if (ns_ctx->u.uring.cqes[i]->res != (int)task->iov.iov_len) { + fprintf(stderr, "cqe[i]->status=%d\n", ns_ctx->u.uring.cqes[i]->res); + exit(0); + } + io_uring_cqe_seen(&ns_ctx->u.uring.ring, ns_ctx->u.uring.cqes[i]); + task_complete(task); + } + } +} + +static void +uring_verify_io(struct perf_task *task, struct ns_entry *entry) +{ +} + +static int +uring_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + if (io_uring_queue_init(g_queue_depth, &ns_ctx->u.uring.ring, 0) < 0) { + SPDK_ERRLOG("uring I/O context setup failure\n"); + return -1; + } + + ns_ctx->u.uring.cqes = calloc(g_queue_depth, sizeof(struct io_uring_cqe *)); + if (!ns_ctx->u.uring.cqes) { + io_uring_queue_exit(&ns_ctx->u.uring.ring); + return -1; + } + + return 0; +} + +static void +uring_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + io_uring_queue_exit(&ns_ctx->u.uring.ring); + free(ns_ctx->u.uring.cqes); +} + +static const struct ns_fn_table uring_fn_table = { + .setup_payload = uring_setup_payload, + .submit_io = uring_submit_io, + .check_io = uring_check_io, + .verify_io = uring_verify_io, + .init_ns_worker_ctx = uring_init_ns_worker_ctx, + .cleanup_ns_worker_ctx = uring_cleanup_ns_worker_ctx, +}; + +#endif + +#ifdef HAVE_LIBAIO +static void +aio_setup_payload(struct perf_task *task, uint8_t pattern) +{ + task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); + task->iov.iov_len = g_io_size_bytes; + if (task->iov.iov_base == NULL) { + fprintf(stderr, "spdk_dma_zmalloc() for task->buf failed\n"); + exit(1); + } + memset(task->iov.iov_base, pattern, task->iov.iov_len); +} + +static int +aio_submit(io_context_t aio_ctx, struct iocb *iocb, int fd, enum io_iocb_cmd cmd, + struct iovec *iov, uint64_t offset, void *cb_ctx) +{ + iocb->aio_fildes = fd; + iocb->aio_reqprio = 0; + iocb->aio_lio_opcode = cmd; + iocb->u.c.buf = iov->iov_base; + iocb->u.c.nbytes = iov->iov_len; + iocb->u.c.offset = offset * iov->iov_len; + iocb->data = cb_ctx; + + if (io_submit(aio_ctx, 1, &iocb) < 0) { + printf("io_submit"); + return -1; + } + + return 0; +} + +static int +aio_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, + struct ns_entry *entry, uint64_t offset_in_ios) +{ + if (task->is_read) { + return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PREAD, + &task->iov, offset_in_ios, task); + } else { + return aio_submit(ns_ctx->u.aio.ctx, &task->iocb, entry->u.aio.fd, IO_CMD_PWRITE, + &task->iov, offset_in_ios, task); + } +} + +static void +aio_check_io(struct ns_worker_ctx *ns_ctx) +{ + int count, i; + struct timespec timeout; + + timeout.tv_sec = 0; + timeout.tv_nsec = 0; + + count = io_getevents(ns_ctx->u.aio.ctx, 1, g_queue_depth, ns_ctx->u.aio.events, &timeout); + if (count < 0) { + fprintf(stderr, "io_getevents error\n"); + exit(1); + } + + for (i = 0; i < count; i++) { + task_complete(ns_ctx->u.aio.events[i].data); + } +} + +static void +aio_verify_io(struct perf_task *task, struct ns_entry *entry) +{ +} + +static int +aio_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + ns_ctx->u.aio.events = calloc(g_queue_depth, sizeof(struct io_event)); + if (!ns_ctx->u.aio.events) { + return -1; + } + ns_ctx->u.aio.ctx = 0; + if (io_setup(g_queue_depth, &ns_ctx->u.aio.ctx) < 0) { + free(ns_ctx->u.aio.events); + perror("io_setup"); + return -1; + } + return 0; +} + +static void +aio_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + io_destroy(ns_ctx->u.aio.ctx); + free(ns_ctx->u.aio.events); +} + +static const struct ns_fn_table aio_fn_table = { + .setup_payload = aio_setup_payload, + .submit_io = aio_submit_io, + .check_io = aio_check_io, + .verify_io = aio_verify_io, + .init_ns_worker_ctx = aio_init_ns_worker_ctx, + .cleanup_ns_worker_ctx = aio_cleanup_ns_worker_ctx, +}; + +#endif /* HAVE_LIBAIO */ + +#if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING) + +static int +register_file(const char *path) +{ + struct ns_entry *entry; + + int flags, fd; + uint64_t size; + uint32_t blklen; + + if (g_rw_percentage == 100) { + flags = O_RDONLY; + } else if (g_rw_percentage == 0) { + flags = O_WRONLY; + } else { + flags = O_RDWR; + } + + flags |= O_DIRECT; + + fd = open(path, flags); + if (fd < 0) { + fprintf(stderr, "Could not open device %s: %s\n", path, strerror(errno)); + return -1; + } + + size = spdk_fd_get_size(fd); + if (size == 0) { + fprintf(stderr, "Could not determine size of device %s\n", path); + close(fd); + return -1; + } + + blklen = spdk_fd_get_blocklen(fd); + if (blklen == 0) { + fprintf(stderr, "Could not determine block size of device %s\n", path); + close(fd); + return -1; + } + + /* + * TODO: This should really calculate the LCM of the current g_io_align and blklen. + * For now, it's fairly safe to just assume all block sizes are powers of 2. + */ + if (g_io_align < blklen) { + g_io_align = blklen; + } + + entry = malloc(sizeof(struct ns_entry)); + if (entry == NULL) { + close(fd); + perror("ns_entry malloc"); + return -1; + } + + if (g_use_uring) { +#ifdef SPDK_CONFIG_URING + entry->type = ENTRY_TYPE_URING_FILE; + entry->fn_table = &uring_fn_table; + entry->u.uring.fd = fd; +#endif + } else { +#if HAVE_LIBAIO + entry->type = ENTRY_TYPE_AIO_FILE; + entry->fn_table = &aio_fn_table; + entry->u.aio.fd = fd; +#endif + } + entry->size_in_ios = size / g_io_size_bytes; + entry->io_size_blocks = g_io_size_bytes / blklen; + + snprintf(entry->name, sizeof(entry->name), "%s", path); + + g_num_namespaces++; + entry->next = g_namespaces; + g_namespaces = entry; + + return 0; +} + +static int +register_files(int argc, char **argv) +{ + int i; + + /* Treat everything after the options as files for AIO/URING */ + for (i = g_file_optind; i < argc; i++) { + if (register_file(argv[i]) != 0) { + return 1; + } + } + + return 0; +} +#endif + +static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); + +static void +nvme_setup_payload(struct perf_task *task, uint8_t pattern) +{ + uint32_t max_io_size_bytes, max_io_md_size; + + /* maximum extended lba format size from all active namespace, + * it's same with g_io_size_bytes for namespace without metadata. + */ + max_io_size_bytes = g_io_size_bytes + g_max_io_md_size * g_max_io_size_blocks; + task->iov.iov_base = spdk_dma_zmalloc(max_io_size_bytes, g_io_align, NULL); + task->iov.iov_len = max_io_size_bytes; + if (task->iov.iov_base == NULL) { + fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n"); + exit(1); + } + memset(task->iov.iov_base, pattern, task->iov.iov_len); + + max_io_md_size = g_max_io_md_size * g_max_io_size_blocks; + if (max_io_md_size != 0) { + task->md_iov.iov_base = spdk_dma_zmalloc(max_io_md_size, g_io_align, NULL); + task->md_iov.iov_len = max_io_md_size; + if (task->md_iov.iov_base == NULL) { + fprintf(stderr, "task->md_buf spdk_dma_zmalloc failed\n"); + spdk_dma_free(task->iov.iov_base); + exit(1); + } + } +} + +static int +nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, + struct ns_entry *entry, uint64_t offset_in_ios) +{ + uint64_t lba; + int rc; + int qp_num; + + enum dif_mode { + DIF_MODE_NONE = 0, + DIF_MODE_DIF = 1, + DIF_MODE_DIX = 2, + } mode = DIF_MODE_NONE; + + lba = offset_in_ios * entry->io_size_blocks; + + if (entry->md_size != 0 && !(entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) { + if (entry->md_interleave) { + mode = DIF_MODE_DIF; + } else { + mode = DIF_MODE_DIX; + } + } + + qp_num = ns_ctx->u.nvme.last_qpair; + ns_ctx->u.nvme.last_qpair++; + if (ns_ctx->u.nvme.last_qpair == ns_ctx->u.nvme.num_active_qpairs) { + ns_ctx->u.nvme.last_qpair = 0; + } + + if (mode != DIF_MODE_NONE) { + rc = spdk_dif_ctx_init(&task->dif_ctx, entry->block_size, entry->md_size, + entry->md_interleave, entry->pi_loc, + (enum spdk_dif_type)entry->pi_type, entry->io_flags, + lba, 0xFFFF, (uint16_t)entry->io_size_blocks, 0, 0); + if (rc != 0) { + fprintf(stderr, "Initialization of DIF context failed\n"); + exit(1); + } + } + + if (task->is_read) { + return spdk_nvme_ns_cmd_read_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], + task->iov.iov_base, task->md_iov.iov_base, + lba, + entry->io_size_blocks, io_complete, + task, entry->io_flags, + task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); + } else { + switch (mode) { + case DIF_MODE_DIF: + rc = spdk_dif_generate(&task->iov, 1, entry->io_size_blocks, &task->dif_ctx); + if (rc != 0) { + fprintf(stderr, "Generation of DIF failed\n"); + return rc; + } + break; + case DIF_MODE_DIX: + rc = spdk_dix_generate(&task->iov, 1, &task->md_iov, entry->io_size_blocks, + &task->dif_ctx); + if (rc != 0) { + fprintf(stderr, "Generation of DIX failed\n"); + return rc; + } + break; + default: + break; + } + + return spdk_nvme_ns_cmd_write_with_md(entry->u.nvme.ns, ns_ctx->u.nvme.qpair[qp_num], + task->iov.iov_base, task->md_iov.iov_base, + lba, + entry->io_size_blocks, io_complete, + task, entry->io_flags, + task->dif_ctx.apptag_mask, task->dif_ctx.app_tag); + } +} + +static void +perf_disconnect_cb(struct spdk_nvme_qpair *qpair, void *ctx) +{ + +} + +static void +nvme_check_io(struct ns_worker_ctx *ns_ctx) +{ + int64_t rc; + + rc = spdk_nvme_poll_group_process_completions(ns_ctx->u.nvme.group, 0, perf_disconnect_cb); + if (rc < 0) { + fprintf(stderr, "NVMe io qpair process completion error\n"); + exit(1); + } +} + +static void +nvme_verify_io(struct perf_task *task, struct ns_entry *entry) +{ + struct spdk_dif_error err_blk = {}; + int rc; + + if (!task->is_read || (entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT)) { + return; + } + + if (entry->md_interleave) { + rc = spdk_dif_verify(&task->iov, 1, entry->io_size_blocks, &task->dif_ctx, + &err_blk); + if (rc != 0) { + fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n", + err_blk.err_type, err_blk.err_offset); + } + } else { + rc = spdk_dix_verify(&task->iov, 1, &task->md_iov, entry->io_size_blocks, + &task->dif_ctx, &err_blk); + if (rc != 0) { + fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n", + err_blk.err_type, err_blk.err_offset); + } + } +} + +/* + * TODO: If a controller has multiple namespaces, they could all use the same queue. + * For now, give each namespace/thread combination its own queue. + */ +static int +nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + struct spdk_nvme_io_qpair_opts opts; + struct ns_entry *entry = ns_ctx->entry; + struct spdk_nvme_poll_group *group; + struct spdk_nvme_qpair *qpair; + int i; + + ns_ctx->u.nvme.num_active_qpairs = g_nr_io_queues_per_ns; + ns_ctx->u.nvme.num_all_qpairs = g_nr_io_queues_per_ns + g_nr_unused_io_queues; + ns_ctx->u.nvme.qpair = calloc(ns_ctx->u.nvme.num_all_qpairs, sizeof(struct spdk_nvme_qpair *)); + if (!ns_ctx->u.nvme.qpair) { + return -1; + } + + spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->u.nvme.ctrlr, &opts, sizeof(opts)); + if (opts.io_queue_requests < entry->num_io_requests) { + opts.io_queue_requests = entry->num_io_requests; + } + opts.delay_cmd_submit = true; + opts.create_only = true; + + ns_ctx->u.nvme.group = spdk_nvme_poll_group_create(NULL); + if (ns_ctx->u.nvme.group == NULL) { + goto poll_group_failed; + } + + group = ns_ctx->u.nvme.group; + for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) { + ns_ctx->u.nvme.qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->u.nvme.ctrlr, &opts, + sizeof(opts)); + qpair = ns_ctx->u.nvme.qpair[i]; + if (!qpair) { + printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); + goto qpair_failed; + } + + if (spdk_nvme_poll_group_add(group, qpair)) { + printf("ERROR: unable to add I/O qpair to poll group.\n"); + spdk_nvme_ctrlr_free_io_qpair(qpair); + goto qpair_failed; + } + + if (spdk_nvme_ctrlr_connect_io_qpair(entry->u.nvme.ctrlr, qpair)) { + printf("ERROR: unable to connect I/O qpair.\n"); + spdk_nvme_poll_group_remove(group, qpair); + spdk_nvme_ctrlr_free_io_qpair(qpair); + goto qpair_failed; + } + } + + return 0; + +qpair_failed: + for (; i > 0; --i) { + spdk_nvme_poll_group_remove(ns_ctx->u.nvme.group, ns_ctx->u.nvme.qpair[i - 1]); + spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i - 1]); + } + + spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group); +poll_group_failed: + free(ns_ctx->u.nvme.qpair); + return -1; +} + +static void +nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + int i; + + for (i = 0; i < ns_ctx->u.nvme.num_all_qpairs; i++) { + spdk_nvme_poll_group_remove(ns_ctx->u.nvme.group, ns_ctx->u.nvme.qpair[i]); + spdk_nvme_ctrlr_free_io_qpair(ns_ctx->u.nvme.qpair[i]); + } + + spdk_nvme_poll_group_destroy(ns_ctx->u.nvme.group); + free(ns_ctx->u.nvme.qpair); +} + +static const struct ns_fn_table nvme_fn_table = { + .setup_payload = nvme_setup_payload, + .submit_io = nvme_submit_io, + .check_io = nvme_check_io, + .verify_io = nvme_verify_io, + .init_ns_worker_ctx = nvme_init_ns_worker_ctx, + .cleanup_ns_worker_ctx = nvme_cleanup_ns_worker_ctx, +}; + +static int +build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport_id *trid; + int res = 0; + + trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); + + switch (trid->trtype) { + case SPDK_NVME_TRANSPORT_PCIE: + res = snprintf(name, length, "PCIE (%s)", trid->traddr); + break; + case SPDK_NVME_TRANSPORT_RDMA: + res = snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + break; + case SPDK_NVME_TRANSPORT_TCP: + res = snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + break; + + default: + fprintf(stderr, "Unknown transport type %d\n", trid->trtype); + break; + } + return res; +} + +static void +build_nvme_ns_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + int res = 0; + + res = build_nvme_name(name, length, ctrlr); + if (res > 0) { + snprintf(name + res, length - res, " NSID %u", nsid); + } + +} + +static void +register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) +{ + struct ns_entry *entry; + const struct spdk_nvme_ctrlr_data *cdata; + uint32_t max_xfer_size, entries, sector_size; + uint64_t ns_size; + struct spdk_nvme_io_qpair_opts opts; + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + if (!spdk_nvme_ns_is_active(ns)) { + printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", + cdata->mn, cdata->sn, + spdk_nvme_ns_get_id(ns)); + g_warn = true; + return; + } + + ns_size = spdk_nvme_ns_get_size(ns); + sector_size = spdk_nvme_ns_get_sector_size(ns); + + if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) { + printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " + "ns size %" PRIu64 " / block size %u for I/O size %u\n", + cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), + ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); + g_warn = true; + return; + } + + max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); + spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); + /* NVMe driver may add additional entries based on + * stripe size and maximum transfer size, we assume + * 1 more entry be used for stripe. + */ + entries = (g_io_size_bytes - 1) / max_xfer_size + 2; + if ((g_queue_depth * entries) > opts.io_queue_size) { + printf("controller IO queue size %u less than required\n", + opts.io_queue_size); + printf("Consider using lower queue depth or small IO size because " + "IO requests may be queued at the NVMe driver.\n"); + } + /* For requests which have children requests, parent request itself + * will also occupy 1 entry. + */ + entries += 1; + + entry = calloc(1, sizeof(struct ns_entry)); + if (entry == NULL) { + perror("ns_entry malloc"); + exit(1); + } + + entry->type = ENTRY_TYPE_NVME_NS; + entry->fn_table = &nvme_fn_table; + entry->u.nvme.ctrlr = ctrlr; + entry->u.nvme.ns = ns; + entry->num_io_requests = g_queue_depth * entries; + + entry->size_in_ios = ns_size / g_io_size_bytes; + entry->io_size_blocks = g_io_size_bytes / sector_size; + + entry->block_size = spdk_nvme_ns_get_extended_sector_size(ns); + entry->md_size = spdk_nvme_ns_get_md_size(ns); + entry->md_interleave = spdk_nvme_ns_supports_extended_lba(ns); + entry->pi_loc = spdk_nvme_ns_get_data(ns)->dps.md_start; + entry->pi_type = spdk_nvme_ns_get_pi_type(ns); + + if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) { + entry->io_flags = g_metacfg_pract_flag | g_metacfg_prchk_flags; + } + + /* If metadata size = 8 bytes, PI is stripped (read) or inserted (write), + * and so reduce metadata size from block size. (If metadata size > 8 bytes, + * PI is passed (read) or replaced (write). So block size is not necessary + * to change.) + */ + if ((entry->io_flags & SPDK_NVME_IO_FLAGS_PRACT) && (entry->md_size == 8)) { + entry->block_size = spdk_nvme_ns_get_sector_size(ns); + } + + if (g_max_io_md_size < entry->md_size) { + g_max_io_md_size = entry->md_size; + } + + if (g_max_io_size_blocks < entry->io_size_blocks) { + g_max_io_size_blocks = entry->io_size_blocks; + } + + build_nvme_ns_name(entry->name, sizeof(entry->name), ctrlr, spdk_nvme_ns_get_id(ns)); + + g_num_namespaces++; + entry->next = g_namespaces; + g_namespaces = entry; +} + +static void +unregister_namespaces(void) +{ + struct ns_entry *entry = g_namespaces; + + while (entry) { + struct ns_entry *next = entry->next; + free(entry); + entry = next; + } +} + +static void +enable_latency_tracking_complete(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + if (spdk_nvme_cpl_is_error(cpl)) { + printf("enable_latency_tracking_complete failed\n"); + } + g_outstanding_commands--; +} + +static void +set_latency_tracking_feature(struct spdk_nvme_ctrlr *ctrlr, bool enable) +{ + int res; + union spdk_nvme_intel_feat_latency_tracking latency_tracking; + + if (enable) { + latency_tracking.bits.enable = 0x01; + } else { + latency_tracking.bits.enable = 0x00; + } + + res = spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING, + latency_tracking.raw, 0, NULL, 0, enable_latency_tracking_complete, NULL); + if (res) { + printf("fail to allocate nvme request.\n"); + return; + } + g_outstanding_commands++; + + while (g_outstanding_commands) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr); + } +} + +static void +register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry) +{ + struct spdk_nvme_ns *ns; + struct ctrlr_entry *entry = malloc(sizeof(struct ctrlr_entry)); + uint32_t nsid; + + if (entry == NULL) { + perror("ctrlr_entry malloc"); + exit(1); + } + + entry->latency_page = spdk_dma_zmalloc(sizeof(struct spdk_nvme_intel_rw_latency_page), + 4096, NULL); + if (entry->latency_page == NULL) { + printf("Allocation error (latency page)\n"); + exit(1); + } + + build_nvme_name(entry->name, sizeof(entry->name), ctrlr); + + entry->ctrlr = ctrlr; + entry->trtype = trid_entry->trid.trtype; + entry->next = g_controllers; + g_controllers = entry; + + if (g_latency_ssd_tracking_enable && + spdk_nvme_ctrlr_is_feature_supported(ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) { + set_latency_tracking_feature(ctrlr, true); + } + + if (trid_entry->nsid == 0) { + for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + continue; + } + register_ns(ctrlr, ns); + } + } else { + ns = spdk_nvme_ctrlr_get_ns(ctrlr, trid_entry->nsid); + if (!ns) { + perror("Namespace does not exist."); + exit(1); + } + + register_ns(ctrlr, ns); + } +} + +static __thread unsigned int seed = 0; + +static inline void +submit_single_io(struct perf_task *task) +{ + uint64_t offset_in_ios; + int rc; + struct ns_worker_ctx *ns_ctx = task->ns_ctx; + struct ns_entry *entry = ns_ctx->entry; + + if (g_is_random) { + offset_in_ios = rand_r(&seed) % entry->size_in_ios; + } else { + offset_in_ios = ns_ctx->offset_in_ios++; + if (ns_ctx->offset_in_ios == entry->size_in_ios) { + ns_ctx->offset_in_ios = 0; + } + } + + task->submit_tsc = spdk_get_ticks(); + + if ((g_rw_percentage == 100) || + (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) { + task->is_read = true; + } else { + task->is_read = false; + } + + rc = entry->fn_table->submit_io(task, ns_ctx, entry, offset_in_ios); + + if (spdk_unlikely(rc != 0)) { + fprintf(stderr, "starting I/O failed\n"); + } else { + ns_ctx->current_queue_depth++; + } +} + +static inline void +task_complete(struct perf_task *task) +{ + struct ns_worker_ctx *ns_ctx; + uint64_t tsc_diff; + struct ns_entry *entry; + + ns_ctx = task->ns_ctx; + entry = ns_ctx->entry; + ns_ctx->current_queue_depth--; + ns_ctx->io_completed++; + tsc_diff = spdk_get_ticks() - task->submit_tsc; + ns_ctx->total_tsc += tsc_diff; + if (spdk_unlikely(ns_ctx->min_tsc > tsc_diff)) { + ns_ctx->min_tsc = tsc_diff; + } + if (spdk_unlikely(ns_ctx->max_tsc < tsc_diff)) { + ns_ctx->max_tsc = tsc_diff; + } + if (spdk_unlikely(g_latency_sw_tracking_level > 0)) { + spdk_histogram_data_tally(ns_ctx->histogram, tsc_diff); + } + + if (spdk_unlikely(entry->md_size > 0)) { + /* add application level verification for end-to-end data protection */ + entry->fn_table->verify_io(task, entry); + } + + /* + * is_draining indicates when time has expired for the test run + * and we are just waiting for the previously submitted I/O + * to complete. In this case, do not submit a new I/O to replace + * the one just completed. + */ + if (spdk_unlikely(ns_ctx->is_draining)) { + spdk_dma_free(task->iov.iov_base); + spdk_dma_free(task->md_iov.iov_base); + free(task); + } else { + submit_single_io(task); + } +} + +static void +io_complete(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct perf_task *task = ctx; + + if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { + fprintf(stderr, "%s completed with error (sct=%d, sc=%d)\n", + task->is_read ? "Read" : "Write", + cpl->status.sct, cpl->status.sc); + } + + task_complete(task); +} + +static struct perf_task * +allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth) +{ + struct perf_task *task; + + task = calloc(1, sizeof(*task)); + if (task == NULL) { + fprintf(stderr, "Out of memory allocating tasks\n"); + exit(1); + } + + ns_ctx->entry->fn_table->setup_payload(task, queue_depth % 8 + 1); + + task->ns_ctx = ns_ctx; + + return task; +} + +static void +submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) +{ + struct perf_task *task; + + while (queue_depth-- > 0) { + task = allocate_task(ns_ctx, queue_depth); + submit_single_io(task); + } +} + +static int +init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + return ns_ctx->entry->fn_table->init_ns_worker_ctx(ns_ctx); +} + +static void +cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + ns_ctx->entry->fn_table->cleanup_ns_worker_ctx(ns_ctx); +} + +static void +print_periodic_performance(void) +{ + uint64_t io_this_second; + double mb_this_second; + struct worker_thread *worker; + struct ns_worker_ctx *ns_ctx; + + if (!isatty(STDOUT_FILENO)) { + /* Don't print periodic stats if output is not going + * to a terminal. + */ + return; + } + + io_this_second = 0; + worker = g_workers; + while (worker) { + ns_ctx = worker->ns_ctx; + while (ns_ctx) { + io_this_second += ns_ctx->io_completed - ns_ctx->last_io_completed; + ns_ctx->last_io_completed = ns_ctx->io_completed; + ns_ctx = ns_ctx->next; + } + worker = worker->next; + } + + mb_this_second = (double)io_this_second * g_io_size_bytes / (1024 * 1024); + printf("%9ju IOPS, %8.2f MiB/s\r", io_this_second, mb_this_second); + fflush(stdout); +} + +static int +work_fn(void *arg) +{ + uint64_t tsc_end, tsc_current, tsc_next_print; + struct worker_thread *worker = (struct worker_thread *)arg; + struct ns_worker_ctx *ns_ctx = NULL; + uint32_t unfinished_ns_ctx; + + /* Allocate queue pairs for each namespace. */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + if (init_ns_worker_ctx(ns_ctx) != 0) { + printf("ERROR: init_ns_worker_ctx() failed\n"); + return 1; + } + ns_ctx = ns_ctx->next; + } + + tsc_current = spdk_get_ticks(); + tsc_end = tsc_current + g_time_in_sec * g_tsc_rate; + tsc_next_print = tsc_current + g_tsc_rate; + + /* Submit initial I/O for each namespace. */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + submit_io(ns_ctx, g_queue_depth); + ns_ctx = ns_ctx->next; + } + + while (1) { + /* + * Check for completed I/O for each controller. A new + * I/O will be submitted in the io_complete callback + * to replace each I/O that is completed. + */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + ns_ctx->entry->fn_table->check_io(ns_ctx); + ns_ctx = ns_ctx->next; + } + + tsc_current = spdk_get_ticks(); + + if (worker->lcore == g_master_core && tsc_current > tsc_next_print) { + tsc_next_print += g_tsc_rate; + print_periodic_performance(); + } + + if (tsc_current > tsc_end) { + break; + } + } + + /* drain the io of each ns_ctx in round robin to make the fairness */ + do { + unfinished_ns_ctx = 0; + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + /* first time will enter into this if case */ + if (!ns_ctx->is_draining) { + ns_ctx->is_draining = true; + } + + if (ns_ctx->current_queue_depth > 0) { + ns_ctx->entry->fn_table->check_io(ns_ctx); + if (ns_ctx->current_queue_depth == 0) { + cleanup_ns_worker_ctx(ns_ctx); + } else { + unfinished_ns_ctx++; + } + } + ns_ctx = ns_ctx->next; + } + } while (unfinished_ns_ctx > 0); + + return 0; +} + +static void usage(char *program_name) +{ + printf("%s options", program_name); +#if defined(SPDK_CONFIG_URING) || defined(HAVE_LIBAIO) + printf(" [Kernel device(s)]..."); +#endif + printf("\n"); + printf("\t[-q io depth]\n"); + printf("\t[-o io size in bytes]\n"); + printf("\t[-P number of io queues per namespace. default: 1]\n"); + printf("\t[-U number of unused io queues per controller. default: 0]\n"); + printf("\t[-w io pattern type, must be one of\n"); + printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n"); + printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n"); + printf("\t[-L enable latency tracking via sw, default: disabled]\n"); + printf("\t\t-L for latency summary, -LL for detailed histogram\n"); + printf("\t[-l enable latency tracking via ssd (if supported), default: disabled]\n"); + printf("\t[-t time in seconds]\n"); + printf("\t[-c core mask for I/O submission/completion.]\n"); + printf("\t\t(default: 1)\n"); + printf("\t[-D disable submission queue in controller memory buffer, default: enabled]\n"); + printf("\t[-H enable header digest for TCP transport, default: disabled]\n"); + printf("\t[-I enable data digest for TCP transport, default: disabled]\n"); + printf("\t[-N no shutdown notification process for controllers, default: disabled]\n"); + printf("\t[-r Transport ID for local PCIe NVMe or NVMeoF]\n"); + printf("\t Format: 'key:value [key:value] ...'\n"); + printf("\t Keys:\n"); + printf("\t trtype Transport type (e.g. PCIe, RDMA)\n"); + printf("\t adrfam Address family (e.g. IPv4, IPv6)\n"); + printf("\t traddr Transport address (e.g. 0000:04:00.0 for PCIe or 192.168.100.8 for RDMA)\n"); + printf("\t trsvcid Transport service identifier (e.g. 4420)\n"); + printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); + printf("\t Example: -r 'trtype:PCIe traddr:0000:04:00.0' for PCIe or\n"); + printf("\t -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n"); + printf("\t[-e metadata configuration]\n"); + printf("\t Keys:\n"); + printf("\t PRACT Protection Information Action bit (PRACT=1 or PRACT=0)\n"); + printf("\t PRCHK Control of Protection Information Checking (PRCHK=GUARD|REFTAG|APPTAG)\n"); + printf("\t Example: -e 'PRACT=0,PRCHK=GUARD|REFTAG|APPTAG'\n"); + printf("\t -e 'PRACT=1,PRCHK=GUARD'\n"); + printf("\t[-k keep alive timeout period in millisecond]\n"); + printf("\t[-s DPDK huge memory size in MB.]\n"); + printf("\t[-C max completions per poll]\n"); + printf("\t\t(default: 0 - unlimited)\n"); + printf("\t[-i shared memory group ID]\n"); + printf("\t"); + spdk_log_usage(stdout, "-T"); +#ifdef SPDK_CONFIG_URING + printf("\t[-R enable using liburing to drive kernel devices (Default: libaio)]\n"); +#endif +#ifdef DEBUG + printf("\t[-G enable debug logging]\n"); +#else + printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)\n"); +#endif +} + +static void +check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count, + uint64_t total, uint64_t so_far) +{ + double so_far_pct; + double **cutoff = ctx; + + if (count == 0) { + return; + } + + so_far_pct = (double)so_far / total; + while (so_far_pct >= **cutoff && **cutoff > 0) { + printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * 1000 * 1000 / g_tsc_rate); + (*cutoff)++; + } +} + +static void +print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count, + uint64_t total, uint64_t so_far) +{ + double so_far_pct; + + if (count == 0) { + return; + } + + so_far_pct = (double)so_far * 100 / total; + printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n", + (double)start * 1000 * 1000 / g_tsc_rate, + (double)end * 1000 * 1000 / g_tsc_rate, + so_far_pct, count); +} + +static void +print_performance(void) +{ + uint64_t total_io_completed, total_io_tsc; + double io_per_second, mb_per_second, average_latency, min_latency, max_latency; + double sum_ave_latency, min_latency_so_far, max_latency_so_far; + double total_io_per_second, total_mb_per_second; + int ns_count; + struct worker_thread *worker; + struct ns_worker_ctx *ns_ctx; + uint32_t max_strlen; + + total_io_per_second = 0; + total_mb_per_second = 0; + total_io_completed = 0; + total_io_tsc = 0; + min_latency_so_far = (double)UINT64_MAX; + max_latency_so_far = 0; + ns_count = 0; + + max_strlen = 0; + worker = g_workers; + while (worker) { + ns_ctx = worker->ns_ctx; + while (ns_ctx) { + max_strlen = spdk_max(strlen(ns_ctx->entry->name), max_strlen); + ns_ctx = ns_ctx->next; + } + worker = worker->next; + } + + printf("========================================================\n"); + printf("%*s\n", max_strlen + 60, "Latency(us)"); + printf("%-*s: %10s %10s %10s %10s %10s\n", + max_strlen + 13, "Device Information", "IOPS", "MiB/s", "Average", "min", "max"); + + worker = g_workers; + while (worker) { + ns_ctx = worker->ns_ctx; + while (ns_ctx) { + if (ns_ctx->io_completed != 0) { + io_per_second = (double)ns_ctx->io_completed / g_time_in_sec; + mb_per_second = io_per_second * g_io_size_bytes / (1024 * 1024); + average_latency = ((double)ns_ctx->total_tsc / ns_ctx->io_completed) * 1000 * 1000 / g_tsc_rate; + min_latency = (double)ns_ctx->min_tsc * 1000 * 1000 / g_tsc_rate; + if (min_latency < min_latency_so_far) { + min_latency_so_far = min_latency; + } + + max_latency = (double)ns_ctx->max_tsc * 1000 * 1000 / g_tsc_rate; + if (max_latency > max_latency_so_far) { + max_latency_so_far = max_latency; + } + + printf("%-*.*s from core %2u: %10.2f %10.2f %10.2f %10.2f %10.2f\n", + max_strlen, max_strlen, ns_ctx->entry->name, worker->lcore, + io_per_second, mb_per_second, + average_latency, min_latency, max_latency); + total_io_per_second += io_per_second; + total_mb_per_second += mb_per_second; + total_io_completed += ns_ctx->io_completed; + total_io_tsc += ns_ctx->total_tsc; + ns_count++; + } + ns_ctx = ns_ctx->next; + } + worker = worker->next; + } + + if (ns_count != 0 && total_io_completed) { + sum_ave_latency = ((double)total_io_tsc / total_io_completed) * 1000 * 1000 / g_tsc_rate; + printf("========================================================\n"); + printf("%-*s: %10.2f %10.2f %10.2f %10.2f %10.2f\n", + max_strlen + 13, "Total", total_io_per_second, total_mb_per_second, + sum_ave_latency, min_latency_so_far, max_latency_so_far); + printf("\n"); + } + + if (g_latency_sw_tracking_level == 0 || total_io_completed == 0) { + return; + } + + worker = g_workers; + while (worker) { + ns_ctx = worker->ns_ctx; + while (ns_ctx) { + const double *cutoff = g_latency_cutoffs; + + printf("Summary latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore); + printf("=================================================================================\n"); + + spdk_histogram_data_iterate(ns_ctx->histogram, check_cutoff, &cutoff); + + printf("\n"); + ns_ctx = ns_ctx->next; + } + worker = worker->next; + } + + if (g_latency_sw_tracking_level == 1) { + return; + } + + worker = g_workers; + while (worker) { + ns_ctx = worker->ns_ctx; + while (ns_ctx) { + printf("Latency histogram for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore); + printf("==============================================================================\n"); + printf(" Range in us Cumulative IO count\n"); + + spdk_histogram_data_iterate(ns_ctx->histogram, print_bucket, NULL); + printf("\n"); + ns_ctx = ns_ctx->next; + } + worker = worker->next; + } + +} + +static void +print_latency_page(struct ctrlr_entry *entry) +{ + int i; + + printf("\n"); + printf("%s\n", entry->name); + printf("--------------------------------------------------------\n"); + + for (i = 0; i < 32; i++) { + if (entry->latency_page->buckets_32us[i]) { + printf("Bucket %dus - %dus: %d\n", i * 32, (i + 1) * 32, entry->latency_page->buckets_32us[i]); + } + } + for (i = 0; i < 31; i++) { + if (entry->latency_page->buckets_1ms[i]) { + printf("Bucket %dms - %dms: %d\n", i + 1, i + 2, entry->latency_page->buckets_1ms[i]); + } + } + for (i = 0; i < 31; i++) { + if (entry->latency_page->buckets_32ms[i]) + printf("Bucket %dms - %dms: %d\n", (i + 1) * 32, (i + 2) * 32, + entry->latency_page->buckets_32ms[i]); + } +} + +static void +print_latency_statistics(const char *op_name, enum spdk_nvme_intel_log_page log_page) +{ + struct ctrlr_entry *ctrlr; + + printf("%s Latency Statistics:\n", op_name); + printf("========================================================\n"); + ctrlr = g_controllers; + while (ctrlr) { + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) { + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr->ctrlr, log_page, SPDK_NVME_GLOBAL_NS_TAG, + ctrlr->latency_page, sizeof(struct spdk_nvme_intel_rw_latency_page), 0, + enable_latency_tracking_complete, + NULL)) { + printf("nvme_ctrlr_cmd_get_log_page() failed\n"); + exit(1); + } + + g_outstanding_commands++; + } else { + printf("Controller %s: %s latency statistics not supported\n", ctrlr->name, op_name); + } + ctrlr = ctrlr->next; + } + + while (g_outstanding_commands) { + ctrlr = g_controllers; + while (ctrlr) { + spdk_nvme_ctrlr_process_admin_completions(ctrlr->ctrlr); + ctrlr = ctrlr->next; + } + } + + ctrlr = g_controllers; + while (ctrlr) { + if (spdk_nvme_ctrlr_is_log_page_supported(ctrlr->ctrlr, log_page)) { + print_latency_page(ctrlr); + } + ctrlr = ctrlr->next; + } + printf("\n"); +} + +static void +print_stats(void) +{ + print_performance(); + if (g_latency_ssd_tracking_enable) { + if (g_rw_percentage != 0) { + print_latency_statistics("Read", SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY); + } + if (g_rw_percentage != 100) { + print_latency_statistics("Write", SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY); + } + } +} + +static void +unregister_trids(void) +{ + struct trid_entry *trid_entry, *tmp; + + TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) { + TAILQ_REMOVE(&g_trid_list, trid_entry, tailq); + free(trid_entry); + } +} + +static int +add_trid(const char *trid_str) +{ + struct trid_entry *trid_entry; + struct spdk_nvme_transport_id *trid; + char *ns; + + trid_entry = calloc(1, sizeof(*trid_entry)); + if (trid_entry == NULL) { + return -1; + } + + trid = &trid_entry->trid; + trid->trtype = SPDK_NVME_TRANSPORT_PCIE; + snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); + + if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) { + fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str); + free(trid_entry); + return 1; + } + + spdk_nvme_transport_id_populate_trstring(trid, + spdk_nvme_transport_id_trtype_str(trid->trtype)); + + ns = strcasestr(trid_str, "ns:"); + if (ns) { + char nsid_str[6]; /* 5 digits maximum in an nsid */ + int len; + int nsid; + + ns += 3; + + len = strcspn(ns, " \t\n"); + if (len > 5) { + fprintf(stderr, "NVMe namespace IDs must be 5 digits or less\n"); + free(trid_entry); + return 1; + } + + memcpy(nsid_str, ns, len); + nsid_str[len] = '\0'; + + nsid = spdk_strtol(nsid_str, 10); + if (nsid <= 0 || nsid > 65535) { + fprintf(stderr, "NVMe namespace IDs must be less than 65536 and greater than 0\n"); + free(trid_entry); + return 1; + } + + trid_entry->nsid = (uint16_t)nsid; + } + + TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq); + return 0; +} + +static size_t +parse_next_key(const char **str, char *key, char *val, size_t key_buf_size, + size_t val_buf_size) +{ + const char *sep; + const char *separator = ", \t\n"; + size_t key_len, val_len; + + *str += strspn(*str, separator); + + sep = strchr(*str, '='); + if (!sep) { + fprintf(stderr, "Key without '=' separator\n"); + return 0; + } + + key_len = sep - *str; + if (key_len >= key_buf_size) { + fprintf(stderr, "Key length %zu is greater than maximum allowed %zu\n", + key_len, key_buf_size - 1); + return 0; + } + + memcpy(key, *str, key_len); + key[key_len] = '\0'; + + *str += key_len + 1; /* Skip key */ + val_len = strcspn(*str, separator); + if (val_len == 0) { + fprintf(stderr, "Key without value\n"); + return 0; + } + + if (val_len >= val_buf_size) { + fprintf(stderr, "Value length %zu is greater than maximum allowed %zu\n", + val_len, val_buf_size - 1); + return 0; + } + + memcpy(val, *str, val_len); + val[val_len] = '\0'; + + *str += val_len; + + return val_len; +} + +static int +parse_metadata(const char *metacfg_str) +{ + const char *str; + size_t val_len; + char key[32]; + char val[1024]; + + if (metacfg_str == NULL) { + return -EINVAL; + } + + str = metacfg_str; + + while (*str != '\0') { + val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val)); + if (val_len == 0) { + fprintf(stderr, "Failed to parse metadata\n"); + return -EINVAL; + } + + if (strcmp(key, "PRACT") == 0) { + if (*val == '1') { + g_metacfg_pract_flag = SPDK_NVME_IO_FLAGS_PRACT; + } + } else if (strcmp(key, "PRCHK") == 0) { + if (strstr(val, "GUARD") != NULL) { + g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD; + } + if (strstr(val, "REFTAG") != NULL) { + g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG; + } + if (strstr(val, "APPTAG") != NULL) { + g_metacfg_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG; + } + } else { + fprintf(stderr, "Unknown key '%s'\n", key); + } + } + + return 0; +} + +static int +parse_args(int argc, char **argv) +{ + int op; + long int val; + int rc; + + while ((op = getopt(argc, argv, "c:e:i:lo:q:r:k:s:t:w:C:DGHILM:NP:RT:U:V")) != -1) { + switch (op) { + case 'i': + case 'C': + case 'P': + case 'o': + case 'q': + case 'k': + case 's': + case 't': + case 'M': + case 'U': + val = spdk_strtol(optarg, 10); + if (val < 0) { + fprintf(stderr, "Converting a string to integer failed\n"); + return val; + } + switch (op) { + case 'i': + g_shm_id = val; + break; + case 'C': + g_max_completions = val; + break; + case 'P': + g_nr_io_queues_per_ns = val; + break; + case 'o': + g_io_size_bytes = val; + break; + case 'q': + g_queue_depth = val; + break; + case 'k': + g_keep_alive_timeout_in_ms = val; + break; + case 's': + g_dpdk_mem = val; + break; + case 't': + g_time_in_sec = val; + break; + case 'M': + g_rw_percentage = val; + g_mix_specified = true; + break; + case 'U': + g_nr_unused_io_queues = val; + break; + } + break; + case 'c': + g_core_mask = optarg; + break; + case 'e': + if (parse_metadata(optarg)) { + usage(argv[0]); + return 1; + } + break; + case 'l': + g_latency_ssd_tracking_enable = true; + break; + case 'r': + if (add_trid(optarg)) { + usage(argv[0]); + return 1; + } + break; + case 'w': + g_workload_type = optarg; + break; + case 'D': + g_disable_sq_cmb = 1; + break; + case 'G': +#ifndef DEBUG + fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n", + argv[0]); + usage(argv[0]); + return 1; +#else + spdk_log_set_flag("nvme"); + spdk_log_set_print_level(SPDK_LOG_DEBUG); + break; +#endif + case 'H': + g_header_digest = 1; + break; + case 'I': + g_data_digest = 1; + break; + case 'L': + g_latency_sw_tracking_level++; + break; + case 'N': + g_no_shn_notification = true; + break; + case 'R': +#ifndef SPDK_CONFIG_URING + fprintf(stderr, "%s must be rebuilt with CONFIG_URING=y for -R flag.\n", + argv[0]); + usage(argv[0]); + return 0; +#endif + g_use_uring = true; + break; + case 'T': + rc = spdk_log_set_flag(optarg); + if (rc < 0) { + fprintf(stderr, "unknown flag\n"); + usage(argv[0]); + exit(EXIT_FAILURE); + } + spdk_log_set_print_level(SPDK_LOG_DEBUG); +#ifndef DEBUG + fprintf(stderr, "%s must be rebuilt with CONFIG_DEBUG=y for -T flag.\n", + argv[0]); + usage(argv[0]); + return 0; +#endif + break; + case 'V': + g_vmd = true; + break; + default: + usage(argv[0]); + return 1; + } + } + + if (!g_nr_io_queues_per_ns) { + usage(argv[0]); + return 1; + } + + if (!g_queue_depth) { + fprintf(stderr, "missing -q (queue size) operand\n"); + usage(argv[0]); + return 1; + } + if (!g_io_size_bytes) { + fprintf(stderr, "missing -o (block size) operand\n"); + usage(argv[0]); + return 1; + } + if (!g_workload_type) { + fprintf(stderr, "missing -w (io pattern type) operand\n"); + usage(argv[0]); + return 1; + } + if (!g_time_in_sec) { + fprintf(stderr, "missing -t (test time in seconds) operand\n"); + usage(argv[0]); + return 1; + } + + if (strncmp(g_workload_type, "rand", 4) == 0) { + g_is_random = 1; + g_workload_type = &g_workload_type[4]; + } + + if (strcmp(g_workload_type, "read") == 0 || strcmp(g_workload_type, "write") == 0) { + g_rw_percentage = strcmp(g_workload_type, "read") == 0 ? 100 : 0; + if (g_mix_specified) { + fprintf(stderr, "Ignoring -M option... Please use -M option" + " only when using rw or randrw.\n"); + } + } else if (strcmp(g_workload_type, "rw") == 0) { + if (g_rw_percentage < 0 || g_rw_percentage > 100) { + fprintf(stderr, + "-M must be specified to value from 0 to 100 " + "for rw or randrw.\n"); + return 1; + } + } else { + fprintf(stderr, + "io pattern type must be one of\n" + "(read, write, randread, randwrite, rw, randrw)\n"); + return 1; + } + + if (TAILQ_EMPTY(&g_trid_list)) { + /* If no transport IDs specified, default to enumerating all local PCIe devices */ + add_trid("trtype:PCIe"); + } else { + struct trid_entry *trid_entry, *trid_entry_tmp; + + g_no_pci = true; + /* check whether there is local PCIe type */ + TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) { + if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + g_no_pci = false; + break; + } + } + } + + g_file_optind = optind; + + return 0; +} + +static int +register_workers(void) +{ + uint32_t i; + struct worker_thread *worker; + + g_workers = NULL; + g_num_workers = 0; + + SPDK_ENV_FOREACH_CORE(i) { + worker = calloc(1, sizeof(*worker)); + if (worker == NULL) { + fprintf(stderr, "Unable to allocate worker\n"); + return -1; + } + + worker->lcore = i; + worker->next = g_workers; + g_workers = worker; + g_num_workers++; + } + + return 0; +} + +static void +unregister_workers(void) +{ + struct worker_thread *worker = g_workers; + + /* Free namespace context and worker thread */ + while (worker) { + struct worker_thread *next_worker = worker->next; + struct ns_worker_ctx *ns_ctx = worker->ns_ctx; + + while (ns_ctx) { + struct ns_worker_ctx *next_ns_ctx = ns_ctx->next; + spdk_histogram_data_free(ns_ctx->histogram); + free(ns_ctx); + ns_ctx = next_ns_ctx; + } + + free(worker); + worker = next_worker; + } +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { + if (g_disable_sq_cmb) { + opts->use_cmb_sqs = false; + } + if (g_no_shn_notification) { + opts->no_shn_notification = true; + } + } + + /* Set io_queue_size to UINT16_MAX, NVMe driver + * will then reduce this to MQES to maximize + * the io_queue_size as much as possible. + */ + opts->io_queue_size = UINT16_MAX; + + /* Set the header and data_digest */ + opts->header_digest = g_header_digest; + opts->data_digest = g_data_digest; + opts->keep_alive_timeout_ms = g_keep_alive_timeout_in_ms; + + return true; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + struct trid_entry *trid_entry = cb_ctx; + struct spdk_pci_addr pci_addr; + struct spdk_pci_device *pci_dev; + struct spdk_pci_id pci_id; + + if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) { + printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n", + trid->traddr, trid->trsvcid, + trid->subnqn); + } else { + if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) { + return; + } + + pci_dev = spdk_nvme_ctrlr_get_pci_device(ctrlr); + if (!pci_dev) { + return; + } + + pci_id = spdk_pci_device_get_id(pci_dev); + + printf("Attached to NVMe Controller at %s [%04x:%04x]\n", + trid->traddr, + pci_id.vendor_id, pci_id.device_id); + } + + register_ctrlr(ctrlr, trid_entry); +} + +static int +register_controllers(void) +{ + struct trid_entry *trid_entry; + + printf("Initializing NVMe Controllers\n"); + + if (g_vmd && spdk_vmd_init()) { + fprintf(stderr, "Failed to initialize VMD." + " Some NVMe devices can be unavailable.\n"); + } + + TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) { + if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n", + trid_entry->trid.traddr); + return -1; + } + } + + return 0; +} + +static void +unregister_controllers(void) +{ + struct ctrlr_entry *entry = g_controllers; + + while (entry) { + struct ctrlr_entry *next = entry->next; + spdk_dma_free(entry->latency_page); + if (g_latency_ssd_tracking_enable && + spdk_nvme_ctrlr_is_feature_supported(entry->ctrlr, SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING)) { + set_latency_tracking_feature(entry->ctrlr, false); + } + + if (g_nr_unused_io_queues) { + int i; + + for (i = 0; i < g_nr_unused_io_queues; i++) { + spdk_nvme_ctrlr_free_io_qpair(entry->unused_qpairs[i]); + } + + free(entry->unused_qpairs); + } + + spdk_nvme_detach(entry->ctrlr); + free(entry); + entry = next; + } + + if (g_vmd) { + spdk_vmd_fini(); + } +} + +static int +associate_workers_with_ns(void) +{ + struct ns_entry *entry = g_namespaces; + struct worker_thread *worker = g_workers; + struct ns_worker_ctx *ns_ctx; + int i, count; + + count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers; + + for (i = 0; i < count; i++) { + if (entry == NULL) { + break; + } + + ns_ctx = calloc(1, sizeof(struct ns_worker_ctx)); + if (!ns_ctx) { + return -1; + } + + printf("Associating %s with lcore %d\n", entry->name, worker->lcore); + ns_ctx->min_tsc = UINT64_MAX; + ns_ctx->entry = entry; + ns_ctx->next = worker->ns_ctx; + ns_ctx->histogram = spdk_histogram_data_alloc(); + worker->ns_ctx = ns_ctx; + + worker = worker->next; + if (worker == NULL) { + worker = g_workers; + } + + entry = entry->next; + if (entry == NULL) { + entry = g_namespaces; + } + + } + + return 0; +} + +static void * +nvme_poll_ctrlrs(void *arg) +{ + struct ctrlr_entry *entry; + int oldstate; + + spdk_unaffinitize_thread(); + + while (true) { + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); + + entry = g_controllers; + while (entry) { + if (entry->trtype != SPDK_NVME_TRANSPORT_PCIE) { + spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr); + } + entry = entry->next; + } + + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate); + + /* This is a pthread cancellation point and cannot be removed. */ + sleep(1); + } + + return NULL; +} + +int main(int argc, char **argv) +{ + int rc; + struct worker_thread *worker, *master_worker; + struct spdk_env_opts opts; + pthread_t thread_id = 0; + + rc = parse_args(argc, argv); + if (rc != 0) { + return rc; + } + + spdk_env_opts_init(&opts); + opts.name = "perf"; + opts.shm_id = g_shm_id; + if (g_core_mask) { + opts.core_mask = g_core_mask; + } + + if (g_dpdk_mem) { + opts.mem_size = g_dpdk_mem; + } + if (g_no_pci) { + opts.no_pci = g_no_pci; + } + if (spdk_env_init(&opts) < 0) { + fprintf(stderr, "Unable to initialize SPDK env\n"); + rc = -1; + goto cleanup; + } + + g_tsc_rate = spdk_get_ticks_hz(); + + if (register_workers() != 0) { + rc = -1; + goto cleanup; + } + +#if defined(HAVE_LIBAIO) || defined(SPDK_CONFIG_URING) + if (register_files(argc, argv) != 0) { + rc = -1; + goto cleanup; + } +#endif + + if (register_controllers() != 0) { + rc = -1; + goto cleanup; + } + + if (g_warn) { + printf("WARNING: Some requested NVMe devices were skipped\n"); + } + + if (g_num_namespaces == 0) { + fprintf(stderr, "No valid NVMe controllers or AIO or URING devices found\n"); + goto cleanup; + } + + rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL); + if (rc != 0) { + fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n"); + goto cleanup; + } + + if (associate_workers_with_ns() != 0) { + rc = -1; + goto cleanup; + } + + printf("Initialization complete. Launching workers.\n"); + + /* Launch all of the slave workers */ + g_master_core = spdk_env_get_current_core(); + master_worker = NULL; + worker = g_workers; + while (worker != NULL) { + if (worker->lcore != g_master_core) { + spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); + } else { + assert(master_worker == NULL); + master_worker = worker; + } + worker = worker->next; + } + + assert(master_worker != NULL); + rc = work_fn(master_worker); + + spdk_env_thread_wait_all(); + + print_stats(); + +cleanup: + if (thread_id && pthread_cancel(thread_id) == 0) { + pthread_join(thread_id, NULL); + } + unregister_trids(); + unregister_namespaces(); + unregister_controllers(); + unregister_workers(); + + if (rc != 0) { + fprintf(stderr, "%s: errors occured\n", argv[0]); + } + + return rc; +} diff --git a/src/spdk/examples/nvme/reconnect/.gitignore b/src/spdk/examples/nvme/reconnect/.gitignore new file mode 100644 index 000000000..efe3eada4 --- /dev/null +++ b/src/spdk/examples/nvme/reconnect/.gitignore @@ -0,0 +1 @@ +reconnect diff --git a/src/spdk/examples/nvme/reconnect/Makefile b/src/spdk/examples/nvme/reconnect/Makefile new file mode 100644 index 000000000..880ae76c0 --- /dev/null +++ b/src/spdk/examples/nvme/reconnect/Makefile @@ -0,0 +1,38 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +APP = reconnect + +include $(SPDK_ROOT_DIR)/mk/nvme.libtest.mk diff --git a/src/spdk/examples/nvme/reconnect/reconnect.c b/src/spdk/examples/nvme/reconnect/reconnect.c new file mode 100644 index 000000000..74c5f3657 --- /dev/null +++ b/src/spdk/examples/nvme/reconnect/reconnect.c @@ -0,0 +1,1185 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/nvme.h" +#include "spdk/queue.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/log.h" +#include "spdk/likely.h" + +struct ctrlr_entry { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_transport_id failover_trid; + enum spdk_nvme_transport_type trtype; + struct ctrlr_entry *next; + char name[1024]; + int num_resets; +}; + +struct ns_entry { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ns *ns; + + struct ns_entry *next; + uint32_t io_size_blocks; + uint32_t num_io_requests; + uint64_t size_in_ios; + uint32_t block_size; + uint32_t io_flags; + char name[1024]; +}; + +struct ns_worker_ctx { + struct ns_entry *entry; + uint64_t io_completed; + uint64_t current_queue_depth; + uint64_t offset_in_ios; + bool is_draining; + + int num_qpairs; + struct spdk_nvme_qpair **qpair; + int last_qpair; + + struct ns_worker_ctx *next; +}; + +struct perf_task { + struct ns_worker_ctx *ns_ctx; + struct iovec iov; + bool is_read; +}; + +struct worker_thread { + struct ns_worker_ctx *ns_ctx; + struct worker_thread *next; + unsigned lcore; +}; + +/* For basic reset handling. */ +static int g_max_ctrlr_resets = 15; + +static struct ctrlr_entry *g_controllers = NULL; +static struct ns_entry *g_namespaces = NULL; +static int g_num_namespaces = 0; +static struct worker_thread *g_workers = NULL; +static int g_num_workers = 0; + +static uint64_t g_tsc_rate; + +static uint32_t g_io_align = 0x200; +static uint32_t g_io_size_bytes; +static uint32_t g_max_io_size_blocks; +static int g_rw_percentage; +static int g_is_random; +static int g_queue_depth; +static int g_time_in_sec; +static uint32_t g_max_completions; +static int g_dpdk_mem; +static bool g_warn; +static uint32_t g_keep_alive_timeout_in_ms = 0; +static uint8_t g_transport_retry_count = 4; +static uint8_t g_transport_ack_timeout = 0; /* disabled */ + +static const char *g_core_mask; + +struct trid_entry { + struct spdk_nvme_transport_id trid; + struct spdk_nvme_transport_id failover_trid; + TAILQ_ENTRY(trid_entry) tailq; +}; + +static TAILQ_HEAD(, trid_entry) g_trid_list = TAILQ_HEAD_INITIALIZER(g_trid_list); + +static inline void +task_complete(struct perf_task *task); +static void submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth); + +static void io_complete(void *ctx, const struct spdk_nvme_cpl *cpl); + +static void +nvme_setup_payload(struct perf_task *task) +{ + /* maximum extended lba format size from all active namespace, + * it's same with g_io_size_bytes for namespace without metadata. + */ + task->iov.iov_base = spdk_dma_zmalloc(g_io_size_bytes, g_io_align, NULL); + task->iov.iov_len = g_io_size_bytes; + if (task->iov.iov_base == NULL) { + fprintf(stderr, "task->buf spdk_dma_zmalloc failed\n"); + exit(1); + } +} + +static int +nvme_submit_io(struct perf_task *task, struct ns_worker_ctx *ns_ctx, + struct ns_entry *entry, uint64_t offset_in_ios) +{ + uint64_t lba; + int qp_num; + + lba = offset_in_ios * entry->io_size_blocks; + + qp_num = ns_ctx->last_qpair; + ns_ctx->last_qpair++; + if (ns_ctx->last_qpair == ns_ctx->num_qpairs) { + ns_ctx->last_qpair = 0; + } + + if (task->is_read) { + return spdk_nvme_ns_cmd_read(entry->ns, ns_ctx->qpair[qp_num], + task->iov.iov_base, lba, + entry->io_size_blocks, io_complete, + task, entry->io_flags); + } + + return spdk_nvme_ns_cmd_write(entry->ns, ns_ctx->qpair[qp_num], + task->iov.iov_base, lba, + entry->io_size_blocks, io_complete, + task, entry->io_flags); +} + +static void +nvme_check_io(struct ns_worker_ctx *ns_ctx) +{ + int i, rc; + + for (i = 0; i < ns_ctx->num_qpairs; i++) { + rc = spdk_nvme_qpair_process_completions(ns_ctx->qpair[i], g_max_completions); + /* The transport level qpair is failed and we need to reconnect it. */ + if (spdk_unlikely(rc == -ENXIO)) { + rc = spdk_nvme_ctrlr_reconnect_io_qpair(ns_ctx->qpair[i]); + /* successful reconnect */ + if (rc == 0) { + continue; + } else if (rc == -ENXIO) { + /* This means the controller is failed. Defer to it to restore the qpair. */ + continue; + } else { + /* + * We were unable to restore the qpair on this attempt. We don't + * really know why. For naive handling, just keep trying. + * TODO: add a retry limit, and destroy the qpair after x iterations. + */ + fprintf(stderr, "qpair failed and we were unable to recover it.\n"); + } + } else if (spdk_unlikely(rc < 0)) { + fprintf(stderr, "Received an unknown error processing completions.\n"); + exit(1); + } + } +} + +/* + * TODO: If a controller has multiple namespaces, they could all use the same queue. + * For now, give each namespace/thread combination its own queue. + */ +static int +nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + struct spdk_nvme_io_qpair_opts opts; + struct ns_entry *entry = ns_ctx->entry; + int i; + + ns_ctx->num_qpairs = 1; + ns_ctx->qpair = calloc(ns_ctx->num_qpairs, sizeof(struct spdk_nvme_qpair *)); + if (!ns_ctx->qpair) { + return -1; + } + + spdk_nvme_ctrlr_get_default_io_qpair_opts(entry->ctrlr, &opts, sizeof(opts)); + if (opts.io_queue_requests < entry->num_io_requests) { + opts.io_queue_requests = entry->num_io_requests; + } + + for (i = 0; i < ns_ctx->num_qpairs; i++) { + ns_ctx->qpair[i] = spdk_nvme_ctrlr_alloc_io_qpair(entry->ctrlr, &opts, + sizeof(opts)); + if (!ns_ctx->qpair[i]) { + printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair failed\n"); + return -1; + } + } + + return 0; +} + +static void +nvme_cleanup_ns_worker_ctx(struct ns_worker_ctx *ns_ctx) +{ + int i; + + for (i = 0; i < ns_ctx->num_qpairs; i++) { + spdk_nvme_ctrlr_free_io_qpair(ns_ctx->qpair[i]); + } + + free(ns_ctx->qpair); +} + +static void +build_nvme_name(char *name, size_t length, struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport_id *trid; + + trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); + + switch (trid->trtype) { + case SPDK_NVME_TRANSPORT_RDMA: + snprintf(name, length, "RDMA (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + break; + case SPDK_NVME_TRANSPORT_TCP: + snprintf(name, length, "TCP (addr:%s subnqn:%s)", trid->traddr, trid->subnqn); + break; + default: + fprintf(stderr, "Unknown transport type %d\n", trid->trtype); + break; + } +} + +static void +register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) +{ + struct ns_entry *entry; + const struct spdk_nvme_ctrlr_data *cdata; + uint32_t max_xfer_size, entries, sector_size; + uint64_t ns_size; + struct spdk_nvme_io_qpair_opts opts; + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + if (!spdk_nvme_ns_is_active(ns)) { + printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", + cdata->mn, cdata->sn, + spdk_nvme_ns_get_id(ns)); + g_warn = true; + return; + } + + ns_size = spdk_nvme_ns_get_size(ns); + sector_size = spdk_nvme_ns_get_sector_size(ns); + + if (ns_size < g_io_size_bytes || sector_size > g_io_size_bytes) { + printf("WARNING: controller %-20.20s (%-20.20s) ns %u has invalid " + "ns size %" PRIu64 " / block size %u for I/O size %u\n", + cdata->mn, cdata->sn, spdk_nvme_ns_get_id(ns), + ns_size, spdk_nvme_ns_get_sector_size(ns), g_io_size_bytes); + g_warn = true; + return; + } + + max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); + spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); + /* NVMe driver may add additional entries based on + * stripe size and maximum transfer size, we assume + * 1 more entry be used for stripe. + */ + entries = (g_io_size_bytes - 1) / max_xfer_size + 2; + if ((g_queue_depth * entries) > opts.io_queue_size) { + printf("controller IO queue size %u less than required\n", + opts.io_queue_size); + printf("Consider using lower queue depth or small IO size because " + "IO requests may be queued at the NVMe driver.\n"); + g_warn = true; + } + /* For requests which have children requests, parent request itself + * will also occupy 1 entry. + */ + entries += 1; + + entry = calloc(1, sizeof(struct ns_entry)); + if (entry == NULL) { + perror("ns_entry malloc"); + exit(1); + } + + entry->ctrlr = ctrlr; + entry->ns = ns; + entry->num_io_requests = g_queue_depth * entries; + + entry->size_in_ios = ns_size / g_io_size_bytes; + entry->io_size_blocks = g_io_size_bytes / sector_size; + + entry->block_size = spdk_nvme_ns_get_sector_size(ns); + + + if (g_max_io_size_blocks < entry->io_size_blocks) { + g_max_io_size_blocks = entry->io_size_blocks; + } + + build_nvme_name(entry->name, sizeof(entry->name), ctrlr); + + g_num_namespaces++; + entry->next = g_namespaces; + g_namespaces = entry; +} + +static void +unregister_namespaces(void) +{ + struct ns_entry *entry = g_namespaces; + + while (entry) { + struct ns_entry *next = entry->next; + free(entry); + entry = next; + } +} + +static void +register_ctrlr(struct spdk_nvme_ctrlr *ctrlr, struct trid_entry *trid_entry) +{ + struct spdk_nvme_ns *ns; + struct ctrlr_entry *entry = calloc(1, sizeof(struct ctrlr_entry)); + const struct spdk_nvme_transport_id *ctrlr_trid; + uint32_t nsid; + + if (entry == NULL) { + perror("ctrlr_entry malloc"); + exit(1); + } + + ctrlr_trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); + assert(ctrlr_trid != NULL); + + /* each controller needs a unique failover trid. */ + entry->failover_trid = trid_entry->failover_trid; + + /* + * Users are allowed to leave the trid subnqn blank or specify a discovery controller subnqn. + * In those cases, the controller subnqn will not equal the trid_entry subnqn and, by association, + * the failover_trid subnqn. + * When we do failover, we want to reconnect to the same nqn so explicitly set the failover nqn to + * the ctrlr nqn here. + */ + snprintf(entry->failover_trid.subnqn, SPDK_NVMF_NQN_MAX_LEN + 1, "%s", ctrlr_trid->subnqn); + + + build_nvme_name(entry->name, sizeof(entry->name), ctrlr); + + entry->ctrlr = ctrlr; + entry->trtype = trid_entry->trid.trtype; + entry->next = g_controllers; + g_controllers = entry; + + for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + continue; + } + register_ns(ctrlr, ns); + } +} + +static __thread unsigned int seed = 0; + +static inline void +submit_single_io(struct perf_task *task) +{ + uint64_t offset_in_ios; + int rc; + struct ns_worker_ctx *ns_ctx = task->ns_ctx; + struct ns_entry *entry = ns_ctx->entry; + + if (g_is_random) { + offset_in_ios = rand_r(&seed) % entry->size_in_ios; + } else { + offset_in_ios = ns_ctx->offset_in_ios++; + if (ns_ctx->offset_in_ios == entry->size_in_ios) { + ns_ctx->offset_in_ios = 0; + } + } + + if ((g_rw_percentage == 100) || + (g_rw_percentage != 0 && ((rand_r(&seed) % 100) < g_rw_percentage))) { + task->is_read = true; + } else { + task->is_read = false; + } + + rc = nvme_submit_io(task, ns_ctx, entry, offset_in_ios); + + if (spdk_unlikely(rc != 0)) { + fprintf(stderr, "starting I/O failed\n"); + } else { + ns_ctx->current_queue_depth++; + } +} + +static inline void +task_complete(struct perf_task *task) +{ + struct ns_worker_ctx *ns_ctx; + + ns_ctx = task->ns_ctx; + ns_ctx->current_queue_depth--; + ns_ctx->io_completed++; + + /* + * is_draining indicates when time has expired for the test run + * and we are just waiting for the previously submitted I/O + * to complete. In this case, do not submit a new I/O to replace + * the one just completed. + */ + if (spdk_unlikely(ns_ctx->is_draining)) { + spdk_dma_free(task->iov.iov_base); + free(task); + } else { + submit_single_io(task); + } +} + +static void +io_complete(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct perf_task *task = ctx; + + if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { + fprintf(stderr, "%s completed with error (sct=%d, sc=%d)\n", + task->is_read ? "Read" : "Write", + cpl->status.sct, cpl->status.sc); + } + + task_complete(task); +} + +static void +check_io(struct ns_worker_ctx *ns_ctx) +{ + nvme_check_io(ns_ctx); +} + +static struct perf_task * +allocate_task(struct ns_worker_ctx *ns_ctx, int queue_depth) +{ + struct perf_task *task; + + task = calloc(1, sizeof(*task)); + if (task == NULL) { + fprintf(stderr, "Out of memory allocating tasks\n"); + exit(1); + } + + nvme_setup_payload(task); + + task->ns_ctx = ns_ctx; + + return task; +} + +static void +submit_io(struct ns_worker_ctx *ns_ctx, int queue_depth) +{ + struct perf_task *task; + + while (queue_depth-- > 0) { + task = allocate_task(ns_ctx, queue_depth); + submit_single_io(task); + } +} + +static int +work_fn(void *arg) +{ + uint64_t tsc_end; + struct worker_thread *worker = (struct worker_thread *)arg; + struct ns_worker_ctx *ns_ctx = NULL; + uint32_t unfinished_ns_ctx; + + printf("Starting thread on core %u\n", worker->lcore); + + /* Allocate queue pairs for each namespace. */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + if (nvme_init_ns_worker_ctx(ns_ctx) != 0) { + printf("ERROR: init_ns_worker_ctx() failed\n"); + return 1; + } + ns_ctx = ns_ctx->next; + } + + tsc_end = spdk_get_ticks() + g_time_in_sec * g_tsc_rate; + + /* Submit initial I/O for each namespace. */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + submit_io(ns_ctx, g_queue_depth); + ns_ctx = ns_ctx->next; + } + + while (1) { + /* + * Check for completed I/O for each controller. A new + * I/O will be submitted in the io_complete callback + * to replace each I/O that is completed. + */ + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + check_io(ns_ctx); + ns_ctx = ns_ctx->next; + } + + if (spdk_get_ticks() > tsc_end) { + break; + } + } + + /* drain the io of each ns_ctx in round robin to make the fairness */ + do { + unfinished_ns_ctx = 0; + ns_ctx = worker->ns_ctx; + while (ns_ctx != NULL) { + /* first time will enter into this if case */ + if (!ns_ctx->is_draining) { + ns_ctx->is_draining = true; + } + + if (ns_ctx->current_queue_depth > 0) { + check_io(ns_ctx); + if (ns_ctx->current_queue_depth == 0) { + nvme_cleanup_ns_worker_ctx(ns_ctx); + } else { + unfinished_ns_ctx++; + } + } + ns_ctx = ns_ctx->next; + } + } while (unfinished_ns_ctx > 0); + + return 0; +} + +static void usage(char *program_name) +{ + printf("%s options", program_name); + printf("\n"); + printf("\t[-q io depth]\n"); + printf("\t[-o io size in bytes]\n"); + printf("\t[-w io pattern type, must be one of\n"); + printf("\t\t(read, write, randread, randwrite, rw, randrw)]\n"); + printf("\t[-M rwmixread (100 for reads, 0 for writes)]\n"); + printf("\t[-t time in seconds]\n"); + printf("\t[-c core mask for I/O submission/completion.]\n"); + printf("\t\t(default: 1)\n"); + printf("\t[-r Transport ID for NVMeoF]\n"); + printf("\t Format: 'key:value [key:value] ...'\n"); + printf("\t Keys:\n"); + printf("\t trtype Transport type (e.g. RDMA)\n"); + printf("\t adrfam Address family (e.g. IPv4, IPv6)\n"); + printf("\t traddr Transport address (e.g. 192.168.100.8 for RDMA)\n"); + printf("\t trsvcid Transport service identifier (e.g. 4420)\n"); + printf("\t subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN); + printf("\t alt_traddr (Optional) Alternative Transport address for failover.\n"); + printf("\t Example: -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420' for NVMeoF\n"); + printf("\t[-k keep alive timeout period in millisecond]\n"); + printf("\t[-s DPDK huge memory size in MB.]\n"); + printf("\t[-m max completions per poll]\n"); + printf("\t\t(default: 0 - unlimited)\n"); + printf("\t[-i shared memory group ID]\n"); + printf("\t[-A transport ACK timeout]\n"); + printf("\t[-R transport retry count]\n"); + printf("\t"); + spdk_log_usage(stdout, "-T"); +#ifdef DEBUG + printf("\t[-G enable debug logging]\n"); +#else + printf("\t[-G enable debug logging (flag disabled, must reconfigure with --enable-debug)\n"); +#endif +} + +static void +unregister_trids(void) +{ + struct trid_entry *trid_entry, *tmp; + + TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, tmp) { + TAILQ_REMOVE(&g_trid_list, trid_entry, tailq); + free(trid_entry); + } +} + +static int +add_trid(const char *trid_str) +{ + struct trid_entry *trid_entry; + struct spdk_nvme_transport_id *trid; + char *alt_traddr; + int len; + + trid_entry = calloc(1, sizeof(*trid_entry)); + if (trid_entry == NULL) { + return -1; + } + + trid = &trid_entry->trid; + snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); + + if (spdk_nvme_transport_id_parse(trid, trid_str) != 0) { + fprintf(stderr, "Invalid transport ID format '%s'\n", trid_str); + free(trid_entry); + return 1; + } + + trid_entry->failover_trid = trid_entry->trid; + + alt_traddr = strcasestr(trid_str, "alt_traddr:"); + if (alt_traddr) { + alt_traddr += strlen("alt_traddr:"); + len = strcspn(alt_traddr, " \t\n"); + if (len > SPDK_NVMF_TRADDR_MAX_LEN) { + fprintf(stderr, "The failover traddr %s is too long.\n", alt_traddr); + free(trid_entry); + return -1; + } + snprintf(trid_entry->failover_trid.traddr, SPDK_NVMF_TRADDR_MAX_LEN + 1, "%s", alt_traddr); + } + + TAILQ_INSERT_TAIL(&g_trid_list, trid_entry, tailq); + return 0; +} + +static int +parse_args(int argc, char **argv) +{ + struct trid_entry *trid_entry, *trid_entry_tmp; + const char *workload_type; + int op; + bool mix_specified = false; + long int val; + int rc; + + /* default value */ + g_queue_depth = 0; + g_io_size_bytes = 0; + workload_type = NULL; + g_time_in_sec = 0; + g_rw_percentage = -1; + g_core_mask = NULL; + g_max_completions = 0; + + while ((op = getopt(argc, argv, "c:m:o:q:r:k:s:t:w:A:GM:R:T:")) != -1) { + switch (op) { + case 'm': + case 'o': + case 'q': + case 'k': + case 's': + case 't': + case 'A': + case 'M': + case 'R': + val = spdk_strtol(optarg, 10); + if (val < 0) { + fprintf(stderr, "Converting a string to integer failed\n"); + return val; + } + switch (op) { + case 'm': + g_max_completions = val; + break; + case 'o': + g_io_size_bytes = val; + break; + case 'q': + g_queue_depth = val; + break; + case 'k': + g_keep_alive_timeout_in_ms = val; + break; + case 's': + g_dpdk_mem = val; + break; + case 't': + g_time_in_sec = val; + break; + case 'A': + g_transport_ack_timeout = val; + break; + case 'M': + g_rw_percentage = val; + mix_specified = true; + break; + case 'R': + g_transport_retry_count = val; + break; + } + break; + case 'c': + g_core_mask = optarg; + break; + case 'r': + if (add_trid(optarg)) { + usage(argv[0]); + return 1; + } + break; + case 'w': + workload_type = optarg; + break; + case 'G': +#ifndef DEBUG + fprintf(stderr, "%s must be configured with --enable-debug for -G flag\n", + argv[0]); + usage(argv[0]); + return 1; +#else + spdk_log_set_flag("nvme"); + spdk_log_set_print_level(SPDK_LOG_DEBUG); + break; +#endif + case 'T': + rc = spdk_log_set_flag(optarg); + if (rc < 0) { + fprintf(stderr, "unknown flag\n"); + usage(argv[0]); + exit(EXIT_FAILURE); + } + spdk_log_set_print_level(SPDK_LOG_DEBUG); +#ifndef DEBUG + fprintf(stderr, "%s must be rebuilt with CONFIG_DEBUG=y for -T flag.\n", + argv[0]); + usage(argv[0]); + return 0; +#endif + break; + default: + usage(argv[0]); + return 1; + } + } + + if (!g_queue_depth) { + usage(argv[0]); + return 1; + } + if (!g_io_size_bytes) { + usage(argv[0]); + return 1; + } + if (!workload_type) { + usage(argv[0]); + return 1; + } + if (!g_time_in_sec) { + usage(argv[0]); + return 1; + } + + if (strcmp(workload_type, "read") && + strcmp(workload_type, "write") && + strcmp(workload_type, "randread") && + strcmp(workload_type, "randwrite") && + strcmp(workload_type, "rw") && + strcmp(workload_type, "randrw")) { + fprintf(stderr, + "io pattern type must be one of\n" + "(read, write, randread, randwrite, rw, randrw)\n"); + return 1; + } + + if (!strcmp(workload_type, "read") || + !strcmp(workload_type, "randread")) { + g_rw_percentage = 100; + } + + if (!strcmp(workload_type, "write") || + !strcmp(workload_type, "randwrite")) { + g_rw_percentage = 0; + } + + if (!strcmp(workload_type, "read") || + !strcmp(workload_type, "randread") || + !strcmp(workload_type, "write") || + !strcmp(workload_type, "randwrite")) { + if (mix_specified) { + fprintf(stderr, "Ignoring -M option... Please use -M option" + " only when using rw or randrw.\n"); + } + } + + if (!strcmp(workload_type, "rw") || + !strcmp(workload_type, "randrw")) { + if (g_rw_percentage < 0 || g_rw_percentage > 100) { + fprintf(stderr, + "-M must be specified to value from 0 to 100 " + "for rw or randrw.\n"); + return 1; + } + } + + if (!strcmp(workload_type, "read") || + !strcmp(workload_type, "write") || + !strcmp(workload_type, "rw")) { + g_is_random = 0; + } else { + g_is_random = 1; + } + + if (TAILQ_EMPTY(&g_trid_list)) { + fprintf(stderr, "You must specify at least one fabrics TRID.\n"); + return -1; + } + + /* check whether there is local PCIe type and fail. */ + TAILQ_FOREACH_SAFE(trid_entry, &g_trid_list, tailq, trid_entry_tmp) { + if (trid_entry->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + fprintf(stderr, "This application was not intended to be run on PCIe controllers.\n"); + return 1; + } + } + + return 0; +} + +static int +register_workers(void) +{ + uint32_t i; + struct worker_thread *worker; + + g_workers = NULL; + g_num_workers = 0; + + SPDK_ENV_FOREACH_CORE(i) { + worker = calloc(1, sizeof(*worker)); + if (worker == NULL) { + fprintf(stderr, "Unable to allocate worker\n"); + return -1; + } + + worker->lcore = i; + worker->next = g_workers; + g_workers = worker; + g_num_workers++; + } + + return 0; +} + +static void +unregister_workers(void) +{ + struct worker_thread *worker = g_workers; + + /* Free namespace context and worker thread */ + while (worker) { + struct worker_thread *next_worker = worker->next; + struct ns_worker_ctx *ns_ctx = worker->ns_ctx; + + while (ns_ctx) { + struct ns_worker_ctx *next_ns_ctx = ns_ctx->next; + free(ns_ctx); + ns_ctx = next_ns_ctx; + } + + free(worker); + worker = next_worker; + } +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + /* These should have been weeded out earlier. */ + assert(trid->trtype != SPDK_NVME_TRANSPORT_PCIE); + + printf("Attaching to NVMe over Fabrics controller at %s:%s: %s\n", + trid->traddr, trid->trsvcid, + trid->subnqn); + + /* Set io_queue_size to UINT16_MAX, NVMe driver + * will then reduce this to MQES to maximize + * the io_queue_size as much as possible. + */ + opts->io_queue_size = UINT16_MAX; + + opts->keep_alive_timeout_ms = spdk_max(opts->keep_alive_timeout_ms, + g_keep_alive_timeout_in_ms); + + opts->transport_retry_count = g_transport_retry_count; + opts->transport_ack_timeout = g_transport_ack_timeout; + + return true; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + struct trid_entry *trid_entry = cb_ctx; + + printf("Attached to NVMe over Fabrics controller at %s:%s: %s\n", + trid->traddr, trid->trsvcid, + trid->subnqn); + + register_ctrlr(ctrlr, trid_entry); +} + +static int +register_controllers(void) +{ + struct trid_entry *trid_entry; + + printf("Initializing NVMe Controllers\n"); + + TAILQ_FOREACH(trid_entry, &g_trid_list, tailq) { + if (spdk_nvme_probe(&trid_entry->trid, trid_entry, probe_cb, attach_cb, NULL) != 0) { + fprintf(stderr, "spdk_nvme_probe() failed for transport address '%s'\n", + trid_entry->trid.traddr); + return -1; + } + } + + return 0; +} + +static void +unregister_controllers(void) +{ + struct ctrlr_entry *entry = g_controllers; + + while (entry) { + struct ctrlr_entry *next = entry->next; + + spdk_nvme_detach(entry->ctrlr); + free(entry); + entry = next; + } +} + +static int +associate_workers_with_ns(void) +{ + struct ns_entry *entry = g_namespaces; + struct worker_thread *worker = g_workers; + struct ns_worker_ctx *ns_ctx; + int i, count; + + count = g_num_namespaces > g_num_workers ? g_num_namespaces : g_num_workers; + + for (i = 0; i < count; i++) { + if (entry == NULL) { + break; + } + + ns_ctx = calloc(1, sizeof(struct ns_worker_ctx)); + if (!ns_ctx) { + return -1; + } + + printf("Associating %s with lcore %d\n", entry->name, worker->lcore); + ns_ctx->entry = entry; + ns_ctx->next = worker->ns_ctx; + worker->ns_ctx = ns_ctx; + + worker = worker->next; + if (worker == NULL) { + worker = g_workers; + } + + entry = entry->next; + if (entry == NULL) { + entry = g_namespaces; + } + + } + + return 0; +} + +static void * +nvme_poll_ctrlrs(void *arg) +{ + struct ctrlr_entry *entry; + const struct spdk_nvme_transport_id *old_trid; + int oldstate; + int rc; + + + spdk_unaffinitize_thread(); + + while (true) { + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); + + entry = g_controllers; + while (entry) { + rc = spdk_nvme_ctrlr_process_admin_completions(entry->ctrlr); + /* This controller has encountered a failure at the transport level. reset it. */ + if (rc == -ENXIO) { + if (entry->num_resets == 0) { + old_trid = spdk_nvme_ctrlr_get_transport_id(entry->ctrlr); + fprintf(stderr, "A controller has encountered a failure and is being reset.\n"); + if (spdk_nvme_transport_id_compare(old_trid, &entry->failover_trid)) { + fprintf(stderr, "Resorting to new failover address %s\n", entry->failover_trid.traddr); + spdk_nvme_ctrlr_fail(entry->ctrlr); + rc = spdk_nvme_ctrlr_set_trid(entry->ctrlr, &entry->failover_trid); + if (rc != 0) { + fprintf(stderr, "Unable to fail over to back up trid.\n"); + } + } + } + + rc = spdk_nvme_ctrlr_reset(entry->ctrlr); + if (rc != 0) { + entry->num_resets++; + fprintf(stderr, "Unable to reset the controller.\n"); + + if (entry->num_resets > g_max_ctrlr_resets) { + fprintf(stderr, "Controller cannot be recovered. Exiting.\n"); + exit(1); + } + } else { + fprintf(stderr, "Controller properly reset.\n"); + } + } + entry = entry->next; + } + + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate); + + /* This is a pthread cancellation point and cannot be removed. */ + sleep(1); + } + + return NULL; +} + +int main(int argc, char **argv) +{ + int rc; + struct worker_thread *worker, *master_worker; + unsigned master_core; + struct spdk_env_opts opts; + pthread_t thread_id = 0; + + rc = parse_args(argc, argv); + if (rc != 0) { + return rc; + } + + spdk_env_opts_init(&opts); + opts.name = "reconnect"; + if (g_core_mask) { + opts.core_mask = g_core_mask; + } + + if (g_dpdk_mem) { + opts.mem_size = g_dpdk_mem; + } + if (spdk_env_init(&opts) < 0) { + fprintf(stderr, "Unable to initialize SPDK env\n"); + rc = 1; + goto cleanup; + } + + g_tsc_rate = spdk_get_ticks_hz(); + + if (register_workers() != 0) { + rc = 1; + goto cleanup; + } + + if (register_controllers() != 0) { + rc = 1; + goto cleanup; + } + + if (g_warn) { + printf("WARNING: Some requested NVMe devices were skipped\n"); + } + + if (g_num_namespaces == 0) { + fprintf(stderr, "No valid NVMe controllers found\n"); + goto cleanup; + } + + rc = pthread_create(&thread_id, NULL, &nvme_poll_ctrlrs, NULL); + if (rc != 0) { + fprintf(stderr, "Unable to spawn a thread to poll admin queues.\n"); + goto cleanup; + } + + if (associate_workers_with_ns() != 0) { + rc = 1; + goto cleanup; + } + + printf("Initialization complete. Launching workers.\n"); + + /* Launch all of the slave workers */ + master_core = spdk_env_get_current_core(); + master_worker = NULL; + worker = g_workers; + while (worker != NULL) { + if (worker->lcore != master_core) { + spdk_env_thread_launch_pinned(worker->lcore, work_fn, worker); + } else { + assert(master_worker == NULL); + master_worker = worker; + } + worker = worker->next; + } + + assert(master_worker != NULL); + rc = work_fn(master_worker); + + spdk_env_thread_wait_all(); + +cleanup: + if (thread_id && pthread_cancel(thread_id) == 0) { + pthread_join(thread_id, NULL); + } + unregister_trids(); + unregister_namespaces(); + unregister_controllers(); + unregister_workers(); + + if (rc != 0) { + fprintf(stderr, "%s: errors occured\n", argv[0]); + /* + * return a generic error to the caller. This allows us to + * distinguish between a failure in the script and something + * like a segfault or an invalid access which causes the program + * to crash. + */ + rc = 1; + } + + return rc; +} |