summaryrefslogtreecommitdiffstats
path: root/src/spdk/examples/nvme/fio_plugin
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/examples/nvme/fio_plugin
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/examples/nvme/fio_plugin')
-rw-r--r--src/spdk/examples/nvme/fio_plugin/.gitignore1
-rw-r--r--src/spdk/examples/nvme/fio_plugin/Makefile51
-rw-r--r--src/spdk/examples/nvme/fio_plugin/README.md107
-rw-r--r--src/spdk/examples/nvme/fio_plugin/example_config.fio15
-rw-r--r--src/spdk/examples/nvme/fio_plugin/fio_plugin.c1267
-rw-r--r--src/spdk/examples/nvme/fio_plugin/full_bench.fio40
-rw-r--r--src/spdk/examples/nvme/fio_plugin/mock_sgl_config.fio17
7 files changed, 1498 insertions, 0 deletions
diff --git a/src/spdk/examples/nvme/fio_plugin/.gitignore b/src/spdk/examples/nvme/fio_plugin/.gitignore
new file mode 100644
index 000000000..1b0b36ac4
--- /dev/null
+++ b/src/spdk/examples/nvme/fio_plugin/.gitignore
@@ -0,0 +1 @@
+fio_plugin
diff --git a/src/spdk/examples/nvme/fio_plugin/Makefile b/src/spdk/examples/nvme/fio_plugin/Makefile
new file mode 100644
index 000000000..1f71802df
--- /dev/null
+++ b/src/spdk/examples/nvme/fio_plugin/Makefile
@@ -0,0 +1,51 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# Copyright (c) 2015-2016, Micron Technology, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk
+
+FIO_PLUGIN := spdk_nvme
+
+C_SRCS = fio_plugin.c
+
+# Unable to combine the FIO plugin and the VPP socket abstraction (license incompatibility)
+SPDK_LIB_LIST = $(filter-out sock_vpp,$(SOCK_MODULES_LIST))
+SPDK_LIB_LIST += nvme thread util log sock vmd jsonrpc json rpc
+
+ifeq ($(CONFIG_RDMA),y)
+SPDK_LIB_LIST += rdma
+endif
+
+include $(SPDK_ROOT_DIR)/mk/spdk.fio.mk
diff --git a/src/spdk/examples/nvme/fio_plugin/README.md b/src/spdk/examples/nvme/fio_plugin/README.md
new file mode 100644
index 000000000..e7a8b7c01
--- /dev/null
+++ b/src/spdk/examples/nvme/fio_plugin/README.md
@@ -0,0 +1,107 @@
+# Compiling fio
+
+First, clone the fio source repository from https://github.com/axboe/fio
+
+ git clone https://github.com/axboe/fio
+
+Then check out the latest fio version and compile the code:
+
+ make
+
+# Compiling SPDK
+
+First, clone the SPDK source repository from https://github.com/spdk/spdk
+
+ git clone https://github.com/spdk/spdk
+ git submodule update --init
+
+Then, run the SPDK configure script to enable fio (point it to the root of the fio repository):
+
+ cd spdk
+ ./configure --with-fio=/path/to/fio/repo <other configuration options>
+
+Finally, build SPDK:
+
+ make
+
+**Note to advanced users**: These steps assume you're using the DPDK submodule. If you are using your
+own version of DPDK, the fio plugin requires that DPDK be compiled with -fPIC. You can compile DPDK
+with -fPIC by modifying your DPDK configuration file and adding the line:
+
+ EXTRA_CFLAGS=-fPIC
+
+# Usage
+
+To use the SPDK fio plugin with fio, specify the plugin binary using LD_PRELOAD when running
+fio and set ioengine=spdk in the fio configuration file (see example_config.fio in the same
+directory as this README).
+
+ LD_PRELOAD=<path to spdk repo>/build/fio/spdk_nvme fio
+
+To select NVMe devices, you pass an SPDK Transport Identifier string as the filename. These are in the
+form:
+
+ filename=key=value [key=value] ... ns=value
+
+Specifically, for local PCIe NVMe devices it will look like this:
+
+ filename=trtype=PCIe traddr=0000.04.00.0 ns=1
+
+And remote devices accessed via NVMe over Fabrics will look like this:
+
+ filename=trtype=RDMA adrfam=IPv4 traddr=192.168.100.8 trsvcid=4420 ns=1
+
+**Note**: The specification of the PCIe address should not use the normal ':'
+and instead only use '.'. This is a limitation in fio - it splits filenames on
+':'. Also, the NVMe namespaces start at 1, not 0, and the namespace must be
+specified at the end of the string.
+
+Currently the SPDK fio plugin is limited to the thread usage model, so fio jobs must also specify thread=1
+when using the SPDK fio plugin.
+
+fio also currently has a race condition on shutdown if dynamically loading the ioengine by specifying the
+engine's full path via the ioengine parameter - LD_PRELOAD is recommended to avoid this race condition.
+
+When testing random workloads, it is recommended to set norandommap=1. fio's random map
+processing consumes extra CPU cycles which will degrade performance over time with
+the fio_plugin since all I/O are submitted and completed on a single CPU core.
+
+When testing FIO on multiple NVMe SSDs with SPDK plugin, it is recommended to use multiple jobs in FIO configurion.
+It has been observed that there are some performance gap between FIO(with SPDK plugin enabled) and SPDK perf
+(examples/nvme/perf/perf) on testing multiple NVMe SSDs. If you use one job(i.e., use one CPU core) configured for
+FIO test, the performance is worse than SPDK perf (also using one CPU core) against many NVMe SSDs. But if you use
+multiple jobs for FIO test, the performance of FIO is similiar with SPDK perf. After analyzing this phenomenon, we
+think that is caused by the FIO architecture. Mainly FIO can scale with multiple threads (i.e., using CPU cores),
+but it is not good to use one thread against many I/O devices.
+
+# End-to-end Data Protection (Optional)
+
+Running with PI setting, following settings steps are required.
+First, format device namespace with proper PI setting. For example:
+
+ nvme format /dev/nvme0n1 -l 1 -i 1 -p 0 -m 1
+
+In fio configure file, add PRACT and set PRCHK by flags(GUARD|REFTAG|APPTAG) properly. For example:
+
+ pi_act=0
+ pi_chk=GUARD
+
+Blocksize should be set as the sum of data and metadata. For example, if data blocksize is 512 Byte, host generated
+PI metadata is 8 Byte, then blocksize in fio configure file should be 520 Byte:
+
+ bs=520
+
+The storage device may use a block format that requires separate metadata (DIX). In this scenario, the fio_plugin
+will automatically allocate an extra 4KiB buffer per I/O to hold this metadata. For some cases, such as 512 byte
+blocks with 32 metadata bytes per block and a 128KiB I/O size, 4KiB isn't large enough. In this case, the
+`md_per_io_size` option may be specified to increase the size of the metadata buffer.
+
+Expose two options 'apptag' and 'apptag_mask', users can change them in the configuration file when using
+application tag and application tag mask in end-to-end data protection. Application tag and application
+tag mask are set to 0x1234 and 0xFFFF by default.
+
+# VMD (Optional)
+
+To enable VMD enumeration add enable_vmd flag in fio configuration file:
+
+ enable_vmd=1
diff --git a/src/spdk/examples/nvme/fio_plugin/example_config.fio b/src/spdk/examples/nvme/fio_plugin/example_config.fio
new file mode 100644
index 000000000..a8e62ccb9
--- /dev/null
+++ b/src/spdk/examples/nvme/fio_plugin/example_config.fio
@@ -0,0 +1,15 @@
+[global]
+ioengine=spdk
+thread=1
+group_reporting=1
+direct=1
+verify=0
+time_based=1
+ramp_time=0
+runtime=2
+iodepth=128
+rw=randrw
+bs=4k
+
+[test]
+numjobs=1
diff --git a/src/spdk/examples/nvme/fio_plugin/fio_plugin.c b/src/spdk/examples/nvme/fio_plugin/fio_plugin.c
new file mode 100644
index 000000000..7aabeb8cb
--- /dev/null
+++ b/src/spdk/examples/nvme/fio_plugin/fio_plugin.c
@@ -0,0 +1,1267 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+#include "spdk/vmd.h"
+#include "spdk/env.h"
+#include "spdk/string.h"
+#include "spdk/log.h"
+#include "spdk/endian.h"
+#include "spdk/dif.h"
+#include "spdk/util.h"
+
+#include "config-host.h"
+#include "fio.h"
+#include "optgroup.h"
+
+/* FreeBSD is missing CLOCK_MONOTONIC_RAW,
+ * so alternative is provided. */
+#ifndef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
+#define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
+#endif
+
+#define NVME_IO_ALIGN 4096
+
+static bool g_spdk_env_initialized;
+static int g_spdk_enable_sgl = 0;
+static uint32_t g_spdk_sge_size = 4096;
+static uint32_t g_spdk_bit_bucket_data_len = 0;
+static uint32_t g_spdk_pract_flag;
+static uint32_t g_spdk_prchk_flags;
+static uint32_t g_spdk_md_per_io_size = 4096;
+static uint16_t g_spdk_apptag;
+static uint16_t g_spdk_apptag_mask;
+
+struct spdk_fio_options {
+ void *pad; /* off1 used in option descriptions may not be 0 */
+ int enable_wrr;
+ int arbitration_burst;
+ int low_weight;
+ int medium_weight;
+ int high_weight;
+ int wrr_priority;
+ int mem_size;
+ int shm_id;
+ int enable_sgl;
+ int sge_size;
+ int bit_bucket_data_len;
+ char *hostnqn;
+ int pi_act;
+ char *pi_chk;
+ int md_per_io_size;
+ int apptag;
+ int apptag_mask;
+ char *digest_enable;
+ int enable_vmd;
+};
+
+struct spdk_fio_request {
+ struct io_u *io;
+ /** Offset in current iovec, fio only uses 1 vector */
+ uint32_t iov_offset;
+
+ /** Amount of data used for Bit Bucket SGL */
+ uint32_t bit_bucket_data_len;
+
+ /** Context for NVMe PI */
+ struct spdk_dif_ctx dif_ctx;
+ /** Separate metadata buffer pointer */
+ void *md_buf;
+
+ struct spdk_fio_thread *fio_thread;
+ struct spdk_fio_qpair *fio_qpair;
+};
+
+struct spdk_fio_ctrlr {
+ struct spdk_nvme_transport_id tr_id;
+ struct spdk_nvme_ctrlr_opts opts;
+ struct spdk_nvme_ctrlr *ctrlr;
+ struct spdk_fio_ctrlr *next;
+};
+
+static struct spdk_fio_ctrlr *g_ctrlr;
+static int g_td_count;
+static pthread_t g_ctrlr_thread_id = 0;
+static pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER;
+static bool g_error;
+
+struct spdk_fio_qpair {
+ struct fio_file *f;
+ struct spdk_nvme_qpair *qpair;
+ struct spdk_nvme_ns *ns;
+ uint32_t io_flags;
+ bool nvme_pi_enabled;
+ /* True for DIF and false for DIX, and this is valid only if nvme_pi_enabled is true. */
+ bool extended_lba;
+ /* True for protection info transferred at start of metadata,
+ * false for protection info transferred at end of metadata, and
+ * this is valid only if nvme_pi_enabled is true.
+ */
+ bool md_start;
+ struct spdk_fio_qpair *next;
+ struct spdk_fio_ctrlr *fio_ctrlr;
+};
+
+struct spdk_fio_thread {
+ struct thread_data *td;
+
+ struct spdk_fio_qpair *fio_qpair;
+ struct spdk_fio_qpair *fio_qpair_current; /* the current fio_qpair to be handled. */
+
+ struct io_u **iocq; /* io completion queue */
+ unsigned int iocq_count; /* number of iocq entries filled by last getevents */
+ unsigned int iocq_size; /* number of iocq entries allocated */
+ struct fio_file *current_f; /* fio_file given by user */
+
+};
+
+static void *
+spdk_fio_poll_ctrlrs(void *arg)
+{
+ struct spdk_fio_ctrlr *fio_ctrlr;
+ int oldstate;
+ int rc;
+
+ /* Loop until the thread is cancelled */
+ while (true) {
+ rc = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to set cancel state disabled on g_init_thread (%d): %s\n",
+ rc, spdk_strerror(rc));
+ }
+
+ pthread_mutex_lock(&g_mutex);
+ fio_ctrlr = g_ctrlr;
+
+ while (fio_ctrlr) {
+ spdk_nvme_ctrlr_process_admin_completions(fio_ctrlr->ctrlr);
+ fio_ctrlr = fio_ctrlr->next;
+ }
+
+ pthread_mutex_unlock(&g_mutex);
+
+ rc = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to set cancel state enabled on g_init_thread (%d): %s\n",
+ rc, spdk_strerror(rc));
+ }
+
+ /* This is a pthread cancellation point and cannot be removed. */
+ sleep(1);
+ }
+
+ return NULL;
+}
+
+static bool
+probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_ctrlr_opts *opts)
+{
+ struct thread_data *td = cb_ctx;
+ struct spdk_fio_options *fio_options = td->eo;
+
+ if (fio_options->hostnqn) {
+ snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", fio_options->hostnqn);
+ }
+
+ if (fio_options->enable_wrr) {
+ opts->arb_mechanism = SPDK_NVME_CC_AMS_WRR;
+ opts->arbitration_burst = fio_options->arbitration_burst;
+ opts->low_priority_weight = fio_options->low_weight;
+ opts->medium_priority_weight = fio_options->medium_weight;
+ opts->high_priority_weight = fio_options->high_weight;
+ }
+
+ if (fio_options->digest_enable) {
+ if (strcasecmp(fio_options->digest_enable, "HEADER") == 0) {
+ opts->header_digest = true;
+ } else if (strcasecmp(fio_options->digest_enable, "DATA") == 0) {
+ opts->data_digest = true;
+ } else if (strcasecmp(fio_options->digest_enable, "BOTH") == 0) {
+ opts->header_digest = true;
+ opts->data_digest = true;
+ }
+ }
+
+ return true;
+}
+
+static struct spdk_fio_ctrlr *
+get_fio_ctrlr(const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_fio_ctrlr *fio_ctrlr = g_ctrlr;
+ while (fio_ctrlr) {
+ if (spdk_nvme_transport_id_compare(trid, &fio_ctrlr->tr_id) == 0) {
+ return fio_ctrlr;
+ }
+
+ fio_ctrlr = fio_ctrlr->next;
+ }
+
+ return NULL;
+}
+
+static void
+attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
+{
+ struct thread_data *td = cb_ctx;
+ struct spdk_fio_thread *fio_thread = td->io_ops_data;
+ struct spdk_nvme_io_qpair_opts qpopts;
+ struct spdk_fio_ctrlr *fio_ctrlr;
+ struct spdk_fio_qpair *fio_qpair;
+ struct spdk_nvme_ns *ns;
+ const struct spdk_nvme_ns_data *nsdata;
+ struct fio_file *f = fio_thread->current_f;
+ uint32_t ns_id;
+ char *p;
+ long int tmp;
+ struct spdk_fio_options *fio_options = td->eo;
+
+ p = strstr(f->file_name, "ns=");
+ if (p != NULL) {
+ tmp = spdk_strtol(p + 3, 10);
+ if (tmp <= 0) {
+ SPDK_ERRLOG("namespace id should be >=1, but was invalid: %ld\n", tmp);
+ g_error = true;
+ return;
+ }
+ ns_id = (uint32_t)tmp;
+ } else {
+ ns_id = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
+ if (ns_id == 0) {
+ /* The ctrlr has no active namespaces and we didn't specify any so nothing to do. */
+ return;
+ }
+ }
+
+ pthread_mutex_lock(&g_mutex);
+ fio_ctrlr = get_fio_ctrlr(trid);
+ /* it is a new ctrlr and needs to be added */
+ if (!fio_ctrlr) {
+ /* Create an fio_ctrlr and add it to the list */
+ fio_ctrlr = calloc(1, sizeof(*fio_ctrlr));
+ if (!fio_ctrlr) {
+ SPDK_ERRLOG("Cannot allocate space for fio_ctrlr\n");
+ g_error = true;
+ pthread_mutex_unlock(&g_mutex);
+ return;
+ }
+ fio_ctrlr->opts = *opts;
+ fio_ctrlr->ctrlr = ctrlr;
+ fio_ctrlr->tr_id = *trid;
+ fio_ctrlr->next = g_ctrlr;
+ g_ctrlr = fio_ctrlr;
+ }
+ pthread_mutex_unlock(&g_mutex);
+
+ ns = spdk_nvme_ctrlr_get_ns(fio_ctrlr->ctrlr, ns_id);
+ if (ns == NULL) {
+ SPDK_ERRLOG("Cannot get namespace by ns_id=%d\n", ns_id);
+ g_error = true;
+ return;
+ }
+
+ if (!spdk_nvme_ns_is_active(ns)) {
+ SPDK_ERRLOG("Inactive namespace by ns_id=%d\n", ns_id);
+ g_error = true;
+ return;
+ }
+ nsdata = spdk_nvme_ns_get_data(ns);
+
+ fio_qpair = fio_thread->fio_qpair;
+ while (fio_qpair != NULL) {
+ if ((fio_qpair->f == f) ||
+ ((spdk_nvme_transport_id_compare(trid, &fio_qpair->fio_ctrlr->tr_id) == 0) &&
+ (spdk_nvme_ns_get_id(fio_qpair->ns) == ns_id))) {
+ /* Not the error case. Avoid duplicated connection */
+ return;
+ }
+ fio_qpair = fio_qpair->next;
+ }
+
+ /* create a new qpair */
+ fio_qpair = calloc(1, sizeof(*fio_qpair));
+ if (!fio_qpair) {
+ g_error = true;
+ SPDK_ERRLOG("Cannot allocate space for fio_qpair\n");
+ return;
+ }
+
+ spdk_nvme_ctrlr_get_default_io_qpair_opts(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
+ qpopts.delay_cmd_submit = true;
+ if (fio_options->enable_wrr) {
+ qpopts.qprio = fio_options->wrr_priority;
+ }
+
+ fio_qpair->qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_ctrlr->ctrlr, &qpopts, sizeof(qpopts));
+ if (!fio_qpair->qpair) {
+ SPDK_ERRLOG("Cannot allocate nvme io_qpair any more\n");
+ g_error = true;
+ free(fio_qpair);
+ return;
+ }
+
+ fio_qpair->ns = ns;
+ fio_qpair->f = f;
+ fio_qpair->fio_ctrlr = fio_ctrlr;
+ fio_qpair->next = fio_thread->fio_qpair;
+ fio_thread->fio_qpair = fio_qpair;
+
+ if (spdk_nvme_ns_get_flags(ns) & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
+ assert(spdk_nvme_ns_get_pi_type(ns) != SPDK_NVME_FMT_NVM_PROTECTION_DISABLE);
+ fio_qpair->io_flags = g_spdk_pract_flag | g_spdk_prchk_flags;
+ fio_qpair->nvme_pi_enabled = true;
+ fio_qpair->md_start = nsdata->dps.md_start;
+ fio_qpair->extended_lba = spdk_nvme_ns_supports_extended_lba(ns);
+ fprintf(stdout, "PI type%u enabled with %s\n", spdk_nvme_ns_get_pi_type(ns),
+ fio_qpair->extended_lba ? "extended lba" : "separate metadata");
+ }
+
+ f->real_file_size = spdk_nvme_ns_get_size(fio_qpair->ns);
+ if (f->real_file_size <= 0) {
+ g_error = true;
+ SPDK_ERRLOG("Cannot get namespace size by ns=%p\n", ns);
+ return;
+ }
+
+ f->filetype = FIO_TYPE_BLOCK;
+ fio_file_set_size_known(f);
+}
+
+static void parse_prchk_flags(const char *prchk_str)
+{
+ if (!prchk_str) {
+ return;
+ }
+
+ if (strstr(prchk_str, "GUARD") != NULL) {
+ g_spdk_prchk_flags = SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
+ }
+ if (strstr(prchk_str, "REFTAG") != NULL) {
+ g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
+ }
+ if (strstr(prchk_str, "APPTAG") != NULL) {
+ g_spdk_prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_APPTAG;
+ }
+}
+
+static void parse_pract_flag(int pract)
+{
+ if (pract == 1) {
+ g_spdk_pract_flag = SPDK_NVME_IO_FLAGS_PRACT;
+ } else {
+ g_spdk_pract_flag = 0;
+ }
+}
+
+/* Called once at initialization. This is responsible for gathering the size of
+ * each "file", which in our case are in the form
+ * 'key=value [key=value] ... ns=value'
+ * For example, For local PCIe NVMe device - 'trtype=PCIe traddr=0000.04.00.0 ns=1'
+ * For remote exported by NVMe-oF target, 'trtype=RDMA adrfam=IPv4 traddr=192.168.100.8 trsvcid=4420 ns=1' */
+static int spdk_fio_setup(struct thread_data *td)
+{
+ struct spdk_fio_thread *fio_thread;
+ struct spdk_fio_options *fio_options = td->eo;
+ struct spdk_env_opts opts;
+ struct fio_file *f;
+ char *p;
+ int rc = 0;
+ struct spdk_nvme_transport_id trid;
+ struct spdk_fio_ctrlr *fio_ctrlr;
+ char *trid_info;
+ unsigned int i;
+
+ /* we might be running in a daemonized FIO instance where standard
+ * input and output were closed and fds 0, 1, and 2 are reused
+ * for something important by FIO. We can't ensure we won't print
+ * anything (and so will our dependencies, e.g. DPDK), so abort early.
+ * (is_backend is an fio global variable)
+ */
+ if (is_backend) {
+ char buf[1024];
+ snprintf(buf, sizeof(buf),
+ "SPDK FIO plugin won't work with daemonized FIO server.");
+ fio_server_text_output(FIO_LOG_ERR, buf, sizeof(buf));
+ return -1;
+ }
+
+ if (!td->o.use_thread) {
+ log_err("spdk: must set thread=1 when using spdk plugin\n");
+ return 1;
+ }
+
+ pthread_mutex_lock(&g_mutex);
+
+ fio_thread = calloc(1, sizeof(*fio_thread));
+ assert(fio_thread != NULL);
+
+ td->io_ops_data = fio_thread;
+ fio_thread->td = td;
+
+ fio_thread->iocq_size = td->o.iodepth;
+ fio_thread->iocq = calloc(fio_thread->iocq_size, sizeof(struct io_u *));
+ assert(fio_thread->iocq != NULL);
+
+ if (!g_spdk_env_initialized) {
+ spdk_env_opts_init(&opts);
+ opts.name = "fio";
+ opts.mem_size = fio_options->mem_size;
+ opts.shm_id = fio_options->shm_id;
+ g_spdk_enable_sgl = fio_options->enable_sgl;
+ g_spdk_sge_size = fio_options->sge_size;
+ g_spdk_bit_bucket_data_len = fio_options->bit_bucket_data_len;
+ parse_pract_flag(fio_options->pi_act);
+ g_spdk_md_per_io_size = spdk_max(fio_options->md_per_io_size, 4096);
+ g_spdk_apptag = (uint16_t)fio_options->apptag;
+ g_spdk_apptag_mask = (uint16_t)fio_options->apptag_mask;
+ parse_prchk_flags(fio_options->pi_chk);
+ if (spdk_env_init(&opts) < 0) {
+ SPDK_ERRLOG("Unable to initialize SPDK env\n");
+ free(fio_thread->iocq);
+ free(fio_thread);
+ fio_thread = NULL;
+ pthread_mutex_unlock(&g_mutex);
+ return 1;
+ }
+ g_spdk_env_initialized = true;
+ spdk_unaffinitize_thread();
+
+ /* Spawn a thread to continue polling the controllers */
+ rc = pthread_create(&g_ctrlr_thread_id, NULL, &spdk_fio_poll_ctrlrs, NULL);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to spawn a thread to poll admin queues. They won't be polled.\n");
+ }
+
+ if (fio_options->enable_vmd && spdk_vmd_init()) {
+ SPDK_ERRLOG("Failed to initialize VMD. Some NVMe devices can be unavailable.\n");
+ }
+ }
+ pthread_mutex_unlock(&g_mutex);
+
+ for_each_file(td, f, i) {
+ memset(&trid, 0, sizeof(trid));
+
+ trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
+
+ p = strstr(f->file_name, " ns=");
+ if (p != NULL) {
+ trid_info = strndup(f->file_name, p - f->file_name);
+ } else {
+ trid_info = strndup(f->file_name, strlen(f->file_name));
+ }
+
+ if (!trid_info) {
+ SPDK_ERRLOG("Failed to allocate space for trid_info\n");
+ continue;
+ }
+
+ rc = spdk_nvme_transport_id_parse(&trid, trid_info);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to parse given str: %s\n", trid_info);
+ free(trid_info);
+ continue;
+ }
+ free(trid_info);
+
+ if (trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ struct spdk_pci_addr pci_addr;
+ if (spdk_pci_addr_parse(&pci_addr, trid.traddr) < 0) {
+ SPDK_ERRLOG("Invalid traddr=%s\n", trid.traddr);
+ continue;
+ }
+ spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
+ } else {
+ if (trid.subnqn[0] == '\0') {
+ snprintf(trid.subnqn, sizeof(trid.subnqn), "%s",
+ SPDK_NVMF_DISCOVERY_NQN);
+ }
+ }
+
+ fio_thread->current_f = f;
+
+ pthread_mutex_lock(&g_mutex);
+ fio_ctrlr = get_fio_ctrlr(&trid);
+ pthread_mutex_unlock(&g_mutex);
+ if (fio_ctrlr) {
+ attach_cb(td, &trid, fio_ctrlr->ctrlr, &fio_ctrlr->opts);
+ } else {
+ /* Enumerate all of the controllers */
+ if (spdk_nvme_probe(&trid, td, probe_cb, attach_cb, NULL) != 0) {
+ SPDK_ERRLOG("spdk_nvme_probe() failed\n");
+ continue;
+ }
+ }
+
+ if (g_error) {
+ log_err("Failed to initialize spdk fio plugin\n");
+ rc = 1;
+ break;
+ }
+ }
+
+ pthread_mutex_lock(&g_mutex);
+ g_td_count++;
+ pthread_mutex_unlock(&g_mutex);
+
+ return rc;
+}
+
+static int spdk_fio_open(struct thread_data *td, struct fio_file *f)
+{
+ return 0;
+}
+
+static int spdk_fio_close(struct thread_data *td, struct fio_file *f)
+{
+ return 0;
+}
+
+static int spdk_fio_iomem_alloc(struct thread_data *td, size_t total_mem)
+{
+ td->orig_buffer = spdk_dma_zmalloc(total_mem, NVME_IO_ALIGN, NULL);
+ return td->orig_buffer == NULL;
+}
+
+static void spdk_fio_iomem_free(struct thread_data *td)
+{
+ spdk_dma_free(td->orig_buffer);
+}
+
+static int spdk_fio_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+ struct spdk_fio_thread *fio_thread = td->io_ops_data;
+ struct spdk_fio_request *fio_req;
+
+ io_u->engine_data = NULL;
+
+ fio_req = calloc(1, sizeof(*fio_req));
+ if (fio_req == NULL) {
+ return 1;
+ }
+
+ fio_req->md_buf = spdk_dma_zmalloc(g_spdk_md_per_io_size, NVME_IO_ALIGN, NULL);
+ if (fio_req->md_buf == NULL) {
+ fprintf(stderr, "Allocate %u metadata failed\n", g_spdk_md_per_io_size);
+ free(fio_req);
+ return 1;
+ }
+
+ fio_req->io = io_u;
+ fio_req->fio_thread = fio_thread;
+
+ io_u->engine_data = fio_req;
+
+ return 0;
+}
+
+static void spdk_fio_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+ struct spdk_fio_request *fio_req = io_u->engine_data;
+
+ if (fio_req) {
+ assert(fio_req->io == io_u);
+ spdk_dma_free(fio_req->md_buf);
+ free(fio_req);
+ io_u->engine_data = NULL;
+ }
+}
+
+static int
+fio_extended_lba_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
+{
+ struct spdk_nvme_ns *ns = fio_qpair->ns;
+ struct spdk_fio_request *fio_req = io_u->engine_data;
+ uint32_t md_size, extended_lba_size, lba_count;
+ uint64_t lba;
+ struct iovec iov;
+ int rc;
+
+ /* Set appmask and apptag when PRACT is enabled */
+ if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
+ fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
+ fio_req->dif_ctx.app_tag = g_spdk_apptag;
+ return 0;
+ }
+
+ extended_lba_size = spdk_nvme_ns_get_extended_sector_size(ns);
+ md_size = spdk_nvme_ns_get_md_size(ns);
+ lba = io_u->offset / extended_lba_size;
+ lba_count = io_u->xfer_buflen / extended_lba_size;
+
+ rc = spdk_dif_ctx_init(&fio_req->dif_ctx, extended_lba_size, md_size,
+ true, fio_qpair->md_start,
+ (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
+ fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag, 0, 0);
+ if (rc != 0) {
+ fprintf(stderr, "Initialization of DIF context failed\n");
+ return rc;
+ }
+
+ if (io_u->ddir != DDIR_WRITE) {
+ return 0;
+ }
+
+ iov.iov_base = io_u->buf;
+ iov.iov_len = io_u->xfer_buflen;
+ rc = spdk_dif_generate(&iov, 1, lba_count, &fio_req->dif_ctx);
+ if (rc != 0) {
+ fprintf(stderr, "Generation of DIF failed\n");
+ }
+
+ return rc;
+}
+
+static int
+fio_separate_md_setup_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
+{
+ struct spdk_nvme_ns *ns = fio_qpair->ns;
+ struct spdk_fio_request *fio_req = io_u->engine_data;
+ uint32_t md_size, block_size, lba_count;
+ uint64_t lba;
+ struct iovec iov, md_iov;
+ int rc;
+
+ /* Set appmask and apptag when PRACT is enabled */
+ if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
+ fio_req->dif_ctx.apptag_mask = g_spdk_apptag_mask;
+ fio_req->dif_ctx.app_tag = g_spdk_apptag;
+ return 0;
+ }
+
+ block_size = spdk_nvme_ns_get_sector_size(ns);
+ md_size = spdk_nvme_ns_get_md_size(ns);
+ lba = io_u->offset / block_size;
+ lba_count = io_u->xfer_buflen / block_size;
+
+ rc = spdk_dif_ctx_init(&fio_req->dif_ctx, block_size, md_size,
+ false, fio_qpair->md_start,
+ (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns),
+ fio_qpair->io_flags, lba, g_spdk_apptag_mask, g_spdk_apptag, 0, 0);
+ if (rc != 0) {
+ fprintf(stderr, "Initialization of DIF context failed\n");
+ return rc;
+ }
+
+ if (io_u->ddir != DDIR_WRITE) {
+ return 0;
+ }
+
+ iov.iov_base = io_u->buf;
+ iov.iov_len = io_u->xfer_buflen;
+ md_iov.iov_base = fio_req->md_buf;
+ md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
+ rc = spdk_dix_generate(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx);
+ if (rc < 0) {
+ fprintf(stderr, "Generation of DIX failed\n");
+ }
+
+ return rc;
+}
+
+static int
+fio_extended_lba_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
+{
+ struct spdk_nvme_ns *ns = fio_qpair->ns;
+ struct spdk_fio_request *fio_req = io_u->engine_data;
+ uint32_t lba_count;
+ struct iovec iov;
+ struct spdk_dif_error err_blk = {};
+ int rc;
+
+ /* Do nothing when PRACT is enabled */
+ if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
+ return 0;
+ }
+
+ iov.iov_base = io_u->buf;
+ iov.iov_len = io_u->xfer_buflen;
+ lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_extended_sector_size(ns);
+
+ rc = spdk_dif_verify(&iov, 1, lba_count, &fio_req->dif_ctx, &err_blk);
+ if (rc != 0) {
+ fprintf(stderr, "DIF error detected. type=%d, offset=%" PRIu32 "\n",
+ err_blk.err_type, err_blk.err_offset);
+ }
+
+ return rc;
+}
+
+static int
+fio_separate_md_verify_pi(struct spdk_fio_qpair *fio_qpair, struct io_u *io_u)
+{
+ struct spdk_nvme_ns *ns = fio_qpair->ns;
+ struct spdk_fio_request *fio_req = io_u->engine_data;
+ uint32_t md_size, lba_count;
+ struct iovec iov, md_iov;
+ struct spdk_dif_error err_blk = {};
+ int rc;
+
+ /* Do nothing when PRACT is enabled */
+ if (fio_qpair->io_flags & SPDK_NVME_IO_FLAGS_PRACT) {
+ return 0;
+ }
+
+ iov.iov_base = io_u->buf;
+ iov.iov_len = io_u->xfer_buflen;
+ lba_count = io_u->xfer_buflen / spdk_nvme_ns_get_sector_size(ns);
+ md_size = spdk_nvme_ns_get_md_size(ns);
+ md_iov.iov_base = fio_req->md_buf;
+ md_iov.iov_len = spdk_min(md_size * lba_count, g_spdk_md_per_io_size);
+
+ rc = spdk_dix_verify(&iov, 1, &md_iov, lba_count, &fio_req->dif_ctx, &err_blk);
+ if (rc != 0) {
+ fprintf(stderr, "DIX error detected. type=%d, offset=%" PRIu32 "\n",
+ err_blk.err_type, err_blk.err_offset);
+ }
+
+ return rc;
+}
+
+static void spdk_fio_completion_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_fio_request *fio_req = ctx;
+ struct spdk_fio_thread *fio_thread = fio_req->fio_thread;
+ struct spdk_fio_qpair *fio_qpair = fio_req->fio_qpair;
+ int rc;
+
+ if (fio_qpair->nvme_pi_enabled && fio_req->io->ddir == DDIR_READ) {
+ if (fio_qpair->extended_lba) {
+ rc = fio_extended_lba_verify_pi(fio_qpair, fio_req->io);
+ } else {
+ rc = fio_separate_md_verify_pi(fio_qpair, fio_req->io);
+ }
+ if (rc != 0) {
+ fio_req->io->error = abs(rc);
+ }
+ }
+
+ assert(fio_thread->iocq_count < fio_thread->iocq_size);
+ fio_thread->iocq[fio_thread->iocq_count++] = fio_req->io;
+}
+
+static void
+spdk_nvme_io_reset_sgl(void *ref, uint32_t sgl_offset)
+{
+ struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
+
+ fio_req->iov_offset = sgl_offset;
+ fio_req->bit_bucket_data_len = 0;
+}
+
+static int
+spdk_nvme_io_next_sge(void *ref, void **address, uint32_t *length)
+{
+ struct spdk_fio_request *fio_req = (struct spdk_fio_request *)ref;
+ struct io_u *io_u = fio_req->io;
+ uint32_t iov_len;
+ uint32_t bit_bucket_len;
+
+ *address = io_u->buf;
+
+ if (fio_req->iov_offset) {
+ assert(fio_req->iov_offset <= io_u->xfer_buflen);
+ *address += fio_req->iov_offset;
+ }
+
+ iov_len = io_u->xfer_buflen - fio_req->iov_offset;
+ if (iov_len > g_spdk_sge_size) {
+ iov_len = g_spdk_sge_size;
+ }
+
+ if ((fio_req->bit_bucket_data_len < g_spdk_bit_bucket_data_len) && (io_u->ddir == DDIR_READ)) {
+ assert(g_spdk_bit_bucket_data_len < io_u->xfer_buflen);
+ *address = (void *)UINT64_MAX;
+ bit_bucket_len = g_spdk_bit_bucket_data_len - fio_req->bit_bucket_data_len;
+ if (iov_len > bit_bucket_len) {
+ iov_len = bit_bucket_len;
+ }
+ fio_req->bit_bucket_data_len += iov_len;
+ }
+
+ fio_req->iov_offset += iov_len;
+ *length = iov_len;
+
+ return 0;
+}
+
+#if FIO_IOOPS_VERSION >= 24
+typedef enum fio_q_status fio_q_status_t;
+#else
+typedef int fio_q_status_t;
+#endif
+
+static fio_q_status_t
+spdk_fio_queue(struct thread_data *td, struct io_u *io_u)
+{
+ int rc = 1;
+ struct spdk_fio_thread *fio_thread = td->io_ops_data;
+ struct spdk_fio_request *fio_req = io_u->engine_data;
+ struct spdk_fio_qpair *fio_qpair;
+ struct spdk_nvme_ns *ns = NULL;
+ void *md_buf = NULL;
+ struct spdk_dif_ctx *dif_ctx = &fio_req->dif_ctx;
+ uint32_t block_size;
+ uint64_t lba;
+ uint32_t lba_count;
+
+ /* Find the namespace that corresponds to the file in the io_u */
+ fio_qpair = fio_thread->fio_qpair;
+ while (fio_qpair != NULL) {
+ if (fio_qpair->f == io_u->file) {
+ ns = fio_qpair->ns;
+ break;
+ }
+ fio_qpair = fio_qpair->next;
+ }
+ if (fio_qpair == NULL || ns == NULL) {
+ return -ENXIO;
+ }
+ if (fio_qpair->nvme_pi_enabled && !fio_qpair->extended_lba) {
+ md_buf = fio_req->md_buf;
+ }
+ fio_req->fio_qpair = fio_qpair;
+
+ block_size = spdk_nvme_ns_get_extended_sector_size(ns);
+ if ((fio_qpair->io_flags & g_spdk_pract_flag) && (spdk_nvme_ns_get_md_size(ns) == 8)) {
+ /* If metadata size = 8 bytes, PI is stripped (read) or inserted (write), and
+ * so reduce metadata size from block size. (If metadata size > 8 bytes, PI
+ * is passed (read) or replaced (write). So block size is not necessary to
+ * change.)
+ */
+ block_size = spdk_nvme_ns_get_sector_size(ns);
+ }
+
+ lba = io_u->offset / block_size;
+ lba_count = io_u->xfer_buflen / block_size;
+
+ /* TODO: considering situations that fio will randomize and verify io_u */
+ if (fio_qpair->nvme_pi_enabled) {
+ if (fio_qpair->extended_lba) {
+ rc = fio_extended_lba_setup_pi(fio_qpair, io_u);
+ } else {
+ rc = fio_separate_md_setup_pi(fio_qpair, io_u);
+ }
+ if (rc < 0) {
+ io_u->error = -rc;
+ return FIO_Q_COMPLETED;
+ }
+ }
+
+ switch (io_u->ddir) {
+ case DDIR_READ:
+ if (!g_spdk_enable_sgl) {
+ rc = spdk_nvme_ns_cmd_read_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba, lba_count,
+ spdk_fio_completion_cb, fio_req,
+ fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
+ } else {
+ rc = spdk_nvme_ns_cmd_readv_with_md(ns, fio_qpair->qpair, lba,
+ lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
+ spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
+ dif_ctx->apptag_mask, dif_ctx->app_tag);
+ }
+ break;
+ case DDIR_WRITE:
+ if (!g_spdk_enable_sgl) {
+ rc = spdk_nvme_ns_cmd_write_with_md(ns, fio_qpair->qpair, io_u->buf, md_buf, lba,
+ lba_count,
+ spdk_fio_completion_cb, fio_req,
+ fio_qpair->io_flags, dif_ctx->apptag_mask, dif_ctx->app_tag);
+ } else {
+ rc = spdk_nvme_ns_cmd_writev_with_md(ns, fio_qpair->qpair, lba,
+ lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
+ spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
+ dif_ctx->apptag_mask, dif_ctx->app_tag);
+ }
+ break;
+ default:
+ assert(false);
+ break;
+ }
+
+ /* NVMe read/write functions return -ENOMEM if there are no free requests. */
+ if (rc == -ENOMEM) {
+ return FIO_Q_BUSY;
+ }
+
+ if (rc != 0) {
+ io_u->error = abs(rc);
+ return FIO_Q_COMPLETED;
+ }
+
+ return FIO_Q_QUEUED;
+}
+
+static struct io_u *spdk_fio_event(struct thread_data *td, int event)
+{
+ struct spdk_fio_thread *fio_thread = td->io_ops_data;
+
+ assert(event >= 0);
+ assert((unsigned)event < fio_thread->iocq_count);
+ return fio_thread->iocq[event];
+}
+
+static int spdk_fio_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
+{
+ struct spdk_fio_thread *fio_thread = td->io_ops_data;
+ struct spdk_fio_qpair *fio_qpair = NULL;
+ struct timespec t0, t1;
+ uint64_t timeout = 0;
+
+ if (t) {
+ timeout = t->tv_sec * 1000000000L + t->tv_nsec;
+ clock_gettime(CLOCK_MONOTONIC_RAW, &t0);
+ }
+
+ fio_thread->iocq_count = 0;
+
+ /* fetch the next qpair */
+ if (fio_thread->fio_qpair_current) {
+ fio_qpair = fio_thread->fio_qpair_current->next;
+ }
+
+ for (;;) {
+ if (fio_qpair == NULL) {
+ fio_qpair = fio_thread->fio_qpair;
+ }
+
+ while (fio_qpair != NULL) {
+ spdk_nvme_qpair_process_completions(fio_qpair->qpair, max - fio_thread->iocq_count);
+
+ if (fio_thread->iocq_count >= min) {
+ /* reset the currrent handling qpair */
+ fio_thread->fio_qpair_current = fio_qpair;
+ return fio_thread->iocq_count;
+ }
+
+ fio_qpair = fio_qpair->next;
+ }
+
+ if (t) {
+ uint64_t elapse;
+
+ clock_gettime(CLOCK_MONOTONIC_RAW, &t1);
+ elapse = ((t1.tv_sec - t0.tv_sec) * 1000000000L)
+ + t1.tv_nsec - t0.tv_nsec;
+ if (elapse > timeout) {
+ break;
+ }
+ }
+ }
+
+ /* reset the currrent handling qpair */
+ fio_thread->fio_qpair_current = fio_qpair;
+ return fio_thread->iocq_count;
+}
+
+static int spdk_fio_invalidate(struct thread_data *td, struct fio_file *f)
+{
+ /* TODO: This should probably send a flush to the device, but for now just return successful. */
+ return 0;
+}
+
+static void spdk_fio_cleanup(struct thread_data *td)
+{
+ struct spdk_fio_thread *fio_thread = td->io_ops_data;
+ struct spdk_fio_qpair *fio_qpair, *fio_qpair_tmp;
+ struct spdk_fio_options *fio_options = td->eo;
+
+ fio_qpair = fio_thread->fio_qpair;
+ while (fio_qpair != NULL) {
+ spdk_nvme_ctrlr_free_io_qpair(fio_qpair->qpair);
+ fio_qpair_tmp = fio_qpair->next;
+ free(fio_qpair);
+ fio_qpair = fio_qpair_tmp;
+ }
+
+ free(fio_thread->iocq);
+ free(fio_thread);
+
+ pthread_mutex_lock(&g_mutex);
+ g_td_count--;
+ if (g_td_count == 0) {
+ struct spdk_fio_ctrlr *fio_ctrlr, *fio_ctrlr_tmp;
+
+ fio_ctrlr = g_ctrlr;
+ while (fio_ctrlr != NULL) {
+ spdk_nvme_detach(fio_ctrlr->ctrlr);
+ fio_ctrlr_tmp = fio_ctrlr->next;
+ free(fio_ctrlr);
+ fio_ctrlr = fio_ctrlr_tmp;
+ }
+ g_ctrlr = NULL;
+
+ if (fio_options->enable_vmd) {
+ spdk_vmd_fini();
+ }
+ }
+ pthread_mutex_unlock(&g_mutex);
+ if (!g_ctrlr) {
+ if (pthread_cancel(g_ctrlr_thread_id) == 0) {
+ pthread_join(g_ctrlr_thread_id, NULL);
+ }
+ }
+}
+
+/* This function enables addition of SPDK parameters to the fio config
+ * Adding new parameters by defining them here and defining a callback
+ * function to read the parameter value. */
+static struct fio_option options[] = {
+ {
+ .name = "enable_wrr",
+ .lname = "Enable weighted round robin (WRR) for IO submission queues",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, enable_wrr),
+ .def = "0",
+ .help = "Enable weighted round robin (WRR) for IO submission queues",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "arbitration_burst",
+ .lname = "Arbitration Burst",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, arbitration_burst),
+ .def = "0",
+ .help = "Arbitration Burst used for WRR (valid range from 0 - 7)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "low_weight",
+ .lname = "low_weight for WRR",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, low_weight),
+ .def = "0",
+ .help = "low_weight used for WRR (valid range from 0 - 255)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "medium_weight",
+ .lname = "medium_weight for WRR",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, medium_weight),
+ .def = "0",
+ .help = "medium weight used for WRR (valid range from 0 - 255)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "high_weight",
+ .lname = "high_weight for WRR",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, high_weight),
+ .def = "0",
+ .help = "high weight used for WRR (valid range from 0 - 255)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "wrr_priority",
+ .lname = "priority used for WRR",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, wrr_priority),
+ .def = "0",
+ .help = "priority used for WRR (valid range from 0-3)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "mem_size_mb",
+ .lname = "Memory size in MB",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, mem_size),
+ .def = "0",
+ .help = "Memory Size for SPDK (MB)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "shm_id",
+ .lname = "shared memory ID",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, shm_id),
+ .def = "-1",
+ .help = "Shared Memory ID",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "enable_sgl",
+ .lname = "SGL used for I/O commands",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, enable_sgl),
+ .def = "0",
+ .help = "SGL Used for I/O Commands (enable_sgl=1 or enable_sgl=0)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "sge_size",
+ .lname = "SGL size used for I/O commands",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, sge_size),
+ .def = "4096",
+ .help = "SGL size in bytes for I/O Commands (default 4096)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "bit_bucket_data_len",
+ .lname = "Amount of data used for Bit Bucket",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, bit_bucket_data_len),
+ .def = "0",
+ .help = "Bit Bucket Data Length for READ commands (disabled by default)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "hostnqn",
+ .lname = "Host NQN to use when connecting to controllers.",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct spdk_fio_options, hostnqn),
+ .help = "Host NQN",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "pi_act",
+ .lname = "Protection Information Action",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, pi_act),
+ .def = "1",
+ .help = "Protection Information Action bit (pi_act=1 or pi_act=0)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "pi_chk",
+ .lname = "Protection Information Check(GUARD|REFTAG|APPTAG)",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct spdk_fio_options, pi_chk),
+ .def = NULL,
+ .help = "Control of Protection Information Checking (pi_chk=GUARD|REFTAG|APPTAG)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "md_per_io_size",
+ .lname = "Separate Metadata Buffer Size per I/O",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, md_per_io_size),
+ .def = "4096",
+ .help = "Size of separate metadata buffer per I/O (Default: 4096)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "apptag",
+ .lname = "Application Tag used in Protection Information",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, apptag),
+ .def = "0x1234",
+ .help = "Application Tag used in Protection Information field (Default: 0x1234)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "apptag_mask",
+ .lname = "Application Tag Mask",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, apptag_mask),
+ .def = "0xffff",
+ .help = "Application Tag Mask used with Application Tag (Default: 0xffff)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "digest_enable",
+ .lname = "PDU digest choice for NVMe/TCP Transport(NONE|HEADER|DATA|BOTH)",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct spdk_fio_options, digest_enable),
+ .def = NULL,
+ .help = "Control the NVMe/TCP control(digest_enable=NONE|HEADER|DATA|BOTH)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "enable_vmd",
+ .lname = "Enable VMD enumeration",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct spdk_fio_options, enable_vmd),
+ .def = "0",
+ .help = "Enable VMD enumeration (enable_vmd=1 or enable_vmd=0)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+/* FIO imports this structure using dlsym */
+struct ioengine_ops ioengine = {
+ .name = "spdk",
+ .version = FIO_IOOPS_VERSION,
+ .queue = spdk_fio_queue,
+ .getevents = spdk_fio_getevents,
+ .event = spdk_fio_event,
+ .cleanup = spdk_fio_cleanup,
+ .open_file = spdk_fio_open,
+ .close_file = spdk_fio_close,
+ .invalidate = spdk_fio_invalidate,
+ .iomem_alloc = spdk_fio_iomem_alloc,
+ .iomem_free = spdk_fio_iomem_free,
+ .setup = spdk_fio_setup,
+ .io_u_init = spdk_fio_io_u_init,
+ .io_u_free = spdk_fio_io_u_free,
+ .flags = FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN,
+ .options = options,
+ .option_struct_size = sizeof(struct spdk_fio_options),
+};
+
+static void fio_init fio_spdk_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_spdk_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
diff --git a/src/spdk/examples/nvme/fio_plugin/full_bench.fio b/src/spdk/examples/nvme/fio_plugin/full_bench.fio
new file mode 100644
index 000000000..4dea21d13
--- /dev/null
+++ b/src/spdk/examples/nvme/fio_plugin/full_bench.fio
@@ -0,0 +1,40 @@
+[global]
+thread=1
+group_reporting=1
+direct=1
+verify=0
+norandommap=1
+cpumask=1
+disable_slat=1
+disable_bw=1
+lat_percentiles=1
+clat_percentiles=0
+percentile_list=50:99:99.999
+
+[precondition-sequential]
+stonewall
+description="Sequentially write to the device twice"
+rw=write
+iodepth=128
+bs=128k
+loops=2
+
+[4k_randwrite_qd1]
+stonewall
+description="4KiB Random Write QD=1"
+bs=4k
+rw=randwrite
+iodepth=1
+time_based=1
+ramp_time=60
+runtime=240
+
+[4k_randread_qd1]
+stonewall
+description="4KiB Random Read QD=1"
+bs=4k
+rw=randread
+iodepth=1
+time_based=1
+ramp_time=60
+runtime=240
diff --git a/src/spdk/examples/nvme/fio_plugin/mock_sgl_config.fio b/src/spdk/examples/nvme/fio_plugin/mock_sgl_config.fio
new file mode 100644
index 000000000..713fce0a2
--- /dev/null
+++ b/src/spdk/examples/nvme/fio_plugin/mock_sgl_config.fio
@@ -0,0 +1,17 @@
+[global]
+ioengine=spdk
+thread=1
+group_reporting=1
+direct=1
+enable_sgl=1
+time_based=1
+ramp_time=0
+runtime=2
+iodepth=128
+rw=randrw
+bs=16k
+verify=md5
+verify_backlog=32
+
+[test]
+numjobs=1