summaryrefslogtreecommitdiffstats
path: root/src/spdk/lib/vhost/vhost_nvme.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/lib/vhost/vhost_nvme.c')
-rw-r--r--src/spdk/lib/vhost/vhost_nvme.c1465
1 files changed, 1465 insertions, 0 deletions
diff --git a/src/spdk/lib/vhost/vhost_nvme.c b/src/spdk/lib/vhost/vhost_nvme.c
new file mode 100644
index 00000000..35015d93
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_nvme.c
@@ -0,0 +1,1465 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "spdk/bdev.h"
+#include "spdk/version.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/likely.h"
+
+#include "vhost_internal.h"
+
+#define MAX_IO_QUEUES 31
+#define MAX_IOVS 64
+#define MAX_NAMESPACE 8
+#define MAX_QUEUE_ENTRIES_SUPPORTED 256
+#define MAX_BATCH_IO 8
+
+struct spdk_vhost_nvme_sq {
+ uint16_t sqid;
+ uint16_t size;
+ uint16_t cqid;
+ bool valid;
+ struct spdk_nvme_cmd *sq_cmd;
+ uint16_t sq_head;
+ uint16_t sq_tail;
+};
+
+struct spdk_vhost_nvme_cq {
+ uint8_t phase;
+ uint16_t size;
+ uint16_t cqid;
+ bool valid;
+ volatile struct spdk_nvme_cpl *cq_cqe;
+ uint16_t cq_head;
+ uint16_t guest_signaled_cq_head;
+ uint32_t need_signaled_cnt;
+ STAILQ_HEAD(, spdk_vhost_nvme_task) cq_full_waited_tasks;
+ bool irq_enabled;
+ int virq;
+};
+
+struct spdk_vhost_nvme_ns {
+ struct spdk_bdev *bdev;
+ uint32_t block_size;
+ uint64_t capacity;
+ uint32_t nsid;
+ uint32_t active_ns;
+ struct spdk_bdev_desc *bdev_desc;
+ struct spdk_io_channel *bdev_io_channel;
+ struct spdk_nvme_ns_data nsdata;
+};
+
+struct spdk_vhost_nvme_task {
+ struct spdk_nvme_cmd cmd;
+ struct spdk_vhost_nvme_dev *nvme;
+ uint16_t sqid;
+ uint16_t cqid;
+
+ /** array of iovecs to transfer. */
+ struct iovec iovs[MAX_IOVS];
+
+ /** Number of iovecs in iovs array. */
+ int iovcnt;
+
+ /** Current iovec position. */
+ int iovpos;
+
+ /** Offset in current iovec. */
+ uint32_t iov_offset;
+
+ /* for bdev_io_wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_vhost_nvme_ns *ns;
+
+ /* parent pointer. */
+ struct spdk_vhost_nvme_task *parent;
+ uint8_t dnr;
+ uint8_t sct;
+ uint8_t sc;
+ uint32_t num_children;
+ STAILQ_ENTRY(spdk_vhost_nvme_task) stailq;
+};
+
+struct spdk_vhost_nvme_dev {
+ struct spdk_vhost_dev vdev;
+
+ uint32_t num_io_queues;
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_cc_register cc;
+ union spdk_nvme_csts_register csts;
+ struct spdk_nvme_ctrlr_data cdata;
+
+ uint32_t num_sqs;
+ uint32_t num_cqs;
+
+ uint32_t num_ns;
+ struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE];
+
+ volatile uint32_t *dbbuf_dbs;
+ volatile uint32_t *dbbuf_eis;
+ struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1];
+ struct spdk_vhost_nvme_cq cq_queue[MAX_IO_QUEUES + 1];
+
+ TAILQ_ENTRY(spdk_vhost_nvme_dev) tailq;
+ STAILQ_HEAD(, spdk_vhost_nvme_task) free_tasks;
+ struct spdk_poller *requestq_poller;
+ struct spdk_vhost_dev_destroy_ctx destroy_ctx;
+};
+
+static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend;
+
+/*
+ * Report the SPDK version as the firmware revision.
+ * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts.
+ */
+#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING
+
+static int
+spdk_nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq,
+ struct spdk_vhost_nvme_task *task);
+
+static struct spdk_vhost_nvme_dev *
+to_nvme_dev(struct spdk_vhost_dev *vdev)
+{
+ if (vdev->backend != &spdk_vhost_nvme_device_backend) {
+ SPDK_ERRLOG("%s: not a vhost-nvme device\n", vdev->name);
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(vdev, struct spdk_vhost_nvme_dev, vdev);
+}
+
+static TAILQ_HEAD(, spdk_vhost_nvme_dev) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs);
+
+static inline unsigned int sq_offset(unsigned int qid, uint32_t db_stride)
+{
+ return qid * 2 * db_stride;
+}
+
+static inline unsigned int cq_offset(unsigned int qid, uint32_t db_stride)
+{
+ return (qid * 2 + 1) * db_stride;
+}
+
+static void
+nvme_inc_cq_head(struct spdk_vhost_nvme_cq *cq)
+{
+ cq->cq_head++;
+ if (cq->cq_head >= cq->size) {
+ cq->cq_head = 0;
+ cq->phase = !cq->phase;
+ }
+}
+
+static bool
+nvme_cq_is_full(struct spdk_vhost_nvme_cq *cq)
+{
+ return ((cq->cq_head + 1) % cq->size == cq->guest_signaled_cq_head);
+}
+
+static void
+nvme_inc_sq_head(struct spdk_vhost_nvme_sq *sq)
+{
+ sq->sq_head = (sq->sq_head + 1) % sq->size;
+}
+
+static struct spdk_vhost_nvme_sq *
+spdk_vhost_nvme_get_sq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
+{
+ if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) {
+ return NULL;
+ }
+
+ return &dev->sq_queue[qid];
+}
+
+static struct spdk_vhost_nvme_cq *
+spdk_vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
+{
+ if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) {
+ return NULL;
+ }
+
+ return &dev->cq_queue[qid];
+}
+
+static int
+spdk_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd,
+ struct spdk_vhost_nvme_task *task, uint32_t len)
+{
+ uint64_t prp1, prp2;
+ void *vva;
+ uint32_t i;
+ uint32_t residue_len, nents, mps = 4096;
+ uint64_t *prp_list;
+
+ prp1 = cmd->dptr.prp.prp1;
+ prp2 = cmd->dptr.prp.prp2;
+
+ /* PRP1 may started with unaligned page address */
+ residue_len = mps - (prp1 % mps);
+ residue_len = spdk_min(len, residue_len);
+
+ vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp1, residue_len);
+ if (spdk_unlikely(vva == NULL)) {
+ SPDK_ERRLOG("GPA to VVA failed\n");
+ return -1;
+ }
+ task->iovs[0].iov_base = vva;
+ task->iovs[0].iov_len = residue_len;
+ len -= residue_len;
+
+ if (len) {
+ if (spdk_unlikely(prp2 == 0)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PRP2=0 in command\n");
+ return -1;
+ }
+
+ if (len <= mps) {
+ /* 2 PRP used */
+ task->iovcnt = 2;
+ vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp2, len);
+ if (spdk_unlikely(vva == NULL)) {
+ return -1;
+ }
+ task->iovs[1].iov_base = vva;
+ task->iovs[1].iov_len = len;
+ } else {
+ /* PRP list used */
+ nents = (len + mps - 1) / mps;
+ vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp2, nents * sizeof(*prp_list));
+ if (spdk_unlikely(vva == NULL)) {
+ return -1;
+ }
+ prp_list = vva;
+ i = 0;
+ while (len != 0) {
+ residue_len = spdk_min(len, mps);
+ vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp_list[i], residue_len);
+ if (spdk_unlikely(vva == NULL)) {
+ return -1;
+ }
+ task->iovs[i + 1].iov_base = vva;
+ task->iovs[i + 1].iov_len = residue_len;
+ len -= residue_len;
+ i++;
+ }
+ task->iovcnt = i + 1;
+ }
+ } else {
+ /* 1 PRP used */
+ task->iovcnt = 1;
+ }
+
+ return 0;
+}
+
+static void
+spdk_nvme_cq_signal_fd(struct spdk_vhost_nvme_dev *nvme)
+{
+ struct spdk_vhost_nvme_cq *cq;
+ uint32_t qid, cq_head;
+
+ assert(nvme != NULL);
+
+ for (qid = 1; qid <= MAX_IO_QUEUES; qid++) {
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq || !cq->valid) {
+ continue;
+ }
+
+ cq_head = nvme->dbbuf_dbs[cq_offset(qid, 1)];
+ if (cq->irq_enabled && cq->need_signaled_cnt && (cq->cq_head != cq_head)) {
+ eventfd_write(cq->virq, (eventfd_t)1);
+ cq->need_signaled_cnt = 0;
+ }
+ }
+}
+
+static void
+spdk_vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task)
+{
+ struct spdk_vhost_nvme_dev *nvme = task->nvme;
+ struct spdk_nvme_cpl cqe = {0};
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ uint16_t cqid = task->cqid;
+ uint16_t sqid = task->sqid;
+
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, cqid);
+ sq = spdk_vhost_nvme_get_sq_from_qid(nvme, sqid);
+ if (spdk_unlikely(!cq || !sq)) {
+ return;
+ }
+
+ cq->guest_signaled_cq_head = nvme->dbbuf_dbs[cq_offset(cqid, 1)];
+ if (spdk_unlikely(nvme_cq_is_full(cq))) {
+ STAILQ_INSERT_TAIL(&cq->cq_full_waited_tasks, task, stailq);
+ return;
+ }
+
+ cqe.sqid = sqid;
+ cqe.sqhd = sq->sq_head;
+ cqe.cid = cmd->cid;
+ cqe.status.dnr = task->dnr;
+ cqe.status.sct = task->sct;
+ cqe.status.sc = task->sc;
+ cqe.status.p = !cq->phase;
+ cq->cq_cqe[cq->cq_head] = cqe;
+ spdk_smp_wmb();
+ cq->cq_cqe[cq->cq_head].status.p = cq->phase;
+
+ nvme_inc_cq_head(cq);
+ cq->need_signaled_cnt++;
+
+ /* MMIO Controll */
+ nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1);
+
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
+}
+
+static void
+blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_nvme_task *task = cb_arg;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ int sc, sct;
+
+ assert(bdev_io != NULL);
+
+ spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc);
+ spdk_bdev_free_io(bdev_io);
+
+ task->dnr = !success;
+ task->sct = sct;
+ task->sc = sc;
+
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("I/O error, sector %u\n", cmd->cdw10);
+ }
+
+ spdk_vhost_nvme_task_complete(task);
+}
+
+static void
+blk_unmap_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_nvme_task *child = cb_arg;
+ struct spdk_vhost_nvme_task *task = child->parent;
+ struct spdk_vhost_nvme_dev *nvme = task->nvme;
+ int sct, sc;
+
+ assert(bdev_io != NULL);
+
+ task->num_children--;
+ if (!success) {
+ task->dnr = 1;
+ spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc);
+ task->sct = sct;
+ task->sc = sc;
+ }
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (!task->num_children) {
+ spdk_vhost_nvme_task_complete(task);
+ }
+
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq);
+}
+
+static struct spdk_vhost_nvme_ns *
+spdk_vhost_nvme_get_ns_from_nsid(struct spdk_vhost_nvme_dev *dev, uint32_t nsid)
+{
+ if (spdk_unlikely(!nsid || nsid > dev->num_ns)) {
+ return NULL;
+ }
+
+ return &dev->ns[nsid - 1];
+}
+
+static void
+vhost_nvme_resubmit_task(void *arg)
+{
+ struct spdk_vhost_nvme_task *task = (struct spdk_vhost_nvme_task *)arg;
+ int rc;
+
+ rc = spdk_nvme_process_sq(task->nvme, task->sq, task);
+ if (rc) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "vhost_nvme: task resubmit failed, rc = %d.\n", rc);
+ }
+}
+
+static int
+vhost_nvme_queue_task(struct spdk_vhost_nvme_task *task)
+{
+ int rc;
+
+ task->bdev_io_wait.bdev = task->ns->bdev;
+ task->bdev_io_wait.cb_fn = vhost_nvme_resubmit_task;
+ task->bdev_io_wait.cb_arg = task;
+
+ rc = spdk_bdev_queue_io_wait(task->ns->bdev, task->ns->bdev_io_channel, &task->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vhost_nvme_queue_task, rc=%d.\n", rc);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ spdk_vhost_nvme_task_complete(task);
+ }
+
+ return rc;
+}
+
+static int
+spdk_nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq,
+ struct spdk_vhost_nvme_task *task)
+{
+ struct spdk_vhost_nvme_task *child;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ struct spdk_vhost_nvme_ns *ns;
+ int ret = -1;
+ uint32_t len, nlba, block_size;
+ uint64_t slba;
+ struct spdk_nvme_dsm_range *range;
+ uint16_t i, num_ranges = 0;
+
+ task->nvme = nvme;
+ task->dnr = 0;
+ task->sct = 0;
+ task->sc = 0;
+
+ ns = spdk_vhost_nvme_get_ns_from_nsid(nvme, cmd->nsid);
+ if (spdk_unlikely(!ns)) {
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ spdk_vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ block_size = ns->block_size;
+ task->num_children = 0;
+ task->cqid = sq->cqid;
+ task->sqid = sq->sqid;
+
+ task->ns = ns;
+
+ if (spdk_unlikely(!ns->active_ns)) {
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ spdk_vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ /* valid only for Read/Write commands */
+ nlba = (cmd->cdw12 & 0xffff) + 1;
+ slba = cmd->cdw11;
+ slba = (slba << 32) | cmd->cdw10;
+
+ if (cmd->opc == SPDK_NVME_OPC_READ || cmd->opc == SPDK_NVME_OPC_WRITE ||
+ cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
+ if (cmd->psdt != SPDK_NVME_PSDT_PRP) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PSDT %u%ub in command\n",
+ cmd->psdt >> 1, cmd->psdt & 1u);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_FIELD;
+ spdk_vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
+ num_ranges = (cmd->cdw10 & 0xff) + 1;
+ len = num_ranges * sizeof(struct spdk_nvme_dsm_range);
+ } else {
+ len = nlba * block_size;
+ }
+
+ ret = spdk_nvme_map_prps(nvme, cmd, task, len);
+ if (spdk_unlikely(ret != 0)) {
+ SPDK_ERRLOG("nvme command map prps failed\n");
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_FIELD;
+ spdk_vhost_nvme_task_complete(task);
+ return -1;
+ }
+ }
+
+ switch (cmd->opc) {
+ case SPDK_NVME_OPC_READ:
+ ret = spdk_bdev_readv(ns->bdev_desc, ns->bdev_io_channel,
+ task->iovs, task->iovcnt, slba * block_size,
+ nlba * block_size, blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_WRITE:
+ ret = spdk_bdev_writev(ns->bdev_desc, ns->bdev_io_channel,
+ task->iovs, task->iovcnt, slba * block_size,
+ nlba * block_size, blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_FLUSH:
+ ret = spdk_bdev_flush(ns->bdev_desc, ns->bdev_io_channel,
+ 0, ns->capacity,
+ blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+ range = (struct spdk_nvme_dsm_range *)task->iovs[0].iov_base;
+ for (i = 0; i < num_ranges; i++) {
+ if (!STAILQ_EMPTY(&nvme->free_tasks)) {
+ child = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ } else {
+ SPDK_ERRLOG("No free task now\n");
+ ret = -1;
+ break;
+ }
+ task->num_children++;
+ child->parent = task;
+ ret = spdk_bdev_unmap(ns->bdev_desc, ns->bdev_io_channel,
+ range[i].starting_lba * block_size,
+ range[i].length * block_size,
+ blk_unmap_complete_cb, child);
+ if (ret) {
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq);
+ break;
+ }
+ }
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+
+ if (spdk_unlikely(ret)) {
+ if (ret == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "No memory, start to queue io.\n");
+ task->sq = sq;
+ ret = vhost_nvme_queue_task(task);
+ } else {
+ /* post error status to cqe */
+ SPDK_ERRLOG("Error Submission For Command %u, ret %d\n", cmd->opc, ret);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ spdk_vhost_nvme_task_complete(task);
+ }
+ }
+
+ return ret;
+}
+
+static int
+nvme_worker(void *arg)
+{
+ struct spdk_vhost_nvme_dev *nvme = (struct spdk_vhost_nvme_dev *)arg;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_task *task;
+ uint32_t qid, dbbuf_sq;
+ int ret;
+ int count = -1;
+
+ if (spdk_unlikely(!nvme->num_sqs)) {
+ return -1;
+ }
+
+ /* worker thread can't start before the admin doorbell
+ * buffer config command
+ */
+ if (spdk_unlikely(!nvme->dbbuf_dbs)) {
+ return -1;
+ }
+
+ for (qid = 1; qid <= MAX_IO_QUEUES; qid++) {
+
+ sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid);
+ if (!sq->valid) {
+ continue;
+ }
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, sq->cqid);
+ if (spdk_unlikely(!cq)) {
+ return -1;
+ }
+ cq->guest_signaled_cq_head = nvme->dbbuf_dbs[cq_offset(sq->cqid, 1)];
+ if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) &&
+ !nvme_cq_is_full(cq))) {
+ task = STAILQ_FIRST(&cq->cq_full_waited_tasks);
+ STAILQ_REMOVE_HEAD(&cq->cq_full_waited_tasks, stailq);
+ spdk_vhost_nvme_task_complete(task);
+ }
+
+ dbbuf_sq = nvme->dbbuf_dbs[sq_offset(qid, 1)];
+ sq->sq_tail = (uint16_t)dbbuf_sq;
+ count = 0;
+
+ while (sq->sq_head != sq->sq_tail) {
+ if (spdk_unlikely(!sq->sq_cmd)) {
+ break;
+ }
+ if (spdk_likely(!STAILQ_EMPTY(&nvme->free_tasks))) {
+ task = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ } else {
+ return -1;
+ }
+
+ task->cmd = sq->sq_cmd[sq->sq_head];
+ nvme_inc_sq_head(sq);
+
+ /* processing IO */
+ ret = spdk_nvme_process_sq(nvme, sq, task);
+ if (spdk_unlikely(ret)) {
+ SPDK_ERRLOG("QID %u CID %u, SQ HEAD %u, DBBUF SQ TAIL %u\n", qid, task->cmd.cid, sq->sq_head,
+ sq->sq_tail);
+ }
+
+ /* MMIO Control */
+ nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1);
+
+ /* Maximum batch I/Os to pick up at once */
+ if (count++ == MAX_BATCH_IO) {
+ break;
+ }
+ }
+ }
+
+ /* Completion Queue */
+ spdk_nvme_cq_signal_fd(nvme);
+
+ return count;
+}
+
+static int
+vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint64_t dbs_dma_addr, eis_dma_addr;
+
+ dbs_dma_addr = cmd->dptr.prp.prp1;
+ eis_dma_addr = cmd->dptr.prp.prp2;
+
+ if ((dbs_dma_addr % 4096) || (eis_dma_addr % 4096)) {
+ return -1;
+ }
+ /* Guest Physical Address to Host Virtual Address */
+ nvme->dbbuf_dbs = spdk_vhost_gpa_to_vva(&nvme->vdev, dbs_dma_addr, 4096);
+ nvme->dbbuf_eis = spdk_vhost_gpa_to_vva(&nvme->vdev, eis_dma_addr, 4096);
+ if (!nvme->dbbuf_dbs || !nvme->dbbuf_eis) {
+ return -1;
+ }
+ /* zeroed the doorbell buffer memory */
+ memset((void *)nvme->dbbuf_dbs, 0, 4096);
+ memset((void *)nvme->dbbuf_eis, 0, 4096);
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static int
+vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid, qsize, cqid;
+ uint64_t dma_addr;
+ uint64_t requested_len;
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_sq *sq;
+
+ /* physical contiguous */
+ if (!(cmd->cdw11 & 0x1)) {
+ return -1;
+ }
+
+ cqid = (cmd->cdw11 >> 16) & 0xffff;
+ qid = cmd->cdw10 & 0xffff;
+ qsize = (cmd->cdw10 >> 16) & 0xffff;
+ dma_addr = cmd->dptr.prp.prp1;
+ if (!dma_addr || dma_addr % 4096) {
+ return -1;
+ }
+
+ sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid);
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, cqid);
+ if (!sq || !cq) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u or CQID %u\n",
+ qid, cqid);
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+ return -1;
+ }
+
+ sq->sqid = qid;
+ sq->cqid = cqid;
+ sq->size = qsize + 1;
+ sq->sq_head = sq->sq_tail = 0;
+ requested_len = sizeof(struct spdk_nvme_cmd) * sq->size;
+ sq->sq_cmd = spdk_vhost_gpa_to_vva(&nvme->vdev, dma_addr, requested_len);
+ if (!sq->sq_cmd) {
+ return -1;
+ }
+ nvme->num_sqs++;
+ sq->valid = true;
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static int
+vhost_nvme_delete_io_sq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid;
+ struct spdk_vhost_nvme_sq *sq;
+
+ qid = cmd->cdw10 & 0xffff;
+ sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid);
+ if (!sq) {
+ return -1;
+ }
+
+ /* We didn't see scenarios when deleting submission
+ * queue while I/O is running against the submisson
+ * queue for now, otherwise, we must ensure the poller
+ * will not run with this submission queue.
+ */
+ nvme->num_sqs--;
+ sq->valid = false;
+
+ memset(sq, 0, sizeof(*sq));
+ sq->sq_cmd = NULL;
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+
+ return 0;
+}
+
+static int
+vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qsize, qid;
+ uint64_t dma_addr;
+ struct spdk_vhost_nvme_cq *cq;
+ uint64_t requested_len;
+
+ /* physical contiguous */
+ if (!(cmd->cdw11 & 0x1)) {
+ return -1;
+ }
+
+ qid = cmd->cdw10 & 0xffff;
+ qsize = (cmd->cdw10 >> 16) & 0xffff;
+ dma_addr = cmd->dptr.prp.prp1;
+ if (!dma_addr || dma_addr % 4096) {
+ return -1;
+ }
+
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u\n", qid);
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+ return -1;
+ }
+ cq->cqid = qid;
+ cq->size = qsize + 1;
+ cq->phase = 1;
+ cq->irq_enabled = (cmd->cdw11 >> 1) & 0x1;
+ /* Setup virq through vhost messages */
+ cq->virq = -1;
+ cq->cq_head = 0;
+ cq->guest_signaled_cq_head = 0;
+ cq->need_signaled_cnt = 0;
+ requested_len = sizeof(struct spdk_nvme_cpl) * cq->size;
+ cq->cq_cqe = spdk_vhost_gpa_to_vva(&nvme->vdev, dma_addr, requested_len);
+ if (!cq->cq_cqe) {
+ return -1;
+ }
+ nvme->num_cqs++;
+ cq->valid = true;
+ STAILQ_INIT(&cq->cq_full_waited_tasks);
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static int
+vhost_nvme_delete_io_cq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid;
+ struct spdk_vhost_nvme_cq *cq;
+
+ qid = cmd->cdw10 & 0xffff;
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ return -1;
+ }
+ nvme->num_cqs--;
+ cq->valid = false;
+
+ memset(cq, 0, sizeof(*cq));
+ cq->cq_cqe = NULL;
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static struct spdk_vhost_nvme_dev *
+spdk_vhost_nvme_get_by_name(int vid)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+
+ TAILQ_FOREACH(nvme, &g_nvme_ctrlrs, tailq) {
+ if (nvme->vdev.vid == vid) {
+ return nvme;
+ }
+ }
+
+ return NULL;
+}
+
+int
+spdk_vhost_nvme_get_cap(int vid, uint64_t *cap)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+
+ nvme = spdk_vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ *cap = nvme->cap.raw;
+ return 0;
+}
+
+int
+spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf)
+{
+ struct spdk_nvme_cmd *req = (struct spdk_nvme_cmd *)cmd;
+ struct spdk_nvme_cpl *cpl = (struct spdk_nvme_cpl *)cqe;
+ struct spdk_vhost_nvme_ns *ns;
+ int ret = 0;
+ struct spdk_vhost_nvme_dev *nvme;
+ uint32_t cq_head, sq_tail;
+
+ nvme = spdk_vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Admin Command Opcode %u\n", req->opc);
+ switch (req->opc) {
+ case SPDK_NVME_OPC_IDENTIFY:
+ if (req->cdw10 == SPDK_NVME_IDENTIFY_CTRLR) {
+ memcpy(buf, &nvme->cdata, sizeof(struct spdk_nvme_ctrlr_data));
+
+ } else if (req->cdw10 == SPDK_NVME_IDENTIFY_NS) {
+ ns = spdk_vhost_nvme_get_ns_from_nsid(nvme, req->nsid);
+ if (!ns) {
+ cpl->status.sc = SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE;
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ break;
+ }
+ memcpy(buf, &ns->nsdata, sizeof(struct spdk_nvme_ns_data));
+ }
+ /* successfully */
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ break;
+ case SPDK_NVME_OPC_CREATE_IO_CQ:
+ ret = vhost_nvme_create_io_cq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_DELETE_IO_CQ:
+ ret = vhost_nvme_delete_io_cq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_CREATE_IO_SQ:
+ ret = vhost_nvme_create_io_sq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_DELETE_IO_SQ:
+ ret = vhost_nvme_delete_io_sq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_GET_FEATURES:
+ case SPDK_NVME_OPC_SET_FEATURES:
+ if (req->cdw10 == SPDK_NVME_FEAT_NUMBER_OF_QUEUES) {
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ cpl->cdw0 = (nvme->num_io_queues - 1) | ((nvme->num_io_queues - 1) << 16);
+ } else {
+ cpl->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ cpl->status.sct = SPDK_NVME_SCT_GENERIC;
+ }
+ break;
+ case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG:
+ ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_ABORT:
+ sq_tail = nvme->dbbuf_dbs[sq_offset(1, 1)] & 0xffffu;
+ cq_head = nvme->dbbuf_dbs[cq_offset(1, 1)] & 0xffffu;
+ SPDK_NOTICELOG("ABORT: CID %u, SQ_TAIL %u, CQ_HEAD %u\n",
+ (req->cdw10 >> 16) & 0xffffu, sq_tail, cq_head);
+ /* TODO: ABORT failed fow now */
+ cpl->cdw0 = 1;
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ break;
+ }
+
+ if (ret) {
+ SPDK_ERRLOG("Admin Passthrough Faild with %u\n", req->opc);
+ }
+
+ return 0;
+}
+
+int
+spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+ struct spdk_vhost_nvme_cq *cq;
+
+ nvme = spdk_vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ return -1;
+ }
+ if (cq->irq_enabled) {
+ cq->virq = fd;
+ } else {
+ SPDK_ERRLOG("NVMe Qid %d Disabled IRQ\n", qid);
+ }
+
+ return 0;
+}
+
+static void
+free_task_pool(struct spdk_vhost_nvme_dev *nvme)
+{
+ struct spdk_vhost_nvme_task *task;
+
+ while (!STAILQ_EMPTY(&nvme->free_tasks)) {
+ task = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ spdk_dma_free(task);
+ }
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_nvme_dev *nvme)
+{
+ uint32_t entries, i;
+ struct spdk_vhost_nvme_task *task;
+
+ entries = nvme->num_io_queues * MAX_QUEUE_ENTRIES_SUPPORTED;
+
+ for (i = 0; i < entries; i++) {
+ task = spdk_dma_zmalloc(sizeof(struct spdk_vhost_nvme_task),
+ SPDK_CACHE_LINE_SIZE, NULL);
+ if (task == NULL) {
+ SPDK_ERRLOG("Controller %s alloc task pool failed\n",
+ nvme->vdev.name);
+ free_task_pool(nvme);
+ return -1;
+ }
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
+ }
+
+ return 0;
+}
+
+/* new device means enable the
+ * virtual NVMe controller
+ */
+static int
+spdk_vhost_nvme_start_device(struct spdk_vhost_dev *vdev, void *event_ctx)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return -1;
+ }
+
+ if (alloc_task_pool(nvme)) {
+ return -1;
+ }
+
+ SPDK_NOTICELOG("Start Device %u, Path %s, lcore %d\n", vdev->vid,
+ vdev->path, vdev->lcore);
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ ns_dev->bdev_io_channel = spdk_bdev_get_io_channel(ns_dev->bdev_desc);
+ if (!ns_dev->bdev_io_channel) {
+ return -1;
+ }
+ }
+
+ /* Start the NVMe Poller */
+ nvme->requestq_poller = spdk_poller_register(nvme_worker, nvme, 0);
+
+ spdk_vhost_dev_backend_event_done(event_ctx, 0);
+ return 0;
+}
+
+static void
+spdk_vhost_nvme_deactive_ns(struct spdk_vhost_nvme_ns *ns)
+{
+ ns->active_ns = 0;
+ spdk_bdev_close(ns->bdev_desc);
+ ns->bdev_desc = NULL;
+ ns->bdev = NULL;
+}
+
+static void
+bdev_remove_cb(void *remove_ctx)
+{
+ struct spdk_vhost_nvme_ns *ns = remove_ctx;
+
+ SPDK_NOTICELOG("Removing NS %u, Block Device %s\n",
+ ns->nsid, spdk_bdev_get_name(ns->bdev));
+
+ spdk_vhost_nvme_deactive_ns(ns);
+}
+
+static int
+destroy_device_poller_cb(void *arg)
+{
+ struct spdk_vhost_nvme_dev *nvme = arg;
+ struct spdk_vhost_nvme_dev *dev, *tmp;
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Destroy device poller callback\n");
+
+ TAILQ_FOREACH_SAFE(dev, &g_nvme_ctrlrs, tailq, tmp) {
+ if (dev == nvme) {
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (ns_dev->bdev_io_channel) {
+ spdk_put_io_channel(ns_dev->bdev_io_channel);
+ ns_dev->bdev_io_channel = NULL;
+ }
+ }
+ nvme->num_sqs = 0;
+ nvme->num_cqs = 0;
+ nvme->dbbuf_dbs = NULL;
+ nvme->dbbuf_eis = NULL;
+ }
+ }
+
+ spdk_poller_unregister(&nvme->destroy_ctx.poller);
+ spdk_vhost_dev_backend_event_done(nvme->destroy_ctx.event_ctx, 0);
+
+ return -1;
+}
+
+/* Disable NVMe controller
+ */
+static int
+spdk_vhost_nvme_stop_device(struct spdk_vhost_dev *vdev, void *event_ctx)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+
+ if (nvme == NULL) {
+ return -1;
+ }
+
+ free_task_pool(nvme);
+ SPDK_NOTICELOG("Stopping Device %u, Path %s\n", vdev->vid, vdev->path);
+
+ nvme->destroy_ctx.event_ctx = event_ctx;
+ spdk_poller_unregister(&nvme->requestq_poller);
+ nvme->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb, nvme, 1000);
+
+ return 0;
+}
+
+static void
+spdk_vhost_nvme_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return;
+ }
+
+ spdk_json_write_named_array_begin(w, "namespaces");
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (!ns_dev->active_ns) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_uint32(w, "nsid", ns_dev->nsid);
+ spdk_json_write_named_string(w, "bdev", spdk_bdev_get_name(ns_dev->bdev));
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+}
+
+static void
+spdk_vhost_nvme_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "construct_vhost_nvme_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name);
+ spdk_json_write_named_uint32(w, "io_queues", nvme->num_io_queues);
+ spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(nvme->vdev.cpumask));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (!ns_dev->active_ns) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "add_vhost_nvme_ns");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name);
+ spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(ns_dev->bdev));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+}
+
+static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend = {
+ .start_device = spdk_vhost_nvme_start_device,
+ .stop_device = spdk_vhost_nvme_stop_device,
+ .dump_info_json = spdk_vhost_nvme_dump_info_json,
+ .write_config_json = spdk_vhost_nvme_write_config_json,
+ .remove_device = spdk_vhost_nvme_dev_remove,
+};
+
+static int
+spdk_vhost_nvme_ns_identify_update(struct spdk_vhost_nvme_dev *dev)
+{
+ struct spdk_nvme_ctrlr_data *cdata = &dev->cdata;
+ struct spdk_nvme_ns_data *nsdata;
+ uint64_t num_blocks;
+ uint32_t i;
+
+ /* Identify Namespace */
+ cdata->nn = dev->num_ns;
+ for (i = 0; i < dev->num_ns; i++) {
+ nsdata = &dev->ns[i].nsdata;
+ if (dev->ns[i].active_ns) {
+ num_blocks = spdk_bdev_get_num_blocks(dev->ns[i].bdev);
+ nsdata->nsze = num_blocks;
+ /* ncap must be non-zero for active Namespace */
+ nsdata->ncap = num_blocks;
+ nsdata->nuse = num_blocks;
+ nsdata->nlbaf = 0;
+ nsdata->flbas.format = 0;
+ nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(dev->ns[i].bdev));
+ nsdata->noiob = spdk_bdev_get_optimal_io_boundary(dev->ns[i].bdev);
+ dev->ns[i].block_size = spdk_bdev_get_block_size(dev->ns[i].bdev);
+ dev->ns[i].capacity = num_blocks * dev->ns[i].block_size;
+ } else {
+ memset(nsdata, 0, sizeof(*nsdata));
+ }
+ }
+ return 0;
+}
+
+static int
+spdk_vhost_nvme_ctrlr_identify_update(struct spdk_vhost_nvme_dev *dev)
+{
+ struct spdk_nvme_ctrlr_data *cdata = &dev->cdata;
+ char sn[20];
+
+ /* Controller Capabilities */
+ dev->cap.bits.cqr = 1;
+ dev->cap.bits.to = 1;
+ dev->cap.bits.dstrd = 0;
+ dev->cap.bits.css = SPDK_NVME_CAP_CSS_NVM;
+ dev->cap.bits.mpsmin = 0;
+ dev->cap.bits.mpsmax = 0;
+ /* MQES is 0 based value */
+ dev->cap.bits.mqes = MAX_QUEUE_ENTRIES_SUPPORTED - 1;
+
+ /* Controller Configuration */
+ dev->cc.bits.en = 0;
+
+ /* Controller Status */
+ dev->csts.bits.rdy = 0;
+
+ /* Identify Controller */
+ spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' ');
+ cdata->vid = 0x8086;
+ cdata->ssvid = 0x8086;
+ spdk_strcpy_pad(cdata->mn, "SPDK Virtual NVMe Controller", sizeof(cdata->mn), ' ');
+ snprintf(sn, sizeof(sn), "NVMe_%s", dev->vdev.name);
+ spdk_strcpy_pad(cdata->sn, sn, sizeof(cdata->sn), ' ');
+ cdata->ieee[0] = 0xe4;
+ cdata->ieee[1] = 0xd2;
+ cdata->ieee[2] = 0x5c;
+ cdata->ver.bits.mjr = 1;
+ cdata->ver.bits.mnr = 0;
+ cdata->mdts = 5; /* 128 KiB */
+ cdata->rab = 6;
+ cdata->sqes.min = 6;
+ cdata->sqes.max = 6;
+ cdata->cqes.min = 4;
+ cdata->cqes.max = 4;
+ cdata->oncs.dsm = 1;
+ /* Emulated NVMe controller */
+ cdata->oacs.doorbell_buffer_config = 1;
+
+ spdk_vhost_nvme_ns_identify_update(dev);
+
+ return 0;
+}
+
+int
+spdk_vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t num_io_queues)
+{
+ struct spdk_vhost_nvme_dev *dev = spdk_dma_zmalloc(sizeof(struct spdk_vhost_nvme_dev),
+ SPDK_CACHE_LINE_SIZE, NULL);
+ int rc;
+
+ if (dev == NULL) {
+ return -ENOMEM;
+ }
+
+ if (num_io_queues < 1 || num_io_queues > MAX_IO_QUEUES) {
+ spdk_dma_free(dev);
+ return -EINVAL;
+ }
+
+ spdk_vhost_lock();
+ rc = spdk_vhost_dev_register(&dev->vdev, name, cpumask,
+ &spdk_vhost_nvme_device_backend);
+
+ if (rc) {
+ spdk_dma_free(dev);
+ spdk_vhost_unlock();
+ return rc;
+ }
+
+ dev->num_io_queues = num_io_queues;
+ STAILQ_INIT(&dev->free_tasks);
+ TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, dev, tailq);
+
+ spdk_vhost_nvme_ctrlr_identify_update(dev);
+
+ SPDK_NOTICELOG("Controller %s: Constructed\n", name);
+ spdk_vhost_unlock();
+ return rc;
+}
+
+int
+spdk_vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_dev *dev, *tmp;
+ struct spdk_vhost_nvme_ns *ns;
+ int rc;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return -EINVAL;
+ }
+
+ TAILQ_FOREACH_SAFE(dev, &g_nvme_ctrlrs, tailq, tmp) {
+ if (dev == nvme) {
+ TAILQ_REMOVE(&g_nvme_ctrlrs, dev, tailq);
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns = &nvme->ns[i];
+ if (ns->active_ns) {
+ spdk_vhost_nvme_deactive_ns(ns);
+ }
+ }
+ }
+ }
+
+ rc = spdk_vhost_dev_unregister(vdev);
+ if (rc != 0) {
+ return rc;
+ }
+
+ spdk_dma_free(nvme);
+ return 0;
+}
+
+int
+spdk_vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, const char *bdev_name)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns;
+ struct spdk_bdev *bdev;
+ int rc = -1;
+
+ if (nvme == NULL) {
+ return -ENODEV;
+ }
+
+ if (nvme->num_ns == MAX_NAMESPACE) {
+ SPDK_ERRLOG("Can't support %d Namespaces\n", nvme->num_ns);
+ return -ENOSPC;
+ }
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ SPDK_ERRLOG("could not find bdev %s\n", bdev_name);
+ return -ENODEV;
+ }
+
+ ns = &nvme->ns[nvme->num_ns];
+ rc = spdk_bdev_open(bdev, true, bdev_remove_cb, ns, &nvme->ns[nvme->num_ns].bdev_desc);
+ if (rc != 0) {
+ SPDK_ERRLOG("Could not open bdev '%s', error=%d\n",
+ bdev_name, rc);
+ return rc;
+ }
+
+ nvme->ns[nvme->num_ns].bdev = bdev;
+ nvme->ns[nvme->num_ns].active_ns = 1;
+ nvme->ns[nvme->num_ns].nsid = nvme->num_ns + 1;
+ nvme->num_ns++;
+
+ spdk_vhost_nvme_ns_identify_update(nvme);
+
+ return rc;
+}
+
+int
+spdk_vhost_nvme_controller_construct(void)
+{
+ struct spdk_conf_section *sp;
+ const char *name;
+ const char *bdev_name;
+ const char *cpumask;
+ int rc, i = 0;
+ struct spdk_vhost_dev *vdev;
+ uint32_t ctrlr_num, io_queues;
+
+ for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+ if (!spdk_conf_section_match_prefix(sp, "VhostNvme")) {
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VhostNvme%u", &ctrlr_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ if (name == NULL) {
+ SPDK_ERRLOG("VhostNvme%u: missing Name\n", ctrlr_num);
+ return -1;
+ }
+
+ cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+ rc = spdk_conf_section_get_intval(sp, "NumberOfQueues");
+ if (rc > 0) {
+ io_queues = rc;
+ } else {
+ io_queues = 1;
+ }
+
+ rc = spdk_vhost_nvme_dev_construct(name, cpumask, io_queues);
+ if (rc < 0) {
+ SPDK_ERRLOG("VhostNvme%u: Construct failed\n", ctrlr_num);
+ return -1;
+ }
+
+ vdev = spdk_vhost_dev_find(name);
+ if (!vdev) {
+ return -1;
+ }
+
+ for (i = 0; spdk_conf_section_get_nval(sp, "Namespace", i) != NULL; i++) {
+ bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0);
+ if (!bdev_name) {
+ SPDK_ERRLOG("namespace configuration missing bdev name\n");
+ break;
+ }
+ rc = spdk_vhost_nvme_dev_add_ns(vdev, bdev_name);
+ if (rc < 0) {
+ SPDK_WARNLOG("VhostNvme%u: Construct Namespace with %s failed\n",
+ ctrlr_num, bdev_name);
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_nvme", SPDK_LOG_VHOST_NVME)