diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/module/bdev | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/module/bdev')
110 files changed, 40342 insertions, 0 deletions
diff --git a/src/spdk/module/bdev/Makefile b/src/spdk/module/bdev/Makefile new file mode 100644 index 000000000..2e30470ec --- /dev/null +++ b/src/spdk/module/bdev/Makefile @@ -0,0 +1,61 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +DIRS-y += delay error gpt lvol malloc null nvme passthru raid rpc split zone_block + +DIRS-$(CONFIG_CRYPTO) += crypto + +DIRS-$(CONFIG_OCF) += ocf + +DIRS-$(CONFIG_REDUCE) += compress + +DIRS-$(CONFIG_URING) += uring + +ifeq ($(OS),Linux) +DIRS-y += aio ftl +DIRS-$(CONFIG_ISCSI_INITIATOR) += iscsi +DIRS-$(CONFIG_VIRTIO) += virtio +DIRS-$(CONFIG_PMDK) += pmem +endif + +DIRS-$(CONFIG_RBD) += rbd + +.PHONY: all clean $(DIRS-y) + +all: $(DIRS-y) +clean: $(DIRS-y) + +include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk diff --git a/src/spdk/module/bdev/aio/Makefile b/src/spdk/module/bdev/aio/Makefile new file mode 100644 index 000000000..9f0e3a582 --- /dev/null +++ b/src/spdk/module/bdev/aio/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = bdev_aio.c bdev_aio_rpc.c +LIBNAME = bdev_aio +LOCAL_SYS_LIBS = -laio + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/aio/bdev_aio.c b/src/spdk/module/bdev/aio/bdev_aio.c new file mode 100644 index 000000000..4b49fb2c3 --- /dev/null +++ b/src/spdk/module/bdev/aio/bdev_aio.c @@ -0,0 +1,827 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_aio.h" + +#include "spdk/stdinc.h" + +#include "spdk/barrier.h" +#include "spdk/bdev.h" +#include "spdk/bdev_module.h" +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/fd.h" +#include "spdk/likely.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include <libaio.h> + +struct bdev_aio_io_channel { + uint64_t io_inflight; + struct bdev_aio_group_channel *group_ch; +}; + +struct bdev_aio_group_channel { + struct spdk_poller *poller; + io_context_t io_ctx; +}; + +struct bdev_aio_task { + struct iocb iocb; + uint64_t len; + struct bdev_aio_io_channel *ch; + TAILQ_ENTRY(bdev_aio_task) link; +}; + +struct file_disk { + struct bdev_aio_task *reset_task; + struct spdk_poller *reset_retry_timer; + struct spdk_bdev disk; + char *filename; + int fd; + TAILQ_ENTRY(file_disk) link; + bool block_size_override; +}; + +/* For user space reaping of completions */ +struct spdk_aio_ring { + uint32_t id; + uint32_t size; + uint32_t head; + uint32_t tail; + + uint32_t version; + uint32_t compat_features; + uint32_t incompat_features; + uint32_t header_length; +}; + +#define SPDK_AIO_RING_VERSION 0xa10a10a1 + +static int bdev_aio_initialize(void); +static void bdev_aio_fini(void); +static void aio_free_disk(struct file_disk *fdisk); +static void bdev_aio_get_spdk_running_config(FILE *fp); +static TAILQ_HEAD(, file_disk) g_aio_disk_head; + +#define SPDK_AIO_QUEUE_DEPTH 128 +#define MAX_EVENTS_PER_POLL 32 + +static int +bdev_aio_get_ctx_size(void) +{ + return sizeof(struct bdev_aio_task); +} + +static struct spdk_bdev_module aio_if = { + .name = "aio", + .module_init = bdev_aio_initialize, + .module_fini = bdev_aio_fini, + .config_text = bdev_aio_get_spdk_running_config, + .get_ctx_size = bdev_aio_get_ctx_size, +}; + +SPDK_BDEV_MODULE_REGISTER(aio, &aio_if) + +static int +bdev_aio_open(struct file_disk *disk) +{ + int fd; + + fd = open(disk->filename, O_RDWR | O_DIRECT); + if (fd < 0) { + /* Try without O_DIRECT for non-disk files */ + fd = open(disk->filename, O_RDWR); + if (fd < 0) { + SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", + disk->filename, errno, spdk_strerror(errno)); + disk->fd = -1; + return -1; + } + } + + disk->fd = fd; + + return 0; +} + +static int +bdev_aio_close(struct file_disk *disk) +{ + int rc; + + if (disk->fd == -1) { + return 0; + } + + rc = close(disk->fd); + if (rc < 0) { + SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", + disk->fd, errno, spdk_strerror(errno)); + return -1; + } + + disk->fd = -1; + + return 0; +} + +static int64_t +bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch, + struct bdev_aio_task *aio_task, + struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) +{ + struct iocb *iocb = &aio_task->iocb; + struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); + int rc; + + io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset); + iocb->data = aio_task; + aio_task->len = nbytes; + aio_task->ch = aio_ch; + + SPDK_DEBUGLOG(SPDK_LOG_AIO, "read %d iovs size %lu to off: %#lx\n", + iovcnt, nbytes, offset); + + rc = io_submit(aio_ch->group_ch->io_ctx, 1, &iocb); + if (rc < 0) { + if (rc == -EAGAIN) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); + SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc); + } + return -1; + } + aio_ch->io_inflight++; + return nbytes; +} + +static int64_t +bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch, + struct bdev_aio_task *aio_task, + struct iovec *iov, int iovcnt, size_t len, uint64_t offset) +{ + struct iocb *iocb = &aio_task->iocb; + struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); + int rc; + + io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset); + iocb->data = aio_task; + aio_task->len = len; + aio_task->ch = aio_ch; + + SPDK_DEBUGLOG(SPDK_LOG_AIO, "write %d iovs size %lu from off: %#lx\n", + iovcnt, len, offset); + + rc = io_submit(aio_ch->group_ch->io_ctx, 1, &iocb); + if (rc < 0) { + if (rc == -EAGAIN) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); + SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc); + } + return -1; + } + aio_ch->io_inflight++; + return len; +} + +static void +bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task) +{ + int rc = fsync(fdisk->fd); + + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), + rc == 0 ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); +} + +static int +bdev_aio_destruct(void *ctx) +{ + struct file_disk *fdisk = ctx; + int rc = 0; + + TAILQ_REMOVE(&g_aio_disk_head, fdisk, link); + rc = bdev_aio_close(fdisk); + if (rc < 0) { + SPDK_ERRLOG("bdev_aio_close() failed\n"); + } + spdk_io_device_unregister(fdisk, NULL); + aio_free_disk(fdisk); + return rc; +} + +static int +bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents) +{ + uint32_t head, tail, count; + struct spdk_aio_ring *ring; + struct timespec timeout; + struct io_event *kevents; + + ring = (struct spdk_aio_ring *)io_ctx; + + if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) { + timeout.tv_sec = 0; + timeout.tv_nsec = 0; + + return io_getevents(io_ctx, 0, max, uevents, &timeout); + } + + /* Read the current state out of the ring */ + head = ring->head; + tail = ring->tail; + + /* This memory barrier is required to prevent the loads above + * from being re-ordered with stores to the events array + * potentially occurring on other threads. */ + spdk_smp_rmb(); + + /* Calculate how many items are in the circular ring */ + count = tail - head; + if (tail < head) { + count += ring->size; + } + + /* Reduce the count to the limit provided by the user */ + count = spdk_min(max, count); + + /* Grab the memory location of the event array */ + kevents = (struct io_event *)((uintptr_t)ring + ring->header_length); + + /* Copy the events out of the ring. */ + if ((head + count) <= ring->size) { + /* Only one copy is required */ + memcpy(uevents, &kevents[head], count * sizeof(struct io_event)); + } else { + uint32_t first_part = ring->size - head; + /* Two copies are required */ + memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event)); + memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event)); + } + + /* Update the head pointer. On x86, stores will not be reordered with older loads, + * so the copies out of the event array will always be complete prior to this + * update becoming visible. On other architectures this is not guaranteed, so + * add a barrier. */ +#if defined(__i386__) || defined(__x86_64__) + spdk_compiler_barrier(); +#else + spdk_smp_mb(); +#endif + ring->head = (head + count) % ring->size; + + return count; +} + +static int +bdev_aio_group_poll(void *arg) +{ + struct bdev_aio_group_channel *group_ch = arg; + int nr, i = 0; + enum spdk_bdev_io_status status; + struct bdev_aio_task *aio_task; + struct io_event events[SPDK_AIO_QUEUE_DEPTH]; + + nr = bdev_user_io_getevents(group_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events); + + if (nr < 0) { + return SPDK_POLLER_IDLE; + } + + for (i = 0; i < nr; i++) { + aio_task = events[i].data; + if (events[i].res != aio_task->len) { + status = SPDK_BDEV_IO_STATUS_FAILED; + } else { + status = SPDK_BDEV_IO_STATUS_SUCCESS; + } + + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), status); + aio_task->ch->io_inflight--; + } + + return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +} + +static void +_bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); + + if (aio_ch->io_inflight) { + spdk_for_each_channel_continue(i, -1); + return; + } + + spdk_for_each_channel_continue(i, 0); +} + +static int bdev_aio_reset_retry_timer(void *arg); + +static void +_bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status) +{ + struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i); + + if (status == -1) { + fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500); + return; + } + + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS); +} + +static int +bdev_aio_reset_retry_timer(void *arg) +{ + struct file_disk *fdisk = arg; + + if (fdisk->reset_retry_timer) { + spdk_poller_unregister(&fdisk->reset_retry_timer); + } + + spdk_for_each_channel(fdisk, + _bdev_aio_get_io_inflight, + fdisk, + _bdev_aio_get_io_inflight_done); + + return SPDK_POLLER_BUSY; +} + +static void +bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task) +{ + fdisk->reset_task = aio_task; + + bdev_aio_reset_retry_timer(fdisk); +} + +static void +bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + bdev_aio_readv((struct file_disk *)bdev_io->bdev->ctxt, + ch, + (struct bdev_aio_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_aio_writev((struct file_disk *)bdev_io->bdev->ctxt, + ch, + (struct bdev_aio_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + break; + default: + SPDK_ERRLOG("Wrong io type\n"); + break; + } +} + +static int _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + /* Read and write operations must be performed on buffers aligned to + * bdev->required_alignment. If user specified unaligned buffers, + * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + case SPDK_BDEV_IO_TYPE_FLUSH: + bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt, + (struct bdev_aio_task *)bdev_io->driver_ctx); + return 0; + + case SPDK_BDEV_IO_TYPE_RESET: + bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt, + (struct bdev_aio_task *)bdev_io->driver_ctx); + return 0; + default: + return -1; + } +} + +static void bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_aio_submit_request(ch, bdev_io) < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + + default: + return false; + } +} + +static int +bdev_aio_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_aio_io_channel *ch = ctx_buf; + + ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if)); + + return 0; +} + +static void +bdev_aio_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_aio_io_channel *ch = ctx_buf; + + spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); +} + +static struct spdk_io_channel * +bdev_aio_get_io_channel(void *ctx) +{ + struct file_disk *fdisk = ctx; + + return spdk_get_io_channel(fdisk); +} + + +static int +bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct file_disk *fdisk = ctx; + + spdk_json_write_named_object_begin(w, "aio"); + + spdk_json_write_named_string(w, "filename", fdisk->filename); + + spdk_json_write_object_end(w); + + return 0; +} + +static void +bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct file_disk *fdisk = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_aio_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + if (fdisk->block_size_override) { + spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); + } + spdk_json_write_named_string(w, "filename", fdisk->filename); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table aio_fn_table = { + .destruct = bdev_aio_destruct, + .submit_request = bdev_aio_submit_request, + .io_type_supported = bdev_aio_io_type_supported, + .get_io_channel = bdev_aio_get_io_channel, + .dump_info_json = bdev_aio_dump_info_json, + .write_config_json = bdev_aio_write_json_config, +}; + +static void aio_free_disk(struct file_disk *fdisk) +{ + if (fdisk == NULL) { + return; + } + free(fdisk->filename); + free(fdisk->disk.name); + free(fdisk); +} + +static int +bdev_aio_group_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_aio_group_channel *ch = ctx_buf; + + if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) { + SPDK_ERRLOG("async I/O context setup failure\n"); + return -1; + } + + ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0); + return 0; +} + +static void +bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_aio_group_channel *ch = ctx_buf; + + io_destroy(ch->io_ctx); + + spdk_poller_unregister(&ch->poller); +} + +int +create_aio_bdev(const char *name, const char *filename, uint32_t block_size) +{ + struct file_disk *fdisk; + uint32_t detected_block_size; + uint64_t disk_size; + int rc; + + fdisk = calloc(1, sizeof(*fdisk)); + if (!fdisk) { + SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n"); + return -ENOMEM; + } + + fdisk->filename = strdup(filename); + if (!fdisk->filename) { + rc = -ENOMEM; + goto error_return; + } + + if (bdev_aio_open(fdisk)) { + SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno); + rc = -errno; + goto error_return; + } + + disk_size = spdk_fd_get_size(fdisk->fd); + + fdisk->disk.name = strdup(name); + if (!fdisk->disk.name) { + rc = -ENOMEM; + goto error_return; + } + fdisk->disk.product_name = "AIO disk"; + fdisk->disk.module = &aio_if; + + fdisk->disk.write_cache = 1; + + detected_block_size = spdk_fd_get_blocklen(fdisk->fd); + if (block_size == 0) { + /* User did not specify block size - use autodetected block size. */ + if (detected_block_size == 0) { + SPDK_ERRLOG("Block size could not be auto-detected\n"); + rc = -EINVAL; + goto error_return; + } + fdisk->block_size_override = false; + block_size = detected_block_size; + } else { + if (block_size < detected_block_size) { + SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " + "auto-detected block size %" PRIu32 "\n", + block_size, detected_block_size); + rc = -EINVAL; + goto error_return; + } else if (detected_block_size != 0 && block_size != detected_block_size) { + SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " + "auto-detected block size %" PRIu32 "\n", + block_size, detected_block_size); + } + fdisk->block_size_override = true; + } + + if (block_size < 512) { + SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); + rc = -EINVAL; + goto error_return; + } + + if (!spdk_u32_is_pow2(block_size)) { + SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); + rc = -EINVAL; + goto error_return; + } + + fdisk->disk.blocklen = block_size; + if (fdisk->block_size_override && detected_block_size) { + fdisk->disk.required_alignment = spdk_u32log2(detected_block_size); + } else { + fdisk->disk.required_alignment = spdk_u32log2(block_size); + } + + if (disk_size % fdisk->disk.blocklen != 0) { + SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", + disk_size, fdisk->disk.blocklen); + rc = -EINVAL; + goto error_return; + } + + fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen; + fdisk->disk.ctxt = fdisk; + + fdisk->disk.fn_table = &aio_fn_table; + + spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb, + sizeof(struct bdev_aio_io_channel), + fdisk->disk.name); + rc = spdk_bdev_register(&fdisk->disk); + if (rc) { + spdk_io_device_unregister(fdisk, NULL); + goto error_return; + } + + TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link); + return 0; + +error_return: + bdev_aio_close(fdisk); + aio_free_disk(fdisk); + return rc; +} + +struct delete_aio_bdev_ctx { + delete_aio_bdev_complete cb_fn; + void *cb_arg; +}; + +static void +aio_bdev_unregister_cb(void *arg, int bdeverrno) +{ + struct delete_aio_bdev_ctx *ctx = arg; + + ctx->cb_fn(ctx->cb_arg, bdeverrno); + free(ctx); +} + +void +bdev_aio_delete(struct spdk_bdev *bdev, delete_aio_bdev_complete cb_fn, void *cb_arg) +{ + struct delete_aio_bdev_ctx *ctx; + + if (!bdev || bdev->module != &aio_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + spdk_bdev_unregister(bdev, aio_bdev_unregister_cb, ctx); +} + +static int +bdev_aio_initialize(void) +{ + size_t i; + struct spdk_conf_section *sp; + int rc = 0; + + TAILQ_INIT(&g_aio_disk_head); + spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb, + sizeof(struct bdev_aio_group_channel), + "aio_module"); + + sp = spdk_conf_find_section(NULL, "AIO"); + if (!sp) { + return 0; + } + + i = 0; + while (true) { + const char *file; + const char *name; + const char *block_size_str; + uint32_t block_size = 0; + long int tmp; + + file = spdk_conf_section_get_nmval(sp, "AIO", i, 0); + if (!file) { + break; + } + + name = spdk_conf_section_get_nmval(sp, "AIO", i, 1); + if (!name) { + SPDK_ERRLOG("No name provided for AIO disk with file %s\n", file); + i++; + continue; + } + + block_size_str = spdk_conf_section_get_nmval(sp, "AIO", i, 2); + if (block_size_str) { + tmp = spdk_strtol(block_size_str, 10); + if (tmp < 0) { + SPDK_ERRLOG("Invalid block size for AIO disk with file %s\n", file); + i++; + continue; + } + block_size = (uint32_t)tmp; + } + + rc = create_aio_bdev(name, file, block_size); + if (rc) { + SPDK_ERRLOG("Unable to create AIO bdev from file %s, err is %s\n", file, spdk_strerror(-rc)); + } + + i++; + } + + return 0; +} + +static void +bdev_aio_fini(void) +{ + spdk_io_device_unregister(&aio_if, NULL); +} + +static void +bdev_aio_get_spdk_running_config(FILE *fp) +{ + char *file; + char *name; + uint32_t block_size; + struct file_disk *fdisk; + + fprintf(fp, + "\n" + "# Users must change this section to match the /dev/sdX devices to be\n" + "# exported as iSCSI LUNs. The devices are accessed using Linux AIO.\n" + "# The format is:\n" + "# AIO <file name> <bdev name> [<block size>]\n" + "# The file name is the backing device\n" + "# The bdev name can be referenced from elsewhere in the configuration file.\n" + "# Block size may be omitted to automatically detect the block size of a disk.\n" + "[AIO]\n"); + + TAILQ_FOREACH(fdisk, &g_aio_disk_head, link) { + file = fdisk->filename; + name = fdisk->disk.name; + block_size = fdisk->disk.blocklen; + fprintf(fp, " AIO %s %s ", file, name); + if (fdisk->block_size_override) { + fprintf(fp, "%d", block_size); + } + fprintf(fp, "\n"); + } + fprintf(fp, "\n"); +} + +SPDK_LOG_REGISTER_COMPONENT("aio", SPDK_LOG_AIO) diff --git a/src/spdk/module/bdev/aio/bdev_aio.h b/src/spdk/module/bdev/aio/bdev_aio.h new file mode 100644 index 000000000..9ba425946 --- /dev/null +++ b/src/spdk/module/bdev/aio/bdev_aio.h @@ -0,0 +1,46 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_AIO_H +#define SPDK_BDEV_AIO_H + +#include "spdk/stdinc.h" +#include "spdk/bdev.h" + +typedef void (*delete_aio_bdev_complete)(void *cb_arg, int bdeverrno); + +int create_aio_bdev(const char *name, const char *filename, uint32_t block_size); + +void bdev_aio_delete(struct spdk_bdev *bdev, delete_aio_bdev_complete cb_fn, void *cb_arg); + +#endif /* SPDK_BDEV_AIO_H */ diff --git a/src/spdk/module/bdev/aio/bdev_aio_rpc.c b/src/spdk/module/bdev/aio/bdev_aio_rpc.c new file mode 100644 index 000000000..0968b8d76 --- /dev/null +++ b/src/spdk/module/bdev/aio/bdev_aio_rpc.c @@ -0,0 +1,148 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_aio.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +struct rpc_construct_aio { + char *name; + char *filename; + uint32_t block_size; +}; + +static void +free_rpc_construct_aio(struct rpc_construct_aio *req) +{ + free(req->name); + free(req->filename); +} + +static const struct spdk_json_object_decoder rpc_construct_aio_decoders[] = { + {"name", offsetof(struct rpc_construct_aio, name), spdk_json_decode_string}, + {"filename", offsetof(struct rpc_construct_aio, filename), spdk_json_decode_string}, + {"block_size", offsetof(struct rpc_construct_aio, block_size), spdk_json_decode_uint32, true}, +}; + +static void +rpc_bdev_aio_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_aio req = {}; + struct spdk_json_write_ctx *w; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_construct_aio_decoders, + SPDK_COUNTOF(rpc_construct_aio_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = create_aio_bdev(req.name, req.filename, req.block_size); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, req.name); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_construct_aio(&req); +} +SPDK_RPC_REGISTER("bdev_aio_create", rpc_bdev_aio_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_aio_create, construct_aio_bdev) + +struct rpc_delete_aio { + char *name; +}; + +static void +free_rpc_delete_aio(struct rpc_delete_aio *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_aio_decoders[] = { + {"name", offsetof(struct rpc_delete_aio, name), spdk_json_decode_string}, +}; + +static void +_rpc_bdev_aio_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_aio_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_aio req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_delete_aio_decoders, + SPDK_COUNTOF(rpc_delete_aio_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + bdev_aio_delete(bdev, _rpc_bdev_aio_delete_cb, request); + + free_rpc_delete_aio(&req); + + return; + +cleanup: + free_rpc_delete_aio(&req); +} +SPDK_RPC_REGISTER("bdev_aio_delete", rpc_bdev_aio_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_aio_delete, delete_aio_bdev) diff --git a/src/spdk/module/bdev/compress/Makefile b/src/spdk/module/bdev/compress/Makefile new file mode 100644 index 000000000..e3d889e67 --- /dev/null +++ b/src/spdk/module/bdev/compress/Makefile @@ -0,0 +1,48 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ + +C_SRCS = vbdev_compress.c vbdev_compress_rpc.c +LIBNAME = bdev_compress +CFLAGS += $(ENV_CFLAGS) + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/compress/vbdev_compress.c b/src/spdk/module/bdev/compress/vbdev_compress.c new file mode 100644 index 000000000..a83c97c64 --- /dev/null +++ b/src/spdk/module/bdev/compress/vbdev_compress.c @@ -0,0 +1,1865 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vbdev_compress.h" + +#include "spdk/reduce.h" +#include "spdk/stdinc.h" +#include "spdk/rpc.h" +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/util.h" +#include "spdk/bdev_module.h" + +#include "spdk_internal/log.h" + +#include <rte_config.h> +#include <rte_bus_vdev.h> +#include <rte_compressdev.h> +#include <rte_comp.h> + +#define NUM_MAX_XFORMS 2 +#define NUM_MAX_INFLIGHT_OPS 128 +#define DEFAULT_WINDOW_SIZE 15 +/* We need extra mbufs per operation to accommodate host buffers that + * span a 2MB boundary. + */ +#define MAX_MBUFS_PER_OP (REDUCE_MAX_IOVECS * 2) +#define CHUNK_SIZE (1024 * 16) +#define COMP_BDEV_NAME "compress" +#define BACKING_IO_SZ (4 * 1024) + +#define ISAL_PMD "compress_isal" +#define QAT_PMD "compress_qat" +#define NUM_MBUFS 8192 +#define POOL_CACHE_SIZE 256 + +static enum compress_pmd g_opts; + +/* Global list of available compression devices. */ +struct compress_dev { + struct rte_compressdev_info cdev_info; /* includes device friendly name */ + uint8_t cdev_id; /* identifier for the device */ + void *comp_xform; /* shared private xform for comp on this PMD */ + void *decomp_xform; /* shared private xform for decomp on this PMD */ + TAILQ_ENTRY(compress_dev) link; +}; +static TAILQ_HEAD(, compress_dev) g_compress_devs = TAILQ_HEAD_INITIALIZER(g_compress_devs); + +/* Although ISAL PMD reports 'unlimited' qpairs, it has an unplanned limit of 99 due to + * the length of the internal ring name that it creates, it breaks a limit in the generic + * ring code and fails the qp initialization. + */ +#define MAX_NUM_QP 99 +/* Global list and lock for unique device/queue pair combos */ +struct comp_device_qp { + struct compress_dev *device; /* ptr to compression device */ + uint8_t qp; /* queue pair for this node */ + struct spdk_thread *thread; /* thead that this qp is assigned to */ + TAILQ_ENTRY(comp_device_qp) link; +}; +static TAILQ_HEAD(, comp_device_qp) g_comp_device_qp = TAILQ_HEAD_INITIALIZER(g_comp_device_qp); +static pthread_mutex_t g_comp_device_qp_lock = PTHREAD_MUTEX_INITIALIZER; + +/* For queueing up compression operations that we can't submit for some reason */ +struct vbdev_comp_op { + struct spdk_reduce_backing_dev *backing_dev; + struct iovec *src_iovs; + int src_iovcnt; + struct iovec *dst_iovs; + int dst_iovcnt; + bool compress; + void *cb_arg; + TAILQ_ENTRY(vbdev_comp_op) link; +}; + +struct vbdev_comp_delete_ctx { + spdk_delete_compress_complete cb_fn; + void *cb_arg; + int cb_rc; + struct spdk_thread *orig_thread; +}; + +/* List of virtual bdevs and associated info for each. */ +struct vbdev_compress { + struct spdk_bdev *base_bdev; /* the thing we're attaching to */ + struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ + struct spdk_io_channel *base_ch; /* IO channel of base device */ + struct spdk_bdev comp_bdev; /* the compression virtual bdev */ + struct comp_io_channel *comp_ch; /* channel associated with this bdev */ + char *drv_name; /* name of the compression device driver */ + struct comp_device_qp *device_qp; + struct spdk_thread *reduce_thread; + pthread_mutex_t reduce_lock; + uint32_t ch_count; + TAILQ_HEAD(, spdk_bdev_io) pending_comp_ios; /* outstanding operations to a comp library */ + struct spdk_poller *poller; /* completion poller */ + struct spdk_reduce_vol_params params; /* params for the reduce volume */ + struct spdk_reduce_backing_dev backing_dev; /* backing device info for the reduce volume */ + struct spdk_reduce_vol *vol; /* the reduce volume */ + struct vbdev_comp_delete_ctx *delete_ctx; + bool orphaned; /* base bdev claimed but comp_bdev not registered */ + int reduce_errno; + TAILQ_HEAD(, vbdev_comp_op) queued_comp_ops; + TAILQ_ENTRY(vbdev_compress) link; + struct spdk_thread *thread; /* thread where base device is opened */ +}; +static TAILQ_HEAD(, vbdev_compress) g_vbdev_comp = TAILQ_HEAD_INITIALIZER(g_vbdev_comp); + +/* The comp vbdev channel struct. It is allocated and freed on my behalf by the io channel code. + */ +struct comp_io_channel { + struct spdk_io_channel_iter *iter; /* used with for_each_channel in reset */ +}; + +/* Per I/O context for the compression vbdev. */ +struct comp_bdev_io { + struct comp_io_channel *comp_ch; /* used in completion handling */ + struct vbdev_compress *comp_bdev; /* vbdev associated with this IO */ + struct spdk_bdev_io_wait_entry bdev_io_wait; /* for bdev_io_wait */ + struct spdk_bdev_io *orig_io; /* the original IO */ + struct spdk_io_channel *ch; /* for resubmission */ + int status; /* save for completion on orig thread */ +}; + +/* Shared mempools between all devices on this system */ +static struct rte_mempool *g_mbuf_mp = NULL; /* mbuf mempool */ +static struct rte_mempool *g_comp_op_mp = NULL; /* comp operations, must be rte* mempool */ +static struct rte_mbuf_ext_shared_info g_shinfo = {}; /* used by DPDK mbuf macros */ +static bool g_qat_available = false; +static bool g_isal_available = false; + +/* Create shared (between all ops per PMD) compress xforms. */ +static struct rte_comp_xform g_comp_xform = { + .type = RTE_COMP_COMPRESS, + .compress = { + .algo = RTE_COMP_ALGO_DEFLATE, + .deflate.huffman = RTE_COMP_HUFFMAN_DEFAULT, + .level = RTE_COMP_LEVEL_MAX, + .window_size = DEFAULT_WINDOW_SIZE, + .chksum = RTE_COMP_CHECKSUM_NONE, + .hash_algo = RTE_COMP_HASH_ALGO_NONE + } +}; +/* Create shared (between all ops per PMD) decompress xforms. */ +static struct rte_comp_xform g_decomp_xform = { + .type = RTE_COMP_DECOMPRESS, + .decompress = { + .algo = RTE_COMP_ALGO_DEFLATE, + .chksum = RTE_COMP_CHECKSUM_NONE, + .window_size = DEFAULT_WINDOW_SIZE, + .hash_algo = RTE_COMP_HASH_ALGO_NONE + } +}; + +static void vbdev_compress_examine(struct spdk_bdev *bdev); +static void vbdev_compress_claim(struct vbdev_compress *comp_bdev); +static void vbdev_compress_queue_io(struct spdk_bdev_io *bdev_io); +struct vbdev_compress *_prepare_for_load_init(struct spdk_bdev *bdev, uint32_t lb_size); +static void vbdev_compress_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io); +static void comp_bdev_ch_destroy_cb(void *io_device, void *ctx_buf); +static void vbdev_compress_delete_done(void *cb_arg, int bdeverrno); + +/* Dummy function used by DPDK to free ext attached buffers + * to mbufs, we free them ourselves but this callback has to + * be here. + */ +static void +shinfo_free_cb(void *arg1, void *arg2) +{ +} + +/* Called by vbdev_init_compress_drivers() to init each discovered compression device */ +static int +create_compress_dev(uint8_t index) +{ + struct compress_dev *device; + uint16_t q_pairs; + uint8_t cdev_id; + int rc, i; + struct comp_device_qp *dev_qp; + struct comp_device_qp *tmp_qp; + + device = calloc(1, sizeof(struct compress_dev)); + if (!device) { + return -ENOMEM; + } + + /* Get details about this device. */ + rte_compressdev_info_get(index, &device->cdev_info); + + cdev_id = device->cdev_id = index; + + /* Zero means no limit so choose number of lcores. */ + if (device->cdev_info.max_nb_queue_pairs == 0) { + q_pairs = MAX_NUM_QP; + } else { + q_pairs = spdk_min(device->cdev_info.max_nb_queue_pairs, MAX_NUM_QP); + } + + /* Configure the compression device. */ + struct rte_compressdev_config config = { + .socket_id = rte_socket_id(), + .nb_queue_pairs = q_pairs, + .max_nb_priv_xforms = NUM_MAX_XFORMS, + .max_nb_streams = 0 + }; + rc = rte_compressdev_configure(cdev_id, &config); + if (rc < 0) { + SPDK_ERRLOG("Failed to configure compressdev %u\n", cdev_id); + goto err; + } + + /* Pre-setup all potential qpairs now and assign them in the channel + * callback. + */ + for (i = 0; i < q_pairs; i++) { + rc = rte_compressdev_queue_pair_setup(cdev_id, i, + NUM_MAX_INFLIGHT_OPS, + rte_socket_id()); + if (rc) { + if (i > 0) { + q_pairs = i; + SPDK_NOTICELOG("FYI failed to setup a queue pair on " + "compressdev %u with error %u " + "so limiting to %u qpairs\n", + cdev_id, rc, q_pairs); + break; + } else { + SPDK_ERRLOG("Failed to setup queue pair on " + "compressdev %u with error %u\n", cdev_id, rc); + rc = -EINVAL; + goto err; + } + } + } + + rc = rte_compressdev_start(cdev_id); + if (rc < 0) { + SPDK_ERRLOG("Failed to start device %u: error %d\n", + cdev_id, rc); + goto err; + } + + if (device->cdev_info.capabilities->comp_feature_flags & RTE_COMP_FF_SHAREABLE_PRIV_XFORM) { + rc = rte_compressdev_private_xform_create(cdev_id, &g_comp_xform, + &device->comp_xform); + if (rc < 0) { + SPDK_ERRLOG("Failed to create private comp xform device %u: error %d\n", + cdev_id, rc); + goto err; + } + + rc = rte_compressdev_private_xform_create(cdev_id, &g_decomp_xform, + &device->decomp_xform); + if (rc) { + SPDK_ERRLOG("Failed to create private decomp xform device %u: error %d\n", + cdev_id, rc); + goto err; + } + } else { + SPDK_ERRLOG("PMD does not support shared transforms\n"); + goto err; + } + + /* Build up list of device/qp combinations */ + for (i = 0; i < q_pairs; i++) { + dev_qp = calloc(1, sizeof(struct comp_device_qp)); + if (!dev_qp) { + rc = -ENOMEM; + goto err; + } + dev_qp->device = device; + dev_qp->qp = i; + dev_qp->thread = NULL; + TAILQ_INSERT_TAIL(&g_comp_device_qp, dev_qp, link); + } + + TAILQ_INSERT_TAIL(&g_compress_devs, device, link); + + if (strcmp(device->cdev_info.driver_name, QAT_PMD) == 0) { + g_qat_available = true; + } + if (strcmp(device->cdev_info.driver_name, ISAL_PMD) == 0) { + g_isal_available = true; + } + + return 0; + +err: + TAILQ_FOREACH_SAFE(dev_qp, &g_comp_device_qp, link, tmp_qp) { + TAILQ_REMOVE(&g_comp_device_qp, dev_qp, link); + free(dev_qp); + } + free(device); + return rc; +} + +/* Called from driver init entry point, vbdev_compress_init() */ +static int +vbdev_init_compress_drivers(void) +{ + uint8_t cdev_count, i; + struct compress_dev *tmp_dev; + struct compress_dev *device; + int rc; + + /* We always init the compress_isal PMD */ + rc = rte_vdev_init(ISAL_PMD, NULL); + if (rc == 0) { + SPDK_NOTICELOG("created virtual PMD %s\n", ISAL_PMD); + } else if (rc == -EEXIST) { + SPDK_NOTICELOG("virtual PMD %s already exists.\n", ISAL_PMD); + } else { + SPDK_ERRLOG("creating virtual PMD %s\n", ISAL_PMD); + return -EINVAL; + } + + /* If we have no compression devices, there's no reason to continue. */ + cdev_count = rte_compressdev_count(); + if (cdev_count == 0) { + return 0; + } + if (cdev_count > RTE_COMPRESS_MAX_DEVS) { + SPDK_ERRLOG("invalid device count from rte_compressdev_count()\n"); + return -EINVAL; + } + + g_mbuf_mp = rte_pktmbuf_pool_create("comp_mbuf_mp", NUM_MBUFS, POOL_CACHE_SIZE, + sizeof(struct rte_mbuf), 0, rte_socket_id()); + if (g_mbuf_mp == NULL) { + SPDK_ERRLOG("Cannot create mbuf pool\n"); + rc = -ENOMEM; + goto error_create_mbuf; + } + + g_comp_op_mp = rte_comp_op_pool_create("comp_op_pool", NUM_MBUFS, POOL_CACHE_SIZE, + 0, rte_socket_id()); + if (g_comp_op_mp == NULL) { + SPDK_ERRLOG("Cannot create comp op pool\n"); + rc = -ENOMEM; + goto error_create_op; + } + + /* Init all devices */ + for (i = 0; i < cdev_count; i++) { + rc = create_compress_dev(i); + if (rc != 0) { + goto error_create_compress_devs; + } + } + + if (g_qat_available == true) { + SPDK_NOTICELOG("initialized QAT PMD\n"); + } + + g_shinfo.free_cb = shinfo_free_cb; + + return 0; + + /* Error cleanup paths. */ +error_create_compress_devs: + TAILQ_FOREACH_SAFE(device, &g_compress_devs, link, tmp_dev) { + TAILQ_REMOVE(&g_compress_devs, device, link); + free(device); + } +error_create_op: +error_create_mbuf: + rte_mempool_free(g_mbuf_mp); + + return rc; +} + +/* for completing rw requests on the orig IO thread. */ +static void +_reduce_rw_blocks_cb(void *arg) +{ + struct comp_bdev_io *io_ctx = arg; + + if (io_ctx->status == 0) { + spdk_bdev_io_complete(io_ctx->orig_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + SPDK_ERRLOG("status %d on operation from reduce API\n", io_ctx->status); + spdk_bdev_io_complete(io_ctx->orig_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +/* Completion callback for r/w that were issued via reducelib. */ +static void +reduce_rw_blocks_cb(void *arg, int reduce_errno) +{ + struct spdk_bdev_io *bdev_io = arg; + struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)bdev_io->driver_ctx; + struct spdk_io_channel *ch = spdk_io_channel_from_ctx(io_ctx->comp_ch); + struct spdk_thread *orig_thread; + + /* TODO: need to decide which error codes are bdev_io success vs failure; + * example examine calls reading metadata */ + + io_ctx->status = reduce_errno; + + /* Send this request to the orig IO thread. */ + orig_thread = spdk_io_channel_get_thread(ch); + if (orig_thread != spdk_get_thread()) { + spdk_thread_send_msg(orig_thread, _reduce_rw_blocks_cb, io_ctx); + } else { + _reduce_rw_blocks_cb(io_ctx); + } +} + +static uint64_t +_setup_compress_mbuf(struct rte_mbuf **mbufs, int *mbuf_total, uint64_t *total_length, + struct iovec *iovs, int iovcnt, void *reduce_cb_arg) +{ + uint64_t updated_length, remainder, phys_addr; + uint8_t *current_base = NULL; + int iov_index, mbuf_index; + int rc = 0; + + /* Setup mbufs */ + iov_index = mbuf_index = 0; + while (iov_index < iovcnt) { + + current_base = iovs[iov_index].iov_base; + if (total_length) { + *total_length += iovs[iov_index].iov_len; + } + assert(mbufs[mbuf_index] != NULL); + mbufs[mbuf_index]->userdata = reduce_cb_arg; + updated_length = iovs[iov_index].iov_len; + phys_addr = spdk_vtophys((void *)current_base, &updated_length); + + rte_pktmbuf_attach_extbuf(mbufs[mbuf_index], + current_base, + phys_addr, + updated_length, + &g_shinfo); + rte_pktmbuf_append(mbufs[mbuf_index], updated_length); + remainder = iovs[iov_index].iov_len - updated_length; + + if (mbuf_index > 0) { + rte_pktmbuf_chain(mbufs[0], mbufs[mbuf_index]); + } + + /* If we crossed 2 2MB boundary we need another mbuf for the remainder */ + if (remainder > 0) { + /* allocate an mbuf at the end of the array */ + rc = rte_pktmbuf_alloc_bulk(g_mbuf_mp, + (struct rte_mbuf **)&mbufs[*mbuf_total], 1); + if (rc) { + SPDK_ERRLOG("ERROR trying to get an extra mbuf!\n"); + return -1; + } + (*mbuf_total)++; + mbuf_index++; + mbufs[mbuf_index]->userdata = reduce_cb_arg; + current_base += updated_length; + phys_addr = spdk_vtophys((void *)current_base, &remainder); + /* assert we don't cross another */ + assert(remainder == iovs[iov_index].iov_len - updated_length); + + rte_pktmbuf_attach_extbuf(mbufs[mbuf_index], + current_base, + phys_addr, + remainder, + &g_shinfo); + rte_pktmbuf_append(mbufs[mbuf_index], remainder); + rte_pktmbuf_chain(mbufs[0], mbufs[mbuf_index]); + } + iov_index++; + mbuf_index++; + } + + return 0; +} + +static int +_compress_operation(struct spdk_reduce_backing_dev *backing_dev, struct iovec *src_iovs, + int src_iovcnt, struct iovec *dst_iovs, + int dst_iovcnt, bool compress, void *cb_arg) +{ + void *reduce_cb_arg = cb_arg; + struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(backing_dev, struct vbdev_compress, + backing_dev); + struct rte_comp_op *comp_op; + struct rte_mbuf *src_mbufs[MAX_MBUFS_PER_OP]; + struct rte_mbuf *dst_mbufs[MAX_MBUFS_PER_OP]; + uint8_t cdev_id = comp_bdev->device_qp->device->cdev_id; + uint64_t total_length = 0; + int rc = 0; + struct vbdev_comp_op *op_to_queue; + int i; + int src_mbuf_total = src_iovcnt; + int dst_mbuf_total = dst_iovcnt; + bool device_error = false; + + assert(src_iovcnt < MAX_MBUFS_PER_OP); + +#ifdef DEBUG + memset(src_mbufs, 0, sizeof(src_mbufs)); + memset(dst_mbufs, 0, sizeof(dst_mbufs)); +#endif + + comp_op = rte_comp_op_alloc(g_comp_op_mp); + if (!comp_op) { + SPDK_ERRLOG("trying to get a comp op!\n"); + goto error_get_op; + } + + /* get an mbuf per iov, src and dst */ + rc = rte_pktmbuf_alloc_bulk(g_mbuf_mp, (struct rte_mbuf **)&src_mbufs[0], src_iovcnt); + if (rc) { + SPDK_ERRLOG("ERROR trying to get src_mbufs!\n"); + goto error_get_src; + } + + rc = rte_pktmbuf_alloc_bulk(g_mbuf_mp, (struct rte_mbuf **)&dst_mbufs[0], dst_iovcnt); + if (rc) { + SPDK_ERRLOG("ERROR trying to get dst_mbufs!\n"); + goto error_get_dst; + } + + /* There is a 1:1 mapping between a bdev_io and a compression operation, but + * all compression PMDs that SPDK uses support chaining so build our mbuf chain + * and associate with our single comp_op. + */ + + rc = _setup_compress_mbuf(&src_mbufs[0], &src_mbuf_total, &total_length, + src_iovs, src_iovcnt, reduce_cb_arg); + if (rc < 0) { + goto error_src_dst; + } + + comp_op->m_src = src_mbufs[0]; + comp_op->src.offset = 0; + comp_op->src.length = total_length; + + /* setup dst mbufs, for the current test being used with this code there's only one vector */ + rc = _setup_compress_mbuf(&dst_mbufs[0], &dst_mbuf_total, NULL, + dst_iovs, dst_iovcnt, reduce_cb_arg); + if (rc < 0) { + goto error_src_dst; + } + + comp_op->m_dst = dst_mbufs[0]; + comp_op->dst.offset = 0; + + if (compress == true) { + comp_op->private_xform = comp_bdev->device_qp->device->comp_xform; + } else { + comp_op->private_xform = comp_bdev->device_qp->device->decomp_xform; + } + + comp_op->op_type = RTE_COMP_OP_STATELESS; + comp_op->flush_flag = RTE_COMP_FLUSH_FINAL; + + rc = rte_compressdev_enqueue_burst(cdev_id, comp_bdev->device_qp->qp, &comp_op, 1); + assert(rc <= 1); + + /* We always expect 1 got queued, if 0 then we need to queue it up. */ + if (rc == 1) { + return 0; + } else if (comp_op->status == RTE_COMP_OP_STATUS_NOT_PROCESSED) { + /* we free mbufs differently depending on whether they were chained or not */ + rte_pktmbuf_free(comp_op->m_src); + rte_pktmbuf_free(comp_op->m_dst); + goto error_enqueue; + } else { + device_error = true; + goto error_src_dst; + } + + /* Error cleanup paths. */ +error_src_dst: + for (i = 0; i < dst_mbuf_total; i++) { + rte_pktmbuf_free((struct rte_mbuf *)&dst_mbufs[i]); + } +error_get_dst: + for (i = 0; i < src_mbuf_total; i++) { + rte_pktmbuf_free((struct rte_mbuf *)&src_mbufs[i]); + } +error_get_src: +error_enqueue: + rte_comp_op_free(comp_op); +error_get_op: + + if (device_error == true) { + /* There was an error sending the op to the device, most + * likely with the parameters. + */ + SPDK_ERRLOG("Compression API returned 0x%x\n", comp_op->status); + return -EINVAL; + } + + op_to_queue = calloc(1, sizeof(struct vbdev_comp_op)); + if (op_to_queue == NULL) { + SPDK_ERRLOG("unable to allocate operation for queueing.\n"); + return -ENOMEM; + } + op_to_queue->backing_dev = backing_dev; + op_to_queue->src_iovs = src_iovs; + op_to_queue->src_iovcnt = src_iovcnt; + op_to_queue->dst_iovs = dst_iovs; + op_to_queue->dst_iovcnt = dst_iovcnt; + op_to_queue->compress = compress; + op_to_queue->cb_arg = cb_arg; + TAILQ_INSERT_TAIL(&comp_bdev->queued_comp_ops, + op_to_queue, + link); + return 0; +} + +/* Poller for the DPDK compression driver. */ +static int +comp_dev_poller(void *args) +{ + struct vbdev_compress *comp_bdev = args; + uint8_t cdev_id = comp_bdev->device_qp->device->cdev_id; + struct rte_comp_op *deq_ops[NUM_MAX_INFLIGHT_OPS]; + uint16_t num_deq; + struct spdk_reduce_vol_cb_args *reduce_args; + struct vbdev_comp_op *op_to_resubmit; + int rc, i; + + num_deq = rte_compressdev_dequeue_burst(cdev_id, comp_bdev->device_qp->qp, deq_ops, + NUM_MAX_INFLIGHT_OPS); + for (i = 0; i < num_deq; i++) { + reduce_args = (struct spdk_reduce_vol_cb_args *)deq_ops[i]->m_src->userdata; + + if (deq_ops[i]->status == RTE_COMP_OP_STATUS_SUCCESS) { + + /* tell reduce this is done and what the bytecount was */ + reduce_args->cb_fn(reduce_args->cb_arg, deq_ops[i]->produced); + } else { + SPDK_NOTICELOG("FYI storing data uncompressed due to deque status %u\n", + deq_ops[i]->status); + + /* Reduce will simply store uncompressed on neg errno value. */ + reduce_args->cb_fn(reduce_args->cb_arg, -EINVAL); + } + + /* Now free both mbufs and the compress operation. The rte_pktmbuf_free() + * call takes care of freeing all of the mbufs in the chain back to their + * original pool. + */ + rte_pktmbuf_free(deq_ops[i]->m_src); + rte_pktmbuf_free(deq_ops[i]->m_dst); + + /* There is no bulk free for com ops so we have to free them one at a time + * here however it would be rare that we'd ever have more than 1 at a time + * anyways. + */ + rte_comp_op_free(deq_ops[i]); + + /* Check if there are any pending comp ops to process, only pull one + * at a time off as _compress_operation() may re-queue the op. + */ + if (!TAILQ_EMPTY(&comp_bdev->queued_comp_ops)) { + op_to_resubmit = TAILQ_FIRST(&comp_bdev->queued_comp_ops); + rc = _compress_operation(op_to_resubmit->backing_dev, + op_to_resubmit->src_iovs, + op_to_resubmit->src_iovcnt, + op_to_resubmit->dst_iovs, + op_to_resubmit->dst_iovcnt, + op_to_resubmit->compress, + op_to_resubmit->cb_arg); + if (rc == 0) { + TAILQ_REMOVE(&comp_bdev->queued_comp_ops, op_to_resubmit, link); + free(op_to_resubmit); + } + } + } + return num_deq == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; +} + +/* Entry point for reduce lib to issue a compress operation. */ +static void +_comp_reduce_compress(struct spdk_reduce_backing_dev *dev, + struct iovec *src_iovs, int src_iovcnt, + struct iovec *dst_iovs, int dst_iovcnt, + struct spdk_reduce_vol_cb_args *cb_arg) +{ + int rc; + + rc = _compress_operation(dev, src_iovs, src_iovcnt, dst_iovs, dst_iovcnt, true, cb_arg); + if (rc) { + SPDK_ERRLOG("with compress operation code %d (%s)\n", rc, spdk_strerror(-rc)); + cb_arg->cb_fn(cb_arg->cb_arg, rc); + } +} + +/* Entry point for reduce lib to issue a decompress operation. */ +static void +_comp_reduce_decompress(struct spdk_reduce_backing_dev *dev, + struct iovec *src_iovs, int src_iovcnt, + struct iovec *dst_iovs, int dst_iovcnt, + struct spdk_reduce_vol_cb_args *cb_arg) +{ + int rc; + + rc = _compress_operation(dev, src_iovs, src_iovcnt, dst_iovs, dst_iovcnt, false, cb_arg); + if (rc) { + SPDK_ERRLOG("with decompress operation code %d (%s)\n", rc, spdk_strerror(-rc)); + cb_arg->cb_fn(cb_arg->cb_arg, rc); + } +} + +/* Callback for getting a buf from the bdev pool in the event that the caller passed + * in NULL, we need to own the buffer so it doesn't get freed by another vbdev module + * beneath us before we're done with it. + */ +static void +comp_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +{ + struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_compress, + comp_bdev); + + spdk_reduce_vol_readv(comp_bdev->vol, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, + reduce_rw_blocks_cb, bdev_io); +} + +/* scheduled for completion on IO thread */ +static void +_complete_other_io(void *arg) +{ + struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)arg; + if (io_ctx->status == 0) { + spdk_bdev_io_complete(io_ctx->orig_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + spdk_bdev_io_complete(io_ctx->orig_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +/* scheduled for submission on reduce thread */ +static void +_comp_bdev_io_submit(void *arg) +{ + struct spdk_bdev_io *bdev_io = arg; + struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)bdev_io->driver_ctx; + struct spdk_io_channel *ch = spdk_io_channel_from_ctx(io_ctx->comp_ch); + struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_compress, + comp_bdev); + struct spdk_thread *orig_thread; + int rc = 0; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, comp_read_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return; + case SPDK_BDEV_IO_TYPE_WRITE: + spdk_reduce_vol_writev(comp_bdev->vol, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, + reduce_rw_blocks_cb, bdev_io); + return; + /* TODO in future patch in the series */ + case SPDK_BDEV_IO_TYPE_RESET: + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_FLUSH: + default: + SPDK_ERRLOG("Unknown I/O type %d\n", bdev_io->type); + rc = -EINVAL; + } + + if (rc) { + if (rc == -ENOMEM) { + SPDK_ERRLOG("No memory, start to queue io for compress.\n"); + io_ctx->ch = ch; + vbdev_compress_queue_io(bdev_io); + return; + } else { + SPDK_ERRLOG("on bdev_io submission!\n"); + io_ctx->status = rc; + } + } + + /* Complete this on the orig IO thread. */ + orig_thread = spdk_io_channel_get_thread(ch); + if (orig_thread != spdk_get_thread()) { + spdk_thread_send_msg(orig_thread, _complete_other_io, io_ctx); + } else { + _complete_other_io(io_ctx); + } +} + +/* Called when someone above submits IO to this vbdev. */ +static void +vbdev_compress_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)bdev_io->driver_ctx; + struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_compress, + comp_bdev); + struct comp_io_channel *comp_ch = spdk_io_channel_get_ctx(ch); + + memset(io_ctx, 0, sizeof(struct comp_bdev_io)); + io_ctx->comp_bdev = comp_bdev; + io_ctx->comp_ch = comp_ch; + io_ctx->orig_io = bdev_io; + + /* Send this request to the reduce_thread if that's not what we're on. */ + if (spdk_get_thread() != comp_bdev->reduce_thread) { + spdk_thread_send_msg(comp_bdev->reduce_thread, _comp_bdev_io_submit, bdev_io); + } else { + _comp_bdev_io_submit(bdev_io); + } +} + +static bool +vbdev_compress_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct vbdev_compress *comp_bdev = (struct vbdev_compress *)ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + return spdk_bdev_io_type_supported(comp_bdev->base_bdev, io_type); + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + default: + return false; + } +} + +/* Resubmission function used by the bdev layer when a queued IO is ready to be + * submitted. + */ +static void +vbdev_compress_resubmit_io(void *arg) +{ + struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg; + struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)bdev_io->driver_ctx; + + vbdev_compress_submit_request(io_ctx->ch, bdev_io); +} + +/* Used to queue an IO in the event of resource issues. */ +static void +vbdev_compress_queue_io(struct spdk_bdev_io *bdev_io) +{ + struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)bdev_io->driver_ctx; + int rc; + + io_ctx->bdev_io_wait.bdev = bdev_io->bdev; + io_ctx->bdev_io_wait.cb_fn = vbdev_compress_resubmit_io; + io_ctx->bdev_io_wait.cb_arg = bdev_io; + + rc = spdk_bdev_queue_io_wait(bdev_io->bdev, io_ctx->comp_bdev->base_ch, &io_ctx->bdev_io_wait); + if (rc) { + SPDK_ERRLOG("Queue io failed in vbdev_compress_queue_io, rc=%d.\n", rc); + assert(false); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +/* Callback for unregistering the IO device. */ +static void +_device_unregister_cb(void *io_device) +{ + struct vbdev_compress *comp_bdev = io_device; + + /* Done with this comp_bdev. */ + pthread_mutex_destroy(&comp_bdev->reduce_lock); + free(comp_bdev->comp_bdev.name); + free(comp_bdev); +} + +static void +_vbdev_compress_destruct_cb(void *ctx) +{ + struct vbdev_compress *comp_bdev = ctx; + + TAILQ_REMOVE(&g_vbdev_comp, comp_bdev, link); + spdk_bdev_module_release_bdev(comp_bdev->base_bdev); + /* Close the underlying bdev on its same opened thread. */ + spdk_bdev_close(comp_bdev->base_desc); + comp_bdev->vol = NULL; + if (comp_bdev->orphaned == false) { + spdk_io_device_unregister(comp_bdev, _device_unregister_cb); + } else { + vbdev_compress_delete_done(comp_bdev->delete_ctx, 0); + _device_unregister_cb(comp_bdev); + } +} + +static void +vbdev_compress_destruct_cb(void *cb_arg, int reduce_errno) +{ + struct vbdev_compress *comp_bdev = (struct vbdev_compress *)cb_arg; + + if (reduce_errno) { + SPDK_ERRLOG("number %d\n", reduce_errno); + } else { + if (comp_bdev->thread && comp_bdev->thread != spdk_get_thread()) { + spdk_thread_send_msg(comp_bdev->thread, + _vbdev_compress_destruct_cb, comp_bdev); + } else { + _vbdev_compress_destruct_cb(comp_bdev); + } + } +} + +static void +_reduce_destroy_cb(void *ctx, int reduce_errno) +{ + struct vbdev_compress *comp_bdev = (struct vbdev_compress *)ctx; + + if (reduce_errno) { + SPDK_ERRLOG("number %d\n", reduce_errno); + } + + comp_bdev->vol = NULL; + spdk_put_io_channel(comp_bdev->base_ch); + if (comp_bdev->orphaned == false) { + spdk_bdev_unregister(&comp_bdev->comp_bdev, vbdev_compress_delete_done, + comp_bdev->delete_ctx); + } else { + vbdev_compress_destruct_cb((void *)comp_bdev, 0); + } + +} + +static void +_delete_vol_unload_cb(void *ctx) +{ + struct vbdev_compress *comp_bdev = ctx; + + /* FIXME: Assert if these conditions are not satisified for now. */ + assert(!comp_bdev->reduce_thread || + comp_bdev->reduce_thread == spdk_get_thread()); + + /* reducelib needs a channel to comm with the backing device */ + comp_bdev->base_ch = spdk_bdev_get_io_channel(comp_bdev->base_desc); + + /* Clean the device before we free our resources. */ + spdk_reduce_vol_destroy(&comp_bdev->backing_dev, _reduce_destroy_cb, comp_bdev); +} + +/* Called by reduceLib after performing unload vol actions */ +static void +delete_vol_unload_cb(void *cb_arg, int reduce_errno) +{ + struct vbdev_compress *comp_bdev = (struct vbdev_compress *)cb_arg; + + if (reduce_errno) { + SPDK_ERRLOG("number %d\n", reduce_errno); + /* FIXME: callback should be executed. */ + return; + } + + pthread_mutex_lock(&comp_bdev->reduce_lock); + if (comp_bdev->reduce_thread && comp_bdev->reduce_thread != spdk_get_thread()) { + spdk_thread_send_msg(comp_bdev->reduce_thread, + _delete_vol_unload_cb, comp_bdev); + pthread_mutex_unlock(&comp_bdev->reduce_lock); + } else { + pthread_mutex_unlock(&comp_bdev->reduce_lock); + + _delete_vol_unload_cb(comp_bdev); + } +} + +const char * +compress_get_name(const struct vbdev_compress *comp_bdev) +{ + return comp_bdev->comp_bdev.name; +} + +struct vbdev_compress * +compress_bdev_first(void) +{ + struct vbdev_compress *comp_bdev; + + comp_bdev = TAILQ_FIRST(&g_vbdev_comp); + + return comp_bdev; +} + +struct vbdev_compress * +compress_bdev_next(struct vbdev_compress *prev) +{ + struct vbdev_compress *comp_bdev; + + comp_bdev = TAILQ_NEXT(prev, link); + + return comp_bdev; +} + +bool +compress_has_orphan(const char *name) +{ + struct vbdev_compress *comp_bdev; + + TAILQ_FOREACH(comp_bdev, &g_vbdev_comp, link) { + if (comp_bdev->orphaned && strcmp(name, comp_bdev->comp_bdev.name) == 0) { + return true; + } + } + return false; +} + +/* Called after we've unregistered following a hot remove callback. + * Our finish entry point will be called next. + */ +static int +vbdev_compress_destruct(void *ctx) +{ + struct vbdev_compress *comp_bdev = (struct vbdev_compress *)ctx; + + if (comp_bdev->vol != NULL) { + /* Tell reducelib that we're done with this volume. */ + spdk_reduce_vol_unload(comp_bdev->vol, vbdev_compress_destruct_cb, comp_bdev); + } else { + vbdev_compress_destruct_cb(comp_bdev, 0); + } + + return 0; +} + +/* We supplied this as an entry point for upper layers who want to communicate to this + * bdev. This is how they get a channel. + */ +static struct spdk_io_channel * +vbdev_compress_get_io_channel(void *ctx) +{ + struct vbdev_compress *comp_bdev = (struct vbdev_compress *)ctx; + + /* The IO channel code will allocate a channel for us which consists of + * the SPDK channel structure plus the size of our comp_io_channel struct + * that we passed in when we registered our IO device. It will then call + * our channel create callback to populate any elements that we need to + * update. + */ + return spdk_get_io_channel(comp_bdev); +} + +/* This is the output for bdev_get_bdevs() for this vbdev */ +static int +vbdev_compress_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct vbdev_compress *comp_bdev = (struct vbdev_compress *)ctx; + + spdk_json_write_name(w, "compress"); + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&comp_bdev->comp_bdev)); + spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(comp_bdev->base_bdev)); + spdk_json_write_named_string(w, "compression_pmd", comp_bdev->drv_name); + spdk_json_write_object_end(w); + + return 0; +} + +/* This is used to generate JSON that can configure this module to its current state. */ +static int +vbdev_compress_config_json(struct spdk_json_write_ctx *w) +{ + struct vbdev_compress *comp_bdev; + + TAILQ_FOREACH(comp_bdev, &g_vbdev_comp, link) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "bdev_compress_create"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(comp_bdev->base_bdev)); + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&comp_bdev->comp_bdev)); + spdk_json_write_named_string(w, "compression_pmd", comp_bdev->drv_name); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); + } + return 0; +} + +static void +_vbdev_reduce_init_cb(void *ctx) +{ + struct vbdev_compress *meta_ctx = ctx; + + /* We're done with metadata operations */ + spdk_put_io_channel(meta_ctx->base_ch); + /* Close the underlying bdev on its same opened thread. */ + spdk_bdev_close(meta_ctx->base_desc); + meta_ctx->base_desc = NULL; + + if (meta_ctx->vol) { + vbdev_compress_claim(meta_ctx); + } else { + free(meta_ctx); + } +} + +/* Callback from reduce for when init is complete. We'll pass the vbdev_comp struct + * used for initial metadata operations to claim where it will be further filled out + * and added to the global list. + */ +static void +vbdev_reduce_init_cb(void *cb_arg, struct spdk_reduce_vol *vol, int reduce_errno) +{ + struct vbdev_compress *meta_ctx = cb_arg; + + if (reduce_errno == 0) { + meta_ctx->vol = vol; + } else { + SPDK_ERRLOG("for vol %s, error %u\n", + spdk_bdev_get_name(meta_ctx->base_bdev), reduce_errno); + } + + if (meta_ctx->thread && meta_ctx->thread != spdk_get_thread()) { + spdk_thread_send_msg(meta_ctx->thread, _vbdev_reduce_init_cb, meta_ctx); + } else { + _vbdev_reduce_init_cb(meta_ctx); + } +} + +/* Callback for the function used by reduceLib to perform IO to/from the backing device. We just + * call the callback provided by reduceLib when it called the read/write/unmap function and + * free the bdev_io. + */ +static void +comp_reduce_io_cb(struct spdk_bdev_io *bdev_io, bool success, void *arg) +{ + struct spdk_reduce_vol_cb_args *cb_args = arg; + int reduce_errno; + + if (success) { + reduce_errno = 0; + } else { + reduce_errno = -EIO; + } + spdk_bdev_free_io(bdev_io); + cb_args->cb_fn(cb_args->cb_arg, reduce_errno); +} + +/* This is the function provided to the reduceLib for sending reads directly to + * the backing device. + */ +static void +_comp_reduce_readv(struct spdk_reduce_backing_dev *dev, struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, struct spdk_reduce_vol_cb_args *args) +{ + struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(dev, struct vbdev_compress, + backing_dev); + int rc; + + rc = spdk_bdev_readv_blocks(comp_bdev->base_desc, comp_bdev->base_ch, + iov, iovcnt, lba, lba_count, + comp_reduce_io_cb, + args); + if (rc) { + if (rc == -ENOMEM) { + SPDK_ERRLOG("No memory, start to queue io.\n"); + /* TODO: there's no bdev_io to queue */ + } else { + SPDK_ERRLOG("submitting readv request\n"); + } + args->cb_fn(args->cb_arg, rc); + } +} + +/* This is the function provided to the reduceLib for sending writes directly to + * the backing device. + */ +static void +_comp_reduce_writev(struct spdk_reduce_backing_dev *dev, struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, struct spdk_reduce_vol_cb_args *args) +{ + struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(dev, struct vbdev_compress, + backing_dev); + int rc; + + rc = spdk_bdev_writev_blocks(comp_bdev->base_desc, comp_bdev->base_ch, + iov, iovcnt, lba, lba_count, + comp_reduce_io_cb, + args); + if (rc) { + if (rc == -ENOMEM) { + SPDK_ERRLOG("No memory, start to queue io.\n"); + /* TODO: there's no bdev_io to queue */ + } else { + SPDK_ERRLOG("error submitting writev request\n"); + } + args->cb_fn(args->cb_arg, rc); + } +} + +/* This is the function provided to the reduceLib for sending unmaps directly to + * the backing device. + */ +static void +_comp_reduce_unmap(struct spdk_reduce_backing_dev *dev, + uint64_t lba, uint32_t lba_count, struct spdk_reduce_vol_cb_args *args) +{ + struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(dev, struct vbdev_compress, + backing_dev); + int rc; + + rc = spdk_bdev_unmap_blocks(comp_bdev->base_desc, comp_bdev->base_ch, + lba, lba_count, + comp_reduce_io_cb, + args); + + if (rc) { + if (rc == -ENOMEM) { + SPDK_ERRLOG("No memory, start to queue io.\n"); + /* TODO: there's no bdev_io to queue */ + } else { + SPDK_ERRLOG("submitting unmap request\n"); + } + args->cb_fn(args->cb_arg, rc); + } +} + +/* Called by reduceLib after performing unload vol actions following base bdev hotremove */ +static void +bdev_hotremove_vol_unload_cb(void *cb_arg, int reduce_errno) +{ + struct vbdev_compress *comp_bdev = (struct vbdev_compress *)cb_arg; + + if (reduce_errno) { + SPDK_ERRLOG("number %d\n", reduce_errno); + } + + comp_bdev->vol = NULL; + spdk_bdev_unregister(&comp_bdev->comp_bdev, NULL, NULL); +} + +/* Called when the underlying base bdev goes away. */ +static void +vbdev_compress_base_bdev_hotremove_cb(void *ctx) +{ + struct vbdev_compress *comp_bdev, *tmp; + struct spdk_bdev *bdev_find = ctx; + + TAILQ_FOREACH_SAFE(comp_bdev, &g_vbdev_comp, link, tmp) { + if (bdev_find == comp_bdev->base_bdev) { + /* Tell reduceLib that we're done with this volume. */ + spdk_reduce_vol_unload(comp_bdev->vol, bdev_hotremove_vol_unload_cb, comp_bdev); + } + } +} + +/* TODO: determine which parms we want user configurable, HC for now + * params.vol_size + * params.chunk_size + * compression PMD, algorithm, window size, comp level, etc. + * DEV_MD_PATH + */ + +/* Common function for init and load to allocate and populate the minimal + * information for reducelib to init or load. + */ +struct vbdev_compress * +_prepare_for_load_init(struct spdk_bdev *bdev, uint32_t lb_size) +{ + struct vbdev_compress *meta_ctx; + + meta_ctx = calloc(1, sizeof(struct vbdev_compress)); + if (meta_ctx == NULL) { + SPDK_ERRLOG("failed to alloc init contexts\n"); + return NULL; + } + + meta_ctx->drv_name = "None"; + meta_ctx->base_bdev = bdev; + meta_ctx->backing_dev.unmap = _comp_reduce_unmap; + meta_ctx->backing_dev.readv = _comp_reduce_readv; + meta_ctx->backing_dev.writev = _comp_reduce_writev; + meta_ctx->backing_dev.compress = _comp_reduce_compress; + meta_ctx->backing_dev.decompress = _comp_reduce_decompress; + + meta_ctx->backing_dev.blocklen = bdev->blocklen; + meta_ctx->backing_dev.blockcnt = bdev->blockcnt; + + meta_ctx->params.chunk_size = CHUNK_SIZE; + if (lb_size == 0) { + meta_ctx->params.logical_block_size = bdev->blocklen; + } else { + meta_ctx->params.logical_block_size = lb_size; + } + + meta_ctx->params.backing_io_unit_size = BACKING_IO_SZ; + return meta_ctx; +} + +static bool +_set_pmd(struct vbdev_compress *comp_dev) +{ + if (g_opts == COMPRESS_PMD_AUTO) { + if (g_qat_available) { + comp_dev->drv_name = QAT_PMD; + } else { + comp_dev->drv_name = ISAL_PMD; + } + } else if (g_opts == COMPRESS_PMD_QAT_ONLY && g_qat_available) { + comp_dev->drv_name = QAT_PMD; + } else if (g_opts == COMPRESS_PMD_ISAL_ONLY && g_isal_available) { + comp_dev->drv_name = ISAL_PMD; + } else { + SPDK_ERRLOG("Requested PMD is not available.\n"); + return false; + } + SPDK_NOTICELOG("PMD being used: %s\n", comp_dev->drv_name); + return true; +} + +/* Call reducelib to initialize a new volume */ +static int +vbdev_init_reduce(struct spdk_bdev *bdev, const char *pm_path, uint32_t lb_size) +{ + struct vbdev_compress *meta_ctx; + int rc; + + meta_ctx = _prepare_for_load_init(bdev, lb_size); + if (meta_ctx == NULL) { + return -EINVAL; + } + + if (_set_pmd(meta_ctx) == false) { + SPDK_ERRLOG("could not find required pmd\n"); + free(meta_ctx); + return -EINVAL; + } + + rc = spdk_bdev_open(meta_ctx->base_bdev, true, vbdev_compress_base_bdev_hotremove_cb, + meta_ctx->base_bdev, &meta_ctx->base_desc); + if (rc) { + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(meta_ctx->base_bdev)); + free(meta_ctx); + return -EINVAL; + } + + /* Save the thread where the base device is opened */ + meta_ctx->thread = spdk_get_thread(); + + meta_ctx->base_ch = spdk_bdev_get_io_channel(meta_ctx->base_desc); + + spdk_reduce_vol_init(&meta_ctx->params, &meta_ctx->backing_dev, + pm_path, + vbdev_reduce_init_cb, + meta_ctx); + return 0; +} + +/* We provide this callback for the SPDK channel code to create a channel using + * the channel struct we provided in our module get_io_channel() entry point. Here + * we get and save off an underlying base channel of the device below us so that + * we can communicate with the base bdev on a per channel basis. If we needed + * our own poller for this vbdev, we'd register it here. + */ +static int +comp_bdev_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct vbdev_compress *comp_bdev = io_device; + struct comp_device_qp *device_qp; + + /* Now set the reduce channel if it's not already set. */ + pthread_mutex_lock(&comp_bdev->reduce_lock); + if (comp_bdev->ch_count == 0) { + /* We use this queue to track outstanding IO in our layer. */ + TAILQ_INIT(&comp_bdev->pending_comp_ios); + + /* We use this to queue up compression operations as needed. */ + TAILQ_INIT(&comp_bdev->queued_comp_ops); + + comp_bdev->base_ch = spdk_bdev_get_io_channel(comp_bdev->base_desc); + comp_bdev->reduce_thread = spdk_get_thread(); + comp_bdev->poller = SPDK_POLLER_REGISTER(comp_dev_poller, comp_bdev, 0); + /* Now assign a q pair */ + pthread_mutex_lock(&g_comp_device_qp_lock); + TAILQ_FOREACH(device_qp, &g_comp_device_qp, link) { + if (strcmp(device_qp->device->cdev_info.driver_name, comp_bdev->drv_name) == 0) { + if (device_qp->thread == spdk_get_thread()) { + comp_bdev->device_qp = device_qp; + break; + } + if (device_qp->thread == NULL) { + comp_bdev->device_qp = device_qp; + device_qp->thread = spdk_get_thread(); + break; + } + } + } + pthread_mutex_unlock(&g_comp_device_qp_lock); + } + comp_bdev->ch_count++; + pthread_mutex_unlock(&comp_bdev->reduce_lock); + + if (comp_bdev->device_qp != NULL) { + return 0; + } else { + SPDK_ERRLOG("out of qpairs, cannot assign one to comp_bdev %p\n", comp_bdev); + assert(false); + return -ENOMEM; + } +} + +static void +_channel_cleanup(struct vbdev_compress *comp_bdev) +{ + /* Note: comp_bdevs can share a device_qp if they are + * on the same thread so we leave the device_qp element + * alone for this comp_bdev and just clear the reduce thread. + */ + spdk_put_io_channel(comp_bdev->base_ch); + comp_bdev->reduce_thread = NULL; + spdk_poller_unregister(&comp_bdev->poller); +} + +/* Used to reroute destroy_ch to the correct thread */ +static void +_comp_bdev_ch_destroy_cb(void *arg) +{ + struct vbdev_compress *comp_bdev = arg; + + pthread_mutex_lock(&comp_bdev->reduce_lock); + _channel_cleanup(comp_bdev); + pthread_mutex_unlock(&comp_bdev->reduce_lock); +} + +/* We provide this callback for the SPDK channel code to destroy a channel + * created with our create callback. We just need to undo anything we did + * when we created. If this bdev used its own poller, we'd unregister it here. + */ +static void +comp_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct vbdev_compress *comp_bdev = io_device; + + pthread_mutex_lock(&comp_bdev->reduce_lock); + comp_bdev->ch_count--; + if (comp_bdev->ch_count == 0) { + /* Send this request to the thread where the channel was created. */ + if (comp_bdev->reduce_thread != spdk_get_thread()) { + spdk_thread_send_msg(comp_bdev->reduce_thread, + _comp_bdev_ch_destroy_cb, comp_bdev); + } else { + _channel_cleanup(comp_bdev); + } + } + pthread_mutex_unlock(&comp_bdev->reduce_lock); +} + +/* RPC entry point for compression vbdev creation. */ +int +create_compress_bdev(const char *bdev_name, const char *pm_path, uint32_t lb_size) +{ + struct spdk_bdev *bdev; + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + return -ENODEV; + } + + if ((lb_size != 0) && (lb_size != LB_SIZE_4K) && (lb_size != LB_SIZE_512B)) { + SPDK_ERRLOG("Logical block size must be 512 or 4096\n"); + return -EINVAL; + } + + return vbdev_init_reduce(bdev, pm_path, lb_size); +} + +/* On init, just init the compress drivers. All metadata is stored on disk. */ +static int +vbdev_compress_init(void) +{ + if (vbdev_init_compress_drivers()) { + SPDK_ERRLOG("Error setting up compression devices\n"); + return -EINVAL; + } + + return 0; +} + +/* Called when the entire module is being torn down. */ +static void +vbdev_compress_finish(void) +{ + struct comp_device_qp *dev_qp; + /* TODO: unload vol in a future patch */ + + while ((dev_qp = TAILQ_FIRST(&g_comp_device_qp))) { + TAILQ_REMOVE(&g_comp_device_qp, dev_qp, link); + free(dev_qp); + } + pthread_mutex_destroy(&g_comp_device_qp_lock); + + rte_mempool_free(g_comp_op_mp); + rte_mempool_free(g_mbuf_mp); +} + +/* During init we'll be asked how much memory we'd like passed to us + * in bev_io structures as context. Here's where we specify how + * much context we want per IO. + */ +static int +vbdev_compress_get_ctx_size(void) +{ + return sizeof(struct comp_bdev_io); +} + +/* When we register our bdev this is how we specify our entry points. */ +static const struct spdk_bdev_fn_table vbdev_compress_fn_table = { + .destruct = vbdev_compress_destruct, + .submit_request = vbdev_compress_submit_request, + .io_type_supported = vbdev_compress_io_type_supported, + .get_io_channel = vbdev_compress_get_io_channel, + .dump_info_json = vbdev_compress_dump_info_json, + .write_config_json = NULL, +}; + +static struct spdk_bdev_module compress_if = { + .name = "compress", + .module_init = vbdev_compress_init, + .config_text = NULL, + .get_ctx_size = vbdev_compress_get_ctx_size, + .examine_disk = vbdev_compress_examine, + .module_fini = vbdev_compress_finish, + .config_json = vbdev_compress_config_json +}; + +SPDK_BDEV_MODULE_REGISTER(compress, &compress_if) + +static int _set_compbdev_name(struct vbdev_compress *comp_bdev) +{ + struct spdk_bdev_alias *aliases; + + if (!TAILQ_EMPTY(spdk_bdev_get_aliases(comp_bdev->base_bdev))) { + aliases = TAILQ_FIRST(spdk_bdev_get_aliases(comp_bdev->base_bdev)); + comp_bdev->comp_bdev.name = spdk_sprintf_alloc("COMP_%s", aliases->alias); + if (!comp_bdev->comp_bdev.name) { + SPDK_ERRLOG("could not allocate comp_bdev name for alias\n"); + return -ENOMEM; + } + } else { + comp_bdev->comp_bdev.name = spdk_sprintf_alloc("COMP_%s", comp_bdev->base_bdev->name); + if (!comp_bdev->comp_bdev.name) { + SPDK_ERRLOG("could not allocate comp_bdev name for unique name\n"); + return -ENOMEM; + } + } + return 0; +} + +static void +vbdev_compress_claim(struct vbdev_compress *comp_bdev) +{ + int rc; + + if (_set_compbdev_name(comp_bdev)) { + goto error_bdev_name; + } + + /* Note: some of the fields below will change in the future - for example, + * blockcnt specifically will not match (the compressed volume size will + * be slightly less than the base bdev size) + */ + comp_bdev->comp_bdev.product_name = COMP_BDEV_NAME; + comp_bdev->comp_bdev.write_cache = comp_bdev->base_bdev->write_cache; + + if (strcmp(comp_bdev->drv_name, QAT_PMD) == 0) { + comp_bdev->comp_bdev.required_alignment = + spdk_max(spdk_u32log2(comp_bdev->base_bdev->blocklen), + comp_bdev->base_bdev->required_alignment); + SPDK_NOTICELOG("QAT in use: Required alignment set to %u\n", + comp_bdev->comp_bdev.required_alignment); + } else { + comp_bdev->comp_bdev.required_alignment = comp_bdev->base_bdev->required_alignment; + } + comp_bdev->comp_bdev.optimal_io_boundary = + comp_bdev->params.chunk_size / comp_bdev->params.logical_block_size; + + comp_bdev->comp_bdev.split_on_optimal_io_boundary = true; + + comp_bdev->comp_bdev.blocklen = comp_bdev->params.logical_block_size; + comp_bdev->comp_bdev.blockcnt = comp_bdev->params.vol_size / comp_bdev->comp_bdev.blocklen; + assert(comp_bdev->comp_bdev.blockcnt > 0); + + /* This is the context that is passed to us when the bdev + * layer calls in so we'll save our comp_bdev node here. + */ + comp_bdev->comp_bdev.ctxt = comp_bdev; + comp_bdev->comp_bdev.fn_table = &vbdev_compress_fn_table; + comp_bdev->comp_bdev.module = &compress_if; + + pthread_mutex_init(&comp_bdev->reduce_lock, NULL); + + rc = spdk_bdev_open(comp_bdev->base_bdev, true, vbdev_compress_base_bdev_hotremove_cb, + comp_bdev->base_bdev, &comp_bdev->base_desc); + if (rc) { + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(comp_bdev->base_bdev)); + goto error_open; + } + + /* Save the thread where the base device is opened */ + comp_bdev->thread = spdk_get_thread(); + + spdk_io_device_register(comp_bdev, comp_bdev_ch_create_cb, comp_bdev_ch_destroy_cb, + sizeof(struct comp_io_channel), + comp_bdev->comp_bdev.name); + + rc = spdk_bdev_module_claim_bdev(comp_bdev->base_bdev, comp_bdev->base_desc, + comp_bdev->comp_bdev.module); + if (rc) { + SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(comp_bdev->base_bdev)); + goto error_claim; + } + + rc = spdk_bdev_register(&comp_bdev->comp_bdev); + if (rc < 0) { + SPDK_ERRLOG("trying to register bdev\n"); + goto error_bdev_register; + } + + TAILQ_INSERT_TAIL(&g_vbdev_comp, comp_bdev, link); + + SPDK_NOTICELOG("registered io_device and virtual bdev for: %s\n", comp_bdev->comp_bdev.name); + + return; + /* Error cleanup paths. */ +error_bdev_register: + spdk_bdev_module_release_bdev(comp_bdev->base_bdev); +error_claim: + spdk_io_device_unregister(comp_bdev, NULL); + spdk_bdev_close(comp_bdev->base_desc); +error_open: + free(comp_bdev->comp_bdev.name); +error_bdev_name: + free(comp_bdev); +} + +static void +_vbdev_compress_delete_done(void *_ctx) +{ + struct vbdev_comp_delete_ctx *ctx = _ctx; + + ctx->cb_fn(ctx->cb_arg, ctx->cb_rc); + + free(ctx); +} + +static void +vbdev_compress_delete_done(void *cb_arg, int bdeverrno) +{ + struct vbdev_comp_delete_ctx *ctx = cb_arg; + + ctx->cb_rc = bdeverrno; + + if (ctx->orig_thread != spdk_get_thread()) { + spdk_thread_send_msg(ctx->orig_thread, _vbdev_compress_delete_done, ctx); + } else { + _vbdev_compress_delete_done(ctx); + } +} + +void +bdev_compress_delete(const char *name, spdk_delete_compress_complete cb_fn, void *cb_arg) +{ + struct vbdev_compress *comp_bdev = NULL; + struct vbdev_comp_delete_ctx *ctx; + + TAILQ_FOREACH(comp_bdev, &g_vbdev_comp, link) { + if (strcmp(name, comp_bdev->comp_bdev.name) == 0) { + break; + } + } + + if (comp_bdev == NULL) { + cb_fn(cb_arg, -ENODEV); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + SPDK_ERRLOG("Failed to allocate delete context\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + /* Save these for after the vol is destroyed. */ + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->orig_thread = spdk_get_thread(); + + comp_bdev->delete_ctx = ctx; + + /* Tell reducelib that we're done with this volume. */ + if (comp_bdev->orphaned == false) { + spdk_reduce_vol_unload(comp_bdev->vol, delete_vol_unload_cb, comp_bdev); + } else { + delete_vol_unload_cb(comp_bdev, 0); + } +} + +static void +_vbdev_reduce_load_cb(void *ctx) +{ + struct vbdev_compress *meta_ctx = ctx; + int rc; + + /* Done with metadata operations */ + spdk_put_io_channel(meta_ctx->base_ch); + /* Close the underlying bdev on its same opened thread. */ + spdk_bdev_close(meta_ctx->base_desc); + meta_ctx->base_desc = NULL; + + if (meta_ctx->reduce_errno == 0) { + if (_set_pmd(meta_ctx) == false) { + SPDK_ERRLOG("could not find required pmd\n"); + goto err; + } + + vbdev_compress_claim(meta_ctx); + } else if (meta_ctx->reduce_errno == -ENOENT) { + if (_set_compbdev_name(meta_ctx)) { + goto err; + } + + /* We still want to open and claim the backing device to protect the data until + * either the pm metadata file is recovered or the comp bdev is deleted. + */ + rc = spdk_bdev_open(meta_ctx->base_bdev, true, vbdev_compress_base_bdev_hotremove_cb, + meta_ctx->base_bdev, &meta_ctx->base_desc); + if (rc) { + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(meta_ctx->base_bdev)); + free(meta_ctx->comp_bdev.name); + goto err; + } + + /* Save the thread where the base device is opened */ + meta_ctx->thread = spdk_get_thread(); + + meta_ctx->comp_bdev.module = &compress_if; + pthread_mutex_init(&meta_ctx->reduce_lock, NULL); + rc = spdk_bdev_module_claim_bdev(meta_ctx->base_bdev, meta_ctx->base_desc, + meta_ctx->comp_bdev.module); + if (rc) { + SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(meta_ctx->base_bdev)); + spdk_bdev_close(meta_ctx->base_desc); + free(meta_ctx->comp_bdev.name); + goto err; + } + + meta_ctx->orphaned = true; + TAILQ_INSERT_TAIL(&g_vbdev_comp, meta_ctx, link); + } else { + if (meta_ctx->reduce_errno != -EILSEQ) { + SPDK_ERRLOG("for vol %s, error %u\n", + spdk_bdev_get_name(meta_ctx->base_bdev), meta_ctx->reduce_errno); + } + goto err; + } + + spdk_bdev_module_examine_done(&compress_if); + return; + +err: + free(meta_ctx); + spdk_bdev_module_examine_done(&compress_if); +} + +/* Callback from reduce for then load is complete. We'll pass the vbdev_comp struct + * used for initial metadata operations to claim where it will be further filled out + * and added to the global list. + */ +static void +vbdev_reduce_load_cb(void *cb_arg, struct spdk_reduce_vol *vol, int reduce_errno) +{ + struct vbdev_compress *meta_ctx = cb_arg; + + if (reduce_errno == 0) { + /* Update information following volume load. */ + meta_ctx->vol = vol; + memcpy(&meta_ctx->params, spdk_reduce_vol_get_params(vol), + sizeof(struct spdk_reduce_vol_params)); + } + + meta_ctx->reduce_errno = reduce_errno; + + if (meta_ctx->thread && meta_ctx->thread != spdk_get_thread()) { + spdk_thread_send_msg(meta_ctx->thread, _vbdev_reduce_load_cb, meta_ctx); + } else { + _vbdev_reduce_load_cb(meta_ctx); + } + +} + +/* Examine_disk entry point: will do a metadata load to see if this is ours, + * and if so will go ahead and claim it. + */ +static void +vbdev_compress_examine(struct spdk_bdev *bdev) +{ + struct vbdev_compress *meta_ctx; + int rc; + + if (strcmp(bdev->product_name, COMP_BDEV_NAME) == 0) { + spdk_bdev_module_examine_done(&compress_if); + return; + } + + meta_ctx = _prepare_for_load_init(bdev, 0); + if (meta_ctx == NULL) { + spdk_bdev_module_examine_done(&compress_if); + return; + } + + rc = spdk_bdev_open(meta_ctx->base_bdev, false, vbdev_compress_base_bdev_hotremove_cb, + meta_ctx->base_bdev, &meta_ctx->base_desc); + if (rc) { + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(meta_ctx->base_bdev)); + free(meta_ctx); + spdk_bdev_module_examine_done(&compress_if); + return; + } + + /* Save the thread where the base device is opened */ + meta_ctx->thread = spdk_get_thread(); + + meta_ctx->base_ch = spdk_bdev_get_io_channel(meta_ctx->base_desc); + spdk_reduce_vol_load(&meta_ctx->backing_dev, vbdev_reduce_load_cb, meta_ctx); +} + +int +compress_set_pmd(enum compress_pmd *opts) +{ + g_opts = *opts; + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_compress", SPDK_LOG_VBDEV_COMPRESS) diff --git a/src/spdk/module/bdev/compress/vbdev_compress.h b/src/spdk/module/bdev/compress/vbdev_compress.h new file mode 100644 index 000000000..4dcd78f60 --- /dev/null +++ b/src/spdk/module/bdev/compress/vbdev_compress.h @@ -0,0 +1,106 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_COMPRESS_H +#define SPDK_VBDEV_COMPRESS_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" + +#define LB_SIZE_4K 0x1000UL +#define LB_SIZE_512B 0x200UL + +/** + * Get the first compression bdev. + * + * \return the first compression bdev. + */ +struct vbdev_compress *compress_bdev_first(void); + +/** + * Get the next compression bdev. + * + * \param prev previous compression bdev. + * \return the next compression bdev. + */ +struct vbdev_compress *compress_bdev_next(struct vbdev_compress *prev); + +/** + * Test to see if a compression bdev orphan exists. + * + * \param name The name of the compression bdev. + * \return true if found, false if not. + */ +bool compress_has_orphan(const char *name); + +/** + * Get the name of a compression bdev. + * + * \param comp_bdev The compression bdev. + * \return the name of the compression bdev. + */ +const char *compress_get_name(const struct vbdev_compress *comp_bdev); + +enum compress_pmd { + COMPRESS_PMD_AUTO = 0, + COMPRESS_PMD_QAT_ONLY, + COMPRESS_PMD_ISAL_ONLY, + COMPRESS_PMD_MAX +}; + +int compress_set_pmd(enum compress_pmd *opts); + +typedef void (*spdk_delete_compress_complete)(void *cb_arg, int bdeverrno); + +/** + * Create new compression bdev. + * + * \param bdev_name Bdev on which compression bdev will be created. + * \param pm_path Path to persistent memory. + * \param lb_size Logical block size for the compressed volume in bytes. Must be 4K or 512. + * \return 0 on success, other on failure. + */ +int create_compress_bdev(const char *bdev_name, const char *pm_path, uint32_t lb_size); + +/** + * Delete compress bdev. + * + * \param bdev_name Bdev on which compression bdev will be deleted. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void bdev_compress_delete(const char *bdev_name, spdk_delete_compress_complete cb_fn, + void *cb_arg); + +#endif /* SPDK_VBDEV_COMPRESS_H */ diff --git a/src/spdk/module/bdev/compress/vbdev_compress_rpc.c b/src/spdk/module/bdev/compress/vbdev_compress_rpc.c new file mode 100644 index 000000000..9eedae066 --- /dev/null +++ b/src/spdk/module/bdev/compress/vbdev_compress_rpc.c @@ -0,0 +1,252 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vbdev_compress.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +struct rpc_bdev_compress_get_orphans { + char *name; +}; + +static void +free_rpc_bdev_compress_get_orphans(struct rpc_bdev_compress_get_orphans *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_compress_get_orphans_decoders[] = { + {"name", offsetof(struct rpc_bdev_compress_get_orphans, name), spdk_json_decode_string, true}, +}; + +static void +rpc_bdev_compress_get_orphans(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_compress_get_orphans req = {}; + struct spdk_json_write_ctx *w; + struct vbdev_compress *comp_bdev; + bool found = false; + + + if (params && spdk_json_decode_object(params, rpc_bdev_compress_get_orphans_decoders, + SPDK_COUNTOF(rpc_bdev_compress_get_orphans_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + free_rpc_bdev_compress_get_orphans(&req); + return; + } + + if (req.name) { + if (compress_has_orphan(req.name) == false) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + free_rpc_bdev_compress_get_orphans(&req); + return; + } + found = true; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + if (found) { + spdk_json_write_string(w, req.name); + } else { + for (comp_bdev = compress_bdev_first(); comp_bdev != NULL; + comp_bdev = compress_bdev_next(comp_bdev)) { + if (compress_has_orphan(compress_get_name(comp_bdev))) { + spdk_json_write_string(w, compress_get_name(comp_bdev)); + } + } + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + free_rpc_bdev_compress_get_orphans(&req); +} +SPDK_RPC_REGISTER("bdev_compress_get_orphans", rpc_bdev_compress_get_orphans, SPDK_RPC_RUNTIME) + +struct rpc_compress_set_pmd { + enum compress_pmd pmd; +}; + +static const struct spdk_json_object_decoder rpc_compress_pmd_decoder[] = { + {"pmd", offsetof(struct rpc_compress_set_pmd, pmd), spdk_json_decode_int32}, +}; + +static void +rpc_compress_set_pmd(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_compress_set_pmd req; + struct spdk_json_write_ctx *w; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_compress_pmd_decoder, + SPDK_COUNTOF(rpc_compress_pmd_decoder), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + return; + } + + if (req.pmd >= COMPRESS_PMD_MAX) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, + "PMD value %d should be less than %d", req.pmd, COMPRESS_PMD_MAX); + return; + } + + rc = compress_set_pmd(&req.pmd); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w != NULL) { + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + } +} +SPDK_RPC_REGISTER("compress_set_pmd", rpc_compress_set_pmd, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(compress_set_pmd, set_compress_pmd) + +/* Structure to hold the parameters for this RPC method. */ +struct rpc_construct_compress { + char *base_bdev_name; + char *pm_path; + uint32_t lb_size; +}; + +/* Free the allocated memory resource after the RPC handling. */ +static void +free_rpc_construct_compress(struct rpc_construct_compress *r) +{ + free(r->base_bdev_name); + free(r->pm_path); +} + +/* Structure to decode the input parameters for this RPC method. */ +static const struct spdk_json_object_decoder rpc_construct_compress_decoders[] = { + {"base_bdev_name", offsetof(struct rpc_construct_compress, base_bdev_name), spdk_json_decode_string}, + {"pm_path", offsetof(struct rpc_construct_compress, pm_path), spdk_json_decode_string}, + {"lb_size", offsetof(struct rpc_construct_compress, lb_size), spdk_json_decode_uint32}, +}; + +/* Decode the parameters for this RPC method and properly construct the compress + * device. Error status returned in the failed cases. + */ +static void +rpc_bdev_compress_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_compress req = {NULL}; + struct spdk_json_write_ctx *w; + char *name; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_compress_decoders, + SPDK_COUNTOF(rpc_construct_compress_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_COMPRESS, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = create_compress_bdev(req.base_bdev_name, req.pm_path, req.lb_size); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + name = spdk_sprintf_alloc("COMP_%s", req.base_bdev_name); + spdk_json_write_string(w, name); + spdk_jsonrpc_end_result(request, w); + free(name); + +cleanup: + free_rpc_construct_compress(&req); +} +SPDK_RPC_REGISTER("bdev_compress_create", rpc_bdev_compress_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_compress_create, construct_compress_bdev) + +struct rpc_delete_compress { + char *name; +}; + +static void +free_rpc_delete_compress(struct rpc_delete_compress *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_compress_decoders[] = { + {"name", offsetof(struct rpc_delete_compress, name), spdk_json_decode_string}, +}; + +static void +_rpc_bdev_compress_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_compress_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_compress req = {NULL}; + + if (spdk_json_decode_object(params, rpc_delete_compress_decoders, + SPDK_COUNTOF(rpc_delete_compress_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + } else { + bdev_compress_delete(req.name, _rpc_bdev_compress_delete_cb, request); + } + + free_rpc_delete_compress(&req); +} +SPDK_RPC_REGISTER("bdev_compress_delete", rpc_bdev_compress_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_compress_delete, delete_compress_bdev) diff --git a/src/spdk/module/bdev/crypto/Makefile b/src/spdk/module/bdev/crypto/Makefile new file mode 100644 index 000000000..dbf96952d --- /dev/null +++ b/src/spdk/module/bdev/crypto/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +CFLAGS += $(ENV_CFLAGS) + +C_SRCS = vbdev_crypto.c vbdev_crypto_rpc.c +LIBNAME = bdev_crypto + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/crypto/vbdev_crypto.c b/src/spdk/module/bdev/crypto/vbdev_crypto.c new file mode 100644 index 000000000..f5dd0f814 --- /dev/null +++ b/src/spdk/module/bdev/crypto/vbdev_crypto.c @@ -0,0 +1,2040 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUcryptoION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vbdev_crypto.h" + +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/thread.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#include <rte_config.h> +#include <rte_version.h> +#include <rte_bus_vdev.h> +#include <rte_crypto.h> +#include <rte_cryptodev.h> +#include <rte_cryptodev_pmd.h> + +/* To add support for new device types, follow the examples of the following... + * Note that the string names are defined by the DPDK PMD in question so be + * sure to use the exact names. + */ +#define MAX_NUM_DRV_TYPES 2 + +/* The VF spread is the number of queue pairs between virtual functions, we use this to + * load balance the QAT device. + */ +#define QAT_VF_SPREAD 32 +static uint8_t g_qat_total_qp = 0; +static uint8_t g_next_qat_index; + +const char *g_driver_names[MAX_NUM_DRV_TYPES] = { AESNI_MB, QAT }; + +/* Global list of available crypto devices. */ +struct vbdev_dev { + struct rte_cryptodev_info cdev_info; /* includes device friendly name */ + uint8_t cdev_id; /* identifier for the device */ + TAILQ_ENTRY(vbdev_dev) link; +}; +static TAILQ_HEAD(, vbdev_dev) g_vbdev_devs = TAILQ_HEAD_INITIALIZER(g_vbdev_devs); + +/* Global list and lock for unique device/queue pair combos. We keep 1 list per supported PMD + * so that we can optimize per PMD where it make sense. For example, with QAT there an optimal + * pattern for assigning queue pairs where with AESNI there is not. + */ +struct device_qp { + struct vbdev_dev *device; /* ptr to crypto device */ + uint8_t qp; /* queue pair for this node */ + bool in_use; /* whether this node is in use or not */ + uint8_t index; /* used by QAT to load balance placement of qpairs */ + TAILQ_ENTRY(device_qp) link; +}; +static TAILQ_HEAD(, device_qp) g_device_qp_qat = TAILQ_HEAD_INITIALIZER(g_device_qp_qat); +static TAILQ_HEAD(, device_qp) g_device_qp_aesni_mb = TAILQ_HEAD_INITIALIZER(g_device_qp_aesni_mb); +static pthread_mutex_t g_device_qp_lock = PTHREAD_MUTEX_INITIALIZER; + + +/* In order to limit the number of resources we need to do one crypto + * operation per LBA (we use LBA as IV), we tell the bdev layer that + * our max IO size is something reasonable. Units here are in bytes. + */ +#define CRYPTO_MAX_IO (64 * 1024) + +/* This controls how many ops will be dequeued from the crypto driver in one run + * of the poller. It is mainly a performance knob as it effectively determines how + * much work the poller has to do. However even that can vary between crypto drivers + * as the AESNI_MB driver for example does all the crypto work on dequeue whereas the + * QAT driver just dequeues what has been completed already. + */ +#define MAX_DEQUEUE_BURST_SIZE 64 + +/* When enqueueing, we need to supply the crypto driver with an array of pointers to + * operation structs. As each of these can be max 512B, we can adjust the CRYPTO_MAX_IO + * value in conjunction with the other defines to make sure we're not using crazy amounts + * of memory. All of these numbers can and probably should be adjusted based on the + * workload. By default we'll use the worst case (smallest) block size for the + * minimum number of array entries. As an example, a CRYPTO_MAX_IO size of 64K with 512B + * blocks would give us an enqueue array size of 128. + */ +#define MAX_ENQUEUE_ARRAY_SIZE (CRYPTO_MAX_IO / 512) + +/* The number of MBUFS we need must be a power of two and to support other small IOs + * in addition to the limits mentioned above, we go to the next power of two. It is + * big number because it is one mempool for source and destination mbufs. It may + * need to be bigger to support multiple crypto drivers at once. + */ +#define NUM_MBUFS 32768 +#define POOL_CACHE_SIZE 256 +#define MAX_CRYPTO_VOLUMES 128 +#define NUM_SESSIONS (2 * MAX_CRYPTO_VOLUMES) +#define SESS_MEMPOOL_CACHE_SIZE 0 +uint8_t g_number_of_claimed_volumes = 0; + +/* This is the max number of IOs we can supply to any crypto device QP at one time. + * It can vary between drivers. + */ +#define CRYPTO_QP_DESCRIPTORS 2048 + +/* Specific to AES_CBC. */ +#define AES_CBC_IV_LENGTH 16 +#define AES_CBC_KEY_LENGTH 16 +#define AES_XTS_KEY_LENGTH 16 /* XTS uses 2 keys, each of this size. */ +#define AESNI_MB_NUM_QP 64 + +/* Common for suported devices. */ +#define IV_OFFSET (sizeof(struct rte_crypto_op) + \ + sizeof(struct rte_crypto_sym_op)) +#define QUEUED_OP_OFFSET (IV_OFFSET + AES_CBC_IV_LENGTH) + +static void _complete_internal_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); +static void _complete_internal_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); +static void _complete_internal_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); +static void vbdev_crypto_examine(struct spdk_bdev *bdev); +static int vbdev_crypto_claim(struct spdk_bdev *bdev); +static void vbdev_crypto_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io); + +/* List of crypto_bdev names and their base bdevs via configuration file. */ +struct bdev_names { + char *vbdev_name; /* name of the vbdev to create */ + char *bdev_name; /* base bdev name */ + + /* Note, for dev/test we allow use of key in the config file, for production + * use, you must use an RPC to specify the key for security reasons. + */ + uint8_t *key; /* key per bdev */ + char *drv_name; /* name of the crypto device driver */ + char *cipher; /* AES_CBC or AES_XTS */ + uint8_t *key2; /* key #2 for AES_XTS, per bdev */ + TAILQ_ENTRY(bdev_names) link; +}; +static TAILQ_HEAD(, bdev_names) g_bdev_names = TAILQ_HEAD_INITIALIZER(g_bdev_names); + +/* List of virtual bdevs and associated info for each. We keep the device friendly name here even + * though its also in the device struct because we use it early on. + */ +struct vbdev_crypto { + struct spdk_bdev *base_bdev; /* the thing we're attaching to */ + struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ + struct spdk_bdev crypto_bdev; /* the crypto virtual bdev */ + uint8_t *key; /* key per bdev */ + uint8_t *key2; /* for XTS */ + uint8_t *xts_key; /* key + key 2 */ + char *drv_name; /* name of the crypto device driver */ + char *cipher; /* cipher used */ + struct rte_cryptodev_sym_session *session_encrypt; /* encryption session for this bdev */ + struct rte_cryptodev_sym_session *session_decrypt; /* decryption session for this bdev */ + struct rte_crypto_sym_xform cipher_xform; /* crypto control struct for this bdev */ + TAILQ_ENTRY(vbdev_crypto) link; + struct spdk_thread *thread; /* thread where base device is opened */ +}; +static TAILQ_HEAD(, vbdev_crypto) g_vbdev_crypto = TAILQ_HEAD_INITIALIZER(g_vbdev_crypto); + +/* Shared mempools between all devices on this system */ +static struct rte_mempool *g_session_mp = NULL; +static struct rte_mempool *g_session_mp_priv = NULL; +static struct spdk_mempool *g_mbuf_mp = NULL; /* mbuf mempool */ +static struct rte_mempool *g_crypto_op_mp = NULL; /* crypto operations, must be rte* mempool */ + +/* For queueing up crypto operations that we can't submit for some reason */ +struct vbdev_crypto_op { + uint8_t cdev_id; + uint8_t qp; + struct rte_crypto_op *crypto_op; + struct spdk_bdev_io *bdev_io; + TAILQ_ENTRY(vbdev_crypto_op) link; +}; +#define QUEUED_OP_LENGTH (sizeof(struct vbdev_crypto_op)) + +/* The crypto vbdev channel struct. It is allocated and freed on my behalf by the io channel code. + * We store things in here that are needed on per thread basis like the base_channel for this thread, + * and the poller for this thread. + */ +struct crypto_io_channel { + struct spdk_io_channel *base_ch; /* IO channel of base device */ + struct spdk_poller *poller; /* completion poller */ + struct device_qp *device_qp; /* unique device/qp combination for this channel */ + TAILQ_HEAD(, spdk_bdev_io) pending_cry_ios; /* outstanding operations to the crypto device */ + struct spdk_io_channel_iter *iter; /* used with for_each_channel in reset */ + TAILQ_HEAD(, vbdev_crypto_op) queued_cry_ops; /* queued for re-submission to CryptoDev */ +}; + +/* This is the crypto per IO context that the bdev layer allocates for us opaquely and attaches to + * each IO for us. + */ +struct crypto_bdev_io { + int cryop_cnt_remaining; /* counter used when completing crypto ops */ + struct crypto_io_channel *crypto_ch; /* need to store for crypto completion handling */ + struct vbdev_crypto *crypto_bdev; /* the crypto node struct associated with this IO */ + struct spdk_bdev_io *orig_io; /* the original IO */ + struct spdk_bdev_io *read_io; /* the read IO we issued */ + int8_t bdev_io_status; /* the status we'll report back on the bdev IO */ + bool on_pending_list; + /* Used for the single contiguous buffer that serves as the crypto destination target for writes */ + uint64_t aux_num_blocks; /* num of blocks for the contiguous buffer */ + uint64_t aux_offset_blocks; /* block offset on media */ + void *aux_buf_raw; /* raw buffer that the bdev layer gave us for write buffer */ + struct iovec aux_buf_iov; /* iov representing aligned contig write buffer */ + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + struct spdk_io_channel *ch; +}; + +/* Called by vbdev_crypto_init_crypto_drivers() to init each discovered crypto device */ +static int +create_vbdev_dev(uint8_t index, uint16_t num_lcores) +{ + struct vbdev_dev *device; + uint8_t j, cdev_id, cdrv_id; + struct device_qp *dev_qp; + struct device_qp *tmp_qp; + int rc; + TAILQ_HEAD(device_qps, device_qp) *dev_qp_head; + + device = calloc(1, sizeof(struct vbdev_dev)); + if (!device) { + return -ENOMEM; + } + + /* Get details about this device. */ + rte_cryptodev_info_get(index, &device->cdev_info); + cdrv_id = device->cdev_info.driver_id; + cdev_id = device->cdev_id = index; + + /* Before going any further, make sure we have enough resources for this + * device type to function. We need a unique queue pair per core accross each + * device type to remain lockless.... + */ + if ((rte_cryptodev_device_count_by_driver(cdrv_id) * + device->cdev_info.max_nb_queue_pairs) < num_lcores) { + SPDK_ERRLOG("Insufficient unique queue pairs available for %s\n", + device->cdev_info.driver_name); + SPDK_ERRLOG("Either add more crypto devices or decrease core count\n"); + rc = -EINVAL; + goto err; + } + + /* Setup queue pairs. */ + struct rte_cryptodev_config conf = { + .nb_queue_pairs = device->cdev_info.max_nb_queue_pairs, + .socket_id = SPDK_ENV_SOCKET_ID_ANY + }; + + rc = rte_cryptodev_configure(cdev_id, &conf); + if (rc < 0) { + SPDK_ERRLOG("Failed to configure cryptodev %u\n", cdev_id); + rc = -EINVAL; + goto err; + } + + struct rte_cryptodev_qp_conf qp_conf = { + .nb_descriptors = CRYPTO_QP_DESCRIPTORS, +#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) + .mp_session = g_session_mp, + .mp_session_private = g_session_mp_priv, +#endif + }; + + /* Pre-setup all potential qpairs now and assign them in the channel + * callback. If we were to create them there, we'd have to stop the + * entire device affecting all other threads that might be using it + * even on other queue pairs. + */ + for (j = 0; j < device->cdev_info.max_nb_queue_pairs; j++) { +#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) + rc = rte_cryptodev_queue_pair_setup(cdev_id, j, &qp_conf, SOCKET_ID_ANY); +#else + rc = rte_cryptodev_queue_pair_setup(cdev_id, j, &qp_conf, SOCKET_ID_ANY, + g_session_mp); +#endif + + if (rc < 0) { + SPDK_ERRLOG("Failed to setup queue pair %u on " + "cryptodev %u\n", j, cdev_id); + rc = -EINVAL; + goto err; + } + } + + rc = rte_cryptodev_start(cdev_id); + if (rc < 0) { + SPDK_ERRLOG("Failed to start device %u: error %d\n", + cdev_id, rc); + rc = -EINVAL; + goto err; + } + + /* Select the right device/qp list based on driver name + * or error if it does not exist. + */ + if (strcmp(device->cdev_info.driver_name, QAT) == 0) { + dev_qp_head = (struct device_qps *)&g_device_qp_qat; + } else if (strcmp(device->cdev_info.driver_name, AESNI_MB) == 0) { + dev_qp_head = (struct device_qps *)&g_device_qp_aesni_mb; + } else { + rc = -EINVAL; + goto err; + } + + /* Build up lists of device/qp combinations per PMD */ + for (j = 0; j < device->cdev_info.max_nb_queue_pairs; j++) { + dev_qp = calloc(1, sizeof(struct device_qp)); + if (!dev_qp) { + rc = -ENOMEM; + goto err_qp_alloc; + } + dev_qp->device = device; + dev_qp->qp = j; + dev_qp->in_use = false; + if (strcmp(device->cdev_info.driver_name, QAT) == 0) { + g_qat_total_qp++; + } + TAILQ_INSERT_TAIL(dev_qp_head, dev_qp, link); + } + + /* Add to our list of available crypto devices. */ + TAILQ_INSERT_TAIL(&g_vbdev_devs, device, link); + + return 0; +err_qp_alloc: + TAILQ_FOREACH_SAFE(dev_qp, dev_qp_head, link, tmp_qp) { + TAILQ_REMOVE(dev_qp_head, dev_qp, link); + free(dev_qp); + } +err: + free(device); + + return rc; +} + +/* This is called from the module's init function. We setup all crypto devices early on as we are unable + * to easily dynamically configure queue pairs after the drivers are up and running. So, here, we + * configure the max capabilities of each device and assign threads to queue pairs as channels are + * requested. + */ +static int +vbdev_crypto_init_crypto_drivers(void) +{ + uint8_t cdev_count; + uint8_t cdev_id; + int i, rc = 0; + struct vbdev_dev *device; + struct vbdev_dev *tmp_dev; + struct device_qp *dev_qp; + unsigned int max_sess_size = 0, sess_size; + uint16_t num_lcores = rte_lcore_count(); + char aesni_args[32]; + + /* Only the first call, via RPC or module init should init the crypto drivers. */ + if (g_session_mp != NULL) { + return 0; + } + + /* We always init AESNI_MB */ + snprintf(aesni_args, sizeof(aesni_args), "max_nb_queue_pairs=%d", AESNI_MB_NUM_QP); + rc = rte_vdev_init(AESNI_MB, aesni_args); + if (rc) { + SPDK_ERRLOG("error creating virtual PMD %s\n", AESNI_MB); + return -EINVAL; + } + + /* If we have no crypto devices, there's no reason to continue. */ + cdev_count = rte_cryptodev_count(); + if (cdev_count == 0) { + return 0; + } + + /* + * Create global mempools, shared by all devices regardless of type. + */ + + /* First determine max session size, most pools are shared by all the devices, + * so we need to find the global max sessions size. + */ + for (cdev_id = 0; cdev_id < cdev_count; cdev_id++) { + sess_size = rte_cryptodev_sym_get_private_session_size(cdev_id); + if (sess_size > max_sess_size) { + max_sess_size = sess_size; + } + } + +#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0) + g_session_mp_priv = rte_mempool_create("session_mp_priv", NUM_SESSIONS, max_sess_size, + SESS_MEMPOOL_CACHE_SIZE, 0, NULL, NULL, NULL, + NULL, SOCKET_ID_ANY, 0); + if (g_session_mp_priv == NULL) { + SPDK_ERRLOG("Cannot create private session pool max size 0x%x\n", max_sess_size); + return -ENOMEM; + } + + g_session_mp = rte_cryptodev_sym_session_pool_create( + "session_mp", + NUM_SESSIONS, 0, SESS_MEMPOOL_CACHE_SIZE, 0, + SOCKET_ID_ANY); +#else + g_session_mp = rte_mempool_create("session_mp", NUM_SESSIONS, max_sess_size, + SESS_MEMPOOL_CACHE_SIZE, + 0, NULL, NULL, NULL, NULL, SOCKET_ID_ANY, 0); +#endif + if (g_session_mp == NULL) { + SPDK_ERRLOG("Cannot create session pool max size 0x%x\n", max_sess_size); + goto error_create_session_mp; + return -ENOMEM; + } + + g_mbuf_mp = spdk_mempool_create("mbuf_mp", NUM_MBUFS, sizeof(struct rte_mbuf), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (g_mbuf_mp == NULL) { + SPDK_ERRLOG("Cannot create mbuf pool\n"); + rc = -ENOMEM; + goto error_create_mbuf; + } + + /* We use per op private data to store the IV and our own struct + * for queueing ops. + */ + g_crypto_op_mp = rte_crypto_op_pool_create("op_mp", + RTE_CRYPTO_OP_TYPE_SYMMETRIC, + NUM_MBUFS, + POOL_CACHE_SIZE, + AES_CBC_IV_LENGTH + QUEUED_OP_LENGTH, + rte_socket_id()); + + if (g_crypto_op_mp == NULL) { + SPDK_ERRLOG("Cannot create op pool\n"); + rc = -ENOMEM; + goto error_create_op; + } + + /* Init all devices */ + for (i = 0; i < cdev_count; i++) { + rc = create_vbdev_dev(i, num_lcores); + if (rc) { + goto err; + } + } + + /* Assign index values to the QAT device qp nodes so that we can + * assign them for optimal performance. + */ + i = 0; + TAILQ_FOREACH(dev_qp, &g_device_qp_qat, link) { + dev_qp->index = i++; + } + + return 0; + + /* Error cleanup paths. */ +err: + TAILQ_FOREACH_SAFE(device, &g_vbdev_devs, link, tmp_dev) { + TAILQ_REMOVE(&g_vbdev_devs, device, link); + free(device); + } + rte_mempool_free(g_crypto_op_mp); + g_crypto_op_mp = NULL; +error_create_op: + spdk_mempool_free(g_mbuf_mp); + g_mbuf_mp = NULL; +error_create_mbuf: + rte_mempool_free(g_session_mp); + g_session_mp = NULL; +error_create_session_mp: + if (g_session_mp_priv != NULL) { + rte_mempool_free(g_session_mp_priv); + g_session_mp_priv = NULL; + } + return rc; +} + +/* Following an encrypt or decrypt we need to then either write the encrypted data or finish + * the read on decrypted data. Do that here. + */ +static void +_crypto_operation_complete(struct spdk_bdev_io *bdev_io) +{ + struct vbdev_crypto *crypto_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_crypto, + crypto_bdev); + struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + struct crypto_io_channel *crypto_ch = io_ctx->crypto_ch; + struct spdk_bdev_io *free_me = io_ctx->read_io; + int rc = 0; + + TAILQ_REMOVE(&crypto_ch->pending_cry_ios, bdev_io, module_link); + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + + /* Complete the original IO and then free the one that we created + * as a result of issuing an IO via submit_request. + */ + if (io_ctx->bdev_io_status != SPDK_BDEV_IO_STATUS_FAILED) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + SPDK_ERRLOG("Issue with decryption on bdev_io %p\n", bdev_io); + rc = -EINVAL; + } + spdk_bdev_free_io(free_me); + + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + + if (io_ctx->bdev_io_status != SPDK_BDEV_IO_STATUS_FAILED) { + /* Write the encrypted data. */ + rc = spdk_bdev_writev_blocks(crypto_bdev->base_desc, crypto_ch->base_ch, + &io_ctx->aux_buf_iov, 1, io_ctx->aux_offset_blocks, + io_ctx->aux_num_blocks, _complete_internal_write, + bdev_io); + } else { + SPDK_ERRLOG("Issue with encryption on bdev_io %p\n", bdev_io); + rc = -EINVAL; + } + + } else { + SPDK_ERRLOG("Unknown bdev type %u on crypto operation completion\n", + bdev_io->type); + rc = -EINVAL; + } + + if (rc) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static int _crypto_operation(struct spdk_bdev_io *bdev_io, + enum rte_crypto_cipher_operation crypto_op, + void *aux_buf); + +/* This is the poller for the crypto device. It uses a single API to dequeue whatever is ready at + * the device. Then we need to decide if what we've got so far (including previous poller + * runs) totals up to one or more complete bdev_ios and if so continue with the bdev_io + * accordingly. This means either completing a read or issuing a new write. + */ +static int +crypto_dev_poller(void *args) +{ + struct crypto_io_channel *crypto_ch = args; + uint8_t cdev_id = crypto_ch->device_qp->device->cdev_id; + int i, num_dequeued_ops, num_enqueued_ops; + struct spdk_bdev_io *bdev_io = NULL; + struct crypto_bdev_io *io_ctx = NULL; + struct rte_crypto_op *dequeued_ops[MAX_DEQUEUE_BURST_SIZE]; + struct rte_crypto_op *mbufs_to_free[2 * MAX_DEQUEUE_BURST_SIZE]; + int num_mbufs = 0; + struct vbdev_crypto_op *op_to_resubmit; + + /* Each run of the poller will get just what the device has available + * at the moment we call it, we don't check again after draining the + * first batch. + */ + num_dequeued_ops = rte_cryptodev_dequeue_burst(cdev_id, crypto_ch->device_qp->qp, + dequeued_ops, MAX_DEQUEUE_BURST_SIZE); + + /* Check if operation was processed successfully */ + for (i = 0; i < num_dequeued_ops; i++) { + + /* We don't know the order or association of the crypto ops wrt any + * partiular bdev_io so need to look at each and determine if it's + * the last one for it's bdev_io or not. + */ + bdev_io = (struct spdk_bdev_io *)dequeued_ops[i]->sym->m_src->userdata; + assert(bdev_io != NULL); + io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + + if (dequeued_ops[i]->status != RTE_CRYPTO_OP_STATUS_SUCCESS) { + SPDK_ERRLOG("error with op %d status %u\n", i, + dequeued_ops[i]->status); + /* Update the bdev status to error, we'll still process the + * rest of the crypto ops for this bdev_io though so they + * aren't left hanging. + */ + io_ctx->bdev_io_status = SPDK_BDEV_IO_STATUS_FAILED; + } + + assert(io_ctx->cryop_cnt_remaining > 0); + + /* Return the associated src and dst mbufs by collecting them into + * an array that we can use the bulk API to free after the loop. + */ + dequeued_ops[i]->sym->m_src->userdata = NULL; + mbufs_to_free[num_mbufs++] = (void *)dequeued_ops[i]->sym->m_src; + if (dequeued_ops[i]->sym->m_dst) { + mbufs_to_free[num_mbufs++] = (void *)dequeued_ops[i]->sym->m_dst; + } + + /* done encrypting, complete the bdev_io */ + if (--io_ctx->cryop_cnt_remaining == 0) { + + /* If we're completing this with an outstanding reset we need + * to fail it. + */ + if (crypto_ch->iter) { + io_ctx->bdev_io_status = SPDK_BDEV_IO_STATUS_FAILED; + } + + /* Complete the IO */ + _crypto_operation_complete(bdev_io); + } + } + + /* Now bulk free both mbufs and crypto operations. */ + if (num_dequeued_ops > 0) { + rte_mempool_put_bulk(g_crypto_op_mp, + (void **)dequeued_ops, + num_dequeued_ops); + assert(num_mbufs > 0); + spdk_mempool_put_bulk(g_mbuf_mp, + (void **)mbufs_to_free, + num_mbufs); + } + + /* Check if there are any pending crypto ops to process */ + while (!TAILQ_EMPTY(&crypto_ch->queued_cry_ops)) { + op_to_resubmit = TAILQ_FIRST(&crypto_ch->queued_cry_ops); + io_ctx = (struct crypto_bdev_io *)op_to_resubmit->bdev_io->driver_ctx; + num_enqueued_ops = rte_cryptodev_enqueue_burst(op_to_resubmit->cdev_id, + op_to_resubmit->qp, + &op_to_resubmit->crypto_op, + 1); + if (num_enqueued_ops == 1) { + /* Make sure we don't put this on twice as one bdev_io is made up + * of many crypto ops. + */ + if (io_ctx->on_pending_list == false) { + TAILQ_INSERT_TAIL(&crypto_ch->pending_cry_ios, op_to_resubmit->bdev_io, module_link); + io_ctx->on_pending_list = true; + } + TAILQ_REMOVE(&crypto_ch->queued_cry_ops, op_to_resubmit, link); + } else { + /* if we couldn't get one, just break and try again later. */ + break; + } + } + + /* If the channel iter is not NULL, we need to continue to poll + * until the pending list is empty, then we can move on to the + * next channel. + */ + if (crypto_ch->iter && TAILQ_EMPTY(&crypto_ch->pending_cry_ios)) { + SPDK_NOTICELOG("Channel %p has been quiesced.\n", crypto_ch); + spdk_for_each_channel_continue(crypto_ch->iter, 0); + crypto_ch->iter = NULL; + } + + return num_dequeued_ops; +} + +/* We're either encrypting on the way down or decrypting on the way back. */ +static int +_crypto_operation(struct spdk_bdev_io *bdev_io, enum rte_crypto_cipher_operation crypto_op, + void *aux_buf) +{ + uint16_t num_enqueued_ops = 0; + uint32_t cryop_cnt = bdev_io->u.bdev.num_blocks; + struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + struct crypto_io_channel *crypto_ch = io_ctx->crypto_ch; + uint8_t cdev_id = crypto_ch->device_qp->device->cdev_id; + uint32_t crypto_len = io_ctx->crypto_bdev->crypto_bdev.blocklen; + uint64_t total_length = bdev_io->u.bdev.num_blocks * crypto_len; + int rc; + uint32_t iov_index = 0; + uint32_t allocated = 0; + uint8_t *current_iov = NULL; + uint64_t total_remaining = 0; + uint64_t updated_length, current_iov_remaining = 0; + uint32_t crypto_index = 0; + uint32_t en_offset = 0; + struct rte_crypto_op *crypto_ops[MAX_ENQUEUE_ARRAY_SIZE]; + struct rte_mbuf *src_mbufs[MAX_ENQUEUE_ARRAY_SIZE]; + struct rte_mbuf *dst_mbufs[MAX_ENQUEUE_ARRAY_SIZE]; + int burst; + struct vbdev_crypto_op *op_to_queue; + uint64_t alignment = spdk_bdev_get_buf_align(&io_ctx->crypto_bdev->crypto_bdev); + + assert((bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen) <= CRYPTO_MAX_IO); + + /* Get the number of source mbufs that we need. These will always be 1:1 because we + * don't support chaining. The reason we don't is because of our decision to use + * LBA as IV, there can be no case where we'd need >1 mbuf per crypto op or the + * op would be > 1 LBA. + */ + rc = spdk_mempool_get_bulk(g_mbuf_mp, (void **)&src_mbufs[0], cryop_cnt); + if (rc) { + SPDK_ERRLOG("ERROR trying to get src_mbufs!\n"); + return -ENOMEM; + } + + /* Get the same amount but these buffers to describe the encrypted data location (dst). */ + if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + rc = spdk_mempool_get_bulk(g_mbuf_mp, (void **)&dst_mbufs[0], cryop_cnt); + if (rc) { + SPDK_ERRLOG("ERROR trying to get dst_mbufs!\n"); + rc = -ENOMEM; + goto error_get_dst; + } + } + +#ifdef __clang_analyzer__ + /* silence scan-build false positive */ + SPDK_CLANG_ANALYZER_PREINIT_PTR_ARRAY(crypto_ops, MAX_ENQUEUE_ARRAY_SIZE, 0x1000); +#endif + /* Allocate crypto operations. */ + allocated = rte_crypto_op_bulk_alloc(g_crypto_op_mp, + RTE_CRYPTO_OP_TYPE_SYMMETRIC, + crypto_ops, cryop_cnt); + if (allocated < cryop_cnt) { + SPDK_ERRLOG("ERROR trying to get crypto ops!\n"); + rc = -ENOMEM; + goto error_get_ops; + } + + /* For encryption, we need to prepare a single contiguous buffer as the encryption + * destination, we'll then pass that along for the write after encryption is done. + * This is done to avoiding encrypting the provided write buffer which may be + * undesirable in some use cases. + */ + if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + io_ctx->aux_buf_iov.iov_len = total_length; + io_ctx->aux_buf_raw = aux_buf; + io_ctx->aux_buf_iov.iov_base = (void *)(((uintptr_t)aux_buf + (alignment - 1)) & ~(alignment - 1)); + io_ctx->aux_offset_blocks = bdev_io->u.bdev.offset_blocks; + io_ctx->aux_num_blocks = bdev_io->u.bdev.num_blocks; + } + + /* This value is used in the completion callback to determine when the bdev_io is + * complete. + */ + io_ctx->cryop_cnt_remaining = cryop_cnt; + + /* As we don't support chaining because of a decision to use LBA as IV, construction + * of crypto operations is straightforward. We build both the op, the mbuf and the + * dst_mbuf in our local arrays by looping through the length of the bdev IO and + * picking off LBA sized blocks of memory from the IOVs as we walk through them. Each + * LBA sized chunk of memory will correspond 1:1 to a crypto operation and a single + * mbuf per crypto operation. + */ + total_remaining = total_length; + current_iov = bdev_io->u.bdev.iovs[iov_index].iov_base; + current_iov_remaining = bdev_io->u.bdev.iovs[iov_index].iov_len; + do { + uint8_t *iv_ptr; + uint64_t op_block_offset; + + /* Set the mbuf elements address and length. Null out the next pointer. */ + src_mbufs[crypto_index]->buf_addr = current_iov; + src_mbufs[crypto_index]->data_len = updated_length = crypto_len; + /* TODO: Make this assignment conditional on QAT usage and add an assert. */ + src_mbufs[crypto_index]->buf_iova = spdk_vtophys((void *)current_iov, &updated_length); + src_mbufs[crypto_index]->next = NULL; + /* Store context in every mbuf as we don't know anything about completion order */ + src_mbufs[crypto_index]->userdata = bdev_io; + + /* Set the IV - we use the LBA of the crypto_op */ + iv_ptr = rte_crypto_op_ctod_offset(crypto_ops[crypto_index], uint8_t *, + IV_OFFSET); + memset(iv_ptr, 0, AES_CBC_IV_LENGTH); + op_block_offset = bdev_io->u.bdev.offset_blocks + crypto_index; + rte_memcpy(iv_ptr, &op_block_offset, sizeof(uint64_t)); + + /* Set the data to encrypt/decrypt length */ + crypto_ops[crypto_index]->sym->cipher.data.length = crypto_len; + crypto_ops[crypto_index]->sym->cipher.data.offset = 0; + + /* link the mbuf to the crypto op. */ + crypto_ops[crypto_index]->sym->m_src = src_mbufs[crypto_index]; + if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + crypto_ops[crypto_index]->sym->m_dst = src_mbufs[crypto_index]; + } else { + crypto_ops[crypto_index]->sym->m_dst = NULL; + } + + /* For encrypt, point the destination to a buffer we allocate and redirect the bdev_io + * that will be used to process the write on completion to the same buffer. Setting + * up the en_buffer is a little simpler as we know the destination buffer is single IOV. + */ + if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + + /* Set the relevant destination en_mbuf elements. */ + dst_mbufs[crypto_index]->buf_addr = io_ctx->aux_buf_iov.iov_base + en_offset; + dst_mbufs[crypto_index]->data_len = updated_length = crypto_len; + /* TODO: Make this assignment conditional on QAT usage and add an assert. */ + dst_mbufs[crypto_index]->buf_iova = spdk_vtophys(dst_mbufs[crypto_index]->buf_addr, + &updated_length); + crypto_ops[crypto_index]->sym->m_dst = dst_mbufs[crypto_index]; + en_offset += crypto_len; + dst_mbufs[crypto_index]->next = NULL; + + /* Attach the crypto session to the operation */ + rc = rte_crypto_op_attach_sym_session(crypto_ops[crypto_index], + io_ctx->crypto_bdev->session_encrypt); + if (rc) { + rc = -EINVAL; + goto error_attach_session; + } + + } else { + /* Attach the crypto session to the operation */ + rc = rte_crypto_op_attach_sym_session(crypto_ops[crypto_index], + io_ctx->crypto_bdev->session_decrypt); + if (rc) { + rc = -EINVAL; + goto error_attach_session; + } + + + } + + /* Subtract our running totals for the op in progress and the overall bdev io */ + total_remaining -= crypto_len; + current_iov_remaining -= crypto_len; + + /* move our current IOV pointer accordingly. */ + current_iov += crypto_len; + + /* move on to the next crypto operation */ + crypto_index++; + + /* If we're done with this IOV, move to the next one. */ + if (current_iov_remaining == 0 && total_remaining > 0) { + iov_index++; + current_iov = bdev_io->u.bdev.iovs[iov_index].iov_base; + current_iov_remaining = bdev_io->u.bdev.iovs[iov_index].iov_len; + } + } while (total_remaining > 0); + + /* Enqueue everything we've got but limit by the max number of descriptors we + * configured the crypto device for. + */ + burst = spdk_min(cryop_cnt, CRYPTO_QP_DESCRIPTORS); + num_enqueued_ops = rte_cryptodev_enqueue_burst(cdev_id, crypto_ch->device_qp->qp, + &crypto_ops[0], + burst); + + /* Add this bdev_io to our outstanding list if any of its crypto ops made it. */ + if (num_enqueued_ops > 0) { + TAILQ_INSERT_TAIL(&crypto_ch->pending_cry_ios, bdev_io, module_link); + io_ctx->on_pending_list = true; + } + /* We were unable to enqueue everything but did get some, so need to decide what + * to do based on the status of the last op. + */ + if (num_enqueued_ops < cryop_cnt) { + switch (crypto_ops[num_enqueued_ops]->status) { + case RTE_CRYPTO_OP_STATUS_NOT_PROCESSED: + /* Queue them up on a linked list to be resubmitted via the poller. */ + for (crypto_index = num_enqueued_ops; crypto_index < cryop_cnt; crypto_index++) { + op_to_queue = (struct vbdev_crypto_op *)rte_crypto_op_ctod_offset(crypto_ops[crypto_index], + uint8_t *, QUEUED_OP_OFFSET); + op_to_queue->cdev_id = cdev_id; + op_to_queue->qp = crypto_ch->device_qp->qp; + op_to_queue->crypto_op = crypto_ops[crypto_index]; + op_to_queue->bdev_io = bdev_io; + TAILQ_INSERT_TAIL(&crypto_ch->queued_cry_ops, + op_to_queue, + link); + } + break; + default: + /* For all other statuses, set the io_ctx bdev_io status so that + * the poller will pick the failure up for the overall bdev status. + */ + io_ctx->bdev_io_status = SPDK_BDEV_IO_STATUS_FAILED; + if (num_enqueued_ops == 0) { + /* If nothing was enqueued, but the last one wasn't because of + * busy, fail it now as the poller won't know anything about it. + */ + _crypto_operation_complete(bdev_io); + rc = -EINVAL; + goto error_attach_session; + } + break; + } + } + + return rc; + + /* Error cleanup paths. */ +error_attach_session: +error_get_ops: + if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + spdk_mempool_put_bulk(g_mbuf_mp, (void **)&dst_mbufs[0], + cryop_cnt); + } + if (allocated > 0) { + rte_mempool_put_bulk(g_crypto_op_mp, (void **)crypto_ops, + allocated); + } +error_get_dst: + spdk_mempool_put_bulk(g_mbuf_mp, (void **)&src_mbufs[0], + cryop_cnt); + return rc; +} + +/* This function is called after all channels have been quiesced following + * a bdev reset. + */ +static void +_ch_quiesce_done(struct spdk_io_channel_iter *i, int status) +{ + struct crypto_bdev_io *io_ctx = spdk_io_channel_iter_get_ctx(i); + + assert(TAILQ_EMPTY(&io_ctx->crypto_ch->pending_cry_ios)); + assert(io_ctx->orig_io != NULL); + + spdk_bdev_io_complete(io_ctx->orig_io, SPDK_BDEV_IO_STATUS_SUCCESS); +} + +/* This function is called per channel to quiesce IOs before completing a + * bdev reset that we received. + */ +static void +_ch_quiesce(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct crypto_io_channel *crypto_ch = spdk_io_channel_get_ctx(ch); + + crypto_ch->iter = i; + /* When the poller runs, it will see the non-NULL iter and handle + * the quiesce. + */ +} + +/* Completion callback for IO that were issued from this bdev other than read/write. + * They have their own for readability. + */ +static void +_complete_internal_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) { + struct crypto_bdev_io *orig_ctx = (struct crypto_bdev_io *)orig_io->driver_ctx; + + assert(orig_io == orig_ctx->orig_io); + + spdk_bdev_free_io(bdev_io); + + spdk_for_each_channel(orig_ctx->crypto_bdev, + _ch_quiesce, + orig_ctx, + _ch_quiesce_done); + return; + } + + spdk_bdev_io_complete(orig_io, status); + spdk_bdev_free_io(bdev_io); +} + +/* Completion callback for writes that were issued from this bdev. */ +static void +_complete_internal_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + struct crypto_bdev_io *orig_ctx = (struct crypto_bdev_io *)orig_io->driver_ctx; + + spdk_bdev_io_put_aux_buf(orig_io, orig_ctx->aux_buf_raw); + + spdk_bdev_io_complete(orig_io, status); + spdk_bdev_free_io(bdev_io); +} + +/* Completion callback for reads that were issued from this bdev. */ +static void +_complete_internal_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + struct crypto_bdev_io *orig_ctx = (struct crypto_bdev_io *)orig_io->driver_ctx; + + if (success) { + + /* Save off this bdev_io so it can be freed after decryption. */ + orig_ctx->read_io = bdev_io; + + if (!_crypto_operation(orig_io, RTE_CRYPTO_CIPHER_OP_DECRYPT, NULL)) { + return; + } else { + SPDK_ERRLOG("ERROR decrypting\n"); + } + } else { + SPDK_ERRLOG("ERROR on read prior to decrypting\n"); + } + + spdk_bdev_io_complete(orig_io, SPDK_BDEV_IO_STATUS_FAILED); + spdk_bdev_free_io(bdev_io); +} + +static void +vbdev_crypto_resubmit_io(void *arg) +{ + struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg; + struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + + vbdev_crypto_submit_request(io_ctx->ch, bdev_io); +} + +static void +vbdev_crypto_queue_io(struct spdk_bdev_io *bdev_io) +{ + struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + int rc; + + io_ctx->bdev_io_wait.bdev = bdev_io->bdev; + io_ctx->bdev_io_wait.cb_fn = vbdev_crypto_resubmit_io; + io_ctx->bdev_io_wait.cb_arg = bdev_io; + + rc = spdk_bdev_queue_io_wait(bdev_io->bdev, io_ctx->crypto_ch->base_ch, &io_ctx->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vbdev_crypto_queue_io, rc=%d.\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +/* Callback for getting a buf from the bdev pool in the event that the caller passed + * in NULL, we need to own the buffer so it doesn't get freed by another vbdev module + * beneath us before we're done with it. + */ +static void +crypto_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + struct vbdev_crypto *crypto_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_crypto, + crypto_bdev); + struct crypto_io_channel *crypto_ch = spdk_io_channel_get_ctx(ch); + struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + int rc; + + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + rc = spdk_bdev_readv_blocks(crypto_bdev->base_desc, crypto_ch->base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, _complete_internal_read, + bdev_io); + if (rc != 0) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "No memory, queue the IO.\n"); + io_ctx->ch = ch; + vbdev_crypto_queue_io(bdev_io); + } else { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +/* For encryption we don't want to encrypt the data in place as the host isn't + * expecting us to mangle its data buffers so we need to encrypt into the bdev + * aux buffer, then we can use that as the source for the disk data transfer. + */ +static void +crypto_write_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + void *aux_buf) +{ + struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + int rc = 0; + + rc = _crypto_operation(bdev_io, RTE_CRYPTO_CIPHER_OP_ENCRYPT, aux_buf); + if (rc != 0) { + spdk_bdev_io_put_aux_buf(bdev_io, aux_buf); + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "No memory, queue the IO.\n"); + io_ctx->ch = ch; + vbdev_crypto_queue_io(bdev_io); + } else { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +/* Called when someone submits IO to this crypto vbdev. For IO's not relevant to crypto, + * we're simply passing it on here via SPDK IO calls which in turn allocate another bdev IO + * and call our cpl callback provided below along with the original bdev_io so that we can + * complete it once this IO completes. For crypto operations, we'll either encrypt it first + * (writes) then call back into bdev to submit it or we'll submit a read and then catch it + * on the way back for decryption. + */ +static void +vbdev_crypto_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_crypto *crypto_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_crypto, + crypto_bdev); + struct crypto_io_channel *crypto_ch = spdk_io_channel_get_ctx(ch); + struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + int rc = 0; + + memset(io_ctx, 0, sizeof(struct crypto_bdev_io)); + io_ctx->crypto_bdev = crypto_bdev; + io_ctx->crypto_ch = crypto_ch; + io_ctx->orig_io = bdev_io; + io_ctx->bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, crypto_read_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + /* Tell the bdev layer that we need an aux buf in addition to the data + * buf already associated with the bdev. + */ + spdk_bdev_io_get_aux_buf(bdev_io, crypto_write_get_buf_cb); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + rc = spdk_bdev_unmap_blocks(crypto_bdev->base_desc, crypto_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _complete_internal_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + rc = spdk_bdev_flush_blocks(crypto_bdev->base_desc, crypto_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _complete_internal_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + rc = spdk_bdev_reset(crypto_bdev->base_desc, crypto_ch->base_ch, + _complete_internal_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + default: + SPDK_ERRLOG("crypto: unknown I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + if (rc != 0) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "No memory, queue the IO.\n"); + io_ctx->ch = ch; + vbdev_crypto_queue_io(bdev_io); + } else { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +/* We'll just call the base bdev and let it answer except for WZ command which + * we always say we don't support so that the bdev layer will actually send us + * real writes that we can encrypt. + */ +static bool +vbdev_crypto_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_FLUSH: + return spdk_bdev_io_type_supported(crypto_bdev->base_bdev, io_type); + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + /* Force the bdev layer to issue actual writes of zeroes so we can + * encrypt them as regular writes. + */ + default: + return false; + } +} + +/* Callback for unregistering the IO device. */ +static void +_device_unregister_cb(void *io_device) +{ + struct vbdev_crypto *crypto_bdev = io_device; + + /* Done with this crypto_bdev. */ + rte_cryptodev_sym_session_free(crypto_bdev->session_decrypt); + rte_cryptodev_sym_session_free(crypto_bdev->session_encrypt); + free(crypto_bdev->drv_name); + if (crypto_bdev->key) { + memset(crypto_bdev->key, 0, strnlen(crypto_bdev->key, (AES_CBC_KEY_LENGTH + 1))); + free(crypto_bdev->key); + } + if (crypto_bdev->key2) { + memset(crypto_bdev->key2, 0, strnlen(crypto_bdev->key2, (AES_XTS_KEY_LENGTH + 1))); + free(crypto_bdev->key2); + } + if (crypto_bdev->xts_key) { + memset(crypto_bdev->xts_key, 0, strnlen(crypto_bdev->xts_key, (AES_XTS_KEY_LENGTH * 2) + 1)); + free(crypto_bdev->xts_key); + } + free(crypto_bdev->crypto_bdev.name); + free(crypto_bdev); +} + +/* Wrapper for the bdev close operation. */ +static void +_vbdev_crypto_destruct(void *ctx) +{ + struct spdk_bdev_desc *desc = ctx; + + spdk_bdev_close(desc); +} + +/* Called after we've unregistered following a hot remove callback. + * Our finish entry point will be called next. + */ +static int +vbdev_crypto_destruct(void *ctx) +{ + struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx; + + /* Remove this device from the internal list */ + TAILQ_REMOVE(&g_vbdev_crypto, crypto_bdev, link); + + /* Unclaim the underlying bdev. */ + spdk_bdev_module_release_bdev(crypto_bdev->base_bdev); + + /* Close the underlying bdev on its same opened thread. */ + if (crypto_bdev->thread && crypto_bdev->thread != spdk_get_thread()) { + spdk_thread_send_msg(crypto_bdev->thread, _vbdev_crypto_destruct, crypto_bdev->base_desc); + } else { + spdk_bdev_close(crypto_bdev->base_desc); + } + + /* Unregister the io_device. */ + spdk_io_device_unregister(crypto_bdev, _device_unregister_cb); + + g_number_of_claimed_volumes--; + + return 0; +} + +/* We supplied this as an entry point for upper layers who want to communicate to this + * bdev. This is how they get a channel. We are passed the same context we provided when + * we created our crypto vbdev in examine() which, for this bdev, is the address of one of + * our context nodes. From here we'll ask the SPDK channel code to fill out our channel + * struct and we'll keep it in our crypto node. + */ +static struct spdk_io_channel * +vbdev_crypto_get_io_channel(void *ctx) +{ + struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx; + + /* The IO channel code will allocate a channel for us which consists of + * the SPDK channel structure plus the size of our crypto_io_channel struct + * that we passed in when we registered our IO device. It will then call + * our channel create callback to populate any elements that we need to + * update. + */ + return spdk_get_io_channel(crypto_bdev); +} + +/* This is the output for bdev_get_bdevs() for this vbdev */ +static int +vbdev_crypto_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx; + + spdk_json_write_name(w, "crypto"); + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(crypto_bdev->base_bdev)); + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&crypto_bdev->crypto_bdev)); + spdk_json_write_named_string(w, "crypto_pmd", crypto_bdev->drv_name); + spdk_json_write_named_string(w, "key", crypto_bdev->key); + if (strcmp(crypto_bdev->cipher, AES_XTS) == 0) { + spdk_json_write_named_string(w, "key2", crypto_bdev->key); + } + spdk_json_write_named_string(w, "cipher", crypto_bdev->cipher); + spdk_json_write_object_end(w); + return 0; +} + +static int +vbdev_crypto_config_json(struct spdk_json_write_ctx *w) +{ + struct vbdev_crypto *crypto_bdev; + + TAILQ_FOREACH(crypto_bdev, &g_vbdev_crypto, link) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "bdev_crypto_create"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(crypto_bdev->base_bdev)); + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&crypto_bdev->crypto_bdev)); + spdk_json_write_named_string(w, "crypto_pmd", crypto_bdev->drv_name); + spdk_json_write_named_string(w, "key", crypto_bdev->key); + if (strcmp(crypto_bdev->cipher, AES_XTS) == 0) { + spdk_json_write_named_string(w, "key2", crypto_bdev->key); + } + spdk_json_write_named_string(w, "cipher", crypto_bdev->cipher); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); + } + return 0; +} + +/* Helper function for the channel creation callback. */ +static void +_assign_device_qp(struct vbdev_crypto *crypto_bdev, struct device_qp *device_qp, + struct crypto_io_channel *crypto_ch) +{ + pthread_mutex_lock(&g_device_qp_lock); + if (strcmp(crypto_bdev->drv_name, QAT) == 0) { + /* For some QAT devices, the optimal qp to use is every 32nd as this spreads the + * workload out over the multiple virtual functions in the device. For the devices + * where this isn't the case, it doesn't hurt. + */ + TAILQ_FOREACH(device_qp, &g_device_qp_qat, link) { + if (device_qp->index != g_next_qat_index) { + continue; + } + if (device_qp->in_use == false) { + crypto_ch->device_qp = device_qp; + device_qp->in_use = true; + g_next_qat_index = (g_next_qat_index + QAT_VF_SPREAD) % g_qat_total_qp; + break; + } else { + /* if the preferred index is used, skip to the next one in this set. */ + g_next_qat_index = (g_next_qat_index + 1) % g_qat_total_qp; + } + } + } else if (strcmp(crypto_bdev->drv_name, AESNI_MB) == 0) { + TAILQ_FOREACH(device_qp, &g_device_qp_aesni_mb, link) { + if (device_qp->in_use == false) { + crypto_ch->device_qp = device_qp; + device_qp->in_use = true; + break; + } + } + } + pthread_mutex_unlock(&g_device_qp_lock); +} + +/* We provide this callback for the SPDK channel code to create a channel using + * the channel struct we provided in our module get_io_channel() entry point. Here + * we get and save off an underlying base channel of the device below us so that + * we can communicate with the base bdev on a per channel basis. We also register the + * poller used to complete crypto operations from the device. + */ +static int +crypto_bdev_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct crypto_io_channel *crypto_ch = ctx_buf; + struct vbdev_crypto *crypto_bdev = io_device; + struct device_qp *device_qp = NULL; + + crypto_ch->base_ch = spdk_bdev_get_io_channel(crypto_bdev->base_desc); + crypto_ch->poller = SPDK_POLLER_REGISTER(crypto_dev_poller, crypto_ch, 0); + crypto_ch->device_qp = NULL; + + /* Assign a device/qp combination that is unique per channel per PMD. */ + _assign_device_qp(crypto_bdev, device_qp, crypto_ch); + assert(crypto_ch->device_qp); + + /* We use this queue to track outstanding IO in our layer. */ + TAILQ_INIT(&crypto_ch->pending_cry_ios); + + /* We use this to queue up crypto ops when the device is busy. */ + TAILQ_INIT(&crypto_ch->queued_cry_ops); + + return 0; +} + +/* We provide this callback for the SPDK channel code to destroy a channel + * created with our create callback. We just need to undo anything we did + * when we created. + */ +static void +crypto_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct crypto_io_channel *crypto_ch = ctx_buf; + + pthread_mutex_lock(&g_device_qp_lock); + crypto_ch->device_qp->in_use = false; + pthread_mutex_unlock(&g_device_qp_lock); + + spdk_poller_unregister(&crypto_ch->poller); + spdk_put_io_channel(crypto_ch->base_ch); +} + +/* Create the association from the bdev and vbdev name and insert + * on the global list. */ +static int +vbdev_crypto_insert_name(const char *bdev_name, const char *vbdev_name, + const char *crypto_pmd, const char *key, + const char *cipher, const char *key2) +{ + struct bdev_names *name; + int rc, j; + bool found = false; + + TAILQ_FOREACH(name, &g_bdev_names, link) { + if (strcmp(vbdev_name, name->vbdev_name) == 0) { + SPDK_ERRLOG("crypto bdev %s already exists\n", vbdev_name); + return -EEXIST; + } + } + + name = calloc(1, sizeof(struct bdev_names)); + if (!name) { + SPDK_ERRLOG("could not allocate bdev_names\n"); + return -ENOMEM; + } + + name->bdev_name = strdup(bdev_name); + if (!name->bdev_name) { + SPDK_ERRLOG("could not allocate name->bdev_name\n"); + rc = -ENOMEM; + goto error_alloc_bname; + } + + name->vbdev_name = strdup(vbdev_name); + if (!name->vbdev_name) { + SPDK_ERRLOG("could not allocate name->vbdev_name\n"); + rc = -ENOMEM; + goto error_alloc_vname; + } + + name->drv_name = strdup(crypto_pmd); + if (!name->drv_name) { + SPDK_ERRLOG("could not allocate name->drv_name\n"); + rc = -ENOMEM; + goto error_alloc_dname; + } + for (j = 0; j < MAX_NUM_DRV_TYPES ; j++) { + if (strcmp(crypto_pmd, g_driver_names[j]) == 0) { + found = true; + break; + } + } + if (!found) { + SPDK_ERRLOG("invalid crypto PMD type %s\n", crypto_pmd); + rc = -EINVAL; + goto error_invalid_pmd; + } + + name->key = strdup(key); + if (!name->key) { + SPDK_ERRLOG("could not allocate name->key\n"); + rc = -ENOMEM; + goto error_alloc_key; + } + if (strnlen(name->key, (AES_CBC_KEY_LENGTH + 1)) != AES_CBC_KEY_LENGTH) { + SPDK_ERRLOG("invalid AES_CBC key length\n"); + rc = -EINVAL; + goto error_invalid_key; + } + + if (strncmp(cipher, AES_XTS, sizeof(AES_XTS)) == 0) { + /* To please scan-build, input validation makes sure we can't + * have this cipher without providing a key2. + */ + name->cipher = AES_XTS; + assert(key2); + if (strnlen(key2, (AES_XTS_KEY_LENGTH + 1)) != AES_XTS_KEY_LENGTH) { + SPDK_ERRLOG("invalid AES_XTS key length\n"); + rc = -EINVAL; + goto error_invalid_key2; + } + + name->key2 = strdup(key2); + if (!name->key2) { + SPDK_ERRLOG("could not allocate name->key2\n"); + rc = -ENOMEM; + goto error_alloc_key2; + } + } else if (strncmp(cipher, AES_CBC, sizeof(AES_CBC)) == 0) { + name->cipher = AES_CBC; + } else { + SPDK_ERRLOG("Invalid cipher: %s\n", cipher); + rc = -EINVAL; + goto error_cipher; + } + + TAILQ_INSERT_TAIL(&g_bdev_names, name, link); + + return 0; + + /* Error cleanup paths. */ +error_cipher: + free(name->key2); +error_alloc_key2: +error_invalid_key2: +error_invalid_key: + free(name->key); +error_alloc_key: +error_invalid_pmd: + free(name->drv_name); +error_alloc_dname: + free(name->vbdev_name); +error_alloc_vname: + free(name->bdev_name); +error_alloc_bname: + free(name); + return rc; +} + +/* RPC entry point for crypto creation. */ +int +create_crypto_disk(const char *bdev_name, const char *vbdev_name, + const char *crypto_pmd, const char *key, + const char *cipher, const char *key2) +{ + struct spdk_bdev *bdev = NULL; + int rc = 0; + + bdev = spdk_bdev_get_by_name(bdev_name); + + rc = vbdev_crypto_insert_name(bdev_name, vbdev_name, crypto_pmd, key, cipher, key2); + if (rc) { + return rc; + } + + if (!bdev) { + SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n"); + return 0; + } + + rc = vbdev_crypto_claim(bdev); + if (rc) { + return rc; + } + + return rc; +} + +/* Called at driver init time, parses config file to prepare for examine calls, + * also fully initializes the crypto drivers. + */ +static int +vbdev_crypto_init(void) +{ + struct spdk_conf_section *sp = NULL; + const char *conf_bdev_name = NULL; + const char *conf_vbdev_name = NULL; + const char *crypto_pmd = NULL; + int i; + int rc = 0; + const char *key = NULL; + const char *cipher = NULL; + const char *key2 = NULL; + + /* Fully configure both SW and HW drivers. */ + rc = vbdev_crypto_init_crypto_drivers(); + if (rc) { + SPDK_ERRLOG("Error setting up crypto devices\n"); + return rc; + } + + sp = spdk_conf_find_section(NULL, "crypto"); + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + + if (!spdk_conf_section_get_nval(sp, "CRY", i)) { + break; + } + + conf_bdev_name = spdk_conf_section_get_nmval(sp, "CRY", i, 0); + if (!conf_bdev_name) { + SPDK_ERRLOG("crypto configuration missing bdev name\n"); + return -EINVAL; + } + + conf_vbdev_name = spdk_conf_section_get_nmval(sp, "CRY", i, 1); + if (!conf_vbdev_name) { + SPDK_ERRLOG("crypto configuration missing crypto_bdev name\n"); + return -EINVAL; + } + + key = spdk_conf_section_get_nmval(sp, "CRY", i, 2); + if (!key) { + SPDK_ERRLOG("crypto configuration missing crypto_bdev key\n"); + return -EINVAL; + } + SPDK_NOTICELOG("WARNING: You are storing your key in a plain text file!!\n"); + + crypto_pmd = spdk_conf_section_get_nmval(sp, "CRY", i, 3); + if (!crypto_pmd) { + SPDK_ERRLOG("crypto configuration missing driver type\n"); + return -EINVAL; + } + + /* These are optional. */ + cipher = spdk_conf_section_get_nmval(sp, "CRY", i, 4); + if (cipher == NULL) { + cipher = AES_CBC; + } + key2 = spdk_conf_section_get_nmval(sp, "CRY", i, 5); + + /* Note: config file options do not support QAT AES_XTS, use RPC */ + rc = vbdev_crypto_insert_name(conf_bdev_name, conf_vbdev_name, + crypto_pmd, key, cipher, key2); + if (rc != 0) { + return rc; + } + } + + return rc; +} + +/* Called when the entire module is being torn down. */ +static void +vbdev_crypto_finish(void) +{ + struct bdev_names *name; + struct vbdev_dev *device; + struct device_qp *dev_qp; + unsigned i; + int rc; + + while ((name = TAILQ_FIRST(&g_bdev_names))) { + TAILQ_REMOVE(&g_bdev_names, name, link); + free(name->drv_name); + free(name->key); + free(name->bdev_name); + free(name->vbdev_name); + free(name->key2); + free(name); + } + + while ((device = TAILQ_FIRST(&g_vbdev_devs))) { + struct rte_cryptodev *rte_dev; + + TAILQ_REMOVE(&g_vbdev_devs, device, link); + rte_cryptodev_stop(device->cdev_id); + + assert(device->cdev_id < RTE_CRYPTO_MAX_DEVS); + rte_dev = &rte_cryptodevs[device->cdev_id]; + + if (rte_dev->dev_ops->queue_pair_release != NULL) { + for (i = 0; i < device->cdev_info.max_nb_queue_pairs; i++) { + rte_dev->dev_ops->queue_pair_release(rte_dev, i); + } + } + free(device); + } + rc = rte_vdev_uninit(AESNI_MB); + if (rc) { + SPDK_ERRLOG("%d from rte_vdev_uninit\n", rc); + } + + while ((dev_qp = TAILQ_FIRST(&g_device_qp_qat))) { + TAILQ_REMOVE(&g_device_qp_qat, dev_qp, link); + free(dev_qp); + } + + while ((dev_qp = TAILQ_FIRST(&g_device_qp_aesni_mb))) { + TAILQ_REMOVE(&g_device_qp_aesni_mb, dev_qp, link); + free(dev_qp); + } + + rte_mempool_free(g_crypto_op_mp); + spdk_mempool_free(g_mbuf_mp); + rte_mempool_free(g_session_mp); + if (g_session_mp_priv != NULL) { + rte_mempool_free(g_session_mp_priv); + } +} + +/* During init we'll be asked how much memory we'd like passed to us + * in bev_io structures as context. Here's where we specify how + * much context we want per IO. + */ +static int +vbdev_crypto_get_ctx_size(void) +{ + return sizeof(struct crypto_bdev_io); +} + +/* Called when SPDK wants to save the current config of this vbdev module to + * a file. + */ +static void +vbdev_crypto_get_spdk_running_config(FILE *fp) +{ + struct bdev_names *names = NULL; + fprintf(fp, "\n[crypto]\n"); + TAILQ_FOREACH(names, &g_bdev_names, link) { + fprintf(fp, " crypto %s %s ", names->bdev_name, names->vbdev_name); + fprintf(fp, "\n"); + } + + fprintf(fp, "\n"); +} + +/* Called when the underlying base bdev goes away. */ +static void +vbdev_crypto_examine_hotremove_cb(void *ctx) +{ + struct vbdev_crypto *crypto_bdev, *tmp; + struct spdk_bdev *bdev_find = ctx; + + TAILQ_FOREACH_SAFE(crypto_bdev, &g_vbdev_crypto, link, tmp) { + if (bdev_find == crypto_bdev->base_bdev) { + spdk_bdev_unregister(&crypto_bdev->crypto_bdev, NULL, NULL); + } + } +} + +static void +vbdev_crypto_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* No config per bdev needed */ +} + +/* When we register our bdev this is how we specify our entry points. */ +static const struct spdk_bdev_fn_table vbdev_crypto_fn_table = { + .destruct = vbdev_crypto_destruct, + .submit_request = vbdev_crypto_submit_request, + .io_type_supported = vbdev_crypto_io_type_supported, + .get_io_channel = vbdev_crypto_get_io_channel, + .dump_info_json = vbdev_crypto_dump_info_json, + .write_config_json = vbdev_crypto_write_config_json +}; + +static struct spdk_bdev_module crypto_if = { + .name = "crypto", + .module_init = vbdev_crypto_init, + .config_text = vbdev_crypto_get_spdk_running_config, + .get_ctx_size = vbdev_crypto_get_ctx_size, + .examine_config = vbdev_crypto_examine, + .module_fini = vbdev_crypto_finish, + .config_json = vbdev_crypto_config_json +}; + +SPDK_BDEV_MODULE_REGISTER(crypto, &crypto_if) + +static int +vbdev_crypto_claim(struct spdk_bdev *bdev) +{ + struct bdev_names *name; + struct vbdev_crypto *vbdev; + struct vbdev_dev *device; + bool found = false; + int rc = 0; + + if (g_number_of_claimed_volumes >= MAX_CRYPTO_VOLUMES) { + SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "Reached max number of claimed volumes\n"); + rc = -EINVAL; + goto error_vbdev_alloc; + } + g_number_of_claimed_volumes++; + + /* Check our list of names from config versus this bdev and if + * there's a match, create the crypto_bdev & bdev accordingly. + */ + TAILQ_FOREACH(name, &g_bdev_names, link) { + if (strcmp(name->bdev_name, bdev->name) != 0) { + continue; + } + SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "Match on %s\n", bdev->name); + + vbdev = calloc(1, sizeof(struct vbdev_crypto)); + if (!vbdev) { + SPDK_ERRLOG("could not allocate crypto_bdev\n"); + rc = -ENOMEM; + goto error_vbdev_alloc; + } + + /* The base bdev that we're attaching to. */ + vbdev->base_bdev = bdev; + vbdev->crypto_bdev.name = strdup(name->vbdev_name); + if (!vbdev->crypto_bdev.name) { + SPDK_ERRLOG("could not allocate crypto_bdev name\n"); + rc = -ENOMEM; + goto error_bdev_name; + } + + vbdev->key = strdup(name->key); + if (!vbdev->key) { + SPDK_ERRLOG("could not allocate crypto_bdev key\n"); + rc = -ENOMEM; + goto error_alloc_key; + } + + if (name->key2) { + vbdev->key2 = strdup(name->key2); + if (!vbdev->key2) { + SPDK_ERRLOG("could not allocate crypto_bdev key2\n"); + rc = -ENOMEM; + goto error_alloc_key2; + } + } + + vbdev->drv_name = strdup(name->drv_name); + if (!vbdev->drv_name) { + SPDK_ERRLOG("could not allocate crypto_bdev drv_name\n"); + rc = -ENOMEM; + goto error_drv_name; + } + + vbdev->crypto_bdev.product_name = "crypto"; + vbdev->crypto_bdev.write_cache = bdev->write_cache; + vbdev->cipher = AES_CBC; + if (strcmp(vbdev->drv_name, QAT) == 0) { + vbdev->crypto_bdev.required_alignment = + spdk_max(spdk_u32log2(bdev->blocklen), bdev->required_alignment); + SPDK_NOTICELOG("QAT in use: Required alignment set to %u\n", + vbdev->crypto_bdev.required_alignment); + if (strcmp(name->cipher, AES_CBC) == 0) { + SPDK_NOTICELOG("QAT using cipher: AES_CBC\n"); + } else { + SPDK_NOTICELOG("QAT using cipher: AES_XTS\n"); + vbdev->cipher = AES_XTS; + /* DPDK expects they keys to be concatenated together. */ + vbdev->xts_key = calloc(1, (AES_XTS_KEY_LENGTH * 2) + 1); + if (vbdev->xts_key == NULL) { + SPDK_ERRLOG("could not allocate memory for XTS key\n"); + rc = -ENOMEM; + goto error_xts_key; + } + memcpy(vbdev->xts_key, vbdev->key, AES_XTS_KEY_LENGTH); + assert(name->key2); + memcpy(vbdev->xts_key + AES_XTS_KEY_LENGTH, name->key2, AES_XTS_KEY_LENGTH + 1); + } + } else { + vbdev->crypto_bdev.required_alignment = bdev->required_alignment; + } + /* Note: CRYPTO_MAX_IO is in units of bytes, optimal_io_boundary is + * in units of blocks. + */ + if (bdev->optimal_io_boundary > 0) { + vbdev->crypto_bdev.optimal_io_boundary = + spdk_min((CRYPTO_MAX_IO / bdev->blocklen), bdev->optimal_io_boundary); + } else { + vbdev->crypto_bdev.optimal_io_boundary = (CRYPTO_MAX_IO / bdev->blocklen); + } + vbdev->crypto_bdev.split_on_optimal_io_boundary = true; + vbdev->crypto_bdev.blocklen = bdev->blocklen; + vbdev->crypto_bdev.blockcnt = bdev->blockcnt; + + /* This is the context that is passed to us when the bdev + * layer calls in so we'll save our crypto_bdev node here. + */ + vbdev->crypto_bdev.ctxt = vbdev; + vbdev->crypto_bdev.fn_table = &vbdev_crypto_fn_table; + vbdev->crypto_bdev.module = &crypto_if; + TAILQ_INSERT_TAIL(&g_vbdev_crypto, vbdev, link); + + spdk_io_device_register(vbdev, crypto_bdev_ch_create_cb, crypto_bdev_ch_destroy_cb, + sizeof(struct crypto_io_channel), vbdev->crypto_bdev.name); + + rc = spdk_bdev_open(bdev, true, vbdev_crypto_examine_hotremove_cb, + bdev, &vbdev->base_desc); + if (rc) { + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(bdev)); + goto error_open; + } + + /* Save the thread where the base device is opened */ + vbdev->thread = spdk_get_thread(); + + rc = spdk_bdev_module_claim_bdev(bdev, vbdev->base_desc, vbdev->crypto_bdev.module); + if (rc) { + SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(bdev)); + goto error_claim; + } + + /* To init the session we have to get the cryptoDev device ID for this vbdev */ + TAILQ_FOREACH(device, &g_vbdev_devs, link) { + if (strcmp(device->cdev_info.driver_name, vbdev->drv_name) == 0) { + found = true; + break; + } + } + if (found == false) { + SPDK_ERRLOG("ERROR can't match crypto device driver to crypto vbdev!\n"); + rc = -EINVAL; + goto error_cant_find_devid; + } + + /* Get sessions. */ + vbdev->session_encrypt = rte_cryptodev_sym_session_create(g_session_mp); + if (NULL == vbdev->session_encrypt) { + SPDK_ERRLOG("ERROR trying to create crypto session!\n"); + rc = -EINVAL; + goto error_session_en_create; + } + + vbdev->session_decrypt = rte_cryptodev_sym_session_create(g_session_mp); + if (NULL == vbdev->session_decrypt) { + SPDK_ERRLOG("ERROR trying to create crypto session!\n"); + rc = -EINVAL; + goto error_session_de_create; + } + + /* Init our per vbdev xform with the desired cipher options. */ + vbdev->cipher_xform.type = RTE_CRYPTO_SYM_XFORM_CIPHER; + vbdev->cipher_xform.cipher.iv.offset = IV_OFFSET; + if (strcmp(name->cipher, AES_CBC) == 0) { + vbdev->cipher_xform.cipher.key.data = vbdev->key; + vbdev->cipher_xform.cipher.algo = RTE_CRYPTO_CIPHER_AES_CBC; + vbdev->cipher_xform.cipher.key.length = AES_CBC_KEY_LENGTH; + } else { + vbdev->cipher_xform.cipher.key.data = vbdev->xts_key; + vbdev->cipher_xform.cipher.algo = RTE_CRYPTO_CIPHER_AES_XTS; + vbdev->cipher_xform.cipher.key.length = AES_XTS_KEY_LENGTH * 2; + } + vbdev->cipher_xform.cipher.iv.length = AES_CBC_IV_LENGTH; + + vbdev->cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT; + rc = rte_cryptodev_sym_session_init(device->cdev_id, vbdev->session_encrypt, + &vbdev->cipher_xform, + g_session_mp_priv ? g_session_mp_priv : g_session_mp); + if (rc < 0) { + SPDK_ERRLOG("ERROR trying to init encrypt session!\n"); + rc = -EINVAL; + goto error_session_init; + } + + vbdev->cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT; + rc = rte_cryptodev_sym_session_init(device->cdev_id, vbdev->session_decrypt, + &vbdev->cipher_xform, + g_session_mp_priv ? g_session_mp_priv : g_session_mp); + if (rc < 0) { + SPDK_ERRLOG("ERROR trying to init decrypt session!\n"); + rc = -EINVAL; + goto error_session_init; + } + + rc = spdk_bdev_register(&vbdev->crypto_bdev); + if (rc < 0) { + SPDK_ERRLOG("ERROR trying to register bdev\n"); + rc = -EINVAL; + goto error_bdev_register; + } + SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "registered io_device and virtual bdev for: %s\n", + name->vbdev_name); + break; + } + + return rc; + + /* Error cleanup paths. */ +error_bdev_register: +error_session_init: + rte_cryptodev_sym_session_free(vbdev->session_decrypt); +error_session_de_create: + rte_cryptodev_sym_session_free(vbdev->session_encrypt); +error_session_en_create: +error_cant_find_devid: +error_claim: + spdk_bdev_close(vbdev->base_desc); +error_open: + TAILQ_REMOVE(&g_vbdev_crypto, vbdev, link); + spdk_io_device_unregister(vbdev, NULL); + free(vbdev->xts_key); +error_xts_key: + free(vbdev->drv_name); +error_drv_name: + free(vbdev->key2); +error_alloc_key2: + free(vbdev->key); +error_alloc_key: + free(vbdev->crypto_bdev.name); +error_bdev_name: + free(vbdev); +error_vbdev_alloc: + g_number_of_claimed_volumes--; + return rc; +} + +/* RPC entry for deleting a crypto vbdev. */ +void +delete_crypto_disk(struct spdk_bdev *bdev, spdk_delete_crypto_complete cb_fn, + void *cb_arg) +{ + struct bdev_names *name; + + if (!bdev || bdev->module != &crypto_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + /* Remove the association (vbdev, bdev) from g_bdev_names. This is required so that the + * vbdev does not get re-created if the same bdev is constructed at some other time, + * unless the underlying bdev was hot-removed. + */ + TAILQ_FOREACH(name, &g_bdev_names, link) { + if (strcmp(name->vbdev_name, bdev->name) == 0) { + TAILQ_REMOVE(&g_bdev_names, name, link); + free(name->bdev_name); + free(name->vbdev_name); + free(name->drv_name); + free(name->key); + free(name->key2); + free(name); + break; + } + } + + /* Additional cleanup happens in the destruct callback. */ + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +/* Because we specified this function in our crypto bdev function table when we + * registered our crypto bdev, we'll get this call anytime a new bdev shows up. + * Here we need to decide if we care about it and if so what to do. We + * parsed the config file at init so we check the new bdev against the list + * we built up at that time and if the user configured us to attach to this + * bdev, here's where we do it. + */ +static void +vbdev_crypto_examine(struct spdk_bdev *bdev) +{ + vbdev_crypto_claim(bdev); + spdk_bdev_module_examine_done(&crypto_if); +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_crypto", SPDK_LOG_CRYPTO) diff --git a/src/spdk/module/bdev/crypto/vbdev_crypto.h b/src/spdk/module/bdev/crypto/vbdev_crypto.h new file mode 100644 index 000000000..458b29c6b --- /dev/null +++ b/src/spdk/module/bdev/crypto/vbdev_crypto.h @@ -0,0 +1,78 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_CRYPTO_H +#define SPDK_VBDEV_CRYPTO_H + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +#include "spdk/bdev.h" + +#define AESNI_MB "crypto_aesni_mb" +#define QAT "crypto_qat" + +/* Supported ciphers */ +#define AES_CBC "AES_CBC" /* QAT and AESNI_MB */ +#define AES_XTS "AES_XTS" /* QAT only */ + +typedef void (*spdk_delete_crypto_complete)(void *cb_arg, int bdeverrno); + +/** + * Create new crypto bdev. + * + * \param bdev_name Name of the bdev on which the crypto vbdev will be created. + * \param vbdev_name Name of the new crypto vbdev. + * \param crypto_pmd Name of the polled mode driver to use for this vbdev. + * \param key The key to use for this vbdev. + * \param cipher The cipher to use for this vbdev. + * \param keys The 2nd key to use for AES_XTS cipher. + * \return 0 on success, other on failure. + */ +int create_crypto_disk(const char *bdev_name, const char *vbdev_name, + const char *crypto_pmd, const char *key, + const char *cipher, const char *key2); + +/** + * Delete crypto bdev. + * + * \param bdev Pointer to crypto bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void delete_crypto_disk(struct spdk_bdev *bdev, spdk_delete_crypto_complete cb_fn, + void *cb_arg); + +#endif /* SPDK_VBDEV_CRYPTO_H */ diff --git a/src/spdk/module/bdev/crypto/vbdev_crypto_rpc.c b/src/spdk/module/bdev/crypto/vbdev_crypto_rpc.c new file mode 100644 index 000000000..46c1e210d --- /dev/null +++ b/src/spdk/module/bdev/crypto/vbdev_crypto_rpc.c @@ -0,0 +1,195 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vbdev_crypto.h" + +/* Structure to hold the parameters for this RPC method. */ +struct rpc_construct_crypto { + char *base_bdev_name; + char *name; + char *crypto_pmd; + char *key; + char *cipher; + char *key2; +}; + +/* Free the allocated memory resource after the RPC handling. */ +static void +free_rpc_construct_crypto(struct rpc_construct_crypto *r) +{ + free(r->base_bdev_name); + free(r->name); + free(r->crypto_pmd); + free(r->key); + free(r->cipher); + free(r->key2); +} + +/* Structure to decode the input parameters for this RPC method. */ +static const struct spdk_json_object_decoder rpc_construct_crypto_decoders[] = { + {"base_bdev_name", offsetof(struct rpc_construct_crypto, base_bdev_name), spdk_json_decode_string}, + {"name", offsetof(struct rpc_construct_crypto, name), spdk_json_decode_string}, + {"crypto_pmd", offsetof(struct rpc_construct_crypto, crypto_pmd), spdk_json_decode_string}, + {"key", offsetof(struct rpc_construct_crypto, key), spdk_json_decode_string}, + {"cipher", offsetof(struct rpc_construct_crypto, cipher), spdk_json_decode_string, true}, + {"key2", offsetof(struct rpc_construct_crypto, key2), spdk_json_decode_string, true}, +}; + +/* Decode the parameters for this RPC method and properly construct the crypto + * device. Error status returned in the failed cases. + */ +static void +rpc_bdev_crypto_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_crypto req = {NULL}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_crypto_decoders, + SPDK_COUNTOF(rpc_construct_crypto_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto cleanup; + } + + if (req.cipher == NULL) { + req.cipher = strdup(AES_CBC); + if (req.cipher == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to allocate memory for req.cipher"); + goto cleanup; + } + } + + if (strcmp(req.cipher, AES_XTS) != 0 && strcmp(req.cipher, AES_CBC) != 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid cipher: %s", + req.cipher); + goto cleanup; + } + + if (strcmp(req.crypto_pmd, AESNI_MB) == 0 && strcmp(req.cipher, AES_XTS) == 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid cipher. AES_XTS is only available on QAT."); + goto cleanup; + } + + if (strcmp(req.cipher, AES_XTS) == 0 && req.key2 == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid key. A 2nd key is needed for AES_XTS."); + goto cleanup; + } + + if (strcmp(req.cipher, AES_CBC) == 0 && req.key2 != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid key. A 2nd key is needed only for AES_XTS."); + goto cleanup; + } + + rc = create_crypto_disk(req.base_bdev_name, req.name, + req.crypto_pmd, req.key, req.cipher, req.key2); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, req.name); + spdk_jsonrpc_end_result(request, w); + free_rpc_construct_crypto(&req); + return; + +cleanup: + free_rpc_construct_crypto(&req); +} +SPDK_RPC_REGISTER("bdev_crypto_create", rpc_bdev_crypto_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_crypto_create, construct_crypto_bdev) + +struct rpc_delete_crypto { + char *name; +}; + +static void +free_rpc_delete_crypto(struct rpc_delete_crypto *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_crypto_decoders[] = { + {"name", offsetof(struct rpc_delete_crypto, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_crypto_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_crypto_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_crypto req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_delete_crypto_decoders, + SPDK_COUNTOF(rpc_delete_crypto_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + delete_crypto_disk(bdev, rpc_bdev_crypto_delete_cb, request); + + free_rpc_delete_crypto(&req); + + return; + +cleanup: + free_rpc_delete_crypto(&req); +} +SPDK_RPC_REGISTER("bdev_crypto_delete", rpc_bdev_crypto_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_crypto_delete, delete_crypto_bdev) diff --git a/src/spdk/module/bdev/delay/Makefile b/src/spdk/module/bdev/delay/Makefile new file mode 100644 index 000000000..f043ca5a8 --- /dev/null +++ b/src/spdk/module/bdev/delay/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ + +C_SRCS = vbdev_delay.c vbdev_delay_rpc.c +LIBNAME = bdev_delay + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/delay/vbdev_delay.c b/src/spdk/module/bdev/delay/vbdev_delay.c new file mode 100644 index 000000000..b4ea1b413 --- /dev/null +++ b/src/spdk/module/bdev/delay/vbdev_delay.c @@ -0,0 +1,851 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "vbdev_delay.h" +#include "spdk/rpc.h" +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + + +static int vbdev_delay_init(void); +static int vbdev_delay_get_ctx_size(void); +static void vbdev_delay_examine(struct spdk_bdev *bdev); +static void vbdev_delay_finish(void); +static int vbdev_delay_config_json(struct spdk_json_write_ctx *w); + +static struct spdk_bdev_module delay_if = { + .name = "delay", + .module_init = vbdev_delay_init, + .config_text = NULL, + .get_ctx_size = vbdev_delay_get_ctx_size, + .examine_config = vbdev_delay_examine, + .module_fini = vbdev_delay_finish, + .config_json = vbdev_delay_config_json +}; + +SPDK_BDEV_MODULE_REGISTER(delay, &delay_if) + +/* Associative list to be used in examine */ +struct bdev_association { + char *vbdev_name; + char *bdev_name; + uint64_t avg_read_latency; + uint64_t p99_read_latency; + uint64_t avg_write_latency; + uint64_t p99_write_latency; + TAILQ_ENTRY(bdev_association) link; +}; +static TAILQ_HEAD(, bdev_association) g_bdev_associations = TAILQ_HEAD_INITIALIZER( + g_bdev_associations); + +/* List of virtual bdevs and associated info for each. */ +struct vbdev_delay { + struct spdk_bdev *base_bdev; /* the thing we're attaching to */ + struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ + struct spdk_bdev delay_bdev; /* the delay virtual bdev */ + uint64_t average_read_latency_ticks; /* the average read delay */ + uint64_t p99_read_latency_ticks; /* the p99 read delay */ + uint64_t average_write_latency_ticks; /* the average write delay */ + uint64_t p99_write_latency_ticks; /* the p99 write delay */ + TAILQ_ENTRY(vbdev_delay) link; + struct spdk_thread *thread; /* thread where base device is opened */ +}; +static TAILQ_HEAD(, vbdev_delay) g_delay_nodes = TAILQ_HEAD_INITIALIZER(g_delay_nodes); + +struct delay_bdev_io { + int status; + + uint64_t completion_tick; + + enum delay_io_type type; + + struct spdk_io_channel *ch; + + struct spdk_bdev_io_wait_entry bdev_io_wait; + + STAILQ_ENTRY(delay_bdev_io) link; +}; + +struct delay_io_channel { + struct spdk_io_channel *base_ch; /* IO channel of base device */ + STAILQ_HEAD(, delay_bdev_io) avg_read_io; + STAILQ_HEAD(, delay_bdev_io) p99_read_io; + STAILQ_HEAD(, delay_bdev_io) avg_write_io; + STAILQ_HEAD(, delay_bdev_io) p99_write_io; + struct spdk_poller *io_poller; + unsigned int rand_seed; +}; + +static void +vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io); + + +/* Callback for unregistering the IO device. */ +static void +_device_unregister_cb(void *io_device) +{ + struct vbdev_delay *delay_node = io_device; + + /* Done with this delay_node. */ + free(delay_node->delay_bdev.name); + free(delay_node); +} + +static void +_vbdev_delay_destruct(void *ctx) +{ + struct spdk_bdev_desc *desc = ctx; + + spdk_bdev_close(desc); +} + +static int +vbdev_delay_destruct(void *ctx) +{ + struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; + + /* It is important to follow this exact sequence of steps for destroying + * a vbdev... + */ + + TAILQ_REMOVE(&g_delay_nodes, delay_node, link); + + /* Unclaim the underlying bdev. */ + spdk_bdev_module_release_bdev(delay_node->base_bdev); + + /* Close the underlying bdev on its same opened thread. */ + if (delay_node->thread && delay_node->thread != spdk_get_thread()) { + spdk_thread_send_msg(delay_node->thread, _vbdev_delay_destruct, delay_node->base_desc); + } else { + spdk_bdev_close(delay_node->base_desc); + } + + /* Unregister the io_device. */ + spdk_io_device_unregister(delay_node, _device_unregister_cb); + + return 0; +} + +static int +_process_io_stailq(void *arg, uint64_t ticks) +{ + STAILQ_HEAD(, delay_bdev_io) *head = arg; + struct delay_bdev_io *io_ctx, *tmp; + int completions = 0; + + STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) { + if (io_ctx->completion_tick <= ticks) { + STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link); + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), io_ctx->status); + completions++; + } else { + /* In the general case, I/O will become ready in an fifo order. When timeouts are dynamically + * changed, this is not necessarily the case. However, the normal behavior will be restored + * after the outstanding I/O at the time of the change have been completed. + * This essentially means that moving from a high to low latency creates a dam for the new I/O + * submitted after the latency change. This is considered desirable behavior for the use case where + * we are trying to trigger a pre-defined timeout on an initiator. + */ + break; + } + } + + return completions; +} + +static int +_delay_finish_io(void *arg) +{ + struct delay_io_channel *delay_ch = arg; + uint64_t ticks = spdk_get_ticks(); + int completions = 0; + + completions += _process_io_stailq(&delay_ch->avg_read_io, ticks); + completions += _process_io_stailq(&delay_ch->avg_write_io, ticks); + completions += _process_io_stailq(&delay_ch->p99_read_io, ticks); + completions += _process_io_stailq(&delay_ch->p99_write_io, ticks); + + return completions == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; +} + +/* Completion callback for IO that were issued from this bdev. The original bdev_io + * is passed in as an arg so we'll complete that one with the appropriate status + * and then free the one that this module issued. + */ +static void +_delay_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + struct vbdev_delay *delay_node = SPDK_CONTAINEROF(orig_io->bdev, struct vbdev_delay, delay_bdev); + struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)orig_io->driver_ctx; + struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch); + + io_ctx->status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + spdk_bdev_free_io(bdev_io); + + /* Put the I/O into the proper list for processing by the channel poller. */ + switch (io_ctx->type) { + case DELAY_AVG_READ: + io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_read_latency_ticks; + STAILQ_INSERT_TAIL(&delay_ch->avg_read_io, io_ctx, link); + break; + case DELAY_AVG_WRITE: + io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_write_latency_ticks; + STAILQ_INSERT_TAIL(&delay_ch->avg_write_io, io_ctx, link); + break; + case DELAY_P99_READ: + io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_read_latency_ticks; + STAILQ_INSERT_TAIL(&delay_ch->p99_read_io, io_ctx, link); + break; + case DELAY_P99_WRITE: + io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_write_latency_ticks; + STAILQ_INSERT_TAIL(&delay_ch->p99_write_io, io_ctx, link); + break; + case DELAY_NONE: + default: + spdk_bdev_io_complete(orig_io, io_ctx->status); + break; + } +} + +static void +vbdev_delay_resubmit_io(void *arg) +{ + struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg; + struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; + + vbdev_delay_submit_request(io_ctx->ch, bdev_io); +} + +static void +vbdev_delay_queue_io(struct spdk_bdev_io *bdev_io) +{ + struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; + struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch); + int rc; + + io_ctx->bdev_io_wait.bdev = bdev_io->bdev; + io_ctx->bdev_io_wait.cb_fn = vbdev_delay_resubmit_io; + io_ctx->bdev_io_wait.cb_arg = bdev_io; + + rc = spdk_bdev_queue_io_wait(bdev_io->bdev, delay_ch->base_ch, &io_ctx->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vbdev_delay_queue_io, rc=%d.\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +delay_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +{ + struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, + delay_bdev); + struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch); + int rc; + + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + rc = spdk_bdev_readv_blocks(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, _delay_complete_io, + bdev_io); + + if (rc == -ENOMEM) { + SPDK_ERRLOG("No memory, start to queue io for delay.\n"); + vbdev_delay_queue_io(bdev_io); + } else if (rc != 0) { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +vbdev_delay_reset_dev(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch); + struct vbdev_delay *delay_node = spdk_io_channel_iter_get_io_device(i); + int rc; + + rc = spdk_bdev_reset(delay_node->base_desc, delay_ch->base_ch, + _delay_complete_io, bdev_io); + + if (rc == -ENOMEM) { + SPDK_ERRLOG("No memory, start to queue io for delay.\n"); + vbdev_delay_queue_io(bdev_io); + } else if (rc != 0) { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +_abort_all_delayed_io(void *arg) +{ + STAILQ_HEAD(, delay_bdev_io) *head = arg; + struct delay_bdev_io *io_ctx, *tmp; + + STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) { + STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link); + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), SPDK_BDEV_IO_STATUS_ABORTED); + } +} + +static void +vbdev_delay_reset_channel(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch); + + _abort_all_delayed_io(&delay_ch->avg_read_io); + _abort_all_delayed_io(&delay_ch->avg_write_io); + _abort_all_delayed_io(&delay_ch->p99_read_io); + _abort_all_delayed_io(&delay_ch->p99_write_io); + + spdk_for_each_channel_continue(i, 0); +} + +static bool +abort_delayed_io(void *_head, struct spdk_bdev_io *bio_to_abort) +{ + STAILQ_HEAD(, delay_bdev_io) *head = _head; + struct delay_bdev_io *io_ctx_to_abort = (struct delay_bdev_io *)bio_to_abort->driver_ctx; + struct delay_bdev_io *io_ctx; + + STAILQ_FOREACH(io_ctx, head, link) { + if (io_ctx == io_ctx_to_abort) { + STAILQ_REMOVE(head, io_ctx_to_abort, delay_bdev_io, link); + spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); + return true; + } + } + + return false; +} + +static int +vbdev_delay_abort(struct vbdev_delay *delay_node, struct delay_io_channel *delay_ch, + struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; + + if (abort_delayed_io(&delay_ch->avg_read_io, bio_to_abort) || + abort_delayed_io(&delay_ch->avg_write_io, bio_to_abort) || + abort_delayed_io(&delay_ch->p99_read_io, bio_to_abort) || + abort_delayed_io(&delay_ch->p99_write_io, bio_to_abort)) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; + } + + return spdk_bdev_abort(delay_node->base_desc, delay_ch->base_ch, bio_to_abort, + _delay_complete_io, bdev_io); +} + +static void +vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, delay_bdev); + struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch); + struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx; + int rc = 0; + bool is_p99; + + is_p99 = rand_r(&delay_ch->rand_seed) % 100 == 0 ? true : false; + + io_ctx->ch = ch; + io_ctx->type = DELAY_NONE; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ; + spdk_bdev_io_get_buf(bdev_io, delay_read_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE; + rc = spdk_bdev_writev_blocks(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, _delay_complete_io, + bdev_io); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + rc = spdk_bdev_write_zeroes_blocks(delay_node->base_desc, delay_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _delay_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + rc = spdk_bdev_unmap_blocks(delay_node->base_desc, delay_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _delay_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + rc = spdk_bdev_flush_blocks(delay_node->base_desc, delay_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _delay_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + /* During reset, the generic bdev layer aborts all new I/Os and queues all new resets. + * Hence we can simply abort all I/Os delayed to complete. + */ + spdk_for_each_channel(delay_node, vbdev_delay_reset_channel, bdev_io, + vbdev_delay_reset_dev); + break; + case SPDK_BDEV_IO_TYPE_ABORT: + rc = vbdev_delay_abort(delay_node, delay_ch, bdev_io); + break; + default: + SPDK_ERRLOG("delay: unknown I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + if (rc == -ENOMEM) { + SPDK_ERRLOG("No memory, start to queue io for delay.\n"); + vbdev_delay_queue_io(bdev_io); + } else if (rc != 0) { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +vbdev_delay_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; + + if (io_type == SPDK_BDEV_IO_TYPE_ZCOPY) { + return false; + } else { + return spdk_bdev_io_type_supported(delay_node->base_bdev, io_type); + } +} + +static struct spdk_io_channel * +vbdev_delay_get_io_channel(void *ctx) +{ + struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; + struct spdk_io_channel *delay_ch = NULL; + + delay_ch = spdk_get_io_channel(delay_node); + + return delay_ch; +} + +static void +_delay_write_conf_values(struct vbdev_delay *delay_node, struct spdk_json_write_ctx *w) +{ + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&delay_node->delay_bdev)); + spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(delay_node->base_bdev)); + spdk_json_write_named_int64(w, "avg_read_latency", + delay_node->average_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); + spdk_json_write_named_int64(w, "p99_read_latency", + delay_node->p99_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); + spdk_json_write_named_int64(w, "avg_write_latency", + delay_node->average_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); + spdk_json_write_named_int64(w, "p99_write_latency", + delay_node->p99_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz()); +} + +static int +vbdev_delay_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx; + + spdk_json_write_name(w, "delay"); + spdk_json_write_object_begin(w); + _delay_write_conf_values(delay_node, w); + spdk_json_write_object_end(w); + + return 0; +} + +/* This is used to generate JSON that can configure this module to its current state. */ +static int +vbdev_delay_config_json(struct spdk_json_write_ctx *w) +{ + struct vbdev_delay *delay_node; + + TAILQ_FOREACH(delay_node, &g_delay_nodes, link) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "bdev_delay_create"); + spdk_json_write_named_object_begin(w, "params"); + _delay_write_conf_values(delay_node, w); + spdk_json_write_object_end(w); + } + return 0; +} + +/* We provide this callback for the SPDK channel code to create a channel using + * the channel struct we provided in our module get_io_channel() entry point. Here + * we get and save off an underlying base channel of the device below us so that + * we can communicate with the base bdev on a per channel basis. If we needed + * our own poller for this vbdev, we'd register it here. + */ +static int +delay_bdev_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct delay_io_channel *delay_ch = ctx_buf; + struct vbdev_delay *delay_node = io_device; + + STAILQ_INIT(&delay_ch->avg_read_io); + STAILQ_INIT(&delay_ch->p99_read_io); + STAILQ_INIT(&delay_ch->avg_write_io); + STAILQ_INIT(&delay_ch->p99_write_io); + + delay_ch->io_poller = SPDK_POLLER_REGISTER(_delay_finish_io, delay_ch, 0); + delay_ch->base_ch = spdk_bdev_get_io_channel(delay_node->base_desc); + delay_ch->rand_seed = time(NULL); + + return 0; +} + +/* We provide this callback for the SPDK channel code to destroy a channel + * created with our create callback. We just need to undo anything we did + * when we created. If this bdev used its own poller, we'd unregsiter it here. + */ +static void +delay_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct delay_io_channel *delay_ch = ctx_buf; + + spdk_poller_unregister(&delay_ch->io_poller); + spdk_put_io_channel(delay_ch->base_ch); +} + +/* Create the delay association from the bdev and vbdev name and insert + * on the global list. */ +static int +vbdev_delay_insert_association(const char *bdev_name, const char *vbdev_name, + uint64_t avg_read_latency, uint64_t p99_read_latency, + uint64_t avg_write_latency, uint64_t p99_write_latency) +{ + struct bdev_association *assoc; + + TAILQ_FOREACH(assoc, &g_bdev_associations, link) { + if (strcmp(vbdev_name, assoc->vbdev_name) == 0) { + SPDK_ERRLOG("delay bdev %s already exists\n", vbdev_name); + return -EEXIST; + } + } + + assoc = calloc(1, sizeof(struct bdev_association)); + if (!assoc) { + SPDK_ERRLOG("could not allocate bdev_association\n"); + return -ENOMEM; + } + + assoc->bdev_name = strdup(bdev_name); + if (!assoc->bdev_name) { + SPDK_ERRLOG("could not allocate assoc->bdev_name\n"); + free(assoc); + return -ENOMEM; + } + + assoc->vbdev_name = strdup(vbdev_name); + if (!assoc->vbdev_name) { + SPDK_ERRLOG("could not allocate assoc->vbdev_name\n"); + free(assoc->bdev_name); + free(assoc); + return -ENOMEM; + } + + assoc->avg_read_latency = avg_read_latency; + assoc->p99_read_latency = p99_read_latency; + assoc->avg_write_latency = avg_write_latency; + assoc->p99_write_latency = p99_write_latency; + + TAILQ_INSERT_TAIL(&g_bdev_associations, assoc, link); + + return 0; +} + +int +vbdev_delay_update_latency_value(char *delay_name, uint64_t latency_us, enum delay_io_type type) +{ + struct spdk_bdev *delay_bdev; + struct vbdev_delay *delay_node; + uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; + + delay_bdev = spdk_bdev_get_by_name(delay_name); + if (delay_bdev == NULL) { + return -ENODEV; + } else if (delay_bdev->module != &delay_if) { + return -EINVAL; + } + + delay_node = SPDK_CONTAINEROF(delay_bdev, struct vbdev_delay, delay_bdev); + + switch (type) { + case DELAY_AVG_READ: + delay_node->average_read_latency_ticks = ticks_mhz * latency_us; + break; + case DELAY_AVG_WRITE: + delay_node->average_write_latency_ticks = ticks_mhz * latency_us; + break; + case DELAY_P99_READ: + delay_node->p99_read_latency_ticks = ticks_mhz * latency_us; + break; + case DELAY_P99_WRITE: + delay_node->p99_write_latency_ticks = ticks_mhz * latency_us; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +vbdev_delay_init(void) +{ + /* Not allowing for .ini style configuration. */ + return 0; +} + +static void +vbdev_delay_finish(void) +{ + struct bdev_association *assoc; + + while ((assoc = TAILQ_FIRST(&g_bdev_associations))) { + TAILQ_REMOVE(&g_bdev_associations, assoc, link); + free(assoc->bdev_name); + free(assoc->vbdev_name); + free(assoc); + } +} + +static int +vbdev_delay_get_ctx_size(void) +{ + return sizeof(struct delay_bdev_io); +} + +static void +vbdev_delay_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* No config per bdev needed */ +} + +/* When we register our bdev this is how we specify our entry points. */ +static const struct spdk_bdev_fn_table vbdev_delay_fn_table = { + .destruct = vbdev_delay_destruct, + .submit_request = vbdev_delay_submit_request, + .io_type_supported = vbdev_delay_io_type_supported, + .get_io_channel = vbdev_delay_get_io_channel, + .dump_info_json = vbdev_delay_dump_info_json, + .write_config_json = vbdev_delay_write_config_json, +}; + +/* Called when the underlying base bdev goes away. */ +static void +vbdev_delay_base_bdev_hotremove_cb(void *ctx) +{ + struct vbdev_delay *delay_node, *tmp; + struct spdk_bdev *bdev_find = ctx; + + TAILQ_FOREACH_SAFE(delay_node, &g_delay_nodes, link, tmp) { + if (bdev_find == delay_node->base_bdev) { + spdk_bdev_unregister(&delay_node->delay_bdev, NULL, NULL); + } + } +} + +/* Create and register the delay vbdev if we find it in our list of bdev names. + * This can be called either by the examine path or RPC method. + */ +static int +vbdev_delay_register(struct spdk_bdev *bdev) +{ + struct bdev_association *assoc; + struct vbdev_delay *delay_node; + uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; + int rc = 0; + + /* Check our list of names from config versus this bdev and if + * there's a match, create the delay_node & bdev accordingly. + */ + TAILQ_FOREACH(assoc, &g_bdev_associations, link) { + if (strcmp(assoc->bdev_name, bdev->name) != 0) { + continue; + } + + delay_node = calloc(1, sizeof(struct vbdev_delay)); + if (!delay_node) { + rc = -ENOMEM; + SPDK_ERRLOG("could not allocate delay_node\n"); + break; + } + + /* The base bdev that we're attaching to. */ + delay_node->base_bdev = bdev; + delay_node->delay_bdev.name = strdup(assoc->vbdev_name); + if (!delay_node->delay_bdev.name) { + rc = -ENOMEM; + SPDK_ERRLOG("could not allocate delay_bdev name\n"); + free(delay_node); + break; + } + delay_node->delay_bdev.product_name = "delay"; + + delay_node->delay_bdev.write_cache = bdev->write_cache; + delay_node->delay_bdev.required_alignment = bdev->required_alignment; + delay_node->delay_bdev.optimal_io_boundary = bdev->optimal_io_boundary; + delay_node->delay_bdev.blocklen = bdev->blocklen; + delay_node->delay_bdev.blockcnt = bdev->blockcnt; + + delay_node->delay_bdev.ctxt = delay_node; + delay_node->delay_bdev.fn_table = &vbdev_delay_fn_table; + delay_node->delay_bdev.module = &delay_if; + + /* Store the number of ticks you need to add to get the I/O expiration time. */ + delay_node->average_read_latency_ticks = ticks_mhz * assoc->avg_read_latency; + delay_node->p99_read_latency_ticks = ticks_mhz * assoc->p99_read_latency; + delay_node->average_write_latency_ticks = ticks_mhz * assoc->avg_write_latency; + delay_node->p99_write_latency_ticks = ticks_mhz * assoc->p99_write_latency; + + spdk_io_device_register(delay_node, delay_bdev_ch_create_cb, delay_bdev_ch_destroy_cb, + sizeof(struct delay_io_channel), + assoc->vbdev_name); + + rc = spdk_bdev_open(bdev, true, vbdev_delay_base_bdev_hotremove_cb, + bdev, &delay_node->base_desc); + if (rc) { + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(bdev)); + goto error_unregister; + } + + /* Save the thread where the base device is opened */ + delay_node->thread = spdk_get_thread(); + + rc = spdk_bdev_module_claim_bdev(bdev, delay_node->base_desc, delay_node->delay_bdev.module); + if (rc) { + SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(bdev)); + goto error_close; + } + + rc = spdk_bdev_register(&delay_node->delay_bdev); + if (rc) { + SPDK_ERRLOG("could not register delay_bdev\n"); + spdk_bdev_module_release_bdev(delay_node->base_bdev); + goto error_close; + } + + TAILQ_INSERT_TAIL(&g_delay_nodes, delay_node, link); + } + + return rc; + +error_close: + spdk_bdev_close(delay_node->base_desc); +error_unregister: + spdk_io_device_unregister(delay_node, NULL); + free(delay_node->delay_bdev.name); + free(delay_node); + return rc; +} + +int +create_delay_disk(const char *bdev_name, const char *vbdev_name, uint64_t avg_read_latency, + uint64_t p99_read_latency, uint64_t avg_write_latency, uint64_t p99_write_latency) +{ + struct spdk_bdev *bdev = NULL; + int rc = 0; + + if (p99_read_latency < avg_read_latency || p99_write_latency < avg_write_latency) { + SPDK_ERRLOG("Unable to create a delay bdev where p99 latency is less than average latency.\n"); + return -EINVAL; + } + + rc = vbdev_delay_insert_association(bdev_name, vbdev_name, avg_read_latency, p99_read_latency, + avg_write_latency, p99_write_latency); + if (rc) { + return rc; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + return 0; + } + + return vbdev_delay_register(bdev); +} + +void +delete_delay_disk(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) +{ + struct bdev_association *assoc; + + if (!bdev || bdev->module != &delay_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + TAILQ_FOREACH(assoc, &g_bdev_associations, link) { + if (strcmp(assoc->vbdev_name, bdev->name) == 0) { + TAILQ_REMOVE(&g_bdev_associations, assoc, link); + free(assoc->bdev_name); + free(assoc->vbdev_name); + free(assoc); + break; + } + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static void +vbdev_delay_examine(struct spdk_bdev *bdev) +{ + vbdev_delay_register(bdev); + + spdk_bdev_module_examine_done(&delay_if); +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_delay", SPDK_LOG_VBDEV_DELAY) diff --git a/src/spdk/module/bdev/delay/vbdev_delay.h b/src/spdk/module/bdev/delay/vbdev_delay.h new file mode 100644 index 000000000..4f88a5e2f --- /dev/null +++ b/src/spdk/module/bdev/delay/vbdev_delay.h @@ -0,0 +1,85 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_DELAY_H +#define SPDK_VBDEV_DELAY_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/bdev_module.h" + +enum delay_io_type { + DELAY_AVG_READ, + DELAY_P99_READ, + DELAY_AVG_WRITE, + DELAY_P99_WRITE, + DELAY_NONE +}; + +/** + * Create new delay bdev. + * + * \param bdev_name Bdev on which delay vbdev will be created. + * \param vbdev_name Name of the delay bdev. + * \param avg_read_latency Desired typical read latency. + * \param p99_read_latency Desired p99 read latency + * \param avg_write_latency Desired typical write latency. + * \param p99_write_latency Desired p99 write latency + * \return 0 on success, other on failure. + */ +int create_delay_disk(const char *bdev_name, const char *vbdev_name, uint64_t avg_read_latency, + uint64_t p99_read_latency, uint64_t avg_write_latency, uint64_t p99_write_latency); + +/** + * Delete delay bdev. + * + * \param bdev Pointer to delay bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void delete_delay_disk(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, + void *cb_arg); + +/** + * Update one of the latency values for a given delay bdev. + * + * \param delay_name The name of the delay bdev + * \param latency_us The new latency value, in microseconds + * \param type a valid value from the delay_io_type enum + * \return 0 on success, -ENODEV if the bdev cannot be found, and -EINVAL if the bdev is not a delay device. + */ +int vbdev_delay_update_latency_value(char *delay_name, uint64_t latency_us, + enum delay_io_type type); + +#endif /* SPDK_VBDEV_DELAY_H */ diff --git a/src/spdk/module/bdev/delay/vbdev_delay_rpc.c b/src/spdk/module/bdev/delay/vbdev_delay_rpc.c new file mode 100644 index 000000000..aabbadd69 --- /dev/null +++ b/src/spdk/module/bdev/delay/vbdev_delay_rpc.c @@ -0,0 +1,225 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vbdev_delay.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" +#include "spdk_internal/assert.h" + +struct rpc_update_latency { + char *delay_bdev_name; + char *latency_type; + uint64_t latency_us; +}; + +static const struct spdk_json_object_decoder rpc_update_latency_decoders[] = { + {"delay_bdev_name", offsetof(struct rpc_update_latency, delay_bdev_name), spdk_json_decode_string}, + {"latency_type", offsetof(struct rpc_update_latency, latency_type), spdk_json_decode_string}, + {"latency_us", offsetof(struct rpc_update_latency, latency_us), spdk_json_decode_uint64} +}; + +static void +free_rpc_update_latency(struct rpc_update_latency *req) +{ + free(req->delay_bdev_name); + free(req->latency_type); +} + +static void +rpc_bdev_delay_update_latency(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_update_latency req = {NULL}; + struct spdk_json_write_ctx *w; + enum delay_io_type latency_type; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_update_latency_decoders, + SPDK_COUNTOF(rpc_update_latency_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_DELAY, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + if (!strncmp(req.latency_type, "avg_read", 9)) { + latency_type = DELAY_AVG_READ; + } else if (!strncmp(req.latency_type, "p99_read", 9)) { + latency_type = DELAY_P99_READ; + } else if (!strncmp(req.latency_type, "avg_write", 10)) { + latency_type = DELAY_AVG_WRITE; + } else if (!strncmp(req.latency_type, "p99_write", 10)) { + latency_type = DELAY_P99_WRITE; + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Please specify a valid latency type."); + goto cleanup; + } + + rc = vbdev_delay_update_latency_value(req.delay_bdev_name, req.latency_us, latency_type); + + if (rc == -ENODEV) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "The requested bdev does not exist."); + goto cleanup; + } else if (rc == -EINVAL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST, + "The requested bdev is not a delay bdev."); + goto cleanup; + } else if (rc) { + SPDK_UNREACHABLE(); + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_update_latency(&req); +} +SPDK_RPC_REGISTER("bdev_delay_update_latency", rpc_bdev_delay_update_latency, SPDK_RPC_RUNTIME) + +struct rpc_construct_delay { + char *base_bdev_name; + char *name; + uint64_t avg_read_latency; + uint64_t p99_read_latency; + uint64_t avg_write_latency; + uint64_t p99_write_latency; +}; + +static void +free_rpc_construct_delay(struct rpc_construct_delay *r) +{ + free(r->base_bdev_name); + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_construct_delay_decoders[] = { + {"base_bdev_name", offsetof(struct rpc_construct_delay, base_bdev_name), spdk_json_decode_string}, + {"name", offsetof(struct rpc_construct_delay, name), spdk_json_decode_string}, + {"avg_read_latency", offsetof(struct rpc_construct_delay, avg_read_latency), spdk_json_decode_uint64}, + {"p99_read_latency", offsetof(struct rpc_construct_delay, p99_read_latency), spdk_json_decode_uint64}, + {"avg_write_latency", offsetof(struct rpc_construct_delay, avg_write_latency), spdk_json_decode_uint64}, + {"p99_write_latency", offsetof(struct rpc_construct_delay, p99_write_latency), spdk_json_decode_uint64}, +}; + +static void +rpc_bdev_delay_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_delay req = {NULL}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_delay_decoders, + SPDK_COUNTOF(rpc_construct_delay_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_DELAY, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = create_delay_disk(req.base_bdev_name, req.name, req.avg_read_latency, req.p99_read_latency, + req.avg_write_latency, req.p99_write_latency); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, req.name); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_construct_delay(&req); +} +SPDK_RPC_REGISTER("bdev_delay_create", rpc_bdev_delay_create, SPDK_RPC_RUNTIME) + +struct rpc_delete_delay { + char *name; +}; + +static void +free_rpc_delete_delay(struct rpc_delete_delay *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_delay_decoders[] = { + {"name", offsetof(struct rpc_delete_delay, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_delay_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_delay_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_delay req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_delete_delay_decoders, + SPDK_COUNTOF(rpc_delete_delay_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + delete_delay_disk(bdev, rpc_bdev_delay_delete_cb, request); + +cleanup: + free_rpc_delete_delay(&req); +} +SPDK_RPC_REGISTER("bdev_delay_delete", rpc_bdev_delay_delete, SPDK_RPC_RUNTIME) diff --git a/src/spdk/module/bdev/error/Makefile b/src/spdk/module/bdev/error/Makefile new file mode 100644 index 000000000..e67a18530 --- /dev/null +++ b/src/spdk/module/bdev/error/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = vbdev_error.c vbdev_error_rpc.c +LIBNAME = bdev_error + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/error/vbdev_error.c b/src/spdk/module/bdev/error/vbdev_error.c new file mode 100644 index 000000000..643d0d8a1 --- /dev/null +++ b/src/spdk/module/bdev/error/vbdev_error.c @@ -0,0 +1,508 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This is a module for test purpose which will simulate error cases for bdev. + */ + +#include "spdk/stdinc.h" +#include "spdk/rpc.h" +#include "spdk/conf.h" +#include "spdk/util.h" +#include "spdk/endian.h" +#include "spdk/nvme_spec.h" +#include "spdk/string.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#include "vbdev_error.h" + +struct spdk_vbdev_error_config { + char *base_bdev; + TAILQ_ENTRY(spdk_vbdev_error_config) tailq; +}; + +static TAILQ_HEAD(, spdk_vbdev_error_config) g_error_config + = TAILQ_HEAD_INITIALIZER(g_error_config); + +struct vbdev_error_info { + uint32_t error_type; + uint32_t error_num; +}; + +/* Context for each error bdev */ +struct error_disk { + struct spdk_bdev_part part; + struct vbdev_error_info error_vector[SPDK_BDEV_IO_TYPE_RESET]; + TAILQ_HEAD(, spdk_bdev_io) pending_ios; +}; + +struct error_channel { + struct spdk_bdev_part_channel part_ch; +}; + +static pthread_mutex_t g_vbdev_error_mutex = PTHREAD_MUTEX_INITIALIZER; +static SPDK_BDEV_PART_TAILQ g_error_disks = TAILQ_HEAD_INITIALIZER(g_error_disks); + +static int vbdev_error_init(void); +static void vbdev_error_fini(void); + +static void vbdev_error_examine(struct spdk_bdev *bdev); +static int vbdev_error_config_json(struct spdk_json_write_ctx *w); + +static int vbdev_error_config_add(const char *base_bdev_name); +static int vbdev_error_config_remove(const char *base_bdev_name); + +static struct spdk_bdev_module error_if = { + .name = "error", + .module_init = vbdev_error_init, + .module_fini = vbdev_error_fini, + .examine_config = vbdev_error_examine, + .config_json = vbdev_error_config_json, + +}; + +SPDK_BDEV_MODULE_REGISTER(error, &error_if) + +int +vbdev_error_inject_error(char *name, uint32_t io_type, uint32_t error_type, uint32_t error_num) +{ + struct spdk_bdev *bdev; + struct spdk_bdev_part *part; + struct error_disk *error_disk = NULL; + uint32_t i; + + pthread_mutex_lock(&g_vbdev_error_mutex); + bdev = spdk_bdev_get_by_name(name); + if (!bdev) { + SPDK_ERRLOG("Could not find ErrorInjection bdev %s\n", name); + pthread_mutex_unlock(&g_vbdev_error_mutex); + return -ENODEV; + } + + TAILQ_FOREACH(part, &g_error_disks, tailq) { + if (bdev == spdk_bdev_part_get_bdev(part)) { + error_disk = (struct error_disk *)part; + break; + } + } + + if (error_disk == NULL) { + SPDK_ERRLOG("Could not find ErrorInjection bdev %s\n", name); + pthread_mutex_unlock(&g_vbdev_error_mutex); + return -ENODEV; + } + + if (0xffffffff == io_type) { + for (i = 0; i < SPDK_COUNTOF(error_disk->error_vector); i++) { + error_disk->error_vector[i].error_type = error_type; + error_disk->error_vector[i].error_num = error_num; + } + } else if (0 == io_type) { + for (i = 0; i < SPDK_COUNTOF(error_disk->error_vector); i++) { + error_disk->error_vector[i].error_num = 0; + } + } else { + error_disk->error_vector[io_type].error_type = error_type; + error_disk->error_vector[io_type].error_num = error_num; + } + pthread_mutex_unlock(&g_vbdev_error_mutex); + return 0; +} + +static void +vbdev_error_reset(struct error_disk *error_disk, struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev_io *pending_io, *tmp; + + TAILQ_FOREACH_SAFE(pending_io, &error_disk->pending_ios, module_link, tmp) { + TAILQ_REMOVE(&error_disk->pending_ios, pending_io, module_link); + spdk_bdev_io_complete(pending_io, SPDK_BDEV_IO_STATUS_FAILED); + } + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +} + +static uint32_t +vbdev_error_get_error_type(struct error_disk *error_disk, uint32_t io_type) +{ + if (error_disk->error_vector[io_type].error_num) { + return error_disk->error_vector[io_type].error_type; + } + return 0; +} + +static void +vbdev_error_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct error_channel *ch = spdk_io_channel_get_ctx(_ch); + struct error_disk *error_disk = bdev_io->bdev->ctxt; + uint32_t error_type; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_FLUSH: + break; + case SPDK_BDEV_IO_TYPE_RESET: + vbdev_error_reset(error_disk, bdev_io); + return; + default: + SPDK_ERRLOG("Error Injection: unknown I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + error_type = vbdev_error_get_error_type(error_disk, bdev_io->type); + if (error_type == 0) { + int rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); + + if (rc) { + SPDK_ERRLOG("bdev_error: submit request failed, rc=%d\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + return; + } else if (error_type == VBDEV_IO_FAILURE) { + error_disk->error_vector[bdev_io->type].error_num--; + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } else if (error_type == VBDEV_IO_PENDING) { + TAILQ_INSERT_TAIL(&error_disk->pending_ios, bdev_io, module_link); + error_disk->error_vector[bdev_io->type].error_num--; + } +} + +static int +vbdev_error_destruct(void *ctx) +{ + struct error_disk *error_disk = ctx; + struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(&error_disk->part); + int rc; + + rc = vbdev_error_config_remove(base_bdev->name); + if (rc != 0) { + SPDK_ERRLOG("vbdev_error_config_remove() failed\n"); + } + + return spdk_bdev_part_free(&error_disk->part); +} + +static int +vbdev_error_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct error_disk *error_disk = ctx; + struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(&error_disk->part); + + spdk_json_write_named_object_begin(w, "error_disk"); + + spdk_json_write_named_string(w, "base_bdev", base_bdev->name); + + spdk_json_write_object_end(w); + + return 0; +} + +static void +vbdev_error_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* No config per bdev. */ +} + + +static struct spdk_bdev_fn_table vbdev_error_fn_table = { + .destruct = vbdev_error_destruct, + .submit_request = vbdev_error_submit_request, + .dump_info_json = vbdev_error_dump_info_json, + .write_config_json = vbdev_error_write_config_json +}; + +static void +vbdev_error_base_bdev_hotremove_cb(void *_part_base) +{ + struct spdk_bdev_part_base *part_base = _part_base; + + spdk_bdev_part_base_hotremove(part_base, &g_error_disks); +} + +static int +_vbdev_error_create(struct spdk_bdev *base_bdev) +{ + struct spdk_bdev_part_base *base = NULL; + struct error_disk *disk = NULL; + char *name; + int rc; + + base = spdk_bdev_part_base_construct(base_bdev, + vbdev_error_base_bdev_hotremove_cb, + &error_if, &vbdev_error_fn_table, &g_error_disks, + NULL, NULL, sizeof(struct error_channel), + NULL, NULL); + if (!base) { + SPDK_ERRLOG("could not construct part base for bdev %s\n", spdk_bdev_get_name(base_bdev)); + return -ENOMEM; + } + + disk = calloc(1, sizeof(*disk)); + if (!disk) { + SPDK_ERRLOG("Memory allocation failure\n"); + spdk_bdev_part_base_free(base); + return -ENOMEM; + } + + name = spdk_sprintf_alloc("EE_%s", spdk_bdev_get_name(base_bdev)); + if (!name) { + SPDK_ERRLOG("name allocation failure\n"); + spdk_bdev_part_base_free(base); + free(disk); + return -ENOMEM; + } + + rc = spdk_bdev_part_construct(&disk->part, base, name, 0, base_bdev->blockcnt, + "Error Injection Disk"); + free(name); + if (rc) { + SPDK_ERRLOG("could not construct part for bdev %s\n", spdk_bdev_get_name(base_bdev)); + /* spdk_bdev_part_construct will free name on failure */ + spdk_bdev_part_base_free(base); + free(disk); + return rc; + } + + TAILQ_INIT(&disk->pending_ios); + + return 0; +} + +int +vbdev_error_create(const char *base_bdev_name) +{ + int rc; + struct spdk_bdev *base_bdev; + + rc = vbdev_error_config_add(base_bdev_name); + if (rc != 0) { + SPDK_ERRLOG("Adding config for ErrorInjection bdev %s failed (rc=%d)\n", + base_bdev_name, rc); + return rc; + } + + base_bdev = spdk_bdev_get_by_name(base_bdev_name); + if (!base_bdev) { + return 0; + } + + rc = _vbdev_error_create(base_bdev); + if (rc != 0) { + vbdev_error_config_remove(base_bdev_name); + SPDK_ERRLOG("Could not create ErrorInjection bdev %s (rc=%d)\n", + base_bdev_name, rc); + } + + return rc; +} + +void +vbdev_error_delete(struct spdk_bdev *vbdev, spdk_delete_error_complete cb_fn, void *cb_arg) +{ + if (!vbdev || vbdev->module != &error_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(vbdev, cb_fn, cb_arg); +} + +static void +vbdev_error_clear_config(void) +{ + struct spdk_vbdev_error_config *cfg; + + while ((cfg = TAILQ_FIRST(&g_error_config))) { + TAILQ_REMOVE(&g_error_config, cfg, tailq); + free(cfg->base_bdev); + free(cfg); + } +} + +static struct spdk_vbdev_error_config * +vbdev_error_config_find_by_base_name(const char *base_bdev_name) +{ + struct spdk_vbdev_error_config *cfg; + + TAILQ_FOREACH(cfg, &g_error_config, tailq) { + if (strcmp(cfg->base_bdev, base_bdev_name) == 0) { + return cfg; + } + } + + return NULL; +} + +static int +vbdev_error_config_add(const char *base_bdev_name) +{ + struct spdk_vbdev_error_config *cfg; + + cfg = vbdev_error_config_find_by_base_name(base_bdev_name); + if (cfg) { + SPDK_ERRLOG("vbdev_error_config for bdev %s already exists\n", + base_bdev_name); + return -EEXIST; + } + + cfg = calloc(1, sizeof(*cfg)); + if (!cfg) { + SPDK_ERRLOG("calloc() failed for vbdev_error_config\n"); + return -ENOMEM; + } + + cfg->base_bdev = strdup(base_bdev_name); + if (!cfg->base_bdev) { + free(cfg); + SPDK_ERRLOG("strdup() failed for base_bdev_name\n"); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&g_error_config, cfg, tailq); + + return 0; +} + +static int +vbdev_error_config_remove(const char *base_bdev_name) +{ + struct spdk_vbdev_error_config *cfg; + + cfg = vbdev_error_config_find_by_base_name(base_bdev_name); + if (!cfg) { + return -ENOENT; + } + + TAILQ_REMOVE(&g_error_config, cfg, tailq); + free(cfg->base_bdev); + free(cfg); + return 0; +} + +static int +vbdev_error_init(void) +{ + struct spdk_conf_section *sp; + struct spdk_vbdev_error_config *cfg; + const char *base_bdev_name; + int i, rc; + + sp = spdk_conf_find_section(NULL, "BdevError"); + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + if (!spdk_conf_section_get_nval(sp, "BdevError", i)) { + break; + } + + base_bdev_name = spdk_conf_section_get_nmval(sp, "BdevError", i, 0); + if (!base_bdev_name) { + SPDK_ERRLOG("ErrorInjection configuration missing bdev name\n"); + rc = -EINVAL; + goto error; + } + + cfg = calloc(1, sizeof(*cfg)); + if (!cfg) { + SPDK_ERRLOG("calloc() failed for vbdev_error_config\n"); + rc = -ENOMEM; + goto error; + } + + cfg->base_bdev = strdup(base_bdev_name); + if (!cfg->base_bdev) { + free(cfg); + SPDK_ERRLOG("strdup() failed for bdev name\n"); + rc = -ENOMEM; + goto error; + } + + TAILQ_INSERT_TAIL(&g_error_config, cfg, tailq); + } + + return 0; + +error: + vbdev_error_clear_config(); + return rc; +} + +static void +vbdev_error_fini(void) +{ + vbdev_error_clear_config(); +} + +static void +vbdev_error_examine(struct spdk_bdev *bdev) +{ + struct spdk_vbdev_error_config *cfg; + int rc; + + cfg = vbdev_error_config_find_by_base_name(bdev->name); + if (cfg != NULL) { + rc = _vbdev_error_create(bdev); + if (rc != 0) { + SPDK_ERRLOG("could not create error vbdev for bdev %s at examine\n", + bdev->name); + } + } + + spdk_bdev_module_examine_done(&error_if); +} + +static int +vbdev_error_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_vbdev_error_config *cfg; + + TAILQ_FOREACH(cfg, &g_error_config, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_error_create"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "base_name", cfg->base_bdev); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + + return 0; +} diff --git a/src/spdk/module/bdev/error/vbdev_error.h b/src/spdk/module/bdev/error/vbdev_error.h new file mode 100644 index 000000000..8c0daaeac --- /dev/null +++ b/src/spdk/module/bdev/error/vbdev_error.h @@ -0,0 +1,76 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_ERROR_H +#define SPDK_VBDEV_ERROR_H + +#include "spdk/stdinc.h" +#include "spdk/bdev.h" + +enum vbdev_error_type { + VBDEV_IO_FAILURE = 1, + VBDEV_IO_PENDING, +}; + +typedef void (*spdk_delete_error_complete)(void *cb_arg, int bdeverrno); + +/** + * Create a vbdev on the base bdev to inject error into it. + * + * \param base_bdev_name Name of the base bdev. + * \return 0 on success or negative on failure. + */ +int vbdev_error_create(const char *base_bdev_name); + +/** + * Delete vbdev used to inject errors. + * + * \param bdev Pointer to error vbdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Arguments to pass to cb_fn. + */ +void vbdev_error_delete(struct spdk_bdev *vbdev, spdk_delete_error_complete cb_fn, + void *cb_arg); + +/** + * Inject error to the base bdev. Users can specify which IO type error is injected, + * what type of error is injected, and how many errors are injected. + * + * \param name Name of the base bdev into which error is injected. + * \param io_type IO type into which error is injected. + * \param error_num Count of injected errors + */ +int vbdev_error_inject_error(char *name, uint32_t io_type, uint32_t error_type, + uint32_t error_num); + +#endif /* SPDK_VBDEV_ERROR_H */ diff --git a/src/spdk/module/bdev/error/vbdev_error_rpc.c b/src/spdk/module/bdev/error/vbdev_error_rpc.c new file mode 100644 index 000000000..2dcbfd33e --- /dev/null +++ b/src/spdk/module/bdev/error/vbdev_error_rpc.c @@ -0,0 +1,245 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" +#include "vbdev_error.h" + +#define ERROR_BDEV_IO_TYPE_INVALID (SPDK_BDEV_IO_TYPE_RESET + 1) +#define ERROR_BDEV_ERROR_TYPE_INVALID (VBDEV_IO_PENDING + 1) + +static uint32_t +rpc_error_bdev_io_type_parse(char *name) +{ + if (strcmp(name, "read") == 0) { + return SPDK_BDEV_IO_TYPE_READ; + } else if (strcmp(name, "write") == 0) { + return SPDK_BDEV_IO_TYPE_WRITE; + } else if (strcmp(name, "flush") == 0) { + return SPDK_BDEV_IO_TYPE_FLUSH; + } else if (strcmp(name, "unmap") == 0) { + return SPDK_BDEV_IO_TYPE_UNMAP; + } else if (strcmp(name, "all") == 0) { + return 0xffffffff; + } else if (strcmp(name, "clear") == 0) { + return 0; + } + return ERROR_BDEV_IO_TYPE_INVALID; +} + +static uint32_t +rpc_error_bdev_error_type_parse(char *name) +{ + if (strcmp(name, "failure") == 0) { + return VBDEV_IO_FAILURE; + } else if (strcmp(name, "pending") == 0) { + return VBDEV_IO_PENDING; + } + return ERROR_BDEV_ERROR_TYPE_INVALID; +} + +struct rpc_bdev_error_create { + char *base_name; +}; + +static void +free_rpc_bdev_error_create(struct rpc_bdev_error_create *req) +{ + free(req->base_name); +} + +static const struct spdk_json_object_decoder rpc_bdev_error_create_decoders[] = { + {"base_name", offsetof(struct rpc_bdev_error_create, base_name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_error_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_error_create req = {}; + struct spdk_json_write_ctx *w; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_bdev_error_create_decoders, + SPDK_COUNTOF(rpc_bdev_error_create_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = vbdev_error_create(req.base_name); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_error_create(&req); +} +SPDK_RPC_REGISTER("bdev_error_create", rpc_bdev_error_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_error_create, construct_error_bdev) + +struct rpc_delete_error { + char *name; +}; + +static void +free_rpc_delete_error(struct rpc_delete_error *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_error_decoders[] = { + {"name", offsetof(struct rpc_delete_error, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_error_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_error_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_error req = {NULL}; + struct spdk_bdev *vbdev; + + if (spdk_json_decode_object(params, rpc_delete_error_decoders, + SPDK_COUNTOF(rpc_delete_error_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + vbdev = spdk_bdev_get_by_name(req.name); + if (vbdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + vbdev_error_delete(vbdev, rpc_bdev_error_delete_cb, request); + +cleanup: + free_rpc_delete_error(&req); +} +SPDK_RPC_REGISTER("bdev_error_delete", rpc_bdev_error_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_error_delete, delete_error_bdev) + +struct rpc_error_information { + char *name; + char *io_type; + char *error_type; + uint32_t num; +}; + +static const struct spdk_json_object_decoder rpc_error_information_decoders[] = { + {"name", offsetof(struct rpc_error_information, name), spdk_json_decode_string}, + {"io_type", offsetof(struct rpc_error_information, io_type), spdk_json_decode_string}, + {"error_type", offsetof(struct rpc_error_information, error_type), spdk_json_decode_string}, + {"num", offsetof(struct rpc_error_information, num), spdk_json_decode_uint32, true}, +}; + +static void +free_rpc_error_information(struct rpc_error_information *p) +{ + free(p->name); + free(p->io_type); + free(p->error_type); +} + +static void +rpc_bdev_error_inject_error(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_error_information req = {}; + struct spdk_json_write_ctx *w; + uint32_t io_type; + uint32_t error_type; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_error_information_decoders, + SPDK_COUNTOF(rpc_error_information_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + io_type = rpc_error_bdev_io_type_parse(req.io_type); + if (io_type == ERROR_BDEV_IO_TYPE_INVALID) { + spdk_jsonrpc_send_error_response(request, -EINVAL, + "Unexpected io_type value"); + goto cleanup; + } + + error_type = rpc_error_bdev_error_type_parse(req.error_type); + if (error_type == ERROR_BDEV_ERROR_TYPE_INVALID) { + spdk_jsonrpc_send_error_response(request, -EINVAL, + "Unexpected error_type value"); + goto cleanup; + } + + rc = vbdev_error_inject_error(req.name, io_type, error_type, req.num); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_error_information(&req); +} +SPDK_RPC_REGISTER("bdev_error_inject_error", rpc_bdev_error_inject_error, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_error_inject_error, bdev_inject_error) diff --git a/src/spdk/module/bdev/ftl/Makefile b/src/spdk/module/bdev/ftl/Makefile new file mode 100644 index 000000000..d0bfe1078 --- /dev/null +++ b/src/spdk/module/bdev/ftl/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS += bdev_ftl.c bdev_ftl_rpc.c +LIBNAME = bdev_ftl + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/ftl/bdev_ftl.c b/src/spdk/module/bdev/ftl/bdev_ftl.c new file mode 100644 index 000000000..e959c8677 --- /dev/null +++ b/src/spdk/module/bdev/ftl/bdev_ftl.c @@ -0,0 +1,517 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/string.h" +#include "spdk/likely.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/ftl.h" +#include "spdk_internal/log.h" + +#include "bdev_ftl.h" + +struct ftl_bdev { + struct spdk_bdev bdev; + + struct spdk_ftl_dev *dev; + + ftl_bdev_init_fn init_cb; + + void *init_arg; +}; + +struct ftl_deferred_init { + struct ftl_bdev_init_opts opts; + + LIST_ENTRY(ftl_deferred_init) entry; +}; + +static LIST_HEAD(, ftl_deferred_init) g_deferred_init = LIST_HEAD_INITIALIZER(g_deferred_init); + +static int bdev_ftl_initialize(void); +static void bdev_ftl_finish(void); +static void bdev_ftl_examine(struct spdk_bdev *bdev); + +static struct spdk_bdev_module g_ftl_if = { + .name = "ftl", + .module_init = bdev_ftl_initialize, + .module_fini = bdev_ftl_finish, + .examine_disk = bdev_ftl_examine, +}; + +SPDK_BDEV_MODULE_REGISTER(ftl, &g_ftl_if) + +static void +bdev_ftl_free_cb(struct spdk_ftl_dev *dev, void *ctx, int status) +{ + struct ftl_bdev *ftl_bdev = ctx; + + spdk_bdev_destruct_done(&ftl_bdev->bdev, status); + free(ftl_bdev->bdev.name); + free(ftl_bdev); +} + +static int +bdev_ftl_destruct(void *ctx) +{ + struct ftl_bdev *ftl_bdev = ctx; + spdk_ftl_dev_free(ftl_bdev->dev, bdev_ftl_free_cb, ftl_bdev); + + /* return 1 to indicate that the destruction is asynchronous */ + return 1; +} + +static void +bdev_ftl_cb(void *arg, int rc) +{ + struct spdk_bdev_io *bdev_io = arg; + enum spdk_bdev_io_status status; + + switch (rc) { + case 0: + status = SPDK_BDEV_IO_STATUS_SUCCESS; + break; + case -ENOMEM: + status = SPDK_BDEV_IO_STATUS_NOMEM; + break; + default: + status = SPDK_BDEV_IO_STATUS_FAILED; + break; + } + + spdk_bdev_io_complete(bdev_io, status); +} + +static void +bdev_ftl_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + struct ftl_bdev *ftl_bdev; + int rc; + + ftl_bdev = bdev_io->bdev->ctxt; + + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + rc = spdk_ftl_read(ftl_bdev->dev, + ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, bdev_ftl_cb, bdev_io); + + if (spdk_unlikely(rc != 0)) { + spdk_bdev_io_complete(bdev_io, rc); + } +} + +static int +_bdev_ftl_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct ftl_bdev *ftl_bdev = (struct ftl_bdev *)bdev_io->bdev->ctxt; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_ftl_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + + case SPDK_BDEV_IO_TYPE_WRITE: + return spdk_ftl_write(ftl_bdev->dev, ch, bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_ftl_cb, bdev_io); + + case SPDK_BDEV_IO_TYPE_FLUSH: + return spdk_ftl_flush(ftl_bdev->dev, bdev_ftl_cb, bdev_io); + + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_UNMAP: + default: + return -ENOTSUP; + break; + } +} + +static void +bdev_ftl_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + int rc = _bdev_ftl_submit_request(ch, bdev_io); + + if (spdk_unlikely(rc != 0)) { + spdk_bdev_io_complete(bdev_io, rc); + } +} + +static bool +bdev_ftl_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + return true; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_UNMAP: + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_ftl_get_io_channel(void *ctx) +{ + struct ftl_bdev *ftl_bdev = ctx; + + return spdk_get_io_channel(ftl_bdev->dev); +} + +static void +_bdev_ftl_write_config_info(struct ftl_bdev *ftl_bdev, struct spdk_json_write_ctx *w) +{ + struct spdk_ftl_attrs attrs = {}; + + spdk_ftl_dev_get_attrs(ftl_bdev->dev, &attrs); + + spdk_json_write_named_string(w, "base_bdev", attrs.base_bdev); + + if (attrs.cache_bdev) { + spdk_json_write_named_string(w, "cache", attrs.cache_bdev); + } +} + +static void +bdev_ftl_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct ftl_bdev *ftl_bdev = bdev->ctxt; + struct spdk_ftl_attrs attrs; + struct spdk_ftl_conf *conf = &attrs.conf; + char uuid[SPDK_UUID_STRING_LEN]; + + spdk_ftl_dev_get_attrs(ftl_bdev->dev, &attrs); + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_ftl_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", ftl_bdev->bdev.name); + + spdk_json_write_named_bool(w, "allow_open_bands", conf->allow_open_bands); + spdk_json_write_named_uint64(w, "overprovisioning", conf->lba_rsvd); + spdk_json_write_named_uint64(w, "limit_crit", conf->limits[SPDK_FTL_LIMIT_CRIT].limit); + spdk_json_write_named_uint64(w, "limit_crit_threshold", conf->limits[SPDK_FTL_LIMIT_CRIT].thld); + spdk_json_write_named_uint64(w, "limit_high", conf->limits[SPDK_FTL_LIMIT_HIGH].limit); + spdk_json_write_named_uint64(w, "limit_high_threshold", conf->limits[SPDK_FTL_LIMIT_HIGH].thld); + spdk_json_write_named_uint64(w, "limit_low", conf->limits[SPDK_FTL_LIMIT_LOW].limit); + spdk_json_write_named_uint64(w, "limit_low_threshold", conf->limits[SPDK_FTL_LIMIT_LOW].thld); + spdk_json_write_named_uint64(w, "limit_start", conf->limits[SPDK_FTL_LIMIT_START].limit); + spdk_json_write_named_uint64(w, "limit_start_threshold", conf->limits[SPDK_FTL_LIMIT_START].thld); + if (conf->l2p_path) { + spdk_json_write_named_string(w, "l2p_path", conf->l2p_path); + } + + spdk_uuid_fmt_lower(uuid, sizeof(uuid), &attrs.uuid); + spdk_json_write_named_string(w, "uuid", uuid); + + _bdev_ftl_write_config_info(ftl_bdev, w); + + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); +} + +static int +bdev_ftl_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct ftl_bdev *ftl_bdev = ctx; + struct spdk_ftl_attrs attrs; + + spdk_ftl_dev_get_attrs(ftl_bdev->dev, &attrs); + + spdk_json_write_named_object_begin(w, "ftl"); + + _bdev_ftl_write_config_info(ftl_bdev, w); + spdk_json_write_named_string_fmt(w, "num_zones", "%zu", attrs.num_zones); + spdk_json_write_named_string_fmt(w, "zone_size", "%zu", attrs.zone_size); + + /* ftl */ + spdk_json_write_object_end(w); + + return 0; +} + +static const struct spdk_bdev_fn_table ftl_fn_table = { + .destruct = bdev_ftl_destruct, + .submit_request = bdev_ftl_submit_request, + .io_type_supported = bdev_ftl_io_type_supported, + .get_io_channel = bdev_ftl_get_io_channel, + .write_config_json = bdev_ftl_write_config_json, + .dump_info_json = bdev_ftl_dump_info_json, +}; + +static void +bdev_ftl_create_cb(struct spdk_ftl_dev *dev, void *ctx, int status) +{ + struct ftl_bdev *ftl_bdev = ctx; + struct ftl_bdev_info info = {}; + struct spdk_ftl_attrs attrs; + ftl_bdev_init_fn init_cb = ftl_bdev->init_cb; + void *init_arg = ftl_bdev->init_arg; + int rc = -ENODEV; + + if (status) { + SPDK_ERRLOG("Failed to create FTL device (%d)\n", status); + rc = status; + goto error; + } + + spdk_ftl_dev_get_attrs(dev, &attrs); + + ftl_bdev->dev = dev; + ftl_bdev->bdev.product_name = "FTL disk"; + ftl_bdev->bdev.write_cache = 0; + ftl_bdev->bdev.blocklen = attrs.block_size; + ftl_bdev->bdev.blockcnt = attrs.num_blocks; + ftl_bdev->bdev.uuid = attrs.uuid; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_FTL, "Creating bdev %s:\n", ftl_bdev->bdev.name); + SPDK_DEBUGLOG(SPDK_LOG_BDEV_FTL, "\tblock_len:\t%zu\n", attrs.block_size); + SPDK_DEBUGLOG(SPDK_LOG_BDEV_FTL, "\tnum_blocks:\t%"PRIu64"\n", attrs.num_blocks); + + ftl_bdev->bdev.ctxt = ftl_bdev; + ftl_bdev->bdev.fn_table = &ftl_fn_table; + ftl_bdev->bdev.module = &g_ftl_if; + + if (spdk_bdev_register(&ftl_bdev->bdev)) { + goto error; + } + + info.name = ftl_bdev->bdev.name; + info.uuid = ftl_bdev->bdev.uuid; + + init_cb(&info, init_arg, 0); + return; + +error: + free(ftl_bdev->bdev.name); + free(ftl_bdev); + + init_cb(NULL, init_arg, rc); +} + +static void +bdev_ftl_defer_free(struct ftl_deferred_init *init) +{ + free((char *)init->opts.name); + free((char *)init->opts.base_bdev); + free((char *)init->opts.cache_bdev); + free(init); +} + +static int +bdev_ftl_defer_init(const struct ftl_bdev_init_opts *opts) +{ + struct ftl_deferred_init *init; + + init = calloc(1, sizeof(*init)); + if (!init) { + return -ENOMEM; + } + + init->opts.mode = opts->mode; + init->opts.uuid = opts->uuid; + init->opts.ftl_conf = opts->ftl_conf; + + init->opts.name = strdup(opts->name); + if (!init->opts.name) { + SPDK_ERRLOG("Could not allocate bdev name\n"); + goto error; + } + + init->opts.base_bdev = strdup(opts->base_bdev); + if (!init->opts.base_bdev) { + SPDK_ERRLOG("Could not allocate base bdev name\n"); + goto error; + } + + if (opts->cache_bdev) { + init->opts.cache_bdev = strdup(opts->cache_bdev); + if (!init->opts.cache_bdev) { + SPDK_ERRLOG("Could not allocate cache bdev name\n"); + goto error; + } + } + + LIST_INSERT_HEAD(&g_deferred_init, init, entry); + + return 0; + +error: + bdev_ftl_defer_free(init); + return -ENOMEM; +} + +int +bdev_ftl_create_bdev(const struct ftl_bdev_init_opts *bdev_opts, + ftl_bdev_init_fn cb, void *cb_arg) +{ + struct ftl_bdev *ftl_bdev = NULL; + struct spdk_ftl_dev_init_opts opts = {}; + int rc; + + ftl_bdev = calloc(1, sizeof(*ftl_bdev)); + if (!ftl_bdev) { + SPDK_ERRLOG("Could not allocate ftl_bdev\n"); + return -ENOMEM; + } + + ftl_bdev->bdev.name = strdup(bdev_opts->name); + if (!ftl_bdev->bdev.name) { + rc = -ENOMEM; + goto error_bdev; + } + + if (spdk_bdev_get_by_name(bdev_opts->base_bdev) == NULL || + (bdev_opts->cache_bdev && spdk_bdev_get_by_name(bdev_opts->cache_bdev) == NULL)) { + rc = bdev_ftl_defer_init(bdev_opts); + if (rc == 0) { + rc = -ENODEV; + } + goto error_name; + } + + ftl_bdev->init_cb = cb; + ftl_bdev->init_arg = cb_arg; + + opts.mode = bdev_opts->mode; + opts.uuid = bdev_opts->uuid; + opts.name = ftl_bdev->bdev.name; + opts.base_bdev = bdev_opts->base_bdev; + opts.cache_bdev = bdev_opts->cache_bdev; + opts.conf = &bdev_opts->ftl_conf; + + /* TODO: set threads based on config */ + opts.core_thread = spdk_get_thread(); + + rc = spdk_ftl_dev_init(&opts, bdev_ftl_create_cb, ftl_bdev); + if (rc) { + SPDK_ERRLOG("Could not create FTL device\n"); + goto error_name; + } + + return 0; + +error_name: + free(ftl_bdev->bdev.name); +error_bdev: + free(ftl_bdev); + return rc; +} + +static int +bdev_ftl_initialize(void) +{ + return 0; +} + +void +bdev_ftl_delete_bdev(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg) +{ + struct spdk_bdev *bdev; + + bdev = spdk_bdev_get_by_name(name); + if (bdev) { + spdk_bdev_unregister(bdev, cb_fn, cb_arg); + return; + } + + cb_fn(cb_arg, -ENODEV); +} + +static void +bdev_ftl_finish(void) +{ +} + +static void +bdev_ftl_create_defered_cb(const struct ftl_bdev_info *info, void *ctx, int status) +{ + struct ftl_deferred_init *opts = ctx; + + if (status) { + SPDK_ERRLOG("Failed to initialize FTL bdev '%s'\n", opts->opts.name); + } + + bdev_ftl_defer_free(opts); + + spdk_bdev_module_examine_done(&g_ftl_if); +} + +static void +bdev_ftl_examine(struct spdk_bdev *bdev) +{ + struct ftl_deferred_init *opts; + + LIST_FOREACH(opts, &g_deferred_init, entry) { + if (spdk_bdev_get_by_name(opts->opts.base_bdev) == NULL) { + continue; + } + + if (opts->opts.cache_bdev && spdk_bdev_get_by_name(opts->opts.base_bdev) == NULL) { + continue; + } + + LIST_REMOVE(opts, entry); + + /* spdk_bdev_module_examine_done will be called by bdev_ftl_create_defered_cb */ + if (bdev_ftl_create_bdev(&opts->opts, bdev_ftl_create_defered_cb, opts)) { + SPDK_ERRLOG("Failed to initialize FTL bdev '%s'\n", opts->opts.name); + bdev_ftl_defer_free(opts); + break; + } + return; + } + + spdk_bdev_module_examine_done(&g_ftl_if); +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_ftl", SPDK_LOG_BDEV_FTL) diff --git a/src/spdk/module/bdev/ftl/bdev_ftl.h b/src/spdk/module/bdev/ftl/bdev_ftl.h new file mode 100644 index 000000000..019a3b8f3 --- /dev/null +++ b/src/spdk/module/bdev/ftl/bdev_ftl.h @@ -0,0 +1,70 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_FTL_H +#define SPDK_BDEV_FTL_H + +#include "spdk/stdinc.h" +#include "spdk/bdev_module.h" +#include "spdk/ftl.h" + +struct spdk_bdev; +struct spdk_uuid; + +struct ftl_bdev_info { + const char *name; + struct spdk_uuid uuid; +}; + +struct ftl_bdev_init_opts { + /* Bdev's name */ + const char *name; + /* Base bdev's name */ + const char *base_bdev; + /* Write buffer bdev's name */ + const char *cache_bdev; + /* Bdev's mode */ + uint32_t mode; + /* UUID if device is restored from SSD */ + struct spdk_uuid uuid; + /* FTL library configuration */ + struct spdk_ftl_conf ftl_conf; +}; + +typedef void (*ftl_bdev_init_fn)(const struct ftl_bdev_info *, void *, int); + +int bdev_ftl_create_bdev(const struct ftl_bdev_init_opts *bdev_opts, + ftl_bdev_init_fn cb, void *cb_arg); +void bdev_ftl_delete_bdev(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg); + +#endif /* SPDK_BDEV_FTL_H */ diff --git a/src/spdk/module/bdev/ftl/bdev_ftl_rpc.c b/src/spdk/module/bdev/ftl/bdev_ftl_rpc.c new file mode 100644 index 000000000..045619342 --- /dev/null +++ b/src/spdk/module/bdev/ftl/bdev_ftl_rpc.c @@ -0,0 +1,258 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/bdev_module.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +#include "bdev_ftl.h" + +struct rpc_bdev_ftl_create { + char *name; + char *base_bdev; + char *uuid; + char *cache_bdev; + struct spdk_ftl_conf ftl_conf; +}; + +static void +free_rpc_bdev_ftl_create(struct rpc_bdev_ftl_create *req) +{ + free(req->name); + free(req->base_bdev); + free(req->uuid); + free(req->cache_bdev); + free((char *)req->ftl_conf.l2p_path); +} + +static const struct spdk_json_object_decoder rpc_bdev_ftl_create_decoders[] = { + {"name", offsetof(struct rpc_bdev_ftl_create, name), spdk_json_decode_string}, + {"base_bdev", offsetof(struct rpc_bdev_ftl_create, base_bdev), spdk_json_decode_string}, + {"uuid", offsetof(struct rpc_bdev_ftl_create, uuid), spdk_json_decode_string, true}, + {"cache", offsetof(struct rpc_bdev_ftl_create, cache_bdev), spdk_json_decode_string, true}, + { + "allow_open_bands", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, allow_open_bands), spdk_json_decode_bool, true + }, + { + "overprovisioning", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, lba_rsvd), spdk_json_decode_uint64, true + }, + { + "use_append", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, use_append), spdk_json_decode_bool, true + }, + { + "l2p_path", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, l2p_path), + spdk_json_decode_string, true + }, + { + "limit_crit", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_CRIT]) + + offsetof(struct spdk_ftl_limit, limit), + spdk_json_decode_uint64, true + }, + { + "limit_crit_threshold", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_CRIT]) + + offsetof(struct spdk_ftl_limit, thld), + spdk_json_decode_uint64, true + }, + { + "limit_high", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_HIGH]) + + offsetof(struct spdk_ftl_limit, limit), + spdk_json_decode_uint64, true + }, + { + "limit_high_threshold", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_HIGH]) + + offsetof(struct spdk_ftl_limit, thld), + spdk_json_decode_uint64, true + }, + { + "limit_low", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_LOW]) + + offsetof(struct spdk_ftl_limit, limit), + spdk_json_decode_uint64, true + }, + { + "limit_low_threshold", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_LOW]) + + offsetof(struct spdk_ftl_limit, thld), + spdk_json_decode_uint64, true + }, + { + "limit_start", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_START]) + + offsetof(struct spdk_ftl_limit, limit), + spdk_json_decode_uint64, true + }, + { + "limit_start_threshold", offsetof(struct rpc_bdev_ftl_create, ftl_conf) + + offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_START]) + + offsetof(struct spdk_ftl_limit, thld), + spdk_json_decode_uint64, true + }, +}; + +static void +rpc_bdev_ftl_create_cb(const struct ftl_bdev_info *bdev_info, void *ctx, int status) +{ + struct spdk_jsonrpc_request *request = ctx; + char bdev_uuid[SPDK_UUID_STRING_LEN]; + struct spdk_json_write_ctx *w; + + if (status) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Failed to create FTL bdev: %s", + spdk_strerror(-status)); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_uuid_fmt_lower(bdev_uuid, sizeof(bdev_uuid), &bdev_info->uuid); + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "name", bdev_info->name); + spdk_json_write_named_string(w, "uuid", bdev_uuid); + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_ftl_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_ftl_create req = {}; + struct ftl_bdev_init_opts opts = {}; + struct spdk_json_write_ctx *w; + int rc; + + spdk_ftl_conf_init_defaults(&req.ftl_conf); + + if (spdk_json_decode_object(params, rpc_bdev_ftl_create_decoders, + SPDK_COUNTOF(rpc_bdev_ftl_create_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto invalid; + } + + if (req.cache_bdev && !spdk_bdev_get_by_name(req.cache_bdev)) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "No such bdev: %s", req.cache_bdev); + goto invalid; + } + + opts.name = req.name; + opts.mode = SPDK_FTL_MODE_CREATE; + opts.base_bdev = req.base_bdev; + opts.cache_bdev = req.cache_bdev; + opts.ftl_conf = req.ftl_conf; + + if (req.uuid) { + if (spdk_uuid_parse(&opts.uuid, req.uuid) < 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Failed to parse uuid: %s", + req.uuid); + goto invalid; + } + + if (!spdk_mem_all_zero(&opts.uuid, sizeof(opts.uuid))) { + opts.mode &= ~SPDK_FTL_MODE_CREATE; + } + } + + rc = bdev_ftl_create_bdev(&opts, rpc_bdev_ftl_create_cb, request); + if (rc) { + if (rc == -ENODEV) { + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string_fmt(w, "FTL bdev: %s creation deferred", req.name); + spdk_jsonrpc_end_result(request, w); + } else { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Failed to create FTL bdev: %s", + spdk_strerror(-rc)); + } + goto invalid; + } + +invalid: + free_rpc_bdev_ftl_create(&req); +} + +SPDK_RPC_REGISTER("bdev_ftl_create", rpc_bdev_ftl_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ftl_create, construct_ftl_bdev) + +struct rpc_delete_ftl { + char *name; +}; + +static const struct spdk_json_object_decoder rpc_delete_ftl_decoders[] = { + {"name", offsetof(struct rpc_bdev_ftl_create, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_ftl_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_ftl_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_ftl attrs = {}; + + if (spdk_json_decode_object(params, rpc_delete_ftl_decoders, + SPDK_COUNTOF(rpc_delete_ftl_decoders), + &attrs)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto invalid; + } + + bdev_ftl_delete_bdev(attrs.name, rpc_bdev_ftl_delete_cb, request); +invalid: + free(attrs.name); +} + +SPDK_RPC_REGISTER("bdev_ftl_delete", rpc_bdev_ftl_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ftl_delete, delete_ftl_bdev) diff --git a/src/spdk/module/bdev/gpt/Makefile b/src/spdk/module/bdev/gpt/Makefile new file mode 100644 index 000000000..db27dbc38 --- /dev/null +++ b/src/spdk/module/bdev/gpt/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = gpt.c vbdev_gpt.c +LIBNAME = bdev_gpt + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/gpt/gpt.c b/src/spdk/module/bdev/gpt/gpt.c new file mode 100644 index 000000000..d31168b0b --- /dev/null +++ b/src/spdk/module/bdev/gpt/gpt.c @@ -0,0 +1,320 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gpt.h" + +#include "spdk/crc32.h" +#include "spdk/endian.h" +#include "spdk/event.h" + +#include "spdk_internal/log.h" + +#define GPT_PRIMARY_PARTITION_TABLE_LBA 0x1 +#define PRIMARY_PARTITION_NUMBER 4 +#define GPT_PROTECTIVE_MBR 1 +#define SPDK_MAX_NUM_PARTITION_ENTRIES 128 + +static uint64_t +gpt_get_expected_head_lba(struct spdk_gpt *gpt) +{ + switch (gpt->parse_phase) { + case SPDK_GPT_PARSE_PHASE_PRIMARY: + return GPT_PRIMARY_PARTITION_TABLE_LBA; + case SPDK_GPT_PARSE_PHASE_SECONDARY: + return gpt->lba_end; + default: + assert(false); + } + return 0; +} + +static struct spdk_gpt_header * +gpt_get_header_buf(struct spdk_gpt *gpt) +{ + switch (gpt->parse_phase) { + case SPDK_GPT_PARSE_PHASE_PRIMARY: + return (struct spdk_gpt_header *) + (gpt->buf + GPT_PRIMARY_PARTITION_TABLE_LBA * gpt->sector_size); + case SPDK_GPT_PARSE_PHASE_SECONDARY: + return (struct spdk_gpt_header *) + (gpt->buf + (gpt->buf_size - gpt->sector_size)); + default: + assert(false); + } + return NULL; +} + +static struct spdk_gpt_partition_entry * +gpt_get_partitions_buf(struct spdk_gpt *gpt, uint64_t total_partition_size, + uint64_t partition_start_lba) +{ + uint64_t secondary_total_size; + + switch (gpt->parse_phase) { + case SPDK_GPT_PARSE_PHASE_PRIMARY: + if ((total_partition_size + partition_start_lba * gpt->sector_size) > + gpt->buf_size) { + SPDK_ERRLOG("Buffer size is not enough\n"); + return NULL; + } + return (struct spdk_gpt_partition_entry *) + (gpt->buf + partition_start_lba * gpt->sector_size); + case SPDK_GPT_PARSE_PHASE_SECONDARY: + secondary_total_size = (gpt->lba_end - partition_start_lba + 1) * gpt->sector_size; + if (secondary_total_size > gpt->buf_size) { + SPDK_ERRLOG("Buffer size is not enough\n"); + return NULL; + } + return (struct spdk_gpt_partition_entry *) + (gpt->buf + (gpt->buf_size - secondary_total_size)); + default: + assert(false); + } + return NULL; +} + +static int +gpt_read_partitions(struct spdk_gpt *gpt) +{ + uint32_t total_partition_size, num_partition_entries, partition_entry_size; + uint64_t partition_start_lba; + struct spdk_gpt_header *head = gpt->header; + uint32_t crc32; + + num_partition_entries = from_le32(&head->num_partition_entries); + if (num_partition_entries > SPDK_MAX_NUM_PARTITION_ENTRIES) { + SPDK_ERRLOG("Num_partition_entries=%u which exceeds max=%u\n", + num_partition_entries, SPDK_MAX_NUM_PARTITION_ENTRIES); + return -1; + } + + partition_entry_size = from_le32(&head->size_of_partition_entry); + if (partition_entry_size != sizeof(struct spdk_gpt_partition_entry)) { + SPDK_ERRLOG("Partition_entry_size(%x) != expected(%lx)\n", + partition_entry_size, sizeof(struct spdk_gpt_partition_entry)); + return -1; + } + + total_partition_size = num_partition_entries * partition_entry_size; + partition_start_lba = from_le64(&head->partition_entry_lba); + gpt->partitions = gpt_get_partitions_buf(gpt, total_partition_size, + partition_start_lba); + if (!gpt->partitions) { + SPDK_ERRLOG("Failed to get gpt partitions buf\n"); + return -1; + } + + crc32 = spdk_crc32_ieee_update(gpt->partitions, total_partition_size, ~0); + crc32 ^= ~0; + + if (crc32 != from_le32(&head->partition_entry_array_crc32)) { + SPDK_ERRLOG("GPT partition entry array crc32 did not match\n"); + return -1; + } + + return 0; +} + +static int +gpt_lba_range_check(struct spdk_gpt_header *head, uint64_t lba_end) +{ + uint64_t usable_lba_start, usable_lba_end; + + usable_lba_start = from_le64(&head->first_usable_lba); + usable_lba_end = from_le64(&head->last_usable_lba); + + if (usable_lba_end < usable_lba_start) { + SPDK_ERRLOG("Head's usable_lba_end(%" PRIu64 ") < usable_lba_start(%" PRIu64 ")\n", + usable_lba_end, usable_lba_start); + return -1; + } + + if (usable_lba_end > lba_end) { + SPDK_ERRLOG("Head's usable_lba_end(%" PRIu64 ") > lba_end(%" PRIu64 ")\n", + usable_lba_end, lba_end); + return -1; + } + + if ((usable_lba_start < GPT_PRIMARY_PARTITION_TABLE_LBA) && + (GPT_PRIMARY_PARTITION_TABLE_LBA < usable_lba_end)) { + SPDK_ERRLOG("Head lba is not in the usable range\n"); + return -1; + } + + return 0; +} + +static int +gpt_read_header(struct spdk_gpt *gpt) +{ + uint32_t head_size; + uint32_t new_crc, original_crc; + uint64_t my_lba, head_lba; + struct spdk_gpt_header *head; + + head = gpt_get_header_buf(gpt); + if (!head) { + SPDK_ERRLOG("Failed to get gpt header buf\n"); + return -1; + } + + head_size = from_le32(&head->header_size); + if (head_size < sizeof(*head) || head_size > gpt->sector_size) { + SPDK_ERRLOG("head_size=%u\n", head_size); + return -1; + } + + original_crc = from_le32(&head->header_crc32); + head->header_crc32 = 0; + new_crc = spdk_crc32_ieee_update(head, from_le32(&head->header_size), ~0); + new_crc ^= ~0; + /* restore header crc32 */ + to_le32(&head->header_crc32, original_crc); + + if (new_crc != original_crc) { + SPDK_ERRLOG("head crc32 does not match, provided=%u, caculated=%u\n", + original_crc, new_crc); + return -1; + } + + if (memcmp(SPDK_GPT_SIGNATURE, head->gpt_signature, + sizeof(head->gpt_signature))) { + SPDK_ERRLOG("signature did not match\n"); + return -1; + } + + head_lba = gpt_get_expected_head_lba(gpt); + my_lba = from_le64(&head->my_lba); + if (my_lba != head_lba) { + SPDK_ERRLOG("head my_lba(%" PRIu64 ") != expected(%" PRIu64 ")\n", + my_lba, head_lba); + return -1; + } + + if (gpt_lba_range_check(head, gpt->lba_end)) { + SPDK_ERRLOG("lba range check error\n"); + return -1; + } + + gpt->header = head; + return 0; +} + +static int +gpt_check_mbr(struct spdk_gpt *gpt) +{ + int i, primary_partition = 0; + uint32_t total_lba_size = 0, ret = 0, expected_start_lba; + struct spdk_mbr *mbr; + + mbr = (struct spdk_mbr *)gpt->buf; + if (from_le16(&mbr->mbr_signature) != SPDK_MBR_SIGNATURE) { + SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "Signature mismatch, provided=%x," + "expected=%x\n", from_le16(&mbr->disk_signature), + SPDK_MBR_SIGNATURE); + return -1; + } + + for (i = 0; i < PRIMARY_PARTITION_NUMBER; i++) { + if (mbr->partitions[i].os_type == SPDK_MBR_OS_TYPE_GPT_PROTECTIVE) { + primary_partition = i; + ret = GPT_PROTECTIVE_MBR; + break; + } + } + + if (ret == GPT_PROTECTIVE_MBR) { + expected_start_lba = GPT_PRIMARY_PARTITION_TABLE_LBA; + if (from_le32(&mbr->partitions[primary_partition].start_lba) != expected_start_lba) { + SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "start lba mismatch, provided=%u, expected=%u\n", + from_le32(&mbr->partitions[primary_partition].start_lba), + expected_start_lba); + return -1; + } + + total_lba_size = from_le32(&mbr->partitions[primary_partition].size_lba); + if ((total_lba_size != ((uint32_t) gpt->total_sectors - 1)) && + (total_lba_size != 0xFFFFFFFF)) { + SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, + "GPT Primary MBR size does not equal: (record_size %u != actual_size %u)!\n", + total_lba_size, (uint32_t) gpt->total_sectors - 1); + return -1; + } + } else { + SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "Currently only support GPT Protective MBR format\n"); + return -1; + } + + return 0; +} + +int +gpt_parse_mbr(struct spdk_gpt *gpt) +{ + int rc; + + if (!gpt || !gpt->buf) { + SPDK_ERRLOG("Gpt and the related buffer should not be NULL\n"); + return -1; + } + + rc = gpt_check_mbr(gpt); + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "Failed to detect gpt in MBR\n"); + return rc; + } + + return 0; +} + +int +gpt_parse_partition_table(struct spdk_gpt *gpt) +{ + int rc; + + rc = gpt_read_header(gpt); + if (rc) { + SPDK_ERRLOG("Failed to read gpt header\n"); + return rc; + } + + rc = gpt_read_partitions(gpt); + if (rc) { + SPDK_ERRLOG("Failed to read gpt partitions\n"); + return rc; + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("gpt_parse", SPDK_LOG_GPT_PARSE) diff --git a/src/spdk/module/bdev/gpt/gpt.h b/src/spdk/module/bdev/gpt/gpt.h new file mode 100644 index 000000000..9fa870843 --- /dev/null +++ b/src/spdk/module/bdev/gpt/gpt.h @@ -0,0 +1,70 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * GPT internal Interface + */ + +#ifndef SPDK_INTERNAL_GPT_H +#define SPDK_INTERNAL_GPT_H + +#include "spdk/stdinc.h" + +#include "spdk/gpt_spec.h" + +#define SPDK_GPT_PART_TYPE_GUID SPDK_GPT_GUID(0x7c5222bd, 0x8f5d, 0x4087, 0x9c00, 0xbf9843c7b58c) +#define SPDK_GPT_BUFFER_SIZE 32768 /* 32KB */ +#define SPDK_GPT_GUID_EQUAL(x,y) (memcmp(x, y, sizeof(struct spdk_gpt_guid)) == 0) + +enum spdk_gpt_parse_phase { + SPDK_GPT_PARSE_PHASE_INVALID = 0, + SPDK_GPT_PARSE_PHASE_PRIMARY, + SPDK_GPT_PARSE_PHASE_SECONDARY, +}; + +struct spdk_gpt { + uint8_t parse_phase; + unsigned char *buf; + uint64_t buf_size; + uint64_t lba_start; + uint64_t lba_end; + uint64_t total_sectors; + uint32_t sector_size; + struct spdk_gpt_header *header; + struct spdk_gpt_partition_entry *partitions; +}; + +int gpt_parse_mbr(struct spdk_gpt *gpt); +int gpt_parse_partition_table(struct spdk_gpt *gpt); + +#endif /* SPDK_INTERNAL_GPT_H */ diff --git a/src/spdk/module/bdev/gpt/vbdev_gpt.c b/src/spdk/module/bdev/gpt/vbdev_gpt.c new file mode 100644 index 000000000..5232444fb --- /dev/null +++ b/src/spdk/module/bdev/gpt/vbdev_gpt.c @@ -0,0 +1,565 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This driver reads a GPT partition table from a bdev and exposes a virtual block device for + * each partition. + */ + +#include "gpt.h" + +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/rpc.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +static int vbdev_gpt_init(void); +static void vbdev_gpt_examine(struct spdk_bdev *bdev); +static int vbdev_gpt_get_ctx_size(void); + +static struct spdk_bdev_module gpt_if = { + .name = "gpt", + .module_init = vbdev_gpt_init, + .get_ctx_size = vbdev_gpt_get_ctx_size, + .examine_disk = vbdev_gpt_examine, + +}; +SPDK_BDEV_MODULE_REGISTER(gpt, &gpt_if) + +/* Base block device gpt context */ +struct gpt_base { + struct spdk_gpt gpt; + struct spdk_bdev_part_base *part_base; + SPDK_BDEV_PART_TAILQ parts; + + /* This channel is only used for reading the partition table. */ + struct spdk_io_channel *ch; +}; + +/* Context for each gpt virtual bdev */ +struct gpt_disk { + struct spdk_bdev_part part; + uint32_t partition_index; +}; + +struct gpt_channel { + struct spdk_bdev_part_channel part_ch; +}; + +struct gpt_io { + struct spdk_io_channel *ch; + struct spdk_bdev_io *bdev_io; + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; +}; + +static bool g_gpt_disabled; + +static void +gpt_base_free(void *ctx) +{ + struct gpt_base *gpt_base = ctx; + + spdk_free(gpt_base->gpt.buf); + free(gpt_base); +} + +static void +gpt_base_bdev_hotremove_cb(void *_part_base) +{ + struct spdk_bdev_part_base *part_base = _part_base; + struct gpt_base *gpt_base = spdk_bdev_part_base_get_ctx(part_base); + + spdk_bdev_part_base_hotremove(part_base, &gpt_base->parts); +} + +static int vbdev_gpt_destruct(void *ctx); +static void vbdev_gpt_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io); +static int vbdev_gpt_dump_info_json(void *ctx, struct spdk_json_write_ctx *w); + +static struct spdk_bdev_fn_table vbdev_gpt_fn_table = { + .destruct = vbdev_gpt_destruct, + .submit_request = vbdev_gpt_submit_request, + .dump_info_json = vbdev_gpt_dump_info_json, +}; + +static struct gpt_base * +gpt_base_bdev_init(struct spdk_bdev *bdev) +{ + struct gpt_base *gpt_base; + struct spdk_gpt *gpt; + + gpt_base = calloc(1, sizeof(*gpt_base)); + if (!gpt_base) { + SPDK_ERRLOG("Cannot alloc memory for gpt_base pointer\n"); + return NULL; + } + + TAILQ_INIT(&gpt_base->parts); + gpt_base->part_base = spdk_bdev_part_base_construct(bdev, + gpt_base_bdev_hotremove_cb, + &gpt_if, &vbdev_gpt_fn_table, + &gpt_base->parts, gpt_base_free, gpt_base, + sizeof(struct gpt_channel), NULL, NULL); + if (!gpt_base->part_base) { + free(gpt_base); + SPDK_ERRLOG("cannot construct gpt_base"); + return NULL; + } + + gpt = &gpt_base->gpt; + gpt->parse_phase = SPDK_GPT_PARSE_PHASE_PRIMARY; + gpt->buf_size = spdk_max(SPDK_GPT_BUFFER_SIZE, bdev->blocklen); + gpt->buf = spdk_zmalloc(gpt->buf_size, spdk_bdev_get_buf_align(bdev), NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!gpt->buf) { + SPDK_ERRLOG("Cannot alloc buf\n"); + spdk_bdev_part_base_free(gpt_base->part_base); + return NULL; + } + + gpt->sector_size = bdev->blocklen; + gpt->total_sectors = bdev->blockcnt; + gpt->lba_start = 0; + gpt->lba_end = gpt->total_sectors - 1; + + return gpt_base; +} + +static int +vbdev_gpt_destruct(void *ctx) +{ + struct gpt_disk *gpt_disk = ctx; + + return spdk_bdev_part_free(&gpt_disk->part); +} + +static void +_vbdev_gpt_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io); + +static void +vbdev_gpt_resubmit_request(void *arg) +{ + struct gpt_io *io = (struct gpt_io *)arg; + + _vbdev_gpt_submit_request(io->ch, io->bdev_io); +} + +static void +vbdev_gpt_queue_io(struct gpt_io *io) +{ + struct gpt_channel *ch = spdk_io_channel_get_ctx(io->ch); + int rc; + + io->bdev_io_wait.bdev = io->bdev_io->bdev; + io->bdev_io_wait.cb_fn = vbdev_gpt_resubmit_request; + io->bdev_io_wait.cb_arg = io; + + rc = spdk_bdev_queue_io_wait(io->bdev_io->bdev, + ch->part_ch.base_ch, &io->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vbdev_gpt_queue_io, rc=%d.\n", rc); + spdk_bdev_io_complete(io->bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +vbdev_gpt_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + _vbdev_gpt_submit_request(ch, bdev_io); +} + +static void +_vbdev_gpt_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct gpt_channel *ch = spdk_io_channel_get_ctx(_ch); + struct gpt_io *io = (struct gpt_io *)bdev_io->driver_ctx; + int rc; + + rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "gpt: no memory, queue io\n"); + io->ch = _ch; + io->bdev_io = bdev_io; + vbdev_gpt_queue_io(io); + } else { + SPDK_ERRLOG("gpt: error on bdev_io submission, rc=%d.\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +static void +vbdev_gpt_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, vbdev_gpt_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + default: + _vbdev_gpt_submit_request(_ch, bdev_io); + break; + } +} + +static void +write_guid(struct spdk_json_write_ctx *w, const struct spdk_gpt_guid *guid) +{ + spdk_json_write_string_fmt(w, "%08x-%04x-%04x-%04x-%04x%08x", + from_le32(&guid->raw[0]), + from_le16(&guid->raw[4]), + from_le16(&guid->raw[6]), + from_be16(&guid->raw[8]), + from_be16(&guid->raw[10]), + from_be32(&guid->raw[12])); +} + +static void +write_string_utf16le(struct spdk_json_write_ctx *w, const uint16_t *str, size_t max_len) +{ + size_t len; + const uint16_t *p; + + for (len = 0, p = str; len < max_len && *p; p++) { + len++; + } + + spdk_json_write_string_utf16le_raw(w, str, len); +} + +static int +vbdev_gpt_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct gpt_disk *gpt_disk = SPDK_CONTAINEROF(ctx, struct gpt_disk, part); + struct spdk_bdev_part_base *base_bdev = spdk_bdev_part_get_base(&gpt_disk->part); + struct gpt_base *gpt_base = spdk_bdev_part_base_get_ctx(base_bdev); + struct spdk_bdev *part_base_bdev = spdk_bdev_part_base_get_bdev(base_bdev); + struct spdk_gpt *gpt = &gpt_base->gpt; + struct spdk_gpt_partition_entry *gpt_entry = &gpt->partitions[gpt_disk->partition_index]; + uint64_t offset_blocks = spdk_bdev_part_get_offset_blocks(&gpt_disk->part); + + spdk_json_write_named_object_begin(w, "gpt"); + + spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(part_base_bdev)); + + spdk_json_write_named_uint64(w, "offset_blocks", offset_blocks); + + spdk_json_write_name(w, "partition_type_guid"); + write_guid(w, &gpt_entry->part_type_guid); + + spdk_json_write_name(w, "unique_partition_guid"); + write_guid(w, &gpt_entry->unique_partition_guid); + + spdk_json_write_name(w, "partition_name"); + write_string_utf16le(w, gpt_entry->partition_name, SPDK_COUNTOF(gpt_entry->partition_name)); + + spdk_json_write_object_end(w); + + return 0; +} + +static int +vbdev_gpt_create_bdevs(struct gpt_base *gpt_base) +{ + uint32_t num_partition_entries; + uint64_t i, head_lba_start, head_lba_end; + uint32_t num_partitions; + struct spdk_gpt_partition_entry *p; + struct gpt_disk *d; + struct spdk_gpt *gpt; + char *name; + struct spdk_bdev *base_bdev; + int rc; + + gpt = &gpt_base->gpt; + num_partition_entries = from_le32(&gpt->header->num_partition_entries); + head_lba_start = from_le64(&gpt->header->first_usable_lba); + head_lba_end = from_le64(&gpt->header->last_usable_lba); + num_partitions = 0; + + for (i = 0; i < num_partition_entries; i++) { + p = &gpt->partitions[i]; + uint64_t lba_start = from_le64(&p->starting_lba); + uint64_t lba_end = from_le64(&p->ending_lba); + + if (!SPDK_GPT_GUID_EQUAL(&gpt->partitions[i].part_type_guid, + &SPDK_GPT_PART_TYPE_GUID) || + lba_start == 0) { + continue; + } + if (lba_start < head_lba_start || lba_end > head_lba_end) { + continue; + } + + d = calloc(1, sizeof(*d)); + if (!d) { + SPDK_ERRLOG("Memory allocation failure\n"); + return -1; + } + + /* index start at 1 instead of 0 to match the existing style */ + base_bdev = spdk_bdev_part_base_get_bdev(gpt_base->part_base); + name = spdk_sprintf_alloc("%sp%" PRIu64, spdk_bdev_get_name(base_bdev), i + 1); + if (!name) { + SPDK_ERRLOG("name allocation failure\n"); + free(d); + return -1; + } + + rc = spdk_bdev_part_construct(&d->part, gpt_base->part_base, name, + lba_start, lba_end - lba_start, "GPT Disk"); + free(name); + if (rc) { + SPDK_ERRLOG("could not construct bdev part\n"); + /* spdk_bdev_part_construct will free name on failure */ + free(d); + return -1; + } + num_partitions++; + d->partition_index = i; + } + + return num_partitions; +} + +static void +gpt_read_secondary_table_complete(struct spdk_bdev_io *bdev_io, bool status, void *arg) +{ + struct gpt_base *gpt_base = (struct gpt_base *)arg; + struct spdk_bdev *bdev = spdk_bdev_part_base_get_bdev(gpt_base->part_base); + int rc, num_partitions = 0; + + spdk_bdev_free_io(bdev_io); + spdk_put_io_channel(gpt_base->ch); + gpt_base->ch = NULL; + + if (status != SPDK_BDEV_IO_STATUS_SUCCESS) { + SPDK_ERRLOG("Gpt: bdev=%s io error status=%d\n", + spdk_bdev_get_name(bdev), status); + goto end; + } + + rc = gpt_parse_partition_table(&gpt_base->gpt); + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to parse secondary partition table\n"); + goto end; + } + + SPDK_WARNLOG("Gpt: bdev=%s primary partition table broken, use the secondary\n", + spdk_bdev_get_name(bdev)); + + num_partitions = vbdev_gpt_create_bdevs(gpt_base); + if (num_partitions < 0) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to split dev=%s by gpt table\n", + spdk_bdev_get_name(bdev)); + } + +end: + spdk_bdev_module_examine_done(&gpt_if); + if (num_partitions <= 0) { + /* If no gpt_disk instances were created, free the base context */ + spdk_bdev_part_base_free(gpt_base->part_base); + } +} + +static int +vbdev_gpt_read_secondary_table(struct gpt_base *gpt_base) +{ + struct spdk_gpt *gpt; + struct spdk_bdev_desc *part_base_desc; + uint64_t secondary_offset; + + gpt = &gpt_base->gpt; + gpt->parse_phase = SPDK_GPT_PARSE_PHASE_SECONDARY; + gpt->header = NULL; + gpt->partitions = NULL; + + part_base_desc = spdk_bdev_part_base_get_desc(gpt_base->part_base); + + secondary_offset = gpt->total_sectors * gpt->sector_size - gpt->buf_size; + return spdk_bdev_read(part_base_desc, gpt_base->ch, gpt_base->gpt.buf, secondary_offset, + gpt_base->gpt.buf_size, gpt_read_secondary_table_complete, + gpt_base); +} + +static void +gpt_bdev_complete(struct spdk_bdev_io *bdev_io, bool status, void *arg) +{ + struct gpt_base *gpt_base = (struct gpt_base *)arg; + struct spdk_bdev *bdev = spdk_bdev_part_base_get_bdev(gpt_base->part_base); + int rc, num_partitions = 0; + + spdk_bdev_free_io(bdev_io); + + if (status != SPDK_BDEV_IO_STATUS_SUCCESS) { + SPDK_ERRLOG("Gpt: bdev=%s io error status=%d\n", + spdk_bdev_get_name(bdev), status); + goto end; + } + + rc = gpt_parse_mbr(&gpt_base->gpt); + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to parse mbr\n"); + goto end; + } + + rc = gpt_parse_partition_table(&gpt_base->gpt); + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to parse primary partition table\n"); + rc = vbdev_gpt_read_secondary_table(gpt_base); + if (rc) { + SPDK_ERRLOG("Failed to read secondary table\n"); + goto end; + } + return; + } + + num_partitions = vbdev_gpt_create_bdevs(gpt_base); + if (num_partitions < 0) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to split dev=%s by gpt table\n", + spdk_bdev_get_name(bdev)); + } + +end: + spdk_put_io_channel(gpt_base->ch); + gpt_base->ch = NULL; + /* + * Notify the generic bdev layer that the actions related to the original examine + * callback are now completed. + */ + spdk_bdev_module_examine_done(&gpt_if); + + /* + * vbdev_gpt_create_bdevs returns the number of bdevs created upon success. + * We can branch on this value. + */ + if (num_partitions <= 0) { + /* If no gpt_disk instances were created, free the base context */ + spdk_bdev_part_base_free(gpt_base->part_base); + } +} + +static int +vbdev_gpt_read_gpt(struct spdk_bdev *bdev) +{ + struct gpt_base *gpt_base; + struct spdk_bdev_desc *part_base_desc; + int rc; + + gpt_base = gpt_base_bdev_init(bdev); + if (!gpt_base) { + SPDK_ERRLOG("Cannot allocated gpt_base\n"); + return -1; + } + + part_base_desc = spdk_bdev_part_base_get_desc(gpt_base->part_base); + gpt_base->ch = spdk_bdev_get_io_channel(part_base_desc); + if (gpt_base->ch == NULL) { + SPDK_ERRLOG("Failed to get an io_channel.\n"); + spdk_bdev_part_base_free(gpt_base->part_base); + return -1; + } + + rc = spdk_bdev_read(part_base_desc, gpt_base->ch, gpt_base->gpt.buf, 0, + gpt_base->gpt.buf_size, gpt_bdev_complete, gpt_base); + if (rc < 0) { + spdk_put_io_channel(gpt_base->ch); + spdk_bdev_part_base_free(gpt_base->part_base); + SPDK_ERRLOG("Failed to send bdev_io command\n"); + return -1; + } + + return 0; +} + +static int +vbdev_gpt_init(void) +{ + struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Gpt"); + + if (sp && spdk_conf_section_get_boolval(sp, "Disable", false)) { + /* Disable Gpt probe */ + g_gpt_disabled = true; + } + + return 0; +} + +static int +vbdev_gpt_get_ctx_size(void) +{ + return sizeof(struct gpt_io); +} + +static void +vbdev_gpt_examine(struct spdk_bdev *bdev) +{ + int rc; + + /* A bdev with fewer than 2 blocks cannot have a GPT. Block 0 has + * the MBR and block 1 has the GPT header. + */ + if (g_gpt_disabled || spdk_bdev_get_num_blocks(bdev) < 2) { + spdk_bdev_module_examine_done(&gpt_if); + return; + } + + if (spdk_bdev_get_block_size(bdev) % 512 != 0) { + SPDK_ERRLOG("GPT module does not support block size %" PRIu32 " for bdev %s\n", + spdk_bdev_get_block_size(bdev), spdk_bdev_get_name(bdev)); + spdk_bdev_module_examine_done(&gpt_if); + return; + } + + rc = vbdev_gpt_read_gpt(bdev); + if (rc) { + spdk_bdev_module_examine_done(&gpt_if); + SPDK_ERRLOG("Failed to read info from bdev %s\n", spdk_bdev_get_name(bdev)); + } +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_gpt", SPDK_LOG_VBDEV_GPT) diff --git a/src/spdk/module/bdev/iscsi/Makefile b/src/spdk/module/bdev/iscsi/Makefile new file mode 100644 index 000000000..38ba8b709 --- /dev/null +++ b/src/spdk/module/bdev/iscsi/Makefile @@ -0,0 +1,51 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ +# CentOS 7 libiscsi package has functions declared inline but not +# defined in the header file. Not aware of any way to disable +# this warning so just make sure the warning isn't treated as +# an error. +CFLAGS += -Wno-error +C_SRCS = bdev_iscsi.c bdev_iscsi_rpc.c +LIBNAME = bdev_iscsi + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/iscsi/bdev_iscsi.c b/src/spdk/module/bdev/iscsi/bdev_iscsi.c new file mode 100644 index 000000000..18e8e0090 --- /dev/null +++ b/src/spdk/module/bdev/iscsi/bdev_iscsi.c @@ -0,0 +1,936 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/fd.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/util.h" +#include "spdk/rpc.h" +#include "spdk/string.h" +#include "spdk/iscsi_spec.h" + +#include "spdk_internal/log.h" +#include "spdk/bdev_module.h" + +#include "iscsi/iscsi.h" +#include "iscsi/scsi-lowlevel.h" + +#include "bdev_iscsi.h" + +struct bdev_iscsi_lun; + +#define BDEV_ISCSI_CONNECTION_POLL_US 500 /* 0.5 ms */ +#define BDEV_ISCSI_NO_MASTER_CH_POLL_US 10000 /* 10ms */ + +#define DEFAULT_INITIATOR_NAME "iqn.2016-06.io.spdk:init" + +static int bdev_iscsi_initialize(void); +static TAILQ_HEAD(, bdev_iscsi_conn_req) g_iscsi_conn_req = TAILQ_HEAD_INITIALIZER( + g_iscsi_conn_req); +static struct spdk_poller *g_conn_poller = NULL; + +struct bdev_iscsi_io { + struct spdk_thread *submit_td; + enum spdk_bdev_io_status status; + int scsi_status; + enum spdk_scsi_sense sk; + uint8_t asc; + uint8_t ascq; +}; + +struct bdev_iscsi_lun { + struct spdk_bdev bdev; + struct iscsi_context *context; + char *initiator_iqn; + int lun_id; + char *url; + pthread_mutex_t mutex; + uint32_t ch_count; + struct spdk_thread *master_td; + struct spdk_poller *no_master_ch_poller; + struct spdk_thread *no_master_ch_poller_td; + bool unmap_supported; + struct spdk_poller *poller; +}; + +struct bdev_iscsi_io_channel { + struct bdev_iscsi_lun *lun; +}; + +struct bdev_iscsi_conn_req { + char *url; + char *bdev_name; + char *initiator_iqn; + struct iscsi_context *context; + spdk_bdev_iscsi_create_cb create_cb; + void *create_cb_arg; + bool unmap_supported; + int lun; + int status; + TAILQ_ENTRY(bdev_iscsi_conn_req) link; +}; + +static void +complete_conn_req(struct bdev_iscsi_conn_req *req, struct spdk_bdev *bdev, + int status) +{ + TAILQ_REMOVE(&g_iscsi_conn_req, req, link); + req->create_cb(req->create_cb_arg, bdev, status); + + /* + * we are still running in the context of iscsi_service() + * so do not tear down its data structures here + */ + req->status = status; +} + +static int +bdev_iscsi_get_ctx_size(void) +{ + return sizeof(struct bdev_iscsi_io); +} + +static void +_iscsi_free_lun(void *arg) +{ + struct bdev_iscsi_lun *lun = arg; + + assert(lun != NULL); + iscsi_destroy_context(lun->context); + pthread_mutex_destroy(&lun->mutex); + free(lun->bdev.name); + free(lun->url); + free(lun->initiator_iqn); + + spdk_bdev_destruct_done(&lun->bdev, 0); + free(lun); +} + +static void +_bdev_iscsi_conn_req_free(struct bdev_iscsi_conn_req *req) +{ + free(req->initiator_iqn); + free(req->bdev_name); + free(req->url); + /* destroy will call iscsi_disconnect() implicitly if connected */ + iscsi_destroy_context(req->context); + free(req); +} + +static void +bdev_iscsi_finish(void) +{ + struct bdev_iscsi_conn_req *req, *tmp; + + /* clear out pending connection requests here. We cannot + * simply set the state to a non SCSI_STATUS_GOOD state as + * the connection poller wont run anymore + */ + TAILQ_FOREACH_SAFE(req, &g_iscsi_conn_req, link, tmp) { + _bdev_iscsi_conn_req_free(req); + } + + if (g_conn_poller) { + spdk_poller_unregister(&g_conn_poller); + } +} + +static struct spdk_bdev_module g_iscsi_bdev_module = { + .name = "iscsi", + .module_init = bdev_iscsi_initialize, + .module_fini = bdev_iscsi_finish, + .get_ctx_size = bdev_iscsi_get_ctx_size, + .async_init = true, +}; + +SPDK_BDEV_MODULE_REGISTER(iscsi, &g_iscsi_bdev_module); + +static void +_bdev_iscsi_io_complete(void *_iscsi_io) +{ + struct bdev_iscsi_io *iscsi_io = _iscsi_io; + + if (iscsi_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { + spdk_bdev_io_complete_scsi_status(spdk_bdev_io_from_ctx(iscsi_io), iscsi_io->scsi_status, + iscsi_io->sk, iscsi_io->asc, iscsi_io->ascq); + } else { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(iscsi_io), iscsi_io->status); + } +} + +static void +bdev_iscsi_io_complete(struct bdev_iscsi_io *iscsi_io, enum spdk_bdev_io_status status) +{ + iscsi_io->status = status; + if (iscsi_io->submit_td != NULL) { + spdk_thread_send_msg(iscsi_io->submit_td, _bdev_iscsi_io_complete, iscsi_io); + } else { + _bdev_iscsi_io_complete(iscsi_io); + } +} + +/* Common call back function for read/write/flush command */ +static void +bdev_iscsi_command_cb(struct iscsi_context *context, int status, void *_task, void *_iscsi_io) +{ + struct scsi_task *task = _task; + struct bdev_iscsi_io *iscsi_io = _iscsi_io; + + iscsi_io->scsi_status = status; + iscsi_io->sk = (uint8_t)task->sense.key; + iscsi_io->asc = (task->sense.ascq >> 8) & 0xFF; + iscsi_io->ascq = task->sense.ascq & 0xFF; + + scsi_free_scsi_task(task); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_SUCCESS); +} + +static void +bdev_iscsi_readv(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io, + struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t lba) +{ + struct scsi_task *task; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI_INIT, "read %d iovs size %lu to lba: %#lx\n", + iovcnt, nbytes, lba); + + task = iscsi_read16_task(lun->context, lun->lun_id, lba, nbytes, lun->bdev.blocklen, 0, 0, 0, 0, 0, + bdev_iscsi_command_cb, iscsi_io); + if (task == NULL) { + SPDK_ERRLOG("failed to get read16_task\n"); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + +#if defined(LIBISCSI_FEATURE_IOVECTOR) + scsi_task_set_iov_in(task, (struct scsi_iovec *)iov, iovcnt); +#else + int i; + for (i = 0; i < iovcnt; i++) { + scsi_task_add_data_in_buffer(task, iov[i].iov_len, iov[i].iov_base); + } +#endif +} + +static void +bdev_iscsi_writev(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io, + struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t lba) +{ + struct scsi_task *task; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI_INIT, "write %d iovs size %lu to lba: %#lx\n", + iovcnt, nbytes, lba); + + task = iscsi_write16_task(lun->context, lun->lun_id, lba, NULL, nbytes, lun->bdev.blocklen, 0, 0, 0, + 0, 0, + bdev_iscsi_command_cb, iscsi_io); + if (task == NULL) { + SPDK_ERRLOG("failed to get write16_task\n"); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + +#if defined(LIBISCSI_FEATURE_IOVECTOR) + scsi_task_set_iov_out(task, (struct scsi_iovec *)iov, iovcnt); +#else + int i; + for (i = 0; i < iovcnt; i++) { + scsi_task_add_data_in_buffer(task, iov[i].iov_len, iov[i].iov_base); + } +#endif +} + +static void +bdev_iscsi_destruct_cb(void *ctx) +{ + struct bdev_iscsi_lun *lun = ctx; + + spdk_poller_unregister(&lun->no_master_ch_poller); + spdk_io_device_unregister(lun, _iscsi_free_lun); +} + +static int +bdev_iscsi_destruct(void *ctx) +{ + struct bdev_iscsi_lun *lun = ctx; + + assert(lun->no_master_ch_poller_td); + spdk_thread_send_msg(lun->no_master_ch_poller_td, bdev_iscsi_destruct_cb, lun); + return 1; +} + +static void +bdev_iscsi_flush(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io, uint32_t num_blocks, + int immed, uint64_t lba) +{ + struct scsi_task *task; + + task = iscsi_synchronizecache16_task(lun->context, lun->lun_id, lba, + num_blocks, 0, immed, bdev_iscsi_command_cb, iscsi_io); + if (task == NULL) { + SPDK_ERRLOG("failed to get sync16_task\n"); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } +} + +static void +bdev_iscsi_unmap(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io, + uint64_t lba, uint64_t num_blocks) +{ + struct scsi_task *task; + struct unmap_list list[1]; + + list[0].lba = lba; + list[0].num = num_blocks; + task = iscsi_unmap_task(lun->context, 0, 0, 0, list, 1, + bdev_iscsi_command_cb, iscsi_io); + if (task == NULL) { + SPDK_ERRLOG("failed to get unmap_task\n"); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } +} + +static void +bdev_iscsi_reset_cb(struct iscsi_context *context __attribute__((unused)), int status, + void *command_data, void *private_data) +{ + uint32_t tmf_response; + struct bdev_iscsi_io *iscsi_io = private_data; + + tmf_response = *(uint32_t *)command_data; + if (tmf_response == ISCSI_TASK_FUNC_RESP_COMPLETE) { + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +_bdev_iscsi_reset(void *_bdev_io) +{ + int rc; + struct spdk_bdev_io *bdev_io = _bdev_io; + struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt; + struct bdev_iscsi_io *iscsi_io = (struct bdev_iscsi_io *)bdev_io->driver_ctx; + struct iscsi_context *context = lun->context; + + rc = iscsi_task_mgmt_lun_reset_async(context, lun->lun_id, + bdev_iscsi_reset_cb, iscsi_io); + if (rc != 0) { + SPDK_ERRLOG("failed to do iscsi reset\n"); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } +} + +static void +bdev_iscsi_reset(struct spdk_bdev_io *bdev_io) +{ + struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt; + spdk_thread_send_msg(lun->master_td, _bdev_iscsi_reset, bdev_io); +} + +static int +bdev_iscsi_poll_lun(void *_lun) +{ + struct bdev_iscsi_lun *lun = _lun; + struct pollfd pfd = {}; + + pfd.fd = iscsi_get_fd(lun->context); + pfd.events = iscsi_which_events(lun->context); + + if (poll(&pfd, 1, 0) < 0) { + SPDK_ERRLOG("poll failed\n"); + return SPDK_POLLER_IDLE; + } + + if (pfd.revents != 0) { + if (iscsi_service(lun->context, pfd.revents) < 0) { + SPDK_ERRLOG("iscsi_service failed: %s\n", iscsi_get_error(lun->context)); + } + + return SPDK_POLLER_BUSY; + } + + return SPDK_POLLER_IDLE; +} + +static int +bdev_iscsi_no_master_ch_poll(void *arg) +{ + struct bdev_iscsi_lun *lun = arg; + enum spdk_thread_poller_rc rc = SPDK_POLLER_IDLE; + + if (pthread_mutex_trylock(&lun->mutex)) { + /* Don't care about the error code here. */ + return SPDK_POLLER_IDLE; + } + + if (lun->ch_count == 0) { + rc = bdev_iscsi_poll_lun(arg); + } + + pthread_mutex_unlock(&lun->mutex); + return rc; +} + +static void +bdev_iscsi_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + bdev_iscsi_readv((struct bdev_iscsi_lun *)bdev_io->bdev->ctxt, + (struct bdev_iscsi_io *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks); +} + +static void _bdev_iscsi_submit_request(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + struct bdev_iscsi_io *iscsi_io = (struct bdev_iscsi_io *)bdev_io->driver_ctx; + struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_iscsi_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_iscsi_writev(lun, iscsi_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + bdev_iscsi_flush(lun, iscsi_io, + bdev_io->u.bdev.num_blocks, + ISCSI_IMMEDIATE_DATA_NO, + bdev_io->u.bdev.offset_blocks); + break; + case SPDK_BDEV_IO_TYPE_RESET: + bdev_iscsi_reset(bdev_io); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + bdev_iscsi_unmap(lun, iscsi_io, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks); + break; + default: + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + break; + } +} + +static void bdev_iscsi_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct spdk_thread *submit_td = spdk_io_channel_get_thread(_ch); + struct bdev_iscsi_io *iscsi_io = (struct bdev_iscsi_io *)bdev_io->driver_ctx; + struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt; + + if (lun->master_td != submit_td) { + iscsi_io->submit_td = submit_td; + spdk_thread_send_msg(lun->master_td, _bdev_iscsi_submit_request, bdev_io); + return; + } else { + iscsi_io->submit_td = NULL; + } + + _bdev_iscsi_submit_request(bdev_io); +} + +static bool +bdev_iscsi_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct bdev_iscsi_lun *lun = ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + + case SPDK_BDEV_IO_TYPE_UNMAP: + return lun->unmap_supported; + default: + return false; + } +} + +static int +bdev_iscsi_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_iscsi_io_channel *ch = ctx_buf; + struct bdev_iscsi_lun *lun = io_device; + + pthread_mutex_lock(&lun->mutex); + if (lun->ch_count == 0) { + assert(lun->master_td == NULL); + lun->master_td = spdk_get_thread(); + lun->poller = SPDK_POLLER_REGISTER(bdev_iscsi_poll_lun, lun, 0); + ch->lun = lun; + } + lun->ch_count++; + pthread_mutex_unlock(&lun->mutex); + + return 0; +} + +static void +_iscsi_destroy_cb(void *ctx) +{ + struct bdev_iscsi_lun *lun = ctx; + + pthread_mutex_lock(&lun->mutex); + + assert(lun->master_td == spdk_get_thread()); + assert(lun->ch_count > 0); + + lun->ch_count--; + if (lun->ch_count > 0) { + pthread_mutex_unlock(&lun->mutex); + return; + } + + lun->master_td = NULL; + spdk_poller_unregister(&lun->poller); + + pthread_mutex_unlock(&lun->mutex); +} + +static void +bdev_iscsi_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_iscsi_lun *lun = io_device; + struct spdk_thread *thread; + + pthread_mutex_lock(&lun->mutex); + lun->ch_count--; + if (lun->ch_count == 0) { + assert(lun->master_td != NULL); + + if (lun->master_td != spdk_get_thread()) { + /* The final channel was destroyed on a different thread + * than where the first channel was created. Pass a message + * to the master thread to unregister the poller. */ + lun->ch_count++; + thread = lun->master_td; + pthread_mutex_unlock(&lun->mutex); + spdk_thread_send_msg(thread, _iscsi_destroy_cb, lun); + return; + } + + lun->master_td = NULL; + spdk_poller_unregister(&lun->poller); + } + pthread_mutex_unlock(&lun->mutex); +} + +static struct spdk_io_channel * +bdev_iscsi_get_io_channel(void *ctx) +{ + struct bdev_iscsi_lun *lun = ctx; + + return spdk_get_io_channel(lun); +} + +static int +bdev_iscsi_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct bdev_iscsi_lun *lun = ctx; + + spdk_json_write_named_object_begin(w, "iscsi"); + spdk_json_write_named_string(w, "initiator_name", lun->initiator_iqn); + spdk_json_write_named_string(w, "url", lun->url); + spdk_json_write_object_end(w); + + return 0; +} + +static void +bdev_iscsi_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct bdev_iscsi_lun *lun = bdev->ctxt; + + pthread_mutex_lock(&lun->mutex); + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_iscsi_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_string(w, "initiator_iqn", lun->initiator_iqn); + spdk_json_write_named_string(w, "url", lun->url); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + pthread_mutex_unlock(&lun->mutex); +} + +static const struct spdk_bdev_fn_table iscsi_fn_table = { + .destruct = bdev_iscsi_destruct, + .submit_request = bdev_iscsi_submit_request, + .io_type_supported = bdev_iscsi_io_type_supported, + .get_io_channel = bdev_iscsi_get_io_channel, + .dump_info_json = bdev_iscsi_dump_info_json, + .write_config_json = bdev_iscsi_write_config_json, +}; + +static int +create_iscsi_lun(struct iscsi_context *context, int lun_id, char *url, char *initiator_iqn, + char *name, + uint64_t num_blocks, uint32_t block_size, struct spdk_bdev **bdev, bool unmap_supported) +{ + struct bdev_iscsi_lun *lun; + int rc; + + lun = calloc(sizeof(*lun), 1); + if (!lun) { + SPDK_ERRLOG("Unable to allocate enough memory for iscsi backend\n"); + return -ENOMEM; + } + + lun->context = context; + lun->lun_id = lun_id; + lun->url = url; + lun->initiator_iqn = initiator_iqn; + + pthread_mutex_init(&lun->mutex, NULL); + + lun->bdev.name = name; + lun->bdev.product_name = "iSCSI LUN"; + lun->bdev.module = &g_iscsi_bdev_module; + lun->bdev.blocklen = block_size; + lun->bdev.blockcnt = num_blocks; + lun->bdev.ctxt = lun; + lun->unmap_supported = unmap_supported; + + lun->bdev.fn_table = &iscsi_fn_table; + + spdk_io_device_register(lun, bdev_iscsi_create_cb, bdev_iscsi_destroy_cb, + sizeof(struct bdev_iscsi_io_channel), + name); + rc = spdk_bdev_register(&lun->bdev); + if (rc) { + spdk_io_device_unregister(lun, NULL); + pthread_mutex_destroy(&lun->mutex); + free(lun); + return rc; + } + + lun->no_master_ch_poller_td = spdk_get_thread(); + lun->no_master_ch_poller = SPDK_POLLER_REGISTER(bdev_iscsi_no_master_ch_poll, lun, + BDEV_ISCSI_NO_MASTER_CH_POLL_US); + + *bdev = &lun->bdev; + return 0; +} + +static void +iscsi_readcapacity16_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *private_data) +{ + struct bdev_iscsi_conn_req *req = private_data; + struct scsi_readcapacity16 *readcap16; + struct spdk_bdev *bdev = NULL; + struct scsi_task *task = command_data; + + if (status != SPDK_SCSI_STATUS_GOOD) { + SPDK_ERRLOG("iSCSI error: %s\n", iscsi_get_error(iscsi)); + goto ret; + } + + readcap16 = scsi_datain_unmarshall(task); + if (!readcap16) { + status = -ENOMEM; + goto ret; + } + + status = create_iscsi_lun(req->context, req->lun, req->url, req->initiator_iqn, req->bdev_name, + readcap16->returned_lba + 1, readcap16->block_length, &bdev, req->unmap_supported); + if (status) { + SPDK_ERRLOG("Unable to create iscsi bdev: %s (%d)\n", spdk_strerror(-status), status); + } + +ret: + scsi_free_scsi_task(task); + complete_conn_req(req, bdev, status); +} + +static void +bdev_iscsi_inquiry_cb(struct iscsi_context *context, int status, void *_task, void *private_data) +{ + struct scsi_task *task = _task; + struct scsi_inquiry_logical_block_provisioning *lbp_inq = NULL; + struct bdev_iscsi_conn_req *req = private_data; + + if (status == SPDK_SCSI_STATUS_GOOD) { + lbp_inq = scsi_datain_unmarshall(task); + if (lbp_inq != NULL && lbp_inq->lbpu) { + req->unmap_supported = true; + } + } + + task = iscsi_readcapacity16_task(context, req->lun, iscsi_readcapacity16_cb, req); + if (task) { + return; + } + + SPDK_ERRLOG("iSCSI error: %s\n", iscsi_get_error(req->context)); + complete_conn_req(req, NULL, status); +} + +static void +iscsi_connect_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *private_data) +{ + struct bdev_iscsi_conn_req *req = private_data; + struct scsi_task *task; + + if (status != SPDK_SCSI_STATUS_GOOD) { + goto ret; + } + + task = iscsi_inquiry_task(iscsi, req->lun, 1, + SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING, + 255, bdev_iscsi_inquiry_cb, req); + if (task) { + return; + } + +ret: + SPDK_ERRLOG("iSCSI error: %s\n", iscsi_get_error(req->context)); + complete_conn_req(req, NULL, status); +} + +static int +iscsi_bdev_conn_poll(void *arg) +{ + struct bdev_iscsi_conn_req *req, *tmp; + struct pollfd pfd; + struct iscsi_context *context; + + if (TAILQ_EMPTY(&g_iscsi_conn_req)) { + return SPDK_POLLER_IDLE; + } + + TAILQ_FOREACH_SAFE(req, &g_iscsi_conn_req, link, tmp) { + context = req->context; + pfd.fd = iscsi_get_fd(context); + pfd.events = iscsi_which_events(context); + pfd.revents = 0; + if (poll(&pfd, 1, 0) < 0) { + SPDK_ERRLOG("poll failed\n"); + return SPDK_POLLER_BUSY; + } + + if (pfd.revents != 0) { + if (iscsi_service(context, pfd.revents) < 0) { + SPDK_ERRLOG("iscsi_service failed: %s\n", iscsi_get_error(context)); + } + } + + if (req->status == 0) { + /* + * The request completed successfully. + */ + free(req); + } else if (req->status > 0) { + /* + * An error has occurred during connecting. This req has already + * been removed from the g_iscsi_conn_req list, but we needed to + * wait until iscsi_service unwound before we could free the req. + */ + _bdev_iscsi_conn_req_free(req); + } + } + return SPDK_POLLER_BUSY; +} + +int +create_iscsi_disk(const char *bdev_name, const char *url, const char *initiator_iqn, + spdk_bdev_iscsi_create_cb cb_fn, void *cb_arg) +{ + struct bdev_iscsi_conn_req *req; + struct iscsi_url *iscsi_url = NULL; + int rc; + + if (!bdev_name || !url || !initiator_iqn || strlen(initiator_iqn) == 0 || !cb_fn) { + return -EINVAL; + } + + req = calloc(1, sizeof(struct bdev_iscsi_conn_req)); + if (!req) { + SPDK_ERRLOG("Cannot allocate pointer of struct bdev_iscsi_conn_req\n"); + return -ENOMEM; + } + + req->status = SCSI_STATUS_GOOD; + req->bdev_name = strdup(bdev_name); + req->url = strdup(url); + req->initiator_iqn = strdup(initiator_iqn); + req->context = iscsi_create_context(initiator_iqn); + if (!req->bdev_name || !req->url || !req->initiator_iqn || !req->context) { + SPDK_ERRLOG("Out of memory\n"); + rc = -ENOMEM; + goto err; + } + + req->create_cb = cb_fn; + req->create_cb_arg = cb_arg; + + iscsi_url = iscsi_parse_full_url(req->context, url); + if (iscsi_url == NULL) { + SPDK_ERRLOG("could not parse URL: %s\n", iscsi_get_error(req->context)); + rc = -EINVAL; + goto err; + } + + req->lun = iscsi_url->lun; + rc = iscsi_set_session_type(req->context, ISCSI_SESSION_NORMAL); + rc = rc ? rc : iscsi_set_header_digest(req->context, ISCSI_HEADER_DIGEST_NONE); + rc = rc ? rc : iscsi_set_targetname(req->context, iscsi_url->target); + rc = rc ? rc : iscsi_full_connect_async(req->context, iscsi_url->portal, iscsi_url->lun, + iscsi_connect_cb, req); + if (rc == 0 && iscsi_url->user[0] != '\0') { + rc = iscsi_set_initiator_username_pwd(req->context, iscsi_url->user, iscsi_url->passwd); + } + + if (rc < 0) { + SPDK_ERRLOG("Failed to connect provided URL=%s: %s\n", url, iscsi_get_error(req->context)); + goto err; + } + + iscsi_destroy_url(iscsi_url); + req->status = -1; + TAILQ_INSERT_TAIL(&g_iscsi_conn_req, req, link); + if (!g_conn_poller) { + g_conn_poller = SPDK_POLLER_REGISTER(iscsi_bdev_conn_poll, NULL, BDEV_ISCSI_CONNECTION_POLL_US); + } + + return 0; + +err: + /* iscsi_destroy_url() is not NULL-proof */ + if (iscsi_url) { + iscsi_destroy_url(iscsi_url); + } + + if (req->context) { + iscsi_destroy_context(req->context); + } + + free(req->initiator_iqn); + free(req->bdev_name); + free(req->url); + free(req); + return rc; +} + +void +delete_iscsi_disk(struct spdk_bdev *bdev, spdk_delete_iscsi_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &g_iscsi_bdev_module) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static void +bdev_iscsi_initialize_cb(void *cb_arg, struct spdk_bdev *bdev, int status) +{ + if (TAILQ_EMPTY(&g_iscsi_conn_req)) { + spdk_bdev_module_init_done(&g_iscsi_bdev_module); + } +} + +static int +bdev_iscsi_initialize(void) +{ + struct spdk_conf_section *sp; + + const char *url, *bdev_name, *initiator_iqn; + int i, rc; + + sp = spdk_conf_find_section(NULL, "iSCSI_Initiator"); + if (sp == NULL) { + spdk_bdev_module_init_done(&g_iscsi_bdev_module); + return 0; + } + + initiator_iqn = spdk_conf_section_get_val(sp, "initiator_name"); + if (!initiator_iqn) { + initiator_iqn = DEFAULT_INITIATOR_NAME; + } + + rc = 0; + for (i = 0; (url = spdk_conf_section_get_nmval(sp, "URL", i, 0)) != NULL; i++) { + bdev_name = spdk_conf_section_get_nmval(sp, "URL", i, 1); + if (bdev_name == NULL) { + SPDK_ERRLOG("no bdev name specified for URL %s\n", url); + rc = -EINVAL; + break; + } + + rc = create_iscsi_disk(bdev_name, url, initiator_iqn, bdev_iscsi_initialize_cb, NULL); + if (rc) { + break; + } + } + + if (i == 0) { + spdk_bdev_module_init_done(&g_iscsi_bdev_module); + } + + return rc; +} + +SPDK_LOG_REGISTER_COMPONENT("iscsi_init", SPDK_LOG_ISCSI_INIT) diff --git a/src/spdk/module/bdev/iscsi/bdev_iscsi.h b/src/spdk/module/bdev/iscsi/bdev_iscsi.h new file mode 100644 index 000000000..6a343123b --- /dev/null +++ b/src/spdk/module/bdev/iscsi/bdev_iscsi.h @@ -0,0 +1,75 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_ISCSI_H +#define SPDK_BDEV_ISCSI_H + +#include "spdk/bdev.h" + +typedef void (*spdk_delete_iscsi_complete)(void *cb_arg, int bdeverrno); + +/** + * SPDK bdev iSCSI callback type. + * + * \param cb_arg Completion callback custom arguments + * \param bdev created bdev + * \param status operation status. Zero on success. + */ +typedef void (*spdk_bdev_iscsi_create_cb)(void *cb_arg, struct spdk_bdev *bdev, int status); + +/** + * Create new iSCSI bdev. + * + * \warning iSCSI URL allow providing login and password. Be careful because + * they will show up in configuration dump. + * + * \param name name for new bdev. + * \param url iSCSI URL string. + * \param initiator_iqn connection iqn name we identify to target as + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + * \return 0 on success or negative error code. If success bdev with provided name was created. + */ +int create_iscsi_disk(const char *bdev_name, const char *url, const char *initiator_iqn, + spdk_bdev_iscsi_create_cb cb_fn, void *cb_arg); + +/** + * Delete iSCSI bdev. + * + * \param bdev Pointer to iSCSI bdev. + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + */ +void delete_iscsi_disk(struct spdk_bdev *bdev, spdk_delete_iscsi_complete cb_fn, void *cb_arg); + +#endif /* SPDK_BDEV_ISCSI_H */ diff --git a/src/spdk/module/bdev/iscsi/bdev_iscsi_rpc.c b/src/spdk/module/bdev/iscsi/bdev_iscsi_rpc.c new file mode 100644 index 000000000..5c3bdf551 --- /dev/null +++ b/src/spdk/module/bdev/iscsi/bdev_iscsi_rpc.c @@ -0,0 +1,158 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_iscsi.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +struct rpc_bdev_iscsi_create { + char *name; + char *initiator_iqn; + char *url; +}; + +static const struct spdk_json_object_decoder rpc_bdev_iscsi_create_decoders[] = { + {"name", offsetof(struct rpc_bdev_iscsi_create, name), spdk_json_decode_string}, + {"initiator_iqn", offsetof(struct rpc_bdev_iscsi_create, initiator_iqn), spdk_json_decode_string}, + {"url", offsetof(struct rpc_bdev_iscsi_create, url), spdk_json_decode_string}, +}; + +static void +free_rpc_bdev_iscsi_create(struct rpc_bdev_iscsi_create *req) +{ + free(req->name); + free(req->initiator_iqn); + free(req->url); +} + +static void +bdev_iscsi_create_cb(void *cb_arg, struct spdk_bdev *bdev, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + if (status > 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "iSCSI error (%d).", status); + } else if (status < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-status)); + } else { + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + } +} + +static void +rpc_bdev_iscsi_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_iscsi_create req = {}; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_bdev_iscsi_create_decoders, + SPDK_COUNTOF(rpc_bdev_iscsi_create_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = create_iscsi_disk(req.name, req.url, req.initiator_iqn, bdev_iscsi_create_cb, request); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + } + +cleanup: + free_rpc_bdev_iscsi_create(&req); +} +SPDK_RPC_REGISTER("bdev_iscsi_create", rpc_bdev_iscsi_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_iscsi_create, construct_iscsi_bdev) + +struct rpc_delete_iscsi { + char *name; +}; + +static void +free_rpc_delete_iscsi(struct rpc_delete_iscsi *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_iscsi_decoders[] = { + {"name", offsetof(struct rpc_delete_iscsi, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_iscsi_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_iscsi_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_iscsi req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_delete_iscsi_decoders, + SPDK_COUNTOF(rpc_delete_iscsi_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + delete_iscsi_disk(bdev, rpc_bdev_iscsi_delete_cb, request); + +cleanup: + free_rpc_delete_iscsi(&req); +} +SPDK_RPC_REGISTER("bdev_iscsi_delete", rpc_bdev_iscsi_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_iscsi_delete, delete_iscsi_bdev) diff --git a/src/spdk/module/bdev/lvol/Makefile b/src/spdk/module/bdev/lvol/Makefile new file mode 100644 index 000000000..37034593c --- /dev/null +++ b/src/spdk/module/bdev/lvol/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = vbdev_lvol.c vbdev_lvol_rpc.c +LIBNAME = bdev_lvol +LOCAL_SYS_LIBS = -luuid + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/lvol/vbdev_lvol.c b/src/spdk/module/bdev/lvol/vbdev_lvol.c new file mode 100644 index 000000000..275d68e6a --- /dev/null +++ b/src/spdk/module/bdev/lvol/vbdev_lvol.c @@ -0,0 +1,1354 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/blob_bdev.h" +#include "spdk/rpc.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk/string.h" +#include "spdk/uuid.h" + +#include "vbdev_lvol.h" + +static TAILQ_HEAD(, lvol_store_bdev) g_spdk_lvol_pairs = TAILQ_HEAD_INITIALIZER( + g_spdk_lvol_pairs); + +static int vbdev_lvs_init(void); +static int vbdev_lvs_get_ctx_size(void); +static void vbdev_lvs_examine(struct spdk_bdev *bdev); + +static struct spdk_bdev_module g_lvol_if = { + .name = "lvol", + .module_init = vbdev_lvs_init, + .examine_disk = vbdev_lvs_examine, + .get_ctx_size = vbdev_lvs_get_ctx_size, + +}; + +SPDK_BDEV_MODULE_REGISTER(lvol, &g_lvol_if) + +struct lvol_store_bdev * +vbdev_get_lvs_bdev_by_lvs(struct spdk_lvol_store *lvs_orig) +{ + struct spdk_lvol_store *lvs = NULL; + struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first(); + + while (lvs_bdev != NULL) { + lvs = lvs_bdev->lvs; + if (lvs == lvs_orig) { + if (lvs_bdev->req != NULL) { + /* We do not allow access to lvs that are being destroyed */ + return NULL; + } else { + return lvs_bdev; + } + } + lvs_bdev = vbdev_lvol_store_next(lvs_bdev); + } + + return NULL; +} + +static int +_vbdev_lvol_change_bdev_alias(struct spdk_lvol *lvol, const char *new_lvol_name) +{ + struct spdk_bdev_alias *tmp; + char *old_alias; + char *alias; + int rc; + int alias_number = 0; + + /* bdev representing lvols have only one alias, + * while we changed lvs name earlier, we have to iterate alias list to get one, + * and check if there is only one alias */ + + TAILQ_FOREACH(tmp, &lvol->bdev->aliases, tailq) { + if (++alias_number > 1) { + SPDK_ERRLOG("There is more than 1 alias in bdev %s\n", lvol->bdev->name); + return -EINVAL; + } + + old_alias = tmp->alias; + } + + if (alias_number == 0) { + SPDK_ERRLOG("There are no aliases in bdev %s\n", lvol->bdev->name); + return -EINVAL; + } + + alias = spdk_sprintf_alloc("%s/%s", lvol->lvol_store->name, new_lvol_name); + if (alias == NULL) { + SPDK_ERRLOG("Cannot alloc memory for alias\n"); + return -ENOMEM; + } + + rc = spdk_bdev_alias_add(lvol->bdev, alias); + if (rc != 0) { + SPDK_ERRLOG("cannot add alias '%s'\n", alias); + free(alias); + return rc; + } + free(alias); + + rc = spdk_bdev_alias_del(lvol->bdev, old_alias); + if (rc != 0) { + SPDK_ERRLOG("cannot remove alias '%s'\n", old_alias); + return rc; + } + + return 0; +} + +static struct lvol_store_bdev * +vbdev_get_lvs_bdev_by_bdev(struct spdk_bdev *bdev_orig) +{ + struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first(); + + while (lvs_bdev != NULL) { + if (lvs_bdev->bdev == bdev_orig) { + if (lvs_bdev->req != NULL) { + /* We do not allow access to lvs that are being destroyed */ + return NULL; + } else { + return lvs_bdev; + } + } + lvs_bdev = vbdev_lvol_store_next(lvs_bdev); + } + + return NULL; +} + +static void +vbdev_lvs_hotremove_cb(void *ctx) +{ + struct spdk_bdev *bdev = ctx; + struct lvol_store_bdev *lvs_bdev; + + lvs_bdev = vbdev_get_lvs_bdev_by_bdev(bdev); + if (lvs_bdev != NULL) { + vbdev_lvs_unload(lvs_bdev->lvs, NULL, NULL); + } +} + +static void +_vbdev_lvs_create_cb(void *cb_arg, struct spdk_lvol_store *lvs, int lvserrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct lvol_store_bdev *lvs_bdev; + struct spdk_bdev *bdev = req->base_bdev; + struct spdk_bs_dev *bs_dev = req->bs_dev; + + if (lvserrno != 0) { + assert(lvs == NULL); + SPDK_ERRLOG("Cannot create lvol store bdev\n"); + goto end; + } + + lvserrno = spdk_bs_bdev_claim(bs_dev, &g_lvol_if); + if (lvserrno != 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store base bdev already claimed by another bdev\n"); + req->bs_dev->destroy(req->bs_dev); + goto end; + } + + assert(lvs != NULL); + + lvs_bdev = calloc(1, sizeof(*lvs_bdev)); + if (!lvs_bdev) { + lvserrno = -ENOMEM; + goto end; + } + lvs_bdev->lvs = lvs; + lvs_bdev->bdev = bdev; + lvs_bdev->req = NULL; + + TAILQ_INSERT_TAIL(&g_spdk_lvol_pairs, lvs_bdev, lvol_stores); + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store bdev inserted\n"); + +end: + req->cb_fn(req->cb_arg, lvs, lvserrno); + free(req); + + return; +} + +int +vbdev_lvs_create(struct spdk_bdev *base_bdev, const char *name, uint32_t cluster_sz, + enum lvs_clear_method clear_method, spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_dev *bs_dev; + struct spdk_lvs_with_handle_req *lvs_req; + struct spdk_lvs_opts opts; + int rc; + int len; + + if (base_bdev == NULL) { + SPDK_ERRLOG("Bdev does not exist\n"); + return -ENODEV; + } + + spdk_lvs_opts_init(&opts); + if (cluster_sz != 0) { + opts.cluster_sz = cluster_sz; + } + + if (clear_method != 0) { + opts.clear_method = clear_method; + } + + if (name == NULL) { + SPDK_ERRLOG("missing name param\n"); + return -EINVAL; + } + + len = strnlen(name, SPDK_LVS_NAME_MAX); + + if (len == 0 || len == SPDK_LVS_NAME_MAX) { + SPDK_ERRLOG("name must be between 1 and %d characters\n", SPDK_LVS_NAME_MAX - 1); + return -EINVAL; + } + snprintf(opts.name, sizeof(opts.name), "%s", name); + + lvs_req = calloc(1, sizeof(*lvs_req)); + if (!lvs_req) { + SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n"); + return -ENOMEM; + } + + bs_dev = spdk_bdev_create_bs_dev(base_bdev, vbdev_lvs_hotremove_cb, base_bdev); + if (!bs_dev) { + SPDK_ERRLOG("Cannot create blobstore device\n"); + free(lvs_req); + return -ENODEV; + } + + lvs_req->bs_dev = bs_dev; + lvs_req->base_bdev = base_bdev; + lvs_req->cb_fn = cb_fn; + lvs_req->cb_arg = cb_arg; + + rc = spdk_lvs_init(bs_dev, &opts, _vbdev_lvs_create_cb, lvs_req); + if (rc < 0) { + free(lvs_req); + bs_dev->destroy(bs_dev); + return rc; + } + + return 0; +} + +static void +_vbdev_lvs_rename_cb(void *cb_arg, int lvserrno) +{ + struct spdk_lvs_req *req = cb_arg; + struct spdk_lvol *tmp; + + if (lvserrno != 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store rename failed\n"); + } else { + TAILQ_FOREACH(tmp, &req->lvol_store->lvols, link) { + /* We have to pass current lvol name, since only lvs name changed */ + _vbdev_lvol_change_bdev_alias(tmp, tmp->name); + } + } + + req->cb_fn(req->cb_arg, lvserrno); + free(req); +} + +void +vbdev_lvs_rename(struct spdk_lvol_store *lvs, const char *new_lvs_name, + spdk_lvs_op_complete cb_fn, void *cb_arg) +{ + struct lvol_store_bdev *lvs_bdev; + + struct spdk_lvs_req *req; + + lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvs); + if (!lvs_bdev) { + SPDK_ERRLOG("No such lvol store found\n"); + cb_fn(cb_arg, -ENODEV); + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol_store = lvs; + + spdk_lvs_rename(lvs, new_lvs_name, _vbdev_lvs_rename_cb, req); +} + +static void +_vbdev_lvs_remove_cb(void *cb_arg, int lvserrno) +{ + struct lvol_store_bdev *lvs_bdev = cb_arg; + struct spdk_lvs_req *req = lvs_bdev->req; + + if (lvserrno != 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store removed with error: %d.\n", lvserrno); + } + + TAILQ_REMOVE(&g_spdk_lvol_pairs, lvs_bdev, lvol_stores); + free(lvs_bdev); + + if (req->cb_fn != NULL) { + req->cb_fn(req->cb_arg, lvserrno); + } + free(req); +} + +static void +_vbdev_lvs_remove_lvol_cb(void *cb_arg, int lvolerrno) +{ + struct lvol_store_bdev *lvs_bdev = cb_arg; + struct spdk_lvol_store *lvs = lvs_bdev->lvs; + struct spdk_lvol *lvol; + + if (lvolerrno != 0) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_LVOL, "Lvol removed with errno %d\n", lvolerrno); + } + + if (TAILQ_EMPTY(&lvs->lvols)) { + spdk_lvs_destroy(lvs, _vbdev_lvs_remove_cb, lvs_bdev); + return; + } + + lvol = TAILQ_FIRST(&lvs->lvols); + while (lvol != NULL) { + if (spdk_lvol_deletable(lvol)) { + vbdev_lvol_destroy(lvol, _vbdev_lvs_remove_lvol_cb, lvs_bdev); + return; + } + lvol = TAILQ_NEXT(lvol, link); + } + + /* If no lvol is deletable, that means there is circular dependency. */ + SPDK_ERRLOG("Lvols left in lvs, but unable to delete.\n"); + assert(false); +} + +static void +_vbdev_lvs_remove_bdev_unregistered_cb(void *cb_arg, int bdeverrno) +{ + struct lvol_store_bdev *lvs_bdev = cb_arg; + struct spdk_lvol_store *lvs = lvs_bdev->lvs; + struct spdk_lvol *lvol, *tmp; + + if (bdeverrno != 0) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_LVOL, "Lvol unregistered with errno %d\n", bdeverrno); + } + + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + if (lvol->ref_count != 0) { + /* An lvol is still open, don't unload whole lvol store. */ + return; + } + } + spdk_lvs_unload(lvs, _vbdev_lvs_remove_cb, lvs_bdev); +} + +static void +_vbdev_lvs_remove(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg, + bool destroy) +{ + struct spdk_lvs_req *req; + struct lvol_store_bdev *lvs_bdev; + struct spdk_lvol *lvol, *tmp; + bool all_lvols_closed = true; + + lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvs); + if (!lvs_bdev) { + SPDK_ERRLOG("No such lvol store found\n"); + if (cb_fn != NULL) { + cb_fn(cb_arg, -ENODEV); + } + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n"); + if (cb_fn != NULL) { + cb_fn(cb_arg, -ENOMEM); + } + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + lvs_bdev->req = req; + + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + if (lvol->ref_count != 0) { + all_lvols_closed = false; + } + } + + if (all_lvols_closed == true) { + if (destroy) { + spdk_lvs_destroy(lvs, _vbdev_lvs_remove_cb, lvs_bdev); + } else { + spdk_lvs_unload(lvs, _vbdev_lvs_remove_cb, lvs_bdev); + } + } else { + lvs->destruct = destroy; + if (destroy) { + _vbdev_lvs_remove_lvol_cb(lvs_bdev, 0); + } else { + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + spdk_bdev_unregister(lvol->bdev, _vbdev_lvs_remove_bdev_unregistered_cb, lvs_bdev); + } + } + } +} + +void +vbdev_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg) +{ + _vbdev_lvs_remove(lvs, cb_fn, cb_arg, false); +} + +void +vbdev_lvs_destruct(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg) +{ + _vbdev_lvs_remove(lvs, cb_fn, cb_arg, true); +} + +struct lvol_store_bdev * +vbdev_lvol_store_first(void) +{ + struct lvol_store_bdev *lvs_bdev; + + lvs_bdev = TAILQ_FIRST(&g_spdk_lvol_pairs); + if (lvs_bdev) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Starting lvolstore iteration at %p\n", lvs_bdev->lvs); + } + + return lvs_bdev; +} + +struct lvol_store_bdev * +vbdev_lvol_store_next(struct lvol_store_bdev *prev) +{ + struct lvol_store_bdev *lvs_bdev; + + if (prev == NULL) { + SPDK_ERRLOG("prev argument cannot be NULL\n"); + return NULL; + } + + lvs_bdev = TAILQ_NEXT(prev, lvol_stores); + if (lvs_bdev) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Continuing lvolstore iteration at %p\n", lvs_bdev->lvs); + } + + return lvs_bdev; +} + +static struct spdk_lvol_store * +_vbdev_get_lvol_store_by_uuid(const struct spdk_uuid *uuid) +{ + struct spdk_lvol_store *lvs = NULL; + struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first(); + + while (lvs_bdev != NULL) { + lvs = lvs_bdev->lvs; + if (spdk_uuid_compare(&lvs->uuid, uuid) == 0) { + return lvs; + } + lvs_bdev = vbdev_lvol_store_next(lvs_bdev); + } + return NULL; +} + +struct spdk_lvol_store * +vbdev_get_lvol_store_by_uuid(const char *uuid_str) +{ + struct spdk_uuid uuid; + + if (spdk_uuid_parse(&uuid, uuid_str)) { + return NULL; + } + + return _vbdev_get_lvol_store_by_uuid(&uuid); +} + +struct spdk_lvol_store * +vbdev_get_lvol_store_by_name(const char *name) +{ + struct spdk_lvol_store *lvs = NULL; + struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first(); + + while (lvs_bdev != NULL) { + lvs = lvs_bdev->lvs; + if (strncmp(lvs->name, name, sizeof(lvs->name)) == 0) { + return lvs; + } + lvs_bdev = vbdev_lvol_store_next(lvs_bdev); + } + return NULL; +} + +struct vbdev_lvol_destroy_ctx { + struct spdk_lvol *lvol; + spdk_lvol_op_complete cb_fn; + void *cb_arg; +}; + +static void +_vbdev_lvol_unregister_cb(void *ctx, int lvolerrno) +{ + struct spdk_bdev *bdev = ctx; + + spdk_bdev_destruct_done(bdev, lvolerrno); + free(bdev); +} + +static int +vbdev_lvol_unregister(void *ctx) +{ + struct spdk_lvol *lvol = ctx; + + assert(lvol != NULL); + + spdk_bdev_alias_del_all(lvol->bdev); + spdk_lvol_close(lvol, _vbdev_lvol_unregister_cb, lvol->bdev); + + /* return 1 to indicate we have an operation that must finish asynchronously before the + * lvol is closed + */ + return 1; +} + +static void +_vbdev_lvol_destroy_cb(void *cb_arg, int bdeverrno) +{ + struct vbdev_lvol_destroy_ctx *ctx = cb_arg; + struct spdk_lvol *lvol = ctx->lvol; + + if (bdeverrno < 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Could not unregister bdev during lvol (%s) destroy\n", + lvol->unique_id); + ctx->cb_fn(ctx->cb_arg, bdeverrno); + free(ctx); + return; + } + + spdk_lvol_destroy(lvol, ctx->cb_fn, ctx->cb_arg); + free(ctx); +} + +void +vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct vbdev_lvol_destroy_ctx *ctx; + size_t count; + + assert(lvol != NULL); + assert(cb_fn != NULL); + + /* Check if it is possible to delete lvol */ + spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count); + if (count > 1) { + /* throw an error */ + SPDK_ERRLOG("Cannot delete lvol\n"); + cb_fn(cb_arg, -EPERM); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->lvol = lvol; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_bdev_unregister(lvol->bdev, _vbdev_lvol_destroy_cb, ctx); +} + +static char * +vbdev_lvol_find_name(struct spdk_lvol *lvol, spdk_blob_id blob_id) +{ + struct spdk_lvol_store *lvs; + struct spdk_lvol *_lvol; + + assert(lvol != NULL); + + lvs = lvol->lvol_store; + + assert(lvs); + + TAILQ_FOREACH(_lvol, &lvs->lvols, link) { + if (_lvol->blob_id == blob_id) { + return _lvol->name; + } + } + + return NULL; +} + +static int +vbdev_lvol_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct spdk_lvol *lvol = ctx; + struct lvol_store_bdev *lvs_bdev; + struct spdk_bdev *bdev; + struct spdk_blob *blob; + char lvol_store_uuid[SPDK_UUID_STRING_LEN]; + spdk_blob_id *ids = NULL; + size_t count, i; + char *name; + int rc = 0; + + spdk_json_write_named_object_begin(w, "lvol"); + + lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvol->lvol_store); + if (!lvs_bdev) { + SPDK_ERRLOG("No such lvol store found\n"); + rc = -ENODEV; + goto end; + } + + bdev = lvs_bdev->bdev; + + spdk_uuid_fmt_lower(lvol_store_uuid, sizeof(lvol_store_uuid), &lvol->lvol_store->uuid); + spdk_json_write_named_string(w, "lvol_store_uuid", lvol_store_uuid); + + spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(bdev)); + + blob = lvol->blob; + + spdk_json_write_named_bool(w, "thin_provision", spdk_blob_is_thin_provisioned(blob)); + + spdk_json_write_named_bool(w, "snapshot", spdk_blob_is_snapshot(blob)); + + spdk_json_write_named_bool(w, "clone", spdk_blob_is_clone(blob)); + + if (spdk_blob_is_clone(blob)) { + spdk_blob_id snapshotid = spdk_blob_get_parent_snapshot(lvol->lvol_store->blobstore, lvol->blob_id); + if (snapshotid != SPDK_BLOBID_INVALID) { + name = vbdev_lvol_find_name(lvol, snapshotid); + if (name != NULL) { + spdk_json_write_named_string(w, "base_snapshot", name); + } else { + SPDK_ERRLOG("Cannot obtain snapshots name\n"); + } + } + } + + if (spdk_blob_is_snapshot(blob)) { + /* Take a number of clones */ + rc = spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count); + if (rc == -ENOMEM && count > 0) { + ids = malloc(sizeof(spdk_blob_id) * count); + if (ids == NULL) { + SPDK_ERRLOG("Cannot allocate memory\n"); + rc = -ENOMEM; + goto end; + } + + rc = spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, ids, &count); + if (rc == 0) { + spdk_json_write_named_array_begin(w, "clones"); + for (i = 0; i < count; i++) { + name = vbdev_lvol_find_name(lvol, ids[i]); + if (name != NULL) { + spdk_json_write_string(w, name); + } else { + SPDK_ERRLOG("Cannot obtain clone name\n"); + } + + } + spdk_json_write_array_end(w); + } + free(ids); + } + + } + +end: + spdk_json_write_object_end(w); + + return rc; +} + +static void +vbdev_lvol_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* Nothing to dump as lvol configuration is saved on physical device. */ +} + +static struct spdk_io_channel * +vbdev_lvol_get_io_channel(void *ctx) +{ + struct spdk_lvol *lvol = ctx; + + return spdk_lvol_get_io_channel(lvol); +} + +static bool +vbdev_lvol_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct spdk_lvol *lvol = ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + return !spdk_blob_is_read_only(lvol->blob); + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_READ: + return true; + default: + return false; + } +} + +static void +lvol_op_comp(void *cb_arg, int bserrno) +{ + struct spdk_bdev_io *bdev_io = cb_arg; + enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; + + if (bserrno != 0) { + if (bserrno == -ENOMEM) { + status = SPDK_BDEV_IO_STATUS_NOMEM; + } else { + status = SPDK_BDEV_IO_STATUS_FAILED; + } + } + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Vbdev processing callback on device %s with type %d\n", + bdev_io->bdev->name, bdev_io->type); + spdk_bdev_io_complete(bdev_io, status); +} + +static void +lvol_unmap(struct spdk_lvol *lvol, struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + uint64_t start_page, num_pages; + struct spdk_blob *blob = lvol->blob; + + start_page = bdev_io->u.bdev.offset_blocks; + num_pages = bdev_io->u.bdev.num_blocks; + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, + "Vbdev doing unmap at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page, + num_pages, bdev_io->bdev->name); + spdk_blob_io_unmap(blob, ch, start_page, num_pages, lvol_op_comp, bdev_io); +} + +static void +lvol_write_zeroes(struct spdk_lvol *lvol, struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + uint64_t start_page, num_pages; + struct spdk_blob *blob = lvol->blob; + + start_page = bdev_io->u.bdev.offset_blocks; + num_pages = bdev_io->u.bdev.num_blocks; + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, + "Vbdev doing write zeros at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page, + num_pages, bdev_io->bdev->name); + spdk_blob_io_write_zeroes(blob, ch, start_page, num_pages, lvol_op_comp, bdev_io); +} + +static void +lvol_read(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + uint64_t start_page, num_pages; + struct spdk_lvol *lvol = bdev_io->bdev->ctxt; + struct spdk_blob *blob = lvol->blob; + + start_page = bdev_io->u.bdev.offset_blocks; + num_pages = bdev_io->u.bdev.num_blocks; + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, + "Vbdev doing read at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page, + num_pages, bdev_io->bdev->name); + spdk_blob_io_readv(blob, ch, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, start_page, + num_pages, lvol_op_comp, bdev_io); +} + +static void +lvol_write(struct spdk_lvol *lvol, struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + uint64_t start_page, num_pages; + struct spdk_blob *blob = lvol->blob; + + start_page = bdev_io->u.bdev.offset_blocks; + num_pages = bdev_io->u.bdev.num_blocks; + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, + "Vbdev doing write at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page, + num_pages, bdev_io->bdev->name); + spdk_blob_io_writev(blob, ch, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, start_page, + num_pages, lvol_op_comp, bdev_io); +} + +static int +lvol_reset(struct spdk_bdev_io *bdev_io) +{ + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + + return 0; +} + +static void +lvol_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + lvol_read(ch, bdev_io); +} + +static void +vbdev_lvol_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct spdk_lvol *lvol = bdev_io->bdev->ctxt; + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Vbdev request type %d submitted\n", bdev_io->type); + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, lvol_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + lvol_write(lvol, ch, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + lvol_reset(bdev_io); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + lvol_unmap(lvol, ch, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + lvol_write_zeroes(lvol, ch, bdev_io); + break; + default: + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "lvol: unsupported I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + return; +} + +static struct spdk_bdev_fn_table vbdev_lvol_fn_table = { + .destruct = vbdev_lvol_unregister, + .io_type_supported = vbdev_lvol_io_type_supported, + .submit_request = vbdev_lvol_submit_request, + .get_io_channel = vbdev_lvol_get_io_channel, + .dump_info_json = vbdev_lvol_dump_info_json, + .write_config_json = vbdev_lvol_write_config_json, +}; + +static void +lvol_destroy_cb(void *cb_arg, int bdeverrno) +{ +} + +static void +_create_lvol_disk_destroy_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_lvol *lvol = cb_arg; + + if (bdeverrno < 0) { + SPDK_ERRLOG("Could not unregister bdev for lvol %s\n", + lvol->unique_id); + return; + } + + spdk_lvol_destroy(lvol, lvol_destroy_cb, NULL); +} + +static void +_create_lvol_disk_unload_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_lvol *lvol = cb_arg; + + if (bdeverrno < 0) { + SPDK_ERRLOG("Could not unregister bdev for lvol %s\n", + lvol->unique_id); + return; + } + + TAILQ_REMOVE(&lvol->lvol_store->lvols, lvol, link); + free(lvol); +} + +static int +_create_lvol_disk(struct spdk_lvol *lvol, bool destroy) +{ + struct spdk_bdev *bdev; + struct lvol_store_bdev *lvs_bdev; + uint64_t total_size; + unsigned char *alias; + int rc; + + lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvol->lvol_store); + if (lvs_bdev == NULL) { + SPDK_ERRLOG("No spdk lvs-bdev pair found for lvol %s\n", lvol->unique_id); + return -ENODEV; + } + + bdev = calloc(1, sizeof(struct spdk_bdev)); + if (!bdev) { + SPDK_ERRLOG("Cannot alloc memory for lvol bdev\n"); + return -ENOMEM; + } + + bdev->name = lvol->unique_id; + bdev->product_name = "Logical Volume"; + bdev->blocklen = spdk_bs_get_io_unit_size(lvol->lvol_store->blobstore); + total_size = spdk_blob_get_num_clusters(lvol->blob) * + spdk_bs_get_cluster_size(lvol->lvol_store->blobstore); + assert((total_size % bdev->blocklen) == 0); + bdev->blockcnt = total_size / bdev->blocklen; + bdev->uuid = lvol->uuid; + bdev->required_alignment = lvs_bdev->bdev->required_alignment; + bdev->split_on_optimal_io_boundary = true; + bdev->optimal_io_boundary = spdk_bs_get_cluster_size(lvol->lvol_store->blobstore) / bdev->blocklen; + + bdev->ctxt = lvol; + bdev->fn_table = &vbdev_lvol_fn_table; + bdev->module = &g_lvol_if; + + rc = spdk_bdev_register(bdev); + if (rc) { + free(bdev); + return rc; + } + lvol->bdev = bdev; + + alias = spdk_sprintf_alloc("%s/%s", lvs_bdev->lvs->name, lvol->name); + if (alias == NULL) { + SPDK_ERRLOG("Cannot alloc memory for alias\n"); + spdk_bdev_unregister(lvol->bdev, (destroy ? _create_lvol_disk_destroy_cb : + _create_lvol_disk_unload_cb), lvol); + return -ENOMEM; + } + + rc = spdk_bdev_alias_add(bdev, alias); + if (rc != 0) { + SPDK_ERRLOG("Cannot add alias to lvol bdev\n"); + spdk_bdev_unregister(lvol->bdev, (destroy ? _create_lvol_disk_destroy_cb : + _create_lvol_disk_unload_cb), lvol); + } + free(alias); + + return rc; +} + +static void +_vbdev_lvol_create_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno) +{ + struct spdk_lvol_with_handle_req *req = cb_arg; + + if (lvolerrno < 0) { + goto end; + } + + lvolerrno = _create_lvol_disk(lvol, true); + +end: + req->cb_fn(req->cb_arg, lvol, lvolerrno); + free(req); +} + +int +vbdev_lvol_create(struct spdk_lvol_store *lvs, const char *name, uint64_t sz, + bool thin_provision, enum lvol_clear_method clear_method, spdk_lvol_op_with_handle_complete cb_fn, + void *cb_arg) +{ + struct spdk_lvol_with_handle_req *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + return -ENOMEM; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + rc = spdk_lvol_create(lvs, name, sz, thin_provision, clear_method, + _vbdev_lvol_create_cb, req); + if (rc != 0) { + free(req); + } + + return rc; +} + +void +vbdev_lvol_create_snapshot(struct spdk_lvol *lvol, const char *snapshot_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_with_handle_req *req; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_lvol_create_snapshot(lvol, snapshot_name, _vbdev_lvol_create_cb, req); +} + +void +vbdev_lvol_create_clone(struct spdk_lvol *lvol, const char *clone_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_with_handle_req *req; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_lvol_create_clone(lvol, clone_name, _vbdev_lvol_create_cb, req); +} + +static void +_vbdev_lvol_rename_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + + if (lvolerrno != 0) { + SPDK_ERRLOG("Renaming lvol failed\n"); + } + + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +void +vbdev_lvol_rename(struct spdk_lvol *lvol, const char *new_lvol_name, + spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + int rc; + + rc = _vbdev_lvol_change_bdev_alias(lvol, new_lvol_name); + if (rc != 0) { + SPDK_ERRLOG("renaming lvol to '%s' does not succeed\n", new_lvol_name); + cb_fn(cb_arg, rc); + return; + } + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_lvol_rename(lvol, new_lvol_name, _vbdev_lvol_rename_cb, req); +} + +static void +_vbdev_lvol_resize_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + uint64_t total_size; + + /* change bdev size */ + if (lvolerrno != 0) { + SPDK_ERRLOG("CB function for bdev lvol %s receive error no: %d.\n", lvol->name, lvolerrno); + goto finish; + } + + total_size = spdk_blob_get_num_clusters(lvol->blob) * + spdk_bs_get_cluster_size(lvol->lvol_store->blobstore); + assert((total_size % lvol->bdev->blocklen) == 0); + + lvolerrno = spdk_bdev_notify_blockcnt_change(lvol->bdev, total_size / lvol->bdev->blocklen); + if (lvolerrno != 0) { + SPDK_ERRLOG("Could not change num blocks for bdev lvol %s with error no: %d.\n", + lvol->name, lvolerrno); + } + +finish: + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +void +vbdev_lvol_resize(struct spdk_lvol *lvol, uint64_t sz, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + cb_fn(cb_arg, -EINVAL); + return; + } + + assert(lvol->bdev != NULL); + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->sz = sz; + req->lvol = lvol; + + spdk_lvol_resize(req->lvol, req->sz, _vbdev_lvol_resize_cb, req); +} + +static void +_vbdev_lvol_set_read_only_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + + if (lvolerrno != 0) { + SPDK_ERRLOG("Could not set bdev lvol %s as read only due to error: %d.\n", lvol->name, lvolerrno); + } + + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +void +vbdev_lvol_set_read_only(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + cb_fn(cb_arg, -EINVAL); + return; + } + + assert(lvol->bdev != NULL); + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + + spdk_lvol_set_read_only(lvol, _vbdev_lvol_set_read_only_cb, req); +} + +static int +vbdev_lvs_init(void) +{ + return 0; +} + +static int +vbdev_lvs_get_ctx_size(void) +{ + return 0; +} + +static void +_vbdev_lvs_examine_failed(void *cb_arg, int lvserrno) +{ + spdk_bdev_module_examine_done(&g_lvol_if); +} + +static void +_vbdev_lvol_examine_close_cb(struct spdk_lvol_store *lvs) +{ + if (lvs->lvols_opened >= lvs->lvol_count) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvols finished\n"); + spdk_bdev_module_examine_done(&g_lvol_if); + } +} + +static void +_vbdev_lvs_examine_finish(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno) +{ + struct spdk_lvol_store *lvs = cb_arg; + + if (lvolerrno != 0) { + SPDK_ERRLOG("Error opening lvol %s\n", lvol->unique_id); + TAILQ_REMOVE(&lvs->lvols, lvol, link); + lvs->lvol_count--; + free(lvol); + goto end; + } + + if (_create_lvol_disk(lvol, false)) { + SPDK_ERRLOG("Cannot create bdev for lvol %s\n", lvol->unique_id); + lvs->lvol_count--; + _vbdev_lvol_examine_close_cb(lvs); + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvol %s failed\n", lvol->unique_id); + return; + } + + lvs->lvols_opened++; + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvol %s succeeded\n", lvol->unique_id); + +end: + + if (lvs->lvols_opened >= lvs->lvol_count) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvols finished\n"); + spdk_bdev_module_examine_done(&g_lvol_if); + } +} + +static void +_vbdev_lvs_examine_cb(void *arg, struct spdk_lvol_store *lvol_store, int lvserrno) +{ + struct lvol_store_bdev *lvs_bdev; + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)arg; + struct spdk_lvol *lvol, *tmp; + + if (lvserrno == -EEXIST) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, + "Name for lvolstore on device %s conflicts with name for already loaded lvs\n", + req->base_bdev->name); + /* On error blobstore destroys bs_dev itself */ + spdk_bdev_module_examine_done(&g_lvol_if); + goto end; + } else if (lvserrno != 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store not found on %s\n", req->base_bdev->name); + /* On error blobstore destroys bs_dev itself */ + spdk_bdev_module_examine_done(&g_lvol_if); + goto end; + } + + lvserrno = spdk_bs_bdev_claim(lvol_store->bs_dev, &g_lvol_if); + if (lvserrno != 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store base bdev already claimed by another bdev\n"); + spdk_lvs_unload(lvol_store, _vbdev_lvs_examine_failed, NULL); + goto end; + } + + lvs_bdev = calloc(1, sizeof(*lvs_bdev)); + if (!lvs_bdev) { + SPDK_ERRLOG("Cannot alloc memory for lvs_bdev\n"); + spdk_lvs_unload(lvol_store, _vbdev_lvs_examine_failed, NULL); + goto end; + } + + lvs_bdev->lvs = lvol_store; + lvs_bdev->bdev = req->base_bdev; + + TAILQ_INSERT_TAIL(&g_spdk_lvol_pairs, lvs_bdev, lvol_stores); + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store found on %s - begin parsing\n", + req->base_bdev->name); + + lvol_store->lvols_opened = 0; + + if (TAILQ_EMPTY(&lvol_store->lvols)) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store examination done\n"); + spdk_bdev_module_examine_done(&g_lvol_if); + } else { + /* Open all lvols */ + TAILQ_FOREACH_SAFE(lvol, &lvol_store->lvols, link, tmp) { + spdk_lvol_open(lvol, _vbdev_lvs_examine_finish, lvol_store); + } + } + +end: + free(req); +} + +static void +vbdev_lvs_examine(struct spdk_bdev *bdev) +{ + struct spdk_bs_dev *bs_dev; + struct spdk_lvs_with_handle_req *req; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + spdk_bdev_module_examine_done(&g_lvol_if); + SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n"); + return; + } + + bs_dev = spdk_bdev_create_bs_dev(bdev, vbdev_lvs_hotremove_cb, bdev); + if (!bs_dev) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Cannot create bs dev on %s\n", bdev->name); + spdk_bdev_module_examine_done(&g_lvol_if); + free(req); + return; + } + + req->base_bdev = bdev; + + spdk_lvs_load(bs_dev, _vbdev_lvs_examine_cb, req); +} + +struct spdk_lvol * +vbdev_lvol_get_from_bdev(struct spdk_bdev *bdev) +{ + if (!bdev || bdev->module != &g_lvol_if) { + return NULL; + } + + if (bdev->ctxt == NULL) { + SPDK_ERRLOG("No lvol ctx assigned to bdev %s\n", bdev->name); + return NULL; + } + + return (struct spdk_lvol *)bdev->ctxt; +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_lvol", SPDK_LOG_VBDEV_LVOL); diff --git a/src/spdk/module/bdev/lvol/vbdev_lvol.h b/src/spdk/module/bdev/lvol/vbdev_lvol.h new file mode 100644 index 000000000..ed3eb1c8e --- /dev/null +++ b/src/spdk/module/bdev/lvol/vbdev_lvol.h @@ -0,0 +1,130 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_LVOL_H +#define SPDK_VBDEV_LVOL_H + +#include "spdk/lvol.h" +#include "spdk/bdev_module.h" + +#include "spdk_internal/lvolstore.h" + +struct lvol_store_bdev { + struct spdk_lvol_store *lvs; + struct spdk_bdev *bdev; + struct spdk_lvs_req *req; + + TAILQ_ENTRY(lvol_store_bdev) lvol_stores; +}; + +int vbdev_lvs_create(struct spdk_bdev *base_bdev, const char *name, uint32_t cluster_sz, + enum lvs_clear_method clear_method, spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg); +void vbdev_lvs_destruct(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg); +void vbdev_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg); + +int vbdev_lvol_create(struct spdk_lvol_store *lvs, const char *name, uint64_t sz, + bool thin_provisioned, enum lvol_clear_method clear_method, + spdk_lvol_op_with_handle_complete cb_fn, + void *cb_arg); + +void vbdev_lvol_create_snapshot(struct spdk_lvol *lvol, const char *snapshot_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg); + +void vbdev_lvol_create_clone(struct spdk_lvol *lvol, const char *clone_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg); + +/** + * \brief Change size of lvol + * \param lvol Handle to lvol + * \param sz Size of lvol to change + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + * \return error + */ +void vbdev_lvol_resize(struct spdk_lvol *lvol, uint64_t sz, spdk_lvol_op_complete cb_fn, + void *cb_arg); + +/** + * \brief Mark lvol as read only + * \param lvol Handle to lvol + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + */ +void vbdev_lvol_set_read_only(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg); + +void vbdev_lvol_rename(struct spdk_lvol *lvol, const char *new_lvol_name, + spdk_lvol_op_complete cb_fn, void *cb_arg); + +/** + * Destroy a logical volume + * \param lvol Handle to lvol + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + */ +void vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg); + +/** + * \brief Renames given lvolstore. + * + * \param lvs Pointer to lvolstore + * \param new_name New name of lvs + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + */ +void vbdev_lvs_rename(struct spdk_lvol_store *lvs, const char *new_lvs_name, + spdk_lvs_op_complete cb_fn, void *cb_arg); + +/** + * \brief Search for handle lvolstore + * \param uuid_str UUID of lvolstore + * \return Handle to spdk_lvol_store or NULL if not found. + */ +struct spdk_lvol_store *vbdev_get_lvol_store_by_uuid(const char *uuid_str); + +/** + * \brief Search for handle to lvolstore + * \param name name of lvolstore + * \return Handle to spdk_lvol_store or NULL if not found. + */ +struct spdk_lvol_store *vbdev_get_lvol_store_by_name(const char *name); + +/** + * \brief Search for handle to lvol_store_bdev + * \param lvs handle to lvolstore + * \return Handle to lvol_store_bdev or NULL if not found. + */ +struct lvol_store_bdev *vbdev_get_lvs_bdev_by_lvs(struct spdk_lvol_store *lvs); + +struct spdk_lvol *vbdev_lvol_get_from_bdev(struct spdk_bdev *bdev); + +#endif /* SPDK_VBDEV_LVOL_H */ diff --git a/src/spdk/module/bdev/lvol/vbdev_lvol_rpc.c b/src/spdk/module/bdev/lvol/vbdev_lvol_rpc.c new file mode 100644 index 000000000..79e74f6a5 --- /dev/null +++ b/src/spdk/module/bdev/lvol/vbdev_lvol_rpc.c @@ -0,0 +1,1098 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/bdev.h" +#include "spdk/util.h" +#include "vbdev_lvol.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +SPDK_LOG_REGISTER_COMPONENT("lvolrpc", SPDK_LOG_LVOL_RPC) + +struct rpc_bdev_lvol_create_lvstore { + char *lvs_name; + char *bdev_name; + uint32_t cluster_sz; + char *clear_method; +}; + +static int +vbdev_get_lvol_store_by_uuid_xor_name(const char *uuid, const char *lvs_name, + struct spdk_lvol_store **lvs) +{ + if ((uuid == NULL && lvs_name == NULL)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "lvs UUID nor lvs name specified\n"); + return -EINVAL; + } else if ((uuid && lvs_name)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "both lvs UUID '%s' and lvs name '%s' specified\n", uuid, + lvs_name); + return -EINVAL; + } else if (uuid) { + *lvs = vbdev_get_lvol_store_by_uuid(uuid); + + if (*lvs == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "blobstore with UUID '%s' not found\n", uuid); + return -ENODEV; + } + } else if (lvs_name) { + + *lvs = vbdev_get_lvol_store_by_name(lvs_name); + + if (*lvs == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "blobstore with name '%s' not found\n", lvs_name); + return -ENODEV; + } + } + return 0; +} + +static void +free_rpc_bdev_lvol_create_lvstore(struct rpc_bdev_lvol_create_lvstore *req) +{ + free(req->bdev_name); + free(req->lvs_name); + free(req->clear_method); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_create_lvstore_decoders[] = { + {"bdev_name", offsetof(struct rpc_bdev_lvol_create_lvstore, bdev_name), spdk_json_decode_string}, + {"cluster_sz", offsetof(struct rpc_bdev_lvol_create_lvstore, cluster_sz), spdk_json_decode_uint32, true}, + {"lvs_name", offsetof(struct rpc_bdev_lvol_create_lvstore, lvs_name), spdk_json_decode_string}, + {"clear_method", offsetof(struct rpc_bdev_lvol_create_lvstore, clear_method), spdk_json_decode_string, true}, +}; + +static void +rpc_lvol_store_construct_cb(void *cb_arg, struct spdk_lvol_store *lvol_store, int lvserrno) +{ + struct spdk_json_write_ctx *w; + char lvol_store_uuid[SPDK_UUID_STRING_LEN]; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvserrno != 0) { + goto invalid; + } + + spdk_uuid_fmt_lower(lvol_store_uuid, sizeof(lvol_store_uuid), &lvol_store->uuid); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, lvol_store_uuid); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvserrno)); +} + +static void +rpc_bdev_lvol_create_lvstore(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_create_lvstore req = {}; + struct spdk_bdev *bdev; + int rc = 0; + enum lvs_clear_method clear_method; + + if (spdk_json_decode_object(params, rpc_bdev_lvol_create_lvstore_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_create_lvstore_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.bdev_name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.bdev_name); + spdk_jsonrpc_send_error_response_fmt(request, -ENODEV, "Bdev %s not found", req.bdev_name); + goto cleanup; + } + + if (req.clear_method != NULL) { + if (!strcasecmp(req.clear_method, "none")) { + clear_method = LVS_CLEAR_WITH_NONE; + } else if (!strcasecmp(req.clear_method, "unmap")) { + clear_method = LVS_CLEAR_WITH_UNMAP; + } else if (!strcasecmp(req.clear_method, "write_zeroes")) { + clear_method = LVS_CLEAR_WITH_WRITE_ZEROES; + } else { + spdk_jsonrpc_send_error_response(request, -EINVAL, "Invalid clear_method parameter"); + goto cleanup; + } + } else { + clear_method = LVS_CLEAR_WITH_UNMAP; + } + + rc = vbdev_lvs_create(bdev, req.lvs_name, req.cluster_sz, clear_method, + rpc_lvol_store_construct_cb, request); + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, -rc, spdk_strerror(rc)); + goto cleanup; + } + free_rpc_bdev_lvol_create_lvstore(&req); + + return; + +cleanup: + free_rpc_bdev_lvol_create_lvstore(&req); +} +SPDK_RPC_REGISTER("bdev_lvol_create_lvstore", rpc_bdev_lvol_create_lvstore, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_create_lvstore, construct_lvol_store) + +struct rpc_bdev_lvol_rename_lvstore { + char *old_name; + char *new_name; +}; + +static void +free_rpc_bdev_lvol_rename_lvstore(struct rpc_bdev_lvol_rename_lvstore *req) +{ + free(req->old_name); + free(req->new_name); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_rename_lvstore_decoders[] = { + {"old_name", offsetof(struct rpc_bdev_lvol_rename_lvstore, old_name), spdk_json_decode_string}, + {"new_name", offsetof(struct rpc_bdev_lvol_rename_lvstore, new_name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_lvol_rename_lvstore_cb(void *cb_arg, int lvserrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvserrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvserrno)); +} + +static void +rpc_bdev_lvol_rename_lvstore(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_rename_lvstore req = {}; + struct spdk_lvol_store *lvs; + + if (spdk_json_decode_object(params, rpc_bdev_lvol_rename_lvstore_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_rename_lvstore_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + lvs = vbdev_get_lvol_store_by_name(req.old_name); + if (lvs == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "no lvs existing for given name\n"); + spdk_jsonrpc_send_error_response_fmt(request, -ENOENT, "Lvol store %s not found", req.old_name); + goto cleanup; + } + + vbdev_lvs_rename(lvs, req.new_name, rpc_bdev_lvol_rename_lvstore_cb, request); + +cleanup: + free_rpc_bdev_lvol_rename_lvstore(&req); +} +SPDK_RPC_REGISTER("bdev_lvol_rename_lvstore", rpc_bdev_lvol_rename_lvstore, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_rename_lvstore, rename_lvol_store) + +struct rpc_bdev_lvol_delete_lvstore { + char *uuid; + char *lvs_name; +}; + +static void +free_rpc_bdev_lvol_delete_lvstore(struct rpc_bdev_lvol_delete_lvstore *req) +{ + free(req->uuid); + free(req->lvs_name); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_delete_lvstore_decoders[] = { + {"uuid", offsetof(struct rpc_bdev_lvol_delete_lvstore, uuid), spdk_json_decode_string, true}, + {"lvs_name", offsetof(struct rpc_bdev_lvol_delete_lvstore, lvs_name), spdk_json_decode_string, true}, +}; + +static void +rpc_lvol_store_destroy_cb(void *cb_arg, int lvserrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvserrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvserrno)); +} + +static void +rpc_bdev_lvol_delete_lvstore(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_delete_lvstore req = {}; + struct spdk_lvol_store *lvs = NULL; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_lvol_delete_lvstore_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_delete_lvstore_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = vbdev_get_lvol_store_by_uuid_xor_name(req.uuid, req.lvs_name, &lvs); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + vbdev_lvs_destruct(lvs, rpc_lvol_store_destroy_cb, request); + +cleanup: + free_rpc_bdev_lvol_delete_lvstore(&req); +} +SPDK_RPC_REGISTER("bdev_lvol_delete_lvstore", rpc_bdev_lvol_delete_lvstore, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_delete_lvstore, destroy_lvol_store) + +struct rpc_bdev_lvol_create { + char *uuid; + char *lvs_name; + char *lvol_name; + uint64_t size; + bool thin_provision; + char *clear_method; +}; + +static void +free_rpc_bdev_lvol_create(struct rpc_bdev_lvol_create *req) +{ + free(req->uuid); + free(req->lvs_name); + free(req->lvol_name); + free(req->clear_method); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_create_decoders[] = { + {"uuid", offsetof(struct rpc_bdev_lvol_create, uuid), spdk_json_decode_string, true}, + {"lvs_name", offsetof(struct rpc_bdev_lvol_create, lvs_name), spdk_json_decode_string, true}, + {"lvol_name", offsetof(struct rpc_bdev_lvol_create, lvol_name), spdk_json_decode_string}, + {"size", offsetof(struct rpc_bdev_lvol_create, size), spdk_json_decode_uint64}, + {"thin_provision", offsetof(struct rpc_bdev_lvol_create, thin_provision), spdk_json_decode_bool, true}, + {"clear_method", offsetof(struct rpc_bdev_lvol_create, clear_method), spdk_json_decode_string, true}, +}; + +static void +rpc_bdev_lvol_create_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, lvol->unique_id); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +rpc_bdev_lvol_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_create req = {}; + enum lvol_clear_method clear_method; + int rc = 0; + struct spdk_lvol_store *lvs = NULL; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Creating blob\n"); + + if (spdk_json_decode_object(params, rpc_bdev_lvol_create_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_create_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = vbdev_get_lvol_store_by_uuid_xor_name(req.uuid, req.lvs_name, &lvs); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + if (req.clear_method != NULL) { + if (!strcasecmp(req.clear_method, "none")) { + clear_method = LVOL_CLEAR_WITH_NONE; + } else if (!strcasecmp(req.clear_method, "unmap")) { + clear_method = LVOL_CLEAR_WITH_UNMAP; + } else if (!strcasecmp(req.clear_method, "write_zeroes")) { + clear_method = LVOL_CLEAR_WITH_WRITE_ZEROES; + } else { + spdk_jsonrpc_send_error_response(request, -EINVAL, "Invalid clean_method option"); + goto cleanup; + } + } else { + clear_method = LVOL_CLEAR_WITH_DEFAULT; + } + + rc = vbdev_lvol_create(lvs, req.lvol_name, req.size, req.thin_provision, + clear_method, rpc_bdev_lvol_create_cb, request); + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + +cleanup: + free_rpc_bdev_lvol_create(&req); +} + +SPDK_RPC_REGISTER("bdev_lvol_create", rpc_bdev_lvol_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_create, construct_lvol_bdev) + +struct rpc_bdev_lvol_snapshot { + char *lvol_name; + char *snapshot_name; +}; + +static void +free_rpc_bdev_lvol_snapshot(struct rpc_bdev_lvol_snapshot *req) +{ + free(req->lvol_name); + free(req->snapshot_name); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_snapshot_decoders[] = { + {"lvol_name", offsetof(struct rpc_bdev_lvol_snapshot, lvol_name), spdk_json_decode_string}, + {"snapshot_name", offsetof(struct rpc_bdev_lvol_snapshot, snapshot_name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_lvol_snapshot_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, lvol->unique_id); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +rpc_bdev_lvol_snapshot(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_snapshot req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Snapshotting blob\n"); + + if (spdk_json_decode_object(params, rpc_bdev_lvol_snapshot_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_snapshot_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.lvol_name); + if (bdev == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "bdev '%s' does not exist\n", req.lvol_name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + vbdev_lvol_create_snapshot(lvol, req.snapshot_name, rpc_bdev_lvol_snapshot_cb, request); + +cleanup: + free_rpc_bdev_lvol_snapshot(&req); +} + +SPDK_RPC_REGISTER("bdev_lvol_snapshot", rpc_bdev_lvol_snapshot, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_snapshot, snapshot_lvol_bdev) + +struct rpc_bdev_lvol_clone { + char *snapshot_name; + char *clone_name; +}; + +static void +free_rpc_bdev_lvol_clone(struct rpc_bdev_lvol_clone *req) +{ + free(req->snapshot_name); + free(req->clone_name); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_clone_decoders[] = { + {"snapshot_name", offsetof(struct rpc_bdev_lvol_clone, snapshot_name), spdk_json_decode_string}, + {"clone_name", offsetof(struct rpc_bdev_lvol_clone, clone_name), spdk_json_decode_string, true}, +}; + +static void +rpc_bdev_lvol_clone_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, lvol->unique_id); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +rpc_bdev_lvol_clone(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_clone req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Cloning blob\n"); + + if (spdk_json_decode_object(params, rpc_bdev_lvol_clone_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_clone_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.snapshot_name); + if (bdev == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "bdev '%s' does not exist\n", req.snapshot_name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + vbdev_lvol_create_clone(lvol, req.clone_name, rpc_bdev_lvol_clone_cb, request); + +cleanup: + free_rpc_bdev_lvol_clone(&req); +} + +SPDK_RPC_REGISTER("bdev_lvol_clone", rpc_bdev_lvol_clone, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_clone, clone_lvol_bdev) + +struct rpc_bdev_lvol_rename { + char *old_name; + char *new_name; +}; + +static void +free_rpc_bdev_lvol_rename(struct rpc_bdev_lvol_rename *req) +{ + free(req->old_name); + free(req->new_name); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_rename_decoders[] = { + {"old_name", offsetof(struct rpc_bdev_lvol_rename, old_name), spdk_json_decode_string}, + {"new_name", offsetof(struct rpc_bdev_lvol_rename, new_name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_lvol_rename_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +rpc_bdev_lvol_rename(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_rename req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Renaming lvol\n"); + + if (spdk_json_decode_object(params, rpc_bdev_lvol_rename_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_rename_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.old_name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.old_name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + vbdev_lvol_rename(lvol, req.new_name, rpc_bdev_lvol_rename_cb, request); + +cleanup: + free_rpc_bdev_lvol_rename(&req); +} + +SPDK_RPC_REGISTER("bdev_lvol_rename", rpc_bdev_lvol_rename, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_rename, rename_lvol_bdev) + +struct rpc_bdev_lvol_inflate { + char *name; +}; + +static void +free_rpc_bdev_lvol_inflate(struct rpc_bdev_lvol_inflate *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_inflate_decoders[] = { + {"name", offsetof(struct rpc_bdev_lvol_inflate, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_lvol_inflate_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +rpc_bdev_lvol_inflate(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_inflate req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Inflating lvol\n"); + + if (spdk_json_decode_object(params, rpc_bdev_lvol_inflate_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_inflate_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + spdk_lvol_inflate(lvol, rpc_bdev_lvol_inflate_cb, request); + +cleanup: + free_rpc_bdev_lvol_inflate(&req); +} + +SPDK_RPC_REGISTER("bdev_lvol_inflate", rpc_bdev_lvol_inflate, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_inflate, inflate_lvol_bdev) + +static void +rpc_bdev_lvol_decouple_parent(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_inflate req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Decoupling parent of lvol\n"); + + if (spdk_json_decode_object(params, rpc_bdev_lvol_inflate_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_inflate_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + spdk_lvol_decouple_parent(lvol, rpc_bdev_lvol_inflate_cb, request); + +cleanup: + free_rpc_bdev_lvol_inflate(&req); +} + +SPDK_RPC_REGISTER("bdev_lvol_decouple_parent", rpc_bdev_lvol_decouple_parent, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_decouple_parent, decouple_parent_lvol_bdev) + +struct rpc_bdev_lvol_resize { + char *name; + uint64_t size; +}; + +static void +free_rpc_bdev_lvol_resize(struct rpc_bdev_lvol_resize *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_resize_decoders[] = { + {"name", offsetof(struct rpc_bdev_lvol_resize, name), spdk_json_decode_string}, + {"size", offsetof(struct rpc_bdev_lvol_resize, size), spdk_json_decode_uint64}, +}; + +static void +rpc_bdev_lvol_resize_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +rpc_bdev_lvol_resize(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_resize req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Resizing lvol\n"); + + if (spdk_json_decode_object(params, rpc_bdev_lvol_resize_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_resize_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("no bdev for provided name %s\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + vbdev_lvol_resize(lvol, req.size, rpc_bdev_lvol_resize_cb, request); + +cleanup: + free_rpc_bdev_lvol_resize(&req); +} + +SPDK_RPC_REGISTER("bdev_lvol_resize", rpc_bdev_lvol_resize, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_resize, resize_lvol_bdev) + +struct rpc_set_ro_lvol_bdev { + char *name; +}; + +static void +free_rpc_set_ro_lvol_bdev(struct rpc_set_ro_lvol_bdev *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_set_ro_lvol_bdev_decoders[] = { + {"name", offsetof(struct rpc_set_ro_lvol_bdev, name), spdk_json_decode_string}, +}; + +static void +rpc_set_ro_lvol_bdev_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +rpc_bdev_lvol_set_read_only(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_set_ro_lvol_bdev req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Setting lvol as read only\n"); + + if (spdk_json_decode_object(params, rpc_set_ro_lvol_bdev_decoders, + SPDK_COUNTOF(rpc_set_ro_lvol_bdev_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + if (req.name == NULL) { + SPDK_ERRLOG("missing name param\n"); + spdk_jsonrpc_send_error_response(request, -EINVAL, "Missing name parameter"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("no bdev for provided name %s\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + vbdev_lvol_set_read_only(lvol, rpc_set_ro_lvol_bdev_cb, request); + +cleanup: + free_rpc_set_ro_lvol_bdev(&req); +} + +SPDK_RPC_REGISTER("bdev_lvol_set_read_only", rpc_bdev_lvol_set_read_only, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_set_read_only, set_read_only_lvol_bdev) + +struct rpc_bdev_lvol_delete { + char *name; +}; + +static void +free_rpc_bdev_lvol_delete(struct rpc_bdev_lvol_delete *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_delete_decoders[] = { + {"name", offsetof(struct rpc_bdev_lvol_delete, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_lvol_delete_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(-lvolerrno)); +} + +static void +rpc_bdev_lvol_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_delete req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + + if (spdk_json_decode_object(params, rpc_bdev_lvol_delete_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_delete_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("no bdev for provided name %s\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + vbdev_lvol_destroy(lvol, rpc_bdev_lvol_delete_cb, request); + +cleanup: + free_rpc_bdev_lvol_delete(&req); +} + +SPDK_RPC_REGISTER("bdev_lvol_delete", rpc_bdev_lvol_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_delete, destroy_lvol_bdev) + +struct rpc_bdev_lvol_get_lvstores { + char *uuid; + char *lvs_name; +}; + +static void +free_rpc_bdev_lvol_get_lvstores(struct rpc_bdev_lvol_get_lvstores *req) +{ + free(req->uuid); + free(req->lvs_name); +} + +static const struct spdk_json_object_decoder rpc_bdev_lvol_get_lvstores_decoders[] = { + {"uuid", offsetof(struct rpc_bdev_lvol_get_lvstores, uuid), spdk_json_decode_string, true}, + {"lvs_name", offsetof(struct rpc_bdev_lvol_get_lvstores, lvs_name), spdk_json_decode_string, true}, +}; + +static void +rpc_dump_lvol_store_info(struct spdk_json_write_ctx *w, struct lvol_store_bdev *lvs_bdev) +{ + struct spdk_blob_store *bs; + uint64_t cluster_size; + char uuid[SPDK_UUID_STRING_LEN]; + + bs = lvs_bdev->lvs->blobstore; + cluster_size = spdk_bs_get_cluster_size(bs); + + spdk_json_write_object_begin(w); + + spdk_uuid_fmt_lower(uuid, sizeof(uuid), &lvs_bdev->lvs->uuid); + spdk_json_write_named_string(w, "uuid", uuid); + + spdk_json_write_named_string(w, "name", lvs_bdev->lvs->name); + + spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(lvs_bdev->bdev)); + + spdk_json_write_named_uint64(w, "total_data_clusters", spdk_bs_total_data_cluster_count(bs)); + + spdk_json_write_named_uint64(w, "free_clusters", spdk_bs_free_cluster_count(bs)); + + spdk_json_write_named_uint64(w, "block_size", spdk_bs_get_io_unit_size(bs)); + + spdk_json_write_named_uint64(w, "cluster_size", cluster_size); + + spdk_json_write_object_end(w); +} + +static void +rpc_bdev_lvol_get_lvstores(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_lvol_get_lvstores req = {}; + struct spdk_json_write_ctx *w; + struct lvol_store_bdev *lvs_bdev = NULL; + struct spdk_lvol_store *lvs = NULL; + int rc; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_bdev_lvol_get_lvstores_decoders, + SPDK_COUNTOF(rpc_bdev_lvol_get_lvstores_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = vbdev_get_lvol_store_by_uuid_xor_name(req.uuid, req.lvs_name, &lvs); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvs); + if (lvs_bdev == NULL) { + spdk_jsonrpc_send_error_response(request, ENODEV, spdk_strerror(-ENODEV)); + goto cleanup; + } + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + if (lvs_bdev != NULL) { + rpc_dump_lvol_store_info(w, lvs_bdev); + } else { + for (lvs_bdev = vbdev_lvol_store_first(); lvs_bdev != NULL; + lvs_bdev = vbdev_lvol_store_next(lvs_bdev)) { + rpc_dump_lvol_store_info(w, lvs_bdev); + } + } + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_lvol_get_lvstores(&req); +} + +SPDK_RPC_REGISTER("bdev_lvol_get_lvstores", rpc_bdev_lvol_get_lvstores, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_get_lvstores, get_lvol_stores) diff --git a/src/spdk/module/bdev/malloc/Makefile b/src/spdk/module/bdev/malloc/Makefile new file mode 100644 index 000000000..c55db23ce --- /dev/null +++ b/src/spdk/module/bdev/malloc/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = bdev_malloc.c bdev_malloc_rpc.c +LIBNAME = bdev_malloc +LOCAL_SYS_LIBS = -luuid + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/malloc/bdev_malloc.c b/src/spdk/module/bdev/malloc/bdev_malloc.c new file mode 100644 index 000000000..ce0403153 --- /dev/null +++ b/src/spdk/module/bdev/malloc/bdev_malloc.c @@ -0,0 +1,532 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "bdev_malloc.h" +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/accel_engine.h" +#include "spdk/json.h" +#include "spdk/thread.h" +#include "spdk/queue.h" +#include "spdk/string.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +struct malloc_disk { + struct spdk_bdev disk; + void *malloc_buf; + TAILQ_ENTRY(malloc_disk) link; +}; + +struct malloc_task { + int num_outstanding; + enum spdk_bdev_io_status status; +}; + +static void +malloc_done(void *ref, int status) +{ + struct malloc_task *task = (struct malloc_task *)ref; + + if (status != 0) { + if (status == -ENOMEM) { + task->status = SPDK_BDEV_IO_STATUS_NOMEM; + } else { + task->status = SPDK_BDEV_IO_STATUS_FAILED; + } + } + + if (--task->num_outstanding == 0) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); + } +} + +static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks); + +int malloc_disk_count = 0; + +static int bdev_malloc_initialize(void); +static void bdev_malloc_get_spdk_running_config(FILE *fp); + +static int +bdev_malloc_get_ctx_size(void) +{ + return sizeof(struct malloc_task); +} + +static struct spdk_bdev_module malloc_if = { + .name = "malloc", + .module_init = bdev_malloc_initialize, + .config_text = bdev_malloc_get_spdk_running_config, + .get_ctx_size = bdev_malloc_get_ctx_size, + +}; + +SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if) + +static void +malloc_disk_free(struct malloc_disk *malloc_disk) +{ + if (!malloc_disk) { + return; + } + + free(malloc_disk->disk.name); + spdk_free(malloc_disk->malloc_buf); + free(malloc_disk); +} + +static int +bdev_malloc_destruct(void *ctx) +{ + struct malloc_disk *malloc_disk = ctx; + + TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link); + malloc_disk_free(malloc_disk); + return 0; +} + +static int +bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) +{ + int i; + + for (i = 0; i < iovcnt; i++) { + if (nbytes < iovs[i].iov_len) { + return 0; + } + + nbytes -= iovs[i].iov_len; + } + + return nbytes != 0; +} + +static void +bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, + struct malloc_task *task, + struct iovec *iov, int iovcnt, size_t len, uint64_t offset) +{ + int64_t res = 0; + void *src = mdisk->malloc_buf + offset; + int i; + + if (bdev_malloc_check_iov_len(iov, iovcnt, len)) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), + SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "read %lu bytes from offset %#lx\n", + len, offset); + + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + task->num_outstanding = iovcnt; + + for (i = 0; i < iovcnt; i++) { + res = spdk_accel_submit_copy(ch, iov[i].iov_base, + src, iov[i].iov_len, malloc_done, task); + + if (res != 0) { + malloc_done(task, res); + } + + src += iov[i].iov_len; + len -= iov[i].iov_len; + } +} + +static void +bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, + struct malloc_task *task, + struct iovec *iov, int iovcnt, size_t len, uint64_t offset) +{ + int64_t res = 0; + void *dst = mdisk->malloc_buf + offset; + int i; + + if (bdev_malloc_check_iov_len(iov, iovcnt, len)) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), + SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "wrote %lu bytes to offset %#lx\n", + len, offset); + + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + task->num_outstanding = iovcnt; + + for (i = 0; i < iovcnt; i++) { + res = spdk_accel_submit_copy(ch, dst, iov[i].iov_base, + iov[i].iov_len, malloc_done, task); + + if (res != 0) { + malloc_done(task, res); + } + + dst += iov[i].iov_len; + } +} + +static int +bdev_malloc_unmap(struct malloc_disk *mdisk, + struct spdk_io_channel *ch, + struct malloc_task *task, + uint64_t offset, + uint64_t byte_count) +{ + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + task->num_outstanding = 1; + + return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0, + byte_count, malloc_done, task); +} + +static int64_t +bdev_malloc_flush(struct malloc_disk *mdisk, struct malloc_task *task, + uint64_t offset, uint64_t nbytes) +{ + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), SPDK_BDEV_IO_STATUS_SUCCESS); + + return 0; +} + +static int +bdev_malloc_reset(struct malloc_disk *mdisk, struct malloc_task *task) +{ + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), SPDK_BDEV_IO_STATUS_SUCCESS); + + return 0; +} + +static int _bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + uint32_t block_size = bdev_io->bdev->blocklen; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { + assert(bdev_io->u.bdev.iovcnt == 1); + bdev_io->u.bdev.iovs[0].iov_base = + ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf + + bdev_io->u.bdev.offset_blocks * block_size; + bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size; + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; + } + + bdev_malloc_readv((struct malloc_disk *)bdev_io->bdev->ctxt, + ch, + (struct malloc_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * block_size, + bdev_io->u.bdev.offset_blocks * block_size); + return 0; + + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_malloc_writev((struct malloc_disk *)bdev_io->bdev->ctxt, + ch, + (struct malloc_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * block_size, + bdev_io->u.bdev.offset_blocks * block_size); + return 0; + + case SPDK_BDEV_IO_TYPE_RESET: + return bdev_malloc_reset((struct malloc_disk *)bdev_io->bdev->ctxt, + (struct malloc_task *)bdev_io->driver_ctx); + + case SPDK_BDEV_IO_TYPE_FLUSH: + return bdev_malloc_flush((struct malloc_disk *)bdev_io->bdev->ctxt, + (struct malloc_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.offset_blocks * block_size, + bdev_io->u.bdev.num_blocks * block_size); + + case SPDK_BDEV_IO_TYPE_UNMAP: + return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt, + ch, + (struct malloc_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.offset_blocks * block_size, + bdev_io->u.bdev.num_blocks * block_size); + + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */ + return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt, + ch, + (struct malloc_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.offset_blocks * block_size, + bdev_io->u.bdev.num_blocks * block_size); + + case SPDK_BDEV_IO_TYPE_ZCOPY: + if (bdev_io->u.bdev.zcopy.start) { + void *buf; + size_t len; + + buf = ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf + + bdev_io->u.bdev.offset_blocks * block_size; + len = bdev_io->u.bdev.num_blocks * block_size; + spdk_bdev_io_set_buf(bdev_io, buf, len); + + } + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; + case SPDK_BDEV_IO_TYPE_ABORT: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return 0; + default: + return -1; + } + return 0; +} + +static void bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_malloc_submit_request(ch, bdev_io) != 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + case SPDK_BDEV_IO_TYPE_ZCOPY: + case SPDK_BDEV_IO_TYPE_ABORT: + return true; + + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_malloc_get_io_channel(void *ctx) +{ + return spdk_accel_engine_get_io_channel(); +} + +static void +bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + char uuid_str[SPDK_UUID_STRING_LEN]; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_malloc_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); + spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table malloc_fn_table = { + .destruct = bdev_malloc_destruct, + .submit_request = bdev_malloc_submit_request, + .io_type_supported = bdev_malloc_io_type_supported, + .get_io_channel = bdev_malloc_get_io_channel, + .write_config_json = bdev_malloc_write_json_config, +}; + +int +create_malloc_disk(struct spdk_bdev **bdev, const char *name, const struct spdk_uuid *uuid, + uint64_t num_blocks, uint32_t block_size) +{ + struct malloc_disk *mdisk; + int rc; + + if (num_blocks == 0) { + SPDK_ERRLOG("Disk num_blocks must be greater than 0"); + return -EINVAL; + } + + mdisk = calloc(1, sizeof(*mdisk)); + if (!mdisk) { + SPDK_ERRLOG("mdisk calloc() failed\n"); + return -ENOMEM; + } + + /* + * Allocate the large backend memory buffer from pinned memory. + * + * TODO: need to pass a hint so we know which socket to allocate + * from on multi-socket systems. + */ + mdisk->malloc_buf = spdk_zmalloc(num_blocks * block_size, 2 * 1024 * 1024, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!mdisk->malloc_buf) { + SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n"); + malloc_disk_free(mdisk); + return -ENOMEM; + } + + if (name) { + mdisk->disk.name = strdup(name); + } else { + /* Auto-generate a name */ + mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count); + malloc_disk_count++; + } + if (!mdisk->disk.name) { + malloc_disk_free(mdisk); + return -ENOMEM; + } + mdisk->disk.product_name = "Malloc disk"; + + mdisk->disk.write_cache = 1; + mdisk->disk.blocklen = block_size; + mdisk->disk.blockcnt = num_blocks; + if (uuid) { + mdisk->disk.uuid = *uuid; + } else { + spdk_uuid_generate(&mdisk->disk.uuid); + } + + mdisk->disk.ctxt = mdisk; + mdisk->disk.fn_table = &malloc_fn_table; + mdisk->disk.module = &malloc_if; + + rc = spdk_bdev_register(&mdisk->disk); + if (rc) { + malloc_disk_free(mdisk); + return rc; + } + + *bdev = &(mdisk->disk); + + TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link); + + return rc; +} + +void +delete_malloc_disk(struct spdk_bdev *bdev, spdk_delete_malloc_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &malloc_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static int bdev_malloc_initialize(void) +{ + struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Malloc"); + int NumberOfLuns, LunSizeInMB, BlockSize, i, rc = 0; + uint64_t size; + struct spdk_bdev *bdev; + + malloc_disk_count = 0; + + if (sp != NULL) { + NumberOfLuns = spdk_conf_section_get_intval(sp, "NumberOfLuns"); + LunSizeInMB = spdk_conf_section_get_intval(sp, "LunSizeInMB"); + BlockSize = spdk_conf_section_get_intval(sp, "BlockSize"); + if ((NumberOfLuns < 1) || (LunSizeInMB < 1)) { + SPDK_ERRLOG("Malloc section present, but no devices specified\n"); + goto end; + } + if (BlockSize < 1) { + /* Default is 512 bytes */ + BlockSize = 512; + } + size = (uint64_t)LunSizeInMB * 1024 * 1024; + for (i = 0; i < NumberOfLuns; i++) { + rc = create_malloc_disk(&bdev, NULL, NULL, size / BlockSize, BlockSize); + if (rc) { + SPDK_ERRLOG("Could not create malloc disk\n"); + goto end; + } + } + } + +end: + return rc; +} + +static void +bdev_malloc_get_spdk_running_config(FILE *fp) +{ + int num_malloc_luns = 0; + uint64_t malloc_lun_size = 0; + struct malloc_disk *mdisk; + + /* count number of malloc LUNs, get LUN size */ + TAILQ_FOREACH(mdisk, &g_malloc_disks, link) { + if (0 == malloc_lun_size) { + /* assume all malloc luns the same size */ + malloc_lun_size = mdisk->disk.blocklen * mdisk->disk.blockcnt; + malloc_lun_size /= (1024 * 1024); + } + num_malloc_luns++; + } + + if (num_malloc_luns > 0) { + fprintf(fp, + "\n" + "# Users may change this section to create a different number or size of\n" + "# malloc LUNs.\n" + "# This will generate %d LUNs with a malloc-allocated backend. Each LUN\n" + "# will be %" PRIu64 "MB in size and these will be named Malloc0 through Malloc%d.\n" + "# Not all LUNs defined here are necessarily used below.\n" + "[Malloc]\n" + " NumberOfLuns %d\n" + " LunSizeInMB %" PRIu64 "\n", + num_malloc_luns, malloc_lun_size, + num_malloc_luns - 1, num_malloc_luns, + malloc_lun_size); + } +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_malloc", SPDK_LOG_BDEV_MALLOC) diff --git a/src/spdk/module/bdev/malloc/bdev_malloc.h b/src/spdk/module/bdev/malloc/bdev_malloc.h new file mode 100644 index 000000000..b683b1062 --- /dev/null +++ b/src/spdk/module/bdev/malloc/bdev_malloc.h @@ -0,0 +1,48 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_MALLOC_H +#define SPDK_BDEV_MALLOC_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" + +typedef void (*spdk_delete_malloc_complete)(void *cb_arg, int bdeverrno); + +int create_malloc_disk(struct spdk_bdev **bdev, const char *name, const struct spdk_uuid *uuid, + uint64_t num_blocks, uint32_t block_size); + +void delete_malloc_disk(struct spdk_bdev *bdev, spdk_delete_malloc_complete cb_fn, void *cb_arg); + +#endif /* SPDK_BDEV_MALLOC_H */ diff --git a/src/spdk/module/bdev/malloc/bdev_malloc_rpc.c b/src/spdk/module/bdev/malloc/bdev_malloc_rpc.c new file mode 100644 index 000000000..f151e8b1f --- /dev/null +++ b/src/spdk/module/bdev/malloc/bdev_malloc_rpc.c @@ -0,0 +1,173 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_malloc.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/uuid.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +struct rpc_construct_malloc { + char *name; + char *uuid; + uint64_t num_blocks; + uint32_t block_size; +}; + +static void +free_rpc_construct_malloc(struct rpc_construct_malloc *r) +{ + free(r->name); + free(r->uuid); +} + +static const struct spdk_json_object_decoder rpc_construct_malloc_decoders[] = { + {"name", offsetof(struct rpc_construct_malloc, name), spdk_json_decode_string, true}, + {"uuid", offsetof(struct rpc_construct_malloc, uuid), spdk_json_decode_string, true}, + {"num_blocks", offsetof(struct rpc_construct_malloc, num_blocks), spdk_json_decode_uint64}, + {"block_size", offsetof(struct rpc_construct_malloc, block_size), spdk_json_decode_uint32}, +}; + +static void +rpc_bdev_malloc_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_malloc req = {NULL}; + struct spdk_json_write_ctx *w; + struct spdk_uuid *uuid = NULL; + struct spdk_uuid decoded_uuid; + struct spdk_bdev *bdev; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_construct_malloc_decoders, + SPDK_COUNTOF(rpc_construct_malloc_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + if (req.num_blocks == 0) { + spdk_jsonrpc_send_error_response(request, -EINVAL, + "Disk num_blocks must be greater than 0"); + goto cleanup; + } + + if (req.uuid) { + if (spdk_uuid_parse(&decoded_uuid, req.uuid)) { + spdk_jsonrpc_send_error_response(request, -EINVAL, + "Failed to parse bdev UUID"); + goto cleanup; + } + uuid = &decoded_uuid; + } + + rc = create_malloc_disk(&bdev, req.name, uuid, req.num_blocks, req.block_size); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + free_rpc_construct_malloc(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + return; + +cleanup: + free_rpc_construct_malloc(&req); +} +SPDK_RPC_REGISTER("bdev_malloc_create", rpc_bdev_malloc_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_malloc_create, construct_malloc_bdev) + +struct rpc_delete_malloc { + char *name; +}; + +static void +free_rpc_delete_malloc(struct rpc_delete_malloc *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_malloc_decoders[] = { + {"name", offsetof(struct rpc_delete_malloc, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_malloc_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_malloc_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_malloc req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_delete_malloc_decoders, + SPDK_COUNTOF(rpc_delete_malloc_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_INFOLOG(SPDK_LOG_BDEV_MALLOC, "bdev '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + delete_malloc_disk(bdev, rpc_bdev_malloc_delete_cb, request); + + free_rpc_delete_malloc(&req); + + return; + +cleanup: + free_rpc_delete_malloc(&req); +} +SPDK_RPC_REGISTER("bdev_malloc_delete", rpc_bdev_malloc_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_malloc_delete, delete_malloc_bdev) diff --git a/src/spdk/module/bdev/null/Makefile b/src/spdk/module/bdev/null/Makefile new file mode 100644 index 000000000..e179b01ed --- /dev/null +++ b/src/spdk/module/bdev/null/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = bdev_null.c bdev_null_rpc.c +LIBNAME = bdev_null + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/null/bdev_null.c b/src/spdk/module/bdev/null/bdev_null.c new file mode 100644 index 000000000..97aa8b03f --- /dev/null +++ b/src/spdk/module/bdev/null/bdev_null.c @@ -0,0 +1,550 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/string.h" +#include "spdk/likely.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#include "bdev_null.h" + +struct null_bdev { + struct spdk_bdev bdev; + TAILQ_ENTRY(null_bdev) tailq; +}; + +struct null_io_channel { + struct spdk_poller *poller; + TAILQ_HEAD(, spdk_bdev_io) io; +}; + +static TAILQ_HEAD(, null_bdev) g_null_bdev_head; +static void *g_null_read_buf; + +static int bdev_null_initialize(void); +static void bdev_null_finish(void); +static void bdev_null_get_spdk_running_config(FILE *fp); + +static struct spdk_bdev_module null_if = { + .name = "null", + .module_init = bdev_null_initialize, + .module_fini = bdev_null_finish, + .config_text = bdev_null_get_spdk_running_config, + .async_fini = true, +}; + +SPDK_BDEV_MODULE_REGISTER(null, &null_if) + +static int +bdev_null_destruct(void *ctx) +{ + struct null_bdev *bdev = ctx; + + TAILQ_REMOVE(&g_null_bdev_head, bdev, tailq); + free(bdev->bdev.name); + free(bdev); + + return 0; +} + +static bool +bdev_null_abort_io(struct null_io_channel *ch, struct spdk_bdev_io *bio_to_abort) +{ + struct spdk_bdev_io *bdev_io; + + TAILQ_FOREACH(bdev_io, &ch->io, module_link) { + if (bdev_io == bio_to_abort) { + TAILQ_REMOVE(&ch->io, bio_to_abort, module_link); + spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); + return true; + } + } + + return false; +} + +static void +bdev_null_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct null_io_channel *ch = spdk_io_channel_get_ctx(_ch); + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_dif_ctx dif_ctx; + struct spdk_dif_error err_blk; + int rc; + + if (SPDK_DIF_DISABLE != bdev->dif_type && + (SPDK_BDEV_IO_TYPE_READ == bdev_io->type || + SPDK_BDEV_IO_TYPE_WRITE == bdev_io->type)) { + rc = spdk_dif_ctx_init(&dif_ctx, + bdev->blocklen, + bdev->md_len, + bdev->md_interleave, + bdev->dif_is_head_of_md, + bdev->dif_type, + bdev->dif_check_flags, + bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, + 0xFFFF, 0, 0, 0); + if (0 != rc) { + SPDK_ERRLOG("Failed to initialize DIF context, error %d\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + } + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { + assert(bdev_io->u.bdev.iovcnt == 1); + if (spdk_likely(bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen <= + SPDK_BDEV_LARGE_BUF_MAX_SIZE)) { + bdev_io->u.bdev.iovs[0].iov_base = g_null_read_buf; + bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + } else { + SPDK_ERRLOG("Overflow occurred. Read I/O size %" PRIu64 " was larger than permitted %d\n", + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + SPDK_BDEV_LARGE_BUF_MAX_SIZE); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + } + if (SPDK_DIF_DISABLE != bdev->dif_type) { + rc = spdk_dif_generate(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks, &dif_ctx); + if (0 != rc) { + SPDK_ERRLOG("IO DIF generation failed: lba %lu, num_block %lu\n", + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + } + TAILQ_INSERT_TAIL(&ch->io, bdev_io, module_link); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + if (SPDK_DIF_DISABLE != bdev->dif_type) { + rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); + if (0 != rc) { + SPDK_ERRLOG("IO DIF verification failed: lba %lu, num_blocks %lu, " + "err_type %u, expected %u, actual %u, err_offset %u\n", + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + err_blk.err_type, + err_blk.expected, + err_blk.actual, + err_blk.err_offset); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + } + TAILQ_INSERT_TAIL(&ch->io, bdev_io, module_link); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + case SPDK_BDEV_IO_TYPE_RESET: + TAILQ_INSERT_TAIL(&ch->io, bdev_io, module_link); + break; + case SPDK_BDEV_IO_TYPE_ABORT: + if (bdev_null_abort_io(ch, bdev_io->u.abort.bio_to_abort)) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_UNMAP: + default: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + break; + } +} + +static bool +bdev_null_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_ABORT: + return true; + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_UNMAP: + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_null_get_io_channel(void *ctx) +{ + return spdk_get_io_channel(&g_null_bdev_head); +} + +static void +bdev_null_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + char uuid_str[SPDK_UUID_STRING_LEN]; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_null_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); + spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); + spdk_json_write_named_uint32(w, "md_size", bdev->md_len); + spdk_json_write_named_uint32(w, "dif_type", bdev->dif_type); + spdk_json_write_named_bool(w, "dif_is_head_of_md", bdev->dif_is_head_of_md); + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table null_fn_table = { + .destruct = bdev_null_destruct, + .submit_request = bdev_null_submit_request, + .io_type_supported = bdev_null_io_type_supported, + .get_io_channel = bdev_null_get_io_channel, + .write_config_json = bdev_null_write_config_json, +}; + +int +bdev_null_create(struct spdk_bdev **bdev, const struct spdk_null_bdev_opts *opts) +{ + struct null_bdev *null_disk; + uint32_t data_block_size; + int rc; + + if (!opts) { + SPDK_ERRLOG("No options provided for Null bdev.\n"); + return -EINVAL; + } + + if (opts->md_interleave) { + if (opts->block_size < opts->md_size) { + SPDK_ERRLOG("Interleaved metadata size can not be greater than block size.\n"); + return -EINVAL; + } + data_block_size = opts->block_size - opts->md_size; + } else { + if (opts->md_size != 0) { + SPDK_ERRLOG("Metadata in separate buffer is not supported\n"); + return -ENOTSUP; + } + data_block_size = opts->block_size; + } + + if (data_block_size % 512 != 0) { + SPDK_ERRLOG("Data block size %u is not a multiple of 512.\n", opts->block_size); + return -EINVAL; + } + + if (opts->num_blocks == 0) { + SPDK_ERRLOG("Disk must be more than 0 blocks\n"); + return -EINVAL; + } + + null_disk = calloc(1, sizeof(*null_disk)); + if (!null_disk) { + SPDK_ERRLOG("could not allocate null_bdev\n"); + return -ENOMEM; + } + + null_disk->bdev.name = strdup(opts->name); + if (!null_disk->bdev.name) { + free(null_disk); + return -ENOMEM; + } + null_disk->bdev.product_name = "Null disk"; + + null_disk->bdev.write_cache = 0; + null_disk->bdev.blocklen = opts->block_size; + null_disk->bdev.blockcnt = opts->num_blocks; + null_disk->bdev.md_len = opts->md_size; + null_disk->bdev.md_interleave = opts->md_interleave; + null_disk->bdev.dif_type = opts->dif_type; + null_disk->bdev.dif_is_head_of_md = opts->dif_is_head_of_md; + /* Current block device layer API does not propagate + * any DIF related information from user. So, we can + * not generate or verify Application Tag. + */ + switch (opts->dif_type) { + case SPDK_DIF_TYPE1: + case SPDK_DIF_TYPE2: + null_disk->bdev.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK | + SPDK_DIF_FLAGS_REFTAG_CHECK; + break; + case SPDK_DIF_TYPE3: + null_disk->bdev.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK; + break; + case SPDK_DIF_DISABLE: + break; + } + if (opts->uuid) { + null_disk->bdev.uuid = *opts->uuid; + } else { + spdk_uuid_generate(&null_disk->bdev.uuid); + } + + null_disk->bdev.ctxt = null_disk; + null_disk->bdev.fn_table = &null_fn_table; + null_disk->bdev.module = &null_if; + + rc = spdk_bdev_register(&null_disk->bdev); + if (rc) { + free(null_disk->bdev.name); + free(null_disk); + return rc; + } + + *bdev = &(null_disk->bdev); + + TAILQ_INSERT_TAIL(&g_null_bdev_head, null_disk, tailq); + + return rc; +} + +void +bdev_null_delete(struct spdk_bdev *bdev, spdk_delete_null_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &null_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static int +null_io_poll(void *arg) +{ + struct null_io_channel *ch = arg; + TAILQ_HEAD(, spdk_bdev_io) io; + struct spdk_bdev_io *bdev_io; + + TAILQ_INIT(&io); + TAILQ_SWAP(&ch->io, &io, spdk_bdev_io, module_link); + + if (TAILQ_EMPTY(&io)) { + return SPDK_POLLER_IDLE; + } + + while (!TAILQ_EMPTY(&io)) { + bdev_io = TAILQ_FIRST(&io); + TAILQ_REMOVE(&io, bdev_io, module_link); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } + + return SPDK_POLLER_BUSY; +} + +static int +null_bdev_create_cb(void *io_device, void *ctx_buf) +{ + struct null_io_channel *ch = ctx_buf; + + TAILQ_INIT(&ch->io); + ch->poller = SPDK_POLLER_REGISTER(null_io_poll, ch, 0); + + return 0; +} + +static void +null_bdev_destroy_cb(void *io_device, void *ctx_buf) +{ + struct null_io_channel *ch = ctx_buf; + + spdk_poller_unregister(&ch->poller); +} + +static void +_bdev_null_cleanup_cb(void *arg) +{ + spdk_free(g_null_read_buf); +} + +static int +bdev_null_initialize(void) +{ + struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Null"); + uint64_t size_in_mb, num_blocks; + int block_size, i, rc = 0; + int md_size, dif_type; + struct spdk_bdev *bdev; + const char *name, *val; + struct spdk_null_bdev_opts opts = {}; + + TAILQ_INIT(&g_null_bdev_head); + + /* + * This will be used if upper layer expects us to allocate the read buffer. + * Instead of using a real rbuf from the bdev pool, just always point to + * this same zeroed buffer. + */ + g_null_read_buf = spdk_zmalloc(SPDK_BDEV_LARGE_BUF_MAX_SIZE, 0, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + + /* + * We need to pick some unique address as our "io device" - so just use the + * address of the global tailq. + */ + spdk_io_device_register(&g_null_bdev_head, null_bdev_create_cb, null_bdev_destroy_cb, + sizeof(struct null_io_channel), + "null_bdev"); + + if (sp == NULL) { + goto end; + } + + for (i = 0; ; ++i) { + val = spdk_conf_section_get_nval(sp, "Dev", i); + if (val == NULL) { + break; + } + + name = spdk_conf_section_get_nmval(sp, "Dev", i, 0); + if (name == NULL) { + SPDK_ERRLOG("Null entry %d: Name must be provided\n", i); + continue; + } + + val = spdk_conf_section_get_nmval(sp, "Dev", i, 1); + if (val == NULL) { + SPDK_ERRLOG("Null entry %d: Size in MB must be provided\n", i); + continue; + } + + errno = 0; + size_in_mb = strtoull(val, NULL, 10); + if (errno) { + SPDK_ERRLOG("Null entry %d: Invalid size in MB %s\n", i, val); + continue; + } + + val = spdk_conf_section_get_nmval(sp, "Dev", i, 2); + if (val == NULL) { + block_size = 512; + } else { + block_size = (int)spdk_strtol(val, 10); + if (block_size <= 0) { + SPDK_ERRLOG("Null entry %d: Invalid block size %s\n", i, val); + continue; + } + } + + val = spdk_conf_section_get_nmval(sp, "Dev", i, 3); + if (val == NULL) { + md_size = 0; + } else { + md_size = (int)spdk_strtol(val, 10); + if (md_size < 0) { + SPDK_ERRLOG("Null entry %d: Invalid metadata size %s\n", i, val); + continue; + } + } + + val = spdk_conf_section_get_nmval(sp, "Dev", i, 4); + if (val == NULL) { + dif_type = SPDK_DIF_DISABLE; + } else { + dif_type = (int)spdk_strtol(val, 10); + if (dif_type < SPDK_DIF_DISABLE || dif_type > SPDK_DIF_TYPE3) { + SPDK_ERRLOG("Null entry %d: Invalid data protection type %s\n", i, val); + continue; + } + } + num_blocks = size_in_mb * (1024 * 1024) / block_size; + + opts.name = name; + opts.num_blocks = num_blocks; + opts.block_size = block_size; + opts.md_size = md_size; + opts.md_interleave = true; + opts.dif_type = dif_type; + opts.dif_is_head_of_md = false; + rc = bdev_null_create(&bdev, &opts); + if (rc) { + SPDK_ERRLOG("Could not create null bdev\n"); + goto end; + } + } +end: + if (rc) { + spdk_io_device_unregister(&g_null_bdev_head, _bdev_null_cleanup_cb); + } + return rc; +} + +static void +_bdev_null_finish_cb(void *arg) +{ + spdk_free(g_null_read_buf); + spdk_bdev_module_finish_done(); +} + +static void +bdev_null_finish(void) +{ + spdk_io_device_unregister(&g_null_bdev_head, _bdev_null_finish_cb); +} + +static void +bdev_null_get_spdk_running_config(FILE *fp) +{ + struct null_bdev *bdev; + uint64_t null_bdev_size; + + fprintf(fp, "\n[Null]\n"); + + TAILQ_FOREACH(bdev, &g_null_bdev_head, tailq) { + null_bdev_size = bdev->bdev.blocklen * bdev->bdev.blockcnt; + null_bdev_size /= (1024 * 1024); + fprintf(fp, " Dev %s %" PRIu64 " %d %d %d\n", + bdev->bdev.name, null_bdev_size, bdev->bdev.blocklen, bdev->bdev.md_len, + bdev->bdev.dif_type); + } +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_null", SPDK_LOG_BDEV_NULL) diff --git a/src/spdk/module/bdev/null/bdev_null.h b/src/spdk/module/bdev/null/bdev_null.h new file mode 100644 index 000000000..07db54e48 --- /dev/null +++ b/src/spdk/module/bdev/null/bdev_null.h @@ -0,0 +1,67 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_NULL_H +#define SPDK_BDEV_NULL_H + +#include "spdk/stdinc.h" + +typedef void (*spdk_delete_null_complete)(void *cb_arg, int bdeverrno); + +struct spdk_bdev; +struct spdk_uuid; + +struct spdk_null_bdev_opts { + const char *name; + const struct spdk_uuid *uuid; + uint64_t num_blocks; + uint32_t block_size; + uint32_t md_size; + bool md_interleave; + enum spdk_dif_type dif_type; + bool dif_is_head_of_md; +}; + +int bdev_null_create(struct spdk_bdev **bdev, const struct spdk_null_bdev_opts *opts); + +/** + * Delete null bdev. + * + * \param bdev Pointer to null bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void bdev_null_delete(struct spdk_bdev *bdev, spdk_delete_null_complete cb_fn, + void *cb_arg); + +#endif /* SPDK_BDEV_NULL_H */ diff --git a/src/spdk/module/bdev/null/bdev_null_rpc.c b/src/spdk/module/bdev/null/bdev_null_rpc.c new file mode 100644 index 000000000..f3a433e75 --- /dev/null +++ b/src/spdk/module/bdev/null/bdev_null_rpc.c @@ -0,0 +1,204 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#include "bdev_null.h" + +struct rpc_construct_null { + char *name; + char *uuid; + uint64_t num_blocks; + uint32_t block_size; + uint32_t md_size; + int32_t dif_type; + bool dif_is_head_of_md; +}; + +static void +free_rpc_construct_null(struct rpc_construct_null *req) +{ + free(req->name); + free(req->uuid); +} + +static const struct spdk_json_object_decoder rpc_construct_null_decoders[] = { + {"name", offsetof(struct rpc_construct_null, name), spdk_json_decode_string}, + {"uuid", offsetof(struct rpc_construct_null, uuid), spdk_json_decode_string, true}, + {"num_blocks", offsetof(struct rpc_construct_null, num_blocks), spdk_json_decode_uint64}, + {"block_size", offsetof(struct rpc_construct_null, block_size), spdk_json_decode_uint32}, + {"md_size", offsetof(struct rpc_construct_null, md_size), spdk_json_decode_uint32, true}, + {"dif_type", offsetof(struct rpc_construct_null, dif_type), spdk_json_decode_int32, true}, + {"dif_is_head_of_md", offsetof(struct rpc_construct_null, dif_is_head_of_md), spdk_json_decode_bool, true}, +}; + +static void +rpc_bdev_null_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_null req = {}; + struct spdk_json_write_ctx *w; + struct spdk_uuid *uuid = NULL; + struct spdk_uuid decoded_uuid; + struct spdk_bdev *bdev; + struct spdk_null_bdev_opts opts = {}; + uint32_t data_block_size; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_construct_null_decoders, + SPDK_COUNTOF(rpc_construct_null_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NULL, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + if (req.block_size < req.md_size) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, + "Interleaved metadata size can not be greater than block size"); + goto cleanup; + } + data_block_size = req.block_size - req.md_size; + if (data_block_size % 512 != 0) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, + "Data block size %u is not a multiple of 512", req.block_size); + goto cleanup; + } + + if (req.num_blocks == 0) { + spdk_jsonrpc_send_error_response(request, -EINVAL, + "Disk num_blocks must be greater than 0"); + goto cleanup; + } + + if (req.uuid) { + if (spdk_uuid_parse(&decoded_uuid, req.uuid)) { + spdk_jsonrpc_send_error_response(request, -EINVAL, + "Failed to parse bdev UUID"); + goto cleanup; + } + uuid = &decoded_uuid; + } + + if (req.dif_type < SPDK_DIF_DISABLE || req.dif_type > SPDK_DIF_TYPE3) { + spdk_jsonrpc_send_error_response(request, -EINVAL, "Invalid protection information type"); + goto cleanup; + } + + opts.name = req.name; + opts.uuid = uuid; + opts.num_blocks = req.num_blocks; + opts.block_size = req.block_size; + opts.md_size = req.md_size; + opts.md_interleave = true; + opts.dif_type = req.dif_type; + opts.dif_is_head_of_md = req.dif_is_head_of_md; + rc = bdev_null_create(&bdev, &opts); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, bdev->name); + spdk_jsonrpc_end_result(request, w); + free_rpc_construct_null(&req); + return; + +cleanup: + free_rpc_construct_null(&req); +} +SPDK_RPC_REGISTER("bdev_null_create", rpc_bdev_null_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_null_create, construct_null_bdev) + +struct rpc_delete_null { + char *name; +}; + +static void +free_rpc_delete_null(struct rpc_delete_null *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_null_decoders[] = { + {"name", offsetof(struct rpc_delete_null, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_null_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_null_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_null req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_delete_null_decoders, + SPDK_COUNTOF(rpc_delete_null_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + bdev_null_delete(bdev, rpc_bdev_null_delete_cb, request); + + free_rpc_delete_null(&req); + + return; + +cleanup: + free_rpc_delete_null(&req); +} +SPDK_RPC_REGISTER("bdev_null_delete", rpc_bdev_null_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_null_delete, delete_null_bdev) diff --git a/src/spdk/module/bdev/nvme/Makefile b/src/spdk/module/bdev/nvme/Makefile new file mode 100644 index 000000000..f9ddb2389 --- /dev/null +++ b/src/spdk/module/bdev/nvme/Makefile @@ -0,0 +1,50 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 + +C_SRCS = bdev_nvme.c bdev_nvme_rpc.c nvme_rpc.c common.c bdev_ocssd.c bdev_ocssd_rpc.c +C_SRCS-$(CONFIG_NVME_CUSE) += bdev_nvme_cuse_rpc.c + +ifeq ($(OS),Linux) +C_SRCS += vbdev_opal.c vbdev_opal_rpc.c +endif +LIBNAME = bdev_nvme + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/nvme/bdev_nvme.c b/src/spdk/module/bdev/nvme/bdev_nvme.c new file mode 100644 index 000000000..4a89b8eb2 --- /dev/null +++ b/src/spdk/module/bdev/nvme/bdev_nvme.c @@ -0,0 +1,2924 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "bdev_nvme.h" +#include "bdev_ocssd.h" + +#include "spdk/config.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/bdev.h" +#include "spdk/json.h" +#include "spdk/nvme.h" +#include "spdk/nvme_ocssd.h" +#include "spdk/thread.h" +#include "spdk/string.h" +#include "spdk/likely.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true + +static void bdev_nvme_get_spdk_running_config(FILE *fp); +static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); + +struct nvme_bdev_io { + /** array of iovecs to transfer. */ + struct iovec *iovs; + + /** Number of iovecs in iovs array. */ + int iovcnt; + + /** Current iovec position. */ + int iovpos; + + /** Offset in current iovec. */ + uint32_t iov_offset; + + /** array of iovecs to transfer. */ + struct iovec *fused_iovs; + + /** Number of iovecs in iovs array. */ + int fused_iovcnt; + + /** Current iovec position. */ + int fused_iovpos; + + /** Offset in current iovec. */ + uint32_t fused_iov_offset; + + /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ + struct spdk_nvme_cpl cpl; + + /** Originating thread */ + struct spdk_thread *orig_thread; + + /** Keeps track if first of fused commands was submitted */ + bool first_fused_submitted; +}; + +struct nvme_probe_ctx { + size_t count; + struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; + struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; + const char *names[NVME_MAX_CONTROLLERS]; + uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; + const char *hostnqn; +}; + +struct nvme_probe_skip_entry { + struct spdk_nvme_transport_id trid; + TAILQ_ENTRY(nvme_probe_skip_entry) tailq; +}; +/* All the controllers deleted by users via RPC are skipped by hotplug monitor */ +static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( + g_skipped_nvme_ctrlrs); + +static struct spdk_bdev_nvme_opts g_opts = { + .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, + .timeout_us = 0, + .retry_count = 4, + .arbitration_burst = 0, + .low_priority_weight = 0, + .medium_priority_weight = 0, + .high_priority_weight = 0, + .nvme_adminq_poll_period_us = 10000ULL, + .nvme_ioq_poll_period_us = 0, + .io_queue_requests = 0, + .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, +}; + +#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL +#define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL + +static int g_hot_insert_nvme_controller_index = 0; +static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; +static bool g_nvme_hotplug_enabled = false; +static struct spdk_thread *g_bdev_nvme_init_thread; +static struct spdk_poller *g_hotplug_poller; +static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; +static char *g_nvme_hostnqn = NULL; + +static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, + struct nvme_async_probe_ctx *ctx); +static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx); +static int bdev_nvme_library_init(void); +static void bdev_nvme_library_fini(void); +static int bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); +static int bdev_nvme_no_pi_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); +static int bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); +static int bdev_nvme_comparev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); +static int bdev_nvme_comparev_and_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, + int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba); +static int bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); +static int bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); +static int bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); +static int bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio); +static int bdev_nvme_abort(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); + +typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, + struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); +static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, + struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); + +static populate_namespace_fn g_populate_namespace_fn[] = { + NULL, + nvme_ctrlr_populate_standard_namespace, + bdev_ocssd_populate_namespace, +}; + +typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *ns); +static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns); + +static depopulate_namespace_fn g_depopulate_namespace_fn[] = { + NULL, + nvme_ctrlr_depopulate_standard_namespace, + bdev_ocssd_depopulate_namespace, +}; + +typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns); +static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, + struct nvme_bdev_ns *ns); + +static config_json_namespace_fn g_config_json_namespace_fn[] = { + NULL, + nvme_ctrlr_config_json_standard_namespace, + bdev_ocssd_namespace_config_json, +}; + +struct spdk_nvme_qpair * +bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) +{ + struct nvme_io_channel *nvme_ch; + + nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); + + return nvme_ch->qpair; +} + +static int +bdev_nvme_get_ctx_size(void) +{ + return sizeof(struct nvme_bdev_io); +} + +static struct spdk_bdev_module nvme_if = { + .name = "nvme", + .async_fini = true, + .module_init = bdev_nvme_library_init, + .module_fini = bdev_nvme_library_fini, + .config_text = bdev_nvme_get_spdk_running_config, + .config_json = bdev_nvme_config_json, + .get_ctx_size = bdev_nvme_get_ctx_size, + +}; +SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) + +static void +bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "qpar %p is disconnected, attempting reconnect.\n", qpair); + /* + * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will + * reconnect a qpair and we will stop getting a callback for this one. + */ + spdk_nvme_ctrlr_reconnect_io_qpair(qpair); +} + +static int +bdev_nvme_poll(void *arg) +{ + struct nvme_bdev_poll_group *group = arg; + int64_t num_completions; + + if (group->collect_spin_stat && group->start_ticks == 0) { + group->start_ticks = spdk_get_ticks(); + } + + num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, + bdev_nvme_disconnected_qpair_cb); + if (group->collect_spin_stat) { + if (num_completions > 0) { + if (group->end_ticks != 0) { + group->spin_ticks += (group->end_ticks - group->start_ticks); + group->end_ticks = 0; + } + group->start_ticks = 0; + } else { + group->end_ticks = spdk_get_ticks(); + } + } + + return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +} + +static int +bdev_nvme_poll_adminq(void *arg) +{ + int32_t rc; + struct spdk_nvme_ctrlr *ctrlr = arg; + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + + rc = spdk_nvme_ctrlr_process_admin_completions(ctrlr); + + if (rc < 0) { + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); + assert(nvme_bdev_ctrlr != NULL); + bdev_nvme_reset(nvme_bdev_ctrlr, NULL); + } + + return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; +} + +static int +bdev_nvme_destruct(void *ctx) +{ + struct nvme_bdev *nvme_disk = ctx; + + nvme_bdev_detach_bdev_from_ns(nvme_disk); + + free(nvme_disk->disk.name); + free(nvme_disk); + + return 0; +} + +static int +bdev_nvme_flush(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio, + uint64_t offset, uint64_t nbytes) +{ + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS); + + return 0; +} + +static void +_bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); + struct spdk_bdev_io *bdev_io; + enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; + + /* A NULL ctx means success. */ + if (spdk_io_channel_iter_get_ctx(i) != NULL) { + status = SPDK_BDEV_IO_STATUS_FAILED; + } + + while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) { + bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets); + TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link); + spdk_bdev_io_complete(bdev_io, status); + } + + spdk_for_each_channel_continue(i, 0); +} + +static void +_bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc) +{ + /* we are using the for_each_channel cb_arg like a return code here. */ + /* If it's zero, we succeeded, otherwise, the reset failed. */ + void *cb_arg = NULL; + + if (rc) { + cb_arg = (void *)0x1; + SPDK_ERRLOG("Resetting controller failed.\n"); + } else { + SPDK_NOTICELOG("Resetting controller successful.\n"); + } + + pthread_mutex_lock(&g_bdev_nvme_mutex); + nvme_bdev_ctrlr->resetting = false; + pthread_mutex_unlock(&g_bdev_nvme_mutex); + /* Make sure we clear any pending resets before returning. */ + spdk_for_each_channel(nvme_bdev_ctrlr, + _bdev_nvme_complete_pending_resets, + cb_arg, NULL); +} + +static void +_bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); + void *ctx = spdk_io_channel_iter_get_ctx(i); + int rc = SPDK_BDEV_IO_STATUS_SUCCESS; + + if (status) { + rc = SPDK_BDEV_IO_STATUS_FAILED; + } + if (ctx) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(ctx), rc); + } + _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); +} + +static void +_bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); + struct spdk_nvme_io_qpair_opts opts; + + spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts)); + opts.delay_cmd_submit = g_opts.delay_cmd_submit; + opts.create_only = true; + + nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts)); + if (!nvme_ch->qpair) { + spdk_for_each_channel_continue(i, -1); + return; + } + + assert(nvme_ch->group != NULL); + if (spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair) != 0) { + SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); + spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); + spdk_for_each_channel_continue(i, -1); + return; + } + + if (spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair)) { + SPDK_ERRLOG("Unable to connect I/O qpair.\n"); + spdk_nvme_poll_group_remove(nvme_ch->group->group, nvme_ch->qpair); + spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); + spdk_for_each_channel_continue(i, -1); + return; + } + + spdk_for_each_channel_continue(i, 0); +} + +static void +_bdev_nvme_reset(struct spdk_io_channel_iter *i, int status) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); + struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); + int rc; + + if (status) { + if (bio) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); + } + _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); + return; + } + + rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr); + if (rc != 0) { + if (bio) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); + } + _bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc); + return; + } + + /* Recreate all of the I/O queue pairs */ + spdk_for_each_channel(nvme_bdev_ctrlr, + _bdev_nvme_reset_create_qpair, + bio, + _bdev_nvme_reset_create_qpairs_done); + + +} + +static void +_bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + int rc; + + rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); + if (!rc) { + nvme_ch->qpair = NULL; + } + + spdk_for_each_channel_continue(i, rc); +} + +static int +bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio) +{ + struct spdk_io_channel *ch; + struct nvme_io_channel *nvme_ch; + + pthread_mutex_lock(&g_bdev_nvme_mutex); + if (nvme_bdev_ctrlr->destruct) { + /* Don't bother resetting if the controller is in the process of being destructed. */ + if (bio) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); + } + pthread_mutex_unlock(&g_bdev_nvme_mutex); + return 0; + } + + if (!nvme_bdev_ctrlr->resetting) { + nvme_bdev_ctrlr->resetting = true; + } else { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); + /* + * The internal reset calls won't be queued. This is on purpose so that we don't + * interfere with the app framework reset strategy. i.e. we are deferring to the + * upper level. If they are in the middle of a reset, we won't try to schedule another one. + */ + if (bio) { + ch = spdk_get_io_channel(nvme_bdev_ctrlr); + assert(ch != NULL); + nvme_ch = spdk_io_channel_get_ctx(ch); + TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, spdk_bdev_io_from_ctx(bio), module_link); + spdk_put_io_channel(ch); + } + return 0; + } + + pthread_mutex_unlock(&g_bdev_nvme_mutex); + /* First, delete all NVMe I/O queue pairs. */ + spdk_for_each_channel(nvme_bdev_ctrlr, + _bdev_nvme_reset_destroy_qpair, + bio, + _bdev_nvme_reset); + + return 0; +} + +static int +bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + uint64_t offset_blocks, + uint64_t num_blocks); + +static void +bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + int ret; + + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + ret = bdev_nvme_readv((struct nvme_bdev *)bdev_io->bdev->ctxt, + ch, + (struct nvme_bdev_io *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.md_buf, + bdev_io->u.bdev.num_blocks, + bdev_io->u.bdev.offset_blocks); + + if (spdk_likely(ret == 0)) { + return; + } else if (ret == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static int +_bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; + struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; + struct nvme_bdev_io *nbdev_io_to_abort; + + if (nvme_ch->qpair == NULL) { + /* The device is currently resetting */ + return -1; + } + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + + case SPDK_BDEV_IO_TYPE_WRITE: + return bdev_nvme_writev(nbdev, + ch, + nbdev_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.md_buf, + bdev_io->u.bdev.num_blocks, + bdev_io->u.bdev.offset_blocks); + + case SPDK_BDEV_IO_TYPE_COMPARE: + return bdev_nvme_comparev(nbdev, + ch, + nbdev_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.md_buf, + bdev_io->u.bdev.num_blocks, + bdev_io->u.bdev.offset_blocks); + + case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: + return bdev_nvme_comparev_and_writev(nbdev, + ch, + nbdev_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.fused_iovs, + bdev_io->u.bdev.fused_iovcnt, + bdev_io->u.bdev.md_buf, + bdev_io->u.bdev.num_blocks, + bdev_io->u.bdev.offset_blocks); + + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + return bdev_nvme_unmap(nbdev, + ch, + nbdev_io, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks); + + case SPDK_BDEV_IO_TYPE_UNMAP: + return bdev_nvme_unmap(nbdev, + ch, + nbdev_io, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks); + + case SPDK_BDEV_IO_TYPE_RESET: + return bdev_nvme_reset(nbdev->nvme_bdev_ctrlr, nbdev_io); + + case SPDK_BDEV_IO_TYPE_FLUSH: + return bdev_nvme_flush(nbdev, + nbdev_io, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks); + + case SPDK_BDEV_IO_TYPE_NVME_ADMIN: + return bdev_nvme_admin_passthru(nbdev, + ch, + nbdev_io, + &bdev_io->u.nvme_passthru.cmd, + bdev_io->u.nvme_passthru.buf, + bdev_io->u.nvme_passthru.nbytes); + + case SPDK_BDEV_IO_TYPE_NVME_IO: + return bdev_nvme_io_passthru(nbdev, + ch, + nbdev_io, + &bdev_io->u.nvme_passthru.cmd, + bdev_io->u.nvme_passthru.buf, + bdev_io->u.nvme_passthru.nbytes); + + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + return bdev_nvme_io_passthru_md(nbdev, + ch, + nbdev_io, + &bdev_io->u.nvme_passthru.cmd, + bdev_io->u.nvme_passthru.buf, + bdev_io->u.nvme_passthru.nbytes, + bdev_io->u.nvme_passthru.md_buf, + bdev_io->u.nvme_passthru.md_len); + + case SPDK_BDEV_IO_TYPE_ABORT: + nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; + return bdev_nvme_abort(nbdev, + ch, + nbdev_io, + nbdev_io_to_abort); + + default: + return -EINVAL; + } + return 0; +} + +static void +bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + int rc = _bdev_nvme_submit_request(ch, bdev_io); + + if (spdk_unlikely(rc != 0)) { + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +static bool +bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct nvme_bdev *nbdev = ctx; + const struct spdk_nvme_ctrlr_data *cdata; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_NVME_ADMIN: + case SPDK_BDEV_IO_TYPE_NVME_IO: + case SPDK_BDEV_IO_TYPE_ABORT: + return true; + + case SPDK_BDEV_IO_TYPE_COMPARE: + return spdk_nvme_ns_supports_compare(nbdev->nvme_ns->ns); + + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + return spdk_nvme_ns_get_md_size(nbdev->nvme_ns->ns) ? true : false; + + case SPDK_BDEV_IO_TYPE_UNMAP: + cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_bdev_ctrlr->ctrlr); + return cdata->oncs.dsm; + + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_bdev_ctrlr->ctrlr); + /* + * If an NVMe controller guarantees reading unallocated blocks returns zero, + * we can implement WRITE_ZEROES as an NVMe deallocate command. + */ + if (cdata->oncs.dsm && + spdk_nvme_ns_get_dealloc_logical_block_read_value(nbdev->nvme_ns->ns) == + SPDK_NVME_DEALLOC_READ_00) { + return true; + } + /* + * The NVMe controller write_zeroes function is currently not used by our driver. + * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail. + * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read. + */ + return false; + + case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: + if (spdk_nvme_ctrlr_get_flags(nbdev->nvme_bdev_ctrlr->ctrlr) & + SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { + return true; + } + return false; + + default: + return false; + } +} + +static int +bdev_nvme_create_cb(void *io_device, void *ctx_buf) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; + struct nvme_io_channel *ch = ctx_buf; + struct spdk_nvme_io_qpair_opts opts; + struct spdk_io_channel *pg_ch = NULL; + int rc; + + spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts)); + opts.delay_cmd_submit = g_opts.delay_cmd_submit; + opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); + opts.create_only = true; + g_opts.io_queue_requests = opts.io_queue_requests; + + ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts)); + + if (ch->qpair == NULL) { + return -1; + } + + if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { + if (bdev_ocssd_create_io_channel(ch)) { + goto err; + } + } + + pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); + if (!pg_ch) { + goto err; + } + + ch->group = spdk_io_channel_get_ctx(pg_ch); + if (spdk_nvme_poll_group_add(ch->group->group, ch->qpair) != 0) { + goto err; + } + + rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, ch->qpair); + if (rc) { + spdk_nvme_poll_group_remove(ch->group->group, ch->qpair); + goto err; + } + +#ifdef SPDK_CONFIG_VTUNE + ch->group->collect_spin_stat = true; +#else + ch->group->collect_spin_stat = false; +#endif + + TAILQ_INIT(&ch->pending_resets); + return 0; + +err: + if (pg_ch) { + spdk_put_io_channel(pg_ch); + } + spdk_nvme_ctrlr_free_io_qpair(ch->qpair); + return -1; +} + +static void +bdev_nvme_destroy_cb(void *io_device, void *ctx_buf) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; + struct nvme_io_channel *ch = ctx_buf; + struct nvme_bdev_poll_group *group; + + group = ch->group; + assert(group != NULL); + + if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { + bdev_ocssd_destroy_io_channel(ch); + } + + if (ch->qpair != NULL) { + spdk_nvme_poll_group_remove(group->group, ch->qpair); + } + spdk_put_io_channel(spdk_io_channel_from_ctx(group)); + + spdk_nvme_ctrlr_free_io_qpair(ch->qpair); +} + +static int +bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf) +{ + struct nvme_bdev_poll_group *group = ctx_buf; + + group->group = spdk_nvme_poll_group_create(group); + if (group->group == NULL) { + return -1; + } + + group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); + + if (group->poller == NULL) { + spdk_nvme_poll_group_destroy(group->group); + return -1; + } + + return 0; +} + +static void +bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf) +{ + struct nvme_bdev_poll_group *group = ctx_buf; + + spdk_poller_unregister(&group->poller); + if (spdk_nvme_poll_group_destroy(group->group)) { + SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module."); + assert(false); + } +} + +static struct spdk_io_channel * +bdev_nvme_get_io_channel(void *ctx) +{ + struct nvme_bdev *nvme_bdev = ctx; + + return spdk_get_io_channel(nvme_bdev->nvme_bdev_ctrlr); +} + +static int +bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct nvme_bdev *nvme_bdev = ctx; + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_bdev->nvme_bdev_ctrlr; + const struct spdk_nvme_ctrlr_data *cdata; + struct spdk_nvme_ns *ns; + union spdk_nvme_vs_register vs; + union spdk_nvme_csts_register csts; + char buf[128]; + + cdata = spdk_nvme_ctrlr_get_data(nvme_bdev->nvme_bdev_ctrlr->ctrlr); + vs = spdk_nvme_ctrlr_get_regs_vs(nvme_bdev->nvme_bdev_ctrlr->ctrlr); + csts = spdk_nvme_ctrlr_get_regs_csts(nvme_bdev->nvme_bdev_ctrlr->ctrlr); + ns = nvme_bdev->nvme_ns->ns; + + spdk_json_write_named_object_begin(w, "nvme"); + + if (nvme_bdev_ctrlr->trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { + spdk_json_write_named_string(w, "pci_address", nvme_bdev_ctrlr->trid->traddr); + } + + spdk_json_write_named_object_begin(w, "trid"); + + nvme_bdev_dump_trid_json(nvme_bdev_ctrlr->trid, w); + + spdk_json_write_object_end(w); + +#ifdef SPDK_CONFIG_NVME_CUSE + size_t cuse_name_size = 128; + char cuse_name[cuse_name_size]; + + int rc = spdk_nvme_cuse_get_ns_name(nvme_bdev->nvme_bdev_ctrlr->ctrlr, spdk_nvme_ns_get_id(ns), + cuse_name, &cuse_name_size); + if (rc == 0) { + spdk_json_write_named_string(w, "cuse_device", cuse_name); + } +#endif + + spdk_json_write_named_object_begin(w, "ctrlr_data"); + + spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); + + snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); + spdk_str_trim(buf); + spdk_json_write_named_string(w, "model_number", buf); + + snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); + spdk_str_trim(buf); + spdk_json_write_named_string(w, "serial_number", buf); + + snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); + spdk_str_trim(buf); + spdk_json_write_named_string(w, "firmware_revision", buf); + + spdk_json_write_named_object_begin(w, "oacs"); + + spdk_json_write_named_uint32(w, "security", cdata->oacs.security); + spdk_json_write_named_uint32(w, "format", cdata->oacs.format); + spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); + spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "vs"); + + spdk_json_write_name(w, "nvme_version"); + if (vs.bits.ter) { + spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); + } else { + spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); + } + + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "csts"); + + spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); + spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); + + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "ns_data"); + + spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); + + spdk_json_write_object_end(w); + + if (cdata->oacs.security) { + spdk_json_write_named_object_begin(w, "security"); + + spdk_json_write_named_bool(w, "opal", nvme_bdev_ctrlr->opal_dev ? true : false); + + spdk_json_write_object_end(w); + } + + spdk_json_write_object_end(w); + + return 0; +} + +static void +bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* No config per bdev needed */ +} + +static uint64_t +bdev_nvme_get_spin_time(struct spdk_io_channel *ch) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + struct nvme_bdev_poll_group *group = nvme_ch->group; + uint64_t spin_time; + + if (!group || !group->collect_spin_stat) { + return 0; + } + + if (group->end_ticks != 0) { + group->spin_ticks += (group->end_ticks - group->start_ticks); + group->end_ticks = 0; + } + + spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); + group->start_ticks = 0; + group->spin_ticks = 0; + + return spin_time; +} + +static const struct spdk_bdev_fn_table nvmelib_fn_table = { + .destruct = bdev_nvme_destruct, + .submit_request = bdev_nvme_submit_request, + .io_type_supported = bdev_nvme_io_type_supported, + .get_io_channel = bdev_nvme_get_io_channel, + .dump_info_json = bdev_nvme_dump_info_json, + .write_config_json = bdev_nvme_write_config_json, + .get_spin_time = bdev_nvme_get_spin_time, +}; + +static void +nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, + struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) +{ + struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; + struct nvme_bdev *bdev; + struct spdk_nvme_ns *ns; + const struct spdk_uuid *uuid; + const struct spdk_nvme_ctrlr_data *cdata; + const struct spdk_nvme_ns_data *nsdata; + int rc; + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); + if (!ns) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Invalid NS %d\n", nvme_ns->id); + nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -EINVAL); + return; + } + + bdev = calloc(1, sizeof(*bdev)); + if (!bdev) { + SPDK_ERRLOG("bdev calloc() failed\n"); + nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM); + return; + } + + bdev->nvme_bdev_ctrlr = nvme_bdev_ctrlr; + nvme_ns->ns = ns; + bdev->nvme_ns = nvme_ns; + + bdev->disk.name = spdk_sprintf_alloc("%sn%d", nvme_bdev_ctrlr->name, spdk_nvme_ns_get_id(ns)); + if (!bdev->disk.name) { + free(bdev); + nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM); + return; + } + bdev->disk.product_name = "NVMe disk"; + + bdev->disk.write_cache = 0; + if (cdata->vwc.present) { + /* Enable if the Volatile Write Cache exists */ + bdev->disk.write_cache = 1; + } + bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns); + bdev->disk.blockcnt = spdk_nvme_ns_get_num_sectors(ns); + bdev->disk.optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); + + uuid = spdk_nvme_ns_get_uuid(ns); + if (uuid != NULL) { + bdev->disk.uuid = *uuid; + } + + nsdata = spdk_nvme_ns_get_data(ns); + + bdev->disk.md_len = spdk_nvme_ns_get_md_size(ns); + if (bdev->disk.md_len != 0) { + bdev->disk.md_interleave = nsdata->flbas.extended; + bdev->disk.dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); + if (bdev->disk.dif_type != SPDK_DIF_DISABLE) { + bdev->disk.dif_is_head_of_md = nsdata->dps.md_start; + bdev->disk.dif_check_flags = nvme_bdev_ctrlr->prchk_flags; + } + } + + if (!bdev_nvme_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { + bdev->disk.acwu = 0; + } else if (nsdata->nsfeat.ns_atomic_write_unit) { + bdev->disk.acwu = nsdata->nacwu; + } else { + bdev->disk.acwu = cdata->acwu; + } + + bdev->disk.ctxt = bdev; + bdev->disk.fn_table = &nvmelib_fn_table; + bdev->disk.module = &nvme_if; + rc = spdk_bdev_register(&bdev->disk); + if (rc) { + free(bdev->disk.name); + free(bdev); + nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); + return; + } + + nvme_bdev_attach_bdev_to_ns(nvme_ns, bdev); + nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, 0); +} + +static bool +hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + struct nvme_probe_skip_entry *entry; + + TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { + if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { + return false; + } + } + + opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; + opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; + opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; + opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attaching to %s\n", trid->traddr); + + return true; +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + struct nvme_probe_ctx *ctx = cb_ctx; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Probing device %s\n", trid->traddr); + + if (nvme_bdev_ctrlr_get(trid)) { + SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", + trid->traddr); + return false; + } + + if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { + bool claim_device = false; + size_t i; + + for (i = 0; i < ctx->count; i++) { + if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { + claim_device = true; + break; + } + } + + if (!claim_device) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Not claiming device at %s\n", trid->traddr); + return false; + } + } + + if (ctx->hostnqn) { + snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", ctx->hostnqn); + } + + opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; + opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; + opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; + opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; + + return true; +} + +static void +nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = ctx; + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_WARNLOG("Abort failed. Resetting controller.\n"); + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); + assert(nvme_bdev_ctrlr != NULL); + bdev_nvme_reset(nvme_bdev_ctrlr, NULL); + } +} + +static void +timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, uint16_t cid) +{ + int rc; + union spdk_nvme_csts_register csts; + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + + SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); + + csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); + if (csts.bits.cfs) { + SPDK_ERRLOG("Controller Fatal Status, reset required\n"); + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); + assert(nvme_bdev_ctrlr != NULL); + bdev_nvme_reset(nvme_bdev_ctrlr, NULL); + return; + } + + switch (g_opts.action_on_timeout) { + case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: + if (qpair) { + rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, + nvme_abort_cpl, ctrlr); + if (rc == 0) { + return; + } + + SPDK_ERRLOG("Unable to send abort. Resetting.\n"); + } + + /* FALLTHROUGH */ + case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr)); + assert(nvme_bdev_ctrlr != NULL); + bdev_nvme_reset(nvme_bdev_ctrlr, NULL); + break; + case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "No action for nvme controller timeout.\n"); + break; + default: + SPDK_ERRLOG("An invalid timeout action value is found.\n"); + break; + } +} + +void +nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) +{ + pthread_mutex_lock(&g_bdev_nvme_mutex); + nvme_bdev_ctrlr->ref--; + + if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); + return; + } + + pthread_mutex_unlock(&g_bdev_nvme_mutex); +} + +static void +nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns) +{ + struct nvme_bdev *bdev, *tmp; + + TAILQ_FOREACH_SAFE(bdev, &ns->bdevs, tailq, tmp) { + spdk_bdev_unregister(&bdev->disk, NULL, NULL); + } + + ns->populated = false; + + nvme_ctrlr_depopulate_namespace_done(ns->ctrlr); +} + +static void nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns, + struct nvme_async_probe_ctx *ctx) +{ + g_populate_namespace_fn[ns->type](ctrlr, ns, ctx); +} + +static void nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns) +{ + g_depopulate_namespace_fn[ns->type](ns); +} + +void +nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, + struct nvme_bdev_ns *ns, int rc) +{ + if (rc == 0) { + ns->populated = true; + pthread_mutex_lock(&g_bdev_nvme_mutex); + ns->ctrlr->ref++; + pthread_mutex_unlock(&g_bdev_nvme_mutex); + } else { + memset(ns, 0, sizeof(*ns)); + } + + if (ctx) { + ctx->populates_in_progress--; + if (ctx->populates_in_progress == 0) { + nvme_ctrlr_populate_namespaces_done(ctx); + } + } +} + +static void +nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, + struct nvme_async_probe_ctx *ctx) +{ + struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; + struct nvme_bdev_ns *ns; + struct spdk_nvme_ns *nvme_ns; + struct nvme_bdev *bdev; + uint32_t i; + int rc; + uint64_t num_sectors; + bool ns_is_active; + + if (ctx) { + /* Initialize this count to 1 to handle the populate functions + * calling nvme_ctrlr_populate_namespace_done() immediately. + */ + ctx->populates_in_progress = 1; + } + + for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { + uint32_t nsid = i + 1; + + ns = nvme_bdev_ctrlr->namespaces[i]; + ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); + + if (ns->populated && ns_is_active && ns->type == NVME_BDEV_NS_STANDARD) { + /* NS is still there but attributes may have changed */ + nvme_ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + num_sectors = spdk_nvme_ns_get_num_sectors(nvme_ns); + bdev = TAILQ_FIRST(&ns->bdevs); + if (bdev->disk.blockcnt != num_sectors) { + SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %lu, new size %lu\n", + nsid, + bdev->disk.name, + bdev->disk.blockcnt, + num_sectors); + rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); + if (rc != 0) { + SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", + bdev->disk.name, rc); + } + } + } + + if (!ns->populated && ns_is_active) { + ns->id = nsid; + ns->ctrlr = nvme_bdev_ctrlr; + if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { + ns->type = NVME_BDEV_NS_OCSSD; + } else { + ns->type = NVME_BDEV_NS_STANDARD; + } + + TAILQ_INIT(&ns->bdevs); + + if (ctx) { + ctx->populates_in_progress++; + } + nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, ns, ctx); + } + + if (ns->populated && !ns_is_active) { + nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns); + } + } + + if (ctx) { + /* Decrement this count now that the loop is over to account + * for the one we started with. If the count is then 0, we + * know any populate_namespace functions completed immediately, + * so we'll kick the callback here. + */ + ctx->populates_in_progress--; + if (ctx->populates_in_progress == 0) { + nvme_ctrlr_populate_namespaces_done(ctx); + } + } + +} + +static void +aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; + union spdk_nvme_async_event_completion event; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_WARNLOG("AER request execute failed"); + return; + } + + event.raw = cpl->cdw0; + if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && + (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { + nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); + } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) && + (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) && + spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { + bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr); + } +} + +static int +create_ctrlr(struct spdk_nvme_ctrlr *ctrlr, + const char *name, + const struct spdk_nvme_transport_id *trid, + uint32_t prchk_flags) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + uint32_t i; + int rc; + + nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr)); + if (nvme_bdev_ctrlr == NULL) { + SPDK_ERRLOG("Failed to allocate device struct\n"); + return -ENOMEM; + } + + nvme_bdev_ctrlr->trid = calloc(1, sizeof(*nvme_bdev_ctrlr->trid)); + if (nvme_bdev_ctrlr->trid == NULL) { + SPDK_ERRLOG("Failed to allocate device trid struct\n"); + free(nvme_bdev_ctrlr); + return -ENOMEM; + } + + nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); + nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); + if (!nvme_bdev_ctrlr->namespaces) { + SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); + free(nvme_bdev_ctrlr->trid); + free(nvme_bdev_ctrlr); + return -ENOMEM; + } + + for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { + nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); + if (nvme_bdev_ctrlr->namespaces[i] == NULL) { + SPDK_ERRLOG("Failed to allocate block namespace struct\n"); + for (; i > 0; i--) { + free(nvme_bdev_ctrlr->namespaces[i - 1]); + } + free(nvme_bdev_ctrlr->namespaces); + free(nvme_bdev_ctrlr->trid); + free(nvme_bdev_ctrlr); + return -ENOMEM; + } + } + + nvme_bdev_ctrlr->thread = spdk_get_thread(); + nvme_bdev_ctrlr->adminq_timer_poller = NULL; + nvme_bdev_ctrlr->ctrlr = ctrlr; + nvme_bdev_ctrlr->ref = 0; + *nvme_bdev_ctrlr->trid = *trid; + nvme_bdev_ctrlr->name = strdup(name); + if (nvme_bdev_ctrlr->name == NULL) { + free(nvme_bdev_ctrlr->namespaces); + free(nvme_bdev_ctrlr->trid); + free(nvme_bdev_ctrlr); + return -ENOMEM; + } + + if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { + rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); + free(nvme_bdev_ctrlr->name); + free(nvme_bdev_ctrlr->namespaces); + free(nvme_bdev_ctrlr->trid); + free(nvme_bdev_ctrlr); + return rc; + } + } + + nvme_bdev_ctrlr->prchk_flags = prchk_flags; + + spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb, + sizeof(struct nvme_io_channel), + name); + + nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, ctrlr, + g_opts.nvme_adminq_poll_period_us); + + TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); + + if (g_opts.timeout_us > 0) { + spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, + timeout_cb, NULL); + } + + spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr); + + if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) & + SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { + nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr); + if (nvme_bdev_ctrlr->opal_dev == NULL) { + SPDK_ERRLOG("Failed to initialize Opal\n"); + } + } + return 0; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct nvme_probe_ctx *ctx = cb_ctx; + char *name = NULL; + uint32_t prchk_flags = 0; + size_t i; + + if (ctx) { + for (i = 0; i < ctx->count; i++) { + if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { + prchk_flags = ctx->prchk_flags[i]; + name = strdup(ctx->names[i]); + break; + } + } + } else { + name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); + } + if (!name) { + SPDK_ERRLOG("Failed to assign name to NVMe device\n"); + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attached to %s (%s)\n", trid->traddr, name); + + create_ctrlr(ctrlr, name, trid, prchk_flags); + + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid); + if (!nvme_bdev_ctrlr) { + SPDK_ERRLOG("Failed to find new NVMe controller\n"); + free(name); + return; + } + + nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); + + free(name); +} + +static void +remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t i; + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct nvme_bdev_ns *ns; + + pthread_mutex_lock(&g_bdev_nvme_mutex); + TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { + if (nvme_bdev_ctrlr->ctrlr == ctrlr) { + /* The controller's destruction was already started */ + if (nvme_bdev_ctrlr->destruct) { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + return; + } + pthread_mutex_unlock(&g_bdev_nvme_mutex); + for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { + uint32_t nsid = i + 1; + + ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; + if (ns->populated) { + assert(ns->id == nsid); + nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns); + } + } + + pthread_mutex_lock(&g_bdev_nvme_mutex); + nvme_bdev_ctrlr->destruct = true; + if (nvme_bdev_ctrlr->ref == 0) { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); + } else { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + } + return; + } + } + pthread_mutex_unlock(&g_bdev_nvme_mutex); +} + +static int +bdev_nvme_hotplug(void *arg) +{ + struct spdk_nvme_transport_id trid_pcie; + int done; + + if (!g_hotplug_probe_ctx) { + memset(&trid_pcie, 0, sizeof(trid_pcie)); + spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); + + g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, + hotplug_probe_cb, + attach_cb, remove_cb); + if (!g_hotplug_probe_ctx) { + return SPDK_POLLER_BUSY; + } + } + + done = spdk_nvme_probe_poll_async(g_hotplug_probe_ctx); + if (done != -EAGAIN) { + g_hotplug_probe_ctx = NULL; + } + + return SPDK_POLLER_BUSY; +} + +void +bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) +{ + *opts = g_opts; +} + +int +bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) +{ + if (g_bdev_nvme_init_thread != NULL) { + if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { + return -EPERM; + } + } + + g_opts = *opts; + + return 0; +} + +struct set_nvme_hotplug_ctx { + uint64_t period_us; + bool enabled; + spdk_msg_fn fn; + void *fn_ctx; +}; + +static void +set_nvme_hotplug_period_cb(void *_ctx) +{ + struct set_nvme_hotplug_ctx *ctx = _ctx; + + spdk_poller_unregister(&g_hotplug_poller); + if (ctx->enabled) { + g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); + } + + g_nvme_hotplug_poll_period_us = ctx->period_us; + g_nvme_hotplug_enabled = ctx->enabled; + if (ctx->fn) { + ctx->fn(ctx->fn_ctx); + } + + free(ctx); +} + +int +bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) +{ + struct set_nvme_hotplug_ctx *ctx; + + if (enabled == true && !spdk_process_is_primary()) { + return -EPERM; + } + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + return -ENOMEM; + } + + period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; + ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); + ctx->enabled = enabled; + ctx->fn = cb; + ctx->fn_ctx = cb_ctx; + + spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); + return 0; +} + +static void +populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) +{ + if (ctx->cb_fn) { + ctx->cb_fn(ctx->cb_ctx, count, rc); + } + + free(ctx); +} + +static void +nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct nvme_bdev_ns *ns; + struct nvme_bdev *nvme_bdev, *tmp; + uint32_t i, nsid; + size_t j; + + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid); + assert(nvme_bdev_ctrlr != NULL); + + /* + * Report the new bdevs that were created in this call. + * There can be more than one bdev per NVMe controller. + */ + j = 0; + for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { + nsid = i + 1; + ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; + if (!ns->populated) { + continue; + } + assert(ns->id == nsid); + TAILQ_FOREACH_SAFE(nvme_bdev, &ns->bdevs, tailq, tmp) { + if (j < ctx->count) { + ctx->names[j] = nvme_bdev->disk.name; + j++; + } else { + SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", + ctx->count); + populate_namespaces_cb(ctx, 0, -ERANGE); + return; + } + } + } + + populate_namespaces_cb(ctx, j, 0); +} + +static void +connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct nvme_async_probe_ctx *ctx; + int rc; + + ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); + + spdk_poller_unregister(&ctx->poller); + + rc = create_ctrlr(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags); + if (rc) { + SPDK_ERRLOG("Failed to create new device\n"); + populate_namespaces_cb(ctx, 0, rc); + return; + } + + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid); + assert(nvme_bdev_ctrlr != NULL); + + nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx); +} + +static int +bdev_nvme_async_poll(void *arg) +{ + struct nvme_async_probe_ctx *ctx = arg; + int rc; + + rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); + if (spdk_unlikely(rc != -EAGAIN && rc != 0)) { + spdk_poller_unregister(&ctx->poller); + free(ctx); + } + + return SPDK_POLLER_BUSY; +} + +int +bdev_nvme_create(struct spdk_nvme_transport_id *trid, + struct spdk_nvme_host_id *hostid, + const char *base_name, + const char **names, + uint32_t count, + const char *hostnqn, + uint32_t prchk_flags, + spdk_bdev_create_nvme_fn cb_fn, + void *cb_ctx) +{ + struct nvme_probe_skip_entry *entry, *tmp; + struct nvme_async_probe_ctx *ctx; + + if (nvme_bdev_ctrlr_get(trid) != NULL) { + SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); + return -EEXIST; + } + + if (nvme_bdev_ctrlr_get_by_name(base_name)) { + SPDK_ERRLOG("A controller with the provided name (%s) already exists.\n", base_name); + return -EEXIST; + } + + if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { + TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { + if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { + TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); + free(entry); + break; + } + } + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + return -ENOMEM; + } + ctx->base_name = base_name; + ctx->names = names; + ctx->count = count; + ctx->cb_fn = cb_fn; + ctx->cb_ctx = cb_ctx; + ctx->prchk_flags = prchk_flags; + ctx->trid = *trid; + + spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); + ctx->opts.transport_retry_count = g_opts.retry_count; + + if (hostnqn) { + snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn); + } + + if (hostid->hostaddr[0] != '\0') { + snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); + } + + if (hostid->hostsvcid[0] != '\0') { + snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); + } + + ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb); + if (ctx->probe_ctx == NULL) { + SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); + free(ctx); + return -ENODEV; + } + ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); + + return 0; +} + +int +bdev_nvme_delete(const char *name) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL; + struct nvme_probe_skip_entry *entry; + + if (name == NULL) { + return -EINVAL; + } + + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); + if (nvme_bdev_ctrlr == NULL) { + SPDK_ERRLOG("Failed to find NVMe controller\n"); + return -ENODEV; + } + + if (nvme_bdev_ctrlr->trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { + entry = calloc(1, sizeof(*entry)); + if (!entry) { + return -ENOMEM; + } + entry->trid = *nvme_bdev_ctrlr->trid; + TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); + } + + remove_cb(NULL, nvme_bdev_ctrlr->ctrlr); + return 0; +} + +static int +bdev_nvme_library_init(void) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct spdk_conf_section *sp; + const char *val; + int rc = 0; + int64_t intval = 0; + size_t i; + struct nvme_probe_ctx *probe_ctx = NULL; + int retry_count; + uint32_t local_nvme_num = 0; + int64_t hotplug_period; + bool hotplug_enabled = g_nvme_hotplug_enabled; + + g_bdev_nvme_init_thread = spdk_get_thread(); + + spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb, + bdev_nvme_poll_group_destroy_cb, + sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups"); + + sp = spdk_conf_find_section(NULL, "Nvme"); + if (sp == NULL) { + goto end; + } + + probe_ctx = calloc(1, sizeof(*probe_ctx)); + if (probe_ctx == NULL) { + SPDK_ERRLOG("Failed to allocate probe_ctx\n"); + rc = -1; + goto end; + } + + retry_count = spdk_conf_section_get_intval(sp, "RetryCount"); + if (retry_count >= 0) { + g_opts.retry_count = retry_count; + } + + val = spdk_conf_section_get_val(sp, "TimeoutUsec"); + if (val != NULL) { + intval = spdk_strtoll(val, 10); + if (intval < 0) { + SPDK_ERRLOG("Invalid TimeoutUsec value\n"); + rc = -1; + goto end; + } + } + + g_opts.timeout_us = intval; + + if (g_opts.timeout_us > 0) { + val = spdk_conf_section_get_val(sp, "ActionOnTimeout"); + if (val != NULL) { + if (!strcasecmp(val, "Reset")) { + g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET; + } else if (!strcasecmp(val, "Abort")) { + g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT; + } + } + } + + intval = spdk_conf_section_get_intval(sp, "AdminPollRate"); + if (intval > 0) { + g_opts.nvme_adminq_poll_period_us = intval; + } + + intval = spdk_conf_section_get_intval(sp, "IOPollRate"); + if (intval > 0) { + g_opts.nvme_ioq_poll_period_us = intval; + } + + if (spdk_process_is_primary()) { + hotplug_enabled = spdk_conf_section_get_boolval(sp, "HotplugEnable", false); + } + + hotplug_period = spdk_conf_section_get_intval(sp, "HotplugPollRate"); + if (hotplug_period < 0) { + hotplug_period = 0; + } + + g_nvme_hostnqn = spdk_conf_section_get_val(sp, "HostNQN"); + probe_ctx->hostnqn = g_nvme_hostnqn; + + g_opts.delay_cmd_submit = spdk_conf_section_get_boolval(sp, "DelayCmdSubmit", + SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT); + + for (i = 0; i < NVME_MAX_CONTROLLERS; i++) { + val = spdk_conf_section_get_nmval(sp, "TransportID", i, 0); + if (val == NULL) { + break; + } + + rc = spdk_nvme_transport_id_parse(&probe_ctx->trids[i], val); + if (rc < 0) { + SPDK_ERRLOG("Unable to parse TransportID: %s\n", val); + rc = -1; + goto end; + } + + rc = spdk_nvme_host_id_parse(&probe_ctx->hostids[i], val); + if (rc < 0) { + SPDK_ERRLOG("Unable to parse HostID: %s\n", val); + rc = -1; + goto end; + } + + val = spdk_conf_section_get_nmval(sp, "TransportID", i, 1); + if (val == NULL) { + SPDK_ERRLOG("No name provided for TransportID\n"); + rc = -1; + goto end; + } + + probe_ctx->names[i] = val; + + val = spdk_conf_section_get_nmval(sp, "TransportID", i, 2); + if (val != NULL) { + rc = spdk_nvme_prchk_flags_parse(&probe_ctx->prchk_flags[i], val); + if (rc < 0) { + SPDK_ERRLOG("Unable to parse prchk: %s\n", val); + rc = -1; + goto end; + } + } + + probe_ctx->count++; + + if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ctrlr_opts opts; + + if (nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) { + SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", + probe_ctx->trids[i].traddr); + rc = -1; + goto end; + } + + if (probe_ctx->trids[i].subnqn[0] == '\0') { + SPDK_ERRLOG("Need to provide subsystem nqn\n"); + rc = -1; + goto end; + } + + spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts)); + opts.transport_retry_count = g_opts.retry_count; + + if (probe_ctx->hostnqn != NULL) { + snprintf(opts.hostnqn, sizeof(opts.hostnqn), "%s", probe_ctx->hostnqn); + } + + if (probe_ctx->hostids[i].hostaddr[0] != '\0') { + snprintf(opts.src_addr, sizeof(opts.src_addr), "%s", probe_ctx->hostids[i].hostaddr); + } + + if (probe_ctx->hostids[i].hostsvcid[0] != '\0') { + snprintf(opts.src_svcid, sizeof(opts.src_svcid), "%s", probe_ctx->hostids[i].hostsvcid); + } + + ctrlr = spdk_nvme_connect(&probe_ctx->trids[i], &opts, sizeof(opts)); + if (ctrlr == NULL) { + SPDK_ERRLOG("Unable to connect to provided trid (traddr: %s)\n", + probe_ctx->trids[i].traddr); + rc = -1; + goto end; + } + + rc = create_ctrlr(ctrlr, probe_ctx->names[i], &probe_ctx->trids[i], 0); + if (rc) { + goto end; + } + + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&probe_ctx->trids[i]); + if (!nvme_bdev_ctrlr) { + SPDK_ERRLOG("Failed to find new NVMe controller\n"); + rc = -ENODEV; + goto end; + } + + nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); + } else { + local_nvme_num++; + } + } + + if (local_nvme_num > 0) { + /* used to probe local NVMe device */ + if (spdk_nvme_probe(NULL, probe_ctx, probe_cb, attach_cb, remove_cb)) { + rc = -1; + goto end; + } + + for (i = 0; i < probe_ctx->count; i++) { + if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) { + continue; + } + + if (!nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) { + SPDK_ERRLOG("NVMe SSD \"%s\" could not be found.\n", probe_ctx->trids[i].traddr); + SPDK_ERRLOG("Check PCIe BDF and that it is attached to UIO/VFIO driver.\n"); + } + } + } + + rc = bdev_nvme_set_hotplug(hotplug_enabled, hotplug_period, NULL, NULL); + if (rc) { + SPDK_ERRLOG("Failed to setup hotplug (%d): %s", rc, spdk_strerror(rc)); + rc = -1; + } +end: + free(probe_ctx); + return rc; +} + +static void +bdev_nvme_library_fini(void) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp; + struct nvme_probe_skip_entry *entry, *entry_tmp; + struct nvme_bdev_ns *ns; + uint32_t i; + + spdk_poller_unregister(&g_hotplug_poller); + free(g_hotplug_probe_ctx); + + TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { + TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); + free(entry); + } + + pthread_mutex_lock(&g_bdev_nvme_mutex); + TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) { + if (nvme_bdev_ctrlr->destruct) { + /* This controller's destruction was already started + * before the application started shutting down + */ + continue; + } + + pthread_mutex_unlock(&g_bdev_nvme_mutex); + + for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { + uint32_t nsid = i + 1; + + ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; + if (ns->populated) { + assert(ns->id == nsid); + nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns); + } + } + + pthread_mutex_lock(&g_bdev_nvme_mutex); + nvme_bdev_ctrlr->destruct = true; + + if (nvme_bdev_ctrlr->ref == 0) { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); + pthread_mutex_lock(&g_bdev_nvme_mutex); + } + } + + g_bdev_nvme_module_finish = true; + if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); + spdk_bdev_module_finish_done(); + return; + } + + pthread_mutex_unlock(&g_bdev_nvme_mutex); +} + +static void +bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_dif_ctx dif_ctx; + struct spdk_dif_error err_blk = {}; + int rc; + + rc = spdk_dif_ctx_init(&dif_ctx, + bdev->blocklen, bdev->md_len, bdev->md_interleave, + bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, + bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); + if (rc != 0) { + SPDK_ERRLOG("Initialization of DIF context failed\n"); + return; + } + + if (bdev->md_interleave) { + rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); + } else { + struct iovec md_iov = { + .iov_base = bdev_io->u.bdev.md_buf, + .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, + }; + + rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); + } + + if (rc != 0) { + SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", + err_blk.err_type, err_blk.err_offset); + } else { + SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); + } +} + +static void +bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_bdev_io *bio = ref; + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + + if (spdk_nvme_cpl_is_success(cpl)) { + /* Run PI verification for read data buffer. */ + bdev_nvme_verify_pi_error(bdev_io); + } + + /* Return original completion status */ + spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, + bio->cpl.status.sc); +} + +static void +bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_bdev_io *bio = ref; + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + int ret; + + if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { + SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", + cpl->status.sct, cpl->status.sc); + + /* Save completion status to use after verifying PI error. */ + bio->cpl = *cpl; + + /* Read without PI checking to verify PI error. */ + ret = bdev_nvme_no_pi_readv((struct nvme_bdev *)bdev_io->bdev->ctxt, + spdk_bdev_io_get_io_channel(bdev_io), + bio, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.md_buf, + bdev_io->u.bdev.num_blocks, + bdev_io->u.bdev.offset_blocks); + if (ret == 0) { + return; + } + } + + spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); +} + +static void +bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); + + if (spdk_nvme_cpl_is_pi_error(cpl)) { + SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", + cpl->status.sct, cpl->status.sc); + /* Run PI verification for write data buffer if PI error is detected. */ + bdev_nvme_verify_pi_error(bdev_io); + } + + spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); +} + +static void +bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); + + if (spdk_nvme_cpl_is_pi_error(cpl)) { + SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", + cpl->status.sct, cpl->status.sc); + /* Run PI verification for compare data buffer if PI error is detected. */ + bdev_nvme_verify_pi_error(bdev_io); + } + + spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); +} + +static void +bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_bdev_io *bio = ref; + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + + /* Compare operation completion */ + if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { + /* Save compare result for write callback */ + bio->cpl = *cpl; + return; + } + + /* Write operation completion */ + if (spdk_nvme_cpl_is_error(&bio->cpl)) { + /* If bio->cpl is already an error, it means the compare operation failed. In that case, + * complete the IO with the compare operation's status. + */ + if (!spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("Unexpected write success after compare failure.\n"); + } + + spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); + } else { + spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); + } +} + +static void +bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); + + spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); +} + +static void +bdev_nvme_admin_passthru_completion(void *ctx) +{ + struct nvme_bdev_io *bio = ctx; + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + + spdk_bdev_io_complete_nvme_status(bdev_io, + bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); +} + +static void +bdev_nvme_abort_completion(void *ctx) +{ + struct nvme_bdev_io *bio = ctx; + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + + if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_bdev_io *bio = ref; + + bio->cpl = *cpl; + spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); +} + +static void +bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_bdev_io *bio = ref; + + bio->cpl = *cpl; + spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); +} + +static void +bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) +{ + struct nvme_bdev_io *bio = ref; + struct iovec *iov; + + bio->iov_offset = sgl_offset; + for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { + iov = &bio->iovs[bio->iovpos]; + if (bio->iov_offset < iov->iov_len) { + break; + } + + bio->iov_offset -= iov->iov_len; + } +} + +static int +bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) +{ + struct nvme_bdev_io *bio = ref; + struct iovec *iov; + + assert(bio->iovpos < bio->iovcnt); + + iov = &bio->iovs[bio->iovpos]; + + *address = iov->iov_base; + *length = iov->iov_len; + + if (bio->iov_offset) { + assert(bio->iov_offset <= iov->iov_len); + *address += bio->iov_offset; + *length -= bio->iov_offset; + } + + bio->iov_offset += *length; + if (bio->iov_offset == iov->iov_len) { + bio->iovpos++; + bio->iov_offset = 0; + } + + return 0; +} + +static void +bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) +{ + struct nvme_bdev_io *bio = ref; + struct iovec *iov; + + bio->fused_iov_offset = sgl_offset; + for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { + iov = &bio->fused_iovs[bio->fused_iovpos]; + if (bio->fused_iov_offset < iov->iov_len) { + break; + } + + bio->fused_iov_offset -= iov->iov_len; + } +} + +static int +bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) +{ + struct nvme_bdev_io *bio = ref; + struct iovec *iov; + + assert(bio->fused_iovpos < bio->fused_iovcnt); + + iov = &bio->fused_iovs[bio->fused_iovpos]; + + *address = iov->iov_base; + *length = iov->iov_len; + + if (bio->fused_iov_offset) { + assert(bio->fused_iov_offset <= iov->iov_len); + *address += bio->fused_iov_offset; + *length -= bio->fused_iov_offset; + } + + bio->fused_iov_offset += *length; + if (bio->fused_iov_offset == iov->iov_len) { + bio->fused_iovpos++; + bio->fused_iov_offset = 0; + } + + return 0; +} + +static int +bdev_nvme_no_pi_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, + void *md, uint64_t lba_count, uint64_t lba) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx without PI check\n", + lba_count, lba); + + bio->iovs = iov; + bio->iovcnt = iovcnt; + bio->iovpos = 0; + bio->iov_offset = 0; + + rc = spdk_nvme_ns_cmd_readv_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count, + bdev_nvme_no_pi_readv_done, bio, 0, + bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, + md, 0, 0); + + if (rc != 0 && rc != -ENOMEM) { + SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); + } + return rc; +} + +static int +bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, + void *md, uint64_t lba_count, uint64_t lba) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx\n", + lba_count, lba); + + bio->iovs = iov; + bio->iovcnt = iovcnt; + bio->iovpos = 0; + bio->iov_offset = 0; + + rc = spdk_nvme_ns_cmd_readv_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count, + bdev_nvme_readv_done, bio, nbdev->disk.dif_check_flags, + bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, + md, 0, 0); + + if (rc != 0 && rc != -ENOMEM) { + SPDK_ERRLOG("readv failed: rc = %d\n", rc); + } + return rc; +} + +static int +bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "write %lu blocks with offset %#lx\n", + lba_count, lba); + + bio->iovs = iov; + bio->iovcnt = iovcnt; + bio->iovpos = 0; + bio->iov_offset = 0; + + rc = spdk_nvme_ns_cmd_writev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count, + bdev_nvme_writev_done, bio, nbdev->disk.dif_check_flags, + bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, + md, 0, 0); + + if (rc != 0 && rc != -ENOMEM) { + SPDK_ERRLOG("writev failed: rc = %d\n", rc); + } + return rc; +} + +static int +bdev_nvme_comparev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "compare %lu blocks with offset %#lx\n", + lba_count, lba); + + bio->iovs = iov; + bio->iovcnt = iovcnt; + bio->iovpos = 0; + bio->iov_offset = 0; + + rc = spdk_nvme_ns_cmd_comparev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count, + bdev_nvme_comparev_done, bio, nbdev->disk.dif_check_flags, + bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, + md, 0, 0); + + if (rc != 0 && rc != -ENOMEM) { + SPDK_ERRLOG("comparev failed: rc = %d\n", rc); + } + return rc; +} + +static int +bdev_nvme_comparev_and_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, + int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + uint32_t flags = nbdev->disk.dif_check_flags; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "compare and write %lu blocks with offset %#lx\n", + lba_count, lba); + + bio->iovs = cmp_iov; + bio->iovcnt = cmp_iovcnt; + bio->iovpos = 0; + bio->iov_offset = 0; + bio->fused_iovs = write_iov; + bio->fused_iovcnt = write_iovcnt; + bio->fused_iovpos = 0; + bio->fused_iov_offset = 0; + + if (bdev_io->num_retries == 0) { + bio->first_fused_submitted = false; + } + + if (!bio->first_fused_submitted) { + flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; + memset(&bio->cpl, 0, sizeof(bio->cpl)); + + rc = spdk_nvme_ns_cmd_comparev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count, + bdev_nvme_comparev_and_writev_done, bio, flags, + bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); + if (rc == 0) { + bio->first_fused_submitted = true; + flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; + } else { + if (rc != -ENOMEM) { + SPDK_ERRLOG("compare failed: rc = %d\n", rc); + } + return rc; + } + } + + flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; + + rc = spdk_nvme_ns_cmd_writev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count, + bdev_nvme_comparev_and_writev_done, bio, flags, + bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); + if (rc != 0 && rc != -ENOMEM) { + SPDK_ERRLOG("write failed: rc = %d\n", rc); + rc = 0; + } + + return rc; +} + +static int +bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + uint64_t offset_blocks, + uint64_t num_blocks) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; + struct spdk_nvme_dsm_range *range; + uint64_t offset, remaining; + uint64_t num_ranges_u64; + uint16_t num_ranges; + int rc; + + num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; + if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { + SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); + return -EINVAL; + } + num_ranges = (uint16_t)num_ranges_u64; + + offset = offset_blocks; + remaining = num_blocks; + range = &dsm_ranges[0]; + + /* Fill max-size ranges until the remaining blocks fit into one range */ + while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { + range->attributes.raw = 0; + range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; + range->starting_lba = offset; + + offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; + remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; + range++; + } + + /* Final range describes the remaining blocks */ + range->attributes.raw = 0; + range->length = remaining; + range->starting_lba = offset; + + rc = spdk_nvme_ns_cmd_dataset_management(nbdev->nvme_ns->ns, nvme_ch->qpair, + SPDK_NVME_DSM_ATTR_DEALLOCATE, + dsm_ranges, num_ranges, + bdev_nvme_queued_done, bio); + + return rc; +} + +static int +bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) +{ + uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr); + + if (nbytes > max_xfer_size) { + SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); + return -EINVAL; + } + + bio->orig_thread = spdk_io_channel_get_thread(ch); + + return spdk_nvme_ctrlr_cmd_admin_raw(nbdev->nvme_bdev_ctrlr->ctrlr, cmd, buf, + (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); +} + +static int +bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr); + + if (nbytes > max_xfer_size) { + SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); + return -EINVAL; + } + + /* + * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, + * so fill it out automatically. + */ + cmd->nsid = spdk_nvme_ns_get_id(nbdev->nvme_ns->ns); + + return spdk_nvme_ctrlr_cmd_io_raw(nbdev->nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf, + (uint32_t)nbytes, bdev_nvme_queued_done, bio); +} + +static int +bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(nbdev->nvme_ns->ns); + uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr); + + if (nbytes > max_xfer_size) { + SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); + return -EINVAL; + } + + if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(nbdev->nvme_ns->ns)) { + SPDK_ERRLOG("invalid meta data buffer size\n"); + return -EINVAL; + } + + /* + * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, + * so fill it out automatically. + */ + cmd->nsid = spdk_nvme_ns_get_id(nbdev->nvme_ns->ns); + + return spdk_nvme_ctrlr_cmd_io_raw_with_md(nbdev->nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf, + (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); +} + +static void +bdev_nvme_abort_admin_cmd(void *ctx) +{ + struct nvme_bdev_io *bio = ctx; + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + struct nvme_bdev *nbdev; + struct nvme_bdev_io *bio_to_abort; + int rc; + + nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; + bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; + + rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_bdev_ctrlr->ctrlr, + NULL, + bio_to_abort, + bdev_nvme_abort_done, bio); + if (rc == -ENOENT) { + /* If no admin command was found in admin qpair, complete the abort + * request with failure. + */ + bio->cpl.cdw0 |= 1U; + bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; + bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + + spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); + } +} + +static int +bdev_nvme_abort(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + int rc; + + bio->orig_thread = spdk_io_channel_get_thread(ch); + + rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_bdev_ctrlr->ctrlr, + nvme_ch->qpair, + bio_to_abort, + bdev_nvme_abort_done, bio); + if (rc == -ENOENT) { + /* If no command was found in I/O qpair, the target command may be + * admin command. Only a single thread tries aborting admin command + * to clean I/O flow. + */ + spdk_thread_send_msg(nbdev->nvme_bdev_ctrlr->thread, + bdev_nvme_abort_admin_cmd, bio); + rc = 0; + } + + return rc; +} + +static void +bdev_nvme_get_spdk_running_config(FILE *fp) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + + fprintf(fp, "\n[Nvme]"); + fprintf(fp, "\n" + "# NVMe Device Whitelist\n" + "# Users may specify which NVMe devices to claim by their transport id.\n" + "# See spdk_nvme_transport_id_parse() in spdk/nvme.h for the correct format.\n" + "# The second argument is the assigned name, which can be referenced from\n" + "# other sections in the configuration file. For NVMe devices, a namespace\n" + "# is automatically appended to each name in the format <YourName>nY, where\n" + "# Y is the NSID (starts at 1).\n"); + + TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { + const char *trtype; + const char *prchk_flags; + + trtype = spdk_nvme_transport_id_trtype_str(nvme_bdev_ctrlr->trid->trtype); + if (!trtype) { + continue; + } + + if (nvme_bdev_ctrlr->trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { + fprintf(fp, "TransportID \"trtype:%s traddr:%s\" %s\n", + trtype, + nvme_bdev_ctrlr->trid->traddr, nvme_bdev_ctrlr->name); + } else { + const char *adrfam; + + adrfam = spdk_nvme_transport_id_adrfam_str(nvme_bdev_ctrlr->trid->adrfam); + prchk_flags = spdk_nvme_prchk_flags_str(nvme_bdev_ctrlr->prchk_flags); + + if (adrfam) { + fprintf(fp, "TransportID \"trtype:%s adrfam:%s traddr:%s trsvcid:%s subnqn:%s\" %s", + trtype, adrfam, + nvme_bdev_ctrlr->trid->traddr, nvme_bdev_ctrlr->trid->trsvcid, + nvme_bdev_ctrlr->trid->subnqn, nvme_bdev_ctrlr->name); + } else { + fprintf(fp, "TransportID \"trtype:%s traddr:%s trsvcid:%s subnqn:%s\" %s", + trtype, + nvme_bdev_ctrlr->trid->traddr, nvme_bdev_ctrlr->trid->trsvcid, + nvme_bdev_ctrlr->trid->subnqn, nvme_bdev_ctrlr->name); + } + + if (prchk_flags) { + fprintf(fp, " \"%s\"\n", prchk_flags); + } else { + fprintf(fp, "\n"); + } + } + } + + fprintf(fp, "\n" + "# The number of attempts per I/O when an I/O fails. Do not include\n" + "# this key to get the default behavior.\n"); + fprintf(fp, "RetryCount %d\n", g_opts.retry_count); + fprintf(fp, "\n" + "# Timeout for each command, in microseconds. If 0, don't track timeouts.\n"); + fprintf(fp, "TimeoutUsec %"PRIu64"\n", g_opts.timeout_us); + + fprintf(fp, "\n" + "# Action to take on command time out. Only valid when Timeout is greater\n" + "# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort\n" + "# the command, or 'None' to just print a message but do nothing.\n" + "# Admin command timeouts will always result in a reset.\n"); + switch (g_opts.action_on_timeout) { + case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: + fprintf(fp, "ActionOnTimeout None\n"); + break; + case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: + fprintf(fp, "ActionOnTimeout Reset\n"); + break; + case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: + fprintf(fp, "ActionOnTimeout Abort\n"); + break; + } + + fprintf(fp, "\n" + "# Set how often the admin queue is polled for asynchronous events.\n" + "# Units in microseconds.\n"); + fprintf(fp, "AdminPollRate %"PRIu64"\n", g_opts.nvme_adminq_poll_period_us); + fprintf(fp, "IOPollRate %" PRIu64"\n", g_opts.nvme_ioq_poll_period_us); + fprintf(fp, "\n" + "# Disable handling of hotplug (runtime insert and remove) events,\n" + "# users can set to Yes if want to enable it.\n" + "# Default: No\n"); + fprintf(fp, "HotplugEnable %s\n", g_nvme_hotplug_enabled ? "Yes" : "No"); + fprintf(fp, "\n" + "# Set how often the hotplug is processed for insert and remove events." + "# Units in microseconds.\n"); + fprintf(fp, "HotplugPollRate %"PRIu64"\n", g_nvme_hotplug_poll_period_us); + if (g_nvme_hostnqn) { + fprintf(fp, "HostNQN %s\n", g_nvme_hostnqn); + } + fprintf(fp, "DelayCmdSubmit %s\n", g_opts.delay_cmd_submit ? "True" : "False"); + + fprintf(fp, "\n"); +} + +static void +nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns) +{ + /* nop */ +} + +static void +nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns) +{ + g_config_json_namespace_fn[ns->type](w, ns); +} + +static int +bdev_nvme_config_json(struct spdk_json_write_ctx *w) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct spdk_nvme_transport_id *trid; + const char *action; + uint32_t nsid; + + if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { + action = "reset"; + } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { + action = "abort"; + } else { + action = "none"; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "action_on_timeout", action); + spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); + spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); + spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); + spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); + spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); + spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); + spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); + spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); + spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); + spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + pthread_mutex_lock(&g_bdev_nvme_mutex); + TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { + trid = nvme_bdev_ctrlr->trid; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); + nvme_bdev_dump_trid_json(trid, w); + spdk_json_write_named_bool(w, "prchk_reftag", + (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); + spdk_json_write_named_bool(w, "prchk_guard", + (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { + if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) { + continue; + } + + nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]); + } + } + + /* Dump as last parameter to give all NVMe bdevs chance to be constructed + * before enabling hotplug poller. + */ + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); + spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + pthread_mutex_unlock(&g_bdev_nvme_mutex); + return 0; +} + +struct spdk_nvme_ctrlr * +bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) +{ + if (!bdev || bdev->module != &nvme_if) { + return NULL; + } + + return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_bdev_ctrlr->ctrlr; +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_nvme", SPDK_LOG_BDEV_NVME) diff --git a/src/spdk/module/bdev/nvme/bdev_nvme.h b/src/spdk/module/bdev/nvme/bdev_nvme.h new file mode 100644 index 000000000..417c21cad --- /dev/null +++ b/src/spdk/module/bdev/nvme/bdev_nvme.h @@ -0,0 +1,90 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_NVME_H +#define SPDK_BDEV_NVME_H + +#include "spdk/stdinc.h" + +#include "spdk/queue.h" +#include "spdk/nvme.h" +#include "spdk/bdev_module.h" + +#include "common.h" + +enum spdk_bdev_timeout_action { + SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE = 0, + SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET, + SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT, +}; + +struct spdk_bdev_nvme_opts { + enum spdk_bdev_timeout_action action_on_timeout; + uint64_t timeout_us; + uint32_t retry_count; + uint32_t arbitration_burst; + uint32_t low_priority_weight; + uint32_t medium_priority_weight; + uint32_t high_priority_weight; + uint64_t nvme_adminq_poll_period_us; + uint64_t nvme_ioq_poll_period_us; + uint32_t io_queue_requests; + bool delay_cmd_submit; +}; + +struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); +void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts); +int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts); +int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx); + +int bdev_nvme_create(struct spdk_nvme_transport_id *trid, + struct spdk_nvme_host_id *hostid, + const char *base_name, + const char **names, + uint32_t count, + const char *hostnqn, + uint32_t prchk_flags, + spdk_bdev_create_nvme_fn cb_fn, + void *cb_ctx); +struct spdk_nvme_ctrlr *bdev_nvme_get_ctrlr(struct spdk_bdev *bdev); + +/** + * Delete NVMe controller with all bdevs on top of it. + * Requires to pass name of NVMe controller. + * + * \param name NVMe controller name + * \return zero on success, -EINVAL on wrong parameters or -ENODEV if controller is not found + */ +int bdev_nvme_delete(const char *name); + +#endif /* SPDK_BDEV_NVME_H */ diff --git a/src/spdk/module/bdev/nvme/bdev_nvme_cuse_rpc.c b/src/spdk/module/bdev/nvme/bdev_nvme_cuse_rpc.c new file mode 100644 index 000000000..c116c510d --- /dev/null +++ b/src/spdk/module/bdev/nvme/bdev_nvme_cuse_rpc.c @@ -0,0 +1,152 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "bdev_nvme.h" + +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/nvme.h" + +#include "spdk_internal/log.h" + +struct rpc_nvme_cuse_register { + char *name; +}; + +static void +free_rpc_nvme_cuse_register(struct rpc_nvme_cuse_register *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_nvme_cuse_register_decoders[] = { + {"name", offsetof(struct rpc_nvme_cuse_register, name), spdk_json_decode_string}, +}; + +static void +rpc_nvme_cuse_register(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_nvme_cuse_register req = {}; + struct spdk_json_write_ctx *w; + struct nvme_bdev_ctrlr *bdev_ctrlr = NULL; + int rc; + + if (spdk_json_decode_object(params, rpc_nvme_cuse_register_decoders, + SPDK_COUNTOF(rpc_nvme_cuse_register_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(req.name); + if (!bdev_ctrlr) { + SPDK_ERRLOG("No such controller\n"); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + rc = spdk_nvme_cuse_register(bdev_ctrlr->ctrlr); + if (rc) { + SPDK_ERRLOG("Failed to register CUSE devices: %s\n", spdk_strerror(-rc)); + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_nvme_cuse_register(&req); +} +SPDK_RPC_REGISTER("bdev_nvme_cuse_register", rpc_nvme_cuse_register, SPDK_RPC_RUNTIME) + +struct rpc_nvme_cuse_unregister { + char *name; +}; + +static void +free_rpc_nvme_cuse_unregister(struct rpc_nvme_cuse_unregister *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_nvme_cuse_unregister_decoders[] = { + {"name", offsetof(struct rpc_nvme_cuse_unregister, name), spdk_json_decode_string, true}, +}; + +static void +rpc_nvme_cuse_unregister(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_nvme_cuse_unregister req = {}; + struct spdk_json_write_ctx *w; + struct nvme_bdev_ctrlr *bdev_ctrlr = NULL; + int rc; + + if (spdk_json_decode_object(params, rpc_nvme_cuse_unregister_decoders, + SPDK_COUNTOF(rpc_nvme_cuse_unregister_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(req.name); + if (!bdev_ctrlr) { + SPDK_ERRLOG("No such controller\n"); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + rc = spdk_nvme_cuse_unregister(bdev_ctrlr->ctrlr); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_nvme_cuse_unregister(&req); +} +SPDK_RPC_REGISTER("bdev_nvme_cuse_unregister", rpc_nvme_cuse_unregister, SPDK_RPC_RUNTIME) diff --git a/src/spdk/module/bdev/nvme/bdev_nvme_rpc.c b/src/spdk/module/bdev/nvme/bdev_nvme_rpc.c new file mode 100644 index 000000000..299da4023 --- /dev/null +++ b/src/spdk/module/bdev/nvme/bdev_nvme_rpc.c @@ -0,0 +1,842 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "bdev_nvme.h" +#include "common.h" + +#include "spdk/config.h" + +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" +#include "spdk/bdev_module.h" + +struct open_descriptors { + void *desc; + struct spdk_bdev *bdev; + TAILQ_ENTRY(open_descriptors) tqlst; + struct spdk_thread *thread; +}; +typedef TAILQ_HEAD(, open_descriptors) open_descriptors_t; + +static int +rpc_decode_action_on_timeout(const struct spdk_json_val *val, void *out) +{ + enum spdk_bdev_timeout_action *action = out; + + if (spdk_json_strequal(val, "none") == true) { + *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE; + } else if (spdk_json_strequal(val, "abort") == true) { + *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT; + } else if (spdk_json_strequal(val, "reset") == true) { + *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET; + } else { + SPDK_NOTICELOG("Invalid parameter value: action_on_timeout\n"); + return -EINVAL; + } + + return 0; +} + +static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] = { + {"action_on_timeout", offsetof(struct spdk_bdev_nvme_opts, action_on_timeout), rpc_decode_action_on_timeout, true}, + {"timeout_us", offsetof(struct spdk_bdev_nvme_opts, timeout_us), spdk_json_decode_uint64, true}, + {"retry_count", offsetof(struct spdk_bdev_nvme_opts, retry_count), spdk_json_decode_uint32, true}, + {"arbitration_burst", offsetof(struct spdk_bdev_nvme_opts, arbitration_burst), spdk_json_decode_uint32, true}, + {"low_priority_weight", offsetof(struct spdk_bdev_nvme_opts, low_priority_weight), spdk_json_decode_uint32, true}, + {"medium_priority_weight", offsetof(struct spdk_bdev_nvme_opts, medium_priority_weight), spdk_json_decode_uint32, true}, + {"high_priority_weight", offsetof(struct spdk_bdev_nvme_opts, high_priority_weight), spdk_json_decode_uint32, true}, + {"nvme_adminq_poll_period_us", offsetof(struct spdk_bdev_nvme_opts, nvme_adminq_poll_period_us), spdk_json_decode_uint64, true}, + {"nvme_ioq_poll_period_us", offsetof(struct spdk_bdev_nvme_opts, nvme_ioq_poll_period_us), spdk_json_decode_uint64, true}, + {"io_queue_requests", offsetof(struct spdk_bdev_nvme_opts, io_queue_requests), spdk_json_decode_uint32, true}, + {"delay_cmd_submit", offsetof(struct spdk_bdev_nvme_opts, delay_cmd_submit), spdk_json_decode_bool, true}, +}; + +static void +rpc_bdev_nvme_set_options(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_bdev_nvme_opts opts; + struct spdk_json_write_ctx *w; + int rc; + + bdev_nvme_get_opts(&opts); + if (params && spdk_json_decode_object(params, rpc_bdev_nvme_options_decoders, + SPDK_COUNTOF(rpc_bdev_nvme_options_decoders), + &opts)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + return; + } + + rc = bdev_nvme_set_opts(&opts); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + + return; +} +SPDK_RPC_REGISTER("bdev_nvme_set_options", rpc_bdev_nvme_set_options, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_set_options, set_bdev_nvme_options) + +struct rpc_bdev_nvme_hotplug { + bool enabled; + uint64_t period_us; +}; + +static const struct spdk_json_object_decoder rpc_bdev_nvme_hotplug_decoders[] = { + {"enable", offsetof(struct rpc_bdev_nvme_hotplug, enabled), spdk_json_decode_bool, false}, + {"period_us", offsetof(struct rpc_bdev_nvme_hotplug, period_us), spdk_json_decode_uint64, true}, +}; + +static void +rpc_bdev_nvme_set_hotplug_done(void *ctx) +{ + struct spdk_jsonrpc_request *request = ctx; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_nvme_set_hotplug(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_nvme_hotplug req = {false, 0}; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_nvme_hotplug_decoders, + SPDK_COUNTOF(rpc_bdev_nvme_hotplug_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = bdev_nvme_set_hotplug(req.enabled, req.period_us, rpc_bdev_nvme_set_hotplug_done, + request); + if (rc) { + goto invalid; + } + + return; +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("bdev_nvme_set_hotplug", rpc_bdev_nvme_set_hotplug, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_set_hotplug, set_bdev_nvme_hotplug) + +struct rpc_bdev_nvme_attach_controller { + char *name; + char *trtype; + char *adrfam; + char *traddr; + char *trsvcid; + char *priority; + char *subnqn; + char *hostnqn; + char *hostaddr; + char *hostsvcid; + bool prchk_reftag; + bool prchk_guard; +}; + +static void +free_rpc_bdev_nvme_attach_controller(struct rpc_bdev_nvme_attach_controller *req) +{ + free(req->name); + free(req->trtype); + free(req->adrfam); + free(req->traddr); + free(req->trsvcid); + free(req->priority); + free(req->subnqn); + free(req->hostnqn); + free(req->hostaddr); + free(req->hostsvcid); +} + +static const struct spdk_json_object_decoder rpc_bdev_nvme_attach_controller_decoders[] = { + {"name", offsetof(struct rpc_bdev_nvme_attach_controller, name), spdk_json_decode_string}, + {"trtype", offsetof(struct rpc_bdev_nvme_attach_controller, trtype), spdk_json_decode_string}, + {"traddr", offsetof(struct rpc_bdev_nvme_attach_controller, traddr), spdk_json_decode_string}, + + {"adrfam", offsetof(struct rpc_bdev_nvme_attach_controller, adrfam), spdk_json_decode_string, true}, + {"trsvcid", offsetof(struct rpc_bdev_nvme_attach_controller, trsvcid), spdk_json_decode_string, true}, + {"priority", offsetof(struct rpc_bdev_nvme_attach_controller, priority), spdk_json_decode_string, true}, + {"subnqn", offsetof(struct rpc_bdev_nvme_attach_controller, subnqn), spdk_json_decode_string, true}, + {"hostnqn", offsetof(struct rpc_bdev_nvme_attach_controller, hostnqn), spdk_json_decode_string, true}, + {"hostaddr", offsetof(struct rpc_bdev_nvme_attach_controller, hostaddr), spdk_json_decode_string, true}, + {"hostsvcid", offsetof(struct rpc_bdev_nvme_attach_controller, hostsvcid), spdk_json_decode_string, true}, + + {"prchk_reftag", offsetof(struct rpc_bdev_nvme_attach_controller, prchk_reftag), spdk_json_decode_bool, true}, + {"prchk_guard", offsetof(struct rpc_bdev_nvme_attach_controller, prchk_guard), spdk_json_decode_bool, true} +}; + +#define NVME_MAX_BDEVS_PER_RPC 128 + +struct rpc_bdev_nvme_attach_controller_ctx { + struct rpc_bdev_nvme_attach_controller req; + uint32_t count; + const char *names[NVME_MAX_BDEVS_PER_RPC]; + struct spdk_jsonrpc_request *request; +}; + +static void +rpc_bdev_nvme_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) +{ + struct rpc_bdev_nvme_attach_controller_ctx *ctx = cb_ctx; + struct spdk_jsonrpc_request *request = ctx->request; + struct spdk_json_write_ctx *w; + size_t i; + + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto exit; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + for (i = 0; i < bdev_count; i++) { + spdk_json_write_string(w, ctx->names[i]); + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + +exit: + free_rpc_bdev_nvme_attach_controller(&ctx->req); + free(ctx); +} + +static void +rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_nvme_attach_controller_ctx *ctx; + struct spdk_nvme_transport_id trid = {}; + struct spdk_nvme_host_id hostid = {}; + uint32_t prchk_flags = 0; + int rc; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); + return; + } + + if (spdk_json_decode_object(params, rpc_bdev_nvme_attach_controller_decoders, + SPDK_COUNTOF(rpc_bdev_nvme_attach_controller_decoders), + &ctx->req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + /* Parse trstring */ + rc = spdk_nvme_transport_id_populate_trstring(&trid, ctx->req.trtype); + if (rc < 0) { + SPDK_ERRLOG("Failed to parse trtype: %s\n", ctx->req.trtype); + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s", + ctx->req.trtype); + goto cleanup; + } + + /* Parse trtype */ + rc = spdk_nvme_transport_id_parse_trtype(&trid.trtype, ctx->req.trtype); + assert(rc == 0); + + /* Parse traddr */ + snprintf(trid.traddr, sizeof(trid.traddr), "%s", ctx->req.traddr); + + /* Parse adrfam */ + if (ctx->req.adrfam) { + rc = spdk_nvme_transport_id_parse_adrfam(&trid.adrfam, ctx->req.adrfam); + if (rc < 0) { + SPDK_ERRLOG("Failed to parse adrfam: %s\n", ctx->req.adrfam); + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse adrfam: %s", + ctx->req.adrfam); + goto cleanup; + } + } + + /* Parse trsvcid */ + if (ctx->req.trsvcid) { + snprintf(trid.trsvcid, sizeof(trid.trsvcid), "%s", ctx->req.trsvcid); + } + + /* Parse priority for the NVMe-oF transport connection */ + if (ctx->req.priority) { + trid.priority = spdk_strtol(ctx->req.priority, 10); + } + + /* Parse subnqn */ + if (ctx->req.subnqn) { + snprintf(trid.subnqn, sizeof(trid.subnqn), "%s", ctx->req.subnqn); + } + + if (ctx->req.hostaddr) { + snprintf(hostid.hostaddr, sizeof(hostid.hostaddr), "%s", ctx->req.hostaddr); + } + + if (ctx->req.hostsvcid) { + snprintf(hostid.hostsvcid, sizeof(hostid.hostsvcid), "%s", ctx->req.hostsvcid); + } + + if (ctx->req.prchk_reftag) { + prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG; + } + + if (ctx->req.prchk_guard) { + prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD; + } + + ctx->request = request; + ctx->count = NVME_MAX_BDEVS_PER_RPC; + rc = bdev_nvme_create(&trid, &hostid, ctx->req.name, ctx->names, ctx->count, ctx->req.hostnqn, + prchk_flags, rpc_bdev_nvme_attach_controller_done, ctx); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + return; + +cleanup: + free_rpc_bdev_nvme_attach_controller(&ctx->req); + free(ctx); +} +SPDK_RPC_REGISTER("bdev_nvme_attach_controller", rpc_bdev_nvme_attach_controller, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_attach_controller, construct_nvme_bdev) + +static void +rpc_dump_nvme_controller_info(struct spdk_json_write_ctx *w, + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) +{ + struct spdk_nvme_transport_id *trid; + + trid = nvme_bdev_ctrlr->trid; + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); + +#ifdef SPDK_CONFIG_NVME_CUSE + size_t cuse_name_size = 128; + char cuse_name[cuse_name_size]; + + int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_bdev_ctrlr->ctrlr, cuse_name, &cuse_name_size); + if (rc == 0) { + spdk_json_write_named_string(w, "cuse_device", cuse_name); + } +#endif + + spdk_json_write_named_object_begin(w, "trid"); + nvme_bdev_dump_trid_json(trid, w); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +struct rpc_bdev_nvme_get_controllers { + char *name; +}; + +static void +free_rpc_bdev_nvme_get_controllers(struct rpc_bdev_nvme_get_controllers *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_nvme_get_controllers_decoders[] = { + {"name", offsetof(struct rpc_bdev_nvme_get_controllers, name), spdk_json_decode_string, true}, +}; + +static void +rpc_bdev_nvme_get_controllers(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_nvme_get_controllers req = {}; + struct spdk_json_write_ctx *w; + struct nvme_bdev_ctrlr *ctrlr = NULL; + + if (params && spdk_json_decode_object(params, rpc_bdev_nvme_get_controllers_decoders, + SPDK_COUNTOF(rpc_bdev_nvme_get_controllers_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + if (req.name) { + ctrlr = nvme_bdev_ctrlr_get_by_name(req.name); + if (ctrlr == NULL) { + SPDK_ERRLOG("ctrlr '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Controller %s does not exist", req.name); + goto cleanup; + } + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + if (ctrlr != NULL) { + rpc_dump_nvme_controller_info(w, ctrlr); + } else { + for (ctrlr = nvme_bdev_first_ctrlr(); ctrlr; ctrlr = nvme_bdev_next_ctrlr(ctrlr)) { + rpc_dump_nvme_controller_info(w, ctrlr); + } + } + + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_nvme_get_controllers(&req); +} +SPDK_RPC_REGISTER("bdev_nvme_get_controllers", rpc_bdev_nvme_get_controllers, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_get_controllers, get_nvme_controllers) + +struct rpc_bdev_nvme_detach_controller { + char *name; +}; + +static void +free_rpc_bdev_nvme_detach_controller(struct rpc_bdev_nvme_detach_controller *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_nvme_detach_controller_decoders[] = { + {"name", offsetof(struct rpc_bdev_nvme_detach_controller, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_nvme_detach_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_nvme_detach_controller req = {NULL}; + struct spdk_json_write_ctx *w; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_bdev_nvme_detach_controller_decoders, + SPDK_COUNTOF(rpc_bdev_nvme_detach_controller_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = bdev_nvme_delete(req.name); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_nvme_detach_controller(&req); +} +SPDK_RPC_REGISTER("bdev_nvme_detach_controller", rpc_bdev_nvme_detach_controller, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_detach_controller, delete_nvme_controller) + +struct rpc_apply_firmware { + char *filename; + char *bdev_name; +}; + +static void +free_rpc_apply_firmware(struct rpc_apply_firmware *req) +{ + free(req->filename); + free(req->bdev_name); +} + +static const struct spdk_json_object_decoder rpc_apply_firmware_decoders[] = { + {"filename", offsetof(struct rpc_apply_firmware, filename), spdk_json_decode_string}, + {"bdev_name", offsetof(struct rpc_apply_firmware, bdev_name), spdk_json_decode_string}, +}; + +struct firmware_update_info { + void *fw_image; + void *p; + unsigned int size; + unsigned int size_remaining; + unsigned int offset; + unsigned int transfer; + + void *desc; + struct spdk_io_channel *ch; + struct spdk_jsonrpc_request *request; + struct spdk_nvme_ctrlr *ctrlr; + open_descriptors_t desc_head; + struct rpc_apply_firmware *req; +}; + +static void +_apply_firmware_cleanup(void *ctx) +{ + struct spdk_bdev_desc *desc = ctx; + + spdk_bdev_close(desc); +} + +static void +apply_firmware_cleanup(void *cb_arg) +{ + struct open_descriptors *opt, *tmp; + struct firmware_update_info *firm_ctx = cb_arg; + + if (!firm_ctx) { + return; + } + + if (firm_ctx->fw_image) { + spdk_free(firm_ctx->fw_image); + } + + if (firm_ctx->req) { + free_rpc_apply_firmware(firm_ctx->req); + free(firm_ctx->req); + } + + if (firm_ctx->ch) { + spdk_put_io_channel(firm_ctx->ch); + } + + TAILQ_FOREACH_SAFE(opt, &firm_ctx->desc_head, tqlst, tmp) { + TAILQ_REMOVE(&firm_ctx->desc_head, opt, tqlst); + /* Close the underlying bdev on its same opened thread. */ + if (opt->thread && opt->thread != spdk_get_thread()) { + spdk_thread_send_msg(opt->thread, _apply_firmware_cleanup, opt->desc); + } else { + spdk_bdev_close(opt->desc); + } + free(opt); + } + free(firm_ctx); +} + +static void +apply_firmware_complete_reset(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + int rc; + struct spdk_json_write_ctx *w; + struct firmware_update_info *firm_ctx = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (!success) { + spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "firmware commit failed."); + apply_firmware_cleanup(firm_ctx); + return; + } + + if ((rc = spdk_nvme_ctrlr_reset(firm_ctx->ctrlr)) != 0) { + spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Controller reset failed."); + apply_firmware_cleanup(firm_ctx); + return; + } + + w = spdk_jsonrpc_begin_result(firm_ctx->request); + spdk_json_write_string(w, "firmware commit succeeded. Controller reset in progress."); + spdk_jsonrpc_end_result(firm_ctx->request, w); + apply_firmware_cleanup(firm_ctx); +} + +static void +apply_firmware_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_nvme_cmd cmd = {}; + struct spdk_nvme_fw_commit fw_commit; + int slot = 0; + int rc; + struct firmware_update_info *firm_ctx = cb_arg; + enum spdk_nvme_fw_commit_action commit_action = SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG; + + if (!success) { + spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "firmware download failed ."); + spdk_bdev_free_io(bdev_io); + apply_firmware_cleanup(firm_ctx); + return; + } + + firm_ctx->p += firm_ctx->transfer; + firm_ctx->offset += firm_ctx->transfer; + firm_ctx->size_remaining -= firm_ctx->transfer; + + switch (firm_ctx->size_remaining) { + case 0: + /* firmware download completed. Commit firmware */ + memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit)); + fw_commit.fs = slot; + fw_commit.ca = commit_action; + + cmd.opc = SPDK_NVME_OPC_FIRMWARE_COMMIT; + memcpy(&cmd.cdw10, &fw_commit, sizeof(uint32_t)); + rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, NULL, 0, + apply_firmware_complete_reset, firm_ctx); + if (rc) { + spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "firmware commit failed."); + spdk_bdev_free_io(bdev_io); + apply_firmware_cleanup(firm_ctx); + return; + } + break; + default: + firm_ctx->transfer = spdk_min(firm_ctx->size_remaining, 4096); + cmd.opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; + + cmd.cdw10 = (firm_ctx->transfer >> 2) - 1; + cmd.cdw11 = firm_ctx->offset >> 2; + rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, firm_ctx->p, + firm_ctx->transfer, apply_firmware_complete, firm_ctx); + if (rc) { + spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "firmware download failed."); + spdk_bdev_free_io(bdev_io); + apply_firmware_cleanup(firm_ctx); + return; + } + break; + } +} + +static void +rpc_bdev_nvme_apply_firmware(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + int rc; + int fd = -1; + struct stat fw_stat; + struct spdk_nvme_ctrlr *ctrlr; + char msg[1024]; + struct spdk_bdev *bdev; + struct spdk_bdev *bdev2; + struct open_descriptors *opt; + struct spdk_bdev_desc *desc; + struct spdk_nvme_cmd *cmd; + struct firmware_update_info *firm_ctx; + + firm_ctx = calloc(1, sizeof(struct firmware_update_info)); + if (!firm_ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation error."); + return; + } + firm_ctx->fw_image = NULL; + TAILQ_INIT(&firm_ctx->desc_head); + firm_ctx->request = request; + + firm_ctx->req = calloc(1, sizeof(struct rpc_apply_firmware)); + if (!firm_ctx->req) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation error."); + free(firm_ctx); + return; + } + + if (spdk_json_decode_object(params, rpc_apply_firmware_decoders, + SPDK_COUNTOF(rpc_apply_firmware_decoders), firm_ctx->req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed."); + free(firm_ctx->req); + free(firm_ctx); + return; + } + + if ((bdev = spdk_bdev_get_by_name(firm_ctx->req->bdev_name)) == NULL) { + snprintf(msg, sizeof(msg), "bdev %s were not found", firm_ctx->req->bdev_name); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); + apply_firmware_cleanup(firm_ctx); + return; + } + + if ((ctrlr = bdev_nvme_get_ctrlr(bdev)) == NULL) { + snprintf(msg, sizeof(msg), "Controller information for %s were not found.", + firm_ctx->req->bdev_name); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); + apply_firmware_cleanup(firm_ctx); + return; + } + firm_ctx->ctrlr = ctrlr; + + for (bdev2 = spdk_bdev_first(); bdev2; bdev2 = spdk_bdev_next(bdev2)) { + + if (bdev_nvme_get_ctrlr(bdev2) != ctrlr) { + continue; + } + + if (!(opt = malloc(sizeof(struct open_descriptors)))) { + snprintf(msg, sizeof(msg), "Memory allocation error."); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); + apply_firmware_cleanup(firm_ctx); + return; + } + + if ((rc = spdk_bdev_open(bdev2, true, NULL, NULL, &desc)) != 0) { + snprintf(msg, sizeof(msg), "Device %s is in use.", firm_ctx->req->bdev_name); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); + free(opt); + apply_firmware_cleanup(firm_ctx); + return; + } + + /* Save the thread where the base device is opened */ + opt->thread = spdk_get_thread(); + + opt->desc = desc; + opt->bdev = bdev; + TAILQ_INSERT_TAIL(&firm_ctx->desc_head, opt, tqlst); + } + + /* + * find a descriptor associated with our bdev + */ + firm_ctx->desc = NULL; + TAILQ_FOREACH(opt, &firm_ctx->desc_head, tqlst) { + if (opt->bdev == bdev) { + firm_ctx->desc = opt->desc; + break; + } + } + + if (!firm_ctx->desc) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "No descriptor were found."); + apply_firmware_cleanup(firm_ctx); + return; + } + + firm_ctx->ch = spdk_bdev_get_io_channel(firm_ctx->desc); + if (!firm_ctx->ch) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "No channels were found."); + apply_firmware_cleanup(firm_ctx); + return; + } + + fd = open(firm_ctx->req->filename, O_RDONLY); + if (fd < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "open file failed."); + apply_firmware_cleanup(firm_ctx); + return; + } + + rc = fstat(fd, &fw_stat); + if (rc < 0) { + close(fd); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "fstat failed."); + apply_firmware_cleanup(firm_ctx); + return; + } + + firm_ctx->size = fw_stat.st_size; + if (fw_stat.st_size % 4) { + close(fd); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Firmware image size is not multiple of 4."); + apply_firmware_cleanup(firm_ctx); + return; + } + + firm_ctx->fw_image = spdk_zmalloc(firm_ctx->size, 4096, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!firm_ctx->fw_image) { + close(fd); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation error."); + apply_firmware_cleanup(firm_ctx); + return; + } + firm_ctx->p = firm_ctx->fw_image; + + if (read(fd, firm_ctx->p, firm_ctx->size) != ((ssize_t)(firm_ctx->size))) { + close(fd); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Read firmware image failed!"); + apply_firmware_cleanup(firm_ctx); + return; + } + close(fd); + + firm_ctx->offset = 0; + firm_ctx->size_remaining = firm_ctx->size; + firm_ctx->transfer = spdk_min(firm_ctx->size_remaining, 4096); + + cmd = malloc(sizeof(struct spdk_nvme_cmd)); + if (!cmd) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation error."); + apply_firmware_cleanup(firm_ctx); + return; + } + memset(cmd, 0, sizeof(struct spdk_nvme_cmd)); + cmd->opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; + + cmd->cdw10 = (firm_ctx->transfer >> 2) - 1; + cmd->cdw11 = firm_ctx->offset >> 2; + + rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, cmd, firm_ctx->p, + firm_ctx->transfer, apply_firmware_complete, firm_ctx); + if (rc) { + free(cmd); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Read firmware image failed!"); + apply_firmware_cleanup(firm_ctx); + return; + } +} +SPDK_RPC_REGISTER("bdev_nvme_apply_firmware", rpc_bdev_nvme_apply_firmware, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_apply_firmware, apply_nvme_firmware) diff --git a/src/spdk/module/bdev/nvme/bdev_ocssd.c b/src/spdk/module/bdev/nvme/bdev_ocssd.c new file mode 100644 index 000000000..35f665f40 --- /dev/null +++ b/src/spdk/module/bdev/nvme/bdev_ocssd.c @@ -0,0 +1,1498 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/bdev_module.h" +#include "spdk/bdev_zone.h" +#include "spdk/likely.h" +#include "spdk/log.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/nvme_ocssd.h" +#include "spdk/nvme_ocssd_spec.h" +#include "spdk_internal/log.h" +#include "spdk/nvme.h" +#include "common.h" +#include "bdev_ocssd.h" + +struct bdev_ocssd_lba_offsets { + uint32_t grp; + uint32_t pu; + uint32_t chk; + uint32_t lbk; +}; + +struct bdev_ocssd_zone { + uint64_t slba; + uint64_t write_pointer; + uint64_t capacity; + bool busy; +}; + +struct bdev_ocssd_io { + union { + struct { + struct bdev_ocssd_zone *zone; + size_t iov_pos; + size_t iov_off; + uint64_t lba[SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES]; + } io; + struct { + size_t chunk_offset; + struct spdk_ocssd_chunk_information_entry chunk_info; + } zone_info; + }; +}; + +struct ocssd_io_channel { + struct spdk_poller *pending_poller; + TAILQ_HEAD(, spdk_bdev_io) pending_requests; +}; + +struct ocssd_bdev { + struct nvme_bdev nvme_bdev; + struct bdev_ocssd_zone *zones; + struct bdev_ocssd_range range; +}; + +struct bdev_ocssd_ns { + struct spdk_ocssd_geometry_data geometry; + struct bdev_ocssd_lba_offsets lba_offsets; + bool chunk_notify_pending; + uint64_t chunk_notify_count; + uint64_t num_outstanding; +#define CHUNK_NOTIFICATION_ENTRY_COUNT 64 + struct spdk_ocssd_chunk_notification_entry chunk[CHUNK_NOTIFICATION_ENTRY_COUNT]; +}; + +struct ocssd_bdev_ctrlr { + struct spdk_poller *mm_poller; +}; + +static struct bdev_ocssd_ns * +bdev_ocssd_get_ns_from_nvme(struct nvme_bdev_ns *nvme_ns) +{ + return nvme_ns->type_ctx; +} + +static struct bdev_ocssd_ns * +bdev_ocssd_get_ns_from_bdev(struct ocssd_bdev *ocssd_bdev) +{ + return bdev_ocssd_get_ns_from_nvme(ocssd_bdev->nvme_bdev.nvme_ns); +} + +static uint64_t +bdev_ocssd_num_parallel_units(const struct ocssd_bdev *ocssd_bdev) +{ + return ocssd_bdev->range.end - ocssd_bdev->range.begin + 1; +} + +static uint64_t +bdev_ocssd_num_zones(const struct ocssd_bdev *ocssd_bdev) +{ + return ocssd_bdev->nvme_bdev.disk.blockcnt / ocssd_bdev->nvme_bdev.disk.zone_size; +} + +static int +bdev_ocssd_library_init(void) +{ + return 0; +} + +static void +bdev_ocssd_library_fini(void) +{ +} + +static int +bdev_ocssd_config_json(struct spdk_json_write_ctx *w) +{ + return 0; +} + +void +bdev_ocssd_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct nvme_bdev *nvme_bdev; + struct ocssd_bdev *ocssd_bdev; + char range_buf[128]; + int rc; + + TAILQ_FOREACH(nvme_bdev, &ns->bdevs, tailq) { + nvme_bdev_ctrlr = nvme_bdev->nvme_bdev_ctrlr; + ocssd_bdev = SPDK_CONTAINEROF(nvme_bdev, struct ocssd_bdev, nvme_bdev); + + rc = snprintf(range_buf, sizeof(range_buf), "%"PRIu64"-%"PRIu64, + ocssd_bdev->range.begin, ocssd_bdev->range.end); + if (rc < 0 || rc >= (int)sizeof(range_buf)) { + SPDK_ERRLOG("Failed to convert parallel unit range\n"); + continue; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "bdev_ocssd_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr_name", nvme_bdev_ctrlr->name); + spdk_json_write_named_string(w, "bdev_name", nvme_bdev->disk.name); + spdk_json_write_named_uint32(w, "nsid", nvme_bdev->nvme_ns->id); + spdk_json_write_named_string(w, "range", range_buf); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } +} + +static int +bdev_ocssd_get_ctx_size(void) +{ + return sizeof(struct bdev_ocssd_io); +} + +static struct spdk_bdev_module ocssd_if = { + .name = "ocssd", + .module_init = bdev_ocssd_library_init, + .module_fini = bdev_ocssd_library_fini, + .config_json = bdev_ocssd_config_json, + .get_ctx_size = bdev_ocssd_get_ctx_size, +}; + +SPDK_BDEV_MODULE_REGISTER(ocssd, &ocssd_if); + +static struct bdev_ocssd_zone * +bdev_ocssd_get_zone_by_lba(struct ocssd_bdev *ocssd_bdev, uint64_t lba) +{ + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + size_t zone_size = nvme_bdev->disk.zone_size; + + if (lba >= nvme_bdev->disk.blockcnt) { + return NULL; + } + + return &ocssd_bdev->zones[lba / zone_size]; +} + +static struct bdev_ocssd_zone * +bdev_ocssd_get_zone_by_slba(struct ocssd_bdev *ocssd_bdev, uint64_t slba) +{ + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + + if (slba % nvme_bdev->disk.zone_size != 0) { + return NULL; + } + + return bdev_ocssd_get_zone_by_lba(ocssd_bdev, slba); +} + +static void +bdev_ocssd_free_bdev(struct ocssd_bdev *ocssd_bdev) +{ + if (!ocssd_bdev) { + return; + } + + free(ocssd_bdev->zones); + free(ocssd_bdev->nvme_bdev.disk.name); + free(ocssd_bdev); +} + +static int +bdev_ocssd_destruct(void *ctx) +{ + struct ocssd_bdev *ocssd_bdev = ctx; + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + + nvme_bdev_detach_bdev_from_ns(nvme_bdev); + bdev_ocssd_free_bdev(ocssd_bdev); + + return 0; +} + +static void +bdev_ocssd_translate_lba(struct ocssd_bdev *ocssd_bdev, uint64_t lba, uint64_t *grp, + uint64_t *pu, uint64_t *chk, uint64_t *lbk) +{ + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev); + const struct spdk_ocssd_geometry_data *geo = &ocssd_ns->geometry; + const struct bdev_ocssd_range *range = &ocssd_bdev->range; + uint64_t addr_shift, punit; + + /* To achieve best performance, we need to make sure that adjacent zones can be accessed + * in parallel. We accomplish this by having the following addressing scheme: + * + * [ zone id ][ zone offset ] User's LBA + * [ chunk ][ group ][ parallel unit ][ logical block ] Open Channel's LBA + * + * which means that neighbouring zones are placed in a different group and parallel unit. + */ + *lbk = lba % geo->clba; + addr_shift = geo->clba; + + punit = range->begin + (lba / addr_shift) % bdev_ocssd_num_parallel_units(ocssd_bdev); + + *pu = punit % geo->num_pu; + *grp = punit / geo->num_pu; + + addr_shift *= bdev_ocssd_num_parallel_units(ocssd_bdev); + + *chk = (lba / addr_shift) % geo->num_chk; +} + +static uint64_t +bdev_ocssd_from_disk_lba(struct ocssd_bdev *ocssd_bdev, uint64_t lba) +{ + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev); + const struct spdk_ocssd_geometry_data *geometry = &ocssd_ns->geometry; + const struct bdev_ocssd_lba_offsets *offsets = &ocssd_ns->lba_offsets; + const struct bdev_ocssd_range *range = &ocssd_bdev->range; + uint64_t lbk, chk, pu, grp, punit; + + lbk = (lba >> offsets->lbk) & ((1 << geometry->lbaf.lbk_len) - 1); + chk = (lba >> offsets->chk) & ((1 << geometry->lbaf.chk_len) - 1); + pu = (lba >> offsets->pu) & ((1 << geometry->lbaf.pu_len) - 1); + grp = (lba >> offsets->grp) & ((1 << geometry->lbaf.grp_len) - 1); + + punit = grp * geometry->num_pu + pu - range->begin; + + return lbk + punit * geometry->clba + chk * geometry->clba * + bdev_ocssd_num_parallel_units(ocssd_bdev); +} + +static uint64_t +bdev_ocssd_to_disk_lba(struct ocssd_bdev *ocssd_bdev, uint64_t lba) +{ + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev); + const struct bdev_ocssd_lba_offsets *offsets = &ocssd_ns->lba_offsets; + uint64_t lbk, chk, pu, grp; + + bdev_ocssd_translate_lba(ocssd_bdev, lba, &grp, &pu, &chk, &lbk); + + return (lbk << offsets->lbk) | + (chk << offsets->chk) | + (pu << offsets->pu) | + (grp << offsets->grp); +} + +static bool +bdev_ocssd_lba_in_range(struct ocssd_bdev *ocssd_bdev, uint64_t lba) +{ + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev); + const struct spdk_ocssd_geometry_data *geometry = &ocssd_ns->geometry; + const struct bdev_ocssd_lba_offsets *offsets = &ocssd_ns->lba_offsets; + const struct bdev_ocssd_range *range = &ocssd_bdev->range; + uint64_t pu, grp, punit; + + pu = (lba >> offsets->pu) & ((1 << geometry->lbaf.pu_len) - 1); + grp = (lba >> offsets->grp) & ((1 << geometry->lbaf.grp_len) - 1); + punit = grp * geometry->num_pu + pu; + + return punit >= range->begin && punit <= range->end; +} + +static void +bdev_ocssd_reset_sgl(void *cb_arg, uint32_t offset) +{ + struct spdk_bdev_io *bdev_io = cb_arg; + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + struct iovec *iov; + + ocdev_io->io.iov_pos = 0; + ocdev_io->io.iov_off = 0; + + for (; ocdev_io->io.iov_pos < (size_t)bdev_io->u.bdev.iovcnt; ++ocdev_io->io.iov_pos) { + iov = &bdev_io->u.bdev.iovs[ocdev_io->io.iov_pos]; + if (offset < iov->iov_len) { + ocdev_io->io.iov_off = offset; + return; + } + + offset -= iov->iov_len; + } + + assert(false && "Invalid offset length"); +} + +static int +bdev_ocssd_next_sge(void *cb_arg, void **address, uint32_t *length) +{ + struct spdk_bdev_io *bdev_io = cb_arg; + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + struct iovec *iov; + + assert(ocdev_io->io.iov_pos < (size_t)bdev_io->u.bdev.iovcnt); + iov = &bdev_io->u.bdev.iovs[ocdev_io->io.iov_pos]; + + *address = iov->iov_base; + *length = iov->iov_len; + + if (ocdev_io->io.iov_off != 0) { + assert(ocdev_io->io.iov_off < iov->iov_len); + *address = (char *)*address + ocdev_io->io.iov_off; + *length -= ocdev_io->io.iov_off; + } + + assert(ocdev_io->io.iov_off + *length == iov->iov_len); + ocdev_io->io.iov_off = 0; + ocdev_io->io.iov_pos++; + + return 0; +} + +static void +bdev_ocssd_read_cb(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_bdev_io *bdev_io = ctx; + + spdk_bdev_io_complete_nvme_status(bdev_io, 0, cpl->status.sct, cpl->status.sc); +} + +static int +bdev_ocssd_read(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) +{ + struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt; + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch); + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + const size_t zone_size = nvme_bdev->disk.zone_size; + uint64_t lba; + + if ((bdev_io->u.bdev.offset_blocks % zone_size) + bdev_io->u.bdev.num_blocks > zone_size) { + SPDK_ERRLOG("Tried to cross zone boundary during read command\n"); + return -EINVAL; + } + + ocdev_io->io.iov_pos = 0; + ocdev_io->io.iov_off = 0; + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev_io->u.bdev.offset_blocks); + + return spdk_nvme_ns_cmd_readv_with_md(nvme_bdev->nvme_ns->ns, nvme_ioch->qpair, lba, + bdev_io->u.bdev.num_blocks, bdev_ocssd_read_cb, + bdev_io, 0, bdev_ocssd_reset_sgl, + bdev_ocssd_next_sge, bdev_io->u.bdev.md_buf, 0, 0); +} + +static void +bdev_ocssd_write_cb(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_bdev_io *bdev_io = ctx; + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) { + bdev_io->u.bdev.offset_blocks = ocdev_io->io.zone->write_pointer; + } + + ocdev_io->io.zone->write_pointer = bdev_io->u.bdev.offset_blocks + + bdev_io->u.bdev.num_blocks; + assert(ocdev_io->io.zone->write_pointer <= ocdev_io->io.zone->slba + + ocdev_io->io.zone->capacity); + + __atomic_store_n(&ocdev_io->io.zone->busy, false, __ATOMIC_SEQ_CST); + spdk_bdev_io_complete_nvme_status(bdev_io, 0, cpl->status.sct, cpl->status.sc); +} + +static int +bdev_ocssd_write(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) +{ + struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt; + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch); + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + const size_t zone_size = nvme_bdev->disk.zone_size; + uint64_t lba; + int rc; + + if ((bdev_io->u.bdev.offset_blocks % zone_size) + bdev_io->u.bdev.num_blocks > zone_size) { + SPDK_ERRLOG("Tried to cross zone boundary during write command\n"); + return -EINVAL; + } + + ocdev_io->io.zone = bdev_ocssd_get_zone_by_lba(ocssd_bdev, bdev_io->u.bdev.offset_blocks); + if (__atomic_exchange_n(&ocdev_io->io.zone->busy, true, __ATOMIC_SEQ_CST)) { + return -EINVAL; + } + + ocdev_io->io.iov_pos = 0; + ocdev_io->io.iov_off = 0; + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev_io->u.bdev.offset_blocks); + rc = spdk_nvme_ns_cmd_writev_with_md(nvme_bdev->nvme_ns->ns, nvme_ioch->qpair, lba, + bdev_io->u.bdev.num_blocks, bdev_ocssd_write_cb, + bdev_io, 0, bdev_ocssd_reset_sgl, + bdev_ocssd_next_sge, bdev_io->u.bdev.md_buf, 0, 0); + if (spdk_unlikely(rc != 0)) { + __atomic_store_n(&ocdev_io->io.zone->busy, false, __ATOMIC_SEQ_CST); + } + + return rc; +} + +static int +bdev_ocssd_zone_append(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) +{ + struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt; + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch); + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + struct bdev_ocssd_zone *zone; + uint64_t lba; + int rc = 0; + + zone = bdev_ocssd_get_zone_by_slba(ocssd_bdev, bdev_io->u.bdev.offset_blocks); + if (!zone) { + SPDK_ERRLOG("Invalid zone SLBA: %"PRIu64"\n", bdev_io->u.bdev.offset_blocks); + return -EINVAL; + } + + if (__atomic_exchange_n(&zone->busy, true, __ATOMIC_SEQ_CST)) { + return -EAGAIN; + } + + if (zone->slba + zone->capacity - zone->write_pointer < bdev_io->u.bdev.num_blocks) { + SPDK_ERRLOG("Insufficient number of blocks remaining\n"); + rc = -ENOSPC; + goto out; + } + + ocdev_io->io.zone = zone; + ocdev_io->io.iov_pos = 0; + ocdev_io->io.iov_off = 0; + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, zone->write_pointer); + rc = spdk_nvme_ns_cmd_writev_with_md(nvme_bdev->nvme_ns->ns, nvme_ioch->qpair, lba, + bdev_io->u.bdev.num_blocks, bdev_ocssd_write_cb, + bdev_io, 0, bdev_ocssd_reset_sgl, + bdev_ocssd_next_sge, bdev_io->u.bdev.md_buf, 0, 0); +out: + if (spdk_unlikely(rc != 0)) { + __atomic_store_n(&zone->busy, false, __ATOMIC_SEQ_CST); + } + + return rc; +} + +static void +bdev_ocssd_io_get_buf_cb(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io, bool success) +{ + int rc; + + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + return; + } + + rc = bdev_ocssd_read(ioch, bdev_io); + if (spdk_likely(rc != 0)) { + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +static void +bdev_ocssd_reset_zone_cb(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_bdev_io *bdev_io = ctx; + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + + ocdev_io->io.zone->write_pointer = ocdev_io->io.zone->slba; + __atomic_store_n(&ocdev_io->io.zone->busy, false, __ATOMIC_SEQ_CST); + spdk_bdev_io_complete_nvme_status(bdev_io, 0, cpl->status.sct, cpl->status.sc); +} + +static int +bdev_ocssd_reset_zone(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io, + uint64_t slba, size_t num_zones) +{ + struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt; + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch); + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + uint64_t offset, zone_size = nvme_bdev->disk.zone_size; + int rc; + + if (num_zones > 1) { + SPDK_ERRLOG("Exceeded maximum number of zones per single reset: 1\n"); + return -EINVAL; + } + + ocdev_io->io.zone = bdev_ocssd_get_zone_by_slba(ocssd_bdev, slba); + if (__atomic_exchange_n(&ocdev_io->io.zone->busy, true, __ATOMIC_SEQ_CST)) { + return -EINVAL; + } + + for (offset = 0; offset < num_zones; ++offset) { + ocdev_io->io.lba[offset] = bdev_ocssd_to_disk_lba(ocssd_bdev, + slba + offset * zone_size); + } + + rc = spdk_nvme_ocssd_ns_cmd_vector_reset(nvme_bdev->nvme_ns->ns, nvme_ioch->qpair, + ocdev_io->io.lba, num_zones, NULL, + bdev_ocssd_reset_zone_cb, bdev_io); + if (spdk_unlikely(rc != 0)) { + __atomic_store_n(&ocdev_io->io.zone->busy, false, __ATOMIC_SEQ_CST); + } + + return rc; +} + +static int _bdev_ocssd_get_zone_info(struct spdk_bdev_io *bdev_io); + +static void +bdev_ocssd_fill_zone_info(struct ocssd_bdev *ocssd_bdev, struct spdk_bdev_zone_info *zone_info, + const struct spdk_ocssd_chunk_information_entry *chunk_info) +{ + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + + zone_info->zone_id = bdev_ocssd_from_disk_lba(ocssd_bdev, chunk_info->slba); + zone_info->write_pointer = zone_info->zone_id; + + if (chunk_info->cs.free) { + zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY; + } else if (chunk_info->cs.closed) { + zone_info->state = SPDK_BDEV_ZONE_STATE_FULL; + } else if (chunk_info->cs.open) { + zone_info->state = SPDK_BDEV_ZONE_STATE_OPEN; + zone_info->write_pointer += chunk_info->wp % nvme_bdev->disk.zone_size; + } else if (chunk_info->cs.offline) { + zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; + } else { + SPDK_ERRLOG("Unknown chunk state, assuming offline\n"); + zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; + } + + if (chunk_info->ct.size_deviate) { + zone_info->capacity = chunk_info->cnlb; + } else { + zone_info->capacity = nvme_bdev->disk.zone_size; + } +} + +static void +bdev_ocssd_zone_info_cb(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_bdev_io *bdev_io = ctx; + struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt; + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + struct spdk_ocssd_chunk_information_entry *chunk_info = &ocdev_io->zone_info.chunk_info; + struct spdk_bdev_zone_info *zone_info; + int rc; + + if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { + spdk_bdev_io_complete_nvme_status(bdev_io, 0, cpl->status.sct, cpl->status.sc); + return; + } + + zone_info = ((struct spdk_bdev_zone_info *)bdev_io->u.zone_mgmt.buf) + + ocdev_io->zone_info.chunk_offset; + bdev_ocssd_fill_zone_info(ocssd_bdev, zone_info, chunk_info); + + if (++ocdev_io->zone_info.chunk_offset == bdev_io->u.zone_mgmt.num_zones) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + rc = _bdev_ocssd_get_zone_info(bdev_io); + if (spdk_unlikely(rc != 0)) { + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } + } +} + +static int +_bdev_ocssd_get_zone_info(struct spdk_bdev_io *bdev_io) +{ + struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt; + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev); + const struct spdk_ocssd_geometry_data *geo = &ocssd_ns->geometry; + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + uint64_t lba, grp, pu, chk, lbk, offset; + + lba = bdev_io->u.zone_mgmt.zone_id + ocdev_io->zone_info.chunk_offset * + nvme_bdev->disk.zone_size; + bdev_ocssd_translate_lba(ocssd_bdev, lba, &grp, &pu, &chk, &lbk); + offset = grp * geo->num_pu * geo->num_chk + pu * geo->num_chk + chk; + + return spdk_nvme_ctrlr_cmd_get_log_page(nvme_bdev->nvme_bdev_ctrlr->ctrlr, + SPDK_OCSSD_LOG_CHUNK_INFO, + spdk_nvme_ns_get_id(nvme_bdev->nvme_ns->ns), + &ocdev_io->zone_info.chunk_info, + sizeof(ocdev_io->zone_info.chunk_info), + offset * sizeof(ocdev_io->zone_info.chunk_info), + bdev_ocssd_zone_info_cb, (void *)bdev_io); +} + +static int +bdev_ocssd_get_zone_info(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) +{ + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + + if (bdev_io->u.zone_mgmt.num_zones < 1) { + SPDK_ERRLOG("Invalid number of zones: %"PRIu32"\n", bdev_io->u.zone_mgmt.num_zones); + return -EINVAL; + } + + if (bdev_io->u.zone_mgmt.zone_id % bdev_io->bdev->zone_size != 0) { + SPDK_ERRLOG("Unaligned zone LBA: %"PRIu64"\n", bdev_io->u.zone_mgmt.zone_id); + return -EINVAL; + } + + ocdev_io->zone_info.chunk_offset = 0; + + return _bdev_ocssd_get_zone_info(bdev_io); +} + +static int +bdev_ocssd_zone_management(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->u.zone_mgmt.zone_action) { + case SPDK_BDEV_ZONE_RESET: + return bdev_ocssd_reset_zone(ioch, bdev_io, bdev_io->u.zone_mgmt.zone_id, + bdev_io->u.zone_mgmt.num_zones); + default: + return -EINVAL; + } +} + +static void bdev_ocssd_submit_request(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io); + +static int +bdev_ocssd_poll_pending(void *ctx) +{ + struct spdk_io_channel *ioch = ctx; + struct nvme_io_channel *nvme_ioch; + struct ocssd_io_channel *ocssd_ioch; + struct spdk_bdev_io *bdev_io; + TAILQ_HEAD(, spdk_bdev_io) pending_requests; + int num_requests = 0; + + nvme_ioch = spdk_io_channel_get_ctx(ioch); + ocssd_ioch = nvme_ioch->ocssd_ioch; + + TAILQ_INIT(&pending_requests); + TAILQ_SWAP(&ocssd_ioch->pending_requests, &pending_requests, spdk_bdev_io, module_link); + + while ((bdev_io = TAILQ_FIRST(&pending_requests))) { + TAILQ_REMOVE(&pending_requests, bdev_io, module_link); + bdev_ocssd_submit_request(ioch, bdev_io); + num_requests++; + } + + if (TAILQ_EMPTY(&ocssd_ioch->pending_requests)) { + spdk_poller_pause(ocssd_ioch->pending_poller); + } + + return num_requests; +} + +static void +bdev_ocssd_delay_request(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) +{ + struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch); + struct ocssd_io_channel *ocssd_ioch = nvme_ioch->ocssd_ioch; + + TAILQ_INSERT_TAIL(&ocssd_ioch->pending_requests, bdev_io, module_link); + spdk_poller_resume(ocssd_ioch->pending_poller); +} + +static void +bdev_ocssd_submit_request(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) +{ + int rc = 0; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_ocssd_io_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + + case SPDK_BDEV_IO_TYPE_WRITE: + rc = bdev_ocssd_write(ioch, bdev_io); + break; + + case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: + rc = bdev_ocssd_zone_management(ioch, bdev_io); + break; + + case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: + rc = bdev_ocssd_get_zone_info(ioch, bdev_io); + break; + + case SPDK_BDEV_IO_TYPE_ZONE_APPEND: + rc = bdev_ocssd_zone_append(ioch, bdev_io); + break; + + default: + rc = -EINVAL; + break; + } + + if (spdk_unlikely(rc != 0)) { + switch (rc) { + case -ENOMEM: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + break; + case -EAGAIN: + bdev_ocssd_delay_request(ioch, bdev_io); + break; + default: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + break; + } + } +} + +static bool +bdev_ocssd_io_type_supported(void *ctx, enum spdk_bdev_io_type type) +{ + switch (type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: + case SPDK_BDEV_IO_TYPE_ZONE_APPEND: + return true; + + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_ocssd_get_io_channel(void *ctx) +{ + struct ocssd_bdev *ocssd_bdev = ctx; + + return spdk_get_io_channel(ocssd_bdev->nvme_bdev.nvme_bdev_ctrlr); +} + +static void +bdev_ocssd_free_namespace(struct nvme_bdev_ns *nvme_ns) +{ + struct nvme_bdev *bdev, *tmp; + + TAILQ_FOREACH_SAFE(bdev, &nvme_ns->bdevs, tailq, tmp) { + spdk_bdev_unregister(&bdev->disk, NULL, NULL); + } + + free(nvme_ns->type_ctx); + nvme_ns->type_ctx = NULL; + + nvme_ctrlr_depopulate_namespace_done(nvme_ns->ctrlr); +} + +static void +bdev_ocssd_chunk_notification_cb(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_bdev_ns *nvme_ns = ctx; + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns); + struct spdk_bdev_media_event event; + struct spdk_ocssd_chunk_notification_entry *chunk_entry; + struct nvme_bdev *nvme_bdev; + struct ocssd_bdev *ocssd_bdev; + size_t chunk_id, num_blocks, lba; + int rc; + + ocssd_ns->num_outstanding--; + + /* The namespace could have been depopulated in the meantime */ + if (!nvme_ns->populated) { + if (ocssd_ns->num_outstanding == 0) { + bdev_ocssd_free_namespace(nvme_ns); + } + + return; + } + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("Failed to retrieve chunk notification log\n"); + return; + } + + for (chunk_id = 0; chunk_id < CHUNK_NOTIFICATION_ENTRY_COUNT; ++chunk_id) { + chunk_entry = &ocssd_ns->chunk[chunk_id]; + if (chunk_entry->nc <= ocssd_ns->chunk_notify_count) { + break; + } + + ocssd_ns->chunk_notify_count = chunk_entry->nc; + if (chunk_entry->mask.lblk) { + num_blocks = chunk_entry->nlb; + } else if (chunk_entry->mask.chunk) { + num_blocks = ocssd_ns->geometry.clba; + } else if (chunk_entry->mask.pu) { + num_blocks = ocssd_ns->geometry.clba * ocssd_ns->geometry.num_chk; + } else { + SPDK_WARNLOG("Invalid chunk notification mask\n"); + continue; + } + + TAILQ_FOREACH(nvme_bdev, &nvme_ns->bdevs, tailq) { + ocssd_bdev = SPDK_CONTAINEROF(nvme_bdev, struct ocssd_bdev, nvme_bdev); + if (bdev_ocssd_lba_in_range(ocssd_bdev, chunk_entry->lba)) { + break; + } + } + + if (nvme_bdev == NULL) { + SPDK_INFOLOG(SPDK_LOG_BDEV_OCSSD, "Dropping media management event\n"); + continue; + } + + lba = bdev_ocssd_from_disk_lba(ocssd_bdev, chunk_entry->lba); + while (num_blocks > 0 && lba < nvme_bdev->disk.blockcnt) { + event.offset = lba; + event.num_blocks = spdk_min(num_blocks, ocssd_ns->geometry.clba); + + rc = spdk_bdev_push_media_events(&nvme_bdev->disk, &event, 1); + if (spdk_unlikely(rc < 0)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_OCSSD, "Failed to push media event: %s\n", + spdk_strerror(-rc)); + break; + } + + /* Jump to the next chunk on the same parallel unit */ + lba += ocssd_ns->geometry.clba * bdev_ocssd_num_parallel_units(ocssd_bdev); + num_blocks -= event.num_blocks; + } + } + + /* If at least one notification has been processed send out media event */ + if (chunk_id > 0) { + TAILQ_FOREACH(nvme_bdev, &nvme_ns->bdevs, tailq) { + spdk_bdev_notify_media_management(&nvme_bdev->disk); + } + } + + /* If we filled the full array of events, there may be more still pending. Set the pending + * flag back to true so that we try to get more events again next time the poller runs. + */ + if (chunk_id == CHUNK_NOTIFICATION_ENTRY_COUNT) { + ocssd_ns->chunk_notify_pending = true; + } +} + +static int +bdev_ocssd_poll_mm(void *ctx) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; + struct nvme_bdev_ns *nvme_ns; + struct bdev_ocssd_ns *ocssd_ns; + uint32_t nsid; + int rc; + + for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { + nvme_ns = nvme_bdev_ctrlr->namespaces[nsid]; + if (nvme_ns == NULL || !nvme_ns->populated) { + continue; + } + + ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns); + if (ocssd_ns->chunk_notify_pending) { + ocssd_ns->chunk_notify_pending = false; + ocssd_ns->num_outstanding++; + + rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_bdev_ctrlr->ctrlr, + SPDK_OCSSD_LOG_CHUNK_NOTIFICATION, + nsid + 1, ocssd_ns->chunk, + sizeof(ocssd_ns->chunk[0]) * + CHUNK_NOTIFICATION_ENTRY_COUNT, + 0, bdev_ocssd_chunk_notification_cb, + nvme_ns); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Failed to get chunk notification log page: %s\n", + spdk_strerror(-rc)); + ocssd_ns->num_outstanding--; + } + } + } + + return SPDK_POLLER_BUSY; +} + +void +bdev_ocssd_handle_chunk_notification(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) +{ + struct bdev_ocssd_ns *ocssd_ns; + struct nvme_bdev_ns *nvme_ns; + uint32_t nsid; + + for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { + nvme_ns = nvme_bdev_ctrlr->namespaces[nsid]; + if (nvme_ns == NULL || !nvme_ns->populated) { + continue; + } + + ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns); + ocssd_ns->chunk_notify_pending = true; + } +} + +static struct spdk_bdev_fn_table ocssdlib_fn_table = { + .destruct = bdev_ocssd_destruct, + .submit_request = bdev_ocssd_submit_request, + .io_type_supported = bdev_ocssd_io_type_supported, + .get_io_channel = bdev_ocssd_get_io_channel, +}; + +struct bdev_ocssd_create_ctx { + struct ocssd_bdev *ocssd_bdev; + bdev_ocssd_create_cb cb_fn; + void *cb_arg; + const struct bdev_ocssd_range *range; + uint64_t chunk_offset; + uint64_t end_chunk_offset; + uint64_t num_chunks; +#define OCSSD_BDEV_CHUNK_INFO_COUNT 128 + struct spdk_ocssd_chunk_information_entry chunk_info[OCSSD_BDEV_CHUNK_INFO_COUNT]; +}; + +static void +bdev_ocssd_create_complete(struct bdev_ocssd_create_ctx *create_ctx, int status) +{ + const char *bdev_name = create_ctx->ocssd_bdev->nvme_bdev.disk.name; + + if (spdk_unlikely(status != 0)) { + bdev_ocssd_free_bdev(create_ctx->ocssd_bdev); + } + + create_ctx->cb_fn(bdev_name, status, create_ctx->cb_arg); + free(create_ctx); +} + +static int bdev_ocssd_init_zone(struct bdev_ocssd_create_ctx *create_ctx); + +static void +bdev_ocssd_register_bdev(void *ctx) +{ + struct bdev_ocssd_create_ctx *create_ctx = ctx; + struct ocssd_bdev *ocssd_bdev = create_ctx->ocssd_bdev; + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + int rc; + + rc = spdk_bdev_register(&nvme_bdev->disk); + if (spdk_likely(rc == 0)) { + nvme_bdev_attach_bdev_to_ns(nvme_bdev->nvme_ns, nvme_bdev); + } else { + SPDK_ERRLOG("Failed to register bdev %s\n", nvme_bdev->disk.name); + } + + bdev_ocssd_create_complete(create_ctx, rc); +} + +static void +bdev_occsd_init_zone_cb(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct bdev_ocssd_create_ctx *create_ctx = ctx; + struct bdev_ocssd_zone *ocssd_zone; + struct ocssd_bdev *ocssd_bdev = create_ctx->ocssd_bdev; + struct spdk_bdev_zone_info zone_info = {}; + uint64_t offset; + int rc = 0; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("Chunk information log page failed\n"); + bdev_ocssd_create_complete(create_ctx, -EIO); + return; + } + + for (offset = 0; offset < create_ctx->num_chunks; ++offset) { + bdev_ocssd_fill_zone_info(ocssd_bdev, &zone_info, &create_ctx->chunk_info[offset]); + + ocssd_zone = bdev_ocssd_get_zone_by_slba(ocssd_bdev, zone_info.zone_id); + if (!ocssd_zone) { + SPDK_ERRLOG("Received invalid zone starting LBA: %"PRIu64"\n", + zone_info.zone_id); + bdev_ocssd_create_complete(create_ctx, -EINVAL); + return; + } + + /* Make sure we're not filling the same zone twice */ + assert(ocssd_zone->busy); + + ocssd_zone->busy = false; + ocssd_zone->slba = zone_info.zone_id; + ocssd_zone->capacity = zone_info.capacity; + ocssd_zone->write_pointer = zone_info.write_pointer; + } + + create_ctx->chunk_offset += create_ctx->num_chunks; + if (create_ctx->chunk_offset < create_ctx->end_chunk_offset) { + rc = bdev_ocssd_init_zone(create_ctx); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Failed to send chunk info log page\n"); + bdev_ocssd_create_complete(create_ctx, rc); + } + } else { + /* Make sure all zones have been processed */ + for (offset = 0; offset < bdev_ocssd_num_zones(ocssd_bdev); ++offset) { + assert(!ocssd_bdev->zones[offset].busy); + } + + /* Schedule the last bit of work (io_device, bdev registration) to be done in a + * context that is not tied to admin command's completion callback. + */ + spdk_thread_send_msg(spdk_get_thread(), bdev_ocssd_register_bdev, create_ctx); + } +} + +static int +bdev_ocssd_init_zone(struct bdev_ocssd_create_ctx *create_ctx) +{ + struct ocssd_bdev *ocssd_bdev = create_ctx->ocssd_bdev; + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + + create_ctx->num_chunks = spdk_min(create_ctx->end_chunk_offset - create_ctx->chunk_offset, + OCSSD_BDEV_CHUNK_INFO_COUNT); + assert(create_ctx->num_chunks > 0); + + return spdk_nvme_ctrlr_cmd_get_log_page(nvme_bdev->nvme_bdev_ctrlr->ctrlr, + SPDK_OCSSD_LOG_CHUNK_INFO, + spdk_nvme_ns_get_id(nvme_bdev->nvme_ns->ns), + &create_ctx->chunk_info, + sizeof(create_ctx->chunk_info[0]) * + create_ctx->num_chunks, + sizeof(create_ctx->chunk_info[0]) * + create_ctx->chunk_offset, + bdev_occsd_init_zone_cb, create_ctx); +} + +static int +bdev_ocssd_init_zones(struct bdev_ocssd_create_ctx *create_ctx) +{ + struct ocssd_bdev *ocssd_bdev = create_ctx->ocssd_bdev; + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev); + struct spdk_bdev *bdev = &ocssd_bdev->nvme_bdev.disk; + uint64_t offset; + + ocssd_bdev->zones = calloc(bdev_ocssd_num_zones(ocssd_bdev), sizeof(*ocssd_bdev->zones)); + if (!ocssd_bdev->zones) { + return -ENOMEM; + } + + create_ctx->chunk_offset = ocssd_bdev->range.begin * ocssd_ns->geometry.num_chk; + create_ctx->end_chunk_offset = create_ctx->chunk_offset + bdev->blockcnt / bdev->zone_size; + + /* Mark all zones as busy and clear it as their info is filled */ + for (offset = 0; offset < bdev_ocssd_num_zones(ocssd_bdev); ++offset) { + ocssd_bdev->zones[offset].busy = true; + } + + return bdev_ocssd_init_zone(create_ctx); +} + +static bool +bdev_ocssd_verify_range(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, uint32_t nsid, + const struct bdev_ocssd_range *range) +{ + struct nvme_bdev_ns *nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns); + const struct spdk_ocssd_geometry_data *geometry = &ocssd_ns->geometry; + struct ocssd_bdev *ocssd_bdev; + struct nvme_bdev *nvme_bdev; + size_t num_punits = geometry->num_pu * geometry->num_grp; + + /* First verify the range is within the geometry */ + if (range != NULL && (range->begin > range->end || range->end >= num_punits)) { + return false; + } + + TAILQ_FOREACH(nvme_bdev, &nvme_ns->bdevs, tailq) { + ocssd_bdev = SPDK_CONTAINEROF(nvme_bdev, struct ocssd_bdev, nvme_bdev); + + /* Only verify bdevs created on the same namespace */ + if (spdk_nvme_ns_get_id(nvme_bdev->nvme_ns->ns) != nsid) { + continue; + } + + /* Empty range means whole namespace should be used */ + if (range == NULL) { + return false; + } + + /* Make sure the range doesn't overlap with any other range */ + if (range->begin <= ocssd_bdev->range.end && + range->end >= ocssd_bdev->range.begin) { + return false; + } + } + + return true; +} + +void +bdev_ocssd_create_bdev(const char *ctrlr_name, const char *bdev_name, uint32_t nsid, + const struct bdev_ocssd_range *range, bdev_ocssd_create_cb cb_fn, + void *cb_arg) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct bdev_ocssd_create_ctx *create_ctx = NULL; + struct nvme_bdev *nvme_bdev = NULL; + struct ocssd_bdev *ocssd_bdev = NULL; + struct spdk_nvme_ns *ns; + struct nvme_bdev_ns *nvme_ns; + struct bdev_ocssd_ns *ocssd_ns; + struct spdk_ocssd_geometry_data *geometry; + int rc = 0; + + nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctrlr_name); + if (!nvme_bdev_ctrlr) { + SPDK_ERRLOG("Unable to find controller %s\n", ctrlr_name); + rc = -ENODEV; + goto error; + } + + ns = spdk_nvme_ctrlr_get_ns(nvme_bdev_ctrlr->ctrlr, nsid); + if (!ns) { + SPDK_ERRLOG("Unable to retrieve namespace %"PRIu32"\n", nsid); + rc = -ENODEV; + goto error; + } + + if (!spdk_nvme_ns_is_active(ns)) { + SPDK_ERRLOG("Namespace %"PRIu32" is inactive\n", nsid); + rc = -EACCES; + goto error; + } + + assert(nsid <= nvme_bdev_ctrlr->num_ns); + nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; + if (nvme_ns == NULL) { + SPDK_ERRLOG("Namespace %"PRIu32" is not initialized\n", nsid); + rc = -EINVAL; + goto error; + } + + ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns); + if (ocssd_ns == NULL) { + SPDK_ERRLOG("Namespace %"PRIu32" is not an OCSSD namespace\n", nsid); + rc = -EINVAL; + goto error; + } + + if (spdk_bdev_get_by_name(bdev_name) != NULL) { + SPDK_ERRLOG("Device with provided name (%s) already exists\n", bdev_name); + rc = -EEXIST; + goto error; + } + + if (!bdev_ocssd_verify_range(nvme_bdev_ctrlr, nsid, range)) { + SPDK_ERRLOG("Invalid parallel unit range\n"); + rc = -EINVAL; + goto error; + } + + ocssd_bdev = calloc(1, sizeof(*ocssd_bdev)); + if (!ocssd_bdev) { + rc = -ENOMEM; + goto error; + } + + create_ctx = calloc(1, sizeof(*create_ctx)); + if (!create_ctx) { + rc = -ENOMEM; + goto error; + } + + create_ctx->ocssd_bdev = ocssd_bdev; + create_ctx->cb_fn = cb_fn; + create_ctx->cb_arg = cb_arg; + create_ctx->range = range; + + nvme_bdev = &ocssd_bdev->nvme_bdev; + nvme_bdev->nvme_ns = nvme_ns; + nvme_bdev->nvme_bdev_ctrlr = nvme_bdev_ctrlr; + geometry = &ocssd_ns->geometry; + + if (range != NULL) { + ocssd_bdev->range = *range; + } else { + ocssd_bdev->range.begin = 0; + ocssd_bdev->range.end = geometry->num_grp * geometry->num_pu - 1; + } + + nvme_bdev->disk.name = strdup(bdev_name); + if (!nvme_bdev->disk.name) { + rc = -ENOMEM; + goto error; + } + + nvme_bdev->disk.product_name = "Open Channel SSD"; + nvme_bdev->disk.ctxt = ocssd_bdev; + nvme_bdev->disk.fn_table = &ocssdlib_fn_table; + nvme_bdev->disk.module = &ocssd_if; + nvme_bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns); + nvme_bdev->disk.zoned = true; + nvme_bdev->disk.blockcnt = bdev_ocssd_num_parallel_units(ocssd_bdev) * + geometry->num_chk * geometry->clba; + nvme_bdev->disk.zone_size = geometry->clba; + nvme_bdev->disk.max_open_zones = geometry->maxoc; + nvme_bdev->disk.optimal_open_zones = bdev_ocssd_num_parallel_units(ocssd_bdev); + nvme_bdev->disk.write_unit_size = geometry->ws_opt; + nvme_bdev->disk.media_events = true; + + if (geometry->maxocpu != 0 && geometry->maxocpu != geometry->maxoc) { + SPDK_WARNLOG("Maximum open chunks per PU is not zero. Reducing the maximum " + "number of open zones: %"PRIu32" -> %"PRIu32"\n", + geometry->maxoc, geometry->maxocpu); + nvme_bdev->disk.max_open_zones = geometry->maxocpu; + } + + rc = bdev_ocssd_init_zones(create_ctx); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Failed to initialize zones on bdev %s\n", nvme_bdev->disk.name); + goto error; + } + + return; +error: + bdev_ocssd_free_bdev(ocssd_bdev); + cb_fn(NULL, rc, cb_arg); + free(create_ctx); +} + +struct bdev_ocssd_delete_ctx { + bdev_ocssd_delete_cb cb_fn; + void *cb_arg; +}; + +static void +bdev_ocssd_unregister_cb(void *cb_arg, int status) +{ + struct bdev_ocssd_delete_ctx *delete_ctx = cb_arg; + + delete_ctx->cb_fn(status, delete_ctx->cb_arg); + free(delete_ctx); +} + +void +bdev_ocssd_delete_bdev(const char *bdev_name, bdev_ocssd_delete_cb cb_fn, void *cb_arg) +{ + struct spdk_bdev *bdev; + struct bdev_ocssd_delete_ctx *delete_ctx; + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + SPDK_ERRLOG("Unable to find bdev %s\n", bdev_name); + cb_fn(-ENODEV, cb_arg); + return; + } + + if (bdev->module != &ocssd_if) { + SPDK_ERRLOG("Specified bdev %s is not an OCSSD bdev\n", bdev_name); + cb_fn(-EINVAL, cb_arg); + return; + } + + delete_ctx = calloc(1, sizeof(*delete_ctx)); + if (!delete_ctx) { + SPDK_ERRLOG("Unable to allocate deletion context\n"); + cb_fn(-ENOMEM, cb_arg); + return; + } + + delete_ctx->cb_fn = cb_fn; + delete_ctx->cb_arg = cb_arg; + + spdk_bdev_unregister(bdev, bdev_ocssd_unregister_cb, delete_ctx); +} + +struct bdev_ocssd_populate_ns_ctx { + struct nvme_async_probe_ctx *nvme_ctx; + struct nvme_bdev_ns *nvme_ns; +}; + +static void +bdev_ocssd_geometry_cb(void *_ctx, const struct spdk_nvme_cpl *cpl) +{ + struct bdev_ocssd_populate_ns_ctx *ctx = _ctx; + struct nvme_bdev_ns *nvme_ns = ctx->nvme_ns; + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns); + int rc = 0; + + if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { + SPDK_ERRLOG("Failed to retrieve geometry for namespace %"PRIu32"\n", nvme_ns->id); + free(nvme_ns->type_ctx); + nvme_ns->type_ctx = NULL; + rc = -EIO; + } else { + ocssd_ns->lba_offsets.lbk = 0; + ocssd_ns->lba_offsets.chk = ocssd_ns->lba_offsets.lbk + + ocssd_ns->geometry.lbaf.lbk_len; + ocssd_ns->lba_offsets.pu = ocssd_ns->lba_offsets.chk + + ocssd_ns->geometry.lbaf.chk_len; + ocssd_ns->lba_offsets.grp = ocssd_ns->lba_offsets.pu + + ocssd_ns->geometry.lbaf.pu_len; + ocssd_ns->chunk_notify_pending = true; + } + + nvme_ctrlr_populate_namespace_done(ctx->nvme_ctx, nvme_ns, rc); + free(ctx); +} + +void +bdev_ocssd_populate_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, + struct nvme_bdev_ns *nvme_ns, + struct nvme_async_probe_ctx *nvme_ctx) +{ + struct bdev_ocssd_ns *ocssd_ns; + struct bdev_ocssd_populate_ns_ctx *ctx; + struct spdk_nvme_ns *ns; + int rc; + + ns = spdk_nvme_ctrlr_get_ns(nvme_bdev_ctrlr->ctrlr, nvme_ns->id); + if (ns == NULL) { + nvme_ctrlr_populate_namespace_done(nvme_ctx, nvme_ns, -EINVAL); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + nvme_ctrlr_populate_namespace_done(nvme_ctx, nvme_ns, -ENOMEM); + return; + } + + ocssd_ns = calloc(1, sizeof(*ocssd_ns)); + if (ocssd_ns == NULL) { + nvme_ctrlr_populate_namespace_done(nvme_ctx, nvme_ns, -ENOMEM); + free(ctx); + return; + } + + nvme_ns->type_ctx = ocssd_ns; + nvme_ns->ns = ns; + ctx->nvme_ctx = nvme_ctx; + ctx->nvme_ns = nvme_ns; + + rc = spdk_nvme_ocssd_ctrlr_cmd_geometry(nvme_bdev_ctrlr->ctrlr, nvme_ns->id, + &ocssd_ns->geometry, + sizeof(ocssd_ns->geometry), + bdev_ocssd_geometry_cb, ctx); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Failed to retrieve OC geometry: %s\n", spdk_strerror(-rc)); + nvme_ns->type_ctx = NULL; + nvme_ctrlr_populate_namespace_done(nvme_ctx, nvme_ns, rc); + free(ocssd_ns); + free(ctx); + } +} + +void +bdev_ocssd_depopulate_namespace(struct nvme_bdev_ns *ns) +{ + struct bdev_ocssd_ns *ocssd_ns; + + ocssd_ns = bdev_ocssd_get_ns_from_nvme(ns); + + /* If there are outstanding admin requests, we cannot free the context + * here, as they'd write over deallocated memory. Clear the populated + * flag, so that the completion callback knows that the namespace is + * being depopulated and finish its deallocation once all requests are + * completed. + */ + ns->populated = false; + if (ocssd_ns->num_outstanding == 0) { + bdev_ocssd_free_namespace(ns); + } +} + +int +bdev_ocssd_create_io_channel(struct nvme_io_channel *ioch) +{ + struct ocssd_io_channel *ocssd_ioch; + + ocssd_ioch = calloc(1, sizeof(*ocssd_ioch)); + if (ocssd_ioch == NULL) { + return -ENOMEM; + } + + ocssd_ioch->pending_poller = SPDK_POLLER_REGISTER(bdev_ocssd_poll_pending, + spdk_io_channel_from_ctx(ioch), 0); + if (ocssd_ioch->pending_poller == NULL) { + SPDK_ERRLOG("Failed to register pending requests poller\n"); + free(ocssd_ioch); + return -ENOMEM; + } + + /* Start the poller paused and only resume it once there are pending requests */ + spdk_poller_pause(ocssd_ioch->pending_poller); + + TAILQ_INIT(&ocssd_ioch->pending_requests); + ioch->ocssd_ioch = ocssd_ioch; + + return 0; +} + +void +bdev_ocssd_destroy_io_channel(struct nvme_io_channel *ioch) +{ + spdk_poller_unregister(&ioch->ocssd_ioch->pending_poller); + free(ioch->ocssd_ioch); +} + +int +bdev_ocssd_init_ctrlr(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) +{ + struct ocssd_bdev_ctrlr *ocssd_ctrlr; + + ocssd_ctrlr = calloc(1, sizeof(*ocssd_ctrlr)); + if (!ocssd_ctrlr) { + return -ENOMEM; + } + + ocssd_ctrlr->mm_poller = SPDK_POLLER_REGISTER(bdev_ocssd_poll_mm, nvme_bdev_ctrlr, + 10000ULL); + if (!ocssd_ctrlr->mm_poller) { + free(ocssd_ctrlr); + return -ENOMEM; + } + + nvme_bdev_ctrlr->ocssd_ctrlr = ocssd_ctrlr; + + return 0; +} + +void +bdev_ocssd_fini_ctrlr(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) +{ + spdk_poller_unregister(&nvme_bdev_ctrlr->ocssd_ctrlr->mm_poller); + free(nvme_bdev_ctrlr->ocssd_ctrlr); + nvme_bdev_ctrlr->ocssd_ctrlr = NULL; +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_ocssd", SPDK_LOG_BDEV_OCSSD) diff --git a/src/spdk/module/bdev/nvme/bdev_ocssd.h b/src/spdk/module/bdev/nvme/bdev_ocssd.h new file mode 100644 index 000000000..89e5a3058 --- /dev/null +++ b/src/spdk/module/bdev/nvme/bdev_ocssd.h @@ -0,0 +1,67 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_OCSSD_H +#define SPDK_BDEV_OCSSD_H + +#include "spdk/stdinc.h" +#include "common.h" + +struct bdev_ocssd_range { + uint64_t begin; + uint64_t end; +}; + +typedef void (*bdev_ocssd_create_cb)(const char *bdev_name, int status, void *ctx); +typedef void (*bdev_ocssd_delete_cb)(int status, void *ctx); + +void bdev_ocssd_create_bdev(const char *ctrlr_name, const char *bdev_name, uint32_t nsid, + const struct bdev_ocssd_range *range, + bdev_ocssd_create_cb cb_fn, void *cb_arg); +void bdev_ocssd_delete_bdev(const char *bdev_name, bdev_ocssd_delete_cb cb_fn, void *cb_arg); + +void bdev_ocssd_populate_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, + struct nvme_bdev_ns *nvme_ns, + struct nvme_async_probe_ctx *ctx); +void bdev_ocssd_depopulate_namespace(struct nvme_bdev_ns *ns); +void bdev_ocssd_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns); + +int bdev_ocssd_create_io_channel(struct nvme_io_channel *ioch); +void bdev_ocssd_destroy_io_channel(struct nvme_io_channel *ioch); + +int bdev_ocssd_init_ctrlr(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr); +void bdev_ocssd_fini_ctrlr(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr); + +void bdev_ocssd_handle_chunk_notification(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr); + +#endif /* SPDK_BDEV_OCSSD_H */ diff --git a/src/spdk/module/bdev/nvme/bdev_ocssd_rpc.c b/src/spdk/module/bdev/nvme/bdev_ocssd_rpc.c new file mode 100644 index 000000000..47c5acdb3 --- /dev/null +++ b/src/spdk/module/bdev/nvme/bdev_ocssd_rpc.c @@ -0,0 +1,197 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/rpc.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/log.h" +#include "spdk/likely.h" +#include "bdev_ocssd.h" + +#define BDEV_OCSSD_DEFAULT_NSID 1 + +struct rpc_create_ocssd_bdev { + char *ctrlr_name; + char *bdev_name; + uint32_t nsid; + char *range; +}; + +static const struct spdk_json_object_decoder rpc_create_ocssd_bdev_decoders[] = { + {"ctrlr_name", offsetof(struct rpc_create_ocssd_bdev, ctrlr_name), spdk_json_decode_string}, + {"bdev_name", offsetof(struct rpc_create_ocssd_bdev, bdev_name), spdk_json_decode_string}, + {"nsid", offsetof(struct rpc_create_ocssd_bdev, nsid), spdk_json_decode_uint32, true}, + {"range", offsetof(struct rpc_create_ocssd_bdev, range), spdk_json_decode_string, true}, +}; + +static void +free_rpc_create_ocssd_bdev(struct rpc_create_ocssd_bdev *rpc) +{ + free(rpc->ctrlr_name); + free(rpc->bdev_name); + free(rpc->range); +} + +struct rpc_bdev_ocssd_create_ctx { + struct spdk_jsonrpc_request *request; + struct rpc_create_ocssd_bdev rpc; + struct bdev_ocssd_range range; +}; + +static void +rpc_bdev_ocssd_create_done(const char *bdev_name, int status, void *_ctx) +{ + struct rpc_bdev_ocssd_create_ctx *ctx = _ctx; + struct spdk_json_write_ctx *w; + + if (status != 0) { + spdk_jsonrpc_send_error_response(ctx->request, status, spdk_strerror(-status)); + goto out; + } + + w = spdk_jsonrpc_begin_result(ctx->request); + spdk_json_write_string(w, bdev_name); + spdk_jsonrpc_end_result(ctx->request, w); +out: + free_rpc_create_ocssd_bdev(&ctx->rpc); + free(ctx); +} + +static void +rpc_bdev_ocssd_create(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +{ + struct rpc_bdev_ocssd_create_ctx *ctx; + struct bdev_ocssd_range *range = NULL; + int rc; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); + return; + } + + ctx->rpc.nsid = BDEV_OCSSD_DEFAULT_NSID; + ctx->request = request; + + if (spdk_json_decode_object(params, rpc_create_ocssd_bdev_decoders, + SPDK_COUNTOF(rpc_create_ocssd_bdev_decoders), + &ctx->rpc)) { + spdk_jsonrpc_send_error_response(request, -EINVAL, "Failed to parse the request"); + goto out; + } + + if (ctx->rpc.range != NULL) { + rc = sscanf(ctx->rpc.range, "%"PRIu64"-%"PRIu64, + &ctx->range.begin, &ctx->range.end); + if (rc != 2) { + spdk_jsonrpc_send_error_response(request, -EINVAL, "Failed to parse range"); + goto out; + } + + range = &ctx->range; + } + + bdev_ocssd_create_bdev(ctx->rpc.ctrlr_name, ctx->rpc.bdev_name, ctx->rpc.nsid, + range, rpc_bdev_ocssd_create_done, ctx); + return; +out: + free_rpc_create_ocssd_bdev(&ctx->rpc); + free(ctx); +} + +SPDK_RPC_REGISTER("bdev_ocssd_create", rpc_bdev_ocssd_create, SPDK_RPC_RUNTIME) + +struct rpc_delete_ocssd_bdev { + char *name; +}; + +static const struct spdk_json_object_decoder rpc_delete_ocssd_bdev_decoders[] = { + {"name", offsetof(struct rpc_delete_ocssd_bdev, name), spdk_json_decode_string}, +}; + +static void +free_rpc_delete_ocssd_bdev(struct rpc_delete_ocssd_bdev *rpc) +{ + free(rpc->name); +} + +struct rpc_bdev_ocssd_delete_ctx { + struct spdk_jsonrpc_request *request; + struct rpc_delete_ocssd_bdev rpc; +}; + +static void +rpc_bdev_ocssd_delete_done(int status, void *_ctx) +{ + struct rpc_bdev_ocssd_delete_ctx *ctx = _ctx; + struct spdk_json_write_ctx *w; + + if (status != 0) { + spdk_jsonrpc_send_error_response(ctx->request, status, spdk_strerror(-status)); + goto out; + } + + w = spdk_jsonrpc_begin_result(ctx->request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(ctx->request, w); +out: + free_rpc_delete_ocssd_bdev(&ctx->rpc); + free(ctx); +} + +static void +rpc_bdev_ocssd_delete(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +{ + struct rpc_bdev_ocssd_delete_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); + return; + } + + ctx->request = request; + if (spdk_json_decode_object(params, rpc_delete_ocssd_bdev_decoders, + SPDK_COUNTOF(rpc_delete_ocssd_bdev_decoders), + &ctx->rpc)) { + spdk_jsonrpc_send_error_response(request, -EINVAL, "Failed to parse the request"); + free_rpc_delete_ocssd_bdev(&ctx->rpc); + free(ctx); + return; + } + + bdev_ocssd_delete_bdev(ctx->rpc.name, rpc_bdev_ocssd_delete_done, ctx); +} + +SPDK_RPC_REGISTER("bdev_ocssd_delete", rpc_bdev_ocssd_delete, SPDK_RPC_RUNTIME) diff --git a/src/spdk/module/bdev/nvme/common.c b/src/spdk/module/bdev/nvme/common.c new file mode 100644 index 000000000..c895f1102 --- /dev/null +++ b/src/spdk/module/bdev/nvme/common.c @@ -0,0 +1,204 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/env.h" +#include "bdev_ocssd.h" +#include "common.h" + +struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); +pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; +bool g_bdev_nvme_module_finish; + +struct nvme_bdev_ctrlr * +nvme_bdev_ctrlr_get(const struct spdk_nvme_transport_id *trid) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + + TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { + if (spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->trid) == 0) { + return nvme_bdev_ctrlr; + } + } + + return NULL; +} + +struct nvme_bdev_ctrlr * +nvme_bdev_ctrlr_get_by_name(const char *name) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + + if (name == NULL) { + return NULL; + } + + TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { + if (strcmp(name, nvme_bdev_ctrlr->name) == 0) { + return nvme_bdev_ctrlr; + } + } + + return NULL; +} + +struct nvme_bdev_ctrlr * +nvme_bdev_first_ctrlr(void) +{ + return TAILQ_FIRST(&g_nvme_bdev_ctrlrs); +} + +struct nvme_bdev_ctrlr * +nvme_bdev_next_ctrlr(struct nvme_bdev_ctrlr *prev) +{ + return TAILQ_NEXT(prev, tailq); +} + +void +nvme_bdev_dump_trid_json(struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) +{ + const char *trtype_str; + const char *adrfam_str; + + trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); + if (trtype_str) { + spdk_json_write_named_string(w, "trtype", trtype_str); + } + + adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); + if (adrfam_str) { + spdk_json_write_named_string(w, "adrfam", adrfam_str); + } + + if (trid->traddr[0] != '\0') { + spdk_json_write_named_string(w, "traddr", trid->traddr); + } + + if (trid->trsvcid[0] != '\0') { + spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); + } + + if (trid->subnqn[0] != '\0') { + spdk_json_write_named_string(w, "subnqn", trid->subnqn); + } +} + +static void +nvme_bdev_unregister_cb(void *io_device) +{ + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; + uint32_t i; + + pthread_mutex_lock(&g_bdev_nvme_mutex); + TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); + pthread_mutex_unlock(&g_bdev_nvme_mutex); + spdk_nvme_detach(nvme_bdev_ctrlr->ctrlr); + spdk_poller_unregister(&nvme_bdev_ctrlr->adminq_timer_poller); + free(nvme_bdev_ctrlr->name); + for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { + free(nvme_bdev_ctrlr->namespaces[i]); + } + free(nvme_bdev_ctrlr->namespaces); + free(nvme_bdev_ctrlr->trid); + free(nvme_bdev_ctrlr); + + pthread_mutex_lock(&g_bdev_nvme_mutex); + if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); + spdk_bdev_module_finish_done(); + return; + } + + pthread_mutex_unlock(&g_bdev_nvme_mutex); +} + +int +nvme_bdev_ctrlr_destruct(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) +{ + assert(nvme_bdev_ctrlr->destruct); + pthread_mutex_lock(&g_bdev_nvme_mutex); + + /* If we have already registered a poller, let that one take care of it. */ + if (nvme_bdev_ctrlr->destruct_poller != NULL) { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + return SPDK_POLLER_IDLE; + } + + if (nvme_bdev_ctrlr->resetting) { + nvme_bdev_ctrlr->destruct_poller = + SPDK_POLLER_REGISTER((spdk_poller_fn)nvme_bdev_ctrlr_destruct, nvme_bdev_ctrlr, 1000); + pthread_mutex_unlock(&g_bdev_nvme_mutex); + return SPDK_POLLER_BUSY; + } + pthread_mutex_unlock(&g_bdev_nvme_mutex); + + spdk_poller_unregister(&nvme_bdev_ctrlr->destruct_poller); + if (nvme_bdev_ctrlr->opal_dev) { + spdk_opal_dev_destruct(nvme_bdev_ctrlr->opal_dev); + nvme_bdev_ctrlr->opal_dev = NULL; + } + + if (nvme_bdev_ctrlr->ocssd_ctrlr) { + bdev_ocssd_fini_ctrlr(nvme_bdev_ctrlr); + } + + spdk_io_device_unregister(nvme_bdev_ctrlr, nvme_bdev_unregister_cb); + return SPDK_POLLER_BUSY; +} + +void +nvme_bdev_attach_bdev_to_ns(struct nvme_bdev_ns *nvme_ns, struct nvme_bdev *nvme_disk) +{ + nvme_ns->ctrlr->ref++; + + TAILQ_INSERT_TAIL(&nvme_ns->bdevs, nvme_disk, tailq); +} + +void +nvme_bdev_detach_bdev_from_ns(struct nvme_bdev *nvme_disk) +{ + struct nvme_bdev_ctrlr *ctrlr = nvme_disk->nvme_ns->ctrlr; + + pthread_mutex_lock(&g_bdev_nvme_mutex); + ctrlr->ref--; + + TAILQ_REMOVE(&nvme_disk->nvme_ns->bdevs, nvme_disk, tailq); + + if (ctrlr->ref == 0 && ctrlr->destruct) { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + nvme_bdev_ctrlr_destruct(ctrlr); + return; + } + + pthread_mutex_unlock(&g_bdev_nvme_mutex); +} diff --git a/src/spdk/module/bdev/nvme/common.h b/src/spdk/module/bdev/nvme/common.h new file mode 100644 index 000000000..c710507a1 --- /dev/null +++ b/src/spdk/module/bdev/nvme/common.h @@ -0,0 +1,163 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_COMMON_BDEV_NVME_H +#define SPDK_COMMON_BDEV_NVME_H + +#include "spdk/nvme.h" +#include "spdk/bdev_module.h" +#include "spdk/opal.h" + +TAILQ_HEAD(nvme_bdev_ctrlrs, nvme_bdev_ctrlr); +extern struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs; +extern pthread_mutex_t g_bdev_nvme_mutex; +extern bool g_bdev_nvme_module_finish; + +#define NVME_MAX_CONTROLLERS 1024 + +enum nvme_bdev_ns_type { + NVME_BDEV_NS_UNKNOWN = 0, + NVME_BDEV_NS_STANDARD = 1, + NVME_BDEV_NS_OCSSD = 2, +}; + +struct nvme_bdev_ns { + uint32_t id; + enum nvme_bdev_ns_type type; + /** Marks whether this data structure has its bdevs + * populated for the associated namespace. It is used + * to keep track if we need manage the populated + * resources when a newly active namespace is found, + * or when a namespace becomes inactive. + */ + bool populated; + struct spdk_nvme_ns *ns; + struct nvme_bdev_ctrlr *ctrlr; + TAILQ_HEAD(, nvme_bdev) bdevs; + void *type_ctx; +}; + +struct ocssd_bdev_ctrlr; + +struct nvme_bdev_ctrlr { + /** + * points to pinned, physically contiguous memory region; + * contains 4KB IDENTIFY structure for controller which is + * target for CONTROLLER IDENTIFY command during initialization + */ + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_transport_id *trid; + char *name; + int ref; + bool resetting; + bool destruct; + /** + * PI check flags. This flags is set to NVMe controllers created only + * through bdev_nvme_attach_controller RPC or .INI config file. Hot added + * NVMe controllers are not included. + */ + uint32_t prchk_flags; + uint32_t num_ns; + /** Array of pointers to namespaces indexed by nsid - 1 */ + struct nvme_bdev_ns **namespaces; + + struct spdk_opal_dev *opal_dev; + + struct spdk_poller *adminq_timer_poller; + struct spdk_poller *destruct_poller; + struct spdk_thread *thread; + + struct ocssd_bdev_ctrlr *ocssd_ctrlr; + + /** linked list pointer for device list */ + TAILQ_ENTRY(nvme_bdev_ctrlr) tailq; +}; + +struct nvme_bdev { + struct spdk_bdev disk; + struct nvme_bdev_ns *nvme_ns; + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + TAILQ_ENTRY(nvme_bdev) tailq; +}; + +struct nvme_bdev_poll_group { + struct spdk_nvme_poll_group *group; + struct spdk_poller *poller; + bool collect_spin_stat; + uint64_t spin_ticks; + uint64_t start_ticks; + uint64_t end_ticks; +}; + +typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc); + +struct nvme_async_probe_ctx { + struct spdk_nvme_probe_ctx *probe_ctx; + const char *base_name; + const char **names; + uint32_t count; + uint32_t prchk_flags; + struct spdk_poller *poller; + struct spdk_nvme_transport_id trid; + struct spdk_nvme_ctrlr_opts opts; + spdk_bdev_create_nvme_fn cb_fn; + void *cb_ctx; + uint32_t populates_in_progress; +}; + +struct ocssd_io_channel; + +struct nvme_io_channel { + struct spdk_nvme_qpair *qpair; + struct nvme_bdev_poll_group *group; + TAILQ_HEAD(, spdk_bdev_io) pending_resets; + struct ocssd_io_channel *ocssd_ioch; +}; + +void nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, + struct nvme_bdev_ns *ns, int rc); +void nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr); + +struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get(const struct spdk_nvme_transport_id *trid); +struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get_by_name(const char *name); +struct nvme_bdev_ctrlr *nvme_bdev_first_ctrlr(void); +struct nvme_bdev_ctrlr *nvme_bdev_next_ctrlr(struct nvme_bdev_ctrlr *prev); + +void nvme_bdev_dump_trid_json(struct spdk_nvme_transport_id *trid, + struct spdk_json_write_ctx *w); + +int nvme_bdev_ctrlr_destruct(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr); +void nvme_bdev_attach_bdev_to_ns(struct nvme_bdev_ns *nvme_ns, struct nvme_bdev *nvme_disk); +void nvme_bdev_detach_bdev_from_ns(struct nvme_bdev *nvme_disk); + +#endif /* SPDK_COMMON_BDEV_NVME_H */ diff --git a/src/spdk/module/bdev/nvme/nvme_rpc.c b/src/spdk/module/bdev/nvme/nvme_rpc.c new file mode 100644 index 000000000..e6a938384 --- /dev/null +++ b/src/spdk/module/bdev/nvme/nvme_rpc.c @@ -0,0 +1,492 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#include "bdev_nvme.h" +#include "common.h" +#include "spdk/base64.h" + +enum spdk_nvme_rpc_type { + NVME_ADMIN_CMD = 1, + NVME_IO_CMD, +}; + +struct rpc_bdev_nvme_send_cmd_req { + char *name; + int cmd_type; + int data_direction; + uint32_t timeout_ms; + uint32_t data_len; + uint32_t md_len; + + struct spdk_nvme_cmd *cmdbuf; + char *data; + char *md; +}; + +struct rpc_bdev_nvme_send_cmd_resp { + char *cpl_text; + char *data_text; + char *md_text; +}; + +struct rpc_bdev_nvme_send_cmd_ctx { + struct spdk_jsonrpc_request *jsonrpc_request; + struct rpc_bdev_nvme_send_cmd_req req; + struct rpc_bdev_nvme_send_cmd_resp resp; + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct spdk_io_channel *ctrlr_io_ch; +}; + +static void +free_rpc_bdev_nvme_send_cmd_ctx(struct rpc_bdev_nvme_send_cmd_ctx *ctx) +{ + assert(ctx != NULL); + + free(ctx->req.name); + free(ctx->req.cmdbuf); + spdk_free(ctx->req.data); + spdk_free(ctx->req.md); + free(ctx->resp.cpl_text); + free(ctx->resp.data_text); + free(ctx->resp.md_text); + free(ctx); +} + +static int +rpc_bdev_nvme_send_cmd_resp_construct(struct rpc_bdev_nvme_send_cmd_resp *resp, + struct rpc_bdev_nvme_send_cmd_req *req, + const struct spdk_nvme_cpl *cpl) +{ + resp->cpl_text = malloc(spdk_base64_get_encoded_strlen(sizeof(*cpl)) + 1); + if (!resp->cpl_text) { + return -ENOMEM; + } + spdk_base64_urlsafe_encode(resp->cpl_text, cpl, sizeof(*cpl)); + + if (req->data_direction == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + if (req->data_len) { + resp->data_text = malloc(spdk_base64_get_encoded_strlen(req->data_len) + 1); + if (!resp->data_text) { + return -ENOMEM; + } + spdk_base64_urlsafe_encode(resp->data_text, req->data, req->data_len); + } + if (req->md_len) { + resp->md_text = malloc(spdk_base64_get_encoded_strlen(req->md_len) + 1); + if (!resp->md_text) { + return -ENOMEM; + } + spdk_base64_urlsafe_encode(resp->md_text, req->md, req->md_len); + } + } + + return 0; +} + +static void +rpc_bdev_nvme_send_cmd_complete(struct rpc_bdev_nvme_send_cmd_ctx *ctx, + const struct spdk_nvme_cpl *cpl) +{ + struct spdk_jsonrpc_request *request = ctx->jsonrpc_request; + struct spdk_json_write_ctx *w; + int ret; + + ret = rpc_bdev_nvme_send_cmd_resp_construct(&ctx->resp, &ctx->req, cpl); + if (ret) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(-ret)); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "cpl", ctx->resp.cpl_text); + + if (ctx->resp.data_text) { + spdk_json_write_named_string(w, "data", ctx->resp.data_text); + } + + if (ctx->resp.md_text) { + spdk_json_write_named_string(w, "metadata", ctx->resp.md_text); + } + + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); + +out: + free_rpc_bdev_nvme_send_cmd_ctx(ctx); + return; +} + +static void +nvme_rpc_bdev_nvme_cb(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct rpc_bdev_nvme_send_cmd_ctx *ctx = (struct rpc_bdev_nvme_send_cmd_ctx *)ref; + + if (ctx->ctrlr_io_ch) { + spdk_put_io_channel(ctx->ctrlr_io_ch); + ctx->ctrlr_io_ch = NULL; + } + + rpc_bdev_nvme_send_cmd_complete(ctx, cpl); +} + +static int +nvme_rpc_admin_cmd_bdev_nvme(struct rpc_bdev_nvme_send_cmd_ctx *ctx, struct spdk_nvme_cmd *cmd, + void *buf, uint32_t nbytes, uint32_t timeout_ms) +{ + struct nvme_bdev_ctrlr *_nvme_ctrlr = ctx->nvme_bdev_ctrlr; + int ret; + + ret = spdk_nvme_ctrlr_cmd_admin_raw(_nvme_ctrlr->ctrlr, cmd, buf, + nbytes, nvme_rpc_bdev_nvme_cb, ctx); + + return ret; +} + +static int +nvme_rpc_io_cmd_bdev_nvme(struct rpc_bdev_nvme_send_cmd_ctx *ctx, struct spdk_nvme_cmd *cmd, + void *buf, uint32_t nbytes, void *md_buf, uint32_t md_len, + uint32_t timeout_ms) +{ + struct nvme_bdev_ctrlr *_nvme_ctrlr = ctx->nvme_bdev_ctrlr; + struct spdk_nvme_qpair *io_qpair; + int ret; + + ctx->ctrlr_io_ch = spdk_get_io_channel(_nvme_ctrlr->ctrlr); + io_qpair = bdev_nvme_get_io_qpair(ctx->ctrlr_io_ch); + + ret = spdk_nvme_ctrlr_cmd_io_raw_with_md(_nvme_ctrlr->ctrlr, io_qpair, + cmd, buf, nbytes, md_buf, nvme_rpc_bdev_nvme_cb, ctx); + if (ret) { + spdk_put_io_channel(ctx->ctrlr_io_ch); + } + + return ret; + +} + +static int +rpc_bdev_nvme_send_cmd_exec(struct rpc_bdev_nvme_send_cmd_ctx *ctx) +{ + struct rpc_bdev_nvme_send_cmd_req *req = &ctx->req; + int ret = -EINVAL; + + switch (req->cmd_type) { + case NVME_ADMIN_CMD: + ret = nvme_rpc_admin_cmd_bdev_nvme(ctx, req->cmdbuf, req->data, + req->data_len, req->timeout_ms); + break; + case NVME_IO_CMD: + ret = nvme_rpc_io_cmd_bdev_nvme(ctx, req->cmdbuf, req->data, + req->data_len, req->md, req->md_len, req->timeout_ms); + break; + } + + return ret; +} + +static int +rpc_decode_cmd_type(const struct spdk_json_val *val, void *out) +{ + int *cmd_type = out; + + if (spdk_json_strequal(val, "admin") == true) { + *cmd_type = NVME_ADMIN_CMD; + } else if (spdk_json_strequal(val, "io") == true) { + *cmd_type = NVME_IO_CMD; + } else { + SPDK_NOTICELOG("Invalid parameter value: cmd_type\n"); + return -EINVAL; + } + + return 0; +} + +static int +rpc_decode_data_direction(const struct spdk_json_val *val, void *out) +{ + int *data_direction = out; + + if (spdk_json_strequal(val, "h2c") == true) { + *data_direction = SPDK_NVME_DATA_HOST_TO_CONTROLLER; + } else if (spdk_json_strequal(val, "c2h") == true) { + *data_direction = SPDK_NVME_DATA_CONTROLLER_TO_HOST; + } else { + SPDK_NOTICELOG("Invalid parameter value: data_direction\n"); + return -EINVAL; + } + + return 0; +} + +static int +rpc_decode_cmdbuf(const struct spdk_json_val *val, void *out) +{ + char *text = NULL; + size_t text_strlen, raw_len; + struct spdk_nvme_cmd *cmdbuf, **_cmdbuf = out; + int rc; + + rc = spdk_json_decode_string(val, &text); + if (rc) { + return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; + } + + text_strlen = strlen(text); + raw_len = spdk_base64_get_decoded_len(text_strlen); + cmdbuf = malloc(raw_len); + if (!cmdbuf) { + rc = -ENOMEM; + goto out; + } + + rc = spdk_base64_urlsafe_decode(cmdbuf, &raw_len, text); + if (rc) { + free(cmdbuf); + goto out; + } + if (raw_len != sizeof(*cmdbuf)) { + rc = -EINVAL; + free(cmdbuf); + goto out; + } + + *_cmdbuf = cmdbuf; + +out: + free(text); + return rc; +} + +static int +rpc_decode_data(const struct spdk_json_val *val, void *out) +{ + struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; + char *text = NULL; + size_t text_strlen; + int rc; + + rc = spdk_json_decode_string(val, &text); + if (rc) { + return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; + } + text_strlen = strlen(text); + + if (req->data_len) { + /* data_len is decoded by param "data_len" */ + if (req->data_len != spdk_base64_get_decoded_len(text_strlen)) { + rc = -EINVAL; + goto out; + } + } else { + req->data_len = spdk_base64_get_decoded_len(text_strlen); + req->data = spdk_malloc(req->data_len > 0x1000 ? req->data_len : 0x1000, 0x1000, + NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!req->data) { + rc = -ENOMEM; + goto out; + } + } + + rc = spdk_base64_urlsafe_decode(req->data, (size_t *)&req->data_len, text); + +out: + free(text); + return rc; +} + +static int +rpc_decode_data_len(const struct spdk_json_val *val, void *out) +{ + struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; + uint32_t data_len; + int rc; + + rc = spdk_json_decode_uint32(val, &data_len); + if (rc) { + return rc; + } + + if (req->data_len) { + /* data_len is decoded by param "data" */ + if (req->data_len != data_len) { + rc = -EINVAL; + } + } else { + req->data_len = data_len; + req->data = spdk_malloc(req->data_len > 0x1000 ? req->data_len : 0x1000, 0x1000, + NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!req->data) { + rc = -ENOMEM; + } + } + + return rc; +} + +static int +rpc_decode_metadata(const struct spdk_json_val *val, void *out) +{ + struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; + char *text = NULL; + size_t text_strlen; + int rc; + + rc = spdk_json_decode_string(val, &text); + if (rc) { + return rc = val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; + } + text_strlen = strlen(text); + + if (req->md_len) { + /* md_len is decoded by param "metadata_len" */ + if (req->md_len != spdk_base64_get_decoded_len(text_strlen)) { + rc = -EINVAL; + goto out; + } + } else { + req->md_len = spdk_base64_get_decoded_len(text_strlen); + req->md = spdk_malloc(req->md_len, 0x1000, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!req->md) { + rc = -ENOMEM; + goto out; + } + } + + rc = spdk_base64_urlsafe_decode(req->md, (size_t *)&req->md_len, text); + +out: + free(text); + return rc; +} + +static int +rpc_decode_metadata_len(const struct spdk_json_val *val, void *out) +{ + struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; + uint32_t md_len; + int rc; + + rc = spdk_json_decode_uint32(val, &md_len); + if (rc) { + return rc; + } + + if (req->md_len) { + /* md_len is decoded by param "metadata" */ + if (req->md_len != md_len) { + rc = -EINVAL; + } + } else { + req->md_len = md_len; + req->md = spdk_malloc(req->md_len, 0x1000, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!req->md) { + rc = -ENOMEM; + } + } + + return rc; +} + +static const struct spdk_json_object_decoder rpc_bdev_nvme_send_cmd_req_decoders[] = { + {"name", offsetof(struct rpc_bdev_nvme_send_cmd_req, name), spdk_json_decode_string}, + {"cmd_type", offsetof(struct rpc_bdev_nvme_send_cmd_req, cmd_type), rpc_decode_cmd_type}, + {"data_direction", offsetof(struct rpc_bdev_nvme_send_cmd_req, data_direction), rpc_decode_data_direction}, + {"cmdbuf", offsetof(struct rpc_bdev_nvme_send_cmd_req, cmdbuf), rpc_decode_cmdbuf}, + {"timeout_ms", offsetof(struct rpc_bdev_nvme_send_cmd_req, timeout_ms), spdk_json_decode_uint32, true}, + {"data_len", 0, rpc_decode_data_len, true}, + {"metadata_len", 0, rpc_decode_metadata_len, true}, + {"data", 0, rpc_decode_data, true}, + {"metadata", 0, rpc_decode_metadata, true}, +}; + +static void +rpc_bdev_nvme_send_cmd(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_nvme_send_cmd_ctx *ctx; + int ret, error_code; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + SPDK_ERRLOG("Failed at Malloc ctx\n"); + error_code = SPDK_JSONRPC_ERROR_INTERNAL_ERROR; + ret = -ENOMEM; + goto invalid; + } + + if (spdk_json_decode_object(params, rpc_bdev_nvme_send_cmd_req_decoders, + SPDK_COUNTOF(rpc_bdev_nvme_send_cmd_req_decoders), + &ctx->req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + error_code = SPDK_JSONRPC_ERROR_INVALID_PARAMS; + ret = -EINVAL; + goto invalid; + } + + ctx->nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->req.name); + if (ctx->nvme_bdev_ctrlr == NULL) { + SPDK_ERRLOG("Failed at device lookup\n"); + error_code = SPDK_JSONRPC_ERROR_INVALID_PARAMS; + ret = -EINVAL; + goto invalid; + } + + ctx->jsonrpc_request = request; + + ret = rpc_bdev_nvme_send_cmd_exec(ctx); + if (ret < 0) { + SPDK_NOTICELOG("Failed at rpc_bdev_nvme_send_cmd_exec\n"); + error_code = SPDK_JSONRPC_ERROR_INTERNAL_ERROR; + goto invalid; + } + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, error_code, spdk_strerror(-ret)); + free_rpc_bdev_nvme_send_cmd_ctx(ctx); + return; +} +SPDK_RPC_REGISTER("bdev_nvme_send_cmd", rpc_bdev_nvme_send_cmd, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_send_cmd, send_nvme_cmd) diff --git a/src/spdk/module/bdev/nvme/vbdev_opal.c b/src/spdk/module/bdev/nvme/vbdev_opal.c new file mode 100644 index 000000000..68281c92b --- /dev/null +++ b/src/spdk/module/bdev/nvme/vbdev_opal.c @@ -0,0 +1,630 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/opal.h" +#include "spdk/bdev_module.h" +#include "vbdev_opal.h" +#include "spdk_internal/log.h" +#include "spdk/string.h" + +/* OPAL locking range only supports operations on nsid=1 for now */ +#define NSID_SUPPORTED 1 + +struct opal_vbdev { + char *name; + struct nvme_bdev_ctrlr *nvme_ctrlr; + struct spdk_opal_dev *opal_dev; + struct spdk_bdev_part *bdev_part; + + uint8_t locking_range_id; + uint64_t range_start; + uint64_t range_length; + struct vbdev_opal_part_base *opal_base; + + TAILQ_ENTRY(opal_vbdev) tailq; +}; + +static TAILQ_HEAD(, opal_vbdev) g_opal_vbdev = + TAILQ_HEAD_INITIALIZER(g_opal_vbdev); + +struct vbdev_opal_bdev_io { + struct spdk_io_channel *ch; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_io_wait_entry bdev_io_wait; +}; + +struct vbdev_opal_channel { + struct spdk_bdev_part_channel part_ch; +}; + +struct vbdev_opal_part_base { + char *nvme_ctrlr_name; + struct spdk_bdev_part_base *part_base; + SPDK_BDEV_PART_TAILQ part_tailq; + TAILQ_ENTRY(vbdev_opal_part_base) tailq; +}; + +static TAILQ_HEAD(, vbdev_opal_part_base) g_opal_base = TAILQ_HEAD_INITIALIZER(g_opal_base); + +static void _vbdev_opal_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io); + +static void vbdev_opal_examine(struct spdk_bdev *bdev); + +static void +vbdev_opal_delete(struct opal_vbdev *opal_bdev) +{ + TAILQ_REMOVE(&g_opal_vbdev, opal_bdev, tailq); + free(opal_bdev->name); + free(opal_bdev); + opal_bdev = NULL; +} + +static void +vbdev_opal_clear(void) +{ + struct opal_vbdev *opal_bdev, *tmp; + + TAILQ_FOREACH_SAFE(opal_bdev, &g_opal_vbdev, tailq, tmp) { + vbdev_opal_delete(opal_bdev); + } +} + +static int +vbdev_opal_init(void) +{ + /* TODO */ + return 0; +} + +static void +vbdev_opal_fini(void) +{ + vbdev_opal_clear(); +} + +static int +vbdev_opal_get_ctx_size(void) +{ + return sizeof(struct vbdev_opal_bdev_io); +} + +/* delete all the config of the same base bdev */ +static void +vbdev_opal_delete_all_base_config(struct vbdev_opal_part_base *base) +{ + char *nvme_ctrlr_name = base->nvme_ctrlr_name; + struct opal_vbdev *bdev, *tmp_bdev; + + TAILQ_FOREACH_SAFE(bdev, &g_opal_vbdev, tailq, tmp_bdev) { + if (!strcmp(nvme_ctrlr_name, bdev->nvme_ctrlr->name)) { + vbdev_opal_delete(bdev); + } + } +} + +static int +_vbdev_opal_destruct(void *ctx) +{ + struct spdk_bdev_part *part = ctx; + + return spdk_bdev_part_free(part); +} + +static void +vbdev_opal_base_free(void *ctx) +{ + struct vbdev_opal_part_base *base = ctx; + + TAILQ_REMOVE(&g_opal_base, base, tailq); + + free(base->nvme_ctrlr_name); + free(base); +} + +static void +vbdev_opal_resubmit_io(void *arg) +{ + struct vbdev_opal_bdev_io *io_ctx = (struct vbdev_opal_bdev_io *)arg; + + _vbdev_opal_submit_request(io_ctx->ch, io_ctx->bdev_io); +} + +static void +vbdev_opal_queue_io(struct vbdev_opal_bdev_io *io_ctx) +{ + struct vbdev_opal_channel *ch = spdk_io_channel_get_ctx(io_ctx->ch); + int rc; + + io_ctx->bdev_io_wait.bdev = io_ctx->bdev_io->bdev; + io_ctx->bdev_io_wait.cb_fn = vbdev_opal_resubmit_io; + io_ctx->bdev_io_wait.cb_arg = io_ctx; + + rc = spdk_bdev_queue_io_wait(io_ctx->bdev_io->bdev, ch->part_ch.base_ch, &io_ctx->bdev_io_wait); + + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vbdev_opal_queue_io: %d\n", rc); + spdk_bdev_io_complete(io_ctx->bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +_vbdev_opal_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_opal_channel *ch = spdk_io_channel_get_ctx(_ch); + struct vbdev_opal_bdev_io *io_ctx = (struct vbdev_opal_bdev_io *)bdev_io->driver_ctx; + int rc; + + rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_OPAL, "opal: no memory, queue io.\n"); + io_ctx->ch = _ch; + io_ctx->bdev_io = bdev_io; + vbdev_opal_queue_io(io_ctx); + } else { + SPDK_ERRLOG("opal: error on io submission, rc=%d.\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +static void +vbdev_opal_io_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + _vbdev_opal_submit_request(ch, bdev_io); +} + +static void +vbdev_opal_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, vbdev_opal_io_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + default: + _vbdev_opal_submit_request(ch, bdev_io); + break; + } +} + +struct spdk_opal_locking_range_info * +vbdev_opal_get_info_from_bdev(const char *opal_bdev_name, const char *password) +{ + struct opal_vbdev *vbdev; + struct nvme_bdev_ctrlr *nvme_ctrlr; + int locking_range_id; + int rc; + + TAILQ_FOREACH(vbdev, &g_opal_vbdev, tailq) { + if (strcmp(vbdev->name, opal_bdev_name) == 0) { + break; + } + } + + if (vbdev == NULL) { + SPDK_ERRLOG("%s not found\n", opal_bdev_name); + return NULL; + } + + nvme_ctrlr = vbdev->nvme_ctrlr; + if (nvme_ctrlr == NULL) { + SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", vbdev->name); + return NULL; + } + + locking_range_id = vbdev->locking_range_id; + rc = spdk_opal_cmd_get_locking_range_info(nvme_ctrlr->opal_dev, password, + OPAL_ADMIN1, locking_range_id); + if (rc) { + SPDK_ERRLOG("Get locking range info error: %d\n", rc); + return NULL; + } + + return spdk_opal_get_locking_range_info(nvme_ctrlr->opal_dev, locking_range_id); +} + +static int +vbdev_opal_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct spdk_bdev_part *part = ctx; + struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(part); + uint64_t offset = spdk_bdev_part_get_offset_blocks(part); + + spdk_json_write_named_object_begin(w, "opal"); + + spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev)); + spdk_json_write_named_uint64(w, "offset_blocks", offset); + + spdk_json_write_object_end(w); + + return 0; +} + +static void +vbdev_opal_base_bdev_hotremove_cb(void *_part_base) +{ + struct spdk_bdev_part_base *part_base = _part_base; + struct vbdev_opal_part_base *base = spdk_bdev_part_base_get_ctx(part_base); + + spdk_bdev_part_base_hotremove(part_base, spdk_bdev_part_base_get_tailq(part_base)); + vbdev_opal_delete_all_base_config(base); +} + +static bool +vbdev_opal_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct spdk_bdev_part *part = ctx; + struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(part); + + return spdk_bdev_io_type_supported(base_bdev, io_type); +} + +static struct spdk_bdev_fn_table opal_vbdev_fn_table = { + .destruct = _vbdev_opal_destruct, + .submit_request = vbdev_opal_submit_request, + .io_type_supported = vbdev_opal_io_type_supported, + .dump_info_json = vbdev_opal_dump_info_json, + .write_config_json = NULL, +}; + +static struct spdk_bdev_module opal_if = { + .name = "opal", + .module_init = vbdev_opal_init, + .module_fini = vbdev_opal_fini, + .get_ctx_size = vbdev_opal_get_ctx_size, + .examine_config = vbdev_opal_examine, + .config_json = NULL, +}; + +SPDK_BDEV_MODULE_REGISTER(opal, &opal_if) + +int +vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_range_id, + uint64_t range_start, uint64_t range_length, const char *password) +{ + int rc; + char *opal_vbdev_name; + char *base_bdev_name; + struct nvme_bdev_ctrlr *nvme_ctrlr; + struct opal_vbdev *opal_bdev; + struct vbdev_opal_part_base *opal_part_base = NULL; + struct spdk_bdev_part *part_bdev; + struct nvme_bdev *nvme_bdev; + + if (nsid != NSID_SUPPORTED) { + SPDK_ERRLOG("nsid %d not supported", nsid); + return -EINVAL; + } + + nvme_ctrlr = nvme_bdev_ctrlr_get_by_name(nvme_ctrlr_name); + if (!nvme_ctrlr) { + SPDK_ERRLOG("get nvme ctrlr failed\n"); + return -ENODEV; + } + + if (!nvme_ctrlr->opal_dev) { + SPDK_ERRLOG("Opal not supported\n"); + return -ENOTSUP; + } + + opal_bdev = calloc(1, sizeof(struct opal_vbdev)); + if (!opal_bdev) { + SPDK_ERRLOG("allocation for opal_bdev failed\n"); + return -ENOMEM; + } + + opal_bdev->locking_range_id = locking_range_id; + opal_bdev->range_start = range_start; + opal_bdev->range_length = range_length; + + opal_bdev->nvme_ctrlr = nvme_ctrlr; + opal_bdev->opal_dev = nvme_ctrlr->opal_dev; + + nvme_bdev = TAILQ_FIRST(&nvme_ctrlr->namespaces[nsid - 1]->bdevs); + assert(nvme_bdev != NULL); + base_bdev_name = nvme_bdev->disk.name; + + /* traverse base list to see if part_base is already create for this base bdev */ + TAILQ_FOREACH(opal_part_base, &g_opal_base, tailq) { + if (!strcmp(spdk_bdev_part_base_get_bdev_name(opal_part_base->part_base), base_bdev_name)) { + break; + } + } + + /* If there is not a corresponding opal_part_base, a new opal_part_base will be created. + For each new part_base, there will be one tailq to store all the parts of this base */ + if (opal_part_base == NULL) { + opal_part_base = calloc(1, sizeof(*opal_part_base)); + if (opal_part_base == NULL) { + SPDK_ERRLOG("Could not allocate opal_part_base\n"); + free(opal_bdev); + return -ENOMEM; + } + TAILQ_INIT(&opal_part_base->part_tailq); + + opal_part_base->part_base = spdk_bdev_part_base_construct(spdk_bdev_get_by_name(base_bdev_name), + vbdev_opal_base_bdev_hotremove_cb, &opal_if, + &opal_vbdev_fn_table, &opal_part_base->part_tailq, vbdev_opal_base_free, + opal_part_base, sizeof(struct vbdev_opal_channel), NULL, NULL); + if (opal_part_base->part_base == NULL) { + SPDK_ERRLOG("Could not allocate part_base\n"); + free(opal_bdev); + free(opal_part_base); + return -ENOMEM; + } + opal_part_base->nvme_ctrlr_name = strdup(nvme_ctrlr_name); + if (opal_part_base->nvme_ctrlr_name == NULL) { + free(opal_bdev); + spdk_bdev_part_base_free(opal_part_base->part_base); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&g_opal_base, opal_part_base, tailq); + } + assert(opal_part_base != NULL); + opal_bdev->opal_base = opal_part_base; + + part_bdev = calloc(1, sizeof(struct spdk_bdev_part)); + if (!part_bdev) { + SPDK_ERRLOG("Could not allocate part_bdev\n"); + free(opal_bdev); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&g_opal_vbdev, opal_bdev, tailq); + opal_vbdev_name = spdk_sprintf_alloc("%sr%" PRIu8, base_bdev_name, + opal_bdev->locking_range_id); /* e.g.: nvme0n1r1 */ + if (opal_vbdev_name == NULL) { + SPDK_ERRLOG("Could not allocate opal_vbdev_name\n"); + rc = -ENOMEM; + goto err; + } + + opal_bdev->name = opal_vbdev_name; + rc = spdk_opal_cmd_setup_locking_range(opal_bdev->opal_dev, OPAL_ADMIN1, + opal_bdev->locking_range_id, opal_bdev->range_start, + opal_bdev->range_length, password); + if (rc) { + SPDK_ERRLOG("Error construct %s\n", opal_vbdev_name); + goto err; + } + + rc = spdk_bdev_part_construct(part_bdev, opal_bdev->opal_base->part_base, opal_vbdev_name, + opal_bdev->range_start, opal_bdev->range_length, "Opal locking range"); + if (rc) { + SPDK_ERRLOG("Could not allocate bdev part\n"); + goto err; + } + + /* lock this bdev initially */ + rc = spdk_opal_cmd_lock_unlock(opal_bdev->opal_dev, OPAL_ADMIN1, OPAL_RWLOCK, locking_range_id, + password); + if (rc) { + SPDK_ERRLOG("Error lock %s\n", opal_vbdev_name); + goto err; + } + + opal_bdev->bdev_part = part_bdev; + return 0; + +err: + vbdev_opal_delete(opal_bdev); + free(part_bdev); + return rc; +} + +static void +vbdev_opal_destruct_bdev(struct opal_vbdev *opal_bdev) +{ + struct spdk_bdev_part *part = opal_bdev->bdev_part; + + assert(opal_bdev->opal_base != NULL); + assert(part != NULL); + + if (opal_bdev->range_start == spdk_bdev_part_get_offset_blocks(part)) { + spdk_bdev_unregister(spdk_bdev_part_get_bdev(part), NULL, NULL); + } + vbdev_opal_delete(opal_bdev); +} + +int +vbdev_opal_destruct(const char *bdev_name, const char *password) +{ + struct nvme_bdev_ctrlr *nvme_ctrlr; + int locking_range_id; + int rc; + struct opal_vbdev *opal_bdev; + + TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) { + if (strcmp(opal_bdev->name, bdev_name) == 0) { + break; + } + } + + if (opal_bdev == NULL) { + SPDK_ERRLOG("%s not found\n", bdev_name); + rc = -ENODEV; + goto err; + } + + locking_range_id = opal_bdev->locking_range_id; + + nvme_ctrlr = opal_bdev->nvme_ctrlr; + if (nvme_ctrlr == NULL) { + SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", bdev_name); + return -ENODEV; + } + + /* secure erase locking range */ + rc = spdk_opal_cmd_secure_erase_locking_range(nvme_ctrlr->opal_dev, OPAL_ADMIN1, locking_range_id, + password); + if (rc) { + SPDK_ERRLOG("opal erase locking range failed\n"); + goto err; + } + + /* reset the locking range to 0 */ + rc = spdk_opal_cmd_setup_locking_range(nvme_ctrlr->opal_dev, OPAL_ADMIN1, locking_range_id, 0, + 0, password); + if (rc) { + SPDK_ERRLOG("opal reset locking range failed\n"); + goto err; + } + + spdk_opal_free_locking_range_info(opal_bdev->opal_dev, locking_range_id); + vbdev_opal_destruct_bdev(opal_bdev); + return 0; + +err: + return rc; +} + +static void +vbdev_opal_examine(struct spdk_bdev *bdev) +{ + /* TODO */ + spdk_bdev_module_examine_done(&opal_if); +} + +int +vbdev_opal_set_lock_state(const char *bdev_name, uint16_t user_id, const char *password, + const char *lock_state) +{ + struct nvme_bdev_ctrlr *nvme_ctrlr; + int locking_range_id; + int rc; + enum spdk_opal_lock_state state_flag; + struct opal_vbdev *opal_bdev; + + TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) { + if (strcmp(opal_bdev->name, bdev_name) == 0) { + break; + } + } + + if (opal_bdev == NULL) { + SPDK_ERRLOG("%s not found\n", bdev_name); + return -ENODEV; + } + + nvme_ctrlr = opal_bdev->nvme_ctrlr; + if (nvme_ctrlr == NULL) { + SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", opal_bdev->name); + return -ENODEV; + } + + if (strcasecmp(lock_state, "READWRITE") == 0) { + state_flag = OPAL_READWRITE; + } else if (strcasecmp(lock_state, "READONLY") == 0) { + state_flag = OPAL_READONLY; + } else if (strcasecmp(lock_state, "RWLOCK") == 0) { + state_flag = OPAL_RWLOCK; + } else { + SPDK_ERRLOG("Invalid OPAL lock state input\n"); + return -EINVAL; + } + + locking_range_id = opal_bdev->locking_range_id; + rc = spdk_opal_cmd_lock_unlock(nvme_ctrlr->opal_dev, user_id, state_flag, locking_range_id, + password); + if (rc) { + SPDK_ERRLOG("%s lock/unlock failure: %d\n", bdev_name, rc); + } + + return rc; +} + +int +vbdev_opal_enable_new_user(const char *bdev_name, const char *admin_password, uint16_t user_id, + const char *user_password) +{ + struct nvme_bdev_ctrlr *nvme_ctrlr; + int locking_range_id; + int rc; + struct opal_vbdev *opal_bdev; + + TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) { + if (strcmp(opal_bdev->name, bdev_name) == 0) { + break; + } + } + + if (opal_bdev == NULL) { + SPDK_ERRLOG("%s not found\n", bdev_name); + return -ENODEV; + } + + nvme_ctrlr = opal_bdev->nvme_ctrlr; + if (nvme_ctrlr == NULL) { + SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", opal_bdev->name); + return -ENODEV; + } + + rc = spdk_opal_cmd_enable_user(nvme_ctrlr->opal_dev, user_id, admin_password); + if (rc) { + SPDK_ERRLOG("%s enable user error: %d\n", bdev_name, rc); + return rc; + } + + rc = spdk_opal_cmd_set_new_passwd(nvme_ctrlr->opal_dev, user_id, user_password, admin_password, + true); + if (rc) { + SPDK_ERRLOG("%s set user password error: %d\n", bdev_name, rc); + return rc; + } + + locking_range_id = opal_bdev->locking_range_id; + rc = spdk_opal_cmd_add_user_to_locking_range(nvme_ctrlr->opal_dev, user_id, locking_range_id, + OPAL_READONLY, admin_password); + if (rc) { + SPDK_ERRLOG("%s add user READONLY priority error: %d\n", bdev_name, rc); + return rc; + } + + rc = spdk_opal_cmd_add_user_to_locking_range(nvme_ctrlr->opal_dev, user_id, locking_range_id, + OPAL_READWRITE, admin_password); + if (rc) { + SPDK_ERRLOG("%s add user READWRITE priority error: %d\n", bdev_name, rc); + return rc; + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_opal", SPDK_LOG_VBDEV_OPAL) diff --git a/src/spdk/module/bdev/nvme/vbdev_opal.h b/src/spdk/module/bdev/nvme/vbdev_opal.h new file mode 100644 index 000000000..0b2fd731f --- /dev/null +++ b/src/spdk/module/bdev/nvme/vbdev_opal.h @@ -0,0 +1,54 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef SPDK_VBDEV_OPAL_H +#define SPDK_VBDEV_OPAL_H + +#include "spdk/bdev_module.h" +#include "bdev_nvme.h" +#include "common.h" + +int vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_range_id, + uint64_t range_start, uint64_t range_length, const char *password); + +struct spdk_opal_locking_range_info *vbdev_opal_get_info_from_bdev(const char *opal_bdev_name, + const char *password); + +int vbdev_opal_destruct(const char *bdev_name, const char *password); + +int vbdev_opal_enable_new_user(const char *bdev_name, const char *admin_password, + uint16_t user_id, const char *user_password); + +int vbdev_opal_set_lock_state(const char *bdev_name, uint16_t user_id, const char *password, + const char *lock_state); + +#endif diff --git a/src/spdk/module/bdev/nvme/vbdev_opal_rpc.c b/src/spdk/module/bdev/nvme/vbdev_opal_rpc.c new file mode 100644 index 000000000..ee270ef35 --- /dev/null +++ b/src/spdk/module/bdev/nvme/vbdev_opal_rpc.c @@ -0,0 +1,453 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +#include "vbdev_opal.h" + +struct rpc_bdev_nvme_opal_init { + char *nvme_ctrlr_name; + char *password; +}; + +static void +free_rpc_bdev_nvme_opal_init(struct rpc_bdev_nvme_opal_init *req) +{ + free(req->nvme_ctrlr_name); + free(req->password); +} + +static const struct spdk_json_object_decoder rpc_bdev_nvme_opal_init_decoders[] = { + {"nvme_ctrlr_name", offsetof(struct rpc_bdev_nvme_opal_init, nvme_ctrlr_name), spdk_json_decode_string}, + {"password", offsetof(struct rpc_bdev_nvme_opal_init, password), spdk_json_decode_string}, +}; + +static void +rpc_bdev_nvme_opal_init(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_nvme_opal_init req = {}; + struct spdk_json_write_ctx *w; + struct nvme_bdev_ctrlr *nvme_ctrlr; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_nvme_opal_init_decoders, + SPDK_COUNTOF(rpc_bdev_nvme_opal_init_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + /* check if opal supported */ + nvme_ctrlr = nvme_bdev_ctrlr_get_by_name(req.nvme_ctrlr_name); + if (nvme_ctrlr == NULL || nvme_ctrlr->opal_dev == NULL) { + SPDK_ERRLOG("%s not support opal\n", req.nvme_ctrlr_name); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + /* take ownership */ + rc = spdk_opal_cmd_take_ownership(nvme_ctrlr->opal_dev, req.password); + if (rc) { + SPDK_ERRLOG("Take ownership failure: %d\n", rc); + switch (rc) { + case -EBUSY: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "SP Busy, try again later"); + break; + case -EACCES: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "This drive is already enabled"); + break; + default: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } + goto out; + } + + /* activate locking SP */ + rc = spdk_opal_cmd_activate_locking_sp(nvme_ctrlr->opal_dev, req.password); + if (rc) { + SPDK_ERRLOG("Activate locking SP failure: %d\n", rc); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +out: + free_rpc_bdev_nvme_opal_init(&req); +} +SPDK_RPC_REGISTER("bdev_nvme_opal_init", rpc_bdev_nvme_opal_init, SPDK_RPC_RUNTIME) + +struct rpc_bdev_nvme_opal_revert { + char *nvme_ctrlr_name; + char *password; +}; + +static void +free_rpc_bdev_nvme_opal_revert(struct rpc_bdev_nvme_opal_revert *req) +{ + free(req->nvme_ctrlr_name); + free(req->password); +} + +static const struct spdk_json_object_decoder rpc_bdev_nvme_opal_revert_decoders[] = { + {"nvme_ctrlr_name", offsetof(struct rpc_bdev_nvme_opal_revert, nvme_ctrlr_name), spdk_json_decode_string}, + {"password", offsetof(struct rpc_bdev_nvme_opal_revert, password), spdk_json_decode_string}, +}; + +static void +rpc_bdev_nvme_opal_revert(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_nvme_opal_revert req = {}; + struct spdk_json_write_ctx *w; + struct nvme_bdev_ctrlr *nvme_ctrlr; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_nvme_opal_revert_decoders, + SPDK_COUNTOF(rpc_bdev_nvme_opal_revert_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + /* check if opal supported */ + nvme_ctrlr = nvme_bdev_ctrlr_get_by_name(req.nvme_ctrlr_name); + if (nvme_ctrlr == NULL || nvme_ctrlr->opal_dev == NULL) { + SPDK_ERRLOG("%s not support opal\n", req.nvme_ctrlr_name); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + /* TODO: delete all opal vbdev before revert TPer */ + + rc = spdk_opal_cmd_revert_tper(nvme_ctrlr->opal_dev, req.password); + if (rc) { + SPDK_ERRLOG("Revert TPer failure: %d\n", rc); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +out: + free_rpc_bdev_nvme_opal_revert(&req); +} +SPDK_RPC_REGISTER("bdev_nvme_opal_revert", rpc_bdev_nvme_opal_revert, SPDK_RPC_RUNTIME) + +struct rpc_bdev_opal_create { + char *nvme_ctrlr_name; + uint32_t nsid; + uint16_t locking_range_id; + uint64_t range_start; + uint64_t range_length; + char *password; +}; + +static void +free_rpc_bdev_opal_create(struct rpc_bdev_opal_create *req) +{ + free(req->nvme_ctrlr_name); + free(req->password); +} + +static const struct spdk_json_object_decoder rpc_bdev_opal_create_decoders[] = { + {"nvme_ctrlr_name", offsetof(struct rpc_bdev_opal_create, nvme_ctrlr_name), spdk_json_decode_string}, + {"nsid", offsetof(struct rpc_bdev_opal_create, nsid), spdk_json_decode_uint32}, + {"locking_range_id", offsetof(struct rpc_bdev_opal_create, locking_range_id), spdk_json_decode_uint16}, + {"range_start", offsetof(struct rpc_bdev_opal_create, range_start), spdk_json_decode_uint64}, + {"range_length", offsetof(struct rpc_bdev_opal_create, range_length), spdk_json_decode_uint64}, + {"password", offsetof(struct rpc_bdev_opal_create, password), spdk_json_decode_string}, +}; + +static void +rpc_bdev_opal_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_opal_create req = {}; + struct spdk_json_write_ctx *w; + char *opal_bdev_name; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_opal_create_decoders, + SPDK_COUNTOF(rpc_bdev_opal_create_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + rc = vbdev_opal_create(req.nvme_ctrlr_name, req.nsid, req.locking_range_id, req.range_start, + req.range_length, req.password); + if (rc != 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Failed to create opal vbdev from '%s': %s", + req.nvme_ctrlr_name, spdk_strerror(-rc)); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + opal_bdev_name = spdk_sprintf_alloc("%sn%dr%d", req.nvme_ctrlr_name, req.nsid, + req.locking_range_id); + spdk_json_write_string(w, opal_bdev_name); + spdk_jsonrpc_end_result(request, w); + free(opal_bdev_name); + +out: + free_rpc_bdev_opal_create(&req); +} +SPDK_RPC_REGISTER("bdev_opal_create", rpc_bdev_opal_create, SPDK_RPC_RUNTIME) + +struct rpc_bdev_opal_get_info { + char *bdev_name; + char *password; +}; + +static void +free_rpc_bdev_opal_get_info(struct rpc_bdev_opal_get_info *req) +{ + free(req->bdev_name); + free(req->password); +} + +static const struct spdk_json_object_decoder rpc_bdev_opal_get_info_decoders[] = { + {"bdev_name", offsetof(struct rpc_bdev_opal_get_info, bdev_name), spdk_json_decode_string}, + {"password", offsetof(struct rpc_bdev_opal_get_info, password), spdk_json_decode_string}, +}; + +static void +rpc_bdev_opal_get_info(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_opal_get_info req = {}; + struct spdk_json_write_ctx *w; + struct spdk_opal_locking_range_info *info; + + if (spdk_json_decode_object(params, rpc_bdev_opal_get_info_decoders, + SPDK_COUNTOF(rpc_bdev_opal_get_info_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + info = vbdev_opal_get_info_from_bdev(req.bdev_name, req.password); + if (info == NULL) { + SPDK_ERRLOG("Get opal info failure\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "name", req.bdev_name); + spdk_json_write_named_uint64(w, "range_start", info->range_start); + spdk_json_write_named_uint64(w, "range_length", info->range_length); + spdk_json_write_named_bool(w, "read_lock_enabled", info->read_lock_enabled); + spdk_json_write_named_bool(w, "write_lock_enabled", info->write_lock_enabled); + spdk_json_write_named_bool(w, "read_locked", info->read_locked); + spdk_json_write_named_bool(w, "write_locked", info->write_locked); + + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); + +out: + free_rpc_bdev_opal_get_info(&req); +} +SPDK_RPC_REGISTER("bdev_opal_get_info", rpc_bdev_opal_get_info, SPDK_RPC_RUNTIME) + +struct rpc_bdev_opal_delete { + char *bdev_name; + char *password; +}; + +static void +free_rpc_bdev_opal_delete(struct rpc_bdev_opal_delete *req) +{ + free(req->bdev_name); + free(req->password); +} + +static const struct spdk_json_object_decoder rpc_bdev_opal_delete_decoders[] = { + {"bdev_name", offsetof(struct rpc_bdev_opal_delete, bdev_name), spdk_json_decode_string}, + {"password", offsetof(struct rpc_bdev_opal_delete, password), spdk_json_decode_string}, +}; + +static void +rpc_bdev_opal_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_opal_delete req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_opal_delete_decoders, + SPDK_COUNTOF(rpc_bdev_opal_delete_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + rc = vbdev_opal_destruct(req.bdev_name, req.password); + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +out: + free_rpc_bdev_opal_delete(&req); +} +SPDK_RPC_REGISTER("bdev_opal_delete", rpc_bdev_opal_delete, SPDK_RPC_RUNTIME) + +struct rpc_bdev_opal_set_lock_state { + char *bdev_name; + uint16_t user_id; + char *password; + char *lock_state; +}; + +static void +free_rpc_bdev_opal_set_lock_state(struct rpc_bdev_opal_set_lock_state *req) +{ + free(req->bdev_name); + free(req->password); + free(req->lock_state); +} + +static const struct spdk_json_object_decoder rpc_bdev_opal_set_lock_state_decoders[] = { + {"bdev_name", offsetof(struct rpc_bdev_opal_set_lock_state, bdev_name), spdk_json_decode_string}, + {"user_id", offsetof(struct rpc_bdev_opal_set_lock_state, user_id), spdk_json_decode_uint16}, + {"password", offsetof(struct rpc_bdev_opal_set_lock_state, password), spdk_json_decode_string}, + {"lock_state", offsetof(struct rpc_bdev_opal_set_lock_state, lock_state), spdk_json_decode_string}, +}; + +static void +rpc_bdev_opal_set_lock_state(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_opal_set_lock_state req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_opal_set_lock_state_decoders, + SPDK_COUNTOF(rpc_bdev_opal_set_lock_state_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + rc = vbdev_opal_set_lock_state(req.bdev_name, req.user_id, req.password, req.lock_state); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +out: + free_rpc_bdev_opal_set_lock_state(&req); +} +SPDK_RPC_REGISTER("bdev_opal_set_lock_state", rpc_bdev_opal_set_lock_state, SPDK_RPC_RUNTIME) + +struct rpc_bdev_opal_new_user { + char *bdev_name; + char *admin_password; + uint16_t user_id; + char *user_password; +}; + +static void +free_rpc_bdev_opal_new_user(struct rpc_bdev_opal_new_user *req) +{ + free(req->bdev_name); + free(req->admin_password); + free(req->user_password); +} + +static const struct spdk_json_object_decoder rpc_bdev_opal_new_user_decoders[] = { + {"bdev_name", offsetof(struct rpc_bdev_opal_new_user, bdev_name), spdk_json_decode_string}, + {"admin_password", offsetof(struct rpc_bdev_opal_new_user, admin_password), spdk_json_decode_string}, + {"user_id", offsetof(struct rpc_bdev_opal_new_user, user_id), spdk_json_decode_uint16}, + {"user_password", offsetof(struct rpc_bdev_opal_new_user, user_password), spdk_json_decode_string}, +}; + +static void +rpc_bdev_opal_new_user(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_opal_new_user req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_opal_new_user_decoders, + SPDK_COUNTOF(rpc_bdev_opal_new_user_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + rc = vbdev_opal_enable_new_user(req.bdev_name, req.admin_password, req.user_id, + req.user_password); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +out: + free_rpc_bdev_opal_new_user(&req); +} +SPDK_RPC_REGISTER("bdev_opal_new_user", rpc_bdev_opal_new_user, SPDK_RPC_RUNTIME) diff --git a/src/spdk/module/bdev/ocf/Makefile b/src/spdk/module/bdev/ocf/Makefile new file mode 100644 index 000000000..b931de106 --- /dev/null +++ b/src/spdk/module/bdev/ocf/Makefile @@ -0,0 +1,52 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) + +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +CFLAGS += $(ENV_CFLAGS) -I$(SPDK_ROOT_DIR)/lib/env_ocf -I$(SPDK_ROOT_DIR)/lib/env_ocf/include +C_SRCS = $(shell ls *.c) + +LIBNAME := bdev_ocf + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk + +OCF_ENV := $(call spdk_lib_list_to_static_libs,ocfenv) + +$(LIB) : $(OCF_ENV) diff --git a/src/spdk/module/bdev/ocf/ctx.c b/src/spdk/module/bdev/ocf/ctx.c new file mode 100644 index 000000000..5bf4c8fee --- /dev/null +++ b/src/spdk/module/bdev/ocf/ctx.c @@ -0,0 +1,565 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <ocf/ocf.h> +#include <execinfo.h> + +#include "spdk/env.h" +#include "spdk_internal/log.h" + +#include "ctx.h" +#include "ocf_env.h" +#include "data.h" + +ocf_ctx_t vbdev_ocf_ctx; + +static ctx_data_t * +vbdev_ocf_ctx_data_alloc(uint32_t pages) +{ + struct bdev_ocf_data *data; + void *buf; + uint32_t sz; + + data = vbdev_ocf_data_alloc(1); + + sz = pages * PAGE_SIZE; + buf = spdk_malloc(sz, PAGE_SIZE, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (buf == NULL) { + return NULL; + } + + vbdev_ocf_iovs_add(data, buf, sz); + + data->size = sz; + + return data; +} + +static void +vbdev_ocf_ctx_data_free(ctx_data_t *ctx_data) +{ + struct bdev_ocf_data *data = ctx_data; + int i; + + if (!data) { + return; + } + + for (i = 0; i < data->iovcnt; i++) { + spdk_free(data->iovs[i].iov_base); + } + + vbdev_ocf_data_free(data); +} + +static int +vbdev_ocf_ctx_data_mlock(ctx_data_t *ctx_data) +{ + /* TODO [mlock]: add mlock option */ + return 0; +} + +static void +vbdev_ocf_ctx_data_munlock(ctx_data_t *ctx_data) +{ + /* TODO [mlock]: add mlock option */ +} + +static size_t +iovec_flatten(struct iovec *iov, size_t iovcnt, void *buf, size_t size, size_t offset) +{ + size_t i, len, done = 0; + + for (i = 0; i < iovcnt; i++) { + if (offset >= iov[i].iov_len) { + offset -= iov[i].iov_len; + continue; + } + + if (iov[i].iov_base == NULL) { + continue; + } + + if (done >= size) { + break; + } + + len = MIN(size - done, iov[i].iov_len - offset); + memcpy(buf, iov[i].iov_base + offset, len); + buf += len; + done += len; + offset = 0; + } + + return done; +} + +static uint32_t +vbdev_ocf_ctx_data_rd(void *dst, ctx_data_t *src, uint32_t size) +{ + struct bdev_ocf_data *s = src; + uint32_t size_local; + + size_local = iovec_flatten(s->iovs, s->iovcnt, dst, size, s->seek); + s->seek += size_local; + + return size_local; +} + +static size_t +buf_to_iovec(const void *buf, size_t size, struct iovec *iov, size_t iovcnt, size_t offset) +{ + size_t i, len, done = 0; + + for (i = 0; i < iovcnt; i++) { + if (offset >= iov[i].iov_len) { + offset -= iov[i].iov_len; + continue; + } + + if (iov[i].iov_base == NULL) { + continue; + } + + if (done >= size) { + break; + } + + len = MIN(size - done, iov[i].iov_len - offset); + memcpy(iov[i].iov_base + offset, buf, len); + buf += len; + done += len; + offset = 0; + } + + return done; +} + +static uint32_t +vbdev_ocf_ctx_data_wr(ctx_data_t *dst, const void *src, uint32_t size) +{ + struct bdev_ocf_data *d = dst; + uint32_t size_local; + + size_local = buf_to_iovec(src, size, d->iovs, d->iovcnt, d->seek); + d->seek += size_local; + + return size_local; +} + +static size_t +iovset(struct iovec *iov, size_t iovcnt, int byte, size_t size, size_t offset) +{ + size_t i, len, done = 0; + + for (i = 0; i < iovcnt; i++) { + if (offset >= iov[i].iov_len) { + offset -= iov[i].iov_len; + continue; + } + + if (iov[i].iov_base == NULL) { + continue; + } + + if (done >= size) { + break; + } + + len = MIN(size - done, iov[i].iov_len - offset); + memset(iov[i].iov_base + offset, byte, len); + done += len; + offset = 0; + } + + return done; +} + +static uint32_t +vbdev_ocf_ctx_data_zero(ctx_data_t *dst, uint32_t size) +{ + struct bdev_ocf_data *d = dst; + uint32_t size_local; + + size_local = iovset(d->iovs, d->iovcnt, 0, size, d->seek); + d->seek += size_local; + + return size_local; +} + +static uint32_t +vbdev_ocf_ctx_data_seek(ctx_data_t *dst, ctx_data_seek_t seek, uint32_t offset) +{ + struct bdev_ocf_data *d = dst; + uint32_t off = 0; + + switch (seek) { + case ctx_data_seek_begin: + off = MIN(offset, d->size); + d->seek = off; + break; + case ctx_data_seek_current: + off = MIN(offset, d->size - d->seek); + d->seek += off; + break; + } + + return off; +} + +static uint64_t +vbdev_ocf_ctx_data_cpy(ctx_data_t *dst, ctx_data_t *src, uint64_t to, + uint64_t from, uint64_t bytes) +{ + struct bdev_ocf_data *s = src; + struct bdev_ocf_data *d = dst; + uint32_t it_iov = 0; + uint32_t it_off = 0; + uint32_t n, sz; + + bytes = MIN(bytes, s->size - from); + bytes = MIN(bytes, d->size - to); + sz = bytes; + + while (from || bytes) { + if (s->iovs[it_iov].iov_len == it_off) { + it_iov++; + it_off = 0; + continue; + } + + if (from) { + n = MIN(from, s->iovs[it_iov].iov_len); + from -= n; + } else { + n = MIN(bytes, s->iovs[it_iov].iov_len); + buf_to_iovec(s->iovs[it_iov].iov_base + it_off, n, d->iovs, d->iovcnt, to); + bytes -= n; + to += n; + } + + it_off += n; + } + + return sz; +} + +static void +vbdev_ocf_ctx_data_secure_erase(ctx_data_t *ctx_data) +{ + struct bdev_ocf_data *data = ctx_data; + struct iovec *iovs = data->iovs; + int i; + + for (i = 0; i < data->iovcnt; i++) { + if (env_memset(iovs[i].iov_base, iovs[i].iov_len, 0)) { + assert(false); + } + } +} + +int vbdev_ocf_queue_create(ocf_cache_t cache, ocf_queue_t *queue, const struct ocf_queue_ops *ops) +{ + int rc; + struct vbdev_ocf_cache_ctx *ctx = ocf_cache_get_priv(cache); + + pthread_mutex_lock(&ctx->lock); + rc = ocf_queue_create(cache, queue, ops); + pthread_mutex_unlock(&ctx->lock); + return rc; +} + +void vbdev_ocf_queue_put(ocf_queue_t queue) +{ + ocf_cache_t cache = ocf_queue_get_cache(queue); + struct vbdev_ocf_cache_ctx *ctx = ocf_cache_get_priv(cache); + + pthread_mutex_lock(&ctx->lock); + ocf_queue_put(queue); + pthread_mutex_unlock(&ctx->lock); +} + +void vbdev_ocf_cache_ctx_put(struct vbdev_ocf_cache_ctx *ctx) +{ + if (env_atomic_dec_return(&ctx->refcnt) == 0) { + pthread_mutex_destroy(&ctx->lock); + free(ctx); + } +} + +void vbdev_ocf_cache_ctx_get(struct vbdev_ocf_cache_ctx *ctx) +{ + env_atomic_inc(&ctx->refcnt); +} + +struct cleaner_priv { + struct spdk_poller *poller; + ocf_queue_t queue; + uint64_t next_run; +}; + +static int +cleaner_poll(void *arg) +{ + ocf_cleaner_t cleaner = arg; + struct cleaner_priv *priv = ocf_cleaner_get_priv(cleaner); + uint32_t iono = ocf_queue_pending_io(priv->queue); + int i, max = spdk_min(32, iono); + + for (i = 0; i < max; i++) { + ocf_queue_run_single(priv->queue); + } + + if (spdk_get_ticks() >= priv->next_run) { + ocf_cleaner_run(cleaner, priv->queue); + return SPDK_POLLER_BUSY; + } + + if (iono > 0) { + return SPDK_POLLER_BUSY; + } else { + return SPDK_POLLER_IDLE; + } +} + +static void +cleaner_cmpl(ocf_cleaner_t c, uint32_t interval) +{ + struct cleaner_priv *priv = ocf_cleaner_get_priv(c); + + priv->next_run = spdk_get_ticks() + ((interval * spdk_get_ticks_hz()) / 1000); +} + +static void +cleaner_queue_kick(ocf_queue_t q) +{ +} + +static void +cleaner_queue_stop(ocf_queue_t q) +{ + struct cleaner_priv *cpriv = ocf_queue_get_priv(q); + + if (cpriv) { + spdk_poller_unregister(&cpriv->poller); + free(cpriv); + } +} + +const struct ocf_queue_ops cleaner_queue_ops = { + .kick_sync = cleaner_queue_kick, + .kick = cleaner_queue_kick, + .stop = cleaner_queue_stop, +}; + +static int +vbdev_ocf_ctx_cleaner_init(ocf_cleaner_t c) +{ + int rc; + struct cleaner_priv *priv = calloc(1, sizeof(*priv)); + ocf_cache_t cache = ocf_cleaner_get_cache(c); + struct vbdev_ocf_cache_ctx *cctx = ocf_cache_get_priv(cache); + + if (priv == NULL) { + return -ENOMEM; + } + + rc = vbdev_ocf_queue_create(cache, &priv->queue, &cleaner_queue_ops); + if (rc) { + free(priv); + return rc; + } + + ocf_queue_set_priv(priv->queue, priv); + + cctx->cleaner_queue = priv->queue; + + ocf_cleaner_set_cmpl(c, cleaner_cmpl); + ocf_cleaner_set_priv(c, priv); + + return 0; +} + +static void +vbdev_ocf_ctx_cleaner_stop(ocf_cleaner_t c) +{ + struct cleaner_priv *priv = ocf_cleaner_get_priv(c); + + vbdev_ocf_queue_put(priv->queue); +} + +static void +vbdev_ocf_ctx_cleaner_kick(ocf_cleaner_t cleaner) +{ + struct cleaner_priv *priv = ocf_cleaner_get_priv(cleaner); + + if (priv->poller) { + return; + } + + /* We start cleaner poller at the same thread where cache was created + * TODO: allow user to specify core at which cleaner should run */ + priv->poller = SPDK_POLLER_REGISTER(cleaner_poll, cleaner, 0); +} + +static void +vbdev_ocf_md_kick(void *ctx) +{ + ocf_metadata_updater_t mu = ctx; + ocf_cache_t cache = ocf_metadata_updater_get_cache(mu); + + if (ocf_cache_is_running(cache)) { + ocf_metadata_updater_run(mu); + } +} + +static int +vbdev_ocf_volume_updater_init(ocf_metadata_updater_t mu) +{ + struct spdk_thread *md_thread = spdk_get_thread(); + + ocf_metadata_updater_set_priv(mu, md_thread); + + return 0; +} + +static void +vbdev_ocf_volume_updater_stop(ocf_metadata_updater_t mu) +{ + +} + +static void +vbdev_ocf_volume_updater_kick(ocf_metadata_updater_t mu) +{ + struct spdk_thread *md_thread = ocf_metadata_updater_get_priv(mu); + + /* We need to send message to updater thread because + * kick can happen from any thread */ + spdk_thread_send_msg(md_thread, vbdev_ocf_md_kick, mu); +} + +/* This function is main way by which OCF communicates with user + * We don't want to use SPDK_LOG here because debugging information that is + * associated with every print message is not helpful in callback that only prints info + * while the real source is somewhere in OCF code */ +static int +vbdev_ocf_ctx_log_printf(ocf_logger_t logger, ocf_logger_lvl_t lvl, + const char *fmt, va_list args) +{ + int spdk_lvl; + + switch (lvl) { + case log_emerg: + case log_alert: + case log_crit: + case log_err: + spdk_lvl = SPDK_LOG_ERROR; + break; + + case log_warn: + spdk_lvl = SPDK_LOG_WARN; + break; + + case log_notice: + spdk_lvl = SPDK_LOG_NOTICE; + break; + + case log_info: + case log_debug: + default: + spdk_lvl = SPDK_LOG_INFO; + } + + spdk_vlog(spdk_lvl, NULL, -1, NULL, fmt, args); + return 0; +} + +static const struct ocf_ctx_config vbdev_ocf_ctx_cfg = { + .name = "OCF SPDK", + + .ops = { + .data = { + .alloc = vbdev_ocf_ctx_data_alloc, + .free = vbdev_ocf_ctx_data_free, + .mlock = vbdev_ocf_ctx_data_mlock, + .munlock = vbdev_ocf_ctx_data_munlock, + .read = vbdev_ocf_ctx_data_rd, + .write = vbdev_ocf_ctx_data_wr, + .zero = vbdev_ocf_ctx_data_zero, + .seek = vbdev_ocf_ctx_data_seek, + .copy = vbdev_ocf_ctx_data_cpy, + .secure_erase = vbdev_ocf_ctx_data_secure_erase, + }, + + .metadata_updater = { + .init = vbdev_ocf_volume_updater_init, + .stop = vbdev_ocf_volume_updater_stop, + .kick = vbdev_ocf_volume_updater_kick, + }, + + .cleaner = { + .init = vbdev_ocf_ctx_cleaner_init, + .stop = vbdev_ocf_ctx_cleaner_stop, + .kick = vbdev_ocf_ctx_cleaner_kick, + }, + + .logger = { + .print = vbdev_ocf_ctx_log_printf, + .dump_stack = NULL, + }, + + }, +}; + +int +vbdev_ocf_ctx_init(void) +{ + int ret; + + ret = ocf_ctx_create(&vbdev_ocf_ctx, &vbdev_ocf_ctx_cfg); + if (ret < 0) { + return ret; + } + + return 0; +} + +void +vbdev_ocf_ctx_cleanup(void) +{ + ocf_ctx_put(vbdev_ocf_ctx); + vbdev_ocf_ctx = NULL; +} + +SPDK_LOG_REGISTER_COMPONENT("ocf_ocfctx", SPDK_LOG_OCFCTX) diff --git a/src/spdk/module/bdev/ocf/ctx.h b/src/spdk/module/bdev/ocf/ctx.h new file mode 100644 index 000000000..446ac8d8f --- /dev/null +++ b/src/spdk/module/bdev/ocf/ctx.h @@ -0,0 +1,65 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VBDEV_OCF_CTX_H +#define VBDEV_OCF_CTX_H + +#include <ocf/ocf.h> +#include "spdk/thread.h" + +extern ocf_ctx_t vbdev_ocf_ctx; + +#define OCF_WRITE_FLUSH 11 + +#define SPDK_OBJECT 1 + +/* Context of cache instance */ +struct vbdev_ocf_cache_ctx { + ocf_queue_t mngt_queue; + ocf_queue_t cleaner_queue; + pthread_mutex_t lock; + env_atomic refcnt; +}; + +void vbdev_ocf_cache_ctx_put(struct vbdev_ocf_cache_ctx *ctx); +void vbdev_ocf_cache_ctx_get(struct vbdev_ocf_cache_ctx *ctx); + +int vbdev_ocf_ctx_init(void); +void vbdev_ocf_ctx_cleanup(void); + +/* Thread safe queue creation and deletion + * These are wrappers for original ocf_queue_create() and ocf_queue_put() */ +int vbdev_ocf_queue_create(ocf_cache_t cache, ocf_queue_t *queue, const struct ocf_queue_ops *ops); +void vbdev_ocf_queue_put(ocf_queue_t queue); + +#endif diff --git a/src/spdk/module/bdev/ocf/data.c b/src/spdk/module/bdev/ocf/data.c new file mode 100644 index 000000000..981c793f5 --- /dev/null +++ b/src/spdk/module/bdev/ocf/data.c @@ -0,0 +1,122 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <ocf/ocf.h> +#include "spdk/bdev.h" +#include "data.h" + +struct bdev_ocf_data * +vbdev_ocf_data_alloc(uint32_t iovcnt) +{ + struct bdev_ocf_data *data; + + data = env_malloc(sizeof(*data), ENV_MEM_NOIO); + if (!data) { + return NULL; + } + + data->seek = 0; + + if (iovcnt) { + data->iovs = env_malloc(sizeof(*data->iovs) * iovcnt, ENV_MEM_NOIO); + if (!data->iovs) { + env_free(data); + return NULL; + } + } + + data->iovcnt = 0; + data->iovalloc = iovcnt; + + return data; +} + +void +vbdev_ocf_data_free(struct bdev_ocf_data *data) +{ + if (!data) { + return; + } + + if (data->iovalloc != 0) { + env_free(data->iovs); + } + + env_free(data); +} + +void +vbdev_ocf_iovs_add(struct bdev_ocf_data *data, void *base, size_t len) +{ + assert(NULL != data); + assert(data->iovalloc != -1); + + if (data->iovcnt == data->iovalloc) { + /* TODO: Realloc iovs */ + SPDK_ERRLOG("IOV error\n"); + } + + data->iovs[data->iovcnt].iov_base = base; + data->iovs[data->iovcnt].iov_len = len; + data->iovcnt++; +} + +struct bdev_ocf_data * +vbdev_ocf_data_from_spdk_io(struct spdk_bdev_io *bdev_io) +{ + struct bdev_ocf_data *data; + + if (bdev_io == NULL) { + return NULL; + } + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_READ: + assert(bdev_io->u.bdev.iovs); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_UNMAP: + break; + default: + SPDK_ERRLOG("Unsupported IO type %d\n", bdev_io->type); + return NULL; + } + + data = (struct bdev_ocf_data *)bdev_io->driver_ctx; + data->iovs = bdev_io->u.bdev.iovs; + data->iovcnt = bdev_io->u.bdev.iovcnt; + data->size = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + + return data; +} diff --git a/src/spdk/module/bdev/ocf/data.h b/src/spdk/module/bdev/ocf/data.h new file mode 100644 index 000000000..7ed5adcef --- /dev/null +++ b/src/spdk/module/bdev/ocf/data.h @@ -0,0 +1,57 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VBDEV_OCF_DATA_H +#define VBDEV_OCF_DATA_H + +#include "spdk/bdev_module.h" + +struct bdev_ocf_data { + struct iovec *iovs; + int iovcnt; + int iovalloc; + uint32_t size; + uint32_t seek; +}; + +struct bdev_ocf_data *vbdev_ocf_data_from_spdk_io(struct spdk_bdev_io *bdev_io); + +struct bdev_ocf_data *vbdev_ocf_data_alloc(uint32_t nvecs); + +void vbdev_ocf_data_free(struct bdev_ocf_data *data); + +struct bdev_ocf_data *vbdev_ocf_data_from_iov(struct iovec *iovs); + +void vbdev_ocf_iovs_add(struct bdev_ocf_data *data, void *base, size_t len); + +#endif diff --git a/src/spdk/module/bdev/ocf/stats.c b/src/spdk/module/bdev/ocf/stats.c new file mode 100644 index 000000000..164da7d2e --- /dev/null +++ b/src/spdk/module/bdev/ocf/stats.c @@ -0,0 +1,109 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ctx.h" +#include "stats.h" + +int +vbdev_ocf_stats_get(ocf_cache_t cache, char *core_name, struct vbdev_ocf_stats *stats) +{ + int status; + ocf_core_t core; + + status = ocf_core_get_by_name(cache, core_name, strlen(core_name), &core); + if (status) { + return status; + } + + return ocf_stats_collect_core(core, &stats->usage, &stats->reqs, &stats->blocks, &stats->errors); +} + +#define WJSON_STAT(w, stats, group, field, units) \ + spdk_json_write_named_object_begin(w, #field); \ + spdk_json_write_named_uint64(w, "count", stats->group.field.value); \ + spdk_json_write_named_string_fmt(w, "percentage", "%lu.%lu", \ + stats->group.field.fraction / 100, stats->group.field.fraction % 100); \ + spdk_json_write_named_string(w, "units", units); \ + spdk_json_write_object_end(w); + +void +vbdev_ocf_stats_write_json(struct spdk_json_write_ctx *w, struct vbdev_ocf_stats *stats) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_object_begin(w, "usage"); + WJSON_STAT(w, stats, usage, occupancy, "4KiB blocks"); + WJSON_STAT(w, stats, usage, free, "4KiB blocks"); + WJSON_STAT(w, stats, usage, clean, "4KiB blocks"); + WJSON_STAT(w, stats, usage, dirty, "4KiB blocks"); + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "requests"); + WJSON_STAT(w, stats, reqs, rd_hits, "Requests"); + WJSON_STAT(w, stats, reqs, rd_partial_misses, "Requests"); + WJSON_STAT(w, stats, reqs, rd_full_misses, "Requests"); + WJSON_STAT(w, stats, reqs, rd_total, "Requests"); + WJSON_STAT(w, stats, reqs, wr_hits, "Requests"); + WJSON_STAT(w, stats, reqs, wr_partial_misses, "Requests"); + WJSON_STAT(w, stats, reqs, wr_full_misses, "Requests"); + WJSON_STAT(w, stats, reqs, wr_total, "Requests"); + WJSON_STAT(w, stats, reqs, rd_pt, "Requests"); + WJSON_STAT(w, stats, reqs, wr_pt, "Requests"); + WJSON_STAT(w, stats, reqs, serviced, "Requests"); + WJSON_STAT(w, stats, reqs, total, "Requests"); + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "blocks"); + WJSON_STAT(w, stats, blocks, core_volume_rd, "4KiB blocks"); + WJSON_STAT(w, stats, blocks, core_volume_wr, "4KiB blocks"); + WJSON_STAT(w, stats, blocks, core_volume_total, "4KiB blocks"); + WJSON_STAT(w, stats, blocks, cache_volume_rd, "4KiB blocks"); + WJSON_STAT(w, stats, blocks, cache_volume_wr, "4KiB blocks"); + WJSON_STAT(w, stats, blocks, cache_volume_total, "4KiB blocks"); + WJSON_STAT(w, stats, blocks, volume_rd, "4KiB blocks"); + WJSON_STAT(w, stats, blocks, volume_wr, "4KiB blocks"); + WJSON_STAT(w, stats, blocks, volume_total, "4KiB blocks"); + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "errors"); + WJSON_STAT(w, stats, errors, core_volume_rd, "Requests"); + WJSON_STAT(w, stats, errors, core_volume_wr, "Requests"); + WJSON_STAT(w, stats, errors, core_volume_total, "Requests"); + WJSON_STAT(w, stats, errors, cache_volume_rd, "Requests"); + WJSON_STAT(w, stats, errors, cache_volume_wr, "Requests"); + WJSON_STAT(w, stats, errors, cache_volume_total, "Requests"); + WJSON_STAT(w, stats, errors, total, "Requests"); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} diff --git a/src/spdk/module/bdev/ocf/stats.h b/src/spdk/module/bdev/ocf/stats.h new file mode 100644 index 000000000..b377c67f5 --- /dev/null +++ b/src/spdk/module/bdev/ocf/stats.h @@ -0,0 +1,51 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VBDEV_OCF_STATS_H +#define VBDEV_OCF_STATS_H + +#include "spdk/json.h" +#include <ocf/ocf.h> + +struct vbdev_ocf_stats { + struct ocf_stats_usage usage; + struct ocf_stats_requests reqs; + struct ocf_stats_blocks blocks; + struct ocf_stats_errors errors; +}; + +int vbdev_ocf_stats_get(ocf_cache_t cache, char *core_name, struct vbdev_ocf_stats *stats); + +void vbdev_ocf_stats_write_json(struct spdk_json_write_ctx *w, struct vbdev_ocf_stats *stats); + +#endif diff --git a/src/spdk/module/bdev/ocf/utils.c b/src/spdk/module/bdev/ocf/utils.c new file mode 100644 index 000000000..3a1df3c9e --- /dev/null +++ b/src/spdk/module/bdev/ocf/utils.c @@ -0,0 +1,136 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "utils.h" +#include "vbdev_ocf.h" + +static char *cache_modes[ocf_cache_mode_max] = { + [ocf_cache_mode_wt] = "wt", + [ocf_cache_mode_wb] = "wb", + [ocf_cache_mode_wa] = "wa", + [ocf_cache_mode_pt] = "pt", + [ocf_cache_mode_wi] = "wi", + [ocf_cache_mode_wo] = "wo", +}; + +ocf_cache_mode_t +ocf_get_cache_mode(const char *cache_mode) +{ + int i; + + for (i = 0; i < ocf_cache_mode_max; i++) { + if (strcmp(cache_mode, cache_modes[i]) == 0) { + return i; + } + } + + return ocf_cache_mode_none; +} + +const char * +ocf_get_cache_modename(ocf_cache_mode_t mode) +{ + if (mode > ocf_cache_mode_none && mode < ocf_cache_mode_max) { + return cache_modes[mode]; + } else { + return NULL; + } +} + +int +vbdev_ocf_mngt_start(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *path, + vbdev_ocf_mngt_callback cb, void *cb_arg) +{ + if (vbdev->mngt_ctx.current_step) { + return -EBUSY; + } + + memset(&vbdev->mngt_ctx, 0, sizeof(vbdev->mngt_ctx)); + + vbdev->mngt_ctx.current_step = path; + vbdev->mngt_ctx.cb = cb; + vbdev->mngt_ctx.cb_arg = cb_arg; + + (*vbdev->mngt_ctx.current_step)(vbdev); + + return 0; +} + +void +vbdev_ocf_mngt_stop(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int status) +{ + if (status) { + vbdev->mngt_ctx.status = status; + } + + if (vbdev->mngt_ctx.status && rollback_path) { + vbdev->mngt_ctx.poller_fn = NULL; + vbdev->mngt_ctx.current_step = rollback_path; + (*vbdev->mngt_ctx.current_step)(vbdev); + return; + } + + if (vbdev->mngt_ctx.cb) { + vbdev->mngt_ctx.cb(vbdev->mngt_ctx.status, vbdev, vbdev->mngt_ctx.cb_arg); + } + + memset(&vbdev->mngt_ctx, 0, sizeof(vbdev->mngt_ctx)); +} + +void +vbdev_ocf_mngt_continue(struct vbdev_ocf *vbdev, int status) +{ + if (vbdev->mngt_ctx.current_step == NULL) { + return; + } + + assert((*vbdev->mngt_ctx.current_step) != NULL); + + vbdev->mngt_ctx.status = status; + + vbdev->mngt_ctx.current_step++; + if (*vbdev->mngt_ctx.current_step) { + (*vbdev->mngt_ctx.current_step)(vbdev); + return; + } + + vbdev_ocf_mngt_stop(vbdev, NULL, 0); +} + +int +vbdev_ocf_mngt_get_status(struct vbdev_ocf *vbdev) +{ + return vbdev->mngt_ctx.status; +} diff --git a/src/spdk/module/bdev/ocf/utils.h b/src/spdk/module/bdev/ocf/utils.h new file mode 100644 index 000000000..73bf6c93a --- /dev/null +++ b/src/spdk/module/bdev/ocf/utils.h @@ -0,0 +1,67 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VBDEV_OCF_UTILS_H +#define VBDEV_OCF_UTILS_H + +#include <ocf/ocf.h> +#include "vbdev_ocf.h" + +ocf_cache_mode_t ocf_get_cache_mode(const char *cache_mode); +const char *ocf_get_cache_modename(ocf_cache_mode_t mode); + +/* Initiate management operation + * Receives NULL terminated array of functions (path) + * and callback (cb) + * and callback argument (cb_arg) + * This function may fail with ENOMEM or EBUSY */ +int vbdev_ocf_mngt_start(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *path, + vbdev_ocf_mngt_callback cb, void *cb_arg); + +/* Continue execution with polling operation (fn) + * fn must invoke vbdev_ocf_mngt_continue() to stop polling + * Poller has default timeout of 5 seconds */ +void vbdev_ocf_mngt_poll(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn fn); + +/* Continue execution with next function that is on path + * If next function is NULL, finish management operation and invoke callback */ +void vbdev_ocf_mngt_continue(struct vbdev_ocf *vbdev, int status); + +/* Stop the execution, if status is non zero set it, + * if rollback function is not null invoke rollback + * else invoke callback with last status returned */ +void vbdev_ocf_mngt_stop(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int status); + +/* Get status */ +int vbdev_ocf_mngt_get_status(struct vbdev_ocf *vbdev); +#endif diff --git a/src/spdk/module/bdev/ocf/vbdev_ocf.c b/src/spdk/module/bdev/ocf/vbdev_ocf.c new file mode 100644 index 000000000..4997772cd --- /dev/null +++ b/src/spdk/module/bdev/ocf/vbdev_ocf.c @@ -0,0 +1,1775 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <ocf/ocf.h> +#include <ocf/ocf_types.h> +#include <ocf/ocf_mngt.h> + +#include "ctx.h" +#include "data.h" +#include "volume.h" +#include "utils.h" +#include "vbdev_ocf.h" + +#include "spdk/bdev_module.h" +#include "spdk/conf.h" +#include "spdk/thread.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" +#include "spdk/cpuset.h" + +static struct spdk_bdev_module ocf_if; + +static TAILQ_HEAD(, vbdev_ocf) g_ocf_vbdev_head + = TAILQ_HEAD_INITIALIZER(g_ocf_vbdev_head); + +static TAILQ_HEAD(, examining_bdev) g_ocf_examining_bdevs_head + = TAILQ_HEAD_INITIALIZER(g_ocf_examining_bdevs_head); + +bool g_fini_started = false; + +/* Structure for keeping list of bdevs that are claimed but not used yet */ +struct examining_bdev { + struct spdk_bdev *bdev; + TAILQ_ENTRY(examining_bdev) tailq; +}; + +/* Add bdev to list of claimed */ +static void +examine_start(struct spdk_bdev *bdev) +{ + struct examining_bdev *entry = malloc(sizeof(*entry)); + + assert(entry); + entry->bdev = bdev; + TAILQ_INSERT_TAIL(&g_ocf_examining_bdevs_head, entry, tailq); +} + +/* Find bdev on list of claimed bdevs, then remove it, + * if it was the last one on list then report examine done */ +static void +examine_done(int status, struct vbdev_ocf *vbdev, void *cb_arg) +{ + struct spdk_bdev *bdev = cb_arg; + struct examining_bdev *entry, *safe, *found = NULL; + + TAILQ_FOREACH_SAFE(entry, &g_ocf_examining_bdevs_head, tailq, safe) { + if (entry->bdev == bdev) { + if (found) { + goto remove; + } else { + found = entry; + } + } + } + + assert(found); + spdk_bdev_module_examine_done(&ocf_if); + +remove: + TAILQ_REMOVE(&g_ocf_examining_bdevs_head, found, tailq); + free(found); +} + +/* Free allocated strings and structure itself + * Used at shutdown only */ +static void +free_vbdev(struct vbdev_ocf *vbdev) +{ + if (!vbdev) { + return; + } + + free(vbdev->name); + free(vbdev->cache.name); + free(vbdev->core.name); + free(vbdev); +} + +/* Get existing cache base + * that is attached to other vbdev */ +static struct vbdev_ocf_base * +get_other_cache_base(struct vbdev_ocf_base *base) +{ + struct vbdev_ocf *vbdev; + + TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { + if (&vbdev->cache == base || !vbdev->cache.attached) { + continue; + } + if (!strcmp(vbdev->cache.name, base->name)) { + return &vbdev->cache; + } + } + + return NULL; +} + +static bool +is_ocf_cache_running(struct vbdev_ocf *vbdev) +{ + if (vbdev->cache.attached && vbdev->ocf_cache) { + return ocf_cache_is_running(vbdev->ocf_cache); + } + return false; +} + +/* Get existing OCF cache instance + * that is started by other vbdev */ +static ocf_cache_t +get_other_cache_instance(struct vbdev_ocf *vbdev) +{ + struct vbdev_ocf *cmp; + + TAILQ_FOREACH(cmp, &g_ocf_vbdev_head, tailq) { + if (cmp->state.doing_finish || cmp == vbdev) { + continue; + } + if (strcmp(cmp->cache.name, vbdev->cache.name)) { + continue; + } + if (is_ocf_cache_running(cmp)) { + return cmp->ocf_cache; + } + } + + return NULL; +} + +static void +_remove_base_bdev(void *ctx) +{ + struct spdk_bdev_desc *desc = ctx; + + spdk_bdev_close(desc); +} + +/* Close and unclaim base bdev */ +static void +remove_base_bdev(struct vbdev_ocf_base *base) +{ + if (base->attached) { + if (base->management_channel) { + spdk_put_io_channel(base->management_channel); + } + + spdk_bdev_module_release_bdev(base->bdev); + /* Close the underlying bdev on its same opened thread. */ + if (base->thread && base->thread != spdk_get_thread()) { + spdk_thread_send_msg(base->thread, _remove_base_bdev, base->desc); + } else { + spdk_bdev_close(base->desc); + } + base->attached = false; + } +} + +/* Finish unregister operation */ +static void +unregister_finish(struct vbdev_ocf *vbdev) +{ + spdk_bdev_destruct_done(&vbdev->exp_bdev, vbdev->state.stop_status); + ocf_mngt_cache_put(vbdev->ocf_cache); + vbdev_ocf_cache_ctx_put(vbdev->cache_ctx); + vbdev_ocf_mngt_continue(vbdev, 0); +} + +static void +close_core_bdev(struct vbdev_ocf *vbdev) +{ + remove_base_bdev(&vbdev->core); + vbdev_ocf_mngt_continue(vbdev, 0); +} + +static void +remove_core_cmpl(void *priv, int error) +{ + struct vbdev_ocf *vbdev = priv; + + ocf_mngt_cache_unlock(vbdev->ocf_cache); + vbdev_ocf_mngt_continue(vbdev, error); +} + +/* Try to lock cache, then remove core */ +static void +remove_core_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) +{ + struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; + + if (error) { + SPDK_ERRLOG("Error %d, can not lock cache instance %s\n", + error, vbdev->name); + vbdev_ocf_mngt_continue(vbdev, error); + return; + } + + ocf_mngt_cache_remove_core(vbdev->ocf_core, remove_core_cmpl, vbdev); +} + +/* Detach core base */ +static void +detach_core(struct vbdev_ocf *vbdev) +{ + if (is_ocf_cache_running(vbdev)) { + ocf_mngt_cache_lock(vbdev->ocf_cache, remove_core_cache_lock_cmpl, vbdev); + } else { + vbdev_ocf_mngt_continue(vbdev, 0); + } +} + +static void +close_cache_bdev(struct vbdev_ocf *vbdev) +{ + remove_base_bdev(&vbdev->cache); + vbdev_ocf_mngt_continue(vbdev, 0); +} + +/* Detach cache base */ +static void +detach_cache(struct vbdev_ocf *vbdev) +{ + vbdev->state.stop_status = vbdev->mngt_ctx.status; + + /* If some other vbdev references this cache bdev, + * we detach this only by changing the flag, without actual close */ + if (get_other_cache_base(&vbdev->cache)) { + vbdev->cache.attached = false; + } + + vbdev_ocf_mngt_continue(vbdev, 0); +} + +static void +stop_vbdev_cmpl(ocf_cache_t cache, void *priv, int error) +{ + struct vbdev_ocf *vbdev = priv; + + vbdev_ocf_queue_put(vbdev->cache_ctx->mngt_queue); + ocf_mngt_cache_unlock(cache); + + vbdev_ocf_mngt_continue(vbdev, error); +} + +/* Try to lock cache, then stop it */ +static void +stop_vbdev_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) +{ + struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; + + if (error) { + SPDK_ERRLOG("Error %d, can not lock cache instance %s\n", + error, vbdev->name); + vbdev_ocf_mngt_continue(vbdev, error); + return; + } + + ocf_mngt_cache_stop(vbdev->ocf_cache, stop_vbdev_cmpl, vbdev); +} + +/* Stop OCF cache object + * vbdev_ocf is not operational after this */ +static void +stop_vbdev(struct vbdev_ocf *vbdev) +{ + if (!is_ocf_cache_running(vbdev)) { + vbdev_ocf_mngt_continue(vbdev, 0); + return; + } + + if (!g_fini_started && get_other_cache_instance(vbdev)) { + SPDK_NOTICELOG("Not stopping cache instance '%s'" + " because it is referenced by other OCF bdev\n", + vbdev->cache.name); + vbdev_ocf_mngt_continue(vbdev, 0); + return; + } + + ocf_mngt_cache_lock(vbdev->ocf_cache, stop_vbdev_cache_lock_cmpl, vbdev); +} + +static void +flush_vbdev_cmpl(ocf_cache_t cache, void *priv, int error) +{ + struct vbdev_ocf *vbdev = priv; + + ocf_mngt_cache_unlock(cache); + vbdev_ocf_mngt_continue(vbdev, error); +} + +static void +flush_vbdev_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) +{ + struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; + + if (error) { + SPDK_ERRLOG("Error %d, can not lock cache instance %s\n", + error, vbdev->name); + vbdev_ocf_mngt_continue(vbdev, error); + return; + } + + ocf_mngt_cache_flush(vbdev->ocf_cache, flush_vbdev_cmpl, vbdev); +} + +static void +flush_vbdev(struct vbdev_ocf *vbdev) +{ + if (!is_ocf_cache_running(vbdev)) { + vbdev_ocf_mngt_continue(vbdev, -EINVAL); + return; + } + + ocf_mngt_cache_lock(vbdev->ocf_cache, flush_vbdev_cache_lock_cmpl, vbdev); +} + +/* Procedures called during dirty unregister */ +vbdev_ocf_mngt_fn unregister_path_dirty[] = { + flush_vbdev, + stop_vbdev, + detach_cache, + close_cache_bdev, + detach_core, + close_core_bdev, + unregister_finish, + NULL +}; + +/* Procedures called during clean unregister */ +vbdev_ocf_mngt_fn unregister_path_clean[] = { + flush_vbdev, + detach_core, + close_core_bdev, + stop_vbdev, + detach_cache, + close_cache_bdev, + unregister_finish, + NULL +}; + +/* Start asynchronous management operation using unregister_path */ +static void +unregister_cb(void *opaque) +{ + struct vbdev_ocf *vbdev = opaque; + vbdev_ocf_mngt_fn *unregister_path; + int rc; + + unregister_path = vbdev->state.doing_clean_delete ? + unregister_path_clean : unregister_path_dirty; + + rc = vbdev_ocf_mngt_start(vbdev, unregister_path, NULL, NULL); + if (rc) { + SPDK_ERRLOG("Unable to unregister OCF bdev: %d\n", rc); + spdk_bdev_destruct_done(&vbdev->exp_bdev, rc); + } +} + +/* Clean remove case - remove core and then cache, this order + * will remove instance permanently */ +static void +_vbdev_ocf_destruct_clean(struct vbdev_ocf *vbdev) +{ + if (vbdev->core.attached) { + detach_core(vbdev); + close_core_bdev(vbdev); + } + + if (vbdev->cache.attached) { + detach_cache(vbdev); + close_cache_bdev(vbdev); + } +} + +/* Dirty shutdown/hot remove case - remove cache and then core, this order + * will allow us to recover this instance in the future */ +static void +_vbdev_ocf_destruct_dirty(struct vbdev_ocf *vbdev) +{ + if (vbdev->cache.attached) { + detach_cache(vbdev); + close_cache_bdev(vbdev); + } + + if (vbdev->core.attached) { + detach_core(vbdev); + close_core_bdev(vbdev); + } +} + +/* Unregister io device with callback to unregister_cb + * This function is called during spdk_bdev_unregister */ +static int +vbdev_ocf_destruct(void *opaque) +{ + struct vbdev_ocf *vbdev = opaque; + + if (vbdev->state.doing_finish) { + return -EALREADY; + } + + if (vbdev->state.starting && !vbdev->state.started) { + /* Prevent before detach cache/core during register path of + this bdev */ + return -EBUSY; + } + + vbdev->state.doing_finish = true; + + if (vbdev->state.started) { + spdk_io_device_unregister(vbdev, unregister_cb); + /* Return 1 because unregister is delayed */ + return 1; + } + + if (vbdev->state.doing_clean_delete) { + _vbdev_ocf_destruct_clean(vbdev); + } else { + _vbdev_ocf_destruct_dirty(vbdev); + } + + return 0; +} + +/* Stop OCF cache and unregister SPDK bdev */ +int +vbdev_ocf_delete(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg) +{ + int rc = 0; + + if (vbdev->state.started) { + spdk_bdev_unregister(&vbdev->exp_bdev, cb, cb_arg); + } else { + rc = vbdev_ocf_destruct(vbdev); + if (rc == 0 && cb) { + cb(cb_arg, 0); + } + } + + return rc; +} + +/* Remove cores permanently and then stop OCF cache and unregister SPDK bdev */ +int +vbdev_ocf_delete_clean(struct vbdev_ocf *vbdev, void (*cb)(void *, int), + void *cb_arg) +{ + vbdev->state.doing_clean_delete = true; + + return vbdev_ocf_delete(vbdev, cb, cb_arg); +} + + +/* If vbdev is online, return its object */ +struct vbdev_ocf * +vbdev_ocf_get_by_name(const char *name) +{ + struct vbdev_ocf *vbdev; + + if (name == NULL) { + assert(false); + return NULL; + } + + TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { + if (vbdev->name == NULL || vbdev->state.doing_finish) { + continue; + } + if (strcmp(vbdev->name, name) == 0) { + return vbdev; + } + } + return NULL; +} + +/* Return matching base if parent vbdev is online */ +struct vbdev_ocf_base * +vbdev_ocf_get_base_by_name(const char *name) +{ + struct vbdev_ocf *vbdev; + + if (name == NULL) { + assert(false); + return NULL; + } + + TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { + if (vbdev->state.doing_finish) { + continue; + } + + if (vbdev->cache.name && strcmp(vbdev->cache.name, name) == 0) { + return &vbdev->cache; + } + if (vbdev->core.name && strcmp(vbdev->core.name, name) == 0) { + return &vbdev->core; + } + } + return NULL; +} + +/* Execute fn for each OCF device that is online or waits for base devices */ +void +vbdev_ocf_foreach(vbdev_ocf_foreach_fn fn, void *ctx) +{ + struct vbdev_ocf *vbdev; + + assert(fn != NULL); + + TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { + if (!vbdev->state.doing_finish) { + fn(vbdev, ctx); + } + } +} + +/* Called from OCF when SPDK_IO is completed */ +static void +vbdev_ocf_io_submit_cb(struct ocf_io *io, int error) +{ + struct spdk_bdev_io *bdev_io = io->priv1; + + if (error == 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else if (error == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + + ocf_io_put(io); +} + +/* Configure io parameters and send it to OCF */ +static int +io_submit_to_ocf(struct spdk_bdev_io *bdev_io, struct ocf_io *io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_READ: + ocf_core_submit_io(io); + return 0; + case SPDK_BDEV_IO_TYPE_FLUSH: + ocf_core_submit_flush(io); + return 0; + case SPDK_BDEV_IO_TYPE_UNMAP: + ocf_core_submit_discard(io); + return 0; + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + default: + SPDK_ERRLOG("Unsupported IO type: %d\n", bdev_io->type); + return -EINVAL; + } +} + +/* Submit SPDK-IO to OCF */ +static void +io_handle(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_ocf *vbdev = bdev_io->bdev->ctxt; + struct ocf_io *io = NULL; + struct bdev_ocf_data *data = NULL; + struct vbdev_ocf_qctx *qctx = spdk_io_channel_get_ctx(ch); + uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + uint64_t offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; + int dir, flags = 0; + int err; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + dir = OCF_READ; + break; + case SPDK_BDEV_IO_TYPE_WRITE: + dir = OCF_WRITE; + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + dir = OCF_WRITE; + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + dir = OCF_WRITE; + break; + default: + err = -EINVAL; + goto fail; + } + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) { + flags = OCF_WRITE_FLUSH; + } + + io = ocf_core_new_io(vbdev->ocf_core, qctx->queue, offset, len, dir, 0, flags); + if (!io) { + err = -ENOMEM; + goto fail; + } + + data = vbdev_ocf_data_from_spdk_io(bdev_io); + if (!data) { + err = -ENOMEM; + goto fail; + } + + err = ocf_io_set_data(io, data, 0); + if (err) { + goto fail; + } + + ocf_io_set_cmpl(io, bdev_io, NULL, vbdev_ocf_io_submit_cb); + + err = io_submit_to_ocf(bdev_io, io); + if (err) { + goto fail; + } + + return; + +fail: + if (io) { + ocf_io_put(io); + } + + if (err == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +vbdev_ocf_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + io_handle(ch, bdev_io); +} + +/* Called from bdev layer when an io to Cache vbdev is submitted */ +static void +vbdev_ocf_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + /* User does not have to allocate io vectors for the request, + * so in case they are not allocated, we allocate them here */ + spdk_bdev_io_get_buf(bdev_io, vbdev_ocf_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_UNMAP: + io_handle(ch, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + default: + SPDK_ERRLOG("Unknown I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + break; + } +} + +/* Called from bdev layer */ +static bool +vbdev_ocf_io_type_supported(void *opaque, enum spdk_bdev_io_type io_type) +{ + struct vbdev_ocf *vbdev = opaque; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_UNMAP: + return spdk_bdev_io_type_supported(vbdev->core.bdev, io_type); + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + default: + return false; + } +} + +/* Called from bdev layer */ +static struct spdk_io_channel * +vbdev_ocf_get_io_channel(void *opaque) +{ + struct vbdev_ocf *bdev = opaque; + + return spdk_get_io_channel(bdev); +} + +static int +vbdev_ocf_dump_info_json(void *opaque, struct spdk_json_write_ctx *w) +{ + struct vbdev_ocf *vbdev = opaque; + + spdk_json_write_named_string(w, "cache_device", vbdev->cache.name); + spdk_json_write_named_string(w, "core_device", vbdev->core.name); + + spdk_json_write_named_string(w, "mode", + ocf_get_cache_modename(ocf_cache_get_mode(vbdev->ocf_cache))); + spdk_json_write_named_uint32(w, "cache_line_size", + ocf_cache_get_line_size(vbdev->ocf_cache)); + spdk_json_write_named_bool(w, "metadata_volatile", + vbdev->cfg.cache.metadata_volatile); + + return 0; +} + +static void +vbdev_ocf_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct vbdev_ocf *vbdev = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_ocf_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", vbdev->name); + spdk_json_write_named_string(w, "mode", + ocf_get_cache_modename(ocf_cache_get_mode(vbdev->ocf_cache))); + spdk_json_write_named_string(w, "cache_bdev_name", vbdev->cache.name); + spdk_json_write_named_string(w, "core_bdev_name", vbdev->core.name); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +/* Cache vbdev function table + * Used by bdev layer */ +static struct spdk_bdev_fn_table cache_dev_fn_table = { + .destruct = vbdev_ocf_destruct, + .io_type_supported = vbdev_ocf_io_type_supported, + .submit_request = vbdev_ocf_submit_request, + .get_io_channel = vbdev_ocf_get_io_channel, + .write_config_json = vbdev_ocf_write_json_config, + .dump_info_json = vbdev_ocf_dump_info_json, +}; + +/* Poller function for the OCF queue + * We execute OCF requests here synchronously */ +static int +queue_poll(void *opaque) +{ + struct vbdev_ocf_qctx *qctx = opaque; + uint32_t iono = ocf_queue_pending_io(qctx->queue); + int i, max = spdk_min(32, iono); + + for (i = 0; i < max; i++) { + ocf_queue_run_single(qctx->queue); + } + + if (iono > 0) { + return SPDK_POLLER_BUSY; + } else { + return SPDK_POLLER_IDLE; + } +} + +/* Called during ocf_submit_io, ocf_purge* + * and any other requests that need to submit io */ +static void +vbdev_ocf_ctx_queue_kick(ocf_queue_t q) +{ +} + +/* OCF queue deinitialization + * Called at ocf_cache_stop */ +static void +vbdev_ocf_ctx_queue_stop(ocf_queue_t q) +{ + struct vbdev_ocf_qctx *qctx = ocf_queue_get_priv(q); + + if (qctx) { + spdk_put_io_channel(qctx->cache_ch); + spdk_put_io_channel(qctx->core_ch); + spdk_poller_unregister(&qctx->poller); + if (qctx->allocated) { + free(qctx); + } + } +} + +/* Queue ops is an interface for running queue thread + * stop() operation in called just before queue gets destroyed */ +const struct ocf_queue_ops queue_ops = { + .kick_sync = vbdev_ocf_ctx_queue_kick, + .kick = vbdev_ocf_ctx_queue_kick, + .stop = vbdev_ocf_ctx_queue_stop, +}; + +/* Called on cache vbdev creation at every thread + * We allocate OCF queues here and SPDK poller for it */ +static int +io_device_create_cb(void *io_device, void *ctx_buf) +{ + struct vbdev_ocf *vbdev = io_device; + struct vbdev_ocf_qctx *qctx = ctx_buf; + int rc; + + rc = vbdev_ocf_queue_create(vbdev->ocf_cache, &qctx->queue, &queue_ops); + if (rc) { + return rc; + } + + ocf_queue_set_priv(qctx->queue, qctx); + + qctx->vbdev = vbdev; + qctx->cache_ch = spdk_bdev_get_io_channel(vbdev->cache.desc); + qctx->core_ch = spdk_bdev_get_io_channel(vbdev->core.desc); + qctx->poller = SPDK_POLLER_REGISTER(queue_poll, qctx, 0); + + return rc; +} + +/* Called per thread + * Put OCF queue and relaunch poller with new context to finish pending requests */ +static void +io_device_destroy_cb(void *io_device, void *ctx_buf) +{ + /* Making a copy of context to use it after io channel will be destroyed */ + struct vbdev_ocf_qctx *copy = malloc(sizeof(*copy)); + struct vbdev_ocf_qctx *qctx = ctx_buf; + + if (copy) { + ocf_queue_set_priv(qctx->queue, copy); + memcpy(copy, qctx, sizeof(*copy)); + spdk_poller_unregister(&qctx->poller); + copy->poller = SPDK_POLLER_REGISTER(queue_poll, copy, 0); + copy->allocated = true; + } else { + SPDK_ERRLOG("Unable to stop OCF queue properly: %s\n", + spdk_strerror(ENOMEM)); + } + + vbdev_ocf_queue_put(qctx->queue); +} + +/* OCF management queue deinitialization */ +static void +vbdev_ocf_ctx_mngt_queue_stop(ocf_queue_t q) +{ + struct spdk_poller *poller = ocf_queue_get_priv(q); + + if (poller) { + spdk_poller_unregister(&poller); + } +} + +static int +mngt_queue_poll(void *opaque) +{ + ocf_queue_t q = opaque; + uint32_t iono = ocf_queue_pending_io(q); + int i, max = spdk_min(32, iono); + + for (i = 0; i < max; i++) { + ocf_queue_run_single(q); + } + + if (iono > 0) { + return SPDK_POLLER_BUSY; + } else { + return SPDK_POLLER_IDLE; + } +} + +static void +vbdev_ocf_ctx_mngt_queue_kick(ocf_queue_t q) +{ +} + +/* Queue ops is an interface for running queue thread + * stop() operation in called just before queue gets destroyed */ +const struct ocf_queue_ops mngt_queue_ops = { + .kick_sync = NULL, + .kick = vbdev_ocf_ctx_mngt_queue_kick, + .stop = vbdev_ocf_ctx_mngt_queue_stop, +}; + +static void +vbdev_ocf_mngt_exit(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int rc) +{ + vbdev->state.starting = false; + vbdev_ocf_mngt_stop(vbdev, rollback_path, rc); +} + +/* Create exported spdk object */ +static void +finish_register(struct vbdev_ocf *vbdev) +{ + int result; + + /* Copy properties of the base bdev */ + vbdev->exp_bdev.blocklen = vbdev->core.bdev->blocklen; + vbdev->exp_bdev.write_cache = vbdev->core.bdev->write_cache; + vbdev->exp_bdev.required_alignment = vbdev->core.bdev->required_alignment; + + vbdev->exp_bdev.name = vbdev->name; + vbdev->exp_bdev.product_name = "SPDK OCF"; + + vbdev->exp_bdev.blockcnt = vbdev->core.bdev->blockcnt; + vbdev->exp_bdev.ctxt = vbdev; + vbdev->exp_bdev.fn_table = &cache_dev_fn_table; + vbdev->exp_bdev.module = &ocf_if; + + /* Finally register vbdev in SPDK */ + spdk_io_device_register(vbdev, io_device_create_cb, io_device_destroy_cb, + sizeof(struct vbdev_ocf_qctx), vbdev->name); + result = spdk_bdev_register(&vbdev->exp_bdev); + if (result) { + SPDK_ERRLOG("Could not register exposed bdev %s\n", + vbdev->name); + vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, result); + return; + } else { + vbdev->state.started = true; + } + + vbdev_ocf_mngt_continue(vbdev, result); +} + +static void +add_core_cmpl(ocf_cache_t cache, ocf_core_t core, void *priv, int error) +{ + struct vbdev_ocf *vbdev = priv; + + ocf_mngt_cache_unlock(cache); + + if (error) { + SPDK_ERRLOG("Error %d, failed to add core device to cache instance %s," + "starting rollback\n", error, vbdev->name); + vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error); + return; + } else { + vbdev->ocf_core = core; + } + + vbdev_ocf_mngt_continue(vbdev, error); +} + +/* Try to lock cache, then add core */ +static void +add_core_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) +{ + struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; + + if (error) { + SPDK_ERRLOG("Error %d, can not lock cache instance %s," + "starting rollback\n", error, vbdev->name); + vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error); + } + ocf_mngt_cache_add_core(vbdev->ocf_cache, &vbdev->cfg.core, add_core_cmpl, vbdev); +} + +/* Add core for existing OCF cache instance */ +static void +add_core(struct vbdev_ocf *vbdev) +{ + ocf_mngt_cache_lock(vbdev->ocf_cache, add_core_cache_lock_cmpl, vbdev); +} + +static void +start_cache_cmpl(ocf_cache_t cache, void *priv, int error) +{ + struct vbdev_ocf *vbdev = priv; + + ocf_mngt_cache_unlock(cache); + + if (error) { + SPDK_ERRLOG("Error %d during start cache %s, starting rollback\n", + error, vbdev->name); + vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error); + return; + } + + vbdev_ocf_mngt_continue(vbdev, error); +} + +static int +create_management_queue(struct vbdev_ocf *vbdev) +{ + struct spdk_poller *mngt_poller; + int rc; + + rc = vbdev_ocf_queue_create(vbdev->ocf_cache, &vbdev->cache_ctx->mngt_queue, &mngt_queue_ops); + if (rc) { + SPDK_ERRLOG("Unable to create mngt_queue: %d\n", rc); + return rc; + } + + mngt_poller = SPDK_POLLER_REGISTER(mngt_queue_poll, vbdev->cache_ctx->mngt_queue, 100); + if (mngt_poller == NULL) { + SPDK_ERRLOG("Unable to initiate mngt request: %s", spdk_strerror(ENOMEM)); + return -ENOMEM; + } + + ocf_queue_set_priv(vbdev->cache_ctx->mngt_queue, mngt_poller); + ocf_mngt_cache_set_mngt_queue(vbdev->ocf_cache, vbdev->cache_ctx->mngt_queue); + + return 0; +} + +/* Start OCF cache, attach caching device */ +static void +start_cache(struct vbdev_ocf *vbdev) +{ + ocf_cache_t existing; + int rc; + + if (is_ocf_cache_running(vbdev)) { + vbdev_ocf_mngt_stop(vbdev, NULL, -EALREADY); + return; + } + + existing = get_other_cache_instance(vbdev); + if (existing) { + SPDK_NOTICELOG("OCF bdev %s connects to existing cache device %s\n", + vbdev->name, vbdev->cache.name); + vbdev->ocf_cache = existing; + ocf_mngt_cache_get(vbdev->ocf_cache); + vbdev->cache_ctx = ocf_cache_get_priv(existing); + vbdev_ocf_cache_ctx_get(vbdev->cache_ctx); + vbdev_ocf_mngt_continue(vbdev, 0); + return; + } + + vbdev->cache_ctx = calloc(1, sizeof(struct vbdev_ocf_cache_ctx)); + if (vbdev->cache_ctx == NULL) { + vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, -ENOMEM); + return; + } + + vbdev_ocf_cache_ctx_get(vbdev->cache_ctx); + pthread_mutex_init(&vbdev->cache_ctx->lock, NULL); + + rc = ocf_mngt_cache_start(vbdev_ocf_ctx, &vbdev->ocf_cache, &vbdev->cfg.cache); + if (rc) { + vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, rc); + return; + } + ocf_mngt_cache_get(vbdev->ocf_cache); + + ocf_cache_set_priv(vbdev->ocf_cache, vbdev->cache_ctx); + + rc = create_management_queue(vbdev); + if (rc) { + SPDK_ERRLOG("Unable to create mngt_queue: %d\n", rc); + vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, rc); + return; + } + + if (vbdev->cfg.loadq) { + ocf_mngt_cache_load(vbdev->ocf_cache, &vbdev->cfg.device, start_cache_cmpl, vbdev); + } else { + ocf_mngt_cache_attach(vbdev->ocf_cache, &vbdev->cfg.device, start_cache_cmpl, vbdev); + } +} + +/* Procedures called during register operation */ +vbdev_ocf_mngt_fn register_path[] = { + start_cache, + add_core, + finish_register, + NULL +}; + +/* Start cache instance and register OCF bdev */ +static void +register_vbdev(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_callback cb, void *cb_arg) +{ + int rc; + + if (!(vbdev->core.attached && vbdev->cache.attached) || vbdev->state.started) { + cb(-EPERM, vbdev, cb_arg); + return; + } + + vbdev->state.starting = true; + rc = vbdev_ocf_mngt_start(vbdev, register_path, cb, cb_arg); + if (rc) { + cb(rc, vbdev, cb_arg); + } +} + +/* Init OCF configuration options + * for core and cache devices */ +static void +init_vbdev_config(struct vbdev_ocf *vbdev) +{ + struct vbdev_ocf_config *cfg = &vbdev->cfg; + + snprintf(cfg->cache.name, sizeof(cfg->cache.name), "%s", vbdev->name); + snprintf(cfg->core.name, sizeof(cfg->core.name), "%s", vbdev->core.name); + + /* TODO [metadata]: make configurable with persistent + * metadata support */ + cfg->cache.metadata_volatile = false; + + /* TODO [cache line size]: make cache line size configurable + * Using standard 4KiB for now */ + cfg->cache.cache_line_size = ocf_cache_line_size_4; + + /* This are suggested values that + * should be sufficient for most use cases */ + cfg->cache.backfill.max_queue_size = 65536; + cfg->cache.backfill.queue_unblock_size = 60000; + + /* TODO [cache line size] */ + cfg->device.cache_line_size = ocf_cache_line_size_4; + cfg->device.force = true; + cfg->device.perform_test = false; + cfg->device.discard_on_start = false; + + vbdev->cfg.cache.locked = true; + + cfg->core.volume_type = SPDK_OBJECT; + cfg->device.volume_type = SPDK_OBJECT; + + if (vbdev->cfg.loadq) { + /* When doing cache_load(), we need to set try_add to true, + * otherwise OCF will interpret this core as new + * instead of the inactive one */ + vbdev->cfg.core.try_add = true; + } + + /* Serialize bdev names in OCF UUID to interpret on future loads + * Core UUID is a triple of (core name, vbdev name, cache name) + * Cache UUID is cache bdev name */ + cfg->device.uuid.size = strlen(vbdev->cache.name) + 1; + cfg->device.uuid.data = vbdev->cache.name; + + snprintf(vbdev->uuid, VBDEV_OCF_MD_MAX_LEN, "%s %s %s", + vbdev->core.name, vbdev->name, vbdev->cache.name); + cfg->core.uuid.size = strlen(vbdev->uuid) + 1; + cfg->core.uuid.data = vbdev->uuid; + vbdev->uuid[strlen(vbdev->core.name)] = 0; + vbdev->uuid[strlen(vbdev->core.name) + 1 + strlen(vbdev->name)] = 0; +} + +/* Allocate vbdev structure object and add it to the global list */ +static int +init_vbdev(const char *vbdev_name, + const char *cache_mode_name, + const char *cache_name, + const char *core_name, + bool loadq) +{ + struct vbdev_ocf *vbdev; + int rc = 0; + + if (spdk_bdev_get_by_name(vbdev_name) || vbdev_ocf_get_by_name(vbdev_name)) { + SPDK_ERRLOG("Device with name '%s' already exists\n", vbdev_name); + return -EPERM; + } + + vbdev = calloc(1, sizeof(*vbdev)); + if (!vbdev) { + goto error_mem; + } + + vbdev->cache.parent = vbdev; + vbdev->core.parent = vbdev; + vbdev->cache.is_cache = true; + vbdev->core.is_cache = false; + + if (cache_mode_name) { + vbdev->cfg.cache.cache_mode + = ocf_get_cache_mode(cache_mode_name); + } else if (!loadq) { /* In load path it is OK to pass NULL as cache mode */ + SPDK_ERRLOG("No cache mode specified\n"); + rc = -EINVAL; + goto error_free; + } + if (vbdev->cfg.cache.cache_mode < 0) { + SPDK_ERRLOG("Incorrect cache mode '%s'\n", cache_mode_name); + rc = -EINVAL; + goto error_free; + } + + vbdev->name = strdup(vbdev_name); + if (!vbdev->name) { + goto error_mem; + } + + vbdev->cache.name = strdup(cache_name); + if (!vbdev->cache.name) { + goto error_mem; + } + + vbdev->core.name = strdup(core_name); + if (!vbdev->core.name) { + goto error_mem; + } + + vbdev->cfg.loadq = loadq; + init_vbdev_config(vbdev); + TAILQ_INSERT_TAIL(&g_ocf_vbdev_head, vbdev, tailq); + return rc; + +error_mem: + rc = -ENOMEM; +error_free: + free_vbdev(vbdev); + return rc; +} + +/* Read configuration file at the start of SPDK application + * This adds vbdevs to global list if some mentioned in config */ +static int +vbdev_ocf_init(void) +{ + const char *vbdev_name, *modename, *cache_name, *core_name; + struct spdk_conf_section *sp; + int status; + + status = vbdev_ocf_ctx_init(); + if (status) { + SPDK_ERRLOG("OCF ctx initialization failed with=%d\n", status); + return status; + } + + status = vbdev_ocf_volume_init(); + if (status) { + vbdev_ocf_ctx_cleanup(); + SPDK_ERRLOG("OCF volume initialization failed with=%d\n", status); + return status; + } + + sp = spdk_conf_find_section(NULL, "OCF"); + if (sp == NULL) { + return 0; + } + + for (int i = 0; ; i++) { + if (!spdk_conf_section_get_nval(sp, "OCF", i)) { + break; + } + + vbdev_name = spdk_conf_section_get_nmval(sp, "OCF", i, 0); + if (!vbdev_name) { + SPDK_ERRLOG("No vbdev name specified\n"); + continue; + } + + modename = spdk_conf_section_get_nmval(sp, "OCF", i, 1); + if (!modename) { + SPDK_ERRLOG("No modename specified for OCF vbdev '%s'\n", vbdev_name); + continue; + } + + cache_name = spdk_conf_section_get_nmval(sp, "OCF", i, 2); + if (!cache_name) { + SPDK_ERRLOG("No cache device specified for OCF vbdev '%s'\n", vbdev_name); + continue; + } + + core_name = spdk_conf_section_get_nmval(sp, "OCF", i, 3); + if (!core_name) { + SPDK_ERRLOG("No core devices specified for OCF vbdev '%s'\n", vbdev_name); + continue; + } + + status = init_vbdev(vbdev_name, modename, cache_name, core_name, false); + if (status) { + SPDK_ERRLOG("Config initialization failed with code: %d\n", status); + } + } + + return status; +} + +/* Called after application shutdown started + * Release memory of allocated structures here */ +static void +vbdev_ocf_module_fini(void) +{ + struct vbdev_ocf *vbdev; + + while ((vbdev = TAILQ_FIRST(&g_ocf_vbdev_head))) { + TAILQ_REMOVE(&g_ocf_vbdev_head, vbdev, tailq); + free_vbdev(vbdev); + } + + vbdev_ocf_volume_cleanup(); + vbdev_ocf_ctx_cleanup(); +} + +/* When base device gets unpluged this is called + * We will unregister cache vbdev here + * When cache device is removed, we delete every OCF bdev that used it */ +static void +hotremove_cb(void *ctx) +{ + struct vbdev_ocf_base *base = ctx; + struct vbdev_ocf *vbdev; + + if (!base->is_cache) { + if (base->parent->state.doing_finish) { + return; + } + + SPDK_NOTICELOG("Deinitializing '%s' because its core device '%s' was removed\n", + base->parent->name, base->name); + vbdev_ocf_delete(base->parent, NULL, NULL); + return; + } + + TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { + if (vbdev->state.doing_finish) { + continue; + } + if (strcmp(base->name, vbdev->cache.name) == 0) { + SPDK_NOTICELOG("Deinitializing '%s' because" + " its cache device '%s' was removed\n", + vbdev->name, base->name); + vbdev_ocf_delete(vbdev, NULL, NULL); + } + } +} + +/* Open base SPDK bdev and claim it */ +static int +attach_base(struct vbdev_ocf_base *base) +{ + int status; + + if (base->attached) { + return -EALREADY; + } + + /* If base cache bdev was already opened by other vbdev, + * we just copy its descriptor here */ + if (base->is_cache) { + struct vbdev_ocf_base *existing = get_other_cache_base(base); + if (existing) { + base->desc = existing->desc; + base->management_channel = existing->management_channel; + base->attached = true; + return 0; + } + } + + status = spdk_bdev_open(base->bdev, true, hotremove_cb, base, &base->desc); + if (status) { + SPDK_ERRLOG("Unable to open device '%s' for writing\n", base->name); + return status; + } + + status = spdk_bdev_module_claim_bdev(base->bdev, base->desc, + &ocf_if); + if (status) { + SPDK_ERRLOG("Unable to claim device '%s'\n", base->name); + spdk_bdev_close(base->desc); + return status; + } + + base->management_channel = spdk_bdev_get_io_channel(base->desc); + if (!base->management_channel) { + SPDK_ERRLOG("Unable to get io channel '%s'\n", base->name); + spdk_bdev_module_release_bdev(base->bdev); + spdk_bdev_close(base->desc); + return -ENOMEM; + } + + /* Save the thread where the base device is opened */ + base->thread = spdk_get_thread(); + + base->attached = true; + return status; +} + +/* Attach base bdevs */ +static int +attach_base_bdevs(struct vbdev_ocf *vbdev, + struct spdk_bdev *cache_bdev, + struct spdk_bdev *core_bdev) +{ + int rc = 0; + + if (cache_bdev) { + vbdev->cache.bdev = cache_bdev; + rc |= attach_base(&vbdev->cache); + } + + if (core_bdev) { + vbdev->core.bdev = core_bdev; + rc |= attach_base(&vbdev->core); + } + + return rc; +} + +/* Init and then start vbdev if all base devices are present */ +void +vbdev_ocf_construct(const char *vbdev_name, + const char *cache_mode_name, + const char *cache_name, + const char *core_name, + bool loadq, + void (*cb)(int, struct vbdev_ocf *, void *), + void *cb_arg) +{ + int rc; + struct spdk_bdev *cache_bdev = spdk_bdev_get_by_name(cache_name); + struct spdk_bdev *core_bdev = spdk_bdev_get_by_name(core_name); + struct vbdev_ocf *vbdev; + + rc = init_vbdev(vbdev_name, cache_mode_name, cache_name, core_name, loadq); + if (rc) { + cb(rc, NULL, cb_arg); + return; + } + + vbdev = vbdev_ocf_get_by_name(vbdev_name); + if (vbdev == NULL) { + cb(-ENODEV, NULL, cb_arg); + return; + } + + if (cache_bdev == NULL) { + SPDK_NOTICELOG("OCF bdev '%s' is waiting for cache device '%s' to connect\n", + vbdev->name, cache_name); + } + if (core_bdev == NULL) { + SPDK_NOTICELOG("OCF bdev '%s' is waiting for core device '%s' to connect\n", + vbdev->name, core_name); + } + + rc = attach_base_bdevs(vbdev, cache_bdev, core_bdev); + if (rc) { + cb(rc, vbdev, cb_arg); + return; + } + + if (core_bdev && cache_bdev) { + register_vbdev(vbdev, cb, cb_arg); + } else { + cb(0, vbdev, cb_arg); + } +} + +/* This called if new device is created in SPDK application + * If that device named as one of base bdevs of OCF vbdev, + * claim and open them */ +static void +vbdev_ocf_examine(struct spdk_bdev *bdev) +{ + const char *bdev_name = spdk_bdev_get_name(bdev); + struct vbdev_ocf *vbdev; + + TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { + if (vbdev->state.doing_finish) { + continue; + } + + if (!strcmp(bdev_name, vbdev->cache.name)) { + attach_base_bdevs(vbdev, bdev, NULL); + continue; + } + if (!strcmp(bdev_name, vbdev->core.name)) { + attach_base_bdevs(vbdev, NULL, bdev); + break; + } + } + spdk_bdev_module_examine_done(&ocf_if); +} + +struct metadata_probe_ctx { + struct vbdev_ocf_base base; + ocf_volume_t volume; + + struct ocf_volume_uuid *core_uuids; + unsigned int uuid_count; + + int result; + int refcnt; +}; + +static void +_examine_ctx_put(void *ctx) +{ + struct spdk_bdev_desc *desc = ctx; + + spdk_bdev_close(desc); +} + +static void +examine_ctx_put(struct metadata_probe_ctx *ctx) +{ + unsigned int i; + + ctx->refcnt--; + if (ctx->refcnt > 0) { + return; + } + + if (ctx->result) { + SPDK_ERRLOG("OCF metadata probe for bdev '%s' failed with %d\n", + spdk_bdev_get_name(ctx->base.bdev), ctx->result); + } + + if (ctx->base.desc) { + /* Close the underlying bdev on its same opened thread. */ + if (ctx->base.thread && ctx->base.thread != spdk_get_thread()) { + spdk_thread_send_msg(ctx->base.thread, _examine_ctx_put, ctx->base.desc); + } else { + spdk_bdev_close(ctx->base.desc); + } + } + + if (ctx->volume) { + ocf_volume_destroy(ctx->volume); + } + + if (ctx->core_uuids) { + for (i = 0; i < ctx->uuid_count; i++) { + free(ctx->core_uuids[i].data); + } + } + free(ctx->core_uuids); + + examine_done(ctx->result, NULL, ctx->base.bdev); + free(ctx); +} + +static void +metadata_probe_construct_cb(int rc, struct vbdev_ocf *vbdev, void *vctx) +{ + struct metadata_probe_ctx *ctx = vctx; + + examine_ctx_put(ctx); +} + +/* This is second callback for ocf_metadata_probe_cores() + * Here we create vbdev configurations based on UUIDs */ +static void +metadata_probe_cores_construct(void *priv, int error, unsigned int num_cores) +{ + struct metadata_probe_ctx *ctx = priv; + const char *vbdev_name; + const char *core_name; + const char *cache_name; + unsigned int i; + + if (error) { + ctx->result = error; + examine_ctx_put(ctx); + return; + } + + for (i = 0; i < num_cores; i++) { + core_name = ocf_uuid_to_str(&ctx->core_uuids[i]); + vbdev_name = core_name + strlen(core_name) + 1; + cache_name = vbdev_name + strlen(vbdev_name) + 1; + + if (strcmp(ctx->base.bdev->name, cache_name)) { + SPDK_NOTICELOG("OCF metadata found on %s belongs to bdev named '%s'\n", + ctx->base.bdev->name, cache_name); + } + + ctx->refcnt++; + vbdev_ocf_construct(vbdev_name, NULL, cache_name, core_name, true, + metadata_probe_construct_cb, ctx); + } + + examine_ctx_put(ctx); +} + +/* This callback is called after OCF reads cores UUIDs from cache metadata + * Here we allocate memory for those UUIDs and call ocf_metadata_probe_cores() again */ +static void +metadata_probe_cores_get_num(void *priv, int error, unsigned int num_cores) +{ + struct metadata_probe_ctx *ctx = priv; + unsigned int i; + + if (error) { + ctx->result = error; + examine_ctx_put(ctx); + return; + } + + ctx->uuid_count = num_cores; + ctx->core_uuids = calloc(num_cores, sizeof(struct ocf_volume_uuid)); + if (!ctx->core_uuids) { + ctx->result = -ENOMEM; + examine_ctx_put(ctx); + return; + } + + for (i = 0; i < ctx->uuid_count; i++) { + ctx->core_uuids[i].size = OCF_VOLUME_UUID_MAX_SIZE; + ctx->core_uuids[i].data = malloc(OCF_VOLUME_UUID_MAX_SIZE); + if (!ctx->core_uuids[i].data) { + ctx->result = -ENOMEM; + examine_ctx_put(ctx); + return; + } + } + + ocf_metadata_probe_cores(vbdev_ocf_ctx, ctx->volume, ctx->core_uuids, ctx->uuid_count, + metadata_probe_cores_construct, ctx); +} + +static void +metadata_probe_cb(void *priv, int rc, + struct ocf_metadata_probe_status *status) +{ + struct metadata_probe_ctx *ctx = priv; + + if (rc) { + /* -ENODATA means device does not have cache metadata on it */ + if (rc != -OCF_ERR_NO_METADATA) { + ctx->result = rc; + } + examine_ctx_put(ctx); + return; + } + + ocf_metadata_probe_cores(vbdev_ocf_ctx, ctx->volume, NULL, 0, + metadata_probe_cores_get_num, ctx); +} + +/* This is called after vbdev_ocf_examine + * It allows to delay application initialization + * until all OCF bdevs get registered + * If vbdev has all of its base devices it starts asynchronously here + * We first check if bdev appears in configuration, + * if not we do metadata_probe() to create its configuration from bdev metadata */ +static void +vbdev_ocf_examine_disk(struct spdk_bdev *bdev) +{ + const char *bdev_name = spdk_bdev_get_name(bdev); + struct vbdev_ocf *vbdev; + struct metadata_probe_ctx *ctx; + bool created_from_config = false; + int rc; + + examine_start(bdev); + + TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { + if (vbdev->state.doing_finish || vbdev->state.started) { + continue; + } + + if (!strcmp(bdev_name, vbdev->cache.name)) { + examine_start(bdev); + register_vbdev(vbdev, examine_done, bdev); + created_from_config = true; + continue; + } + if (!strcmp(bdev_name, vbdev->core.name)) { + examine_start(bdev); + register_vbdev(vbdev, examine_done, bdev); + examine_done(0, NULL, bdev); + return; + } + } + + /* If devices is discovered during config we do not check for metadata */ + if (created_from_config) { + examine_done(0, NULL, bdev); + return; + } + + /* Metadata probe path + * We create temporary OCF volume and a temporary base structure + * to use them for ocf_metadata_probe() and for bottom adapter IOs + * Then we get UUIDs of core devices an create configurations based on them */ + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + examine_done(-ENOMEM, NULL, bdev); + return; + } + + ctx->base.bdev = bdev; + ctx->refcnt = 1; + + rc = spdk_bdev_open(ctx->base.bdev, true, NULL, NULL, &ctx->base.desc); + if (rc) { + ctx->result = rc; + examine_ctx_put(ctx); + return; + } + + rc = ocf_ctx_volume_create(vbdev_ocf_ctx, &ctx->volume, NULL, SPDK_OBJECT); + if (rc) { + ctx->result = rc; + examine_ctx_put(ctx); + return; + } + + rc = ocf_volume_open(ctx->volume, &ctx->base); + if (rc) { + ctx->result = rc; + examine_ctx_put(ctx); + return; + } + + /* Save the thread where the base device is opened */ + ctx->base.thread = spdk_get_thread(); + + ocf_metadata_probe(vbdev_ocf_ctx, ctx->volume, metadata_probe_cb, ctx); +} + +static int +vbdev_ocf_get_ctx_size(void) +{ + return sizeof(struct bdev_ocf_data); +} + +static void +fini_start(void) +{ + g_fini_started = true; +} + +/* Module-global function table + * Does not relate to vbdev instances */ +static struct spdk_bdev_module ocf_if = { + .name = "ocf", + .module_init = vbdev_ocf_init, + .fini_start = fini_start, + .module_fini = vbdev_ocf_module_fini, + .config_text = NULL, + .get_ctx_size = vbdev_ocf_get_ctx_size, + .examine_config = vbdev_ocf_examine, + .examine_disk = vbdev_ocf_examine_disk, +}; +SPDK_BDEV_MODULE_REGISTER(ocf, &ocf_if); + +SPDK_LOG_REGISTER_COMPONENT("vbdev_ocf", SPDK_TRACE_VBDEV_OCF) diff --git a/src/spdk/module/bdev/ocf/vbdev_ocf.h b/src/spdk/module/bdev/ocf/vbdev_ocf.h new file mode 100644 index 000000000..d0fd0b183 --- /dev/null +++ b/src/spdk/module/bdev/ocf/vbdev_ocf.h @@ -0,0 +1,210 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_OCF_H +#define SPDK_VBDEV_OCF_H + +#include <ocf/ocf.h> + +#include "spdk/bdev.h" +#include "spdk/bdev_module.h" + +#define VBDEV_OCF_MD_MAX_LEN 4096 + +struct vbdev_ocf; + +/* Context for OCF queue poller + * Used for mapping SPDK threads to OCF queues */ +struct vbdev_ocf_qctx { + /* OCF queue. Contains OCF requests */ + struct ocf_queue *queue; + /* Poller for OCF queue. Runs OCF requests */ + struct spdk_poller *poller; + /* Reference to parent vbdev */ + struct vbdev_ocf *vbdev; + /* Base devices channels */ + struct spdk_io_channel *cache_ch; + struct spdk_io_channel *core_ch; + /* If true, we have to free this context on queue stop */ + bool allocated; + /* Link to per-bdev list of queue contexts */ + TAILQ_ENTRY(vbdev_ocf_qctx) tailq; +}; + +/* Important states */ +struct vbdev_ocf_state { + /* From the moment when clean delete started */ + bool doing_clean_delete; + /* From the moment when finish started */ + bool doing_finish; + /* From the moment when reset IO recieved, until it is completed */ + bool doing_reset; + /* From the moment when exp_bdev is registered */ + bool started; + /* From the moment when register path started */ + bool starting; + /* Status of last attempt for stopping this device */ + int stop_status; +}; + +/* + * OCF cache configuration options + */ +struct vbdev_ocf_config { + /* Initial cache configuration */ + struct ocf_mngt_cache_config cache; + + /* Cache device config */ + struct ocf_mngt_cache_device_config device; + + /* Core initial config */ + struct ocf_mngt_core_config core; + + /* Load flag, if set to true, then we will try load cache instance from disk, + * otherwise we will create new cache on that disk */ + bool loadq; +}; + +/* Types for management operations */ +typedef void (*vbdev_ocf_mngt_fn)(struct vbdev_ocf *); +typedef void (*vbdev_ocf_mngt_callback)(int, struct vbdev_ocf *, void *); + +/* Context for asynchronous management operations + * Single management operation usually contains a list of sub procedures, + * this structure handles sharing between those sub procedures */ +struct vbdev_ocf_mngt_ctx { + /* Pointer to function that is currently being executed + * It gets incremented on each step until it dereferences to NULL */ + vbdev_ocf_mngt_fn *current_step; + + /* Function that gets invoked by poller on each iteration */ + vbdev_ocf_mngt_fn poller_fn; + /* Poller timeout time stamp - when the poller should stop with error */ + uint64_t timeout_ts; + + /* Status of management operation */ + int status; + + /* External callback and its argument */ + vbdev_ocf_mngt_callback cb; + void *cb_arg; +}; + +/* Base device info */ +struct vbdev_ocf_base { + /* OCF internal name */ + char *name; + + /* True if this is a caching device */ + bool is_cache; + + /* Connected SPDK block device */ + struct spdk_bdev *bdev; + + /* SPDK device io handle */ + struct spdk_bdev_desc *desc; + + /* True if SPDK bdev has been claimed and opened for writing */ + bool attached; + + /* Channel for cleaner operations */ + struct spdk_io_channel *management_channel; + + /* Reference to main vbdev */ + struct vbdev_ocf *parent; + + /* thread where base device is opened */ + struct spdk_thread *thread; +}; + +/* + * The main information provider + * It's also registered as io_device + */ +struct vbdev_ocf { + /* Exposed unique name */ + char *name; + + /* Base bdevs */ + struct vbdev_ocf_base cache; + struct vbdev_ocf_base core; + + /* Base bdevs OCF objects */ + ocf_cache_t ocf_cache; + ocf_core_t ocf_core; + + /* Parameters */ + struct vbdev_ocf_config cfg; + struct vbdev_ocf_state state; + + /* Management context */ + struct vbdev_ocf_mngt_ctx mngt_ctx; + /* Cache conext */ + struct vbdev_ocf_cache_ctx *cache_ctx; + + /* Exposed SPDK bdev. Registered in bdev layer */ + struct spdk_bdev exp_bdev; + + /* OCF uuid for core device of this vbdev */ + char uuid[VBDEV_OCF_MD_MAX_LEN]; + + /* Link to global list of this type structures */ + TAILQ_ENTRY(vbdev_ocf) tailq; +}; + +void vbdev_ocf_construct( + const char *vbdev_name, + const char *cache_mode_name, + const char *cache_name, + const char *core_name, + bool loadq, + void (*cb)(int, struct vbdev_ocf *, void *), + void *cb_arg); + +/* If vbdev is online, return its object */ +struct vbdev_ocf *vbdev_ocf_get_by_name(const char *name); + +/* Return matching base if parent vbdev is online */ +struct vbdev_ocf_base *vbdev_ocf_get_base_by_name(const char *name); + +/* Stop OCF cache and unregister SPDK bdev */ +int vbdev_ocf_delete(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg); + +int vbdev_ocf_delete_clean(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg); + +typedef void (*vbdev_ocf_foreach_fn)(struct vbdev_ocf *, void *); + +/* Execute fn for each OCF device that is online or waits for base devices */ +void vbdev_ocf_foreach(vbdev_ocf_foreach_fn fn, void *ctx); + +#endif diff --git a/src/spdk/module/bdev/ocf/vbdev_ocf_rpc.c b/src/spdk/module/bdev/ocf/vbdev_ocf_rpc.c new file mode 100644 index 000000000..89286fe23 --- /dev/null +++ b/src/spdk/module/bdev/ocf/vbdev_ocf_rpc.c @@ -0,0 +1,362 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vbdev_ocf.h" +#include "stats.h" +#include "spdk/log.h" +#include "spdk/rpc.h" +#include "spdk/string.h" + +/* Structure to hold the parameters for this RPC method. */ +struct rpc_bdev_ocf_create { + char *name; /* master vbdev */ + char *mode; /* OCF mode (choose one) */ + char *cache_bdev_name; /* sub bdev */ + char *core_bdev_name; /* sub bdev */ +}; + +static void +free_rpc_bdev_ocf_create(struct rpc_bdev_ocf_create *r) +{ + free(r->name); + free(r->core_bdev_name); + free(r->cache_bdev_name); + free(r->mode); +} + +/* Structure to decode the input parameters for this RPC method. */ +static const struct spdk_json_object_decoder rpc_bdev_ocf_create_decoders[] = { + {"name", offsetof(struct rpc_bdev_ocf_create, name), spdk_json_decode_string}, + {"mode", offsetof(struct rpc_bdev_ocf_create, mode), spdk_json_decode_string}, + {"cache_bdev_name", offsetof(struct rpc_bdev_ocf_create, cache_bdev_name), spdk_json_decode_string}, + {"core_bdev_name", offsetof(struct rpc_bdev_ocf_create, core_bdev_name), spdk_json_decode_string}, +}; + +static void +construct_cb(int status, struct vbdev_ocf *vbdev, void *cb_arg) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + if (status) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Could not create OCF vbdev: %d", + status); + } else { + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, vbdev->name); + spdk_jsonrpc_end_result(request, w); + } +} + +static void +rpc_bdev_ocf_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_ocf_create req = {NULL}; + int ret; + + ret = spdk_json_decode_object(params, rpc_bdev_ocf_create_decoders, + SPDK_COUNTOF(rpc_bdev_ocf_create_decoders), + &req); + if (ret) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_bdev_ocf_create(&req); + return; + } + + vbdev_ocf_construct(req.name, req.mode, req.cache_bdev_name, req.core_bdev_name, false, + construct_cb, request); + free_rpc_bdev_ocf_create(&req); +} +SPDK_RPC_REGISTER("bdev_ocf_create", rpc_bdev_ocf_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ocf_create, construct_ocf_bdev) + +/* Structure to hold the parameters for this RPC method. */ +struct rpc_bdev_ocf_delete { + char *name; /* master vbdev name */ +}; + +static void +free_rpc_bdev_ocf_delete(struct rpc_bdev_ocf_delete *r) +{ + free(r->name); +} + +/* Structure to decode the input parameters for this RPC method. */ +static const struct spdk_json_object_decoder rpc_bdev_ocf_delete_decoders[] = { + {"name", offsetof(struct rpc_bdev_ocf_delete, name), spdk_json_decode_string}, +}; + +static void +delete_cb(void *cb_arg, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + if (status) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Could not delete OCF vbdev: %d", + status); + } else { + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + } +} + +static void +rpc_bdev_ocf_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_ocf_delete req = {NULL}; + struct vbdev_ocf *vbdev; + int status; + + status = spdk_json_decode_object(params, rpc_bdev_ocf_delete_decoders, + SPDK_COUNTOF(rpc_bdev_ocf_delete_decoders), + &req); + if (status) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto end; + } + + vbdev = vbdev_ocf_get_by_name(req.name); + if (vbdev == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(ENODEV)); + goto end; + } + + status = vbdev_ocf_delete_clean(vbdev, delete_cb, request); + if (status) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Could not delete OCF vbdev: %s", + spdk_strerror(-status)); + goto end; + } + +end: + free_rpc_bdev_ocf_delete(&req); +} +SPDK_RPC_REGISTER("bdev_ocf_delete", rpc_bdev_ocf_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ocf_delete, delete_ocf_bdev) + +/* Structure to hold the parameters for this RPC method. */ +struct rpc_bdev_ocf_get_stats { + char *name; /* master vbdev name */ +}; + +static void +free_rpc_bdev_ocf_get_stats(struct rpc_bdev_ocf_get_stats *r) +{ + free(r->name); +} + +/* Structure to decode the input parameters for this RPC method. */ +static const struct spdk_json_object_decoder rpc_bdev_ocf_get_stats_decoders[] = { + {"name", offsetof(struct rpc_bdev_ocf_get_stats, name), spdk_json_decode_string}, +}; + +struct get_ocf_stats_ctx { + struct spdk_jsonrpc_request *request; + char *core_name; +}; + +static void +rpc_bdev_ocf_get_stats_cmpl(ocf_cache_t cache, void *priv, int error) +{ + struct get_ocf_stats_ctx *ctx = (struct get_ocf_stats_ctx *) priv; + struct spdk_json_write_ctx *w; + struct vbdev_ocf_stats stats; + + if (error) { + goto end; + } + + error = vbdev_ocf_stats_get(cache, ctx->core_name, &stats); + + ocf_mngt_cache_read_unlock(cache); + + if (error) { + goto end; + } + + w = spdk_jsonrpc_begin_result(ctx->request); + vbdev_ocf_stats_write_json(w, &stats); + spdk_jsonrpc_end_result(ctx->request, w); + +end: + if (error) { + spdk_jsonrpc_send_error_response_fmt(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Could not get stats: %s", + spdk_strerror(-error)); + } + free(ctx); +} + +static void +rpc_bdev_ocf_get_stats(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_ocf_get_stats req = {NULL}; + struct vbdev_ocf *vbdev; + struct get_ocf_stats_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Not enough memory to process request"); + goto end; + } + + if (spdk_json_decode_object(params, rpc_bdev_ocf_get_stats_decoders, + SPDK_COUNTOF(rpc_bdev_ocf_get_stats_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free(ctx); + goto end; + } + + vbdev = vbdev_ocf_get_by_name(req.name); + if (vbdev == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(ENODEV)); + free(ctx); + goto end; + } + + ctx->core_name = vbdev->core.name; + ctx->request = request; + ocf_mngt_cache_read_lock(vbdev->ocf_cache, rpc_bdev_ocf_get_stats_cmpl, ctx); + +end: + free_rpc_bdev_ocf_get_stats(&req); +} +SPDK_RPC_REGISTER("bdev_ocf_get_stats", rpc_bdev_ocf_get_stats, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ocf_get_stats, get_ocf_stats) + +/* Structure to hold the parameters for this RPC method. */ +struct rpc_bdev_ocf_get_bdevs { + char *name; +}; + +static void +free_rpc_bdev_ocf_get_bdevs(struct rpc_bdev_ocf_get_bdevs *r) +{ + free(r->name); +} + +/* Structure to decode the input parameters for this RPC method. */ +static const struct spdk_json_object_decoder rpc_bdev_ocf_get_bdevs_decoders[] = { + {"name", offsetof(struct rpc_bdev_ocf_get_bdevs, name), spdk_json_decode_string, true}, +}; + +struct bdev_get_bdevs_ctx { + char *name; + struct spdk_json_write_ctx *w; +}; + +static void +bdev_get_bdevs_fn(struct vbdev_ocf *vbdev, void *ctx) +{ + struct bdev_get_bdevs_ctx *cctx = ctx; + struct spdk_json_write_ctx *w = cctx->w; + + if (cctx->name != NULL && + strcmp(vbdev->name, cctx->name) && + strcmp(vbdev->cache.name, cctx->name) && + strcmp(vbdev->core.name, cctx->name)) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "name", vbdev->name); + spdk_json_write_named_bool(w, "started", vbdev->state.started); + + spdk_json_write_named_object_begin(w, "cache"); + spdk_json_write_named_string(w, "name", vbdev->cache.name); + spdk_json_write_named_bool(w, "attached", vbdev->cache.attached); + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "core"); + spdk_json_write_named_string(w, "name", vbdev->core.name); + spdk_json_write_named_bool(w, "attached", vbdev->core.attached); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static void +rpc_bdev_ocf_get_bdevs(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + struct rpc_bdev_ocf_get_bdevs req = {NULL}; + struct bdev_get_bdevs_ctx cctx; + + if (params && spdk_json_decode_object(params, rpc_bdev_ocf_get_bdevs_decoders, + SPDK_COUNTOF(rpc_bdev_ocf_get_bdevs_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto end; + } + + if (req.name) { + if (!(vbdev_ocf_get_by_name(req.name) || vbdev_ocf_get_base_by_name(req.name))) { + spdk_jsonrpc_send_error_response(request, + SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(ENODEV)); + goto end; + } + } + + w = spdk_jsonrpc_begin_result(request); + + cctx.name = req.name; + cctx.w = w; + + spdk_json_write_array_begin(w); + vbdev_ocf_foreach(bdev_get_bdevs_fn, &cctx); + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + +end: + free_rpc_bdev_ocf_get_bdevs(&req); +} +SPDK_RPC_REGISTER("bdev_ocf_get_bdevs", rpc_bdev_ocf_get_bdevs, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ocf_get_bdevs, get_ocf_bdevs) diff --git a/src/spdk/module/bdev/ocf/volume.c b/src/spdk/module/bdev/ocf/volume.c new file mode 100644 index 000000000..de683b852 --- /dev/null +++ b/src/spdk/module/bdev/ocf/volume.c @@ -0,0 +1,441 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <ocf/ocf.h> + +#include "spdk/bdev_module.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk_internal/log.h" + +#include "data.h" +#include "volume.h" +#include "ctx.h" +#include "vbdev_ocf.h" + +static int +vbdev_ocf_volume_open(ocf_volume_t volume, void *opts) +{ + struct vbdev_ocf_base **priv = ocf_volume_get_priv(volume); + struct vbdev_ocf_base *base; + + if (opts) { + base = opts; + } else { + base = vbdev_ocf_get_base_by_name(ocf_volume_get_uuid(volume)->data); + if (base == NULL) { + return -ENODEV; + } + } + + *priv = base; + + return 0; +} + +static void +vbdev_ocf_volume_close(ocf_volume_t volume) +{ +} + +static uint64_t +vbdev_ocf_volume_get_length(ocf_volume_t volume) +{ + struct vbdev_ocf_base *base = *((struct vbdev_ocf_base **)ocf_volume_get_priv(volume)); + uint64_t len; + + len = base->bdev->blocklen * base->bdev->blockcnt; + + return len; +} + +static int +vbdev_ocf_volume_io_set_data(struct ocf_io *io, ctx_data_t *data, + uint32_t offset) +{ + struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); + + io_ctx->offset = offset; + io_ctx->data = data; + + if (io_ctx->data && offset >= io_ctx->data->size) { + return -ENOBUFS; + } + + return 0; +} + +static ctx_data_t * +vbdev_ocf_volume_io_get_data(struct ocf_io *io) +{ + return ocf_get_io_ctx(io)->data; +} + +static void +vbdev_ocf_volume_io_get(struct ocf_io *io) +{ + struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); + + io_ctx->ref++; +} + +static void +vbdev_ocf_volume_io_put(struct ocf_io *io) +{ + struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); + + if (--io_ctx->ref) { + return; + } +} + +static int +get_starting_vec(struct iovec *iovs, int iovcnt, int *offset) +{ + int i; + size_t off; + + off = *offset; + + for (i = 0; i < iovcnt; i++) { + if (off < iovs[i].iov_len) { + *offset = off; + return i; + } + off -= iovs[i].iov_len; + } + + return -1; +} + +static void +initialize_cpy_vector(struct iovec *cpy_vec, int cpy_vec_len, struct iovec *orig_vec, + int orig_vec_len, + size_t offset, size_t bytes) +{ + void *curr_base; + int len, i; + + i = 0; + + while (bytes > 0) { + curr_base = orig_vec[i].iov_base + offset; + len = MIN(bytes, orig_vec[i].iov_len - offset); + + cpy_vec[i].iov_base = curr_base; + cpy_vec[i].iov_len = len; + + bytes -= len; + offset = 0; + i++; + } +} + +static void +vbdev_ocf_volume_submit_io_cb(struct spdk_bdev_io *bdev_io, bool success, void *opaque) +{ + struct ocf_io *io; + struct ocf_io_ctx *io_ctx; + + assert(opaque); + + io = opaque; + io_ctx = ocf_get_io_ctx(io); + assert(io_ctx != NULL); + + if (!success) { + io_ctx->error |= 1; + } + + if (io_ctx->iovs_allocated && bdev_io != NULL) { + env_free(bdev_io->u.bdev.iovs); + } + + if (io_ctx->error) { + SPDK_DEBUGLOG(SPDK_TRACE_VBDEV_OCF_VOLUME, + "base returned error on io submission: %d\n", io_ctx->error); + } + + if (io->io_queue == NULL && io_ctx->ch != NULL) { + spdk_put_io_channel(io_ctx->ch); + } + + vbdev_ocf_volume_io_put(io); + if (bdev_io) { + spdk_bdev_free_io(bdev_io); + } + + if (--io_ctx->rq_cnt == 0) { + io->end(io, io_ctx->error); + } +} + +static int +prepare_submit(struct ocf_io *io) +{ + struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); + struct vbdev_ocf_qctx *qctx; + struct vbdev_ocf_base *base; + ocf_queue_t q = io->io_queue; + ocf_cache_t cache; + struct vbdev_ocf_cache_ctx *cctx; + int rc = 0; + + io_ctx->rq_cnt++; + if (io_ctx->rq_cnt != 1) { + return 0; + } + + vbdev_ocf_volume_io_get(io); + base = *((struct vbdev_ocf_base **)ocf_volume_get_priv(ocf_io_get_volume(io))); + + if (io->io_queue == NULL) { + /* In case IO is initiated by OCF, queue is unknown + * so we have to get io channel ourselves */ + io_ctx->ch = spdk_bdev_get_io_channel(base->desc); + if (io_ctx->ch == NULL) { + return -EPERM; + } + return 0; + } + + cache = ocf_queue_get_cache(q); + cctx = ocf_cache_get_priv(cache); + + if (q == cctx->cleaner_queue || q == cctx->mngt_queue) { + io_ctx->ch = base->management_channel; + return 0; + } + + qctx = ocf_queue_get_priv(q); + if (qctx == NULL) { + return -EFAULT; + } + + if (base->is_cache) { + io_ctx->ch = qctx->cache_ch; + } else { + io_ctx->ch = qctx->core_ch; + } + + return rc; +} + +static void +vbdev_ocf_volume_submit_flush(struct ocf_io *io) +{ + struct vbdev_ocf_base *base = + *((struct vbdev_ocf_base **) + ocf_volume_get_priv(ocf_io_get_volume(io))); + struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); + int status; + + status = prepare_submit(io); + if (status) { + SPDK_ERRLOG("Preparing io failed with status=%d\n", status); + vbdev_ocf_volume_submit_io_cb(NULL, false, io); + return; + } + + status = spdk_bdev_flush( + base->desc, io_ctx->ch, + io->addr, io->bytes, + vbdev_ocf_volume_submit_io_cb, io); + if (status) { + /* Since callback is not called, we need to do it manually to free io structures */ + SPDK_ERRLOG("Submission failed with status=%d\n", status); + vbdev_ocf_volume_submit_io_cb(NULL, false, io); + } +} + +static void +vbdev_ocf_volume_submit_io(struct ocf_io *io) +{ + struct vbdev_ocf_base *base = + *((struct vbdev_ocf_base **) + ocf_volume_get_priv(ocf_io_get_volume(io))); + struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); + struct iovec *iovs; + int iovcnt, status = 0, i, offset; + uint64_t addr, len; + + if (io->flags == OCF_WRITE_FLUSH) { + vbdev_ocf_volume_submit_flush(io); + return; + } + + status = prepare_submit(io); + if (status) { + SPDK_ERRLOG("Preparing io failed with status=%d\n", status); + vbdev_ocf_volume_submit_io_cb(NULL, false, io); + return; + } + + /* IO fields */ + addr = io->addr; + len = io->bytes; + offset = io_ctx->offset; + + if (len < io_ctx->data->size) { + if (io_ctx->data->iovcnt == 1) { + if (io->dir == OCF_READ) { + status = spdk_bdev_read(base->desc, io_ctx->ch, + io_ctx->data->iovs[0].iov_base + offset, addr, len, + vbdev_ocf_volume_submit_io_cb, io); + } else if (io->dir == OCF_WRITE) { + status = spdk_bdev_write(base->desc, io_ctx->ch, + io_ctx->data->iovs[0].iov_base + offset, addr, len, + vbdev_ocf_volume_submit_io_cb, io); + } + goto end; + } else { + i = get_starting_vec(io_ctx->data->iovs, io_ctx->data->iovcnt, &offset); + + if (i < 0) { + SPDK_ERRLOG("offset bigger than data size\n"); + vbdev_ocf_volume_submit_io_cb(NULL, false, io); + return; + } + + iovcnt = io_ctx->data->iovcnt - i; + + io_ctx->iovs_allocated = true; + iovs = env_malloc(sizeof(*iovs) * iovcnt, ENV_MEM_NOIO); + + if (!iovs) { + SPDK_ERRLOG("allocation failed\n"); + vbdev_ocf_volume_submit_io_cb(NULL, false, io); + return; + } + + initialize_cpy_vector(iovs, io_ctx->data->iovcnt, &io_ctx->data->iovs[i], + iovcnt, offset, len); + } + } else { + iovs = io_ctx->data->iovs; + iovcnt = io_ctx->data->iovcnt; + } + + if (io->dir == OCF_READ) { + status = spdk_bdev_readv(base->desc, io_ctx->ch, + iovs, iovcnt, addr, len, vbdev_ocf_volume_submit_io_cb, io); + } else if (io->dir == OCF_WRITE) { + status = spdk_bdev_writev(base->desc, io_ctx->ch, + iovs, iovcnt, addr, len, vbdev_ocf_volume_submit_io_cb, io); + } + +end: + if (status) { + /* TODO [ENOMEM]: implement ENOMEM handling when submitting IO to base device */ + + /* Since callback is not called, we need to do it manually to free io structures */ + SPDK_ERRLOG("submission failed with status=%d\n", status); + vbdev_ocf_volume_submit_io_cb(NULL, false, io); + } +} + +static void +vbdev_ocf_volume_submit_discard(struct ocf_io *io) +{ + struct vbdev_ocf_base *base = + *((struct vbdev_ocf_base **) + ocf_volume_get_priv(ocf_io_get_volume(io))); + struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); + int status = 0; + + status = prepare_submit(io); + if (status) { + SPDK_ERRLOG("Preparing io failed with status=%d\n", status); + vbdev_ocf_volume_submit_io_cb(NULL, false, io); + return; + } + + status = spdk_bdev_unmap( + base->desc, io_ctx->ch, + io->addr, io->bytes, + vbdev_ocf_volume_submit_io_cb, io); + if (status) { + /* Since callback is not called, we need to do it manually to free io structures */ + SPDK_ERRLOG("Submission failed with status=%d\n", status); + vbdev_ocf_volume_submit_io_cb(NULL, false, io); + } +} + +static void +vbdev_ocf_volume_submit_metadata(struct ocf_io *io) +{ + /* Implement with persistent metadata support */ +} + +static unsigned int +vbdev_ocf_volume_get_max_io_size(ocf_volume_t volume) +{ + return 131072; +} + +static struct ocf_volume_properties vbdev_volume_props = { + .name = "SPDK block device", + .io_priv_size = sizeof(struct ocf_io_ctx), + .volume_priv_size = sizeof(struct vbdev_ocf_base *), + .caps = { + .atomic_writes = 0 /* to enable need to have ops->submit_metadata */ + }, + .ops = { + .open = vbdev_ocf_volume_open, + .close = vbdev_ocf_volume_close, + .get_length = vbdev_ocf_volume_get_length, + .submit_io = vbdev_ocf_volume_submit_io, + .submit_discard = vbdev_ocf_volume_submit_discard, + .submit_flush = vbdev_ocf_volume_submit_flush, + .get_max_io_size = vbdev_ocf_volume_get_max_io_size, + .submit_metadata = vbdev_ocf_volume_submit_metadata, + }, + .io_ops = { + .set_data = vbdev_ocf_volume_io_set_data, + .get_data = vbdev_ocf_volume_io_get_data, + }, +}; + +int +vbdev_ocf_volume_init(void) +{ + return ocf_ctx_register_volume_type(vbdev_ocf_ctx, SPDK_OBJECT, &vbdev_volume_props); +} + +void +vbdev_ocf_volume_cleanup(void) +{ + ocf_ctx_unregister_volume_type(vbdev_ocf_ctx, SPDK_OBJECT); +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_ocf_volume", SPDK_TRACE_VBDEV_OCF_VOLUME) diff --git a/src/spdk/module/bdev/ocf/volume.h b/src/spdk/module/bdev/ocf/volume.h new file mode 100644 index 000000000..6ae7488b5 --- /dev/null +++ b/src/spdk/module/bdev/ocf/volume.h @@ -0,0 +1,63 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VBDEV_OCF_DOBJ_H +#define VBDEV_OCF_DOBJ_H + +#include <ocf/ocf.h> + +#include "ocf_env.h" +#include "ctx.h" +#include "data.h" + +/* ocf_io context + * It is initialized from io size and offset */ +struct ocf_io_ctx { + struct bdev_ocf_data *data; + struct spdk_io_channel *ch; + uint32_t offset; + int ref; + int rq_cnt; + int error; + bool iovs_allocated; +}; + +int vbdev_ocf_volume_init(void); +void vbdev_ocf_volume_cleanup(void); + +static inline struct ocf_io_ctx *ocf_get_io_ctx(struct ocf_io *io) +{ + return ocf_io_get_priv(io); +} + +#endif diff --git a/src/spdk/module/bdev/passthru/Makefile b/src/spdk/module/bdev/passthru/Makefile new file mode 100644 index 000000000..c12b97691 --- /dev/null +++ b/src/spdk/module/bdev/passthru/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ + +C_SRCS = vbdev_passthru.c vbdev_passthru_rpc.c +LIBNAME = bdev_passthru + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/passthru/vbdev_passthru.c b/src/spdk/module/bdev/passthru/vbdev_passthru.c new file mode 100644 index 000000000..f166f3e34 --- /dev/null +++ b/src/spdk/module/bdev/passthru/vbdev_passthru.c @@ -0,0 +1,809 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This is a simple example of a virtual block device module that passes IO + * down to a bdev (or bdevs) that its configured to attach to. + */ + +#include "spdk/stdinc.h" + +#include "vbdev_passthru.h" +#include "spdk/rpc.h" +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + + +static int vbdev_passthru_init(void); +static void vbdev_passthru_get_spdk_running_config(FILE *fp); +static int vbdev_passthru_get_ctx_size(void); +static void vbdev_passthru_examine(struct spdk_bdev *bdev); +static void vbdev_passthru_finish(void); +static int vbdev_passthru_config_json(struct spdk_json_write_ctx *w); + +static struct spdk_bdev_module passthru_if = { + .name = "passthru", + .module_init = vbdev_passthru_init, + .config_text = vbdev_passthru_get_spdk_running_config, + .get_ctx_size = vbdev_passthru_get_ctx_size, + .examine_config = vbdev_passthru_examine, + .module_fini = vbdev_passthru_finish, + .config_json = vbdev_passthru_config_json +}; + +SPDK_BDEV_MODULE_REGISTER(passthru, &passthru_if) + +/* List of pt_bdev names and their base bdevs via configuration file. + * Used so we can parse the conf once at init and use this list in examine(). + */ +struct bdev_names { + char *vbdev_name; + char *bdev_name; + TAILQ_ENTRY(bdev_names) link; +}; +static TAILQ_HEAD(, bdev_names) g_bdev_names = TAILQ_HEAD_INITIALIZER(g_bdev_names); + +/* List of virtual bdevs and associated info for each. */ +struct vbdev_passthru { + struct spdk_bdev *base_bdev; /* the thing we're attaching to */ + struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ + struct spdk_bdev pt_bdev; /* the PT virtual bdev */ + TAILQ_ENTRY(vbdev_passthru) link; + struct spdk_thread *thread; /* thread where base device is opened */ +}; +static TAILQ_HEAD(, vbdev_passthru) g_pt_nodes = TAILQ_HEAD_INITIALIZER(g_pt_nodes); + +/* The pt vbdev channel struct. It is allocated and freed on my behalf by the io channel code. + * If this vbdev needed to implement a poller or a queue for IO, this is where those things + * would be defined. This passthru bdev doesn't actually need to allocate a channel, it could + * simply pass back the channel of the bdev underneath it but for example purposes we will + * present its own to the upper layers. + */ +struct pt_io_channel { + struct spdk_io_channel *base_ch; /* IO channel of base device */ +}; + +/* Just for fun, this pt_bdev module doesn't need it but this is essentially a per IO + * context that we get handed by the bdev layer. + */ +struct passthru_bdev_io { + uint8_t test; + + /* bdev related */ + struct spdk_io_channel *ch; + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; +}; + +static void +vbdev_passthru_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io); + + +/* Callback for unregistering the IO device. */ +static void +_device_unregister_cb(void *io_device) +{ + struct vbdev_passthru *pt_node = io_device; + + /* Done with this pt_node. */ + free(pt_node->pt_bdev.name); + free(pt_node); +} + +/* Wrapper for the bdev close operation. */ +static void +_vbdev_passthru_destruct(void *ctx) +{ + struct spdk_bdev_desc *desc = ctx; + + spdk_bdev_close(desc); +} + +/* Called after we've unregistered following a hot remove callback. + * Our finish entry point will be called next. + */ +static int +vbdev_passthru_destruct(void *ctx) +{ + struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; + + /* It is important to follow this exact sequence of steps for destroying + * a vbdev... + */ + + TAILQ_REMOVE(&g_pt_nodes, pt_node, link); + + /* Unclaim the underlying bdev. */ + spdk_bdev_module_release_bdev(pt_node->base_bdev); + + /* Close the underlying bdev on its same opened thread. */ + if (pt_node->thread && pt_node->thread != spdk_get_thread()) { + spdk_thread_send_msg(pt_node->thread, _vbdev_passthru_destruct, pt_node->base_desc); + } else { + spdk_bdev_close(pt_node->base_desc); + } + + /* Unregister the io_device. */ + spdk_io_device_unregister(pt_node, _device_unregister_cb); + + return 0; +} + +/* Completion callback for IO that were issued from this bdev. The original bdev_io + * is passed in as an arg so we'll complete that one with the appropriate status + * and then free the one that this module issued. + */ +static void +_pt_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)orig_io->driver_ctx; + + /* We setup this value in the submission routine, just showing here that it is + * passed back to us. + */ + if (io_ctx->test != 0x5a) { + SPDK_ERRLOG("Error, original IO device_ctx is wrong! 0x%x\n", + io_ctx->test); + } + + /* Complete the original IO and then free the one that we created here + * as a result of issuing an IO via submit_request. + */ + spdk_bdev_io_complete(orig_io, status); + spdk_bdev_free_io(bdev_io); +} + +static void +_pt_complete_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)orig_io->driver_ctx; + + /* We setup this value in the submission routine, just showing here that it is + * passed back to us. + */ + if (io_ctx->test != 0x5a) { + SPDK_ERRLOG("Error, original IO device_ctx is wrong! 0x%x\n", + io_ctx->test); + } + + /* Complete the original IO and then free the one that we created here + * as a result of issuing an IO via submit_request. + */ + spdk_bdev_io_set_buf(orig_io, bdev_io->u.bdev.iovs[0].iov_base, bdev_io->u.bdev.iovs[0].iov_len); + spdk_bdev_io_complete(orig_io, status); + spdk_bdev_free_io(bdev_io); +} + +static void +vbdev_passthru_resubmit_io(void *arg) +{ + struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg; + struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; + + vbdev_passthru_submit_request(io_ctx->ch, bdev_io); +} + +static void +vbdev_passthru_queue_io(struct spdk_bdev_io *bdev_io) +{ + struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; + struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(io_ctx->ch); + int rc; + + io_ctx->bdev_io_wait.bdev = bdev_io->bdev; + io_ctx->bdev_io_wait.cb_fn = vbdev_passthru_resubmit_io; + io_ctx->bdev_io_wait.cb_arg = bdev_io; + + /* Queue the IO using the channel of the base device. */ + rc = spdk_bdev_queue_io_wait(bdev_io->bdev, pt_ch->base_ch, &io_ctx->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vbdev_passthru_queue_io, rc=%d.\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +/* Callback for getting a buf from the bdev pool in the event that the caller passed + * in NULL, we need to own the buffer so it doesn't get freed by another vbdev module + * beneath us before we're done with it. That won't happen in this example but it could + * if this example were used as a template for something more complex. + */ +static void +pt_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +{ + struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_passthru, + pt_bdev); + struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(ch); + struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; + int rc; + + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + if (bdev_io->u.bdev.md_buf == NULL) { + rc = spdk_bdev_readv_blocks(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, _pt_complete_io, + bdev_io); + } else { + rc = spdk_bdev_readv_blocks_with_md(pt_node->base_desc, pt_ch->base_ch, + bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.md_buf, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _pt_complete_io, bdev_io); + } + + if (rc != 0) { + if (rc == -ENOMEM) { + SPDK_ERRLOG("No memory, start to queue io for passthru.\n"); + io_ctx->ch = ch; + vbdev_passthru_queue_io(bdev_io); + } else { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +/* Called when someone above submits IO to this pt vbdev. We're simply passing it on here + * via SPDK IO calls which in turn allocate another bdev IO and call our cpl callback provided + * below along with the original bdev_io so that we can complete it once this IO completes. + */ +static void +vbdev_passthru_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_passthru, pt_bdev); + struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(ch); + struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; + int rc = 0; + + /* Setup a per IO context value; we don't do anything with it in the vbdev other + * than confirm we get the same thing back in the completion callback just to + * demonstrate. + */ + io_ctx->test = 0x5a; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, pt_read_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + if (bdev_io->u.bdev.md_buf == NULL) { + rc = spdk_bdev_writev_blocks(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, _pt_complete_io, + bdev_io); + } else { + rc = spdk_bdev_writev_blocks_with_md(pt_node->base_desc, pt_ch->base_ch, + bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.md_buf, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _pt_complete_io, bdev_io); + } + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + rc = spdk_bdev_write_zeroes_blocks(pt_node->base_desc, pt_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _pt_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + rc = spdk_bdev_unmap_blocks(pt_node->base_desc, pt_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _pt_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + rc = spdk_bdev_flush_blocks(pt_node->base_desc, pt_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _pt_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + rc = spdk_bdev_reset(pt_node->base_desc, pt_ch->base_ch, + _pt_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_ZCOPY: + rc = spdk_bdev_zcopy_start(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.zcopy.populate, + _pt_complete_zcopy_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_ABORT: + rc = spdk_bdev_abort(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.abort.bio_to_abort, + _pt_complete_io, bdev_io); + break; + default: + SPDK_ERRLOG("passthru: unknown I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + if (rc != 0) { + if (rc == -ENOMEM) { + SPDK_ERRLOG("No memory, start to queue io for passthru.\n"); + io_ctx->ch = ch; + vbdev_passthru_queue_io(bdev_io); + } else { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +/* We'll just call the base bdev and let it answer however if we were more + * restrictive for some reason (or less) we could get the response back + * and modify according to our purposes. + */ +static bool +vbdev_passthru_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; + + return spdk_bdev_io_type_supported(pt_node->base_bdev, io_type); +} + +/* We supplied this as an entry point for upper layers who want to communicate to this + * bdev. This is how they get a channel. We are passed the same context we provided when + * we created our PT vbdev in examine() which, for this bdev, is the address of one of + * our context nodes. From here we'll ask the SPDK channel code to fill out our channel + * struct and we'll keep it in our PT node. + */ +static struct spdk_io_channel * +vbdev_passthru_get_io_channel(void *ctx) +{ + struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; + struct spdk_io_channel *pt_ch = NULL; + + /* The IO channel code will allocate a channel for us which consists of + * the SPDK channel structure plus the size of our pt_io_channel struct + * that we passed in when we registered our IO device. It will then call + * our channel create callback to populate any elements that we need to + * update. + */ + pt_ch = spdk_get_io_channel(pt_node); + + return pt_ch; +} + +/* This is the output for bdev_get_bdevs() for this vbdev */ +static int +vbdev_passthru_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; + + spdk_json_write_name(w, "passthru"); + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&pt_node->pt_bdev)); + spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(pt_node->base_bdev)); + spdk_json_write_object_end(w); + + return 0; +} + +/* This is used to generate JSON that can configure this module to its current state. */ +static int +vbdev_passthru_config_json(struct spdk_json_write_ctx *w) +{ + struct vbdev_passthru *pt_node; + + TAILQ_FOREACH(pt_node, &g_pt_nodes, link) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "bdev_passthru_create"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(pt_node->base_bdev)); + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&pt_node->pt_bdev)); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); + } + return 0; +} + +/* We provide this callback for the SPDK channel code to create a channel using + * the channel struct we provided in our module get_io_channel() entry point. Here + * we get and save off an underlying base channel of the device below us so that + * we can communicate with the base bdev on a per channel basis. If we needed + * our own poller for this vbdev, we'd register it here. + */ +static int +pt_bdev_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct pt_io_channel *pt_ch = ctx_buf; + struct vbdev_passthru *pt_node = io_device; + + pt_ch->base_ch = spdk_bdev_get_io_channel(pt_node->base_desc); + + return 0; +} + +/* We provide this callback for the SPDK channel code to destroy a channel + * created with our create callback. We just need to undo anything we did + * when we created. If this bdev used its own poller, we'd unregister it here. + */ +static void +pt_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct pt_io_channel *pt_ch = ctx_buf; + + spdk_put_io_channel(pt_ch->base_ch); +} + +/* Create the passthru association from the bdev and vbdev name and insert + * on the global list. */ +static int +vbdev_passthru_insert_name(const char *bdev_name, const char *vbdev_name) +{ + struct bdev_names *name; + + TAILQ_FOREACH(name, &g_bdev_names, link) { + if (strcmp(vbdev_name, name->vbdev_name) == 0) { + SPDK_ERRLOG("passthru bdev %s already exists\n", vbdev_name); + return -EEXIST; + } + } + + name = calloc(1, sizeof(struct bdev_names)); + if (!name) { + SPDK_ERRLOG("could not allocate bdev_names\n"); + return -ENOMEM; + } + + name->bdev_name = strdup(bdev_name); + if (!name->bdev_name) { + SPDK_ERRLOG("could not allocate name->bdev_name\n"); + free(name); + return -ENOMEM; + } + + name->vbdev_name = strdup(vbdev_name); + if (!name->vbdev_name) { + SPDK_ERRLOG("could not allocate name->vbdev_name\n"); + free(name->bdev_name); + free(name); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&g_bdev_names, name, link); + + return 0; +} + +/* On init, just parse config file and build list of pt vbdevs and bdev name pairs. */ +static int +vbdev_passthru_init(void) +{ + struct spdk_conf_section *sp = NULL; + const char *conf_bdev_name = NULL; + const char *conf_vbdev_name = NULL; + struct bdev_names *name; + int i, rc; + + sp = spdk_conf_find_section(NULL, "Passthru"); + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + if (!spdk_conf_section_get_nval(sp, "PT", i)) { + break; + } + + conf_bdev_name = spdk_conf_section_get_nmval(sp, "PT", i, 0); + if (!conf_bdev_name) { + SPDK_ERRLOG("Passthru configuration missing bdev name\n"); + break; + } + + conf_vbdev_name = spdk_conf_section_get_nmval(sp, "PT", i, 1); + if (!conf_vbdev_name) { + SPDK_ERRLOG("Passthru configuration missing pt_bdev name\n"); + break; + } + + rc = vbdev_passthru_insert_name(conf_bdev_name, conf_vbdev_name); + if (rc != 0) { + return rc; + } + } + TAILQ_FOREACH(name, &g_bdev_names, link) { + SPDK_NOTICELOG("conf parse matched: %s\n", name->bdev_name); + } + return 0; +} + +/* Called when the entire module is being torn down. */ +static void +vbdev_passthru_finish(void) +{ + struct bdev_names *name; + + while ((name = TAILQ_FIRST(&g_bdev_names))) { + TAILQ_REMOVE(&g_bdev_names, name, link); + free(name->bdev_name); + free(name->vbdev_name); + free(name); + } +} + +/* During init we'll be asked how much memory we'd like passed to us + * in bev_io structures as context. Here's where we specify how + * much context we want per IO. + */ +static int +vbdev_passthru_get_ctx_size(void) +{ + return sizeof(struct passthru_bdev_io); +} + +/* Called when SPDK wants to save the current config of this vbdev module to + * a file. + */ +static void +vbdev_passthru_get_spdk_running_config(FILE *fp) +{ + struct bdev_names *names = NULL; + + fprintf(fp, "\n[Passthru]\n"); + TAILQ_FOREACH(names, &g_bdev_names, link) { + fprintf(fp, " PT %s %s\n", names->bdev_name, names->vbdev_name); + } + fprintf(fp, "\n"); +} + +/* Where vbdev_passthru_config_json() is used to generate per module JSON config data, this + * function is called to output any per bdev specific methods. For the PT module, there are + * none. + */ +static void +vbdev_passthru_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* No config per bdev needed */ +} + +/* When we register our bdev this is how we specify our entry points. */ +static const struct spdk_bdev_fn_table vbdev_passthru_fn_table = { + .destruct = vbdev_passthru_destruct, + .submit_request = vbdev_passthru_submit_request, + .io_type_supported = vbdev_passthru_io_type_supported, + .get_io_channel = vbdev_passthru_get_io_channel, + .dump_info_json = vbdev_passthru_dump_info_json, + .write_config_json = vbdev_passthru_write_config_json, +}; + +/* Called when the underlying base bdev goes away. */ +static void +vbdev_passthru_base_bdev_hotremove_cb(void *ctx) +{ + struct vbdev_passthru *pt_node, *tmp; + struct spdk_bdev *bdev_find = ctx; + + TAILQ_FOREACH_SAFE(pt_node, &g_pt_nodes, link, tmp) { + if (bdev_find == pt_node->base_bdev) { + spdk_bdev_unregister(&pt_node->pt_bdev, NULL, NULL); + } + } +} + +/* Create and register the passthru vbdev if we find it in our list of bdev names. + * This can be called either by the examine path or RPC method. + */ +static int +vbdev_passthru_register(struct spdk_bdev *bdev) +{ + struct bdev_names *name; + struct vbdev_passthru *pt_node; + int rc = 0; + + /* Check our list of names from config versus this bdev and if + * there's a match, create the pt_node & bdev accordingly. + */ + TAILQ_FOREACH(name, &g_bdev_names, link) { + if (strcmp(name->bdev_name, bdev->name) != 0) { + continue; + } + + SPDK_NOTICELOG("Match on %s\n", bdev->name); + pt_node = calloc(1, sizeof(struct vbdev_passthru)); + if (!pt_node) { + rc = -ENOMEM; + SPDK_ERRLOG("could not allocate pt_node\n"); + break; + } + + /* The base bdev that we're attaching to. */ + pt_node->base_bdev = bdev; + pt_node->pt_bdev.name = strdup(name->vbdev_name); + if (!pt_node->pt_bdev.name) { + rc = -ENOMEM; + SPDK_ERRLOG("could not allocate pt_bdev name\n"); + free(pt_node); + break; + } + pt_node->pt_bdev.product_name = "passthru"; + + /* Copy some properties from the underlying base bdev. */ + pt_node->pt_bdev.write_cache = bdev->write_cache; + pt_node->pt_bdev.required_alignment = bdev->required_alignment; + pt_node->pt_bdev.optimal_io_boundary = bdev->optimal_io_boundary; + pt_node->pt_bdev.blocklen = bdev->blocklen; + pt_node->pt_bdev.blockcnt = bdev->blockcnt; + + pt_node->pt_bdev.md_interleave = bdev->md_interleave; + pt_node->pt_bdev.md_len = bdev->md_len; + pt_node->pt_bdev.dif_type = bdev->dif_type; + pt_node->pt_bdev.dif_is_head_of_md = bdev->dif_is_head_of_md; + pt_node->pt_bdev.dif_check_flags = bdev->dif_check_flags; + + /* This is the context that is passed to us when the bdev + * layer calls in so we'll save our pt_bdev node here. + */ + pt_node->pt_bdev.ctxt = pt_node; + pt_node->pt_bdev.fn_table = &vbdev_passthru_fn_table; + pt_node->pt_bdev.module = &passthru_if; + TAILQ_INSERT_TAIL(&g_pt_nodes, pt_node, link); + + spdk_io_device_register(pt_node, pt_bdev_ch_create_cb, pt_bdev_ch_destroy_cb, + sizeof(struct pt_io_channel), + name->vbdev_name); + SPDK_NOTICELOG("io_device created at: 0x%p\n", pt_node); + + rc = spdk_bdev_open(bdev, true, vbdev_passthru_base_bdev_hotremove_cb, + bdev, &pt_node->base_desc); + if (rc) { + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(bdev)); + TAILQ_REMOVE(&g_pt_nodes, pt_node, link); + spdk_io_device_unregister(pt_node, NULL); + free(pt_node->pt_bdev.name); + free(pt_node); + break; + } + SPDK_NOTICELOG("bdev opened\n"); + + /* Save the thread where the base device is opened */ + pt_node->thread = spdk_get_thread(); + + rc = spdk_bdev_module_claim_bdev(bdev, pt_node->base_desc, pt_node->pt_bdev.module); + if (rc) { + SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(bdev)); + spdk_bdev_close(pt_node->base_desc); + TAILQ_REMOVE(&g_pt_nodes, pt_node, link); + spdk_io_device_unregister(pt_node, NULL); + free(pt_node->pt_bdev.name); + free(pt_node); + break; + } + SPDK_NOTICELOG("bdev claimed\n"); + + rc = spdk_bdev_register(&pt_node->pt_bdev); + if (rc) { + SPDK_ERRLOG("could not register pt_bdev\n"); + spdk_bdev_module_release_bdev(&pt_node->pt_bdev); + spdk_bdev_close(pt_node->base_desc); + TAILQ_REMOVE(&g_pt_nodes, pt_node, link); + spdk_io_device_unregister(pt_node, NULL); + free(pt_node->pt_bdev.name); + free(pt_node); + break; + } + SPDK_NOTICELOG("pt_bdev registered\n"); + SPDK_NOTICELOG("created pt_bdev for: %s\n", name->vbdev_name); + } + + return rc; +} + +/* Create the passthru disk from the given bdev and vbdev name. */ +int +bdev_passthru_create_disk(const char *bdev_name, const char *vbdev_name) +{ + struct spdk_bdev *bdev = NULL; + int rc = 0; + + /* Insert the bdev into our global name list even if it doesn't exist yet, + * it may show up soon... + */ + rc = vbdev_passthru_insert_name(bdev_name, vbdev_name); + if (rc) { + return rc; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + /* This is not an error, we tracked the name above and it still + * may show up later. + */ + SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n"); + return 0; + } + + return vbdev_passthru_register(bdev); +} + +void +bdev_passthru_delete_disk(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) +{ + struct bdev_names *name; + + if (!bdev || bdev->module != &passthru_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + /* Remove the association (vbdev, bdev) from g_bdev_names. This is required so that the + * vbdev does not get re-created if the same bdev is constructed at some other time, + * unless the underlying bdev was hot-removed. + */ + TAILQ_FOREACH(name, &g_bdev_names, link) { + if (strcmp(name->vbdev_name, bdev->name) == 0) { + TAILQ_REMOVE(&g_bdev_names, name, link); + free(name->bdev_name); + free(name->vbdev_name); + free(name); + break; + } + } + + /* Additional cleanup happens in the destruct callback. */ + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +/* Because we specified this function in our pt bdev function table when we + * registered our pt bdev, we'll get this call anytime a new bdev shows up. + * Here we need to decide if we care about it and if so what to do. We + * parsed the config file at init so we check the new bdev against the list + * we built up at that time and if the user configured us to attach to this + * bdev, here's where we do it. + */ +static void +vbdev_passthru_examine(struct spdk_bdev *bdev) +{ + vbdev_passthru_register(bdev); + + spdk_bdev_module_examine_done(&passthru_if); +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_passthru", SPDK_LOG_VBDEV_PASSTHRU) diff --git a/src/spdk/module/bdev/passthru/vbdev_passthru.h b/src/spdk/module/bdev/passthru/vbdev_passthru.h new file mode 100644 index 000000000..716e187c1 --- /dev/null +++ b/src/spdk/module/bdev/passthru/vbdev_passthru.h @@ -0,0 +1,61 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_PASSTHRU_H +#define SPDK_VBDEV_PASSTHRU_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/bdev_module.h" + +/** + * Create new pass through bdev. + * + * \param bdev_name Bdev on which pass through vbdev will be created. + * \param vbdev_name Name of the pass through bdev. + * \return 0 on success, other on failure. + */ +int bdev_passthru_create_disk(const char *bdev_name, const char *vbdev_name); + +/** + * Delete passthru bdev. + * + * \param bdev Pointer to pass through bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void bdev_passthru_delete_disk(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, + void *cb_arg); + +#endif /* SPDK_VBDEV_PASSTHRU_H */ diff --git a/src/spdk/module/bdev/passthru/vbdev_passthru_rpc.c b/src/spdk/module/bdev/passthru/vbdev_passthru_rpc.c new file mode 100644 index 000000000..ae4014294 --- /dev/null +++ b/src/spdk/module/bdev/passthru/vbdev_passthru_rpc.c @@ -0,0 +1,148 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vbdev_passthru.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +/* Structure to hold the parameters for this RPC method. */ +struct rpc_bdev_passthru_create { + char *base_bdev_name; + char *name; +}; + +/* Free the allocated memory resource after the RPC handling. */ +static void +free_rpc_bdev_passthru_create(struct rpc_bdev_passthru_create *r) +{ + free(r->base_bdev_name); + free(r->name); +} + +/* Structure to decode the input parameters for this RPC method. */ +static const struct spdk_json_object_decoder rpc_bdev_passthru_create_decoders[] = { + {"base_bdev_name", offsetof(struct rpc_bdev_passthru_create, base_bdev_name), spdk_json_decode_string}, + {"name", offsetof(struct rpc_bdev_passthru_create, name), spdk_json_decode_string}, +}; + +/* Decode the parameters for this RPC method and properly construct the passthru + * device. Error status returned in the failed cases. + */ +static void +rpc_bdev_passthru_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_passthru_create req = {NULL}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_passthru_create_decoders, + SPDK_COUNTOF(rpc_bdev_passthru_create_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_PASSTHRU, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = bdev_passthru_create_disk(req.base_bdev_name, req.name); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, req.name); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_passthru_create(&req); +} +SPDK_RPC_REGISTER("bdev_passthru_create", rpc_bdev_passthru_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_passthru_create, construct_passthru_bdev) + +struct rpc_bdev_passthru_delete { + char *name; +}; + +static void +free_rpc_bdev_passthru_delete(struct rpc_bdev_passthru_delete *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_passthru_delete_decoders[] = { + {"name", offsetof(struct rpc_bdev_passthru_delete, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_passthru_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_passthru_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_passthru_delete req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_bdev_passthru_delete_decoders, + SPDK_COUNTOF(rpc_bdev_passthru_delete_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + bdev_passthru_delete_disk(bdev, rpc_bdev_passthru_delete_cb, request); + +cleanup: + free_rpc_bdev_passthru_delete(&req); +} +SPDK_RPC_REGISTER("bdev_passthru_delete", rpc_bdev_passthru_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_passthru_delete, delete_passthru_bdev) diff --git a/src/spdk/module/bdev/pmem/Makefile b/src/spdk/module/bdev/pmem/Makefile new file mode 100644 index 000000000..3a918be78 --- /dev/null +++ b/src/spdk/module/bdev/pmem/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = bdev_pmem.c bdev_pmem_rpc.c +LIBNAME = bdev_pmem + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/pmem/bdev_pmem.c b/src/spdk/module/bdev/pmem/bdev_pmem.c new file mode 100644 index 000000000..79ffb960a --- /dev/null +++ b/src/spdk/module/bdev/pmem/bdev_pmem.c @@ -0,0 +1,473 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/conf.h" +#include "spdk/string.h" +#include "spdk/likely.h" +#include "spdk/util.h" +#include "spdk/rpc.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk/config.h" + +#include "bdev_pmem.h" +#include "libpmemblk.h" + +struct pmem_disk { + struct spdk_bdev disk; + PMEMblkpool *pool; + char pmem_file[NAME_MAX]; + TAILQ_ENTRY(pmem_disk) tailq; +}; + +static TAILQ_HEAD(, pmem_disk) g_pmem_disks = TAILQ_HEAD_INITIALIZER(g_pmem_disks); + +static int bdev_pmem_initialize(void); +static void bdev_pmem_finish(void); + +static struct spdk_bdev_module pmem_if = { + .name = "pmem", + .module_init = bdev_pmem_initialize, + .module_fini = bdev_pmem_finish, + .async_fini = true, + +}; + +SPDK_BDEV_MODULE_REGISTER(pmem, &pmem_if) + +typedef int(*spdk_bdev_pmem_io_request)(PMEMblkpool *pbp, void *buf, long long blockno); + +static int +_bdev_pmem_submit_io_read(PMEMblkpool *pbp, void *buf, long long blockno) +{ + return pmemblk_read(pbp, buf, blockno); +} + +static int +_bdev_pmem_submit_io_write(PMEMblkpool *pbp, void *buf, long long blockno) +{ + return pmemblk_write(pbp, buf, blockno); +} + +static int +bdev_pmem_destruct(void *ctx) +{ + struct pmem_disk *pdisk = ctx; + + TAILQ_REMOVE(&g_pmem_disks, pdisk, tailq); + free(pdisk->disk.name); + pmemblk_close(pdisk->pool); + free(pdisk); + + return 0; +} + +static int +bdev_pmem_check_iov_len(struct iovec *iovs, int iovcnt, size_t num_blocks, uint32_t block_size) +{ + size_t nbytes = num_blocks * block_size; + int i; + + for (i = 0; i < iovcnt; i++) { + if (spdk_unlikely(iovs[i].iov_base == NULL && iovs[i].iov_len != 0)) { + return -1; + } + + if (nbytes <= iovs[i].iov_len) { + return 0; + } + + if (spdk_unlikely(iovs[i].iov_len % block_size != 0)) { + return -1; + } + + nbytes -= iovs[i].iov_len; + } + + return -1; +} + +static void +bdev_pmem_submit_io(struct spdk_bdev_io *bdev_io, struct pmem_disk *pdisk, + struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset_blocks, size_t num_blocks, uint32_t block_size, + spdk_bdev_pmem_io_request fn) +{ + int rc; + size_t nbytes, offset, len; + enum spdk_bdev_io_status status; + + rc = bdev_pmem_check_iov_len(iov, iovcnt, num_blocks, block_size); + if (rc) { + status = SPDK_BDEV_IO_STATUS_FAILED; + goto end; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "io %lu bytes from offset %#lx\n", + num_blocks, offset_blocks); + + for (nbytes = num_blocks * block_size; nbytes > 0; iov++) { + len = spdk_min(iov->iov_len, nbytes); + nbytes -= len; + + offset = 0; + while (offset != len) { + rc = fn(pdisk->pool, iov->iov_base + offset, offset_blocks); + if (rc != 0) { + SPDK_ERRLOG("pmemblk io failed: %d (%s)\n", errno, pmemblk_errormsg()); + status = SPDK_BDEV_IO_STATUS_FAILED; + goto end; + } + + offset += block_size; + offset_blocks++; + } + } + + assert(num_blocks == offset_blocks - bdev_io->u.bdev.offset_blocks); + status = SPDK_BDEV_IO_STATUS_SUCCESS; +end: + + spdk_bdev_io_complete(bdev_io, status); +} + +static void +bdev_pmem_write_zeros(struct spdk_bdev_io *bdev_io, struct pmem_disk *pdisk, + struct spdk_io_channel *ch, uint64_t offset_blocks, + uint64_t num_blocks, uint32_t block_size) +{ + int rc; + enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; + + while (num_blocks > 0) { + rc = pmemblk_set_zero(pdisk->pool, offset_blocks); + if (rc != 0) { + SPDK_ERRLOG("pmemblk_set_zero failed: %d (%s)\n", errno, pmemblk_errormsg()); + status = SPDK_BDEV_IO_STATUS_FAILED; + break; + } + offset_blocks++; + num_blocks--; + } + spdk_bdev_io_complete(bdev_io, status); +} + +static void +bdev_pmem_io_get_buf_cb(struct spdk_io_channel *channel, struct spdk_bdev_io *bdev_io, + bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + bdev_pmem_submit_io(bdev_io, + bdev_io->bdev->ctxt, + channel, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + bdev_io->bdev->blocklen, + _bdev_pmem_submit_io_read); +} + +static void +bdev_pmem_submit_request(struct spdk_io_channel *channel, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_pmem_io_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_pmem_submit_io(bdev_io, + bdev_io->bdev->ctxt, + channel, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + bdev_io->bdev->blocklen, + _bdev_pmem_submit_io_write); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + bdev_pmem_write_zeros(bdev_io, + bdev_io->bdev->ctxt, + channel, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_RESET: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + break; + default: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_pmem_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + return true; + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_pmem_get_io_channel(void *ctx) +{ + return spdk_get_io_channel(&g_pmem_disks); +} + +static int +bdev_pmem_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct pmem_disk *pdisk = ctx; + + spdk_json_write_named_object_begin(w, "pmem"); + spdk_json_write_named_string(w, "pmem_file", pdisk->pmem_file); + spdk_json_write_object_end(w); + + return 0; +} + +static int +bdev_pmem_create_cb(void *io_device, void *ctx_buf) +{ + return 0; +} + +static void +bdev_pmem_destroy_cb(void *io_device, void *ctx_buf) +{ +} + +static void +bdev_pmem_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct pmem_disk *disk = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_pmem_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_string(w, "pmem_file", disk->pmem_file); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table pmem_fn_table = { + .destruct = bdev_pmem_destruct, + .submit_request = bdev_pmem_submit_request, + .io_type_supported = bdev_pmem_io_type_supported, + .get_io_channel = bdev_pmem_get_io_channel, + .dump_info_json = bdev_pmem_dump_info_json, + .write_config_json = bdev_pmem_write_config_json, +}; + +int +create_pmem_disk(const char *pmem_file, const char *name, struct spdk_bdev **bdev) +{ + uint64_t num_blocks; + uint32_t block_size; + struct pmem_disk *pdisk; + int rc; + + *bdev = NULL; + + if (name == NULL) { + SPDK_ERRLOG("Missing name parameter for create_pmem_disk()\n"); + return -EINVAL; + } + + if (pmemblk_check(pmem_file, 0) != 1) { + SPDK_ERRLOG("Pool '%s' check failed: %s\n", pmem_file, pmemblk_errormsg()); + return -EIO; + } + + pdisk = calloc(1, sizeof(*pdisk)); + if (!pdisk) { + return -ENOMEM; + } + + snprintf(pdisk->pmem_file, sizeof(pdisk->pmem_file), "%s", pmem_file); + pdisk->pool = pmemblk_open(pmem_file, 0); + if (!pdisk->pool) { + SPDK_ERRLOG("Opening pmem pool '%s' failed: %d\n", pmem_file, errno); + free(pdisk); + return -errno; + } + + block_size = pmemblk_bsize(pdisk->pool); + num_blocks = pmemblk_nblock(pdisk->pool); + + if (block_size == 0) { + SPDK_ERRLOG("Block size must be more than 0 bytes\n"); + pmemblk_close(pdisk->pool); + free(pdisk); + return -EINVAL; + } + + if (num_blocks == 0) { + SPDK_ERRLOG("Disk must be more than 0 blocks\n"); + pmemblk_close(pdisk->pool); + free(pdisk); + return -EINVAL; + } + + pdisk->disk.name = strdup(name); + if (!pdisk->disk.name) { + pmemblk_close(pdisk->pool); + free(pdisk); + return -ENOMEM; + } + + pdisk->disk.product_name = "pmemblk disk"; + pdisk->disk.write_cache = 0; + pdisk->disk.blocklen = block_size; + pdisk->disk.blockcnt = num_blocks; + + pdisk->disk.ctxt = pdisk; + pdisk->disk.fn_table = &pmem_fn_table; + pdisk->disk.module = &pmem_if; + + rc = spdk_bdev_register(&pdisk->disk); + if (rc) { + pmemblk_close(pdisk->pool); + free(pdisk->disk.name); + free(pdisk); + return rc; + } + + TAILQ_INSERT_TAIL(&g_pmem_disks, pdisk, tailq); + + *bdev = &pdisk->disk; + + return 0; +} + +void +delete_pmem_disk(struct spdk_bdev *bdev, spdk_delete_pmem_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &pmem_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static void +bdev_pmem_read_conf(void) +{ + struct spdk_conf_section *sp; + struct spdk_bdev *bdev; + const char *pmem_file; + const char *bdev_name; + int i; + + sp = spdk_conf_find_section(NULL, "Pmem"); + if (sp == NULL) { + return; + } + + for (i = 0; ; i++) { + if (!spdk_conf_section_get_nval(sp, "Blk", i)) { + break; + } + + pmem_file = spdk_conf_section_get_nmval(sp, "Blk", i, 0); + if (pmem_file == NULL) { + SPDK_ERRLOG("Pmem: missing filename\n"); + continue; + } + + bdev_name = spdk_conf_section_get_nmval(sp, "Blk", i, 1); + if (bdev_name == NULL) { + SPDK_ERRLOG("Pmem: missing bdev name\n"); + continue; + } + + create_pmem_disk(pmem_file, bdev_name, &bdev); + } +} + +static int +bdev_pmem_initialize(void) +{ + const char *err = pmemblk_check_version(PMEMBLK_MAJOR_VERSION, PMEMBLK_MINOR_VERSION); + + if (err != NULL) { + SPDK_ERRLOG("Invalid libpmemblk version (expected %d.%d): %s\n", PMEMBLK_MAJOR_VERSION, + PMEMBLK_MINOR_VERSION, err); + return -1; + } + +#ifdef SPDK_CONFIG_DEBUG + setenv("PMEMBLK_LOG_LEVEL", "1", 1); +#endif + spdk_io_device_register(&g_pmem_disks, bdev_pmem_create_cb, bdev_pmem_destroy_cb, 0, "pmem_bdev"); + + bdev_pmem_read_conf(); + + return 0; + +} + +static void +bdev_pmem_finish_done(void *io_device) +{ + spdk_bdev_module_finish_done(); +} + +static void +bdev_pmem_finish(void) +{ + spdk_io_device_unregister(&g_pmem_disks, bdev_pmem_finish_done); +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_pmem", SPDK_LOG_BDEV_PMEM) diff --git a/src/spdk/module/bdev/pmem/bdev_pmem.h b/src/spdk/module/bdev/pmem/bdev_pmem.h new file mode 100644 index 000000000..d9292b114 --- /dev/null +++ b/src/spdk/module/bdev/pmem/bdev_pmem.h @@ -0,0 +1,64 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_PMEM_H +#define SPDK_BDEV_PMEM_H + +#include "spdk/bdev.h" + +typedef void (*spdk_delete_pmem_complete)(void *cb_arg, int bdeverrno); + +/** + * Create new pmem bdev. + * + * \param pmem_file Pointer to pmem pool file. + * \param name Bdev name. + * \param bdev output parameter for bdev when operation is successful. + * \return 0 on success. + * -EIO if pool check failed + * -EINVAL if input parameters check failed + * -ENOMEM if buffer cannot be allocated + */ +int create_pmem_disk(const char *pmem_file, const char *name, struct spdk_bdev **bdev); + +/** + * Delete pmem bdev. + * + * \param bdev Pointer to pmem bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void delete_pmem_disk(struct spdk_bdev *bdev, spdk_delete_pmem_complete cb_fn, + void *cb_arg); + +#endif /* SPDK_BDEV_PMEM_H */ diff --git a/src/spdk/module/bdev/pmem/bdev_pmem_rpc.c b/src/spdk/module/bdev/pmem/bdev_pmem_rpc.c new file mode 100644 index 000000000..2af7c1c7a --- /dev/null +++ b/src/spdk/module/bdev/pmem/bdev_pmem_rpc.c @@ -0,0 +1,337 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_pmem.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "libpmemblk.h" + +#include "spdk_internal/log.h" + +struct rpc_construct_pmem { + char *pmem_file; + char *name; +}; + +static void +free_rpc_bdev_pmem_create(struct rpc_construct_pmem *req) +{ + free(req->pmem_file); + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_construct_pmem_decoders[] = { + {"pmem_file", offsetof(struct rpc_construct_pmem, pmem_file), spdk_json_decode_string}, + {"name", offsetof(struct rpc_construct_pmem, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_pmem_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_pmem req = {}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_pmem_decoders, + SPDK_COUNTOF(rpc_construct_pmem_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + rc = create_pmem_disk(req.pmem_file, req.name, &bdev); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_pmem_create(&req); +} +SPDK_RPC_REGISTER("bdev_pmem_create", rpc_bdev_pmem_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_pmem_create, construct_pmem_bdev) + +struct rpc_delete_pmem { + char *name; +}; + +static void +free_rpc_delete_pmem(struct rpc_delete_pmem *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_pmem_decoders[] = { + {"name", offsetof(struct rpc_delete_pmem, name), spdk_json_decode_string}, +}; + +static void +_rpc_bdev_pmem_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_pmem_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_pmem req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_delete_pmem_decoders, + SPDK_COUNTOF(rpc_delete_pmem_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "bdev '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + delete_pmem_disk(bdev, _rpc_bdev_pmem_delete_cb, request); + +cleanup: + free_rpc_delete_pmem(&req); +} +SPDK_RPC_REGISTER("bdev_pmem_delete", rpc_bdev_pmem_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_pmem_delete, delete_pmem_bdev) + +struct rpc_bdev_pmem_create_pool { + char *pmem_file; + uint64_t num_blocks; + uint32_t block_size; +}; + +static const struct spdk_json_object_decoder rpc_bdev_pmem_create_pool_decoders[] = { + {"pmem_file", offsetof(struct rpc_bdev_pmem_create_pool, pmem_file), spdk_json_decode_string}, + {"num_blocks", offsetof(struct rpc_bdev_pmem_create_pool, num_blocks), spdk_json_decode_uint64}, + {"block_size", offsetof(struct rpc_bdev_pmem_create_pool, block_size), spdk_json_decode_uint32}, +}; + +static void +free_rpc_bdev_pmem_create_pool(struct rpc_bdev_pmem_create_pool *req) +{ + free(req->pmem_file); +} + +static void +rpc_bdev_pmem_create_pool(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_pmem_create_pool req = {}; + struct spdk_json_write_ctx *w; + uint64_t pool_size; + PMEMblkpool *pbp; + + if (spdk_json_decode_object(params, rpc_bdev_pmem_create_pool_decoders, + SPDK_COUNTOF(rpc_bdev_pmem_create_pool_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + /* libpmemblk pool has to contain at least 256 blocks */ + if (req.num_blocks < 256) { + spdk_jsonrpc_send_error_response(request, -EINVAL, + "Pmem pool num_blocks must be at least 256"); + goto cleanup; + } + + pool_size = req.num_blocks * req.block_size; + if (pool_size < PMEMBLK_MIN_POOL) { + spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, + "Pmem pool size must be at least %ld", PMEMBLK_MIN_POOL); + goto cleanup; + } + + pbp = pmemblk_create(req.pmem_file, req.block_size, pool_size, 0666); + if (pbp == NULL) { + const char *msg = pmemblk_errormsg(); + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "pmemblk_create() failed: %s\n", msg ? msg : "(logs disabled)"); + spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "pmemblk_create failed: %s", msg ? msg : "(logs disabled)"); + goto cleanup; + } + + pmemblk_close(pbp); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_pmem_create_pool(&req); +} +SPDK_RPC_REGISTER("bdev_pmem_create_pool", rpc_bdev_pmem_create_pool, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_pmem_create_pool, create_pmem_pool) + +struct rpc_bdev_pmem_get_pool_info { + char *pmem_file; +}; + +static const struct spdk_json_object_decoder rpc_bdev_pmem_get_pool_info_decoders[] = { + {"pmem_file", offsetof(struct rpc_bdev_pmem_get_pool_info, pmem_file), spdk_json_decode_string}, +}; + +static void +free_rpc_bdev_pmem_get_pool_info(struct rpc_bdev_pmem_get_pool_info *req) +{ + free(req->pmem_file); +} + +static void +rpc_bdev_pmem_get_pool_info(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_pmem_get_pool_info req = {}; + struct spdk_json_write_ctx *w; + size_t num_blocks, block_size; + PMEMblkpool *pbp; + + if (spdk_json_decode_object(params, rpc_bdev_pmem_get_pool_info_decoders, + SPDK_COUNTOF(rpc_bdev_pmem_get_pool_info_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + pbp = pmemblk_open(req.pmem_file, 0); + if (pbp == NULL) { + const char *msg = pmemblk_errormsg(); + + spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "pmemblk_open failed: %s", msg ? msg : "(logs disabled)"); + goto cleanup; + } + + block_size = pmemblk_bsize(pbp); + num_blocks = pmemblk_nblock(pbp); + + pmemblk_close(pbp); + + /* Check pmem pool consistency */ + if (pmemblk_check(req.pmem_file, block_size) != 1) { + const char *msg = pmemblk_errormsg(); + + spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "pmemblk_check failed: %s", msg ? msg : "(logs disabled)"); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + spdk_json_write_object_begin(w); + spdk_json_write_named_uint64(w, "num_blocks", num_blocks); + spdk_json_write_named_uint64(w, "block_size", block_size); + spdk_json_write_object_end(w); + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_pmem_get_pool_info(&req); +} +SPDK_RPC_REGISTER("bdev_pmem_get_pool_info", rpc_bdev_pmem_get_pool_info, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_pmem_get_pool_info, pmem_pool_info) + +struct rpc_bdev_pmem_delete_pool { + char *pmem_file; +}; + +static const struct spdk_json_object_decoder rpc_bdev_pmem_delete_pool_decoders[] = { + {"pmem_file", offsetof(struct rpc_bdev_pmem_delete_pool, pmem_file), spdk_json_decode_string}, +}; + +static void +free_rpc_bdev_pmem_delete_pool(struct rpc_bdev_pmem_delete_pool *req) +{ + free(req->pmem_file); +} + +static void +rpc_bdev_pmem_delete_pool(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_pmem_delete_pool req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_pmem_delete_pool_decoders, + SPDK_COUNTOF(rpc_bdev_pmem_delete_pool_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + /* Check if file is actually pmem pool */ + rc = pmemblk_check(req.pmem_file, 0); + if (rc != 1) { + const char *msg = pmemblk_errormsg(); + + spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "pmemblk_check failed: %s", msg ? msg : "(logs disabled)"); + goto cleanup; + } + + unlink(req.pmem_file); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_pmem_delete_pool(&req); +} +SPDK_RPC_REGISTER("bdev_pmem_delete_pool", rpc_bdev_pmem_delete_pool, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_pmem_delete_pool, delete_pmem_pool) diff --git a/src/spdk/module/bdev/raid/Makefile b/src/spdk/module/bdev/raid/Makefile new file mode 100644 index 000000000..452d32e79 --- /dev/null +++ b/src/spdk/module/bdev/raid/Makefile @@ -0,0 +1,51 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ +C_SRCS = bdev_raid.c bdev_raid_rpc.c raid0.c + +ifeq ($(CONFIG_RAID5),y) +C_SRCS += raid5.c +endif + +LIBNAME = bdev_raid + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/raid/bdev_raid.c b/src/spdk/module/bdev/raid/bdev_raid.c new file mode 100644 index 000000000..10da1a799 --- /dev/null +++ b/src/spdk/module/bdev/raid/bdev_raid.c @@ -0,0 +1,1719 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_raid.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/conf.h" +#include "spdk_internal/log.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/json.h" +#include "spdk/string.h" + +static bool g_shutdown_started = false; + +/* raid bdev config as read from config file */ +struct raid_config g_raid_config = { + .raid_bdev_config_head = TAILQ_HEAD_INITIALIZER(g_raid_config.raid_bdev_config_head), +}; + +/* + * List of raid bdev in configured list, these raid bdevs are registered with + * bdev layer + */ +struct raid_configured_tailq g_raid_bdev_configured_list = TAILQ_HEAD_INITIALIZER( + g_raid_bdev_configured_list); + +/* List of raid bdev in configuring list */ +struct raid_configuring_tailq g_raid_bdev_configuring_list = TAILQ_HEAD_INITIALIZER( + g_raid_bdev_configuring_list); + +/* List of all raid bdevs */ +struct raid_all_tailq g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list); + +/* List of all raid bdevs that are offline */ +struct raid_offline_tailq g_raid_bdev_offline_list = TAILQ_HEAD_INITIALIZER( + g_raid_bdev_offline_list); + +static TAILQ_HEAD(, raid_bdev_module) g_raid_modules = TAILQ_HEAD_INITIALIZER(g_raid_modules); + +static struct raid_bdev_module *raid_bdev_module_find(enum raid_level level) +{ + struct raid_bdev_module *raid_module; + + TAILQ_FOREACH(raid_module, &g_raid_modules, link) { + if (raid_module->level == level) { + return raid_module; + } + } + + return NULL; +} + +void raid_bdev_module_list_add(struct raid_bdev_module *raid_module) +{ + if (raid_bdev_module_find(raid_module->level) != NULL) { + SPDK_ERRLOG("module for raid level '%s' already registered.\n", + raid_bdev_level_to_str(raid_module->level)); + assert(false); + } else { + TAILQ_INSERT_TAIL(&g_raid_modules, raid_module, link); + } +} + +/* Function declarations */ +static void raid_bdev_examine(struct spdk_bdev *bdev); +static int raid_bdev_init(void); +static void raid_bdev_deconfigure(struct raid_bdev *raid_bdev, + raid_bdev_destruct_cb cb_fn, void *cb_arg); +static void raid_bdev_remove_base_bdev(void *ctx); + +/* + * brief: + * raid_bdev_create_cb function is a cb function for raid bdev which creates the + * hierarchy from raid bdev to base bdev io channels. It will be called per core + * params: + * io_device - pointer to raid bdev io device represented by raid_bdev + * ctx_buf - pointer to context buffer for raid bdev io channel + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_create_cb(void *io_device, void *ctx_buf) +{ + struct raid_bdev *raid_bdev = io_device; + struct raid_bdev_io_channel *raid_ch = ctx_buf; + uint8_t i; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_create_cb, %p\n", raid_ch); + + assert(raid_bdev != NULL); + assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE); + + raid_ch->num_channels = raid_bdev->num_base_bdevs; + + raid_ch->base_channel = calloc(raid_ch->num_channels, + sizeof(struct spdk_io_channel *)); + if (!raid_ch->base_channel) { + SPDK_ERRLOG("Unable to allocate base bdevs io channel\n"); + return -ENOMEM; + } + for (i = 0; i < raid_ch->num_channels; i++) { + /* + * Get the spdk_io_channel for all the base bdevs. This is used during + * split logic to send the respective child bdev ios to respective base + * bdev io channel. + */ + raid_ch->base_channel[i] = spdk_bdev_get_io_channel( + raid_bdev->base_bdev_info[i].desc); + if (!raid_ch->base_channel[i]) { + uint8_t j; + + for (j = 0; j < i; j++) { + spdk_put_io_channel(raid_ch->base_channel[j]); + } + free(raid_ch->base_channel); + raid_ch->base_channel = NULL; + SPDK_ERRLOG("Unable to create io channel for base bdev\n"); + return -ENOMEM; + } + } + + return 0; +} + +/* + * brief: + * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the + * hierarchy from raid bdev to base bdev io channels. It will be called per core + * params: + * io_device - pointer to raid bdev io device represented by raid_bdev + * ctx_buf - pointer to context buffer for raid bdev io channel + * returns: + * none + */ +static void +raid_bdev_destroy_cb(void *io_device, void *ctx_buf) +{ + struct raid_bdev_io_channel *raid_ch = ctx_buf; + uint8_t i; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destroy_cb\n"); + + assert(raid_ch != NULL); + assert(raid_ch->base_channel); + for (i = 0; i < raid_ch->num_channels; i++) { + /* Free base bdev channels */ + assert(raid_ch->base_channel[i] != NULL); + spdk_put_io_channel(raid_ch->base_channel[i]); + } + free(raid_ch->base_channel); + raid_ch->base_channel = NULL; +} + +/* + * brief: + * raid_bdev_cleanup is used to cleanup and free raid_bdev related data + * structures. + * params: + * raid_bdev - pointer to raid_bdev + * returns: + * none + */ +static void +raid_bdev_cleanup(struct raid_bdev *raid_bdev) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_cleanup, %p name %s, state %u, config %p\n", + raid_bdev, + raid_bdev->bdev.name, raid_bdev->state, raid_bdev->config); + if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) { + TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link); + } else if (raid_bdev->state == RAID_BDEV_STATE_OFFLINE) { + TAILQ_REMOVE(&g_raid_bdev_offline_list, raid_bdev, state_link); + } else { + assert(0); + } + TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link); + free(raid_bdev->bdev.name); + free(raid_bdev->base_bdev_info); + if (raid_bdev->config) { + raid_bdev->config->raid_bdev = NULL; + } + free(raid_bdev); +} + +/* + * brief: + * wrapper for the bdev close operation + * params: + * base_info - raid base bdev info + * returns: + */ +static void +_raid_bdev_free_base_bdev_resource(void *ctx) +{ + struct spdk_bdev_desc *desc = ctx; + + spdk_bdev_close(desc); +} + + +/* + * brief: + * free resource of base bdev for raid bdev + * params: + * raid_bdev - pointer to raid bdev + * base_info - raid base bdev info + * returns: + * 0 - success + * non zero - failure + */ +static void +raid_bdev_free_base_bdev_resource(struct raid_bdev *raid_bdev, + struct raid_base_bdev_info *base_info) +{ + spdk_bdev_module_release_bdev(base_info->bdev); + if (base_info->thread && base_info->thread != spdk_get_thread()) { + spdk_thread_send_msg(base_info->thread, _raid_bdev_free_base_bdev_resource, base_info->desc); + } else { + spdk_bdev_close(base_info->desc); + } + base_info->desc = NULL; + base_info->bdev = NULL; + + assert(raid_bdev->num_base_bdevs_discovered); + raid_bdev->num_base_bdevs_discovered--; +} + +/* + * brief: + * raid_bdev_destruct is the destruct function table pointer for raid bdev + * params: + * ctxt - pointer to raid_bdev + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_destruct(void *ctxt) +{ + struct raid_bdev *raid_bdev = ctxt; + struct raid_base_bdev_info *base_info; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destruct\n"); + + raid_bdev->destruct_called = true; + RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { + /* + * Close all base bdev descriptors for which call has come from below + * layers. Also close the descriptors if we have started shutdown. + */ + if (g_shutdown_started || + ((base_info->remove_scheduled == true) && + (base_info->bdev != NULL))) { + raid_bdev_free_base_bdev_resource(raid_bdev, base_info); + } + } + + if (g_shutdown_started) { + TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link); + if (raid_bdev->module->stop != NULL) { + raid_bdev->module->stop(raid_bdev); + } + raid_bdev->state = RAID_BDEV_STATE_OFFLINE; + TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link); + } + + spdk_io_device_unregister(raid_bdev, NULL); + + if (raid_bdev->num_base_bdevs_discovered == 0) { + /* Free raid_bdev when there are no base bdevs left */ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev base bdevs is 0, going to free all in destruct\n"); + raid_bdev_cleanup(raid_bdev); + } + + return 0; +} + +void +raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status) +{ + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); + + spdk_bdev_io_complete(bdev_io, status); +} + +/* + * brief: + * raid_bdev_io_complete_part - signal the completion of a part of the expected + * base bdev IOs and complete the raid_io if this is the final expected IO. + * The caller should first set raid_io->base_bdev_io_remaining. This function + * will decrement this counter by the value of the 'completed' parameter and + * complete the raid_io if the counter reaches 0. The caller is free to + * interpret the 'base_bdev_io_remaining' and 'completed' values as needed, + * it can represent e.g. blocks or IOs. + * params: + * raid_io - pointer to raid_bdev_io + * completed - the part of the raid_io that has been completed + * status - status of the base IO + * returns: + * true - if the raid_io is completed + * false - otherwise + */ +bool +raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed, + enum spdk_bdev_io_status status) +{ + assert(raid_io->base_bdev_io_remaining >= completed); + raid_io->base_bdev_io_remaining -= completed; + + if (status != SPDK_BDEV_IO_STATUS_SUCCESS) { + raid_io->base_bdev_io_status = status; + } + + if (raid_io->base_bdev_io_remaining == 0) { + raid_bdev_io_complete(raid_io, raid_io->base_bdev_io_status); + return true; + } else { + return false; + } +} + +/* + * brief: + * raid_bdev_queue_io_wait function processes the IO which failed to submit. + * It will try to queue the IOs after storing the context to bdev wait queue logic. + * params: + * raid_io - pointer to raid_bdev_io + * bdev - the block device that the IO is submitted to + * ch - io channel + * cb_fn - callback when the spdk_bdev_io for bdev becomes available + * returns: + * none + */ +void +raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, + struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn) +{ + raid_io->waitq_entry.bdev = bdev; + raid_io->waitq_entry.cb_fn = cb_fn; + raid_io->waitq_entry.cb_arg = raid_io; + spdk_bdev_queue_io_wait(bdev, ch, &raid_io->waitq_entry); +} + +static void +raid_base_bdev_reset_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct raid_bdev_io *raid_io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + raid_bdev_io_complete_part(raid_io, 1, success ? + SPDK_BDEV_IO_STATUS_SUCCESS : + SPDK_BDEV_IO_STATUS_FAILED); +} + +static void +raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io); + +static void +_raid_bdev_submit_reset_request(void *_raid_io) +{ + struct raid_bdev_io *raid_io = _raid_io; + + raid_bdev_submit_reset_request(raid_io); +} + +/* + * brief: + * raid_bdev_submit_reset_request function submits reset requests + * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in + * which case it will queue it for later submission + * params: + * raid_io + * returns: + * none + */ +static void +raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io) +{ + struct raid_bdev *raid_bdev; + int ret; + uint8_t i; + struct raid_base_bdev_info *base_info; + struct spdk_io_channel *base_ch; + + raid_bdev = raid_io->raid_bdev; + + if (raid_io->base_bdev_io_remaining == 0) { + raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; + } + + while (raid_io->base_bdev_io_submitted < raid_bdev->num_base_bdevs) { + i = raid_io->base_bdev_io_submitted; + base_info = &raid_bdev->base_bdev_info[i]; + base_ch = raid_io->raid_ch->base_channel[i]; + ret = spdk_bdev_reset(base_info->desc, base_ch, + raid_base_bdev_reset_complete, raid_io); + if (ret == 0) { + raid_io->base_bdev_io_submitted++; + } else if (ret == -ENOMEM) { + raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, + _raid_bdev_submit_reset_request); + return; + } else { + SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); + assert(false); + raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + } +} + +/* + * brief: + * Callback function to spdk_bdev_io_get_buf. + * params: + * ch - pointer to raid bdev io channel + * bdev_io - pointer to parent bdev_io on raid bdev device + * success - True if buffer is allocated or false otherwise. + * returns: + * none + */ +static void +raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; + + if (!success) { + raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + raid_io->raid_bdev->module->submit_rw_request(raid_io); +} + +/* + * brief: + * raid_bdev_submit_request function is the submit_request function pointer of + * raid bdev function table. This is used to submit the io on raid_bdev to below + * layers. + * params: + * ch - pointer to raid bdev io channel + * bdev_io - pointer to parent bdev_io on raid bdev device + * returns: + * none + */ +static void +raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; + + raid_io->raid_bdev = bdev_io->bdev->ctxt; + raid_io->raid_ch = spdk_io_channel_get_ctx(ch); + raid_io->base_bdev_io_remaining = 0; + raid_io->base_bdev_io_submitted = 0; + raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + raid_io->raid_bdev->module->submit_rw_request(raid_io); + break; + + case SPDK_BDEV_IO_TYPE_RESET: + raid_bdev_submit_reset_request(raid_io); + break; + + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_UNMAP: + raid_io->raid_bdev->module->submit_null_payload_request(raid_io); + break; + + default: + SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type); + raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); + break; + } +} + +/* + * brief: + * _raid_bdev_io_type_supported checks whether io_type is supported in + * all base bdev modules of raid bdev module. If anyone among the base_bdevs + * doesn't support, the raid device doesn't supports. + * + * params: + * raid_bdev - pointer to raid bdev context + * io_type - io type + * returns: + * true - io_type is supported + * false - io_type is not supported + */ +inline static bool +_raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type) +{ + struct raid_base_bdev_info *base_info; + + if (io_type == SPDK_BDEV_IO_TYPE_FLUSH || + io_type == SPDK_BDEV_IO_TYPE_UNMAP) { + if (raid_bdev->module->submit_null_payload_request == NULL) { + return false; + } + } + + RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { + if (base_info->bdev == NULL) { + assert(false); + continue; + } + + if (spdk_bdev_io_type_supported(base_info->bdev, io_type) == false) { + return false; + } + } + + return true; +} + +/* + * brief: + * raid_bdev_io_type_supported is the io_supported function for bdev function + * table which returns whether the particular io type is supported or not by + * raid bdev module + * params: + * ctx - pointer to raid bdev context + * type - io type + * returns: + * true - io_type is supported + * false - io_type is not supported + */ +static bool +raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + return true; + + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_UNMAP: + return _raid_bdev_io_type_supported(ctx, io_type); + + default: + return false; + } + + return false; +} + +/* + * brief: + * raid_bdev_get_io_channel is the get_io_channel function table pointer for + * raid bdev. This is used to return the io channel for this raid bdev + * params: + * ctxt - pointer to raid_bdev + * returns: + * pointer to io channel for raid bdev + */ +static struct spdk_io_channel * +raid_bdev_get_io_channel(void *ctxt) +{ + struct raid_bdev *raid_bdev = ctxt; + + return spdk_get_io_channel(raid_bdev); +} + +/* + * brief: + * raid_bdev_dump_info_json is the function table pointer for raid bdev + * params: + * ctx - pointer to raid_bdev + * w - pointer to json context + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct raid_bdev *raid_bdev = ctx; + struct raid_base_bdev_info *base_info; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_dump_config_json\n"); + assert(raid_bdev != NULL); + + /* Dump the raid bdev configuration related information */ + spdk_json_write_named_object_begin(w, "raid"); + spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size); + spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb); + spdk_json_write_named_uint32(w, "state", raid_bdev->state); + spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); + spdk_json_write_named_uint32(w, "destruct_called", raid_bdev->destruct_called); + spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs); + spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered); + spdk_json_write_name(w, "base_bdevs_list"); + spdk_json_write_array_begin(w); + RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { + if (base_info->bdev) { + spdk_json_write_string(w, base_info->bdev->name); + } else { + spdk_json_write_null(w); + } + } + spdk_json_write_array_end(w); + spdk_json_write_object_end(w); + + return 0; +} + +/* + * brief: + * raid_bdev_write_config_json is the function table pointer for raid bdev + * params: + * bdev - pointer to spdk_bdev + * w - pointer to json context + * returns: + * none + */ +static void +raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct raid_bdev *raid_bdev = bdev->ctxt; + struct raid_base_bdev_info *base_info; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_raid_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size_kb); + spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); + + spdk_json_write_named_array_begin(w, "base_bdevs"); + RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { + if (base_info->bdev) { + spdk_json_write_string(w, base_info->bdev->name); + } + } + spdk_json_write_array_end(w); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +/* g_raid_bdev_fn_table is the function table for raid bdev */ +static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = { + .destruct = raid_bdev_destruct, + .submit_request = raid_bdev_submit_request, + .io_type_supported = raid_bdev_io_type_supported, + .get_io_channel = raid_bdev_get_io_channel, + .dump_info_json = raid_bdev_dump_info_json, + .write_config_json = raid_bdev_write_config_json, +}; + +/* + * brief: + * raid_bdev_config_cleanup function is used to free memory for one raid_bdev in configuration + * params: + * raid_cfg - pointer to raid_bdev_config structure + * returns: + * none + */ +void +raid_bdev_config_cleanup(struct raid_bdev_config *raid_cfg) +{ + uint8_t i; + + TAILQ_REMOVE(&g_raid_config.raid_bdev_config_head, raid_cfg, link); + g_raid_config.total_raid_bdev--; + + if (raid_cfg->base_bdev) { + for (i = 0; i < raid_cfg->num_base_bdevs; i++) { + free(raid_cfg->base_bdev[i].name); + } + free(raid_cfg->base_bdev); + } + free(raid_cfg->name); + free(raid_cfg); +} + +/* + * brief: + * raid_bdev_free is the raid bdev function table function pointer. This is + * called on bdev free path + * params: + * none + * returns: + * none + */ +static void +raid_bdev_free(void) +{ + struct raid_bdev_config *raid_cfg, *tmp; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_free\n"); + TAILQ_FOREACH_SAFE(raid_cfg, &g_raid_config.raid_bdev_config_head, link, tmp) { + raid_bdev_config_cleanup(raid_cfg); + } +} + +/* brief + * raid_bdev_config_find_by_name is a helper function to find raid bdev config + * by name as key. + * + * params: + * raid_name - name for raid bdev. + */ +struct raid_bdev_config * +raid_bdev_config_find_by_name(const char *raid_name) +{ + struct raid_bdev_config *raid_cfg; + + TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) { + if (!strcmp(raid_cfg->name, raid_name)) { + return raid_cfg; + } + } + + return raid_cfg; +} + +/* + * brief + * raid_bdev_config_add function adds config for newly created raid bdev. + * + * params: + * raid_name - name for raid bdev. + * strip_size - strip size in KB + * num_base_bdevs - number of base bdevs. + * level - raid level. + * _raid_cfg - Pointer to newly added configuration + */ +int +raid_bdev_config_add(const char *raid_name, uint32_t strip_size, uint8_t num_base_bdevs, + enum raid_level level, struct raid_bdev_config **_raid_cfg) +{ + struct raid_bdev_config *raid_cfg; + + raid_cfg = raid_bdev_config_find_by_name(raid_name); + if (raid_cfg != NULL) { + SPDK_ERRLOG("Duplicate raid bdev name found in config file %s\n", + raid_name); + return -EEXIST; + } + + if (spdk_u32_is_pow2(strip_size) == false) { + SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size); + return -EINVAL; + } + + if (num_base_bdevs == 0) { + SPDK_ERRLOG("Invalid base device count %u\n", num_base_bdevs); + return -EINVAL; + } + + raid_cfg = calloc(1, sizeof(*raid_cfg)); + if (raid_cfg == NULL) { + SPDK_ERRLOG("unable to allocate memory\n"); + return -ENOMEM; + } + + raid_cfg->name = strdup(raid_name); + if (!raid_cfg->name) { + free(raid_cfg); + SPDK_ERRLOG("unable to allocate memory\n"); + return -ENOMEM; + } + raid_cfg->strip_size = strip_size; + raid_cfg->num_base_bdevs = num_base_bdevs; + raid_cfg->level = level; + + raid_cfg->base_bdev = calloc(num_base_bdevs, sizeof(*raid_cfg->base_bdev)); + if (raid_cfg->base_bdev == NULL) { + free(raid_cfg->name); + free(raid_cfg); + SPDK_ERRLOG("unable to allocate memory\n"); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&g_raid_config.raid_bdev_config_head, raid_cfg, link); + g_raid_config.total_raid_bdev++; + + *_raid_cfg = raid_cfg; + return 0; +} + +/* + * brief: + * raid_bdev_config_add_base_bdev function add base bdev to raid bdev config. + * + * params: + * raid_cfg - pointer to raid bdev configuration + * base_bdev_name - name of base bdev + * slot - Position to add base bdev + */ +int +raid_bdev_config_add_base_bdev(struct raid_bdev_config *raid_cfg, const char *base_bdev_name, + uint8_t slot) +{ + uint8_t i; + struct raid_bdev_config *tmp; + + if (slot >= raid_cfg->num_base_bdevs) { + return -EINVAL; + } + + TAILQ_FOREACH(tmp, &g_raid_config.raid_bdev_config_head, link) { + for (i = 0; i < tmp->num_base_bdevs; i++) { + if (tmp->base_bdev[i].name != NULL) { + if (!strcmp(tmp->base_bdev[i].name, base_bdev_name)) { + SPDK_ERRLOG("duplicate base bdev name %s mentioned\n", + base_bdev_name); + return -EEXIST; + } + } + } + } + + raid_cfg->base_bdev[slot].name = strdup(base_bdev_name); + if (raid_cfg->base_bdev[slot].name == NULL) { + SPDK_ERRLOG("unable to allocate memory\n"); + return -ENOMEM; + } + + return 0; +} + +static struct { + const char *name; + enum raid_level value; +} g_raid_level_names[] = { + { "raid0", RAID0 }, + { "0", RAID0 }, + { "raid5", RAID5 }, + { "5", RAID5 }, + { } +}; + +enum raid_level raid_bdev_parse_raid_level(const char *str) +{ + unsigned int i; + + assert(str != NULL); + + for (i = 0; g_raid_level_names[i].name != NULL; i++) { + if (strcasecmp(g_raid_level_names[i].name, str) == 0) { + return g_raid_level_names[i].value; + } + } + + return INVALID_RAID_LEVEL; +} + +const char * +raid_bdev_level_to_str(enum raid_level level) +{ + unsigned int i; + + for (i = 0; g_raid_level_names[i].name != NULL; i++) { + if (g_raid_level_names[i].value == level) { + return g_raid_level_names[i].name; + } + } + + return ""; +} + +/* + * brief: + * raid_bdev_parse_raid is used to parse the raid bdev from config file based on + * pre-defined raid bdev format in config file. + * Format of config file: + * [RAID1] + * Name raid1 + * StripSize 64 + * NumDevices 2 + * RaidLevel 0 + * Devices Nvme0n1 Nvme1n1 + * + * [RAID2] + * Name raid2 + * StripSize 64 + * NumDevices 3 + * RaidLevel 0 + * Devices Nvme2n1 Nvme3n1 Nvme4n1 + * + * params: + * conf_section - pointer to config section + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_parse_raid(struct spdk_conf_section *conf_section) +{ + const char *raid_name; + uint32_t strip_size; + uint8_t num_base_bdevs; + const char *raid_level_str; + enum raid_level level; + const char *base_bdev_name; + struct raid_bdev_config *raid_cfg; + int rc, i, val; + + raid_name = spdk_conf_section_get_val(conf_section, "Name"); + if (raid_name == NULL) { + SPDK_ERRLOG("raid_name is null\n"); + return -EINVAL; + } + + val = spdk_conf_section_get_intval(conf_section, "StripSize"); + if (val < 0) { + return -EINVAL; + } + strip_size = val; + + val = spdk_conf_section_get_intval(conf_section, "NumDevices"); + if (val < 0) { + return -EINVAL; + } + num_base_bdevs = val; + + raid_level_str = spdk_conf_section_get_val(conf_section, "RaidLevel"); + if (raid_level_str == NULL) { + SPDK_ERRLOG("Missing RaidLevel\n"); + return -EINVAL; + } + level = raid_bdev_parse_raid_level(raid_level_str); + if (level == INVALID_RAID_LEVEL) { + SPDK_ERRLOG("Invalid RaidLevel\n"); + return -EINVAL; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "%s %" PRIu32 " %u %u\n", + raid_name, strip_size, num_base_bdevs, level); + + rc = raid_bdev_config_add(raid_name, strip_size, num_base_bdevs, level, + &raid_cfg); + if (rc != 0) { + SPDK_ERRLOG("Failed to add raid bdev config\n"); + return rc; + } + + for (i = 0; true; i++) { + base_bdev_name = spdk_conf_section_get_nmval(conf_section, "Devices", 0, i); + if (base_bdev_name == NULL) { + break; + } + if (i >= num_base_bdevs) { + raid_bdev_config_cleanup(raid_cfg); + SPDK_ERRLOG("Number of devices mentioned is more than count\n"); + return -EINVAL; + } + + rc = raid_bdev_config_add_base_bdev(raid_cfg, base_bdev_name, i); + if (rc != 0) { + raid_bdev_config_cleanup(raid_cfg); + SPDK_ERRLOG("Failed to add base bdev to raid bdev config\n"); + return rc; + } + } + + if (i != raid_cfg->num_base_bdevs) { + raid_bdev_config_cleanup(raid_cfg); + SPDK_ERRLOG("Number of devices mentioned is less than count\n"); + return -EINVAL; + } + + rc = raid_bdev_create(raid_cfg); + if (rc != 0) { + raid_bdev_config_cleanup(raid_cfg); + SPDK_ERRLOG("Failed to create raid bdev\n"); + return rc; + } + + rc = raid_bdev_add_base_devices(raid_cfg); + if (rc != 0) { + SPDK_ERRLOG("Failed to add any base bdev to raid bdev\n"); + /* Config is not removed in this case. */ + } + + return 0; +} + +/* + * brief: + * raid_bdev_parse_config is used to find the raid bdev config section and parse it + * Format of config file: + * params: + * none + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_parse_config(void) +{ + int ret; + struct spdk_conf_section *conf_section; + + conf_section = spdk_conf_first_section(NULL); + while (conf_section != NULL) { + if (spdk_conf_section_match_prefix(conf_section, "RAID")) { + ret = raid_bdev_parse_raid(conf_section); + if (ret < 0) { + SPDK_ERRLOG("Unable to parse raid bdev section\n"); + return ret; + } + } + conf_section = spdk_conf_next_section(conf_section); + } + + return 0; +} + +/* + * brief: + * raid_bdev_fini_start is called when bdev layer is starting the + * shutdown process + * params: + * none + * returns: + * none + */ +static void +raid_bdev_fini_start(void) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_fini_start\n"); + g_shutdown_started = true; +} + +/* + * brief: + * raid_bdev_exit is called on raid bdev module exit time by bdev layer + * params: + * none + * returns: + * none + */ +static void +raid_bdev_exit(void) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_exit\n"); + raid_bdev_free(); +} + +/* + * brief: + * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid + * module + * params: + * none + * returns: + * size of spdk_bdev_io context for raid + */ +static int +raid_bdev_get_ctx_size(void) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_get_ctx_size\n"); + return sizeof(struct raid_bdev_io); +} + +/* + * brief: + * raid_bdev_get_running_config is used to get the configuration options. + * + * params: + * fp - The pointer to a file that will be written to the configuration options. + * returns: + * none + */ +static void +raid_bdev_get_running_config(FILE *fp) +{ + struct raid_bdev *raid_bdev; + struct raid_base_bdev_info *base_info; + int index = 1; + + TAILQ_FOREACH(raid_bdev, &g_raid_bdev_configured_list, state_link) { + fprintf(fp, + "\n" + "[RAID%d]\n" + " Name %s\n" + " StripSize %" PRIu32 "\n" + " NumDevices %u\n" + " RaidLevel %s\n", + index, raid_bdev->bdev.name, raid_bdev->strip_size_kb, + raid_bdev->num_base_bdevs, + raid_bdev_level_to_str(raid_bdev->level)); + fprintf(fp, + " Devices "); + RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { + if (base_info->bdev) { + fprintf(fp, + "%s ", + base_info->bdev->name); + } + } + fprintf(fp, + "\n"); + index++; + } +} + +/* + * brief: + * raid_bdev_can_claim_bdev is the function to check if this base_bdev can be + * claimed by raid bdev or not. + * params: + * bdev_name - represents base bdev name + * _raid_cfg - pointer to raid bdev config parsed from config file + * base_bdev_slot - if bdev can be claimed, it represents the base_bdev correct + * slot. This field is only valid if return value of this function is true + * returns: + * true - if bdev can be claimed + * false - if bdev can't be claimed + */ +static bool +raid_bdev_can_claim_bdev(const char *bdev_name, struct raid_bdev_config **_raid_cfg, + uint8_t *base_bdev_slot) +{ + struct raid_bdev_config *raid_cfg; + uint8_t i; + + TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) { + for (i = 0; i < raid_cfg->num_base_bdevs; i++) { + /* + * Check if the base bdev name is part of raid bdev configuration. + * If match is found then return true and the slot information where + * this base bdev should be inserted in raid bdev + */ + if (!strcmp(bdev_name, raid_cfg->base_bdev[i].name)) { + *_raid_cfg = raid_cfg; + *base_bdev_slot = i; + return true; + } + } + } + + return false; +} + + +static struct spdk_bdev_module g_raid_if = { + .name = "raid", + .module_init = raid_bdev_init, + .fini_start = raid_bdev_fini_start, + .module_fini = raid_bdev_exit, + .get_ctx_size = raid_bdev_get_ctx_size, + .examine_config = raid_bdev_examine, + .config_text = raid_bdev_get_running_config, + .async_init = false, + .async_fini = false, +}; +SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if) + +/* + * brief: + * raid_bdev_init is the initialization function for raid bdev module + * params: + * none + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_init(void) +{ + int ret; + + /* Parse config file for raids */ + ret = raid_bdev_parse_config(); + if (ret < 0) { + SPDK_ERRLOG("raid bdev init failed parsing\n"); + raid_bdev_free(); + return ret; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_init completed successfully\n"); + + return 0; +} + +/* + * brief: + * raid_bdev_create allocates raid bdev based on passed configuration + * params: + * raid_cfg - configuration of raid bdev + * returns: + * 0 - success + * non zero - failure + */ +int +raid_bdev_create(struct raid_bdev_config *raid_cfg) +{ + struct raid_bdev *raid_bdev; + struct spdk_bdev *raid_bdev_gen; + struct raid_bdev_module *module; + + module = raid_bdev_module_find(raid_cfg->level); + if (module == NULL) { + SPDK_ERRLOG("Unsupported raid level '%d'\n", raid_cfg->level); + return -EINVAL; + } + + assert(module->base_bdevs_min != 0); + if (raid_cfg->num_base_bdevs < module->base_bdevs_min) { + SPDK_ERRLOG("At least %u base devices required for %s\n", + module->base_bdevs_min, + raid_bdev_level_to_str(raid_cfg->level)); + return -EINVAL; + } + + raid_bdev = calloc(1, sizeof(*raid_bdev)); + if (!raid_bdev) { + SPDK_ERRLOG("Unable to allocate memory for raid bdev\n"); + return -ENOMEM; + } + + raid_bdev->module = module; + raid_bdev->num_base_bdevs = raid_cfg->num_base_bdevs; + raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs, + sizeof(struct raid_base_bdev_info)); + if (!raid_bdev->base_bdev_info) { + SPDK_ERRLOG("Unable able to allocate base bdev info\n"); + free(raid_bdev); + return -ENOMEM; + } + + /* strip_size_kb is from the rpc param. strip_size is in blocks and used + * internally and set later. + */ + raid_bdev->strip_size = 0; + raid_bdev->strip_size_kb = raid_cfg->strip_size; + raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; + raid_bdev->config = raid_cfg; + raid_bdev->level = raid_cfg->level; + + raid_bdev_gen = &raid_bdev->bdev; + + raid_bdev_gen->name = strdup(raid_cfg->name); + if (!raid_bdev_gen->name) { + SPDK_ERRLOG("Unable to allocate name for raid\n"); + free(raid_bdev->base_bdev_info); + free(raid_bdev); + return -ENOMEM; + } + + raid_bdev_gen->product_name = "Raid Volume"; + raid_bdev_gen->ctxt = raid_bdev; + raid_bdev_gen->fn_table = &g_raid_bdev_fn_table; + raid_bdev_gen->module = &g_raid_if; + raid_bdev_gen->write_cache = 0; + + TAILQ_INSERT_TAIL(&g_raid_bdev_configuring_list, raid_bdev, state_link); + TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link); + + raid_cfg->raid_bdev = raid_bdev; + + return 0; +} + +/* + * brief + * raid_bdev_alloc_base_bdev_resource allocates resource of base bdev. + * params: + * raid_bdev - pointer to raid bdev + * bdev - pointer to base bdev + * base_bdev_slot - position to add base bdev + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_alloc_base_bdev_resource(struct raid_bdev *raid_bdev, struct spdk_bdev *bdev, + uint8_t base_bdev_slot) +{ + struct spdk_bdev_desc *desc; + int rc; + + rc = spdk_bdev_open(bdev, true, raid_bdev_remove_base_bdev, bdev, &desc); + if (rc != 0) { + SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", bdev->name); + return rc; + } + + rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if); + if (rc != 0) { + SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n"); + spdk_bdev_close(desc); + return rc; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s is claimed\n", bdev->name); + + assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE); + assert(base_bdev_slot < raid_bdev->num_base_bdevs); + + raid_bdev->base_bdev_info[base_bdev_slot].thread = spdk_get_thread(); + raid_bdev->base_bdev_info[base_bdev_slot].bdev = bdev; + raid_bdev->base_bdev_info[base_bdev_slot].desc = desc; + raid_bdev->num_base_bdevs_discovered++; + assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs); + + return 0; +} + +/* + * brief: + * If raid bdev config is complete, then only register the raid bdev to + * bdev layer and remove this raid bdev from configuring list and + * insert the raid bdev to configured list + * params: + * raid_bdev - pointer to raid bdev + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_configure(struct raid_bdev *raid_bdev) +{ + uint32_t blocklen = 0; + struct spdk_bdev *raid_bdev_gen; + struct raid_base_bdev_info *base_info; + int rc = 0; + + assert(raid_bdev->state == RAID_BDEV_STATE_CONFIGURING); + assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs); + + RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { + /* Check blocklen for all base bdevs that it should be same */ + if (blocklen == 0) { + blocklen = base_info->bdev->blocklen; + } else if (blocklen != base_info->bdev->blocklen) { + /* + * Assumption is that all the base bdevs for any raid bdev should + * have same blocklen + */ + SPDK_ERRLOG("Blocklen of various bdevs not matching\n"); + return -EINVAL; + } + } + assert(blocklen > 0); + + /* The strip_size_kb is read in from user in KB. Convert to blocks here for + * internal use. + */ + raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / blocklen; + raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size); + raid_bdev->blocklen_shift = spdk_u32log2(blocklen); + + raid_bdev_gen = &raid_bdev->bdev; + raid_bdev_gen->blocklen = blocklen; + + rc = raid_bdev->module->start(raid_bdev); + if (rc != 0) { + SPDK_ERRLOG("raid module startup callback failed\n"); + return rc; + } + raid_bdev->state = RAID_BDEV_STATE_ONLINE; + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "io device register %p\n", raid_bdev); + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "blockcnt %lu, blocklen %u\n", + raid_bdev_gen->blockcnt, raid_bdev_gen->blocklen); + spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb, + sizeof(struct raid_bdev_io_channel), + raid_bdev->bdev.name); + rc = spdk_bdev_register(raid_bdev_gen); + if (rc != 0) { + SPDK_ERRLOG("Unable to register raid bdev and stay at configuring state\n"); + if (raid_bdev->module->stop != NULL) { + raid_bdev->module->stop(raid_bdev); + } + spdk_io_device_unregister(raid_bdev, NULL); + raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; + return rc; + } + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev generic %p\n", raid_bdev_gen); + TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link); + TAILQ_INSERT_TAIL(&g_raid_bdev_configured_list, raid_bdev, state_link); + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev is created with name %s, raid_bdev %p\n", + raid_bdev_gen->name, raid_bdev); + + return 0; +} + +/* + * brief: + * If raid bdev is online and registered, change the bdev state to + * configuring and unregister this raid device. Queue this raid device + * in configuring list + * params: + * raid_bdev - pointer to raid bdev + * cb_fn - callback function + * cb_arg - argument to callback function + * returns: + * none + */ +static void +raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, + void *cb_arg) +{ + if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { + if (cb_fn) { + cb_fn(cb_arg, 0); + } + return; + } + + assert(raid_bdev->num_base_bdevs == raid_bdev->num_base_bdevs_discovered); + TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link); + if (raid_bdev->module->stop != NULL) { + raid_bdev->module->stop(raid_bdev); + } + raid_bdev->state = RAID_BDEV_STATE_OFFLINE; + assert(raid_bdev->num_base_bdevs_discovered); + TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link); + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev state chaning from online to offline\n"); + + spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg); +} + +/* + * brief: + * raid_bdev_find_by_base_bdev function finds the raid bdev which has + * claimed the base bdev. + * params: + * base_bdev - pointer to base bdev pointer + * _raid_bdev - Reference to pointer to raid bdev + * _base_info - Reference to the raid base bdev info. + * returns: + * true - if the raid bdev is found. + * false - if the raid bdev is not found. + */ +static bool +raid_bdev_find_by_base_bdev(struct spdk_bdev *base_bdev, struct raid_bdev **_raid_bdev, + struct raid_base_bdev_info **_base_info) +{ + struct raid_bdev *raid_bdev; + struct raid_base_bdev_info *base_info; + + TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { + RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { + if (base_info->bdev == base_bdev) { + *_raid_bdev = raid_bdev; + *_base_info = base_info; + return true; + } + } + } + + return false; +} + +/* + * brief: + * raid_bdev_remove_base_bdev function is called by below layers when base_bdev + * is removed. This function checks if this base bdev is part of any raid bdev + * or not. If yes, it takes necessary action on that particular raid bdev. + * params: + * ctx - pointer to base bdev pointer which got removed + * returns: + * none + */ +static void +raid_bdev_remove_base_bdev(void *ctx) +{ + struct spdk_bdev *base_bdev = ctx; + struct raid_bdev *raid_bdev = NULL; + struct raid_base_bdev_info *base_info; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_bdev\n"); + + /* Find the raid_bdev which has claimed this base_bdev */ + if (!raid_bdev_find_by_base_bdev(base_bdev, &raid_bdev, &base_info)) { + SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name); + return; + } + + assert(base_info->desc); + base_info->remove_scheduled = true; + + if (raid_bdev->destruct_called == true || + raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) { + /* + * As raid bdev is not registered yet or already unregistered, + * so cleanup should be done here itself. + */ + raid_bdev_free_base_bdev_resource(raid_bdev, base_info); + if (raid_bdev->num_base_bdevs_discovered == 0) { + /* There is no base bdev for this raid, so free the raid device. */ + raid_bdev_cleanup(raid_bdev); + return; + } + } + + raid_bdev_deconfigure(raid_bdev, NULL, NULL); +} + +/* + * brief: + * Remove base bdevs from the raid bdev one by one. Skip any base bdev which + * doesn't exist. + * params: + * raid_cfg - pointer to raid bdev config. + * cb_fn - callback function + * cb_ctx - argument to callback function + */ +void +raid_bdev_remove_base_devices(struct raid_bdev_config *raid_cfg, + raid_bdev_destruct_cb cb_fn, void *cb_arg) +{ + struct raid_bdev *raid_bdev; + struct raid_base_bdev_info *base_info; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_devices\n"); + + raid_bdev = raid_cfg->raid_bdev; + if (raid_bdev == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev %s doesn't exist now\n", raid_cfg->name); + if (cb_fn) { + cb_fn(cb_arg, 0); + } + return; + } + + if (raid_bdev->destroy_started) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "destroying raid bdev %s is already started\n", + raid_cfg->name); + if (cb_fn) { + cb_fn(cb_arg, -EALREADY); + } + return; + } + + raid_bdev->destroy_started = true; + + RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { + if (base_info->bdev == NULL) { + continue; + } + + assert(base_info->desc); + base_info->remove_scheduled = true; + + if (raid_bdev->destruct_called == true || + raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) { + /* + * As raid bdev is not registered yet or already unregistered, + * so cleanup should be done here itself. + */ + raid_bdev_free_base_bdev_resource(raid_bdev, base_info); + if (raid_bdev->num_base_bdevs_discovered == 0) { + /* There is no base bdev for this raid, so free the raid device. */ + raid_bdev_cleanup(raid_bdev); + if (cb_fn) { + cb_fn(cb_arg, 0); + } + return; + } + } + } + + raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg); +} + +/* + * brief: + * raid_bdev_add_base_device function is the actual function which either adds + * the nvme base device to existing raid bdev or create a new raid bdev. It also claims + * the base device and keep the open descriptor. + * params: + * raid_cfg - pointer to raid bdev config + * bdev - pointer to base bdev + * base_bdev_slot - position to add base bdev + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_add_base_device(struct raid_bdev_config *raid_cfg, struct spdk_bdev *bdev, + uint8_t base_bdev_slot) +{ + struct raid_bdev *raid_bdev; + int rc; + + raid_bdev = raid_cfg->raid_bdev; + if (!raid_bdev) { + SPDK_ERRLOG("Raid bdev '%s' is not created yet\n", raid_cfg->name); + return -ENODEV; + } + + rc = raid_bdev_alloc_base_bdev_resource(raid_bdev, bdev, base_bdev_slot); + if (rc != 0) { + SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", bdev->name); + return rc; + } + + assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs); + + if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) { + rc = raid_bdev_configure(raid_bdev); + if (rc != 0) { + SPDK_ERRLOG("Failed to configure raid bdev\n"); + return rc; + } + } + + return 0; +} + +/* + * brief: + * Add base bdevs to the raid bdev one by one. Skip any base bdev which doesn't + * exist or fails to add. If all base bdevs are successfully added, the raid bdev + * moves to the configured state and becomes available. Otherwise, the raid bdev + * stays at the configuring state with added base bdevs. + * params: + * raid_cfg - pointer to raid bdev config + * returns: + * 0 - The raid bdev moves to the configured state or stays at the configuring + * state with added base bdevs due to any nonexistent base bdev. + * non zero - Failed to add any base bdev and stays at the configuring state with + * added base bdevs. + */ +int +raid_bdev_add_base_devices(struct raid_bdev_config *raid_cfg) +{ + struct spdk_bdev *base_bdev; + uint8_t i; + int rc = 0, _rc; + + for (i = 0; i < raid_cfg->num_base_bdevs; i++) { + base_bdev = spdk_bdev_get_by_name(raid_cfg->base_bdev[i].name); + if (base_bdev == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "base bdev %s doesn't exist now\n", + raid_cfg->base_bdev[i].name); + continue; + } + + _rc = raid_bdev_add_base_device(raid_cfg, base_bdev, i); + if (_rc != 0) { + SPDK_ERRLOG("Failed to add base bdev %s to RAID bdev %s: %s\n", + raid_cfg->base_bdev[i].name, raid_cfg->name, + spdk_strerror(-_rc)); + if (rc == 0) { + rc = _rc; + } + } + } + + return rc; +} + +/* + * brief: + * raid_bdev_examine function is the examine function call by the below layers + * like bdev_nvme layer. This function will check if this base bdev can be + * claimed by this raid bdev or not. + * params: + * bdev - pointer to base bdev + * returns: + * none + */ +static void +raid_bdev_examine(struct spdk_bdev *bdev) +{ + struct raid_bdev_config *raid_cfg; + uint8_t base_bdev_slot; + + if (raid_bdev_can_claim_bdev(bdev->name, &raid_cfg, &base_bdev_slot)) { + raid_bdev_add_base_device(raid_cfg, bdev, base_bdev_slot); + } else { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s can't be claimed\n", + bdev->name); + } + + spdk_bdev_module_examine_done(&g_raid_if); +} + +/* Log component for bdev raid bdev module */ +SPDK_LOG_REGISTER_COMPONENT("bdev_raid", SPDK_LOG_BDEV_RAID) diff --git a/src/spdk/module/bdev/raid/bdev_raid.h b/src/spdk/module/bdev/raid/bdev_raid.h new file mode 100644 index 000000000..4acca1da6 --- /dev/null +++ b/src/spdk/module/bdev/raid/bdev_raid.h @@ -0,0 +1,319 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_RAID_INTERNAL_H +#define SPDK_BDEV_RAID_INTERNAL_H + +#include "spdk/bdev_module.h" + +enum raid_level { + INVALID_RAID_LEVEL = -1, + RAID0 = 0, + RAID5 = 5, +}; + +/* + * Raid state describes the state of the raid. This raid bdev can be either in + * configured list or configuring list + */ +enum raid_bdev_state { + /* raid bdev is ready and is seen by upper layers */ + RAID_BDEV_STATE_ONLINE, + + /* + * raid bdev is configuring, not all underlying bdevs are present. + * And can't be seen by upper layers. + */ + RAID_BDEV_STATE_CONFIGURING, + + /* + * In offline state, raid bdev layer will complete all incoming commands without + * submitting to underlying base nvme bdevs + */ + RAID_BDEV_STATE_OFFLINE, + + /* raid bdev max, new states should be added before this */ + RAID_BDEV_MAX +}; + +/* + * raid_base_bdev_info contains information for the base bdevs which are part of some + * raid. This structure contains the per base bdev information. Whatever is + * required per base device for raid bdev will be kept here + */ +struct raid_base_bdev_info { + /* pointer to base spdk bdev */ + struct spdk_bdev *bdev; + + /* pointer to base bdev descriptor opened by raid bdev */ + struct spdk_bdev_desc *desc; + + /* + * When underlying base device calls the hot plug function on drive removal, + * this flag will be set and later after doing some processing, base device + * descriptor will be closed + */ + bool remove_scheduled; + + /* thread where base device is opened */ + struct spdk_thread *thread; +}; + +/* + * raid_bdev_io is the context part of bdev_io. It contains the information + * related to bdev_io for a raid bdev + */ +struct raid_bdev_io { + /* The raid bdev associated with this IO */ + struct raid_bdev *raid_bdev; + + /* WaitQ entry, used only in waitq logic */ + struct spdk_bdev_io_wait_entry waitq_entry; + + /* Context of the original channel for this IO */ + struct raid_bdev_io_channel *raid_ch; + + /* Used for tracking progress on io requests sent to member disks. */ + uint64_t base_bdev_io_remaining; + uint8_t base_bdev_io_submitted; + uint8_t base_bdev_io_status; +}; + +/* + * raid_bdev is the single entity structure which contains SPDK block device + * and the information related to any raid bdev either configured or + * in configuring list. io device is created on this. + */ +struct raid_bdev { + /* raid bdev device, this will get registered in bdev layer */ + struct spdk_bdev bdev; + + /* link of raid bdev to link it to configured, configuring or offline list */ + TAILQ_ENTRY(raid_bdev) state_link; + + /* link of raid bdev to link it to global raid bdev list */ + TAILQ_ENTRY(raid_bdev) global_link; + + /* pointer to config file entry */ + struct raid_bdev_config *config; + + /* array of base bdev info */ + struct raid_base_bdev_info *base_bdev_info; + + /* strip size of raid bdev in blocks */ + uint32_t strip_size; + + /* strip size of raid bdev in KB */ + uint32_t strip_size_kb; + + /* strip size bit shift for optimized calculation */ + uint32_t strip_size_shift; + + /* block length bit shift for optimized calculation */ + uint32_t blocklen_shift; + + /* state of raid bdev */ + enum raid_bdev_state state; + + /* number of base bdevs comprising raid bdev */ + uint8_t num_base_bdevs; + + /* number of base bdevs discovered */ + uint8_t num_base_bdevs_discovered; + + /* Raid Level of this raid bdev */ + enum raid_level level; + + /* Set to true if destruct is called for this raid bdev */ + bool destruct_called; + + /* Set to true if destroy of this raid bdev is started. */ + bool destroy_started; + + /* Module for RAID-level specific operations */ + struct raid_bdev_module *module; + + /* Private data for the raid module */ + void *module_private; +}; + +#define RAID_FOR_EACH_BASE_BDEV(r, i) \ + for (i = r->base_bdev_info; i < r->base_bdev_info + r->num_base_bdevs; i++) + +/* + * raid_base_bdev_config is the per base bdev data structure which contains + * information w.r.t to per base bdev during parsing config + */ +struct raid_base_bdev_config { + /* base bdev name from config file */ + char *name; +}; + +/* + * raid_bdev_config contains the raid bdev config related information after + * parsing the config file + */ +struct raid_bdev_config { + /* base bdev config per underlying bdev */ + struct raid_base_bdev_config *base_bdev; + + /* Points to already created raid bdev */ + struct raid_bdev *raid_bdev; + + char *name; + + /* strip size of this raid bdev in kilo bytes */ + uint32_t strip_size; + + /* number of base bdevs */ + uint8_t num_base_bdevs; + + /* raid level */ + enum raid_level level; + + TAILQ_ENTRY(raid_bdev_config) link; +}; + +/* + * raid_config is the top level structure representing the raid bdev config as read + * from config file for all raids + */ +struct raid_config { + /* raid bdev context from config file */ + TAILQ_HEAD(, raid_bdev_config) raid_bdev_config_head; + + /* total raid bdev from config file */ + uint8_t total_raid_bdev; +}; + +/* + * raid_bdev_io_channel is the context of spdk_io_channel for raid bdev device. It + * contains the relationship of raid bdev io channel with base bdev io channels. + */ +struct raid_bdev_io_channel { + /* Array of IO channels of base bdevs */ + struct spdk_io_channel **base_channel; + + /* Number of IO channels */ + uint8_t num_channels; +}; + +/* TAIL heads for various raid bdev lists */ +TAILQ_HEAD(raid_configured_tailq, raid_bdev); +TAILQ_HEAD(raid_configuring_tailq, raid_bdev); +TAILQ_HEAD(raid_all_tailq, raid_bdev); +TAILQ_HEAD(raid_offline_tailq, raid_bdev); + +extern struct raid_configured_tailq g_raid_bdev_configured_list; +extern struct raid_configuring_tailq g_raid_bdev_configuring_list; +extern struct raid_all_tailq g_raid_bdev_list; +extern struct raid_offline_tailq g_raid_bdev_offline_list; +extern struct raid_config g_raid_config; + +typedef void (*raid_bdev_destruct_cb)(void *cb_ctx, int rc); + +int raid_bdev_create(struct raid_bdev_config *raid_cfg); +int raid_bdev_add_base_devices(struct raid_bdev_config *raid_cfg); +void raid_bdev_remove_base_devices(struct raid_bdev_config *raid_cfg, + raid_bdev_destruct_cb cb_fn, void *cb_ctx); +int raid_bdev_config_add(const char *raid_name, uint32_t strip_size, uint8_t num_base_bdevs, + enum raid_level level, struct raid_bdev_config **_raid_cfg); +int raid_bdev_config_add_base_bdev(struct raid_bdev_config *raid_cfg, + const char *base_bdev_name, uint8_t slot); +void raid_bdev_config_cleanup(struct raid_bdev_config *raid_cfg); +struct raid_bdev_config *raid_bdev_config_find_by_name(const char *raid_name); +enum raid_level raid_bdev_parse_raid_level(const char *str); +const char *raid_bdev_level_to_str(enum raid_level level); + +/* + * RAID module descriptor + */ +struct raid_bdev_module { + /* RAID level implemented by this module */ + enum raid_level level; + + /* Minimum required number of base bdevs. Must be > 0. */ + uint8_t base_bdevs_min; + + /* + * Maximum number of base bdevs that can be removed without failing + * the array. + */ + uint8_t base_bdevs_max_degraded; + + /* + * Called when the raid is starting, right before changing the state to + * online and registering the bdev. Parameters of the bdev like blockcnt + * should be set here. + * + * Non-zero return value will abort the startup process. + */ + int (*start)(struct raid_bdev *raid_bdev); + + /* + * Called when the raid is stopping, right before changing the state to + * offline and unregistering the bdev. Optional. + */ + void (*stop)(struct raid_bdev *raid_bdev); + + /* Handler for R/W requests */ + void (*submit_rw_request)(struct raid_bdev_io *raid_io); + + /* Handler for requests without payload (flush, unmap). Optional. */ + void (*submit_null_payload_request)(struct raid_bdev_io *raid_io); + + TAILQ_ENTRY(raid_bdev_module) link; +}; + +void raid_bdev_module_list_add(struct raid_bdev_module *raid_module); + +#define __RAID_MODULE_REGISTER(line) __RAID_MODULE_REGISTER_(line) +#define __RAID_MODULE_REGISTER_(line) raid_module_register_##line + +#define RAID_MODULE_REGISTER(_module) \ +__attribute__((constructor)) static void \ +__RAID_MODULE_REGISTER(__LINE__)(void) \ +{ \ + raid_bdev_module_list_add(_module); \ +} + +bool +raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed, + enum spdk_bdev_io_status status); +void +raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, + struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn); +void +raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status); + +#endif /* SPDK_BDEV_RAID_INTERNAL_H */ diff --git a/src/spdk/module/bdev/raid/bdev_raid_rpc.c b/src/spdk/module/bdev/raid/bdev_raid_rpc.c new file mode 100644 index 000000000..1c2d070c3 --- /dev/null +++ b/src/spdk/module/bdev/raid/bdev_raid_rpc.c @@ -0,0 +1,452 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/bdev.h" +#include "bdev_raid.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" +#include "spdk/env.h" + +#define RPC_MAX_BASE_BDEVS 255 + +SPDK_LOG_REGISTER_COMPONENT("raidrpc", SPDK_LOG_RAID_RPC) + +/* + * Input structure for bdev_raid_get_bdevs RPC + */ +struct rpc_bdev_raid_get_bdevs { + /* category - all or online or configuring or offline */ + char *category; +}; + +/* + * brief: + * free_rpc_bdev_raid_get_bdevs function frees RPC bdev_raid_get_bdevs related parameters + * params: + * req - pointer to RPC request + * returns: + * none + */ +static void +free_rpc_bdev_raid_get_bdevs(struct rpc_bdev_raid_get_bdevs *req) +{ + free(req->category); +} + +/* + * Decoder object for RPC get_raids + */ +static const struct spdk_json_object_decoder rpc_bdev_raid_get_bdevs_decoders[] = { + {"category", offsetof(struct rpc_bdev_raid_get_bdevs, category), spdk_json_decode_string}, +}; + +/* + * brief: + * rpc_bdev_raid_get_bdevs function is the RPC for rpc_bdev_raid_get_bdevs. This is used to list + * all the raid bdev names based on the input category requested. Category should be + * one of "all", "online", "configuring" or "offline". "all" means all the raids + * whether they are online or configuring or offline. "online" is the raid bdev which + * is registered with bdev layer. "configuring" is the raid bdev which does not have + * full configuration discovered yet. "offline" is the raid bdev which is not + * registered with bdev as of now and it has encountered any error or user has + * requested to offline the raid. + * params: + * request - pointer to json rpc request + * params - pointer to request parameters + * returns: + * none + */ +static void +rpc_bdev_raid_get_bdevs(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_raid_get_bdevs req = {}; + struct spdk_json_write_ctx *w; + struct raid_bdev *raid_bdev; + + if (spdk_json_decode_object(params, rpc_bdev_raid_get_bdevs_decoders, + SPDK_COUNTOF(rpc_bdev_raid_get_bdevs_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + if (!(strcmp(req.category, "all") == 0 || + strcmp(req.category, "online") == 0 || + strcmp(req.category, "configuring") == 0 || + strcmp(req.category, "offline") == 0)) { + spdk_jsonrpc_send_error_response(request, -EINVAL, spdk_strerror(EINVAL)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + /* Get raid bdev list based on the category requested */ + if (strcmp(req.category, "all") == 0) { + TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { + spdk_json_write_string(w, raid_bdev->bdev.name); + } + } else if (strcmp(req.category, "online") == 0) { + TAILQ_FOREACH(raid_bdev, &g_raid_bdev_configured_list, state_link) { + spdk_json_write_string(w, raid_bdev->bdev.name); + } + } else if (strcmp(req.category, "configuring") == 0) { + TAILQ_FOREACH(raid_bdev, &g_raid_bdev_configuring_list, state_link) { + spdk_json_write_string(w, raid_bdev->bdev.name); + } + } else { + TAILQ_FOREACH(raid_bdev, &g_raid_bdev_offline_list, state_link) { + spdk_json_write_string(w, raid_bdev->bdev.name); + } + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_raid_get_bdevs(&req); +} +SPDK_RPC_REGISTER("bdev_raid_get_bdevs", rpc_bdev_raid_get_bdevs, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_raid_get_bdevs, get_raid_bdevs) + +/* + * Base bdevs in RPC bdev_raid_create + */ +struct rpc_bdev_raid_create_base_bdevs { + /* Number of base bdevs */ + size_t num_base_bdevs; + + /* List of base bdevs names */ + char *base_bdevs[RPC_MAX_BASE_BDEVS]; +}; + +/* + * Input structure for RPC rpc_bdev_raid_create + */ +struct rpc_bdev_raid_create { + /* Raid bdev name */ + char *name; + + /* RAID strip size KB, 'strip_size' is deprecated. */ + uint32_t strip_size; + uint32_t strip_size_kb; + + /* RAID raid level */ + enum raid_level level; + + /* Base bdevs information */ + struct rpc_bdev_raid_create_base_bdevs base_bdevs; +}; + +/* + * brief: + * free_rpc_bdev_raid_create function is to free RPC bdev_raid_create related parameters + * params: + * req - pointer to RPC request + * returns: + * none + */ +static void +free_rpc_bdev_raid_create(struct rpc_bdev_raid_create *req) +{ + size_t i; + + free(req->name); + for (i = 0; i < req->base_bdevs.num_base_bdevs; i++) { + free(req->base_bdevs.base_bdevs[i]); + } +} + +/* + * Decoder function for RPC bdev_raid_create to decode raid level + */ +static int +decode_raid_level(const struct spdk_json_val *val, void *out) +{ + int ret; + char *str = NULL; + enum raid_level level; + + ret = spdk_json_decode_string(val, &str); + if (ret == 0 && str != NULL) { + level = raid_bdev_parse_raid_level(str); + if (level == INVALID_RAID_LEVEL) { + ret = -EINVAL; + } else { + *(enum raid_level *)out = level; + } + } + + free(str); + return ret; +} + +/* + * Decoder function for RPC bdev_raid_create to decode base bdevs list + */ +static int +decode_base_bdevs(const struct spdk_json_val *val, void *out) +{ + struct rpc_bdev_raid_create_base_bdevs *base_bdevs = out; + return spdk_json_decode_array(val, spdk_json_decode_string, base_bdevs->base_bdevs, + RPC_MAX_BASE_BDEVS, &base_bdevs->num_base_bdevs, sizeof(char *)); +} + +/* + * Decoder object for RPC bdev_raid_create + */ +/* Note: strip_size is deprecated, one of the two options must be specified but not both. */ +static const struct spdk_json_object_decoder rpc_bdev_raid_create_decoders[] = { + {"name", offsetof(struct rpc_bdev_raid_create, name), spdk_json_decode_string}, + {"strip_size", offsetof(struct rpc_bdev_raid_create, strip_size), spdk_json_decode_uint32, true}, + {"strip_size_kb", offsetof(struct rpc_bdev_raid_create, strip_size_kb), spdk_json_decode_uint32, true}, + {"raid_level", offsetof(struct rpc_bdev_raid_create, level), decode_raid_level}, + {"base_bdevs", offsetof(struct rpc_bdev_raid_create, base_bdevs), decode_base_bdevs}, +}; + +/* + * brief: + * rpc_bdev_raid_create function is the RPC for creating RAID bdevs. It takes + * input as raid bdev name, raid level, strip size in KB and list of base bdev names. + * params: + * request - pointer to json rpc request + * params - pointer to request parameters + * returns: + * none + */ +static void +rpc_bdev_raid_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_raid_create req = {}; + struct spdk_json_write_ctx *w; + struct raid_bdev_config *raid_cfg; + int rc; + size_t i; + + if (spdk_json_decode_object(params, rpc_bdev_raid_create_decoders, + SPDK_COUNTOF(rpc_bdev_raid_create_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + if (req.strip_size == 0 && req.strip_size_kb == 0) { + spdk_jsonrpc_send_error_response(request, EINVAL, "strip size not specified"); + goto cleanup; + } else if (req.strip_size > 0 && req.strip_size_kb > 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "please use only one strip size option"); + goto cleanup; + } else if (req.strip_size > 0 && req.strip_size_kb == 0) { + SPDK_ERRLOG("the rpc param strip_size is deprecated.\n"); + req.strip_size_kb = req.strip_size; + } + + rc = raid_bdev_config_add(req.name, req.strip_size_kb, req.base_bdevs.num_base_bdevs, + req.level, + &raid_cfg); + if (rc != 0) { + spdk_jsonrpc_send_error_response_fmt(request, rc, + "Failed to add RAID bdev config %s: %s", + req.name, spdk_strerror(-rc)); + goto cleanup; + } + + for (i = 0; i < req.base_bdevs.num_base_bdevs; i++) { + rc = raid_bdev_config_add_base_bdev(raid_cfg, req.base_bdevs.base_bdevs[i], i); + if (rc != 0) { + raid_bdev_config_cleanup(raid_cfg); + spdk_jsonrpc_send_error_response_fmt(request, rc, + "Failed to add base bdev %s to RAID bdev config %s: %s", + req.base_bdevs.base_bdevs[i], req.name, + spdk_strerror(-rc)); + goto cleanup; + } + } + + rc = raid_bdev_create(raid_cfg); + if (rc != 0) { + raid_bdev_config_cleanup(raid_cfg); + spdk_jsonrpc_send_error_response_fmt(request, rc, + "Failed to create RAID bdev %s: %s", + req.name, spdk_strerror(-rc)); + goto cleanup; + } + + rc = raid_bdev_add_base_devices(raid_cfg); + if (rc != 0) { + spdk_jsonrpc_send_error_response_fmt(request, rc, + "Failed to add any base bdev to RAID bdev %s: %s", + req.name, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_raid_create(&req); +} +SPDK_RPC_REGISTER("bdev_raid_create", rpc_bdev_raid_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_raid_create, construct_raid_bdev) + +/* + * Input structure for RPC deleting a raid bdev + */ +struct rpc_bdev_raid_delete { + /* raid bdev name */ + char *name; +}; + +/* + * brief: + * free_rpc_bdev_raid_delete function is used to free RPC bdev_raid_delete related parameters + * params: + * req - pointer to RPC request + * params: + * none + */ +static void +free_rpc_bdev_raid_delete(struct rpc_bdev_raid_delete *req) +{ + free(req->name); +} + +/* + * Decoder object for RPC raid_bdev_delete + */ +static const struct spdk_json_object_decoder rpc_bdev_raid_delete_decoders[] = { + {"name", offsetof(struct rpc_bdev_raid_delete, name), spdk_json_decode_string}, +}; + +struct rpc_bdev_raid_delete_ctx { + struct rpc_bdev_raid_delete req; + struct raid_bdev_config *raid_cfg; + struct spdk_jsonrpc_request *request; +}; + +/* + * brief: + * params: + * cb_arg - pointer to the callback context. + * rc - return code of the deletion of the raid bdev. + * returns: + * none + */ +static void +bdev_raid_delete_done(void *cb_arg, int rc) +{ + struct rpc_bdev_raid_delete_ctx *ctx = cb_arg; + struct raid_bdev_config *raid_cfg; + struct spdk_jsonrpc_request *request = ctx->request; + struct spdk_json_write_ctx *w; + + if (rc != 0) { + SPDK_ERRLOG("Failed to delete raid bdev %s (%d): %s\n", + ctx->req.name, rc, spdk_strerror(-rc)); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(-rc)); + goto exit; + } + + raid_cfg = ctx->raid_cfg; + assert(raid_cfg->raid_bdev == NULL); + + raid_bdev_config_cleanup(raid_cfg); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +exit: + free_rpc_bdev_raid_delete(&ctx->req); + free(ctx); +} + +/* + * brief: + * rpc_bdev_raid_delete function is the RPC for deleting a raid bdev. It takes raid + * name as input and delete that raid bdev including freeing the base bdev + * resources. + * params: + * request - pointer to json rpc request + * params - pointer to request parameters + * returns: + * none + */ +static void +rpc_bdev_raid_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_raid_delete_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); + return; + } + + if (spdk_json_decode_object(params, rpc_bdev_raid_delete_decoders, + SPDK_COUNTOF(rpc_bdev_raid_delete_decoders), + &ctx->req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + ctx->raid_cfg = raid_bdev_config_find_by_name(ctx->req.name); + if (ctx->raid_cfg == NULL) { + spdk_jsonrpc_send_error_response_fmt(request, ENODEV, + "raid bdev %s is not found in config", + ctx->req.name); + goto cleanup; + } + + ctx->request = request; + + /* Remove all the base bdevs from this raid bdev before deleting the raid bdev */ + raid_bdev_remove_base_devices(ctx->raid_cfg, bdev_raid_delete_done, ctx); + + return; + +cleanup: + free_rpc_bdev_raid_delete(&ctx->req); + free(ctx); +} +SPDK_RPC_REGISTER("bdev_raid_delete", rpc_bdev_raid_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_raid_delete, destroy_raid_bdev) diff --git a/src/spdk/module/bdev/raid/raid0.c b/src/spdk/module/bdev/raid/raid0.c new file mode 100644 index 000000000..5632c5b7c --- /dev/null +++ b/src/spdk/module/bdev/raid/raid0.c @@ -0,0 +1,398 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_raid.h" + +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +/* + * brief: + * raid0_bdev_io_completion function is called by lower layers to notify raid + * module that particular bdev_io is completed. + * params: + * bdev_io - pointer to bdev io submitted to lower layers, like child io + * success - bdev_io status + * cb_arg - function callback context (parent raid_bdev_io) + * returns: + * none + */ +static void +raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct raid_bdev_io *raid_io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (success) { + raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +raid0_submit_rw_request(struct raid_bdev_io *raid_io); + +static void +_raid0_submit_rw_request(void *_raid_io) +{ + struct raid_bdev_io *raid_io = _raid_io; + + raid0_submit_rw_request(raid_io); +} + +/* + * brief: + * raid0_submit_rw_request function is used to submit I/O to the correct + * member disk for raid0 bdevs. + * params: + * raid_io + * returns: + * none + */ +static void +raid0_submit_rw_request(struct raid_bdev_io *raid_io) +{ + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); + struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; + struct raid_bdev *raid_bdev = raid_io->raid_bdev; + uint64_t pd_strip; + uint32_t offset_in_strip; + uint64_t pd_lba; + uint64_t pd_blocks; + uint8_t pd_idx; + int ret = 0; + uint64_t start_strip; + uint64_t end_strip; + struct raid_base_bdev_info *base_info; + struct spdk_io_channel *base_ch; + + start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift; + end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >> + raid_bdev->strip_size_shift; + if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) { + assert(false); + SPDK_ERRLOG("I/O spans strip boundary!\n"); + raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + pd_strip = start_strip / raid_bdev->num_base_bdevs; + pd_idx = start_strip % raid_bdev->num_base_bdevs; + offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1); + pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; + pd_blocks = bdev_io->u.bdev.num_blocks; + base_info = &raid_bdev->base_bdev_info[pd_idx]; + if (base_info->desc == NULL) { + SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); + assert(0); + } + + /* + * Submit child io to bdev layer with using base bdev descriptors, base + * bdev lba, base bdev child io length in blocks, buffer, completion + * function and function callback context + */ + assert(raid_ch != NULL); + assert(raid_ch->base_channel); + base_ch = raid_ch->base_channel[pd_idx]; + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + ret = spdk_bdev_readv_blocks(base_info->desc, base_ch, + bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + pd_lba, pd_blocks, raid0_bdev_io_completion, + raid_io); + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + ret = spdk_bdev_writev_blocks(base_info->desc, base_ch, + bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + pd_lba, pd_blocks, raid0_bdev_io_completion, + raid_io); + } else { + SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); + assert(0); + } + + if (ret == -ENOMEM) { + raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, + _raid0_submit_rw_request); + } else if (ret != 0) { + SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); + assert(false); + raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +/* raid0 IO range */ +struct raid_bdev_io_range { + uint64_t strip_size; + uint64_t start_strip_in_disk; + uint64_t end_strip_in_disk; + uint64_t start_offset_in_strip; + uint64_t end_offset_in_strip; + uint8_t start_disk; + uint8_t end_disk; + uint8_t n_disks_involved; +}; + +static inline void +_raid0_get_io_range(struct raid_bdev_io_range *io_range, + uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift, + uint64_t offset_blocks, uint64_t num_blocks) +{ + uint64_t start_strip; + uint64_t end_strip; + + io_range->strip_size = strip_size; + + /* The start and end strip index in raid0 bdev scope */ + start_strip = offset_blocks >> strip_size_shift; + end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift; + io_range->start_strip_in_disk = start_strip / num_base_bdevs; + io_range->end_strip_in_disk = end_strip / num_base_bdevs; + + /* The first strip may have unaligned start LBA offset. + * The end strip may have unaligned end LBA offset. + * Strips between them certainly have aligned offset and length to boundaries. + */ + io_range->start_offset_in_strip = offset_blocks % strip_size; + io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size; + + /* The base bdev indexes in which start and end strips are located */ + io_range->start_disk = start_strip % num_base_bdevs; + io_range->end_disk = end_strip % num_base_bdevs; + + /* Calculate how many base_bdevs are involved in io operation. + * Number of base bdevs involved is between 1 and num_base_bdevs. + * It will be 1 if the first strip and last strip are the same one. + */ + io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs); +} + +static inline void +_raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx, + uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk) +{ + uint64_t n_strips_in_disk; + uint64_t start_offset_in_disk; + uint64_t end_offset_in_disk; + uint64_t offset_in_disk; + uint64_t nblocks_in_disk; + uint64_t start_strip_in_disk; + uint64_t end_strip_in_disk; + + start_strip_in_disk = io_range->start_strip_in_disk; + if (disk_idx < io_range->start_disk) { + start_strip_in_disk += 1; + } + + end_strip_in_disk = io_range->end_strip_in_disk; + if (disk_idx > io_range->end_disk) { + end_strip_in_disk -= 1; + } + + assert(end_strip_in_disk >= start_strip_in_disk); + n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1; + + if (disk_idx == io_range->start_disk) { + start_offset_in_disk = io_range->start_offset_in_strip; + } else { + start_offset_in_disk = 0; + } + + if (disk_idx == io_range->end_disk) { + end_offset_in_disk = io_range->end_offset_in_strip; + } else { + end_offset_in_disk = io_range->strip_size - 1; + } + + offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size; + nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size + + end_offset_in_disk - start_offset_in_disk + 1; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0, + "raid_bdev (strip_size 0x%lx) splits IO to base_bdev (%u) at (0x%lx, 0x%lx).\n", + io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk); + + *_offset_in_disk = offset_in_disk; + *_nblocks_in_disk = nblocks_in_disk; +} + +static void +raid0_submit_null_payload_request(struct raid_bdev_io *raid_io); + +static void +_raid0_submit_null_payload_request(void *_raid_io) +{ + struct raid_bdev_io *raid_io = _raid_io; + + raid0_submit_null_payload_request(raid_io); +} + +static void +raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct raid_bdev_io *raid_io = cb_arg; + + raid_bdev_io_complete_part(raid_io, 1, success ? + SPDK_BDEV_IO_STATUS_SUCCESS : + SPDK_BDEV_IO_STATUS_FAILED); + + spdk_bdev_free_io(bdev_io); +} + +/* + * brief: + * raid0_submit_null_payload_request function submits the next batch of + * io requests with range but without payload, like FLUSH and UNMAP, to member disks; + * it will submit as many as possible unless one base io request fails with -ENOMEM, + * in which case it will queue itself for later submission. + * params: + * bdev_io - pointer to parent bdev_io on raid bdev device + * returns: + * none + */ +static void +raid0_submit_null_payload_request(struct raid_bdev_io *raid_io) +{ + struct spdk_bdev_io *bdev_io; + struct raid_bdev *raid_bdev; + struct raid_bdev_io_range io_range; + int ret; + struct raid_base_bdev_info *base_info; + struct spdk_io_channel *base_ch; + + bdev_io = spdk_bdev_io_from_ctx(raid_io); + raid_bdev = raid_io->raid_bdev; + + _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs, + raid_bdev->strip_size, raid_bdev->strip_size_shift, + bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); + + if (raid_io->base_bdev_io_remaining == 0) { + raid_io->base_bdev_io_remaining = io_range.n_disks_involved; + } + + while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) { + uint8_t disk_idx; + uint64_t offset_in_disk; + uint64_t nblocks_in_disk; + + /* base_bdev is started from start_disk to end_disk. + * It is possible that index of start_disk is larger than end_disk's. + */ + disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs; + base_info = &raid_bdev->base_bdev_info[disk_idx]; + base_ch = raid_io->raid_ch->base_channel[disk_idx]; + + _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk); + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_UNMAP: + ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch, + offset_in_disk, nblocks_in_disk, + raid0_base_io_complete, raid_io); + break; + + case SPDK_BDEV_IO_TYPE_FLUSH: + ret = spdk_bdev_flush_blocks(base_info->desc, base_ch, + offset_in_disk, nblocks_in_disk, + raid0_base_io_complete, raid_io); + break; + + default: + SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type); + assert(false); + ret = -EIO; + } + + if (ret == 0) { + raid_io->base_bdev_io_submitted++; + } else if (ret == -ENOMEM) { + raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, + _raid0_submit_null_payload_request); + return; + } else { + SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); + assert(false); + raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + } +} + +static int raid0_start(struct raid_bdev *raid_bdev) +{ + uint64_t min_blockcnt = UINT64_MAX; + struct raid_base_bdev_info *base_info; + + RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { + /* Calculate minimum block count from all base bdevs */ + min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); + } + + /* + * Take the minimum block count based approach where total block count + * of raid bdev is the number of base bdev times the minimum block count + * of any base bdev. + */ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0, "min blockcount %lu, numbasedev %u, strip size shift %u\n", + min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); + raid_bdev->bdev.blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) << + raid_bdev->strip_size_shift) * raid_bdev->num_base_bdevs; + + if (raid_bdev->num_base_bdevs > 1) { + raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; + raid_bdev->bdev.split_on_optimal_io_boundary = true; + } else { + /* Do not need to split reads/writes on single bdev RAID modules. */ + raid_bdev->bdev.optimal_io_boundary = 0; + raid_bdev->bdev.split_on_optimal_io_boundary = false; + } + + return 0; +} + +static struct raid_bdev_module g_raid0_module = { + .level = RAID0, + .base_bdevs_min = 1, + .start = raid0_start, + .submit_rw_request = raid0_submit_rw_request, + .submit_null_payload_request = raid0_submit_null_payload_request, +}; +RAID_MODULE_REGISTER(&g_raid0_module) + +SPDK_LOG_REGISTER_COMPONENT("bdev_raid0", SPDK_LOG_BDEV_RAID0) diff --git a/src/spdk/module/bdev/raid/raid5.c b/src/spdk/module/bdev/raid/raid5.c new file mode 100644 index 000000000..1e287c863 --- /dev/null +++ b/src/spdk/module/bdev/raid/raid5.c @@ -0,0 +1,114 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_raid.h" + +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +struct raid5_info { + /* The parent raid bdev */ + struct raid_bdev *raid_bdev; + + /* Number of data blocks in a stripe (without parity) */ + uint64_t stripe_blocks; + + /* Number of stripes on this array */ + uint64_t total_stripes; +}; + +static inline uint8_t +raid5_stripe_data_chunks_num(const struct raid_bdev *raid_bdev) +{ + return raid_bdev->num_base_bdevs - raid_bdev->module->base_bdevs_max_degraded; +} + +static void +raid5_submit_rw_request(struct raid_bdev_io *raid_io) +{ + raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +} + +static int +raid5_start(struct raid_bdev *raid_bdev) +{ + uint64_t min_blockcnt = UINT64_MAX; + struct raid_base_bdev_info *base_info; + struct raid5_info *r5info; + + r5info = calloc(1, sizeof(*r5info)); + if (!r5info) { + SPDK_ERRLOG("Failed to allocate r5info\n"); + return -ENOMEM; + } + r5info->raid_bdev = raid_bdev; + + RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { + min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); + } + + r5info->total_stripes = min_blockcnt / raid_bdev->strip_size; + r5info->stripe_blocks = raid_bdev->strip_size * raid5_stripe_data_chunks_num(raid_bdev); + + raid_bdev->bdev.blockcnt = r5info->stripe_blocks * r5info->total_stripes; + raid_bdev->bdev.optimal_io_boundary = r5info->stripe_blocks; + raid_bdev->bdev.split_on_optimal_io_boundary = true; + + raid_bdev->module_private = r5info; + + return 0; +} + +static void +raid5_stop(struct raid_bdev *raid_bdev) +{ + struct raid5_info *r5info = raid_bdev->module_private; + + free(r5info); +} + +static struct raid_bdev_module g_raid5_module = { + .level = RAID5, + .base_bdevs_min = 3, + .base_bdevs_max_degraded = 1, + .start = raid5_start, + .stop = raid5_stop, + .submit_rw_request = raid5_submit_rw_request, +}; +RAID_MODULE_REGISTER(&g_raid5_module) + +SPDK_LOG_REGISTER_COMPONENT("bdev_raid5", SPDK_LOG_BDEV_RAID5) diff --git a/src/spdk/module/bdev/rbd/Makefile b/src/spdk/module/bdev/rbd/Makefile new file mode 100644 index 000000000..055e14dac --- /dev/null +++ b/src/spdk/module/bdev/rbd/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 3 +SO_MINOR := 0 + +C_SRCS = bdev_rbd.c bdev_rbd_rpc.c +LIBNAME = bdev_rbd + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/rbd/bdev_rbd.c b/src/spdk/module/bdev/rbd/bdev_rbd.c new file mode 100644 index 000000000..f3b2547c4 --- /dev/null +++ b/src/spdk/module/bdev/rbd/bdev_rbd.c @@ -0,0 +1,898 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "bdev_rbd.h" + +#include <rbd/librbd.h> +#include <rados/librados.h> +#include <sys/eventfd.h> + +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/bdev.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#define SPDK_RBD_QUEUE_DEPTH 128 + +static int bdev_rbd_count = 0; + +#define BDEV_RBD_POLL_US 50 + +struct bdev_rbd { + struct spdk_bdev disk; + char *rbd_name; + char *user_id; + char *pool_name; + char **config; + rbd_image_info_t info; + TAILQ_ENTRY(bdev_rbd) tailq; + struct spdk_poller *reset_timer; + struct spdk_bdev_io *reset_bdev_io; +}; + +struct bdev_rbd_io_channel { + rados_ioctx_t io_ctx; + rados_t cluster; + struct pollfd pfd; + rbd_image_t image; + struct bdev_rbd *disk; + struct spdk_poller *poller; +}; + +struct bdev_rbd_io { + uint64_t remaining_len; + int num_segments; + bool failed; +}; + +static void +bdev_rbd_free(struct bdev_rbd *rbd) +{ + if (!rbd) { + return; + } + + free(rbd->disk.name); + free(rbd->rbd_name); + free(rbd->user_id); + free(rbd->pool_name); + bdev_rbd_free_config(rbd->config); + free(rbd); +} + +void +bdev_rbd_free_config(char **config) +{ + char **entry; + + if (config) { + for (entry = config; *entry; entry++) { + free(*entry); + } + free(config); + } +} + +char ** +bdev_rbd_dup_config(const char *const *config) +{ + size_t count; + char **copy; + + if (!config) { + return NULL; + } + for (count = 0; config[count]; count++) {} + copy = calloc(count + 1, sizeof(*copy)); + if (!copy) { + return NULL; + } + for (count = 0; config[count]; count++) { + if (!(copy[count] = strdup(config[count]))) { + bdev_rbd_free_config(copy); + return NULL; + } + } + return copy; +} + +static int +bdev_rados_context_init(const char *user_id, const char *rbd_pool_name, const char *const *config, + rados_t *cluster, rados_ioctx_t *io_ctx) +{ + int ret; + + ret = rados_create(cluster, user_id); + if (ret < 0) { + SPDK_ERRLOG("Failed to create rados_t struct\n"); + return -1; + } + + if (config) { + const char *const *entry = config; + while (*entry) { + ret = rados_conf_set(*cluster, entry[0], entry[1]); + if (ret < 0) { + SPDK_ERRLOG("Failed to set %s = %s\n", entry[0], entry[1]); + rados_shutdown(*cluster); + return -1; + } + entry += 2; + } + } else { + ret = rados_conf_read_file(*cluster, NULL); + if (ret < 0) { + SPDK_ERRLOG("Failed to read conf file\n"); + rados_shutdown(*cluster); + return -1; + } + } + + ret = rados_connect(*cluster); + if (ret < 0) { + SPDK_ERRLOG("Failed to connect to rbd_pool\n"); + rados_shutdown(*cluster); + return -1; + } + + ret = rados_ioctx_create(*cluster, rbd_pool_name, io_ctx); + + if (ret < 0) { + SPDK_ERRLOG("Failed to create ioctx\n"); + rados_shutdown(*cluster); + return -1; + } + + return 0; +} + +static int +bdev_rbd_init(const char *user_id, const char *rbd_pool_name, const char *const *config, + const char *rbd_name, rbd_image_info_t *info) +{ + int ret; + rados_t cluster = NULL; + rados_ioctx_t io_ctx = NULL; + rbd_image_t image = NULL; + + ret = bdev_rados_context_init(user_id, rbd_pool_name, config, &cluster, &io_ctx); + if (ret < 0) { + SPDK_ERRLOG("Failed to create rados context for user_id=%s and rbd_pool=%s\n", + user_id ? user_id : "admin (the default)", rbd_pool_name); + return -1; + } + + ret = rbd_open(io_ctx, rbd_name, &image, NULL); + if (ret < 0) { + SPDK_ERRLOG("Failed to open specified rbd device\n"); + goto err; + } + ret = rbd_stat(image, info, sizeof(*info)); + rbd_close(image); + if (ret < 0) { + SPDK_ERRLOG("Failed to stat specified rbd device\n"); + goto err; + } + + rados_ioctx_destroy(io_ctx); + return 0; +err: + rados_ioctx_destroy(io_ctx); + rados_shutdown(cluster); + return -1; +} + +static void +bdev_rbd_exit(rbd_image_t image) +{ + rbd_flush(image); + rbd_close(image); +} + +static void +bdev_rbd_finish_aiocb(rbd_completion_t cb, void *arg) +{ + /* Doing nothing here */ +} + +static int +bdev_rbd_start_aio(rbd_image_t image, struct spdk_bdev_io *bdev_io, + void *buf, uint64_t offset, size_t len) +{ + int ret; + rbd_completion_t comp; + + ret = rbd_aio_create_completion(bdev_io, bdev_rbd_finish_aiocb, + &comp); + if (ret < 0) { + return -1; + } + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + ret = rbd_aio_read(image, offset, len, + buf, comp); + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + ret = rbd_aio_write(image, offset, len, + buf, comp); + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) { + ret = rbd_aio_flush(image, comp); + } + + if (ret < 0) { + rbd_aio_release(comp); + return -1; + } + + return 0; +} + +static int bdev_rbd_library_init(void); + +static int +bdev_rbd_get_ctx_size(void) +{ + return sizeof(struct bdev_rbd_io); +} + +static struct spdk_bdev_module rbd_if = { + .name = "rbd", + .module_init = bdev_rbd_library_init, + .get_ctx_size = bdev_rbd_get_ctx_size, + +}; +SPDK_BDEV_MODULE_REGISTER(rbd, &rbd_if) + +static int64_t +bdev_rbd_rw(struct bdev_rbd *disk, struct spdk_io_channel *ch, + struct spdk_bdev_io *bdev_io, struct iovec *iov, + int iovcnt, size_t len, uint64_t offset) +{ + struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; + struct bdev_rbd_io_channel *rbdio_ch = spdk_io_channel_get_ctx(ch); + size_t remaining = len; + int i, rc; + + rbd_io->remaining_len = 0; + rbd_io->num_segments = 0; + rbd_io->failed = false; + + for (i = 0; i < iovcnt && remaining > 0; i++) { + size_t seg_len = spdk_min(remaining, iov[i].iov_len); + + rc = bdev_rbd_start_aio(rbdio_ch->image, bdev_io, iov[i].iov_base, offset, seg_len); + if (rc) { + /* + * This bdev_rbd_start_aio() call failed, but if any previous ones were + * submitted, we need to wait for them to finish. + */ + if (rbd_io->num_segments == 0) { + /* No previous I/O submitted - return error code immediately. */ + return rc; + } + + /* Return and wait for outstanding I/O to complete. */ + rbd_io->failed = true; + return 0; + } + + rbd_io->num_segments++; + rbd_io->remaining_len += seg_len; + + offset += seg_len; + remaining -= seg_len; + } + + return 0; +} + +static int64_t +bdev_rbd_flush(struct bdev_rbd *disk, struct spdk_io_channel *ch, + struct spdk_bdev_io *bdev_io, uint64_t offset, uint64_t nbytes) +{ + struct bdev_rbd_io_channel *rbdio_ch = spdk_io_channel_get_ctx(ch); + struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; + + rbd_io->num_segments++; + return bdev_rbd_start_aio(rbdio_ch->image, bdev_io, NULL, offset, nbytes); +} + +static int +bdev_rbd_reset_timer(void *arg) +{ + struct bdev_rbd *disk = arg; + + /* + * TODO: This should check if any I/O is still in flight before completing the reset. + * For now, just complete after the timer expires. + */ + spdk_bdev_io_complete(disk->reset_bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + spdk_poller_unregister(&disk->reset_timer); + disk->reset_bdev_io = NULL; + + return SPDK_POLLER_BUSY; +} + +static int +bdev_rbd_reset(struct bdev_rbd *disk, struct spdk_bdev_io *bdev_io) +{ + /* + * HACK: Since librbd doesn't provide any way to cancel outstanding aio, just kick off a + * timer to wait for in-flight I/O to complete. + */ + assert(disk->reset_bdev_io == NULL); + disk->reset_bdev_io = bdev_io; + disk->reset_timer = SPDK_POLLER_REGISTER(bdev_rbd_reset_timer, disk, 1 * 1000 * 1000); + + return 0; +} + +static int +bdev_rbd_destruct(void *ctx) +{ + struct bdev_rbd *rbd = ctx; + + spdk_io_device_unregister(rbd, NULL); + + bdev_rbd_free(rbd); + return 0; +} + +static void +bdev_rbd_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + int ret; + + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + ret = bdev_rbd_rw(bdev_io->bdev->ctxt, + ch, + bdev_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + + if (ret != 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static int _bdev_rbd_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_rbd_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + + case SPDK_BDEV_IO_TYPE_WRITE: + return bdev_rbd_rw((struct bdev_rbd *)bdev_io->bdev->ctxt, + ch, + bdev_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + + case SPDK_BDEV_IO_TYPE_FLUSH: + return bdev_rbd_flush((struct bdev_rbd *)bdev_io->bdev->ctxt, + ch, + bdev_io, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + + case SPDK_BDEV_IO_TYPE_RESET: + return bdev_rbd_reset((struct bdev_rbd *)bdev_io->bdev->ctxt, + bdev_io); + + default: + return -1; + } + return 0; +} + +static void bdev_rbd_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_rbd_submit_request(ch, bdev_io) < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_rbd_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + + default: + return false; + } +} + +static int +bdev_rbd_io_poll(void *arg) +{ + struct bdev_rbd_io_channel *ch = arg; + int i, io_status, rc; + rbd_completion_t comps[SPDK_RBD_QUEUE_DEPTH]; + struct spdk_bdev_io *bdev_io; + struct bdev_rbd_io *rbd_io; + + rc = poll(&ch->pfd, 1, 0); + + /* check the return value of poll since we have only one fd for each channel */ + if (rc != 1) { + return SPDK_POLLER_BUSY; + } + + rc = rbd_poll_io_events(ch->image, comps, SPDK_RBD_QUEUE_DEPTH); + for (i = 0; i < rc; i++) { + bdev_io = rbd_aio_get_arg(comps[i]); + rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; + io_status = rbd_aio_get_return_value(comps[i]); + + assert(rbd_io->num_segments > 0); + rbd_io->num_segments--; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + if (io_status > 0) { + /* For reads, io_status is the length */ + rbd_io->remaining_len -= io_status; + } + + if (rbd_io->num_segments == 0 && rbd_io->remaining_len != 0) { + rbd_io->failed = true; + } + } else { + /* For others, 0 means success */ + if (io_status != 0) { + rbd_io->failed = true; + } + } + + rbd_aio_release(comps[i]); + + if (rbd_io->num_segments == 0) { + spdk_bdev_io_complete(bdev_io, + rbd_io->failed ? SPDK_BDEV_IO_STATUS_FAILED : SPDK_BDEV_IO_STATUS_SUCCESS); + } + } + + return rc > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +} + +static void +bdev_rbd_free_channel(struct bdev_rbd_io_channel *ch) +{ + if (!ch) { + return; + } + + if (ch->image) { + bdev_rbd_exit(ch->image); + } + + if (ch->io_ctx) { + rados_ioctx_destroy(ch->io_ctx); + } + + if (ch->cluster) { + rados_shutdown(ch->cluster); + } + + if (ch->pfd.fd >= 0) { + close(ch->pfd.fd); + } +} + +static void * +bdev_rbd_handle(void *arg) +{ + struct bdev_rbd_io_channel *ch = arg; + void *ret = arg; + + if (rbd_open(ch->io_ctx, ch->disk->rbd_name, &ch->image, NULL) < 0) { + SPDK_ERRLOG("Failed to open specified rbd device\n"); + ret = NULL; + } + + return ret; +} + +static int +bdev_rbd_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_rbd_io_channel *ch = ctx_buf; + int ret; + + ch->disk = io_device; + ch->image = NULL; + ch->io_ctx = NULL; + ch->pfd.fd = -1; + + ret = bdev_rados_context_init(ch->disk->user_id, ch->disk->pool_name, + (const char *const *)ch->disk->config, + &ch->cluster, &ch->io_ctx); + if (ret < 0) { + SPDK_ERRLOG("Failed to create rados context for user_id %s and rbd_pool=%s\n", + ch->disk->user_id ? ch->disk->user_id : "admin (the default)", ch->disk->pool_name); + goto err; + } + + if (spdk_call_unaffinitized(bdev_rbd_handle, ch) == NULL) { + goto err; + } + + ch->pfd.fd = eventfd(0, EFD_NONBLOCK); + if (ch->pfd.fd < 0) { + SPDK_ERRLOG("Failed to get eventfd\n"); + goto err; + } + + ch->pfd.events = POLLIN; + ret = rbd_set_image_notification(ch->image, ch->pfd.fd, EVENT_TYPE_EVENTFD); + if (ret < 0) { + SPDK_ERRLOG("Failed to set rbd image notification\n"); + goto err; + } + + ch->poller = SPDK_POLLER_REGISTER(bdev_rbd_io_poll, ch, BDEV_RBD_POLL_US); + + return 0; + +err: + bdev_rbd_free_channel(ch); + return -1; +} + +static void +bdev_rbd_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_rbd_io_channel *io_channel = ctx_buf; + + bdev_rbd_free_channel(io_channel); + + spdk_poller_unregister(&io_channel->poller); +} + +static struct spdk_io_channel * +bdev_rbd_get_io_channel(void *ctx) +{ + struct bdev_rbd *rbd_bdev = ctx; + + return spdk_get_io_channel(rbd_bdev); +} + +static int +bdev_rbd_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct bdev_rbd *rbd_bdev = ctx; + + spdk_json_write_named_object_begin(w, "rbd"); + + spdk_json_write_named_string(w, "pool_name", rbd_bdev->pool_name); + + spdk_json_write_named_string(w, "rbd_name", rbd_bdev->rbd_name); + + if (rbd_bdev->user_id) { + spdk_json_write_named_string(w, "user_id", rbd_bdev->user_id); + } + + if (rbd_bdev->config) { + char **entry = rbd_bdev->config; + + spdk_json_write_named_object_begin(w, "config"); + while (*entry) { + spdk_json_write_named_string(w, entry[0], entry[1]); + entry += 2; + } + spdk_json_write_object_end(w); + } + + spdk_json_write_object_end(w); + + return 0; +} + +static void +bdev_rbd_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct bdev_rbd *rbd = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_rbd_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_string(w, "pool_name", rbd->pool_name); + spdk_json_write_named_string(w, "rbd_name", rbd->rbd_name); + spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); + if (rbd->user_id) { + spdk_json_write_named_string(w, "user_id", rbd->user_id); + } + + if (rbd->config) { + char **entry = rbd->config; + + spdk_json_write_named_object_begin(w, "config"); + while (*entry) { + spdk_json_write_named_string(w, entry[0], entry[1]); + entry += 2; + } + spdk_json_write_object_end(w); + } + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table rbd_fn_table = { + .destruct = bdev_rbd_destruct, + .submit_request = bdev_rbd_submit_request, + .io_type_supported = bdev_rbd_io_type_supported, + .get_io_channel = bdev_rbd_get_io_channel, + .dump_info_json = bdev_rbd_dump_info_json, + .write_config_json = bdev_rbd_write_config_json, +}; + +int +bdev_rbd_create(struct spdk_bdev **bdev, const char *name, const char *user_id, + const char *pool_name, + const char *const *config, + const char *rbd_name, + uint32_t block_size) +{ + struct bdev_rbd *rbd; + int ret; + + if ((pool_name == NULL) || (rbd_name == NULL)) { + return -EINVAL; + } + + rbd = calloc(1, sizeof(struct bdev_rbd)); + if (rbd == NULL) { + SPDK_ERRLOG("Failed to allocate bdev_rbd struct\n"); + return -ENOMEM; + } + + rbd->rbd_name = strdup(rbd_name); + if (!rbd->rbd_name) { + bdev_rbd_free(rbd); + return -ENOMEM; + } + + if (user_id) { + rbd->user_id = strdup(user_id); + if (!rbd->user_id) { + bdev_rbd_free(rbd); + return -ENOMEM; + } + } + + rbd->pool_name = strdup(pool_name); + if (!rbd->pool_name) { + bdev_rbd_free(rbd); + return -ENOMEM; + } + + if (config && !(rbd->config = bdev_rbd_dup_config(config))) { + bdev_rbd_free(rbd); + return -ENOMEM; + } + + ret = bdev_rbd_init(rbd->user_id, rbd->pool_name, + (const char *const *)rbd->config, + rbd_name, &rbd->info); + if (ret < 0) { + bdev_rbd_free(rbd); + SPDK_ERRLOG("Failed to init rbd device\n"); + return ret; + } + + if (name) { + rbd->disk.name = strdup(name); + } else { + rbd->disk.name = spdk_sprintf_alloc("Ceph%d", bdev_rbd_count); + } + if (!rbd->disk.name) { + bdev_rbd_free(rbd); + return -ENOMEM; + } + rbd->disk.product_name = "Ceph Rbd Disk"; + bdev_rbd_count++; + + rbd->disk.write_cache = 0; + rbd->disk.blocklen = block_size; + rbd->disk.blockcnt = rbd->info.size / rbd->disk.blocklen; + rbd->disk.ctxt = rbd; + rbd->disk.fn_table = &rbd_fn_table; + rbd->disk.module = &rbd_if; + + SPDK_NOTICELOG("Add %s rbd disk to lun\n", rbd->disk.name); + + spdk_io_device_register(rbd, bdev_rbd_create_cb, + bdev_rbd_destroy_cb, + sizeof(struct bdev_rbd_io_channel), + rbd_name); + ret = spdk_bdev_register(&rbd->disk); + if (ret) { + spdk_io_device_unregister(rbd, NULL); + bdev_rbd_free(rbd); + return ret; + } + + *bdev = &(rbd->disk); + + return ret; +} + +void +bdev_rbd_delete(struct spdk_bdev *bdev, spdk_delete_rbd_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &rbd_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +int +bdev_rbd_resize(struct spdk_bdev *bdev, const uint64_t new_size_in_mb) +{ + struct spdk_io_channel *ch; + struct bdev_rbd_io_channel *rbd_io_ch; + int rc; + uint64_t new_size_in_byte; + uint64_t current_size_in_mb; + + if (bdev->module != &rbd_if) { + return -EINVAL; + } + + current_size_in_mb = bdev->blocklen * bdev->blockcnt / (1024 * 1024); + if (current_size_in_mb > new_size_in_mb) { + SPDK_ERRLOG("The new bdev size must be lager than current bdev size.\n"); + return -EINVAL; + } + + ch = bdev_rbd_get_io_channel(bdev); + rbd_io_ch = spdk_io_channel_get_ctx(ch); + new_size_in_byte = new_size_in_mb * 1024 * 1024; + + rc = rbd_resize(rbd_io_ch->image, new_size_in_byte); + if (rc != 0) { + SPDK_ERRLOG("failed to resize the ceph bdev.\n"); + return rc; + } + + rc = spdk_bdev_notify_blockcnt_change(bdev, new_size_in_byte / bdev->blocklen); + if (rc != 0) { + SPDK_ERRLOG("failed to notify block cnt change.\n"); + return rc; + } + + return rc; +} + +static int +bdev_rbd_library_init(void) +{ + int i, rc = 0; + const char *val; + const char *pool_name; + const char *rbd_name; + struct spdk_bdev *bdev; + uint32_t block_size; + long int tmp; + + struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Ceph"); + + if (sp == NULL) { + /* + * Ceph section not found. Do not initialize any rbd LUNS. + */ + goto end; + } + + /* Init rbd block devices */ + for (i = 0; ; i++) { + val = spdk_conf_section_get_nval(sp, "Ceph", i); + if (val == NULL) { + break; + } + + /* get the Rbd_pool name */ + pool_name = spdk_conf_section_get_nmval(sp, "Ceph", i, 0); + if (pool_name == NULL) { + SPDK_ERRLOG("Ceph%d: rbd pool name needs to be provided\n", i); + rc = -1; + goto end; + } + + rbd_name = spdk_conf_section_get_nmval(sp, "Ceph", i, 1); + if (rbd_name == NULL) { + SPDK_ERRLOG("Ceph%d: format error\n", i); + rc = -1; + goto end; + } + + val = spdk_conf_section_get_nmval(sp, "Ceph", i, 2); + + if (val == NULL) { + block_size = 512; /* default value */ + } else { + tmp = spdk_strtol(val, 10); + if (tmp <= 0) { + SPDK_ERRLOG("Invalid block size\n"); + rc = -1; + goto end; + } else if (tmp & 0x1ff) { + SPDK_ERRLOG("current block_size = %ld, it should be multiple of 512\n", + tmp); + rc = -1; + goto end; + } + block_size = (uint32_t)tmp; + } + + /* TODO(?): user_id and rbd config values */ + rc = bdev_rbd_create(&bdev, NULL, NULL, pool_name, NULL, rbd_name, block_size); + if (rc) { + goto end; + } + } + +end: + return rc; +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_rbd", SPDK_LOG_BDEV_RBD) diff --git a/src/spdk/module/bdev/rbd/bdev_rbd.h b/src/spdk/module/bdev/rbd/bdev_rbd.h new file mode 100644 index 000000000..1d16a02db --- /dev/null +++ b/src/spdk/module/bdev/rbd/bdev_rbd.h @@ -0,0 +1,68 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_RBD_H +#define SPDK_BDEV_RBD_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" + +void bdev_rbd_free_config(char **config); +char **bdev_rbd_dup_config(const char *const *config); + +typedef void (*spdk_delete_rbd_complete)(void *cb_arg, int bdeverrno); + +int bdev_rbd_create(struct spdk_bdev **bdev, const char *name, const char *user_id, + const char *pool_name, + const char *const *config, + const char *rbd_name, uint32_t block_size); +/** + * Delete rbd bdev. + * + * \param bdev Pointer to rbd bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void bdev_rbd_delete(struct spdk_bdev *bdev, spdk_delete_rbd_complete cb_fn, + void *cb_arg); + +/** + * Resize rbd bdev. + * + * \param bdev Pointer to rbd bdev. + * \param new_size_in_mb The new size in MiB for this bdev. + */ +int bdev_rbd_resize(struct spdk_bdev *bdev, const uint64_t new_size_in_mb); + +#endif /* SPDK_BDEV_RBD_H */ diff --git a/src/spdk/module/bdev/rbd/bdev_rbd_rpc.c b/src/spdk/module/bdev/rbd/bdev_rbd_rpc.c new file mode 100644 index 000000000..c60c83a58 --- /dev/null +++ b/src/spdk/module/bdev/rbd/bdev_rbd_rpc.c @@ -0,0 +1,252 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_rbd.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +struct rpc_create_rbd { + char *name; + char *user_id; + char *pool_name; + char *rbd_name; + uint32_t block_size; + char **config; +}; + +static void +free_rpc_create_rbd(struct rpc_create_rbd *req) +{ + free(req->name); + free(req->user_id); + free(req->pool_name); + free(req->rbd_name); + bdev_rbd_free_config(req->config); +} + +static int +bdev_rbd_decode_config(const struct spdk_json_val *values, void *out) +{ + char ***map = out; + char **entry; + uint32_t i; + + if (values->type == SPDK_JSON_VAL_NULL) { + /* treated like empty object: empty config */ + *map = calloc(1, sizeof(**map)); + if (!*map) { + return -1; + } + return 0; + } + + if (values->type != SPDK_JSON_VAL_OBJECT_BEGIN) { + return -1; + } + + *map = calloc(values->len + 1, sizeof(**map)); + if (!*map) { + return -1; + } + + for (i = 0, entry = *map; i < values->len;) { + const struct spdk_json_val *name = &values[i + 1]; + const struct spdk_json_val *v = &values[i + 2]; + /* Here we catch errors like invalid types. */ + if (!(entry[0] = spdk_json_strdup(name)) || + !(entry[1] = spdk_json_strdup(v))) { + bdev_rbd_free_config(*map); + *map = NULL; + return -1; + } + i += 1 + spdk_json_val_len(v); + entry += 2; + } + + return 0; +} + +static const struct spdk_json_object_decoder rpc_create_rbd_decoders[] = { + {"name", offsetof(struct rpc_create_rbd, name), spdk_json_decode_string, true}, + {"user_id", offsetof(struct rpc_create_rbd, user_id), spdk_json_decode_string, true}, + {"pool_name", offsetof(struct rpc_create_rbd, pool_name), spdk_json_decode_string}, + {"rbd_name", offsetof(struct rpc_create_rbd, rbd_name), spdk_json_decode_string}, + {"block_size", offsetof(struct rpc_create_rbd, block_size), spdk_json_decode_uint32}, + {"config", offsetof(struct rpc_create_rbd, config), bdev_rbd_decode_config, true} +}; + +static void +rpc_bdev_rbd_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_create_rbd req = {}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *bdev; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_create_rbd_decoders, + SPDK_COUNTOF(rpc_create_rbd_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RBD, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = bdev_rbd_create(&bdev, req.name, req.user_id, req.pool_name, + (const char *const *)req.config, + req.rbd_name, + req.block_size); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_create_rbd(&req); +} +SPDK_RPC_REGISTER("bdev_rbd_create", rpc_bdev_rbd_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_rbd_create, construct_rbd_bdev) + +struct rpc_bdev_rbd_delete { + char *name; +}; + +static void +free_rpc_bdev_rbd_delete(struct rpc_bdev_rbd_delete *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_rbd_delete_decoders[] = { + {"name", offsetof(struct rpc_bdev_rbd_delete, name), spdk_json_decode_string}, +}; + +static void +_rpc_bdev_rbd_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_rbd_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_rbd_delete req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_bdev_rbd_delete_decoders, + SPDK_COUNTOF(rpc_bdev_rbd_delete_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + bdev_rbd_delete(bdev, _rpc_bdev_rbd_delete_cb, request); + +cleanup: + free_rpc_bdev_rbd_delete(&req); +} +SPDK_RPC_REGISTER("bdev_rbd_delete", rpc_bdev_rbd_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_rbd_delete, delete_rbd_bdev) + +struct rpc_bdev_rbd_resize { + char *name; + uint64_t new_size; +}; + +static const struct spdk_json_object_decoder rpc_bdev_rbd_resize_decoders[] = { + {"name", offsetof(struct rpc_bdev_rbd_resize, name), spdk_json_decode_string}, + {"new_size", offsetof(struct rpc_bdev_rbd_resize, new_size), spdk_json_decode_uint64} +}; + +static void +free_rpc_bdev_rbd_resize(struct rpc_bdev_rbd_resize *req) +{ + free(req->name); +} + +static void +rpc_bdev_rbd_resize(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_rbd_resize req = {}; + struct spdk_bdev *bdev; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_rbd_resize_decoders, + SPDK_COUNTOF(rpc_bdev_rbd_resize_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + rc = bdev_rbd_resize(bdev, req.new_size); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +cleanup: + free_rpc_bdev_rbd_resize(&req); +} +SPDK_RPC_REGISTER("bdev_rbd_resize", rpc_bdev_rbd_resize, SPDK_RPC_RUNTIME) diff --git a/src/spdk/module/bdev/rpc/Makefile b/src/spdk/module/bdev/rpc/Makefile new file mode 100644 index 000000000..15de4fef9 --- /dev/null +++ b/src/spdk/module/bdev/rpc/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = bdev_rpc.c +LIBNAME = bdev_rpc + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/rpc/bdev_rpc.c b/src/spdk/module/bdev/rpc/bdev_rpc.c new file mode 100644 index 000000000..166ab1a42 --- /dev/null +++ b/src/spdk/module/bdev/rpc/bdev_rpc.c @@ -0,0 +1,676 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/rpc.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/histogram_data.h" +#include "spdk/base64.h" + +#include "spdk/bdev_module.h" + +struct rpc_bdev_get_iostat_ctx { + int bdev_count; + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; +}; + +static void +rpc_bdev_get_iostat_cb(struct spdk_bdev *bdev, + struct spdk_bdev_io_stat *stat, void *cb_arg, int rc) +{ + struct rpc_bdev_get_iostat_ctx *ctx = cb_arg; + struct spdk_json_write_ctx *w = ctx->w; + const char *bdev_name; + + if (rc != 0) { + goto done; + } + + bdev_name = spdk_bdev_get_name(bdev); + if (bdev_name != NULL) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "name", bdev_name); + + spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); + + spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); + + spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); + + spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); + + spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); + + spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); + + spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); + + spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); + + spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); + + if (spdk_bdev_get_qd_sampling_period(bdev)) { + spdk_json_write_named_uint64(w, "queue_depth_polling_period", + spdk_bdev_get_qd_sampling_period(bdev)); + + spdk_json_write_named_uint64(w, "queue_depth", spdk_bdev_get_qd(bdev)); + + spdk_json_write_named_uint64(w, "io_time", spdk_bdev_get_io_time(bdev)); + + spdk_json_write_named_uint64(w, "weighted_io_time", + spdk_bdev_get_weighted_io_time(bdev)); + } + + spdk_json_write_object_end(w); + } + +done: + free(stat); + if (--ctx->bdev_count == 0) { + spdk_json_write_array_end(ctx->w); + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(ctx->request, ctx->w); + free(ctx); + } +} + +struct rpc_bdev_get_iostat { + char *name; +}; + +static void +free_rpc_bdev_get_iostat(struct rpc_bdev_get_iostat *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_get_iostat_decoders[] = { + {"name", offsetof(struct rpc_bdev_get_iostat, name), spdk_json_decode_string, true}, +}; + +static void +rpc_bdev_get_iostat(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_get_iostat req = {}; + struct spdk_bdev *bdev = NULL; + struct spdk_json_write_ctx *w; + struct spdk_bdev_io_stat *stat; + struct rpc_bdev_get_iostat_ctx *ctx; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_bdev_get_iostat_decoders, + SPDK_COUNTOF(rpc_bdev_get_iostat_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + free_rpc_bdev_get_iostat(&req); + return; + } + + if (req.name) { + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + free_rpc_bdev_get_iostat(&req); + return; + } + } + } + + free_rpc_bdev_get_iostat(&req); + + ctx = calloc(1, sizeof(struct rpc_bdev_get_iostat_ctx)); + if (ctx == NULL) { + SPDK_ERRLOG("Failed to allocate rpc_bdev_get_iostat_ctx struct\n"); + spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); + return; + } + + w = spdk_jsonrpc_begin_result(request); + /* + * Increment initial bdev_count so that it will never reach 0 in the middle + * of iterating. + */ + ctx->bdev_count++; + ctx->request = request; + ctx->w = w; + + + spdk_json_write_object_begin(w); + spdk_json_write_named_uint64(w, "tick_rate", spdk_get_ticks_hz()); + spdk_json_write_named_uint64(w, "ticks", spdk_get_ticks()); + + spdk_json_write_named_array_begin(w, "bdevs"); + + if (bdev != NULL) { + stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); + if (stat == NULL) { + SPDK_ERRLOG("Failed to allocate rpc_bdev_get_iostat_ctx struct\n"); + } else { + ctx->bdev_count++; + spdk_bdev_get_device_stat(bdev, stat, rpc_bdev_get_iostat_cb, ctx); + } + } else { + for (bdev = spdk_bdev_first(); bdev != NULL; bdev = spdk_bdev_next(bdev)) { + stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); + if (stat == NULL) { + SPDK_ERRLOG("Failed to allocate spdk_bdev_io_stat struct\n"); + break; + } + ctx->bdev_count++; + spdk_bdev_get_device_stat(bdev, stat, rpc_bdev_get_iostat_cb, ctx); + } + } + + if (--ctx->bdev_count == 0) { + spdk_json_write_array_end(w); + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); + free(ctx); + } +} +SPDK_RPC_REGISTER("bdev_get_iostat", rpc_bdev_get_iostat, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_get_iostat, get_bdevs_iostat) + +static void +rpc_dump_bdev_info(struct spdk_json_write_ctx *w, + struct spdk_bdev *bdev) +{ + struct spdk_bdev_alias *tmp; + uint64_t qos_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; + int i; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(bdev)); + + spdk_json_write_named_array_begin(w, "aliases"); + + TAILQ_FOREACH(tmp, spdk_bdev_get_aliases(bdev), tailq) { + spdk_json_write_string(w, tmp->alias); + } + + spdk_json_write_array_end(w); + + spdk_json_write_named_string(w, "product_name", spdk_bdev_get_product_name(bdev)); + + spdk_json_write_named_uint32(w, "block_size", spdk_bdev_get_block_size(bdev)); + + spdk_json_write_named_uint64(w, "num_blocks", spdk_bdev_get_num_blocks(bdev)); + + if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { + char uuid_str[SPDK_UUID_STRING_LEN]; + + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + } + + if (spdk_bdev_get_md_size(bdev) != 0) { + spdk_json_write_named_uint32(w, "md_size", spdk_bdev_get_md_size(bdev)); + spdk_json_write_named_bool(w, "md_interleave", spdk_bdev_is_md_interleaved(bdev)); + spdk_json_write_named_uint32(w, "dif_type", spdk_bdev_get_dif_type(bdev)); + if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { + spdk_json_write_named_bool(w, "dif_is_head_of_md", spdk_bdev_is_dif_head_of_md(bdev)); + spdk_json_write_named_object_begin(w, "enabled_dif_check_types"); + spdk_json_write_named_bool(w, "reftag", + spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)); + spdk_json_write_named_bool(w, "apptag", + spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_APPTAG)); + spdk_json_write_named_bool(w, "guard", + spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)); + spdk_json_write_object_end(w); + } + } + + spdk_json_write_named_object_begin(w, "assigned_rate_limits"); + spdk_bdev_get_qos_rate_limits(bdev, qos_limits); + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + spdk_json_write_named_uint64(w, spdk_bdev_get_qos_rpc_type(i), qos_limits[i]); + } + spdk_json_write_object_end(w); + + spdk_json_write_named_bool(w, "claimed", (bdev->internal.claim_module != NULL)); + + spdk_json_write_named_bool(w, "zoned", bdev->zoned); + if (bdev->zoned) { + spdk_json_write_named_uint64(w, "zone_size", bdev->zone_size); + spdk_json_write_named_uint64(w, "max_open_zones", bdev->max_open_zones); + spdk_json_write_named_uint64(w, "optimal_open_zones", bdev->optimal_open_zones); + } + + spdk_json_write_named_object_begin(w, "supported_io_types"); + spdk_json_write_named_bool(w, "read", + spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)); + spdk_json_write_named_bool(w, "write", + spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); + spdk_json_write_named_bool(w, "unmap", + spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)); + spdk_json_write_named_bool(w, "write_zeroes", + spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)); + spdk_json_write_named_bool(w, "flush", + spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)); + spdk_json_write_named_bool(w, "reset", + spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_RESET)); + spdk_json_write_named_bool(w, "nvme_admin", + spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN)); + spdk_json_write_named_bool(w, "nvme_io", + spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO)); + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "driver_specific"); + spdk_bdev_dump_info_json(bdev, w); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +struct rpc_bdev_get_bdevs { + char *name; +}; + +static void +free_rpc_bdev_get_bdevs(struct rpc_bdev_get_bdevs *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_get_bdevs_decoders[] = { + {"name", offsetof(struct rpc_bdev_get_bdevs, name), spdk_json_decode_string, true}, +}; + +static void +rpc_bdev_get_bdevs(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_get_bdevs req = {}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *bdev = NULL; + + if (params && spdk_json_decode_object(params, rpc_bdev_get_bdevs_decoders, + SPDK_COUNTOF(rpc_bdev_get_bdevs_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + free_rpc_bdev_get_bdevs(&req); + return; + } + + if (req.name) { + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + free_rpc_bdev_get_bdevs(&req); + return; + } + } + + free_rpc_bdev_get_bdevs(&req); + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + if (bdev != NULL) { + rpc_dump_bdev_info(w, bdev); + } else { + for (bdev = spdk_bdev_first(); bdev != NULL; bdev = spdk_bdev_next(bdev)) { + rpc_dump_bdev_info(w, bdev); + } + } + + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("bdev_get_bdevs", rpc_bdev_get_bdevs, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_get_bdevs, get_bdevs) + +struct rpc_bdev_set_qd_sampling_period { + char *name; + uint64_t period; +}; + +static void +free_rpc_bdev_set_qd_sampling_period(struct rpc_bdev_set_qd_sampling_period *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder + rpc_bdev_set_qd_sampling_period_decoders[] = { + {"name", offsetof(struct rpc_bdev_set_qd_sampling_period, name), spdk_json_decode_string}, + {"period", offsetof(struct rpc_bdev_set_qd_sampling_period, period), spdk_json_decode_uint64}, +}; + +static void +rpc_bdev_set_qd_sampling_period(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_set_qd_sampling_period req = {0}; + struct spdk_bdev *bdev; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_bdev_set_qd_sampling_period_decoders, + SPDK_COUNTOF(rpc_bdev_set_qd_sampling_period_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_bdev_set_qd_sampling_period(bdev, req.period); + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_bdev_set_qd_sampling_period(&req); +} +SPDK_RPC_REGISTER("bdev_set_qd_sampling_period", + rpc_bdev_set_qd_sampling_period, + SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_set_qd_sampling_period, + set_bdev_qd_sampling_period) + +struct rpc_bdev_set_qos_limit { + char *name; + uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; +}; + +static void +free_rpc_bdev_set_qos_limit(struct rpc_bdev_set_qos_limit *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_set_qos_limit_decoders[] = { + {"name", offsetof(struct rpc_bdev_set_qos_limit, name), spdk_json_decode_string}, + { + "rw_ios_per_sec", offsetof(struct rpc_bdev_set_qos_limit, + limits[SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT]), + spdk_json_decode_uint64, true + }, + { + "rw_mbytes_per_sec", offsetof(struct rpc_bdev_set_qos_limit, + limits[SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT]), + spdk_json_decode_uint64, true + }, + { + "r_mbytes_per_sec", offsetof(struct rpc_bdev_set_qos_limit, + limits[SPDK_BDEV_QOS_R_BPS_RATE_LIMIT]), + spdk_json_decode_uint64, true + }, + { + "w_mbytes_per_sec", offsetof(struct rpc_bdev_set_qos_limit, + limits[SPDK_BDEV_QOS_W_BPS_RATE_LIMIT]), + spdk_json_decode_uint64, true + }, +}; + +static void +rpc_bdev_set_qos_limit_complete(void *cb_arg, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + if (status != 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Failed to configure rate limit: %s", + spdk_strerror(-status)); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_set_qos_limit(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_set_qos_limit req = {NULL, {UINT64_MAX, UINT64_MAX, UINT64_MAX, UINT64_MAX}}; + struct spdk_bdev *bdev; + int i; + + if (spdk_json_decode_object(params, rpc_bdev_set_qos_limit_decoders, + SPDK_COUNTOF(rpc_bdev_set_qos_limit_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (req.limits[i] != UINT64_MAX) { + break; + } + } + if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { + SPDK_ERRLOG("no rate limits specified\n"); + spdk_jsonrpc_send_error_response(request, -EINVAL, "No rate limits specified"); + goto cleanup; + } + + spdk_bdev_set_qos_rate_limits(bdev, req.limits, rpc_bdev_set_qos_limit_complete, request); + +cleanup: + free_rpc_bdev_set_qos_limit(&req); +} + +SPDK_RPC_REGISTER("bdev_set_qos_limit", rpc_bdev_set_qos_limit, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_set_qos_limit, set_bdev_qos_limit) + +/* SPDK_RPC_ENABLE_BDEV_HISTOGRAM */ + +struct rpc_bdev_enable_histogram_request { + char *name; + bool enable; +}; + +static void +free_rpc_bdev_enable_histogram_request(struct rpc_bdev_enable_histogram_request *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_bdev_enable_histogram_request_decoders[] = { + {"name", offsetof(struct rpc_bdev_enable_histogram_request, name), spdk_json_decode_string}, + {"enable", offsetof(struct rpc_bdev_enable_histogram_request, enable), spdk_json_decode_bool}, +}; + +static void +bdev_histogram_status_cb(void *cb_arg, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_bool(w, status == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_enable_histogram(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_enable_histogram_request req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_bdev_enable_histogram_request_decoders, + SPDK_COUNTOF(rpc_bdev_enable_histogram_request_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + spdk_bdev_histogram_enable(bdev, bdev_histogram_status_cb, request, req.enable); + +cleanup: + free_rpc_bdev_enable_histogram_request(&req); +} + +SPDK_RPC_REGISTER("bdev_enable_histogram", rpc_bdev_enable_histogram, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_enable_histogram, enable_bdev_histogram) + +/* SPDK_RPC_GET_BDEV_HISTOGRAM */ + +struct rpc_bdev_get_histogram_request { + char *name; +}; + +static const struct spdk_json_object_decoder rpc_bdev_get_histogram_request_decoders[] = { + {"name", offsetof(struct rpc_bdev_get_histogram_request, name), spdk_json_decode_string} +}; + +static void +free_rpc_bdev_get_histogram_request(struct rpc_bdev_get_histogram_request *r) +{ + free(r->name); +} + +static void +_rpc_bdev_histogram_data_cb(void *cb_arg, int status, struct spdk_histogram_data *histogram) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + int rc; + char *encoded_histogram; + size_t src_len, dst_len; + + + if (status != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(-status)); + goto invalid; + } + + src_len = SPDK_HISTOGRAM_NUM_BUCKETS(histogram) * sizeof(uint64_t); + dst_len = spdk_base64_get_encoded_strlen(src_len) + 1; + + encoded_histogram = malloc(dst_len); + if (encoded_histogram == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(ENOMEM)); + goto invalid; + } + + rc = spdk_base64_encode(encoded_histogram, histogram->bucket, src_len); + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(-rc)); + goto free_encoded_histogram; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "histogram", encoded_histogram); + spdk_json_write_named_int64(w, "bucket_shift", histogram->bucket_shift); + spdk_json_write_named_int64(w, "tsc_rate", spdk_get_ticks_hz()); + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); + +free_encoded_histogram: + free(encoded_histogram); +invalid: + spdk_histogram_data_free(histogram); +} + +static void +rpc_bdev_get_histogram(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_get_histogram_request req = {NULL}; + struct spdk_histogram_data *histogram; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_bdev_get_histogram_request_decoders, + SPDK_COUNTOF(rpc_bdev_get_histogram_request_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + histogram = spdk_histogram_data_alloc(); + if (histogram == NULL) { + spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); + goto cleanup; + } + + spdk_bdev_histogram_get(bdev, histogram, _rpc_bdev_histogram_data_cb, request); + +cleanup: + free_rpc_bdev_get_histogram_request(&req); +} + +SPDK_RPC_REGISTER("bdev_get_histogram", rpc_bdev_get_histogram, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_get_histogram, get_bdev_histogram) diff --git a/src/spdk/module/bdev/split/Makefile b/src/spdk/module/bdev/split/Makefile new file mode 100644 index 000000000..830224c62 --- /dev/null +++ b/src/spdk/module/bdev/split/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = vbdev_split.c vbdev_split_rpc.c +LIBNAME = bdev_split + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/split/vbdev_split.c b/src/spdk/module/bdev/split/vbdev_split.c new file mode 100644 index 000000000..fd175d339 --- /dev/null +++ b/src/spdk/module/bdev/split/vbdev_split.c @@ -0,0 +1,582 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This is a simple example of a virtual block device that takes a single + * bdev and slices it into multiple smaller bdevs. + */ + +#include "vbdev_split.h" + +#include "spdk/rpc.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +struct spdk_vbdev_split_config { + char *base_bdev; + unsigned split_count; + uint64_t split_size_mb; + + SPDK_BDEV_PART_TAILQ splits; + struct spdk_bdev_part_base *split_base; + + TAILQ_ENTRY(spdk_vbdev_split_config) tailq; +}; + +static TAILQ_HEAD(, spdk_vbdev_split_config) g_split_config = TAILQ_HEAD_INITIALIZER( + g_split_config); + +struct vbdev_split_channel { + struct spdk_bdev_part_channel part_ch; +}; + +struct vbdev_split_bdev_io { + struct spdk_io_channel *ch; + struct spdk_bdev_io *bdev_io; + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; +}; + +static void vbdev_split_del_config(struct spdk_vbdev_split_config *cfg); + +static int vbdev_split_init(void); +static void vbdev_split_fini(void); +static void vbdev_split_examine(struct spdk_bdev *bdev); +static int vbdev_split_config_json(struct spdk_json_write_ctx *w); +static int vbdev_split_get_ctx_size(void); + +static void +_vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io); + +static struct spdk_bdev_module split_if = { + .name = "split", + .module_init = vbdev_split_init, + .module_fini = vbdev_split_fini, + .get_ctx_size = vbdev_split_get_ctx_size, + .examine_config = vbdev_split_examine, + .config_json = vbdev_split_config_json, +}; + +SPDK_BDEV_MODULE_REGISTER(split, &split_if) + +static void +vbdev_split_base_free(void *ctx) +{ + struct spdk_vbdev_split_config *cfg = ctx; + + vbdev_split_del_config(cfg); +} + +static int +_vbdev_split_destruct(void *ctx) +{ + struct spdk_bdev_part *part = ctx; + + return spdk_bdev_part_free(part); +} + +static void +vbdev_split_base_bdev_hotremove_cb(void *_part_base) +{ + struct spdk_bdev_part_base *part_base = _part_base; + struct spdk_vbdev_split_config *cfg = spdk_bdev_part_base_get_ctx(part_base); + + spdk_bdev_part_base_hotremove(part_base, &cfg->splits); +} + +static void +vbdev_split_resubmit_io(void *arg) +{ + struct vbdev_split_bdev_io *split_io = (struct vbdev_split_bdev_io *)arg; + + _vbdev_split_submit_request(split_io->ch, split_io->bdev_io); +} + +static void +vbdev_split_queue_io(struct vbdev_split_bdev_io *split_io) +{ + struct vbdev_split_channel *ch = spdk_io_channel_get_ctx(split_io->ch); + int rc; + + split_io->bdev_io_wait.bdev = split_io->bdev_io->bdev; + split_io->bdev_io_wait.cb_fn = vbdev_split_resubmit_io; + split_io->bdev_io_wait.cb_arg = split_io; + + rc = spdk_bdev_queue_io_wait(split_io->bdev_io->bdev, + ch->part_ch.base_ch, &split_io->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vbdev_split_queue_io, rc=%d\n", rc); + spdk_bdev_io_complete(split_io->bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +_vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_split_channel *ch = spdk_io_channel_get_ctx(_ch); + struct vbdev_split_bdev_io *io_ctx = (struct vbdev_split_bdev_io *)bdev_io->driver_ctx; + int rc; + + rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "split: no memory, queue io.\n"); + io_ctx->ch = _ch; + io_ctx->bdev_io = bdev_io; + vbdev_split_queue_io(io_ctx); + } else { + SPDK_ERRLOG("split: error on io submission, rc=%d.\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +static void +vbdev_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + _vbdev_split_submit_request(ch, bdev_io); +} + +static void +vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, vbdev_split_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + default: + _vbdev_split_submit_request(_ch, bdev_io); + break; + } +} + +static int +vbdev_split_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct spdk_bdev_part *part = ctx; + struct spdk_bdev *split_base_bdev = spdk_bdev_part_get_base_bdev(part); + uint64_t offset_blocks = spdk_bdev_part_get_offset_blocks(part); + + spdk_json_write_named_object_begin(w, "split"); + + spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(split_base_bdev)); + spdk_json_write_named_uint64(w, "offset_blocks", offset_blocks); + + spdk_json_write_object_end(w); + + return 0; +} + +static void +vbdev_split_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* No config per bdev needed */ +} + +static struct spdk_bdev_fn_table vbdev_split_fn_table = { + .destruct = _vbdev_split_destruct, + .submit_request = vbdev_split_submit_request, + .dump_info_json = vbdev_split_dump_info_json, + .write_config_json = vbdev_split_write_config_json +}; + +static int +vbdev_split_create(struct spdk_vbdev_split_config *cfg) +{ + uint64_t split_size_blocks, offset_blocks; + uint64_t split_count, max_split_count; + uint64_t mb = 1024 * 1024; + uint64_t i; + int rc; + char *name; + struct spdk_bdev *base_bdev; + struct bdev_part_tailq *split_base_tailq; + + assert(cfg->split_count > 0); + + base_bdev = spdk_bdev_get_by_name(cfg->base_bdev); + if (!base_bdev) { + return -ENODEV; + } + + if (cfg->split_size_mb) { + if (((cfg->split_size_mb * mb) % base_bdev->blocklen) != 0) { + SPDK_ERRLOG("Split size %" PRIu64 " MB is not possible with block size " + "%" PRIu32 "\n", + cfg->split_size_mb, base_bdev->blocklen); + return -EINVAL; + } + split_size_blocks = (cfg->split_size_mb * mb) / base_bdev->blocklen; + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "Split size %" PRIu64 " MB specified by user\n", + cfg->split_size_mb); + } else { + split_size_blocks = base_bdev->blockcnt / cfg->split_count; + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "Split size not specified by user\n"); + } + + max_split_count = base_bdev->blockcnt / split_size_blocks; + split_count = cfg->split_count; + if (split_count > max_split_count) { + SPDK_WARNLOG("Split count %" PRIu64 " is greater than maximum possible split count " + "%" PRIu64 " - clamping\n", split_count, max_split_count); + split_count = max_split_count; + } + + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "base_bdev: %s split_count: %" PRIu64 + " split_size_blocks: %" PRIu64 "\n", + spdk_bdev_get_name(base_bdev), split_count, split_size_blocks); + + TAILQ_INIT(&cfg->splits); + cfg->split_base = spdk_bdev_part_base_construct(base_bdev, + vbdev_split_base_bdev_hotremove_cb, + &split_if, &vbdev_split_fn_table, + &cfg->splits, vbdev_split_base_free, cfg, + sizeof(struct vbdev_split_channel), NULL, NULL); + if (!cfg->split_base) { + SPDK_ERRLOG("Cannot construct bdev part base\n"); + return -ENOMEM; + } + + offset_blocks = 0; + for (i = 0; i < split_count; i++) { + struct spdk_bdev_part *d; + + d = calloc(1, sizeof(*d)); + if (d == NULL) { + SPDK_ERRLOG("could not allocate bdev part\n"); + rc = -ENOMEM; + goto err; + } + + name = spdk_sprintf_alloc("%sp%" PRIu64, cfg->base_bdev, i); + if (!name) { + SPDK_ERRLOG("could not allocate name\n"); + free(d); + rc = -ENOMEM; + goto err; + } + + rc = spdk_bdev_part_construct(d, cfg->split_base, name, offset_blocks, split_size_blocks, + "Split Disk"); + free(name); + if (rc) { + SPDK_ERRLOG("could not construct bdev part\n"); + /* spdk_bdev_part_construct will free name if it fails */ + free(d); + rc = -ENOMEM; + goto err; + } + + offset_blocks += split_size_blocks; + } + + return 0; +err: + split_base_tailq = spdk_bdev_part_base_get_tailq(cfg->split_base); + spdk_bdev_part_base_hotremove(cfg->split_base, split_base_tailq); + return rc; +} + +static void +vbdev_split_del_config(struct spdk_vbdev_split_config *cfg) +{ + TAILQ_REMOVE(&g_split_config, cfg, tailq); + free(cfg->base_bdev); + free(cfg); +} + +static void +vbdev_split_destruct_config(struct spdk_vbdev_split_config *cfg) +{ + struct bdev_part_tailq *split_base_tailq; + + if (cfg->split_base != NULL) { + split_base_tailq = spdk_bdev_part_base_get_tailq(cfg->split_base); + spdk_bdev_part_base_hotremove(cfg->split_base, split_base_tailq); + } else { + vbdev_split_del_config(cfg); + } +} + +static void +vbdev_split_clear_config(void) +{ + struct spdk_vbdev_split_config *cfg, *tmp_cfg; + + TAILQ_FOREACH_SAFE(cfg, &g_split_config, tailq, tmp_cfg) { + vbdev_split_destruct_config(cfg); + } +} + +static struct spdk_vbdev_split_config * +vbdev_split_config_find_by_base_name(const char *base_bdev_name) +{ + struct spdk_vbdev_split_config *cfg; + + TAILQ_FOREACH(cfg, &g_split_config, tailq) { + if (strcmp(cfg->base_bdev, base_bdev_name) == 0) { + return cfg; + } + } + + return NULL; +} + +static int +vbdev_split_add_config(const char *base_bdev_name, unsigned split_count, uint64_t split_size, + struct spdk_vbdev_split_config **config) +{ + struct spdk_vbdev_split_config *cfg; + assert(base_bdev_name); + + if (base_bdev_name == NULL) { + SPDK_ERRLOG("Split bdev config: no base bdev provided."); + return -EINVAL; + } + + if (split_count == 0) { + SPDK_ERRLOG("Split bdev config: split_count can't be 0."); + return -EINVAL; + } + + /* Check if we already have 'base_bdev_name' registered in config */ + cfg = vbdev_split_config_find_by_base_name(base_bdev_name); + if (cfg) { + SPDK_ERRLOG("Split bdev config for base bdev '%s' already exist.", base_bdev_name); + return -EEXIST; + } + + cfg = calloc(1, sizeof(*cfg)); + if (!cfg) { + SPDK_ERRLOG("calloc(): Out of memory"); + return -ENOMEM; + } + + cfg->base_bdev = strdup(base_bdev_name); + if (!cfg->base_bdev) { + SPDK_ERRLOG("strdup(): Out of memory"); + free(cfg); + return -ENOMEM; + } + + cfg->split_count = split_count; + cfg->split_size_mb = split_size; + TAILQ_INSERT_TAIL(&g_split_config, cfg, tailq); + if (config) { + *config = cfg; + } + + return 0; +} + +static int +vbdev_split_init(void) +{ + + struct spdk_conf_section *sp; + const char *base_bdev_name; + const char *split_count_str; + const char *split_size_str; + int rc, i, split_count, split_size; + + sp = spdk_conf_find_section(NULL, "Split"); + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + if (!spdk_conf_section_get_nval(sp, "Split", i)) { + break; + } + + base_bdev_name = spdk_conf_section_get_nmval(sp, "Split", i, 0); + if (!base_bdev_name) { + SPDK_ERRLOG("Split configuration missing bdev name\n"); + rc = -EINVAL; + goto err; + } + + split_count_str = spdk_conf_section_get_nmval(sp, "Split", i, 1); + if (!split_count_str) { + SPDK_ERRLOG("Split configuration missing split count\n"); + rc = -EINVAL; + goto err; + } + + split_count = spdk_strtol(split_count_str, 10); + if (split_count < 1) { + SPDK_ERRLOG("Invalid Split count %d\n", split_count); + rc = -EINVAL; + goto err; + } + + /* Optional split size in MB */ + split_size = 0; + split_size_str = spdk_conf_section_get_nmval(sp, "Split", i, 2); + if (split_size_str) { + split_size = spdk_strtol(split_size_str, 10); + if (split_size <= 0) { + SPDK_ERRLOG("Invalid Split size %d\n", split_size); + rc = -EINVAL; + goto err; + } + } + + rc = vbdev_split_add_config(base_bdev_name, split_count, split_size, NULL); + if (rc != 0) { + goto err; + } + } + + return 0; +err: + vbdev_split_clear_config(); + return rc; +} + +static void +vbdev_split_fini(void) +{ + vbdev_split_clear_config(); +} + +static void +vbdev_split_examine(struct spdk_bdev *bdev) +{ + struct spdk_vbdev_split_config *cfg = vbdev_split_config_find_by_base_name(bdev->name); + + if (cfg != NULL) { + assert(cfg->split_base == NULL); + + if (vbdev_split_create(cfg)) { + SPDK_ERRLOG("could not split bdev %s\n", bdev->name); + } + } + spdk_bdev_module_examine_done(&split_if); +} + +static int +vbdev_split_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_vbdev_split_config *cfg; + + TAILQ_FOREACH(cfg, &g_split_config, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_split_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "base_bdev", cfg->base_bdev); + spdk_json_write_named_uint32(w, "split_count", cfg->split_count); + spdk_json_write_named_uint64(w, "split_size_mb", cfg->split_size_mb); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + + return 0; +} + +int +create_vbdev_split(const char *base_bdev_name, unsigned split_count, uint64_t split_size_mb) +{ + int rc; + struct spdk_vbdev_split_config *cfg; + + rc = vbdev_split_add_config(base_bdev_name, split_count, split_size_mb, &cfg); + if (rc) { + return rc; + } + + rc = vbdev_split_create(cfg); + if (rc == -ENODEV) { + /* It is ok if base bdev does not exist yet. */ + rc = 0; + } + + return rc; +} + +int +vbdev_split_destruct(const char *base_bdev_name) +{ + struct spdk_vbdev_split_config *cfg = vbdev_split_config_find_by_base_name(base_bdev_name); + + if (!cfg) { + SPDK_ERRLOG("Split configuration for '%s' not found\n", base_bdev_name); + return -ENOENT; + } + + vbdev_split_destruct_config(cfg); + return 0; +} + +struct spdk_bdev_part_base * +vbdev_split_get_part_base(struct spdk_bdev *bdev) +{ + struct spdk_vbdev_split_config *cfg; + + cfg = vbdev_split_config_find_by_base_name(spdk_bdev_get_name(bdev)); + + if (cfg == NULL) { + return NULL; + } + + return cfg->split_base; +} + +/* + * During init we'll be asked how much memory we'd like passed to us + * in bev_io structures as context. Here's where we specify how + * much context we want per IO. + */ +static int +vbdev_split_get_ctx_size(void) +{ + return sizeof(struct vbdev_split_bdev_io); +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_split", SPDK_LOG_VBDEV_SPLIT) diff --git a/src/spdk/module/bdev/split/vbdev_split.h b/src/spdk/module/bdev/split/vbdev_split.h new file mode 100644 index 000000000..f468f2414 --- /dev/null +++ b/src/spdk/module/bdev/split/vbdev_split.h @@ -0,0 +1,68 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_SPLIT_H +#define SPDK_VBDEV_SPLIT_H + +#include "spdk/bdev_module.h" + +/** + * Add given disk name to split config. If bdev with \c base_bdev_name name + * exist the split bdevs will be created right away, if not the split bdevs will + * be created when base bdev became be available (during examination process). + * + * \param base_bdev_name Base bdev name + * \param split_count number of splits to be created. + * \param split_size_mb size of each bdev. If 0 use base bdev size / split_count + * \return value >= 0 - number of splits create. Negative errno code on error. + */ +int create_vbdev_split(const char *base_bdev_name, unsigned split_count, uint64_t split_size_mb); + +/** + * Remove all created split bdevs and split config. + * + * \param base_bdev_name base bdev name + * \return 0 on success or negative errno value. + */ +int vbdev_split_destruct(const char *base_bdev_name); + +/** + * Get the spdk_bdev_part_base associated with the given split base_bdev. + * + * \param base_bdev Bdev to get the part_base from + * \return pointer to the associated spdk_bdev_part_base + * \return NULL if the base_bdev is not being split by the split module + */ +struct spdk_bdev_part_base *vbdev_split_get_part_base(struct spdk_bdev *base_bdev); + +#endif /* SPDK_VBDEV_SPLIT_H */ diff --git a/src/spdk/module/bdev/split/vbdev_split_rpc.c b/src/spdk/module/bdev/split/vbdev_split_rpc.c new file mode 100644 index 000000000..a8c6f3be0 --- /dev/null +++ b/src/spdk/module/bdev/split/vbdev_split_rpc.c @@ -0,0 +1,145 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "vbdev_split.h" +#include "spdk_internal/log.h" + +struct rpc_construct_split { + char *base_bdev; + uint32_t split_count; + uint64_t split_size_mb; +}; + +static const struct spdk_json_object_decoder rpc_construct_split_decoders[] = { + {"base_bdev", offsetof(struct rpc_construct_split, base_bdev), spdk_json_decode_string}, + {"split_count", offsetof(struct rpc_construct_split, split_count), spdk_json_decode_uint32}, + {"split_size_mb", offsetof(struct rpc_construct_split, split_size_mb), spdk_json_decode_uint64, true}, +}; + +static void +rpc_bdev_split_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_split req = {}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *base_bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_split_decoders, + SPDK_COUNTOF(rpc_construct_split_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + rc = create_vbdev_split(req.base_bdev, req.split_count, req.split_size_mb); + if (rc < 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Failed to create %"PRIu32" split bdevs from '%s': %s", + req.split_count, req.base_bdev, spdk_strerror(-rc)); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_array_begin(w); + + base_bdev = spdk_bdev_get_by_name(req.base_bdev); + if (base_bdev != NULL) { + struct spdk_bdev_part_base *split_base; + struct bdev_part_tailq *split_base_tailq; + struct spdk_bdev_part *split_part; + struct spdk_bdev *split_bdev; + + split_base = vbdev_split_get_part_base(base_bdev); + + assert(split_base != NULL); + + split_base_tailq = spdk_bdev_part_base_get_tailq(split_base); + TAILQ_FOREACH(split_part, split_base_tailq, tailq) { + split_bdev = spdk_bdev_part_get_bdev(split_part); + spdk_json_write_string(w, spdk_bdev_get_name(split_bdev)); + } + } + + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + +out: + free(req.base_bdev); +} +SPDK_RPC_REGISTER("bdev_split_create", rpc_bdev_split_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_split_create, construct_split_vbdev) + +struct rpc_delete_split { + char *base_bdev; +}; + +static const struct spdk_json_object_decoder rpc_delete_split_decoders[] = { + {"base_bdev", offsetof(struct rpc_delete_split, base_bdev), spdk_json_decode_string}, +}; + +static void +rpc_bdev_split_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_split req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_split_decoders, + SPDK_COUNTOF(rpc_delete_split_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + rc = vbdev_split_destruct(req.base_bdev); + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +out: + free(req.base_bdev); +} +SPDK_RPC_REGISTER("bdev_split_delete", rpc_bdev_split_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_split_delete, destruct_split_vbdev) diff --git a/src/spdk/module/bdev/uring/Makefile b/src/spdk/module/bdev/uring/Makefile new file mode 100644 index 000000000..2a97f1564 --- /dev/null +++ b/src/spdk/module/bdev/uring/Makefile @@ -0,0 +1,51 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = bdev_uring.c bdev_uring_rpc.c +LIBNAME = bdev_uring +LOCAL_SYS_LIBS = -luring + +ifneq ($(strip $(CONFIG_URING_PATH)),) +CFLAGS += -I$(CONFIG_URING_PATH) +LDFLAGS += -L$(CONFIG_URING_PATH) +endif + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/uring/bdev_uring.c b/src/spdk/module/bdev/uring/bdev_uring.c new file mode 100644 index 000000000..494cc4794 --- /dev/null +++ b/src/spdk/module/bdev/uring/bdev_uring.c @@ -0,0 +1,676 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_uring.h" + +#include "spdk/stdinc.h" + +#include "spdk/barrier.h" +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/fd.h" +#include "spdk/likely.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" +#include "spdk_internal/uring.h" + +struct bdev_uring_io_channel { + struct bdev_uring_group_channel *group_ch; +}; + +struct bdev_uring_group_channel { + uint64_t io_inflight; + uint64_t io_pending; + struct spdk_poller *poller; + struct io_uring uring; +}; + +struct bdev_uring_task { + uint64_t len; + struct bdev_uring_io_channel *ch; + TAILQ_ENTRY(bdev_uring_task) link; +}; + +struct bdev_uring { + struct spdk_bdev bdev; + char *filename; + int fd; + TAILQ_ENTRY(bdev_uring) link; +}; + +static int bdev_uring_init(void); +static void bdev_uring_fini(void); +static void uring_free_bdev(struct bdev_uring *uring); +static void bdev_uring_get_spdk_running_config(FILE *fp); +static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head; + +#define SPDK_URING_QUEUE_DEPTH 512 +#define MAX_EVENTS_PER_POLL 32 + +static int +bdev_uring_get_ctx_size(void) +{ + return sizeof(struct bdev_uring_task); +} + +static struct spdk_bdev_module uring_if = { + .name = "uring", + .module_init = bdev_uring_init, + .module_fini = bdev_uring_fini, + .config_text = bdev_uring_get_spdk_running_config, + .get_ctx_size = bdev_uring_get_ctx_size, +}; + +SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) + +static int +bdev_uring_open(struct bdev_uring *bdev) +{ + int fd; + + fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME); + if (fd < 0) { + /* Try without O_DIRECT for non-disk files */ + fd = open(bdev->filename, O_RDWR | O_NOATIME); + if (fd < 0) { + SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", + bdev->filename, errno, spdk_strerror(errno)); + bdev->fd = -1; + return -1; + } + } + + bdev->fd = fd; + + return 0; +} + +static int +bdev_uring_close(struct bdev_uring *bdev) +{ + int rc; + + if (bdev->fd == -1) { + return 0; + } + + rc = close(bdev->fd); + if (rc < 0) { + SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", + bdev->fd, errno, spdk_strerror(errno)); + return -1; + } + + bdev->fd = -1; + + return 0; +} + +static int64_t +bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, + struct bdev_uring_task *uring_task, + struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) +{ + struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); + struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(&group_ch->uring); + io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); + io_uring_sqe_set_data(sqe, uring_task); + uring_task->len = nbytes; + uring_task->ch = uring_ch; + + SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n", + iovcnt, nbytes, offset); + + group_ch->io_pending++; + return nbytes; +} + +static int64_t +bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, + struct bdev_uring_task *uring_task, + struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) +{ + struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); + struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(&group_ch->uring); + io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); + io_uring_sqe_set_data(sqe, uring_task); + uring_task->len = nbytes; + uring_task->ch = uring_ch; + + SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n", + iovcnt, nbytes, offset); + + group_ch->io_pending++; + return nbytes; +} + +static int +bdev_uring_destruct(void *ctx) +{ + struct bdev_uring *uring = ctx; + int rc = 0; + + TAILQ_REMOVE(&g_uring_bdev_head, uring, link); + rc = bdev_uring_close(uring); + if (rc < 0) { + SPDK_ERRLOG("bdev_uring_close() failed\n"); + } + spdk_io_device_unregister(uring, NULL); + uring_free_bdev(uring); + return rc; +} + +static int +bdev_uring_reap(struct io_uring *ring, int max) +{ + int i, count, ret; + struct io_uring_cqe *cqe; + struct bdev_uring_task *uring_task; + enum spdk_bdev_io_status status; + + count = 0; + for (i = 0; i < max; i++) { + ret = io_uring_peek_cqe(ring, &cqe); + if (ret != 0) { + return ret; + } + + if (cqe == NULL) { + return count; + } + + uring_task = (struct bdev_uring_task *)cqe->user_data; + if (cqe->res != (signed)uring_task->len) { + status = SPDK_BDEV_IO_STATUS_FAILED; + } else { + status = SPDK_BDEV_IO_STATUS_SUCCESS; + } + + uring_task->ch->group_ch->io_inflight--; + io_uring_cqe_seen(ring, cqe); + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); + count++; + } + + return count; +} + +static int +bdev_uring_group_poll(void *arg) +{ + struct bdev_uring_group_channel *group_ch = arg; + int to_complete, to_submit; + int count, ret; + + to_submit = group_ch->io_pending; + to_complete = group_ch->io_inflight; + + ret = 0; + if (to_submit > 0) { + /* If there are I/O to submit, use io_uring_submit here. + * It will automatically call spdk_io_uring_enter appropriately. */ + ret = io_uring_submit(&group_ch->uring); + group_ch->io_pending = 0; + group_ch->io_inflight += to_submit; + } else if (to_complete > 0) { + /* If there are I/O in flight but none to submit, we need to + * call io_uring_enter ourselves. */ + ret = spdk_io_uring_enter(group_ch->uring.ring_fd, 0, 0, + IORING_ENTER_GETEVENTS); + } + + if (ret < 0) { + return SPDK_POLLER_BUSY; + } + + count = 0; + if (to_complete > 0) { + count = bdev_uring_reap(&group_ch->uring, to_complete); + } + + if (count + to_submit > 0) { + return SPDK_POLLER_BUSY; + } else { + return SPDK_POLLER_IDLE; + } +} + +static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, + ch, + (struct bdev_uring_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, + ch, + (struct bdev_uring_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + break; + default: + SPDK_ERRLOG("Wrong io type\n"); + break; + } +} + +static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + /* Read and write operations must be performed on buffers aligned to + * bdev->required_alignment. If user specified unaligned buffers, + * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + default: + return -1; + } +} + +static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_uring_submit_request(ch, bdev_io) < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + return true; + default: + return false; + } +} + +static int +bdev_uring_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_uring_io_channel *ch = ctx_buf; + + ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); + + return 0; +} + +static void +bdev_uring_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_uring_io_channel *ch = ctx_buf; + + spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); +} + +static struct spdk_io_channel * +bdev_uring_get_io_channel(void *ctx) +{ + struct bdev_uring *uring = ctx; + + return spdk_get_io_channel(uring); +} + +static int +bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct bdev_uring *uring = ctx; + + spdk_json_write_named_object_begin(w, "uring"); + + spdk_json_write_named_string(w, "filename", uring->filename); + + spdk_json_write_object_end(w); + + return 0; +} + +static void +bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct bdev_uring *uring = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_uring_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); + spdk_json_write_named_string(w, "filename", uring->filename); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table uring_fn_table = { + .destruct = bdev_uring_destruct, + .submit_request = bdev_uring_submit_request, + .io_type_supported = bdev_uring_io_type_supported, + .get_io_channel = bdev_uring_get_io_channel, + .dump_info_json = bdev_uring_dump_info_json, + .write_config_json = bdev_uring_write_json_config, +}; + +static void uring_free_bdev(struct bdev_uring *uring) +{ + if (uring == NULL) { + return; + } + free(uring->filename); + free(uring->bdev.name); + free(uring); +} + +static int +bdev_uring_group_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_uring_group_channel *ch = ctx_buf; + + if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) { + SPDK_ERRLOG("uring I/O context setup failure\n"); + return -1; + } + + ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0); + return 0; +} + +static void +bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_uring_group_channel *ch = ctx_buf; + + io_uring_queue_exit(&ch->uring); + + spdk_poller_unregister(&ch->poller); +} + +struct spdk_bdev * +create_uring_bdev(const char *name, const char *filename, uint32_t block_size) +{ + struct bdev_uring *uring; + uint32_t detected_block_size; + uint64_t bdev_size; + int rc; + + uring = calloc(1, sizeof(*uring)); + if (!uring) { + SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); + return NULL; + } + + uring->filename = strdup(filename); + if (!uring->filename) { + goto error_return; + } + + if (bdev_uring_open(uring)) { + SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); + goto error_return; + } + + bdev_size = spdk_fd_get_size(uring->fd); + + uring->bdev.name = strdup(name); + if (!uring->bdev.name) { + goto error_return; + } + uring->bdev.product_name = "URING bdev"; + uring->bdev.module = &uring_if; + + uring->bdev.write_cache = 1; + + detected_block_size = spdk_fd_get_blocklen(uring->fd); + if (block_size == 0) { + /* User did not specify block size - use autodetected block size. */ + if (detected_block_size == 0) { + SPDK_ERRLOG("Block size could not be auto-detected\n"); + goto error_return; + } + block_size = detected_block_size; + } else { + if (block_size < detected_block_size) { + SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " + "auto-detected block size %" PRIu32 "\n", + block_size, detected_block_size); + goto error_return; + } else if (detected_block_size != 0 && block_size != detected_block_size) { + SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " + "auto-detected block size %" PRIu32 "\n", + block_size, detected_block_size); + } + } + + if (block_size < 512) { + SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); + goto error_return; + } + + if (!spdk_u32_is_pow2(block_size)) { + SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); + goto error_return; + } + + uring->bdev.blocklen = block_size; + uring->bdev.required_alignment = spdk_u32log2(block_size); + + if (bdev_size % uring->bdev.blocklen != 0) { + SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", + bdev_size, uring->bdev.blocklen); + goto error_return; + } + + uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; + uring->bdev.ctxt = uring; + + uring->bdev.fn_table = &uring_fn_table; + + spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, + sizeof(struct bdev_uring_io_channel), + uring->bdev.name); + rc = spdk_bdev_register(&uring->bdev); + if (rc) { + spdk_io_device_unregister(uring, NULL); + goto error_return; + } + + TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); + return &uring->bdev; + +error_return: + bdev_uring_close(uring); + uring_free_bdev(uring); + return NULL; +} + +struct delete_uring_bdev_ctx { + spdk_delete_uring_complete cb_fn; + void *cb_arg; +}; + +static void +uring_bdev_unregister_cb(void *arg, int bdeverrno) +{ + struct delete_uring_bdev_ctx *ctx = arg; + + ctx->cb_fn(ctx->cb_arg, bdeverrno); + free(ctx); +} + +void +delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg) +{ + struct delete_uring_bdev_ctx *ctx; + + if (!bdev || bdev->module != &uring_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx); +} + +static int +bdev_uring_init(void) +{ + size_t i; + struct spdk_conf_section *sp; + struct spdk_bdev *bdev; + + TAILQ_INIT(&g_uring_bdev_head); + spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, + sizeof(struct bdev_uring_group_channel), + "uring_module"); + + sp = spdk_conf_find_section(NULL, "URING"); + if (!sp) { + return 0; + } + + i = 0; + while (true) { + const char *file; + const char *name; + const char *block_size_str; + uint32_t block_size = 0; + long int tmp; + + file = spdk_conf_section_get_nmval(sp, "URING", i, 0); + if (!file) { + break; + } + + name = spdk_conf_section_get_nmval(sp, "URING", i, 1); + if (!name) { + SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file); + i++; + continue; + } + + block_size_str = spdk_conf_section_get_nmval(sp, "URING", i, 2); + if (block_size_str) { + tmp = spdk_strtol(block_size_str, 10); + if (tmp < 0) { + SPDK_ERRLOG("Invalid block size for URING bdev with file %s\n", file); + i++; + continue; + } + block_size = (uint32_t)tmp; + } + + bdev = create_uring_bdev(name, file, block_size); + if (!bdev) { + SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file); + i++; + continue; + } + + i++; + } + + return 0; +} + +static void +bdev_uring_fini(void) +{ + spdk_io_device_unregister(&uring_if, NULL); +} + +static void +bdev_uring_get_spdk_running_config(FILE *fp) +{ + char *file; + char *name; + uint32_t block_size; + struct bdev_uring *uring; + + fprintf(fp, + "\n" + "# Users must change this section to match the /dev/sdX devices to be\n" + "# exported as iSCSI LUNs. The devices are accessed using io_uring.\n" + "# The format is:\n" + "# URING <file name> <bdev name> [<block size>]\n" + "# The file name is the backing device\n" + "# The bdev name can be referenced from elsewhere in the configuration file.\n" + "# Block size may be omitted to automatically detect the block size of a bdev.\n" + "[URING]\n"); + + TAILQ_FOREACH(uring, &g_uring_bdev_head, link) { + file = uring->filename; + name = uring->bdev.name; + block_size = uring->bdev.blocklen; + fprintf(fp, " URING %s %s %d\n", file, name, block_size); + } + fprintf(fp, "\n"); +} + +SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING) diff --git a/src/spdk/module/bdev/uring/bdev_uring.h b/src/spdk/module/bdev/uring/bdev_uring.h new file mode 100644 index 000000000..a35681832 --- /dev/null +++ b/src/spdk/module/bdev/uring/bdev_uring.h @@ -0,0 +1,50 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_URING_H +#define SPDK_BDEV_URING_H + +#include "spdk/stdinc.h" + +#include "spdk/queue.h" +#include "spdk/bdev.h" + +#include "spdk/bdev_module.h" + +typedef void (*spdk_delete_uring_complete)(void *cb_arg, int bdeverrno); + +struct spdk_bdev *create_uring_bdev(const char *name, const char *filename, uint32_t block_size); + +void delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg); + +#endif /* SPDK_BDEV_URING_H */ diff --git a/src/spdk/module/bdev/uring/bdev_uring_rpc.c b/src/spdk/module/bdev/uring/bdev_uring_rpc.c new file mode 100644 index 000000000..e65751002 --- /dev/null +++ b/src/spdk/module/bdev/uring/bdev_uring_rpc.c @@ -0,0 +1,150 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_uring.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +/* Structure to hold the parameters for this RPC method. */ +struct rpc_create_uring { + char *name; + char *filename; + uint32_t block_size; +}; + +/* Free the allocated memory resource after the RPC handling. */ +static void +free_rpc_create_uring(struct rpc_create_uring *r) +{ + free(r->name); + free(r->filename); +} + +/* Structure to decode the input parameters for this RPC method. */ +static const struct spdk_json_object_decoder rpc_create_uring_decoders[] = { + {"name", offsetof(struct rpc_create_uring, name), spdk_json_decode_string}, + {"filename", offsetof(struct rpc_create_uring, filename), spdk_json_decode_string}, + {"block_size", offsetof(struct rpc_create_uring, block_size), spdk_json_decode_uint32, true}, +}; + +/* Decode the parameters for this RPC method and properly create the uring + * device. Error status returned in the failed cases. + */ +static void +rpc_bdev_uring_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_create_uring req = {}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_create_uring_decoders, + SPDK_COUNTOF(rpc_create_uring_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = create_uring_bdev(req.name, req.filename, req.block_size); + if (!bdev) { + SPDK_ERRLOG("Unable to create URING bdev from file %s\n", req.filename); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Unable to create URING bdev."); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, req.name); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_create_uring(&req); +} +SPDK_RPC_REGISTER("bdev_uring_create", rpc_bdev_uring_create, SPDK_RPC_RUNTIME) + +struct rpc_delete_uring { + char *name; +}; + +static void +free_rpc_delete_uring(struct rpc_delete_uring *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_uring_decoders[] = { + {"name", offsetof(struct rpc_delete_uring, name), spdk_json_decode_string}, +}; + +static void +_rpc_bdev_uring_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); + +} + +static void +rpc_bdev_uring_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_uring req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_delete_uring_decoders, + SPDK_COUNTOF(rpc_delete_uring_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + delete_uring_bdev(bdev, _rpc_bdev_uring_delete_cb, request); + +cleanup: + free_rpc_delete_uring(&req); +} +SPDK_RPC_REGISTER("bdev_uring_delete", rpc_bdev_uring_delete, SPDK_RPC_RUNTIME) diff --git a/src/spdk/module/bdev/virtio/Makefile b/src/spdk/module/bdev/virtio/Makefile new file mode 100644 index 000000000..602927afe --- /dev/null +++ b/src/spdk/module/bdev/virtio/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = bdev_virtio_scsi.c bdev_virtio_blk.c bdev_virtio_rpc.c +LIBNAME = bdev_virtio + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/virtio/bdev_virtio.h b/src/spdk/module/bdev/virtio/bdev_virtio.h new file mode 100644 index 000000000..538fab8f6 --- /dev/null +++ b/src/spdk/module/bdev/virtio/bdev_virtio.h @@ -0,0 +1,164 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_VIRTIO_H +#define SPDK_BDEV_VIRTIO_H + +#include "spdk/bdev.h" +#include "spdk/env.h" + +/** + * Callback for creating virtio bdevs. + * + * \param ctx opaque context set by the user + * \param errnum error code. 0 on success, negative errno on error. + * \param bdevs contiguous array of created bdevs + * \param bdev_cnt number of bdevs in the `bdevs` array + */ +typedef void (*bdev_virtio_create_cb)(void *ctx, int errnum, + struct spdk_bdev **bdevs, size_t bdev_cnt); + +/** + * Callback for removing virtio devices. + * + * \param ctx opaque context set by the user + * \param errnum error code. 0 on success, negative errno on error. + */ +typedef void (*bdev_virtio_remove_cb)(void *ctx, int errnum); + +/** + * Connect to a vhost-user Unix domain socket and create a Virtio SCSI device. + * If the connection is successful, the device will be automatically scanned. + * The scan consists of probing the targets on the device and will result in + * creating possibly multiple Virtio SCSI bdevs - one for each target. Currently + * only one LUN per target is detected - LUN0. Note that the bdev creation is + * run asynchronously in the background. After it's finished, the `cb_fn` + * callback is called. + * + * \param name name for the virtio device. It will be inherited by all created + * bdevs, which are named in the following format: <name>t<target_id> + * \param path path to the socket + * \param num_queues max number of request virtqueues to use. `vdev` will be + * started successfully even if the host device supports less queues than requested. + * \param queue_size depth of each queue + * \param cb_fn function to be called after scanning all targets on the virtio + * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. + * \param cb_arg argument for the `cb_fn` + * \return zero on success (device scan is started) or negative error code. + * In case of error the \c cb_fn is not called. + */ +int bdev_virtio_user_scsi_dev_create(const char *name, const char *path, + unsigned num_queues, unsigned queue_size, + bdev_virtio_create_cb cb_fn, void *cb_arg); + +/** + * Attach virtio-pci device. This creates a Virtio SCSI device with the same + * capabilities as the vhost-user equivalent. The device will be automatically + * scanned for exposed SCSI targets. This will result in creating possibly multiple + * Virtio SCSI bdevs - one for each target. Currently only one LUN per target is + * detected - LUN0. Note that the bdev creation is run asynchronously in the + * background. After it's finished, the `cb_fn` callback is called. + * + * \param name name for the virtio device. It will be inherited by all created + * bdevs, which are named in the following format: <name>t<target_id> + * \param pci_addr PCI address of the device to attach + * \param cb_fn function to be called after scanning all targets on the virtio + * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. + * \param cb_arg argument for the `cb_fn` + * \return zero on success (device scan is started) or negative error code. + * In case of error the \c cb_fn is not called. + */ +int bdev_virtio_pci_scsi_dev_create(const char *name, struct spdk_pci_addr *pci_addr, + bdev_virtio_create_cb cb_fn, void *cb_arg); + +/** + * Remove a Virtio device with given name. This will destroy all bdevs exposed + * by this device. + * + * \param name virtio device name + * \param cb_fn function to be called after scanning all targets on the virtio + * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. Possible + * error codes are: + * * ENODEV - couldn't find device with given name + * * EBUSY - device is already being removed + * \param cb_arg argument for the `cb_fn` + * \return zero on success or -ENODEV if scsi dev does not exist + */ +int bdev_virtio_scsi_dev_remove(const char *name, + bdev_virtio_remove_cb cb_fn, void *cb_arg); + +/** + * Remove a Virtio device with given name. + * + * \param bdev virtio blk device bdev + * \param cb_fn function to be called after removing bdev + * \param cb_arg argument for the `cb_fn` + * \return zero on success, -ENODEV if bdev with 'name' does not exist or + * -EINVAL if bdev with 'name' is not a virtio blk device. + */ +int bdev_virtio_blk_dev_remove(const char *name, + bdev_virtio_remove_cb cb_fn, void *cb_arg); + +/** + * List all created Virtio-SCSI devices. + * + * \param write_ctx JSON context to write into + */ +void bdev_virtio_scsi_dev_list(struct spdk_json_write_ctx *write_ctx); + +/** + * Connect to a vhost-user Unix domain socket and create a Virtio BLK bdev. + * + * \param name name for the virtio bdev + * \param path path to the socket + * \param num_queues max number of request virtqueues to use. `vdev` will be + * started successfully even if the host device supports less queues than requested. + * \param queue_size depth of each queue + * \return virtio-blk bdev or NULL + */ +struct spdk_bdev *bdev_virtio_user_blk_dev_create(const char *name, const char *path, + unsigned num_queues, unsigned queue_size); + +/** + * Attach virtio-pci device. This creates a Virtio BLK device with the same + * capabilities as the vhost-user equivalent. + * + * \param name name for the virtio device. It will be inherited by all created + * bdevs, which are named in the following format: <name>t<target_id> + * \param pci_addr PCI address of the device to attach + * \return virtio-blk bdev or NULL + */ +struct spdk_bdev *bdev_virtio_pci_blk_dev_create(const char *name, + struct spdk_pci_addr *pci_addr); + +#endif /* SPDK_BDEV_VIRTIO_H */ diff --git a/src/spdk/module/bdev/virtio/bdev_virtio_blk.c b/src/spdk/module/bdev/virtio/bdev_virtio_blk.c new file mode 100644 index 000000000..99653e238 --- /dev/null +++ b/src/spdk/module/bdev/virtio/bdev_virtio_blk.c @@ -0,0 +1,756 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/json.h" + +#include "spdk_internal/assert.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk_internal/virtio.h" +#include "spdk_internal/vhost_user.h" + +#include <linux/virtio_blk.h> + +#include "bdev_virtio.h" + +struct virtio_blk_dev { + struct virtio_dev vdev; + struct spdk_bdev bdev; + bool readonly; + bool unmap; +}; + +struct virtio_blk_io_ctx { + struct iovec iov_req; + struct iovec iov_resp; + struct iovec iov_unmap; + struct virtio_blk_outhdr req; + struct virtio_blk_discard_write_zeroes unmap; + uint8_t resp; +}; + +struct bdev_virtio_blk_io_channel { + struct virtio_dev *vdev; + + /** Virtqueue exclusively assigned to this channel. */ + struct virtqueue *vq; + + /** Virtio response poller. */ + struct spdk_poller *poller; +}; + +/* Features desired/implemented by this driver. */ +#define VIRTIO_BLK_DEV_SUPPORTED_FEATURES \ + (1ULL << VIRTIO_BLK_F_BLK_SIZE | \ + 1ULL << VIRTIO_BLK_F_TOPOLOGY | \ + 1ULL << VIRTIO_BLK_F_MQ | \ + 1ULL << VIRTIO_BLK_F_RO | \ + 1ULL << VIRTIO_BLK_F_DISCARD | \ + 1ULL << VIRTIO_RING_F_EVENT_IDX | \ + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES) + +static int bdev_virtio_initialize(void); +static int bdev_virtio_blk_get_ctx_size(void); + +static struct spdk_bdev_module virtio_blk_if = { + .name = "virtio_blk", + .module_init = bdev_virtio_initialize, + .get_ctx_size = bdev_virtio_blk_get_ctx_size, +}; + +SPDK_BDEV_MODULE_REGISTER(virtio_blk, &virtio_blk_if) + +static int bdev_virtio_blk_ch_create_cb(void *io_device, void *ctx_buf); +static void bdev_virtio_blk_ch_destroy_cb(void *io_device, void *ctx_buf); + +static struct virtio_blk_io_ctx * +bdev_virtio_blk_init_io_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_blk_outhdr *req; + uint8_t *resp; + struct virtio_blk_discard_write_zeroes *desc; + + struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; + + req = &io_ctx->req; + resp = &io_ctx->resp; + desc = &io_ctx->unmap; + + io_ctx->iov_req.iov_base = req; + io_ctx->iov_req.iov_len = sizeof(*req); + + io_ctx->iov_resp.iov_base = resp; + io_ctx->iov_resp.iov_len = sizeof(*resp); + + io_ctx->iov_unmap.iov_base = desc; + io_ctx->iov_unmap.iov_len = sizeof(*desc); + + memset(req, 0, sizeof(*req)); + return io_ctx; +} + +static void +bdev_virtio_blk_send_io(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct bdev_virtio_blk_io_channel *virtio_channel = spdk_io_channel_get_ctx(ch); + struct virtqueue *vq = virtio_channel->vq; + struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; + int rc; + + rc = virtqueue_req_start(vq, bdev_io, bdev_io->u.bdev.iovcnt + 2); + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + return; + } else if (rc != 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); + if (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP) { + virtqueue_req_add_iovs(vq, &io_ctx->iov_unmap, 1, SPDK_VIRTIO_DESC_RO); + } else { + virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->type == SPDK_BDEV_IO_TYPE_READ ? + SPDK_VIRTIO_DESC_WR : SPDK_VIRTIO_DESC_RO); + } + virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); + + virtqueue_req_flush(vq); +} + +static void +bdev_virtio_command(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_blk_io_ctx *io_ctx = bdev_virtio_blk_init_io_vreq(ch, bdev_io); + struct virtio_blk_outhdr *req = &io_ctx->req; + struct virtio_blk_discard_write_zeroes *desc = &io_ctx->unmap; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + req->type = VIRTIO_BLK_T_IN; + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + req->type = VIRTIO_BLK_T_OUT; + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP) { + req->type = VIRTIO_BLK_T_DISCARD; + desc->sector = bdev_io->u.bdev.offset_blocks * + spdk_bdev_get_block_size(bdev_io->bdev) / 512; + desc->num_sectors = bdev_io->u.bdev.num_blocks * + spdk_bdev_get_block_size(bdev_io->bdev) / 512; + desc->flags = 0; + } + + req->sector = bdev_io->u.bdev.offset_blocks * + spdk_bdev_get_block_size(bdev_io->bdev) / 512; + + bdev_virtio_blk_send_io(ch, bdev_io); +} + +static void +bdev_virtio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + bdev_virtio_command(ch, bdev_io); +} + +static int +_bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_blk_dev *bvdev = bdev_io->bdev->ctxt; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_virtio_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + case SPDK_BDEV_IO_TYPE_WRITE: + if (bvdev->readonly) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } else { + bdev_virtio_command(ch, bdev_io); + } + return 0; + case SPDK_BDEV_IO_TYPE_RESET: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; + case SPDK_BDEV_IO_TYPE_UNMAP: + if (bvdev->unmap) { + bdev_virtio_command(ch, bdev_io); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + return 0; + case SPDK_BDEV_IO_TYPE_FLUSH: + default: + return -1; + } + + SPDK_UNREACHABLE(); +} + +static void +bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_virtio_submit_request(ch, bdev_io) < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_virtio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct virtio_blk_dev *bvdev = ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + case SPDK_BDEV_IO_TYPE_WRITE: + return !bvdev->readonly; + case SPDK_BDEV_IO_TYPE_UNMAP: + return bvdev->unmap; + case SPDK_BDEV_IO_TYPE_FLUSH: + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_virtio_get_io_channel(void *ctx) +{ + struct virtio_blk_dev *bvdev = ctx; + + return spdk_get_io_channel(bvdev); +} + +static void +virtio_blk_dev_unregister_cb(void *io_device) +{ + struct virtio_blk_dev *bvdev = io_device; + struct virtio_dev *vdev = &bvdev->vdev; + + virtio_dev_stop(vdev); + virtio_dev_destruct(vdev); + spdk_bdev_destruct_done(&bvdev->bdev, 0); + free(bvdev); +} + +static int +bdev_virtio_disk_destruct(void *ctx) +{ + struct virtio_blk_dev *bvdev = ctx; + + spdk_io_device_unregister(bvdev, virtio_blk_dev_unregister_cb); + return 1; +} + +int +bdev_virtio_blk_dev_remove(const char *name, bdev_virtio_remove_cb cb_fn, void *cb_arg) +{ + struct spdk_bdev *bdev; + + bdev = spdk_bdev_get_by_name(name); + if (bdev == NULL) { + return -ENODEV; + } + + if (bdev->module != &virtio_blk_if) { + return -ENODEV; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); + + return 0; +} + +static int +bdev_virtio_dump_json_config(void *ctx, struct spdk_json_write_ctx *w) +{ + struct virtio_blk_dev *bvdev = ctx; + + virtio_dev_dump_json_info(&bvdev->vdev, w); + return 0; +} + +static void +bdev_virtio_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct virtio_blk_dev *bvdev = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_virtio_attach_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bvdev->vdev.name); + spdk_json_write_named_string(w, "dev_type", "blk"); + + /* Write transport specific parameters. */ + bvdev->vdev.backend_ops->write_json_config(&bvdev->vdev, w); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table virtio_fn_table = { + .destruct = bdev_virtio_disk_destruct, + .submit_request = bdev_virtio_submit_request, + .io_type_supported = bdev_virtio_io_type_supported, + .get_io_channel = bdev_virtio_get_io_channel, + .dump_info_json = bdev_virtio_dump_json_config, + .write_config_json = bdev_virtio_write_config_json, +}; + +static void +bdev_virtio_io_cpl(struct spdk_bdev_io *bdev_io) +{ + struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; + + spdk_bdev_io_complete(bdev_io, io_ctx->resp == VIRTIO_BLK_S_OK ? + SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); +} + +static int +bdev_virtio_poll(void *arg) +{ + struct bdev_virtio_blk_io_channel *ch = arg; + void *io[32]; + uint32_t io_len[32]; + uint16_t i, cnt; + + cnt = virtio_recv_pkts(ch->vq, io, io_len, SPDK_COUNTOF(io)); + for (i = 0; i < cnt; ++i) { + bdev_virtio_io_cpl(io[i]); + } + + return cnt; +} + +static int +bdev_virtio_blk_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct virtio_blk_dev *bvdev = io_device; + struct virtio_dev *vdev = &bvdev->vdev; + struct bdev_virtio_blk_io_channel *ch = ctx_buf; + struct virtqueue *vq; + int32_t queue_idx; + + queue_idx = virtio_dev_find_and_acquire_queue(vdev, 0); + if (queue_idx < 0) { + SPDK_ERRLOG("Couldn't get an unused queue for the io_channel.\n"); + return -1; + } + + vq = vdev->vqs[queue_idx]; + + ch->vdev = vdev; + ch->vq = vq; + + ch->poller = SPDK_POLLER_REGISTER(bdev_virtio_poll, ch, 0); + return 0; +} + +static void +bdev_virtio_blk_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct virtio_blk_dev *bvdev = io_device; + struct virtio_dev *vdev = &bvdev->vdev; + struct bdev_virtio_blk_io_channel *ch = ctx_buf; + struct virtqueue *vq = ch->vq; + + spdk_poller_unregister(&ch->poller); + virtio_dev_release_queue(vdev, vq->vq_queue_index); +} + +static int +virtio_blk_dev_init(struct virtio_blk_dev *bvdev, uint16_t max_queues) +{ + struct virtio_dev *vdev = &bvdev->vdev; + struct spdk_bdev *bdev = &bvdev->bdev; + uint64_t capacity, num_blocks; + uint32_t block_size; + uint16_t host_max_queues; + int rc; + + if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE)) { + rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, blk_size), + &block_size, sizeof(block_size)); + if (rc) { + SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); + return rc; + } + + if (block_size == 0 || block_size % 512 != 0) { + SPDK_ERRLOG("%s: invalid block size (%"PRIu32"). Must be " + "a multiple of 512.\n", vdev->name, block_size); + return -EIO; + } + } else { + block_size = 512; + } + + rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, capacity), + &capacity, sizeof(capacity)); + if (rc) { + SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); + return rc; + } + + /* `capacity` is a number of 512-byte sectors. */ + num_blocks = capacity * 512 / block_size; + if (num_blocks == 0) { + SPDK_ERRLOG("%s: size too small (size: %"PRIu64", blocksize: %"PRIu32").\n", + vdev->name, capacity * 512, block_size); + return -EIO; + } + + if ((capacity * 512) % block_size != 0) { + SPDK_WARNLOG("%s: size has been rounded down to the nearest block size boundary. " + "(block size: %"PRIu32", previous size: %"PRIu64", new size: %"PRIu64")\n", + vdev->name, block_size, capacity * 512, num_blocks * block_size); + } + + if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_MQ)) { + rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, num_queues), + &host_max_queues, sizeof(host_max_queues)); + if (rc) { + SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); + return rc; + } + } else { + host_max_queues = 1; + } + + if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_RO)) { + bvdev->readonly = true; + } + + if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { + bvdev->unmap = true; + } + + if (max_queues == 0) { + SPDK_ERRLOG("%s: requested 0 request queues (%"PRIu16" available).\n", + vdev->name, host_max_queues); + return -EINVAL; + } + + if (max_queues > host_max_queues) { + SPDK_WARNLOG("%s: requested %"PRIu16" request queues " + "but only %"PRIu16" available.\n", + vdev->name, max_queues, host_max_queues); + max_queues = host_max_queues; + } + + /* bdev is tied with the virtio device; we can reuse the name */ + bdev->name = vdev->name; + rc = virtio_dev_start(vdev, max_queues, 0); + if (rc != 0) { + return rc; + } + + bdev->product_name = "VirtioBlk Disk"; + bdev->write_cache = 0; + bdev->blocklen = block_size; + bdev->blockcnt = num_blocks; + + bdev->ctxt = bvdev; + bdev->fn_table = &virtio_fn_table; + bdev->module = &virtio_blk_if; + + spdk_io_device_register(bvdev, bdev_virtio_blk_ch_create_cb, + bdev_virtio_blk_ch_destroy_cb, + sizeof(struct bdev_virtio_blk_io_channel), + vdev->name); + + rc = spdk_bdev_register(bdev); + if (rc) { + SPDK_ERRLOG("Failed to register bdev name=%s\n", bdev->name); + spdk_io_device_unregister(bvdev, NULL); + virtio_dev_stop(vdev); + return rc; + } + + return 0; +} + +static struct virtio_blk_dev * +virtio_pci_blk_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) +{ + static int pci_dev_counter = 0; + struct virtio_blk_dev *bvdev; + struct virtio_dev *vdev; + char *default_name = NULL; + uint16_t num_queues; + int rc; + + bvdev = calloc(1, sizeof(*bvdev)); + if (bvdev == NULL) { + SPDK_ERRLOG("virtio device calloc failed\n"); + return NULL; + } + vdev = &bvdev->vdev; + + if (name == NULL) { + default_name = spdk_sprintf_alloc("VirtioBlk%"PRIu32, pci_dev_counter++); + if (default_name == NULL) { + free(vdev); + return NULL; + } + name = default_name; + } + + rc = virtio_pci_dev_init(vdev, name, pci_ctx); + free(default_name); + + if (rc != 0) { + free(bvdev); + return NULL; + } + + rc = virtio_dev_reset(vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES); + if (rc != 0) { + virtio_dev_destruct(vdev); + free(bvdev); + return NULL; + } + + /* TODO: add a way to limit usable virtqueues */ + if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_MQ)) { + rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, num_queues), + &num_queues, sizeof(num_queues)); + if (rc) { + SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); + virtio_dev_destruct(vdev); + free(bvdev); + return NULL; + } + } else { + num_queues = 1; + } + + rc = virtio_blk_dev_init(bvdev, num_queues); + if (rc != 0) { + virtio_dev_destruct(vdev); + free(bvdev); + return NULL; + } + + return bvdev; +} + +static struct virtio_blk_dev * +virtio_user_blk_dev_create(const char *name, const char *path, + uint16_t num_queues, uint32_t queue_size) +{ + struct virtio_blk_dev *bvdev; + int rc; + + bvdev = calloc(1, sizeof(*bvdev)); + if (bvdev == NULL) { + SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path); + return NULL; + } + + rc = virtio_user_dev_init(&bvdev->vdev, name, path, queue_size); + if (rc != 0) { + SPDK_ERRLOG("Failed to create virito device %s: %s\n", name, path); + free(bvdev); + return NULL; + } + + rc = virtio_dev_reset(&bvdev->vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES); + if (rc != 0) { + virtio_dev_destruct(&bvdev->vdev); + free(bvdev); + return NULL; + } + + rc = virtio_blk_dev_init(bvdev, num_queues); + if (rc != 0) { + virtio_dev_destruct(&bvdev->vdev); + free(bvdev); + return NULL; + } + + return bvdev; +} + +struct bdev_virtio_pci_dev_create_ctx { + const char *name; + struct virtio_blk_dev *ret; +}; + +static int +bdev_virtio_pci_blk_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) +{ + struct bdev_virtio_pci_dev_create_ctx *create_ctx = ctx; + + create_ctx->ret = virtio_pci_blk_dev_create(create_ctx->name, pci_ctx); + if (create_ctx->ret == NULL) { + return -1; + } + + return 0; +} + +struct spdk_bdev * +bdev_virtio_pci_blk_dev_create(const char *name, struct spdk_pci_addr *pci_addr) +{ + struct bdev_virtio_pci_dev_create_ctx create_ctx; + + create_ctx.name = name; + create_ctx.ret = NULL; + + virtio_pci_dev_attach(bdev_virtio_pci_blk_dev_create_cb, &create_ctx, + PCI_DEVICE_ID_VIRTIO_BLK_MODERN, pci_addr); + + if (create_ctx.ret == NULL) { + return NULL; + } + + return &create_ctx.ret->bdev; +} + +static int +virtio_pci_blk_dev_enumerate_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) +{ + struct virtio_blk_dev *bvdev; + + bvdev = virtio_pci_blk_dev_create(NULL, pci_ctx); + return bvdev == NULL ? -1 : 0; +} + +static int +bdev_virtio_initialize(void) +{ + struct spdk_conf_section *sp; + struct virtio_blk_dev *bvdev; + char *default_name = NULL; + char *path, *type, *name; + unsigned vdev_num; + int num_queues; + bool enable_pci; + int rc = 0; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VirtioUser")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VirtioUser%u", &vdev_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + path = spdk_conf_section_get_val(sp, "Path"); + if (path == NULL) { + SPDK_ERRLOG("VirtioUserBlk%u: missing Path\n", vdev_num); + return -1; + } + + type = spdk_conf_section_get_val(sp, "Type"); + if (type == NULL || strcmp(type, "Blk") != 0) { + continue; + } + + num_queues = spdk_conf_section_get_intval(sp, "Queues"); + if (num_queues < 1) { + num_queues = 1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + default_name = spdk_sprintf_alloc("VirtioBlk%u", vdev_num); + name = default_name; + } + + bvdev = virtio_user_blk_dev_create(name, path, num_queues, 512); + free(default_name); + default_name = NULL; + + if (bvdev == NULL) { + return -1; + } + } + + sp = spdk_conf_find_section(NULL, "VirtioPci"); + if (sp == NULL) { + return 0; + } + + enable_pci = spdk_conf_section_get_boolval(sp, "Enable", false); + if (enable_pci) { + rc = virtio_pci_dev_enumerate(virtio_pci_blk_dev_enumerate_cb, NULL, + PCI_DEVICE_ID_VIRTIO_BLK_MODERN); + } + + return rc; +} + +struct spdk_bdev * +bdev_virtio_user_blk_dev_create(const char *name, const char *path, + unsigned num_queues, unsigned queue_size) +{ + struct virtio_blk_dev *bvdev; + + bvdev = virtio_user_blk_dev_create(name, path, num_queues, queue_size); + if (bvdev == NULL) { + return NULL; + } + + return &bvdev->bdev; +} + +static int +bdev_virtio_blk_get_ctx_size(void) +{ + return sizeof(struct virtio_blk_io_ctx); +} + +SPDK_LOG_REGISTER_COMPONENT("virtio_blk", SPDK_LOG_VIRTIO_BLK) diff --git a/src/spdk/module/bdev/virtio/bdev_virtio_rpc.c b/src/spdk/module/bdev/virtio/bdev_virtio_rpc.c new file mode 100644 index 000000000..3c3c276eb --- /dev/null +++ b/src/spdk/module/bdev/virtio/bdev_virtio_rpc.c @@ -0,0 +1,264 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk_internal/log.h" + +#include "bdev_virtio.h" + +#define SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT 1 +#define SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE 512 + +struct rpc_remove_virtio_dev { + char *name; +}; + +static const struct spdk_json_object_decoder rpc_remove_virtio_dev[] = { + {"name", offsetof(struct rpc_remove_virtio_dev, name), spdk_json_decode_string }, +}; + +static void +rpc_bdev_virtio_detach_controller_cb(void *ctx, int errnum) +{ + struct spdk_jsonrpc_request *request = ctx; + struct spdk_json_write_ctx *w; + + if (errnum != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-errnum)); + return; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_bdev_virtio_detach_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_remove_virtio_dev req = {NULL}; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_remove_virtio_dev, + SPDK_COUNTOF(rpc_remove_virtio_dev), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + rc = bdev_virtio_blk_dev_remove(req.name, rpc_bdev_virtio_detach_controller_cb, request); + if (rc == -ENODEV) { + rc = bdev_virtio_scsi_dev_remove(req.name, rpc_bdev_virtio_detach_controller_cb, request); + } + + if (rc != 0) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + } + +cleanup: + free(req.name); +} +SPDK_RPC_REGISTER("bdev_virtio_detach_controller", + rpc_bdev_virtio_detach_controller, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_virtio_detach_controller, remove_virtio_bdev) + +static void +rpc_bdev_virtio_scsi_get_devices(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "bdev_virtio_scsi_get_devices requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + bdev_virtio_scsi_dev_list(w); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("bdev_virtio_scsi_get_devices", + rpc_bdev_virtio_scsi_get_devices, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_virtio_scsi_get_devices, get_virtio_scsi_devs) + +struct rpc_bdev_virtio_attach_controller_ctx { + char *name; + char *trtype; + char *traddr; + char *dev_type; + uint32_t vq_count; + uint32_t vq_size; + struct spdk_jsonrpc_request *request; +}; + +static const struct spdk_json_object_decoder rpc_bdev_virtio_attach_controller_ctx[] = { + {"name", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, name), spdk_json_decode_string }, + {"trtype", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, trtype), spdk_json_decode_string }, + {"traddr", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, traddr), spdk_json_decode_string }, + {"dev_type", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, dev_type), spdk_json_decode_string }, + {"vq_count", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, vq_count), spdk_json_decode_uint32, true }, + {"vq_size", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, vq_size), spdk_json_decode_uint32, true }, +}; + +static void +free_rpc_bdev_virtio_attach_controller_ctx(struct rpc_bdev_virtio_attach_controller_ctx *req) +{ + free(req->name); + free(req->trtype); + free(req->traddr); + free(req->dev_type); + free(req); +} + +static void +rpc_create_virtio_dev_cb(void *ctx, int result, struct spdk_bdev **bdevs, size_t cnt) +{ + struct rpc_bdev_virtio_attach_controller_ctx *req = ctx; + struct spdk_json_write_ctx *w; + size_t i; + + if (result) { + spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-result)); + free_rpc_bdev_virtio_attach_controller_ctx(req); + return; + } + + w = spdk_jsonrpc_begin_result(req->request); + spdk_json_write_array_begin(w); + + for (i = 0; i < cnt; i++) { + spdk_json_write_string(w, spdk_bdev_get_name(bdevs[i])); + } + + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(req->request, w); + + free_rpc_bdev_virtio_attach_controller_ctx(ctx); +} + +static void +rpc_bdev_virtio_attach_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_virtio_attach_controller_ctx *req; + struct spdk_bdev *bdev; + struct spdk_pci_addr pci_addr; + bool pci; + int rc; + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("calloc() failed\n"); + spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); + return; + } + + if (spdk_json_decode_object(params, rpc_bdev_virtio_attach_controller_ctx, + SPDK_COUNTOF(rpc_bdev_virtio_attach_controller_ctx), + req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + if (strcmp(req->trtype, "pci") == 0) { + if (req->vq_count != 0 || req->vq_size != 0) { + SPDK_ERRLOG("VQ count or size is not allowed for PCI transport type\n"); + spdk_jsonrpc_send_error_response(request, EINVAL, + "vq_count or vq_size is not allowed for PCI transport type."); + goto cleanup; + } + + if (spdk_pci_addr_parse(&pci_addr, req->traddr) != 0) { + SPDK_ERRLOG("Invalid PCI address '%s'\n", req->traddr); + spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid PCI address '%s'", req->traddr); + goto cleanup; + } + + pci = true; + } else if (strcmp(req->trtype, "user") == 0) { + req->vq_count = req->vq_count == 0 ? SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT : req->vq_count; + req->vq_size = req->vq_size == 0 ? SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE : req->vq_size; + pci = false; + } else { + SPDK_ERRLOG("Invalid trtype '%s'\n", req->trtype); + spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid trtype '%s'", req->trtype); + goto cleanup; + } + + req->request = request; + if (strcmp(req->dev_type, "blk") == 0) { + if (pci) { + bdev = bdev_virtio_pci_blk_dev_create(req->name, &pci_addr); + } else { + bdev = bdev_virtio_user_blk_dev_create(req->name, req->traddr, req->vq_count, req->vq_size); + } + + /* Virtio blk doesn't use callback so call it manually to send result. */ + rc = bdev ? 0 : -EINVAL; + rpc_create_virtio_dev_cb(req, rc, &bdev, bdev ? 1 : 0); + } else if (strcmp(req->dev_type, "scsi") == 0) { + if (pci) { + rc = bdev_virtio_pci_scsi_dev_create(req->name, &pci_addr, rpc_create_virtio_dev_cb, req); + } else { + rc = bdev_virtio_user_scsi_dev_create(req->name, req->traddr, req->vq_count, req->vq_size, + rpc_create_virtio_dev_cb, req); + } + + if (rc < 0) { + /* In case of error callback is not called so do it manually to send result. */ + rpc_create_virtio_dev_cb(req, rc, NULL, 0); + } + } else { + SPDK_ERRLOG("Invalid dev_type '%s'\n", req->dev_type); + spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid dev_type '%s'", req->dev_type); + goto cleanup; + } + + return; + +cleanup: + free_rpc_bdev_virtio_attach_controller_ctx(req); +} +SPDK_RPC_REGISTER("bdev_virtio_attach_controller", + rpc_bdev_virtio_attach_controller, SPDK_RPC_RUNTIME); +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_virtio_attach_controller, construct_virtio_dev) diff --git a/src/spdk/module/bdev/virtio/bdev_virtio_scsi.c b/src/spdk/module/bdev/virtio/bdev_virtio_scsi.c new file mode 100644 index 000000000..520b8a17d --- /dev/null +++ b/src/spdk/module/bdev/virtio/bdev_virtio_scsi.c @@ -0,0 +1,2036 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/scsi_spec.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/json.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk_internal/virtio.h" +#include "spdk_internal/vhost_user.h" + +#include <linux/virtio_scsi.h> + +#include "bdev_virtio.h" + +#define BDEV_VIRTIO_MAX_TARGET 64 +#define BDEV_VIRTIO_SCAN_PAYLOAD_SIZE 256 +#define MGMT_POLL_PERIOD_US (1000 * 5) +#define CTRLQ_RING_SIZE 16 +#define SCAN_REQUEST_RETRIES 5 + +/* Number of non-request queues - eventq and controlq */ +#define SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED 2 + +#define VIRTIO_SCSI_EVENTQ_BUFFER_COUNT 16 + +#define VIRTIO_SCSI_CONTROLQ 0 +#define VIRTIO_SCSI_EVENTQ 1 +#define VIRTIO_SCSI_REQUESTQ 2 + +static int bdev_virtio_initialize(void); +static void bdev_virtio_finish(void); + +struct virtio_scsi_dev { + /* Generic virtio device data. */ + struct virtio_dev vdev; + + /** Detected SCSI LUNs */ + TAILQ_HEAD(, virtio_scsi_disk) luns; + + /** Context for the SCSI target scan. */ + struct virtio_scsi_scan_base *scan_ctx; + + /** Controlq poller. */ + struct spdk_poller *mgmt_poller; + + /** Controlq messages to be sent. */ + struct spdk_ring *ctrlq_ring; + + /** Buffers for the eventq. */ + struct virtio_scsi_eventq_io *eventq_ios; + + /** Device marked for removal. */ + bool removed; + + /** Callback to be called after vdev removal. */ + bdev_virtio_remove_cb remove_cb; + + /** Context for the `remove_cb`. */ + void *remove_ctx; + + TAILQ_ENTRY(virtio_scsi_dev) tailq; +}; + +struct virtio_scsi_io_ctx { + struct iovec iov_req; + struct iovec iov_resp; + union { + struct virtio_scsi_cmd_req req; + struct virtio_scsi_ctrl_tmf_req tmf_req; + }; + union { + struct virtio_scsi_cmd_resp resp; + struct virtio_scsi_ctrl_tmf_resp tmf_resp; + }; +}; + +struct virtio_scsi_eventq_io { + struct iovec iov; + struct virtio_scsi_event ev; +}; + +struct virtio_scsi_scan_info { + uint64_t num_blocks; + uint32_t block_size; + uint8_t target; + bool unmap_supported; + TAILQ_ENTRY(virtio_scsi_scan_info) tailq; +}; + +struct virtio_scsi_scan_base { + struct virtio_scsi_dev *svdev; + + /** I/O channel used for the scan I/O. */ + struct bdev_virtio_io_channel *channel; + + bdev_virtio_create_cb cb_fn; + void *cb_arg; + + /** Scan all targets on the device. */ + bool full_scan; + + /** Start a full rescan after receiving next scan I/O response. */ + bool restart; + + /** Additional targets to be (re)scanned. */ + TAILQ_HEAD(, virtio_scsi_scan_info) scan_queue; + + /** Remaining attempts for sending the current request. */ + unsigned retries; + + /** If set, the last scan I/O needs to be resent */ + bool needs_resend; + + struct virtio_scsi_io_ctx io_ctx; + struct iovec iov; + uint8_t payload[BDEV_VIRTIO_SCAN_PAYLOAD_SIZE]; + + /** Scan results for the current target. */ + struct virtio_scsi_scan_info info; +}; + +struct virtio_scsi_disk { + struct spdk_bdev bdev; + struct virtio_scsi_dev *svdev; + struct virtio_scsi_scan_info info; + + /** Descriptor opened just to be notified of external bdev hotremove. */ + struct spdk_bdev_desc *notify_desc; + + /** Disk marked for removal. */ + bool removed; + TAILQ_ENTRY(virtio_scsi_disk) link; +}; + +struct bdev_virtio_io_channel { + struct virtio_scsi_dev *svdev; + + /** Virtqueue exclusively assigned to this channel. */ + struct virtqueue *vq; + + /** Virtio response poller. */ + struct spdk_poller *poller; +}; + +static TAILQ_HEAD(, virtio_scsi_dev) g_virtio_scsi_devs = + TAILQ_HEAD_INITIALIZER(g_virtio_scsi_devs); + +static pthread_mutex_t g_virtio_scsi_mutex = PTHREAD_MUTEX_INITIALIZER; + +/** Module finish in progress */ +static bool g_bdev_virtio_finish = false; + +/* Features desired/implemented by this driver. */ +#define VIRTIO_SCSI_DEV_SUPPORTED_FEATURES \ + (1ULL << VIRTIO_SCSI_F_INOUT | \ + 1ULL << VIRTIO_SCSI_F_HOTPLUG | \ + 1ULL << VIRTIO_RING_F_EVENT_IDX | \ + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES) + +static void virtio_scsi_dev_unregister_cb(void *io_device); +static void virtio_scsi_dev_remove(struct virtio_scsi_dev *svdev, + bdev_virtio_remove_cb cb_fn, void *cb_arg); +static int bdev_virtio_scsi_ch_create_cb(void *io_device, void *ctx_buf); +static void bdev_virtio_scsi_ch_destroy_cb(void *io_device, void *ctx_buf); +static void process_scan_resp(struct virtio_scsi_scan_base *base); +static int bdev_virtio_mgmt_poll(void *arg); + +static int +virtio_scsi_dev_send_eventq_io(struct virtqueue *vq, struct virtio_scsi_eventq_io *io) +{ + int rc; + + rc = virtqueue_req_start(vq, io, 1); + if (rc != 0) { + return -1; + } + + virtqueue_req_add_iovs(vq, &io->iov, 1, SPDK_VIRTIO_DESC_WR); + virtqueue_req_flush(vq); + + return 0; +} + +static int +virtio_scsi_dev_init(struct virtio_scsi_dev *svdev, uint16_t max_queues) +{ + struct virtio_dev *vdev = &svdev->vdev; + struct spdk_ring *ctrlq_ring; + struct virtio_scsi_eventq_io *eventq_io; + struct virtqueue *eventq; + uint16_t i, num_events; + int rc; + + rc = virtio_dev_reset(vdev, VIRTIO_SCSI_DEV_SUPPORTED_FEATURES); + if (rc != 0) { + return rc; + } + + rc = virtio_dev_start(vdev, max_queues, SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED); + if (rc != 0) { + return rc; + } + + ctrlq_ring = spdk_ring_create(SPDK_RING_TYPE_MP_SC, CTRLQ_RING_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (ctrlq_ring == NULL) { + SPDK_ERRLOG("Failed to allocate send ring for the controlq.\n"); + return -1; + } + + rc = virtio_dev_acquire_queue(vdev, VIRTIO_SCSI_CONTROLQ); + if (rc != 0) { + SPDK_ERRLOG("Failed to acquire the controlq.\n"); + spdk_ring_free(ctrlq_ring); + return -1; + } + + rc = virtio_dev_acquire_queue(vdev, VIRTIO_SCSI_EVENTQ); + if (rc != 0) { + SPDK_ERRLOG("Failed to acquire the eventq.\n"); + virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); + spdk_ring_free(ctrlq_ring); + return -1; + } + + eventq = vdev->vqs[VIRTIO_SCSI_EVENTQ]; + num_events = spdk_min(eventq->vq_nentries, VIRTIO_SCSI_EVENTQ_BUFFER_COUNT); + svdev->eventq_ios = spdk_zmalloc(sizeof(*svdev->eventq_ios) * num_events, + 0, NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); + if (svdev->eventq_ios == NULL) { + SPDK_ERRLOG("cannot allocate memory for %"PRIu16" eventq buffers\n", + num_events); + virtio_dev_release_queue(vdev, VIRTIO_SCSI_EVENTQ); + virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); + spdk_ring_free(ctrlq_ring); + return -1; + } + + for (i = 0; i < num_events; i++) { + eventq_io = &svdev->eventq_ios[i]; + eventq_io->iov.iov_base = &eventq_io->ev; + eventq_io->iov.iov_len = sizeof(eventq_io->ev); + virtio_scsi_dev_send_eventq_io(eventq, eventq_io); + } + + svdev->ctrlq_ring = ctrlq_ring; + + svdev->mgmt_poller = SPDK_POLLER_REGISTER(bdev_virtio_mgmt_poll, svdev, + MGMT_POLL_PERIOD_US); + + TAILQ_INIT(&svdev->luns); + svdev->scan_ctx = NULL; + svdev->removed = false; + svdev->remove_cb = NULL; + svdev->remove_ctx = NULL; + + spdk_io_device_register(svdev, bdev_virtio_scsi_ch_create_cb, + bdev_virtio_scsi_ch_destroy_cb, + sizeof(struct bdev_virtio_io_channel), + svdev->vdev.name); + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_INSERT_TAIL(&g_virtio_scsi_devs, svdev, tailq); + pthread_mutex_unlock(&g_virtio_scsi_mutex); + return 0; +} + +static struct virtio_scsi_dev * +virtio_pci_scsi_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) +{ + static int pci_dev_counter = 0; + struct virtio_scsi_dev *svdev; + struct virtio_dev *vdev; + char *default_name = NULL; + uint32_t num_queues; + int rc; + + svdev = calloc(1, sizeof(*svdev)); + if (svdev == NULL) { + SPDK_ERRLOG("virtio device calloc failed\n"); + return NULL; + } + + vdev = &svdev->vdev; + if (name == NULL) { + default_name = spdk_sprintf_alloc("VirtioScsi%"PRIu32, pci_dev_counter++); + if (default_name == NULL) { + free(vdev); + return NULL; + } + name = default_name; + } + + rc = virtio_pci_dev_init(vdev, name, pci_ctx); + free(default_name); + + if (rc != 0) { + free(svdev); + return NULL; + } + + rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_scsi_config, num_queues), + &num_queues, sizeof(num_queues)); + if (rc) { + SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); + virtio_dev_destruct(vdev); + free(svdev); + return NULL; + } + + rc = virtio_scsi_dev_init(svdev, num_queues); + if (rc != 0) { + virtio_dev_destruct(vdev); + free(svdev); + return NULL; + } + + return svdev; +} + +static struct virtio_scsi_dev * +virtio_user_scsi_dev_create(const char *name, const char *path, + uint16_t num_queues, uint32_t queue_size) +{ + struct virtio_scsi_dev *svdev; + struct virtio_dev *vdev; + int rc; + + svdev = calloc(1, sizeof(*svdev)); + if (svdev == NULL) { + SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path); + return NULL; + } + + vdev = &svdev->vdev; + rc = virtio_user_dev_init(vdev, name, path, queue_size); + if (rc != 0) { + SPDK_ERRLOG("Failed to create virito device %s: %s\n", name, path); + free(svdev); + return NULL; + } + + rc = virtio_scsi_dev_init(svdev, num_queues); + if (rc != 0) { + virtio_dev_destruct(vdev); + free(svdev); + return NULL; + } + + return svdev; +} + +static struct virtio_scsi_disk * +virtio_scsi_dev_get_disk_by_id(struct virtio_scsi_dev *svdev, uint8_t target_id) +{ + struct virtio_scsi_disk *disk; + + TAILQ_FOREACH(disk, &svdev->luns, link) { + if (disk->info.target == target_id) { + return disk; + } + } + + return NULL; +} + +static int virtio_scsi_dev_scan(struct virtio_scsi_dev *svdev, + bdev_virtio_create_cb cb_fn, void *cb_arg); +static int send_scan_io(struct virtio_scsi_scan_base *base); +static void _virtio_scsi_dev_scan_tgt(struct virtio_scsi_scan_base *base, uint8_t target); +static int _virtio_scsi_dev_scan_next(struct virtio_scsi_scan_base *base, int rc); +static void _virtio_scsi_dev_scan_finish(struct virtio_scsi_scan_base *base, int errnum); +static int virtio_scsi_dev_scan_tgt(struct virtio_scsi_dev *svdev, uint8_t target); + +static int +bdev_virtio_get_ctx_size(void) +{ + return sizeof(struct virtio_scsi_io_ctx); +} + +static int +bdev_virtio_scsi_config_json(struct spdk_json_write_ctx *w) +{ + struct virtio_scsi_dev *svdev; + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_virtio_attach_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", svdev->vdev.name); + spdk_json_write_named_string(w, "dev_type", "scsi"); + + /* Write transport specific parameters. */ + svdev->vdev.backend_ops->write_json_config(&svdev->vdev, w); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + } + pthread_mutex_unlock(&g_virtio_scsi_mutex); + + return 0; +} + + +static struct spdk_bdev_module virtio_scsi_if = { + .name = "virtio_scsi", + .module_init = bdev_virtio_initialize, + .module_fini = bdev_virtio_finish, + .get_ctx_size = bdev_virtio_get_ctx_size, + .config_json = bdev_virtio_scsi_config_json, + .async_init = true, + .async_fini = true, +}; + +SPDK_BDEV_MODULE_REGISTER(virtio_scsi, &virtio_scsi_if) + +static struct virtio_scsi_io_ctx * +bdev_virtio_init_io_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_cmd_req *req; + struct virtio_scsi_cmd_resp *resp; + struct virtio_scsi_disk *disk = (struct virtio_scsi_disk *)bdev_io->bdev; + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + + req = &io_ctx->req; + resp = &io_ctx->resp; + + io_ctx->iov_req.iov_base = req; + io_ctx->iov_req.iov_len = sizeof(*req); + + io_ctx->iov_resp.iov_base = resp; + io_ctx->iov_resp.iov_len = sizeof(*resp); + + memset(req, 0, sizeof(*req)); + req->lun[0] = 1; + req->lun[1] = disk->info.target; + + return io_ctx; +} + +static struct virtio_scsi_io_ctx * +bdev_virtio_init_tmf_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_ctrl_tmf_req *tmf_req; + struct virtio_scsi_ctrl_tmf_resp *tmf_resp; + struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + + tmf_req = &io_ctx->tmf_req; + tmf_resp = &io_ctx->tmf_resp; + + io_ctx->iov_req.iov_base = tmf_req; + io_ctx->iov_req.iov_len = sizeof(*tmf_req); + io_ctx->iov_resp.iov_base = tmf_resp; + io_ctx->iov_resp.iov_len = sizeof(*tmf_resp); + + memset(tmf_req, 0, sizeof(*tmf_req)); + tmf_req->lun[0] = 1; + tmf_req->lun[1] = disk->info.target; + + return io_ctx; +} + +static void +bdev_virtio_send_io(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct bdev_virtio_io_channel *virtio_channel = spdk_io_channel_get_ctx(ch); + struct virtqueue *vq = virtio_channel->vq; + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + int rc; + + rc = virtqueue_req_start(vq, bdev_io, bdev_io->u.bdev.iovcnt + 2); + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + return; + } else if (rc != 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); + virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + SPDK_VIRTIO_DESC_WR); + } else { + virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + SPDK_VIRTIO_DESC_RO); + virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); + } + + virtqueue_req_flush(vq); +} + +static void +bdev_virtio_rw(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); + struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_io_vreq(ch, bdev_io); + struct virtio_scsi_cmd_req *req = &io_ctx->req; + bool is_write = bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE; + + if (disk->info.num_blocks > (1ULL << 32)) { + req->cdb[0] = is_write ? SPDK_SBC_WRITE_16 : SPDK_SBC_READ_16; + to_be64(&req->cdb[2], bdev_io->u.bdev.offset_blocks); + to_be32(&req->cdb[10], bdev_io->u.bdev.num_blocks); + } else { + req->cdb[0] = is_write ? SPDK_SBC_WRITE_10 : SPDK_SBC_READ_10; + to_be32(&req->cdb[2], bdev_io->u.bdev.offset_blocks); + to_be16(&req->cdb[7], bdev_io->u.bdev.num_blocks); + } + + bdev_virtio_send_io(ch, bdev_io); +} + +static void +bdev_virtio_reset(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct bdev_virtio_io_channel *virtio_ch = spdk_io_channel_get_ctx(ch); + struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_tmf_vreq(ch, bdev_io); + struct virtio_scsi_ctrl_tmf_req *tmf_req = &io_ctx->tmf_req; + struct virtio_scsi_dev *svdev = virtio_ch->svdev; + size_t enqueued_count; + + tmf_req->type = VIRTIO_SCSI_T_TMF; + tmf_req->subtype = VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET; + + enqueued_count = spdk_ring_enqueue(svdev->ctrlq_ring, (void **)&bdev_io, 1, NULL); + if (spdk_likely(enqueued_count == 1)) { + return; + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } +} + +static void +bdev_virtio_unmap(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +{ + struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_io_vreq(ch, bdev_io); + struct virtio_scsi_cmd_req *req = &io_ctx->req; + struct spdk_scsi_unmap_bdesc *desc, *first_desc; + uint8_t *buf; + uint64_t offset_blocks, num_blocks; + uint16_t cmd_len; + + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + buf = bdev_io->u.bdev.iovs[0].iov_base; + + offset_blocks = bdev_io->u.bdev.offset_blocks; + num_blocks = bdev_io->u.bdev.num_blocks; + + /* (n-1) * 16-byte descriptors */ + first_desc = desc = (struct spdk_scsi_unmap_bdesc *)&buf[8]; + while (num_blocks > UINT32_MAX) { + to_be64(&desc->lba, offset_blocks); + to_be32(&desc->block_count, UINT32_MAX); + memset(&desc->reserved, 0, sizeof(desc->reserved)); + offset_blocks += UINT32_MAX; + num_blocks -= UINT32_MAX; + desc++; + } + + /* The last descriptor with block_count <= UINT32_MAX */ + to_be64(&desc->lba, offset_blocks); + to_be32(&desc->block_count, num_blocks); + memset(&desc->reserved, 0, sizeof(desc->reserved)); + + /* 8-byte header + n * 16-byte block descriptor */ + cmd_len = 8 + (desc - first_desc + 1) * sizeof(struct spdk_scsi_unmap_bdesc); + + req->cdb[0] = SPDK_SBC_UNMAP; + to_be16(&req->cdb[7], cmd_len); + + /* 8-byte header */ + to_be16(&buf[0], cmd_len - 2); /* total length (excluding the length field) */ + to_be16(&buf[2], cmd_len - 8); /* length of block descriptors */ + memset(&buf[4], 0, 4); /* reserved */ + + bdev_virtio_send_io(ch, bdev_io); +} + +static void +bdev_virtio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + bdev_virtio_rw(ch, bdev_io); +} + +static int _bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_virtio_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_virtio_rw(ch, bdev_io); + return 0; + case SPDK_BDEV_IO_TYPE_RESET: + bdev_virtio_reset(ch, bdev_io); + return 0; + case SPDK_BDEV_IO_TYPE_UNMAP: { + uint64_t buf_len = 8 /* header size */ + + (bdev_io->u.bdev.num_blocks + UINT32_MAX - 1) / + UINT32_MAX * sizeof(struct spdk_scsi_unmap_bdesc); + + if (!disk->info.unmap_supported) { + return -1; + } + + if (buf_len > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { + SPDK_ERRLOG("Trying to UNMAP too many blocks: %"PRIu64"\n", + bdev_io->u.bdev.num_blocks); + return -1; + } + spdk_bdev_io_get_buf(bdev_io, bdev_virtio_unmap, buf_len); + return 0; + } + case SPDK_BDEV_IO_TYPE_FLUSH: + default: + return -1; + } + return 0; +} + +static void bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_virtio_submit_request(ch, bdev_io) < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_virtio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct virtio_scsi_disk *disk = ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + + case SPDK_BDEV_IO_TYPE_UNMAP: + return disk->info.unmap_supported; + + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_virtio_get_io_channel(void *ctx) +{ + struct virtio_scsi_disk *disk = ctx; + + return spdk_get_io_channel(disk->svdev); +} + +static int +bdev_virtio_disk_destruct(void *ctx) +{ + struct virtio_scsi_disk *disk = ctx; + struct virtio_scsi_dev *svdev = disk->svdev; + + TAILQ_REMOVE(&svdev->luns, disk, link); + free(disk->bdev.name); + free(disk); + + if (svdev->removed && TAILQ_EMPTY(&svdev->luns)) { + spdk_io_device_unregister(svdev, virtio_scsi_dev_unregister_cb); + } + + return 0; +} + +static int +bdev_virtio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct virtio_scsi_disk *disk = ctx; + + virtio_dev_dump_json_info(&disk->svdev->vdev, w); + return 0; +} + +static void +bdev_virtio_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* SCSI targets and LUNS are discovered during scan process so nothing + * to save here. + */ +} + +static const struct spdk_bdev_fn_table virtio_fn_table = { + .destruct = bdev_virtio_disk_destruct, + .submit_request = bdev_virtio_submit_request, + .io_type_supported = bdev_virtio_io_type_supported, + .get_io_channel = bdev_virtio_get_io_channel, + .dump_info_json = bdev_virtio_dump_info_json, + .write_config_json = bdev_virtio_write_config_json, +}; + +static void +get_scsi_status(struct virtio_scsi_cmd_resp *resp, int *sk, int *asc, int *ascq) +{ + /* see spdk_scsi_task_build_sense_data() for sense data details */ + *sk = 0; + *asc = 0; + *ascq = 0; + + if (resp->sense_len < 3) { + return; + } + + *sk = resp->sense[2] & 0xf; + + if (resp->sense_len < 13) { + return; + } + + *asc = resp->sense[12]; + + if (resp->sense_len < 14) { + return; + } + + *ascq = resp->sense[13]; +} + +static void +bdev_virtio_io_cpl(struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + int sk, asc, ascq; + + get_scsi_status(&io_ctx->resp, &sk, &asc, &ascq); + spdk_bdev_io_complete_scsi_status(bdev_io, io_ctx->resp.status, sk, asc, ascq); +} + +static int +bdev_virtio_poll(void *arg) +{ + struct bdev_virtio_io_channel *ch = arg; + struct virtio_scsi_dev *svdev = ch->svdev; + struct virtio_scsi_scan_base *scan_ctx = svdev->scan_ctx; + void *io[32]; + uint32_t io_len[32]; + uint16_t i, cnt; + int rc; + + cnt = virtio_recv_pkts(ch->vq, (void **)io, io_len, SPDK_COUNTOF(io)); + for (i = 0; i < cnt; ++i) { + if (spdk_unlikely(scan_ctx && io[i] == &scan_ctx->io_ctx)) { + if (svdev->removed) { + _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR); + return SPDK_POLLER_BUSY; + } + + if (scan_ctx->restart) { + scan_ctx->restart = false; + scan_ctx->full_scan = true; + _virtio_scsi_dev_scan_tgt(scan_ctx, 0); + continue; + } + + process_scan_resp(scan_ctx); + continue; + } + + bdev_virtio_io_cpl(io[i]); + } + + if (spdk_unlikely(scan_ctx && scan_ctx->needs_resend)) { + if (svdev->removed) { + _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR); + return SPDK_POLLER_BUSY; + } else if (cnt == 0) { + return SPDK_POLLER_IDLE; + } + + rc = send_scan_io(scan_ctx); + if (rc != 0) { + assert(scan_ctx->retries > 0); + scan_ctx->retries--; + if (scan_ctx->retries == 0) { + SPDK_ERRLOG("Target scan failed unrecoverably with rc = %d.\n", rc); + _virtio_scsi_dev_scan_finish(scan_ctx, rc); + } + } + } + + return cnt; +} + +static void +bdev_virtio_tmf_cpl_cb(void *ctx) +{ + struct spdk_bdev_io *bdev_io = ctx; + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + + if (io_ctx->tmf_resp.response == VIRTIO_SCSI_S_OK) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +bdev_virtio_tmf_cpl(struct spdk_bdev_io *bdev_io) +{ + spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_virtio_tmf_cpl_cb, bdev_io); +} + +static void +bdev_virtio_eventq_io_cpl(struct virtio_scsi_dev *svdev, struct virtio_scsi_eventq_io *io) +{ + struct virtio_scsi_event *ev = &io->ev; + struct virtio_scsi_disk *disk; + + if (ev->lun[0] != 1) { + SPDK_WARNLOG("Received an event with invalid data layout.\n"); + goto out; + } + + if (ev->event & VIRTIO_SCSI_T_EVENTS_MISSED) { + ev->event &= ~VIRTIO_SCSI_T_EVENTS_MISSED; + virtio_scsi_dev_scan(svdev, NULL, NULL); + } + + switch (ev->event) { + case VIRTIO_SCSI_T_NO_EVENT: + break; + case VIRTIO_SCSI_T_TRANSPORT_RESET: + switch (ev->reason) { + case VIRTIO_SCSI_EVT_RESET_RESCAN: + virtio_scsi_dev_scan_tgt(svdev, ev->lun[1]); + break; + case VIRTIO_SCSI_EVT_RESET_REMOVED: + disk = virtio_scsi_dev_get_disk_by_id(svdev, ev->lun[1]); + if (disk != NULL) { + spdk_bdev_unregister(&disk->bdev, NULL, NULL); + } + break; + default: + break; + } + break; + default: + break; + } + +out: + virtio_scsi_dev_send_eventq_io(svdev->vdev.vqs[VIRTIO_SCSI_EVENTQ], io); +} + +static void +bdev_virtio_tmf_abort_nomem_cb(void *ctx) +{ + struct spdk_bdev_io *bdev_io = ctx; + + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +} + +static void +bdev_virtio_tmf_abort_ioerr_cb(void *ctx) +{ + struct spdk_bdev_io *bdev_io = ctx; + + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +} + +static void +bdev_virtio_tmf_abort(struct spdk_bdev_io *bdev_io, int status) +{ + spdk_msg_fn fn; + + if (status == -ENOMEM) { + fn = bdev_virtio_tmf_abort_nomem_cb; + } else { + fn = bdev_virtio_tmf_abort_ioerr_cb; + } + + spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), fn, bdev_io); +} + +static int +bdev_virtio_send_tmf_io(struct virtqueue *ctrlq, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + int rc; + + rc = virtqueue_req_start(ctrlq, bdev_io, 2); + if (rc != 0) { + return rc; + } + + virtqueue_req_add_iovs(ctrlq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); + virtqueue_req_add_iovs(ctrlq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); + + virtqueue_req_flush(ctrlq); + return 0; +} + +static int +bdev_virtio_mgmt_poll(void *arg) +{ + struct virtio_scsi_dev *svdev = arg; + struct virtio_dev *vdev = &svdev->vdev; + struct virtqueue *eventq = vdev->vqs[VIRTIO_SCSI_EVENTQ]; + struct virtqueue *ctrlq = vdev->vqs[VIRTIO_SCSI_CONTROLQ]; + struct spdk_ring *send_ring = svdev->ctrlq_ring; + void *io[16]; + uint32_t io_len[16]; + uint16_t i, cnt; + int rc; + int total = 0; + + cnt = spdk_ring_dequeue(send_ring, io, SPDK_COUNTOF(io)); + total += cnt; + for (i = 0; i < cnt; ++i) { + rc = bdev_virtio_send_tmf_io(ctrlq, io[i]); + if (rc != 0) { + bdev_virtio_tmf_abort(io[i], rc); + } + } + + cnt = virtio_recv_pkts(ctrlq, io, io_len, SPDK_COUNTOF(io)); + total += cnt; + for (i = 0; i < cnt; ++i) { + bdev_virtio_tmf_cpl(io[i]); + } + + cnt = virtio_recv_pkts(eventq, io, io_len, SPDK_COUNTOF(io)); + total += cnt; + for (i = 0; i < cnt; ++i) { + bdev_virtio_eventq_io_cpl(svdev, io[i]); + } + + return total; +} + +static int +bdev_virtio_scsi_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct virtio_scsi_dev *svdev = io_device; + struct virtio_dev *vdev = &svdev->vdev; + struct bdev_virtio_io_channel *ch = ctx_buf; + struct virtqueue *vq; + int32_t queue_idx; + + queue_idx = virtio_dev_find_and_acquire_queue(vdev, VIRTIO_SCSI_REQUESTQ); + if (queue_idx < 0) { + SPDK_ERRLOG("Couldn't get an unused queue for the io_channel.\n"); + return -1; + } + + vq = vdev->vqs[queue_idx]; + + ch->svdev = svdev; + ch->vq = vq; + + ch->poller = SPDK_POLLER_REGISTER(bdev_virtio_poll, ch, 0); + + return 0; +} + +static void +bdev_virtio_scsi_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_virtio_io_channel *ch = ctx_buf; + struct virtio_scsi_dev *svdev = ch->svdev; + struct virtio_dev *vdev = &svdev->vdev; + struct virtqueue *vq = ch->vq; + + spdk_poller_unregister(&ch->poller); + virtio_dev_release_queue(vdev, vq->vq_queue_index); +} + +static void +_virtio_scsi_dev_scan_finish(struct virtio_scsi_scan_base *base, int errnum) +{ + struct virtio_scsi_dev *svdev = base->svdev; + size_t bdevs_cnt; + struct spdk_bdev *bdevs[BDEV_VIRTIO_MAX_TARGET]; + struct virtio_scsi_disk *disk; + struct virtio_scsi_scan_info *tgt, *next_tgt; + + spdk_put_io_channel(spdk_io_channel_from_ctx(base->channel)); + base->svdev->scan_ctx = NULL; + + TAILQ_FOREACH_SAFE(tgt, &base->scan_queue, tailq, next_tgt) { + TAILQ_REMOVE(&base->scan_queue, tgt, tailq); + free(tgt); + } + + if (base->cb_fn == NULL) { + spdk_free(base); + return; + } + + bdevs_cnt = 0; + if (errnum == 0) { + TAILQ_FOREACH(disk, &svdev->luns, link) { + bdevs[bdevs_cnt] = &disk->bdev; + bdevs_cnt++; + } + } + + base->cb_fn(base->cb_arg, errnum, bdevs, bdevs_cnt); + spdk_free(base); +} + +static int +send_scan_io(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_io_ctx *io_ctx = &base->io_ctx; + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct virtqueue *vq = base->channel->vq; + int payload_iov_cnt = base->iov.iov_len > 0 ? 1 : 0; + int rc; + + req->lun[0] = 1; + req->lun[1] = base->info.target; + + rc = virtqueue_req_start(vq, io_ctx, 2 + payload_iov_cnt); + if (rc != 0) { + base->needs_resend = true; + return -1; + } + + virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); + virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); + virtqueue_req_add_iovs(vq, &base->iov, payload_iov_cnt, SPDK_VIRTIO_DESC_WR); + + virtqueue_req_flush(vq); + return 0; +} + +static int +send_inquiry(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct spdk_scsi_cdb_inquiry *cdb; + + memset(req, 0, sizeof(*req)); + + base->iov.iov_len = BDEV_VIRTIO_SCAN_PAYLOAD_SIZE; + cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; + cdb->opcode = SPDK_SPC_INQUIRY; + to_be16(cdb->alloc_len, BDEV_VIRTIO_SCAN_PAYLOAD_SIZE); + + return send_scan_io(base); +} + +static int +send_inquiry_vpd(struct virtio_scsi_scan_base *base, uint8_t page_code) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct spdk_scsi_cdb_inquiry *inquiry_cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; + + memset(req, 0, sizeof(*req)); + + base->iov.iov_len = BDEV_VIRTIO_SCAN_PAYLOAD_SIZE; + inquiry_cdb->opcode = SPDK_SPC_INQUIRY; + inquiry_cdb->evpd = 1; + inquiry_cdb->page_code = page_code; + to_be16(inquiry_cdb->alloc_len, base->iov.iov_len); + + return send_scan_io(base); +} + +static int +send_read_cap_10(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + + memset(req, 0, sizeof(*req)); + + base->iov.iov_len = 8; + req->cdb[0] = SPDK_SBC_READ_CAPACITY_10; + + return send_scan_io(base); +} + +static int +send_read_cap_16(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + + memset(req, 0, sizeof(*req)); + + base->iov.iov_len = 32; + req->cdb[0] = SPDK_SPC_SERVICE_ACTION_IN_16; + req->cdb[1] = SPDK_SBC_SAI_READ_CAPACITY_16; + to_be32(&req->cdb[10], base->iov.iov_len); + + return send_scan_io(base); +} + +static int +send_test_unit_ready(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + + memset(req, 0, sizeof(*req)); + req->cdb[0] = SPDK_SPC_TEST_UNIT_READY; + base->iov.iov_len = 0; + + return send_scan_io(base); +} + +static int +send_start_stop_unit(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + + memset(req, 0, sizeof(*req)); + req->cdb[0] = SPDK_SBC_START_STOP_UNIT; + req->cdb[4] = SPDK_SBC_START_STOP_UNIT_START_BIT; + base->iov.iov_len = 0; + + return send_scan_io(base); +} + +static int +process_scan_start_stop_unit(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + + if (resp->status == SPDK_SCSI_STATUS_GOOD) { + return send_inquiry_vpd(base, SPDK_SPC_VPD_SUPPORTED_VPD_PAGES); + } + + return -1; +} + +static int +process_scan_test_unit_ready(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + int sk, asc, ascq; + + get_scsi_status(resp, &sk, &asc, &ascq); + + /* check response, get VPD if spun up otherwise send SSU */ + if (resp->status == SPDK_SCSI_STATUS_GOOD) { + return send_inquiry_vpd(base, SPDK_SPC_VPD_SUPPORTED_VPD_PAGES); + } else if (resp->response == VIRTIO_SCSI_S_OK && + resp->status == SPDK_SCSI_STATUS_CHECK_CONDITION && + sk == SPDK_SCSI_SENSE_UNIT_ATTENTION && + asc == SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_READY) { + return send_start_stop_unit(base); + } else { + return -1; + } +} + +static int +process_scan_inquiry_standard(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + struct spdk_scsi_cdb_inquiry_data *inquiry_data = + (struct spdk_scsi_cdb_inquiry_data *)base->payload; + + if (resp->status != SPDK_SCSI_STATUS_GOOD) { + return -1; + } + + /* check to make sure its a supported device */ + if (inquiry_data->peripheral_device_type != SPDK_SPC_PERIPHERAL_DEVICE_TYPE_DISK || + inquiry_data->peripheral_qualifier != SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED) { + SPDK_WARNLOG("Unsupported peripheral device type 0x%02x (qualifier 0x%02x)\n", + inquiry_data->peripheral_device_type, + inquiry_data->peripheral_qualifier); + return -1; + } + + return send_test_unit_ready(base); +} + +static int +process_scan_inquiry_vpd_supported_vpd_pages(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + bool block_provisioning_page_supported = false; + + if (resp->status == SPDK_SCSI_STATUS_GOOD) { + const uint8_t *vpd_data = base->payload; + const uint8_t *supported_vpd_pages = vpd_data + 4; + uint16_t page_length; + uint16_t num_supported_pages; + uint16_t i; + + page_length = from_be16(vpd_data + 2); + num_supported_pages = spdk_min(page_length, base->iov.iov_len - 4); + + for (i = 0; i < num_supported_pages; i++) { + if (supported_vpd_pages[i] == SPDK_SPC_VPD_BLOCK_THIN_PROVISION) { + block_provisioning_page_supported = true; + break; + } + } + } + + if (block_provisioning_page_supported) { + return send_inquiry_vpd(base, SPDK_SPC_VPD_BLOCK_THIN_PROVISION); + } else { + return send_read_cap_10(base); + } +} + +static int +process_scan_inquiry_vpd_block_thin_provision(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + + base->info.unmap_supported = false; + + if (resp->status == SPDK_SCSI_STATUS_GOOD) { + uint8_t *vpd_data = base->payload; + + base->info.unmap_supported = !!(vpd_data[5] & SPDK_SCSI_UNMAP_LBPU); + } + + SPDK_INFOLOG(SPDK_LOG_VIRTIO, "Target %u: unmap supported = %d\n", + base->info.target, (int)base->info.unmap_supported); + + return send_read_cap_10(base); +} + +static int +process_scan_inquiry(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct spdk_scsi_cdb_inquiry *inquiry_cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; + + if ((inquiry_cdb->evpd & 1) == 0) { + return process_scan_inquiry_standard(base); + } + + switch (inquiry_cdb->page_code) { + case SPDK_SPC_VPD_SUPPORTED_VPD_PAGES: + return process_scan_inquiry_vpd_supported_vpd_pages(base); + case SPDK_SPC_VPD_BLOCK_THIN_PROVISION: + return process_scan_inquiry_vpd_block_thin_provision(base); + default: + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO, "Unexpected VPD page 0x%02x\n", inquiry_cdb->page_code); + return -1; + } +} + +static void +bdev_virtio_disc_notify_remove(void *remove_ctx) +{ + struct virtio_scsi_disk *disk = remove_ctx; + + disk->removed = true; + spdk_bdev_close(disk->notify_desc); +} + +/* To be called only from the thread performing target scan */ +static int +virtio_scsi_dev_add_tgt(struct virtio_scsi_dev *svdev, struct virtio_scsi_scan_info *info) +{ + struct virtio_scsi_disk *disk; + struct spdk_bdev *bdev; + int rc; + + TAILQ_FOREACH(disk, &svdev->luns, link) { + if (disk->info.target == info->target) { + /* Target is already attached and param change is not supported */ + return 0; + } + } + + if (info->block_size == 0 || info->num_blocks == 0) { + SPDK_ERRLOG("%s: invalid target %u: bs=%"PRIu32" blocks=%"PRIu64"\n", + svdev->vdev.name, info->target, info->block_size, info->num_blocks); + return -EINVAL; + } + + disk = calloc(1, sizeof(*disk)); + if (disk == NULL) { + SPDK_ERRLOG("could not allocate disk\n"); + return -ENOMEM; + } + + disk->svdev = svdev; + memcpy(&disk->info, info, sizeof(*info)); + + bdev = &disk->bdev; + bdev->name = spdk_sprintf_alloc("%st%"PRIu8, svdev->vdev.name, info->target); + if (bdev->name == NULL) { + SPDK_ERRLOG("Couldn't alloc memory for the bdev name.\n"); + free(disk); + return -ENOMEM; + } + + bdev->product_name = "Virtio SCSI Disk"; + bdev->write_cache = 0; + bdev->blocklen = disk->info.block_size; + bdev->blockcnt = disk->info.num_blocks; + + bdev->ctxt = disk; + bdev->fn_table = &virtio_fn_table; + bdev->module = &virtio_scsi_if; + + rc = spdk_bdev_register(&disk->bdev); + if (rc) { + SPDK_ERRLOG("Failed to register bdev name=%s\n", disk->bdev.name); + free(bdev->name); + free(disk); + return rc; + } + + rc = spdk_bdev_open(bdev, false, bdev_virtio_disc_notify_remove, disk, &disk->notify_desc); + if (rc) { + assert(false); + } + + TAILQ_INSERT_TAIL(&svdev->luns, disk, link); + return 0; +} + +static int +process_read_cap_10(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + uint64_t max_block; + uint32_t block_size; + uint8_t target_id = req->lun[1]; + int rc; + + if (resp->response != VIRTIO_SCSI_S_OK || resp->status != SPDK_SCSI_STATUS_GOOD) { + SPDK_ERRLOG("READ CAPACITY (10) failed for target %"PRIu8".\n", target_id); + return -1; + } + + block_size = from_be32(base->payload + 4); + max_block = from_be32(base->payload); + + if (max_block == 0xffffffff) { + return send_read_cap_16(base); + } + + base->info.num_blocks = (uint64_t)max_block + 1; + base->info.block_size = block_size; + + rc = virtio_scsi_dev_add_tgt(base->svdev, &base->info); + if (rc != 0) { + return rc; + } + + return _virtio_scsi_dev_scan_next(base, 0); +} + +static int +process_read_cap_16(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + uint8_t target_id = req->lun[1]; + int rc; + + if (resp->response != VIRTIO_SCSI_S_OK || resp->status != SPDK_SCSI_STATUS_GOOD) { + SPDK_ERRLOG("READ CAPACITY (16) failed for target %"PRIu8".\n", target_id); + return -1; + } + + base->info.num_blocks = from_be64(base->payload) + 1; + base->info.block_size = from_be32(base->payload + 8); + rc = virtio_scsi_dev_add_tgt(base->svdev, &base->info); + if (rc != 0) { + return rc; + } + + return _virtio_scsi_dev_scan_next(base, 0); +} + +static void +process_scan_resp(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + int rc, sk, asc, ascq; + uint8_t target_id; + + if (base->io_ctx.iov_req.iov_len < sizeof(struct virtio_scsi_cmd_req) || + base->io_ctx.iov_resp.iov_len < sizeof(struct virtio_scsi_cmd_resp)) { + SPDK_ERRLOG("Received target scan message with invalid length.\n"); + _virtio_scsi_dev_scan_next(base, -EIO); + return; + } + + get_scsi_status(resp, &sk, &asc, &ascq); + target_id = req->lun[1]; + + if (resp->response == VIRTIO_SCSI_S_BAD_TARGET || + resp->response == VIRTIO_SCSI_S_INCORRECT_LUN) { + _virtio_scsi_dev_scan_next(base, -ENODEV); + return; + } + + if (resp->response != VIRTIO_SCSI_S_OK || + (resp->status == SPDK_SCSI_STATUS_CHECK_CONDITION && + sk != SPDK_SCSI_SENSE_ILLEGAL_REQUEST)) { + assert(base->retries > 0); + base->retries--; + if (base->retries == 0) { + SPDK_NOTICELOG("Target %"PRIu8" is present, but unavailable.\n", target_id); + SPDK_LOGDUMP(SPDK_LOG_VIRTIO, "CDB", req->cdb, sizeof(req->cdb)); + SPDK_LOGDUMP(SPDK_LOG_VIRTIO, "SENSE DATA", resp->sense, sizeof(resp->sense)); + _virtio_scsi_dev_scan_next(base, -EBUSY); + return; + } + + /* resend the same request */ + rc = send_scan_io(base); + if (rc != 0) { + /* Let response poller do the resend */ + } + return; + } + + base->retries = SCAN_REQUEST_RETRIES; + + switch (req->cdb[0]) { + case SPDK_SPC_INQUIRY: + rc = process_scan_inquiry(base); + break; + case SPDK_SPC_TEST_UNIT_READY: + rc = process_scan_test_unit_ready(base); + break; + case SPDK_SBC_START_STOP_UNIT: + rc = process_scan_start_stop_unit(base); + break; + case SPDK_SBC_READ_CAPACITY_10: + rc = process_read_cap_10(base); + break; + case SPDK_SPC_SERVICE_ACTION_IN_16: + rc = process_read_cap_16(base); + break; + default: + SPDK_ERRLOG("Received invalid target scan message: cdb[0] = %"PRIu8".\n", req->cdb[0]); + rc = -1; + break; + } + + if (rc != 0) { + if (base->needs_resend) { + return; /* Let response poller do the resend */ + } + + _virtio_scsi_dev_scan_next(base, rc); + } +} + +static int +_virtio_scsi_dev_scan_next(struct virtio_scsi_scan_base *base, int rc) +{ + struct virtio_scsi_scan_info *next; + struct virtio_scsi_disk *disk; + uint8_t target_id; + + if (base->full_scan) { + if (rc != 0) { + disk = virtio_scsi_dev_get_disk_by_id(base->svdev, + base->info.target); + if (disk != NULL) { + spdk_bdev_unregister(&disk->bdev, NULL, NULL); + } + } + + target_id = base->info.target + 1; + if (target_id < BDEV_VIRTIO_MAX_TARGET) { + _virtio_scsi_dev_scan_tgt(base, target_id); + return 0; + } + + base->full_scan = false; + } + + next = TAILQ_FIRST(&base->scan_queue); + if (next == NULL) { + _virtio_scsi_dev_scan_finish(base, 0); + return 0; + } + + TAILQ_REMOVE(&base->scan_queue, next, tailq); + target_id = next->target; + free(next); + + _virtio_scsi_dev_scan_tgt(base, target_id); + return 0; +} + +static int +virtio_pci_scsi_dev_enumerate_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) +{ + struct virtio_scsi_dev *svdev; + + svdev = virtio_pci_scsi_dev_create(NULL, pci_ctx); + return svdev == NULL ? -1 : 0; +} + +static int +bdev_virtio_process_config(void) +{ + struct spdk_conf_section *sp; + struct virtio_scsi_dev *svdev; + char *default_name = NULL; + char *path, *type, *name; + unsigned vdev_num; + int num_queues; + bool enable_pci; + int rc = 0; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VirtioUser")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VirtioUser%u", &vdev_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + rc = -1; + goto out; + } + + path = spdk_conf_section_get_val(sp, "Path"); + if (path == NULL) { + SPDK_ERRLOG("VirtioUser%u: missing Path\n", vdev_num); + rc = -1; + goto out; + } + + type = spdk_conf_section_get_val(sp, "Type"); + if (type != NULL && strcmp(type, "SCSI") != 0) { + continue; + } + + num_queues = spdk_conf_section_get_intval(sp, "Queues"); + if (num_queues < 1) { + num_queues = 1; + } else if (num_queues > SPDK_VIRTIO_MAX_VIRTQUEUES) { + num_queues = SPDK_VIRTIO_MAX_VIRTQUEUES; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + default_name = spdk_sprintf_alloc("VirtioScsi%u", vdev_num); + name = default_name; + } + + svdev = virtio_user_scsi_dev_create(name, path, num_queues, 512); + free(default_name); + default_name = NULL; + + if (svdev == NULL) { + rc = -1; + goto out; + } + } + + sp = spdk_conf_find_section(NULL, "VirtioPci"); + if (sp == NULL) { + return 0; + } + + enable_pci = spdk_conf_section_get_boolval(sp, "Enable", false); + if (enable_pci) { + rc = virtio_pci_dev_enumerate(virtio_pci_scsi_dev_enumerate_cb, NULL, + PCI_DEVICE_ID_VIRTIO_SCSI_MODERN); + } + +out: + return rc; +} + +static int +_virtio_scsi_dev_scan_init(struct virtio_scsi_dev *svdev) +{ + struct virtio_scsi_scan_base *base; + struct spdk_io_channel *io_ch; + struct virtio_scsi_io_ctx *io_ctx; + struct virtio_scsi_cmd_req *req; + struct virtio_scsi_cmd_resp *resp; + + io_ch = spdk_get_io_channel(svdev); + if (io_ch == NULL) { + return -EBUSY; + } + + base = spdk_zmalloc(sizeof(*base), 64, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (base == NULL) { + SPDK_ERRLOG("couldn't allocate memory for scsi target scan.\n"); + return -ENOMEM; + } + + base->svdev = svdev; + + base->channel = spdk_io_channel_get_ctx(io_ch); + TAILQ_INIT(&base->scan_queue); + svdev->scan_ctx = base; + + base->iov.iov_base = base->payload; + io_ctx = &base->io_ctx; + req = &io_ctx->req; + resp = &io_ctx->resp; + io_ctx->iov_req.iov_base = req; + io_ctx->iov_req.iov_len = sizeof(*req); + io_ctx->iov_resp.iov_base = resp; + io_ctx->iov_resp.iov_len = sizeof(*resp); + + base->retries = SCAN_REQUEST_RETRIES; + return 0; +} + +static void +_virtio_scsi_dev_scan_tgt(struct virtio_scsi_scan_base *base, uint8_t target) +{ + int rc; + + memset(&base->info, 0, sizeof(base->info)); + base->info.target = target; + + rc = send_inquiry(base); + if (rc) { + /* Let response poller do the resend */ + } +} + +static int +virtio_scsi_dev_scan(struct virtio_scsi_dev *svdev, bdev_virtio_create_cb cb_fn, + void *cb_arg) +{ + struct virtio_scsi_scan_base *base; + struct virtio_scsi_scan_info *tgt, *next_tgt; + int rc; + + if (svdev->scan_ctx) { + if (svdev->scan_ctx->full_scan) { + return -EEXIST; + } + + /* We're about to start a full rescan, so there's no need + * to scan particular targets afterwards. + */ + TAILQ_FOREACH_SAFE(tgt, &svdev->scan_ctx->scan_queue, tailq, next_tgt) { + TAILQ_REMOVE(&svdev->scan_ctx->scan_queue, tgt, tailq); + free(tgt); + } + + svdev->scan_ctx->cb_fn = cb_fn; + svdev->scan_ctx->cb_arg = cb_arg; + svdev->scan_ctx->restart = true; + return 0; + } + + rc = _virtio_scsi_dev_scan_init(svdev); + if (rc != 0) { + return rc; + } + + base = svdev->scan_ctx; + base->cb_fn = cb_fn; + base->cb_arg = cb_arg; + base->full_scan = true; + + _virtio_scsi_dev_scan_tgt(base, 0); + return 0; +} + +static int +virtio_scsi_dev_scan_tgt(struct virtio_scsi_dev *svdev, uint8_t target) +{ + struct virtio_scsi_scan_base *base; + struct virtio_scsi_scan_info *info; + int rc; + + base = svdev->scan_ctx; + if (base) { + info = calloc(1, sizeof(*info)); + if (info == NULL) { + SPDK_ERRLOG("calloc failed\n"); + return -ENOMEM; + } + + info->target = target; + TAILQ_INSERT_TAIL(&base->scan_queue, info, tailq); + return 0; + } + + rc = _virtio_scsi_dev_scan_init(svdev); + if (rc != 0) { + return rc; + } + + base = svdev->scan_ctx; + base->full_scan = true; + _virtio_scsi_dev_scan_tgt(base, target); + return 0; +} + +static void +bdev_virtio_initial_scan_complete(void *ctx, int result, + struct spdk_bdev **bdevs, size_t bdevs_cnt) +{ + struct virtio_scsi_dev *svdev; + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { + if (svdev->scan_ctx) { + /* another device is still being scanned */ + pthread_mutex_unlock(&g_virtio_scsi_mutex); + return; + } + } + + pthread_mutex_unlock(&g_virtio_scsi_mutex); + spdk_bdev_module_init_done(&virtio_scsi_if); +} + +static int +bdev_virtio_initialize(void) +{ + struct virtio_scsi_dev *svdev, *next_svdev; + int rc; + + rc = bdev_virtio_process_config(); + pthread_mutex_lock(&g_virtio_scsi_mutex); + + if (rc != 0) { + goto err_unlock; + } + + if (TAILQ_EMPTY(&g_virtio_scsi_devs)) { + goto out_unlock; + } + + /* Initialize all created devices and scan available targets */ + TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { + rc = virtio_scsi_dev_scan(svdev, bdev_virtio_initial_scan_complete, NULL); + if (rc != 0) { + goto err_unlock; + } + } + + pthread_mutex_unlock(&g_virtio_scsi_mutex); + return 0; + +err_unlock: + /* Remove any created devices */ + TAILQ_FOREACH_SAFE(svdev, &g_virtio_scsi_devs, tailq, next_svdev) { + virtio_scsi_dev_remove(svdev, NULL, NULL); + } + +out_unlock: + pthread_mutex_unlock(&g_virtio_scsi_mutex); + spdk_bdev_module_init_done(&virtio_scsi_if); + return rc; +} + +static void +_virtio_scsi_dev_unregister_cb(void *io_device) +{ + struct virtio_scsi_dev *svdev = io_device; + struct virtio_dev *vdev = &svdev->vdev; + bool finish_module; + bdev_virtio_remove_cb remove_cb; + void *remove_ctx; + + assert(spdk_ring_count(svdev->ctrlq_ring) == 0); + spdk_ring_free(svdev->ctrlq_ring); + spdk_poller_unregister(&svdev->mgmt_poller); + + virtio_dev_release_queue(vdev, VIRTIO_SCSI_EVENTQ); + virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); + + virtio_dev_stop(vdev); + virtio_dev_destruct(vdev); + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_REMOVE(&g_virtio_scsi_devs, svdev, tailq); + pthread_mutex_unlock(&g_virtio_scsi_mutex); + + remove_cb = svdev->remove_cb; + remove_ctx = svdev->remove_ctx; + spdk_free(svdev->eventq_ios); + free(svdev); + + if (remove_cb) { + remove_cb(remove_ctx, 0); + } + + finish_module = TAILQ_EMPTY(&g_virtio_scsi_devs); + + if (g_bdev_virtio_finish && finish_module) { + spdk_bdev_module_finish_done(); + } +} + +static void +virtio_scsi_dev_unregister_cb(void *io_device) +{ + struct virtio_scsi_dev *svdev = io_device; + struct spdk_thread *thread; + + thread = virtio_dev_queue_get_thread(&svdev->vdev, VIRTIO_SCSI_CONTROLQ); + spdk_thread_send_msg(thread, _virtio_scsi_dev_unregister_cb, io_device); +} + +static void +virtio_scsi_dev_remove(struct virtio_scsi_dev *svdev, + bdev_virtio_remove_cb cb_fn, void *cb_arg) +{ + struct virtio_scsi_disk *disk, *disk_tmp; + bool do_remove = true; + + if (svdev->removed) { + if (cb_fn) { + cb_fn(cb_arg, -EBUSY); + } + return; + } + + svdev->remove_cb = cb_fn; + svdev->remove_ctx = cb_arg; + svdev->removed = true; + + if (svdev->scan_ctx) { + /* The removal will continue after we receive a pending scan I/O. */ + return; + } + + TAILQ_FOREACH_SAFE(disk, &svdev->luns, link, disk_tmp) { + if (!disk->removed) { + spdk_bdev_unregister(&disk->bdev, NULL, NULL); + } + do_remove = false; + } + + if (do_remove) { + spdk_io_device_unregister(svdev, virtio_scsi_dev_unregister_cb); + } +} + +static void +bdev_virtio_finish(void) +{ + struct virtio_scsi_dev *svdev, *next; + + g_bdev_virtio_finish = true; + + pthread_mutex_lock(&g_virtio_scsi_mutex); + if (TAILQ_EMPTY(&g_virtio_scsi_devs)) { + pthread_mutex_unlock(&g_virtio_scsi_mutex); + spdk_bdev_module_finish_done(); + return; + } + + /* Defer module finish until all controllers are removed. */ + TAILQ_FOREACH_SAFE(svdev, &g_virtio_scsi_devs, tailq, next) { + virtio_scsi_dev_remove(svdev, NULL, NULL); + } + pthread_mutex_unlock(&g_virtio_scsi_mutex); +} + +int +bdev_virtio_user_scsi_dev_create(const char *base_name, const char *path, + unsigned num_queues, unsigned queue_size, + bdev_virtio_create_cb cb_fn, void *cb_arg) +{ + struct virtio_scsi_dev *svdev; + int rc; + + svdev = virtio_user_scsi_dev_create(base_name, path, num_queues, queue_size); + if (svdev == NULL) { + return -1; + } + + rc = virtio_scsi_dev_scan(svdev, cb_fn, cb_arg); + if (rc) { + virtio_scsi_dev_remove(svdev, NULL, NULL); + } + + return rc; +} + +struct bdev_virtio_pci_dev_create_ctx { + const char *name; + bdev_virtio_create_cb cb_fn; + void *cb_arg; +}; + +static int +bdev_virtio_pci_scsi_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) +{ + struct virtio_scsi_dev *svdev; + struct bdev_virtio_pci_dev_create_ctx *create_ctx = ctx; + int rc; + + svdev = virtio_pci_scsi_dev_create(create_ctx->name, pci_ctx); + if (svdev == NULL) { + return -1; + } + + rc = virtio_scsi_dev_scan(svdev, create_ctx->cb_fn, create_ctx->cb_arg); + if (rc) { + virtio_scsi_dev_remove(svdev, NULL, NULL); + } + + return rc; +} + +int +bdev_virtio_pci_scsi_dev_create(const char *name, struct spdk_pci_addr *pci_addr, + bdev_virtio_create_cb cb_fn, void *cb_arg) +{ + struct bdev_virtio_pci_dev_create_ctx create_ctx; + + create_ctx.name = name; + create_ctx.cb_fn = cb_fn; + create_ctx.cb_arg = cb_arg; + + return virtio_pci_dev_attach(bdev_virtio_pci_scsi_dev_create_cb, &create_ctx, + PCI_DEVICE_ID_VIRTIO_SCSI_MODERN, pci_addr); +} + +int +bdev_virtio_scsi_dev_remove(const char *name, bdev_virtio_remove_cb cb_fn, void *cb_arg) +{ + struct virtio_scsi_dev *svdev; + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { + if (strcmp(svdev->vdev.name, name) == 0) { + break; + } + } + + if (svdev == NULL) { + pthread_mutex_unlock(&g_virtio_scsi_mutex); + SPDK_ERRLOG("Cannot find Virtio-SCSI device named '%s'\n", name); + return -ENODEV; + } + + virtio_scsi_dev_remove(svdev, cb_fn, cb_arg); + pthread_mutex_unlock(&g_virtio_scsi_mutex); + + return 0; +} + +void +bdev_virtio_scsi_dev_list(struct spdk_json_write_ctx *w) +{ + struct virtio_scsi_dev *svdev; + + spdk_json_write_array_begin(w); + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "name", svdev->vdev.name); + + virtio_dev_dump_json_info(&svdev->vdev, w); + + spdk_json_write_object_end(w); + } + pthread_mutex_unlock(&g_virtio_scsi_mutex); + + spdk_json_write_array_end(w); +} + +SPDK_LOG_REGISTER_COMPONENT("virtio", SPDK_LOG_VIRTIO) diff --git a/src/spdk/module/bdev/zone_block/Makefile b/src/spdk/module/bdev/zone_block/Makefile new file mode 100644 index 000000000..3dec8a37d --- /dev/null +++ b/src/spdk/module/bdev/zone_block/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = vbdev_zone_block.c vbdev_zone_block_rpc.c +LIBNAME = bdev_zone_block + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/module/bdev/zone_block/vbdev_zone_block.c b/src/spdk/module/bdev/zone_block/vbdev_zone_block.c new file mode 100644 index 000000000..fb8b92fd2 --- /dev/null +++ b/src/spdk/module/bdev/zone_block/vbdev_zone_block.c @@ -0,0 +1,916 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "vbdev_zone_block.h" + +#include "spdk/config.h" +#include "spdk/nvme.h" +#include "spdk/bdev_zone.h" + +#include "spdk_internal/log.h" + +static int zone_block_init(void); +static int zone_block_get_ctx_size(void); +static void zone_block_finish(void); +static int zone_block_config_json(struct spdk_json_write_ctx *w); +static void zone_block_examine(struct spdk_bdev *bdev); + +static struct spdk_bdev_module bdev_zoned_if = { + .name = "bdev_zoned_block", + .module_init = zone_block_init, + .module_fini = zone_block_finish, + .config_text = NULL, + .config_json = zone_block_config_json, + .examine_config = zone_block_examine, + .get_ctx_size = zone_block_get_ctx_size, +}; + +SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if) + +/* List of block vbdev names and their base bdevs via configuration file. + * Used so we can parse the conf once at init and use this list in examine(). + */ +struct bdev_zone_block_config { + char *vbdev_name; + char *bdev_name; + uint64_t zone_capacity; + uint64_t optimal_open_zones; + TAILQ_ENTRY(bdev_zone_block_config) link; +}; +static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs); + +struct block_zone { + struct spdk_bdev_zone_info zone_info; + pthread_spinlock_t lock; +}; + +/* List of block vbdevs and associated info for each. */ +struct bdev_zone_block { + struct spdk_bdev bdev; /* the block zoned bdev */ + struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ + struct block_zone *zones; /* array of zones */ + uint64_t num_zones; /* number of zones */ + uint64_t zone_capacity; /* zone capacity */ + uint64_t zone_shift; /* log2 of zone_size */ + TAILQ_ENTRY(bdev_zone_block) link; + struct spdk_thread *thread; /* thread where base device is opened */ +}; +static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes); + +struct zone_block_io_channel { + struct spdk_io_channel *base_ch; /* IO channel of base device */ +}; + +struct zone_block_io { + /* vbdev to which IO was issued */ + struct bdev_zone_block *bdev_zone_block; +}; + +static int +zone_block_init(void) +{ + return 0; +} + +static void +zone_block_remove_config(struct bdev_zone_block_config *name) +{ + TAILQ_REMOVE(&g_bdev_configs, name, link); + free(name->bdev_name); + free(name->vbdev_name); + free(name); +} + +static void +zone_block_finish(void) +{ + struct bdev_zone_block_config *name; + + while ((name = TAILQ_FIRST(&g_bdev_configs))) { + zone_block_remove_config(name); + } +} + +static int +zone_block_get_ctx_size(void) +{ + return sizeof(struct zone_block_io); +} + +static int +zone_block_config_json(struct spdk_json_write_ctx *w) +{ + struct bdev_zone_block *bdev_node; + struct spdk_bdev *base_bdev = NULL; + + TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) { + base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc); + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "bdev_zone_block_create"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev)); + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev)); + spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity); + spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); + } + + return 0; +} + +/* Callback for unregistering the IO device. */ +static void +_device_unregister_cb(void *io_device) +{ + struct bdev_zone_block *bdev_node = io_device; + uint64_t i; + + free(bdev_node->bdev.name); + for (i = 0; i < bdev_node->num_zones; i++) { + pthread_spin_destroy(&bdev_node->zones[i].lock); + } + free(bdev_node->zones); + free(bdev_node); +} + +static void +_zone_block_destruct(void *ctx) +{ + struct spdk_bdev_desc *desc = ctx; + + spdk_bdev_close(desc); +} + +static int +zone_block_destruct(void *ctx) +{ + struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx; + + TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link); + + /* Unclaim the underlying bdev. */ + spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc)); + + /* Close the underlying bdev on its same opened thread. */ + if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) { + spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc); + } else { + spdk_bdev_close(bdev_node->base_desc); + } + + /* Unregister the io_device. */ + spdk_io_device_unregister(bdev_node, _device_unregister_cb); + + return 0; +} + +static struct block_zone * +zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba) +{ + size_t index = lba >> bdev_node->zone_shift; + + if (index >= bdev_node->num_zones) { + return NULL; + } + + return &bdev_node->zones[index]; +} + +static struct block_zone * +zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba) +{ + struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba); + + if (zone && zone->zone_info.zone_id == start_lba) { + return zone; + } else { + return NULL; + } +} + +static int +zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io) +{ + struct block_zone *zone; + struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf; + uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; + size_t i; + + /* User can request info for more zones than exist, need to check both internal and user + * boundaries + */ + for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) { + zone = zone_block_get_zone_by_slba(bdev_node, zone_id); + if (!zone) { + return -EINVAL; + } + memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info)); + } + + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; +} + +static int +zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io) +{ + pthread_spin_lock(&zone->lock); + + switch (zone->zone_info.state) { + case SPDK_BDEV_ZONE_STATE_EMPTY: + case SPDK_BDEV_ZONE_STATE_OPEN: + case SPDK_BDEV_ZONE_STATE_CLOSED: + zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN; + pthread_spin_unlock(&zone->lock); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; + default: + pthread_spin_unlock(&zone->lock); + return -EINVAL; + } +} + +static void +_zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + /* Complete the original IO and then free the one that we created here + * as a result of issuing an IO via submit_reqeust. + */ + spdk_bdev_io_complete(orig_io, status); + spdk_bdev_free_io(bdev_io); +} + +static int +zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, + struct block_zone *zone, struct spdk_bdev_io *bdev_io) +{ + pthread_spin_lock(&zone->lock); + + switch (zone->zone_info.state) { + case SPDK_BDEV_ZONE_STATE_EMPTY: + pthread_spin_unlock(&zone->lock); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; + case SPDK_BDEV_ZONE_STATE_OPEN: + case SPDK_BDEV_ZONE_STATE_FULL: + case SPDK_BDEV_ZONE_STATE_CLOSED: + zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY; + zone->zone_info.write_pointer = zone->zone_info.zone_id; + pthread_spin_unlock(&zone->lock); + return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch, + zone->zone_info.zone_id, zone->zone_info.capacity, + _zone_block_complete_unmap, bdev_io); + default: + pthread_spin_unlock(&zone->lock); + return -EINVAL; + } +} + +static int +zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io) +{ + pthread_spin_lock(&zone->lock); + + switch (zone->zone_info.state) { + case SPDK_BDEV_ZONE_STATE_OPEN: + case SPDK_BDEV_ZONE_STATE_CLOSED: + zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED; + pthread_spin_unlock(&zone->lock); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; + default: + pthread_spin_unlock(&zone->lock); + return -EINVAL; + } +} + +static int +zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io) +{ + pthread_spin_lock(&zone->lock); + + zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity; + zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL; + + pthread_spin_unlock(&zone->lock); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; +} + +static int +zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, + struct spdk_bdev_io *bdev_io) +{ + struct block_zone *zone; + + zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id); + if (!zone) { + return -EINVAL; + } + + switch (bdev_io->u.zone_mgmt.zone_action) { + case SPDK_BDEV_ZONE_RESET: + return zone_block_reset_zone(bdev_node, ch, zone, bdev_io); + case SPDK_BDEV_ZONE_OPEN: + return zone_block_open_zone(zone, bdev_io); + case SPDK_BDEV_ZONE_CLOSE: + return zone_block_close_zone(zone, bdev_io); + case SPDK_BDEV_ZONE_FINISH: + return zone_block_finish_zone(zone, bdev_io); + default: + return -EINVAL; + } +} + +static void +_zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) { + orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks; + } + + /* Complete the original IO and then free the one that we created here + * as a result of issuing an IO via submit_reqeust. + */ + spdk_bdev_io_complete(orig_io, status); + spdk_bdev_free_io(bdev_io); +} + +static int +zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, + struct spdk_bdev_io *bdev_io) +{ + struct block_zone *zone; + uint64_t len = bdev_io->u.bdev.num_blocks; + uint64_t lba = bdev_io->u.bdev.offset_blocks; + uint64_t num_blocks_left, wp; + int rc = 0; + bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND; + + if (is_append) { + zone = zone_block_get_zone_by_slba(bdev_node, lba); + } else { + zone = zone_block_get_zone_containing_lba(bdev_node, lba); + } + if (!zone) { + SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%lx)\n", lba); + return -EINVAL; + } + + pthread_spin_lock(&zone->lock); + + switch (zone->zone_info.state) { + case SPDK_BDEV_ZONE_STATE_OPEN: + case SPDK_BDEV_ZONE_STATE_EMPTY: + case SPDK_BDEV_ZONE_STATE_CLOSED: + zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN; + break; + default: + SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state); + rc = -EINVAL; + goto write_fail; + } + + wp = zone->zone_info.write_pointer; + if (is_append) { + lba = wp; + } else { + if (lba != wp) { + SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%lx, wp 0x%lx)\n", lba, wp); + rc = -EINVAL; + goto write_fail; + } + } + + num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp; + if (len > num_blocks_left) { + SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIu64 ", len 0x%lx, wp 0x%lx)\n", lba, len, wp); + rc = -EINVAL; + goto write_fail; + } + + zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks; + assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity); + if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) { + zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL; + } + pthread_spin_unlock(&zone->lock); + + if (bdev_io->u.bdev.md_buf == NULL) { + rc = spdk_bdev_writev_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, lba, + bdev_io->u.bdev.num_blocks, _zone_block_complete_write, + bdev_io); + } else { + rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch, + bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.md_buf, + lba, bdev_io->u.bdev.num_blocks, + _zone_block_complete_write, bdev_io); + } + + return rc; + +write_fail: + pthread_spin_unlock(&zone->lock); + return rc; +} + +static void +_zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + /* Complete the original IO and then free the one that we created here + * as a result of issuing an IO via submit_reqeust. + */ + spdk_bdev_io_complete(orig_io, status); + spdk_bdev_free_io(bdev_io); +} + +static int +zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, + struct spdk_bdev_io *bdev_io) +{ + struct block_zone *zone; + uint64_t len = bdev_io->u.bdev.num_blocks; + uint64_t lba = bdev_io->u.bdev.offset_blocks; + int rc; + + zone = zone_block_get_zone_containing_lba(bdev_node, lba); + if (!zone) { + SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%lx)\n", lba); + return -EINVAL; + } + + if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) { + SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%lx, len 0x%lx)\n", lba, len); + return -EINVAL; + } + + if (bdev_io->u.bdev.md_buf == NULL) { + rc = spdk_bdev_readv_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, lba, + len, _zone_block_complete_read, + bdev_io); + } else { + rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch, + bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.md_buf, + lba, len, + _zone_block_complete_read, bdev_io); + } + + return rc; +} + +static void +zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev); + struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch); + int rc = 0; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: + rc = zone_block_get_zone_info(bdev_node, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: + rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_ZONE_APPEND: + rc = zone_block_write(bdev_node, dev_ch, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_READ: + rc = zone_block_read(bdev_node, dev_ch, bdev_io); + break; + default: + SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type); + rc = -ENOTSUP; + break; + } + + if (rc != 0) { + if (rc == -ENOMEM) { + SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +static bool +zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_ZONE_APPEND: + return true; + default: + return false; + } +} + +static struct spdk_io_channel * +zone_block_get_io_channel(void *ctx) +{ + struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx; + + return spdk_get_io_channel(bdev_node); +} + +static int +zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx; + struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc); + + spdk_json_write_name(w, "zoned_block"); + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev)); + spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev)); + spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity); + spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones); + spdk_json_write_object_end(w); + + return 0; +} + +/* When we register our vbdev this is how we specify our entry points. */ +static const struct spdk_bdev_fn_table zone_block_fn_table = { + .destruct = zone_block_destruct, + .submit_request = zone_block_submit_request, + .io_type_supported = zone_block_io_type_supported, + .get_io_channel = zone_block_get_io_channel, + .dump_info_json = zone_block_dump_info_json, +}; + +static void +zone_block_base_bdev_hotremove_cb(void *ctx) +{ + struct bdev_zone_block *bdev_node, *tmp; + struct spdk_bdev *bdev_find = ctx; + + TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) { + if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) { + spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL); + } + } +} + +static int +_zone_block_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct zone_block_io_channel *bdev_ch = ctx_buf; + struct bdev_zone_block *bdev_node = io_device; + + bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc); + if (!bdev_ch->base_ch) { + return -ENOMEM; + } + + return 0; +} + +static void +_zone_block_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct zone_block_io_channel *bdev_ch = ctx_buf; + + spdk_put_io_channel(bdev_ch->base_ch); +} + +static int +zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity, + uint64_t optimal_open_zones) +{ + struct bdev_zone_block_config *name; + + TAILQ_FOREACH(name, &g_bdev_configs, link) { + if (strcmp(vbdev_name, name->vbdev_name) == 0) { + SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name); + return -EEXIST; + } + if (strcmp(bdev_name, name->bdev_name) == 0) { + SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name); + return -EEXIST; + } + } + + name = calloc(1, sizeof(*name)); + if (!name) { + SPDK_ERRLOG("could not allocate bdev_names\n"); + return -ENOMEM; + } + + name->bdev_name = strdup(bdev_name); + if (!name->bdev_name) { + SPDK_ERRLOG("could not allocate name->bdev_name\n"); + free(name); + return -ENOMEM; + } + + name->vbdev_name = strdup(vbdev_name); + if (!name->vbdev_name) { + SPDK_ERRLOG("could not allocate name->vbdev_name\n"); + free(name->bdev_name); + free(name); + return -ENOMEM; + } + + name->zone_capacity = zone_capacity; + name->optimal_open_zones = optimal_open_zones; + + TAILQ_INSERT_TAIL(&g_bdev_configs, name, link); + + return 0; +} + +static int +zone_block_init_zone_info(struct bdev_zone_block *bdev_node) +{ + size_t i; + struct block_zone *zone; + int rc = 0; + + for (i = 0; i < bdev_node->num_zones; i++) { + zone = &bdev_node->zones[i]; + zone->zone_info.zone_id = bdev_node->bdev.zone_size * i; + zone->zone_info.capacity = bdev_node->zone_capacity; + zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity; + zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL; + if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) { + SPDK_ERRLOG("pthread_spin_init() failed\n"); + rc = -ENOMEM; + break; + } + } + + if (rc) { + for (; i > 0; i--) { + pthread_spin_destroy(&bdev_node->zones[i - 1].lock); + } + } + + return rc; +} + +static int +zone_block_register(struct spdk_bdev *base_bdev) +{ + struct bdev_zone_block_config *name, *tmp; + struct bdev_zone_block *bdev_node; + uint64_t zone_size; + int rc = 0; + + /* Check our list of names from config versus this bdev and if + * there's a match, create the bdev_node & bdev accordingly. + */ + TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) { + if (strcmp(name->bdev_name, base_bdev->name) != 0) { + continue; + } + + if (spdk_bdev_is_zoned(base_bdev)) { + SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev->name); + rc = -EEXIST; + goto free_config; + } + + bdev_node = calloc(1, sizeof(struct bdev_zone_block)); + if (!bdev_node) { + rc = -ENOMEM; + SPDK_ERRLOG("could not allocate bdev_node\n"); + goto free_config; + } + + /* The base bdev that we're attaching to. */ + bdev_node->bdev.name = strdup(name->vbdev_name); + if (!bdev_node->bdev.name) { + rc = -ENOMEM; + SPDK_ERRLOG("could not allocate bdev_node name\n"); + goto strdup_failed; + } + + zone_size = spdk_align64pow2(name->zone_capacity); + if (zone_size == 0) { + rc = -EINVAL; + SPDK_ERRLOG("invalid zone size\n"); + goto roundup_failed; + } + + bdev_node->zone_shift = spdk_u64log2(zone_size); + bdev_node->num_zones = base_bdev->blockcnt / zone_size; + + /* Align num_zones to optimal_open_zones */ + bdev_node->num_zones -= bdev_node->num_zones % name->optimal_open_zones; + bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone)); + if (!bdev_node->zones) { + rc = -ENOMEM; + SPDK_ERRLOG("could not allocate zones\n"); + goto calloc_failed; + } + + bdev_node->bdev.product_name = "zone_block"; + + /* Copy some properties from the underlying base bdev. */ + bdev_node->bdev.write_cache = base_bdev->write_cache; + bdev_node->bdev.required_alignment = base_bdev->required_alignment; + bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary; + + bdev_node->bdev.blocklen = base_bdev->blocklen; + bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size; + + if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_ZONE_BLOCK, + "Lost %lu blocks due to zone capacity and base bdev size misalignment\n", + base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity); + } + + bdev_node->bdev.write_unit_size = base_bdev->write_unit_size; + + bdev_node->bdev.md_interleave = base_bdev->md_interleave; + bdev_node->bdev.md_len = base_bdev->md_len; + bdev_node->bdev.dif_type = base_bdev->dif_type; + bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md; + bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags; + + bdev_node->bdev.zoned = true; + bdev_node->bdev.ctxt = bdev_node; + bdev_node->bdev.fn_table = &zone_block_fn_table; + bdev_node->bdev.module = &bdev_zoned_if; + + /* bdev specific info */ + bdev_node->bdev.zone_size = zone_size; + + bdev_node->zone_capacity = name->zone_capacity; + bdev_node->bdev.optimal_open_zones = name->optimal_open_zones; + bdev_node->bdev.max_open_zones = 0; + rc = zone_block_init_zone_info(bdev_node); + if (rc) { + SPDK_ERRLOG("could not init zone info\n"); + goto zone_info_failed; + } + + TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link); + + spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb, + sizeof(struct zone_block_io_channel), + name->vbdev_name); + + rc = spdk_bdev_open(base_bdev, true, zone_block_base_bdev_hotremove_cb, + base_bdev, &bdev_node->base_desc); + if (rc) { + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(base_bdev)); + goto open_failed; + } + + /* Save the thread where the base device is opened */ + bdev_node->thread = spdk_get_thread(); + + rc = spdk_bdev_module_claim_bdev(base_bdev, bdev_node->base_desc, bdev_node->bdev.module); + if (rc) { + SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(base_bdev)); + goto claim_failed; + } + + rc = spdk_bdev_register(&bdev_node->bdev); + if (rc) { + SPDK_ERRLOG("could not register zoned bdev\n"); + goto register_failed; + } + } + + return rc; + +register_failed: + spdk_bdev_module_release_bdev(&bdev_node->bdev); +claim_failed: + spdk_bdev_close(bdev_node->base_desc); +open_failed: + TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link); + spdk_io_device_unregister(bdev_node, NULL); +zone_info_failed: + free(bdev_node->zones); +calloc_failed: +roundup_failed: + free(bdev_node->bdev.name); +strdup_failed: + free(bdev_node); +free_config: + zone_block_remove_config(name); + return rc; +} + +int +vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity, + uint64_t optimal_open_zones) +{ + struct spdk_bdev *bdev = NULL; + int rc = 0; + + if (zone_capacity == 0) { + SPDK_ERRLOG("Zone capacity can't be 0\n"); + return -EINVAL; + } + + if (optimal_open_zones == 0) { + SPDK_ERRLOG("Optimal open zones can't be 0\n"); + return -EINVAL; + } + + /* Insert the bdev into our global name list even if it doesn't exist yet, + * it may show up soon... + */ + rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones); + if (rc) { + return rc; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + /* This is not an error, even though the bdev is not present at this time it may + * still show up later. + */ + return 0; + } + + return zone_block_register(bdev); +} + +void +vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg) +{ + struct bdev_zone_block_config *name_node; + struct spdk_bdev *bdev = NULL; + + bdev = spdk_bdev_get_by_name(name); + if (!bdev || bdev->module != &bdev_zoned_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + TAILQ_FOREACH(name_node, &g_bdev_configs, link) { + if (strcmp(name_node->vbdev_name, bdev->name) == 0) { + zone_block_remove_config(name_node); + break; + } + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static void +zone_block_examine(struct spdk_bdev *bdev) +{ + zone_block_register(bdev); + + spdk_bdev_module_examine_done(&bdev_zoned_if); +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_zone_block", SPDK_LOG_VBDEV_ZONE_BLOCK) diff --git a/src/spdk/module/bdev/zone_block/vbdev_zone_block.h b/src/spdk/module/bdev/zone_block/vbdev_zone_block.h new file mode 100644 index 000000000..b4904c4f4 --- /dev/null +++ b/src/spdk/module/bdev/zone_block/vbdev_zone_block.h @@ -0,0 +1,47 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_ZONE_BLOCK_H +#define SPDK_VBDEV_ZONE_BLOCK_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/bdev_module.h" + +int vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, + uint64_t zone_capacity, uint64_t optimal_open_zones); + +void vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg); + +#endif /* SPDK_VBDEV_ZONE_BLOCK_H */ diff --git a/src/spdk/module/bdev/zone_block/vbdev_zone_block_rpc.c b/src/spdk/module/bdev/zone_block/vbdev_zone_block_rpc.c new file mode 100644 index 000000000..b7f485190 --- /dev/null +++ b/src/spdk/module/bdev/zone_block/vbdev_zone_block_rpc.c @@ -0,0 +1,146 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "vbdev_zone_block.h" + +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/rpc.h" + +#include "spdk_internal/log.h" + +struct rpc_construct_zone_block { + char *name; + char *base_bdev; + uint64_t zone_capacity; + uint64_t optimal_open_zones; +}; + +static void +free_rpc_construct_zone_block(struct rpc_construct_zone_block *req) +{ + free(req->name); + free(req->base_bdev); +} + +static const struct spdk_json_object_decoder rpc_construct_zone_block_decoders[] = { + {"name", offsetof(struct rpc_construct_zone_block, name), spdk_json_decode_string}, + {"base_bdev", offsetof(struct rpc_construct_zone_block, base_bdev), spdk_json_decode_string}, + {"zone_capacity", offsetof(struct rpc_construct_zone_block, zone_capacity), spdk_json_decode_uint64}, + {"optimal_open_zones", offsetof(struct rpc_construct_zone_block, optimal_open_zones), spdk_json_decode_uint64}, +}; + +static void +rpc_zone_block_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_zone_block req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_zone_block_decoders, + SPDK_COUNTOF(rpc_construct_zone_block_decoders), + &req)) { + SPDK_ERRLOG("Failed to decode block create parameters"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto cleanup; + } + + rc = vbdev_zone_block_create(req.base_bdev, req.name, req.zone_capacity, + req.optimal_open_zones); + if (rc) { + SPDK_ERRLOG("Failed to create block zoned vbdev: %s", spdk_strerror(-rc)); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Failed to create block zoned vbdev: %s", + spdk_strerror(-rc)); + goto cleanup; + } + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, req.name); + spdk_jsonrpc_end_result(request, w); + +cleanup: + free_rpc_construct_zone_block(&req); +} +SPDK_RPC_REGISTER("bdev_zone_block_create", rpc_zone_block_create, SPDK_RPC_RUNTIME) + +struct rpc_delete_zone_block { + char *name; +}; + +static void +free_rpc_delete_zone_block(struct rpc_delete_zone_block *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_zone_block_decoders[] = { + {"name", offsetof(struct rpc_delete_zone_block, name), spdk_json_decode_string}, +}; + +static void +_rpc_delete_zone_block_cb(void *cb_ctx, int rc) +{ + struct spdk_jsonrpc_request *request = cb_ctx; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_bool(w, rc == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +rpc_zone_block_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_zone_block attrs = {}; + + if (spdk_json_decode_object(params, rpc_delete_zone_block_decoders, + SPDK_COUNTOF(rpc_delete_zone_block_decoders), + &attrs)) { + SPDK_ERRLOG("Failed to decode block delete parameters"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto cleanup; + } + + vbdev_zone_block_delete(attrs.name, _rpc_delete_zone_block_cb, request); + +cleanup: + free_rpc_delete_zone_block(&attrs); +} +SPDK_RPC_REGISTER("bdev_zone_block_delete", rpc_zone_block_delete, SPDK_RPC_RUNTIME) |