summaryrefslogtreecommitdiffstats
path: root/src/spdk/module/bdev/raid/raid0.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/module/bdev/raid/raid0.c
parentInitial commit. (diff)
downloadceph-upstream.tar.xz
ceph-upstream.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/module/bdev/raid/raid0.c')
-rw-r--r--src/spdk/module/bdev/raid/raid0.c398
1 files changed, 398 insertions, 0 deletions
diff --git a/src/spdk/module/bdev/raid/raid0.c b/src/spdk/module/bdev/raid/raid0.c
new file mode 100644
index 000000000..5632c5b7c
--- /dev/null
+++ b/src/spdk/module/bdev/raid/raid0.c
@@ -0,0 +1,398 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_raid.h"
+
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+/*
+ * brief:
+ * raid0_bdev_io_completion function is called by lower layers to notify raid
+ * module that particular bdev_io is completed.
+ * params:
+ * bdev_io - pointer to bdev io submitted to lower layers, like child io
+ * success - bdev_io status
+ * cb_arg - function callback context (parent raid_bdev_io)
+ * returns:
+ * none
+ */
+static void
+raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct raid_bdev_io *raid_io = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (success) {
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+raid0_submit_rw_request(struct raid_bdev_io *raid_io);
+
+static void
+_raid0_submit_rw_request(void *_raid_io)
+{
+ struct raid_bdev_io *raid_io = _raid_io;
+
+ raid0_submit_rw_request(raid_io);
+}
+
+/*
+ * brief:
+ * raid0_submit_rw_request function is used to submit I/O to the correct
+ * member disk for raid0 bdevs.
+ * params:
+ * raid_io
+ * returns:
+ * none
+ */
+static void
+raid0_submit_rw_request(struct raid_bdev_io *raid_io)
+{
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
+ struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch;
+ struct raid_bdev *raid_bdev = raid_io->raid_bdev;
+ uint64_t pd_strip;
+ uint32_t offset_in_strip;
+ uint64_t pd_lba;
+ uint64_t pd_blocks;
+ uint8_t pd_idx;
+ int ret = 0;
+ uint64_t start_strip;
+ uint64_t end_strip;
+ struct raid_base_bdev_info *base_info;
+ struct spdk_io_channel *base_ch;
+
+ start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
+ end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >>
+ raid_bdev->strip_size_shift;
+ if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
+ assert(false);
+ SPDK_ERRLOG("I/O spans strip boundary!\n");
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ pd_strip = start_strip / raid_bdev->num_base_bdevs;
+ pd_idx = start_strip % raid_bdev->num_base_bdevs;
+ offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
+ pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
+ pd_blocks = bdev_io->u.bdev.num_blocks;
+ base_info = &raid_bdev->base_bdev_info[pd_idx];
+ if (base_info->desc == NULL) {
+ SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
+ assert(0);
+ }
+
+ /*
+ * Submit child io to bdev layer with using base bdev descriptors, base
+ * bdev lba, base bdev child io length in blocks, buffer, completion
+ * function and function callback context
+ */
+ assert(raid_ch != NULL);
+ assert(raid_ch->base_channel);
+ base_ch = raid_ch->base_channel[pd_idx];
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+ ret = spdk_bdev_readv_blocks(base_info->desc, base_ch,
+ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ pd_lba, pd_blocks, raid0_bdev_io_completion,
+ raid_io);
+ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+ ret = spdk_bdev_writev_blocks(base_info->desc, base_ch,
+ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ pd_lba, pd_blocks, raid0_bdev_io_completion,
+ raid_io);
+ } else {
+ SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
+ assert(0);
+ }
+
+ if (ret == -ENOMEM) {
+ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
+ _raid0_submit_rw_request);
+ } else if (ret != 0) {
+ SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
+ assert(false);
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+/* raid0 IO range */
+struct raid_bdev_io_range {
+ uint64_t strip_size;
+ uint64_t start_strip_in_disk;
+ uint64_t end_strip_in_disk;
+ uint64_t start_offset_in_strip;
+ uint64_t end_offset_in_strip;
+ uint8_t start_disk;
+ uint8_t end_disk;
+ uint8_t n_disks_involved;
+};
+
+static inline void
+_raid0_get_io_range(struct raid_bdev_io_range *io_range,
+ uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
+ uint64_t offset_blocks, uint64_t num_blocks)
+{
+ uint64_t start_strip;
+ uint64_t end_strip;
+
+ io_range->strip_size = strip_size;
+
+ /* The start and end strip index in raid0 bdev scope */
+ start_strip = offset_blocks >> strip_size_shift;
+ end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift;
+ io_range->start_strip_in_disk = start_strip / num_base_bdevs;
+ io_range->end_strip_in_disk = end_strip / num_base_bdevs;
+
+ /* The first strip may have unaligned start LBA offset.
+ * The end strip may have unaligned end LBA offset.
+ * Strips between them certainly have aligned offset and length to boundaries.
+ */
+ io_range->start_offset_in_strip = offset_blocks % strip_size;
+ io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size;
+
+ /* The base bdev indexes in which start and end strips are located */
+ io_range->start_disk = start_strip % num_base_bdevs;
+ io_range->end_disk = end_strip % num_base_bdevs;
+
+ /* Calculate how many base_bdevs are involved in io operation.
+ * Number of base bdevs involved is between 1 and num_base_bdevs.
+ * It will be 1 if the first strip and last strip are the same one.
+ */
+ io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
+}
+
+static inline void
+_raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
+ uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
+{
+ uint64_t n_strips_in_disk;
+ uint64_t start_offset_in_disk;
+ uint64_t end_offset_in_disk;
+ uint64_t offset_in_disk;
+ uint64_t nblocks_in_disk;
+ uint64_t start_strip_in_disk;
+ uint64_t end_strip_in_disk;
+
+ start_strip_in_disk = io_range->start_strip_in_disk;
+ if (disk_idx < io_range->start_disk) {
+ start_strip_in_disk += 1;
+ }
+
+ end_strip_in_disk = io_range->end_strip_in_disk;
+ if (disk_idx > io_range->end_disk) {
+ end_strip_in_disk -= 1;
+ }
+
+ assert(end_strip_in_disk >= start_strip_in_disk);
+ n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
+
+ if (disk_idx == io_range->start_disk) {
+ start_offset_in_disk = io_range->start_offset_in_strip;
+ } else {
+ start_offset_in_disk = 0;
+ }
+
+ if (disk_idx == io_range->end_disk) {
+ end_offset_in_disk = io_range->end_offset_in_strip;
+ } else {
+ end_offset_in_disk = io_range->strip_size - 1;
+ }
+
+ offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
+ nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
+ + end_offset_in_disk - start_offset_in_disk + 1;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0,
+ "raid_bdev (strip_size 0x%lx) splits IO to base_bdev (%u) at (0x%lx, 0x%lx).\n",
+ io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
+
+ *_offset_in_disk = offset_in_disk;
+ *_nblocks_in_disk = nblocks_in_disk;
+}
+
+static void
+raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
+
+static void
+_raid0_submit_null_payload_request(void *_raid_io)
+{
+ struct raid_bdev_io *raid_io = _raid_io;
+
+ raid0_submit_null_payload_request(raid_io);
+}
+
+static void
+raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct raid_bdev_io *raid_io = cb_arg;
+
+ raid_bdev_io_complete_part(raid_io, 1, success ?
+ SPDK_BDEV_IO_STATUS_SUCCESS :
+ SPDK_BDEV_IO_STATUS_FAILED);
+
+ spdk_bdev_free_io(bdev_io);
+}
+
+/*
+ * brief:
+ * raid0_submit_null_payload_request function submits the next batch of
+ * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
+ * it will submit as many as possible unless one base io request fails with -ENOMEM,
+ * in which case it will queue itself for later submission.
+ * params:
+ * bdev_io - pointer to parent bdev_io on raid bdev device
+ * returns:
+ * none
+ */
+static void
+raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
+{
+ struct spdk_bdev_io *bdev_io;
+ struct raid_bdev *raid_bdev;
+ struct raid_bdev_io_range io_range;
+ int ret;
+ struct raid_base_bdev_info *base_info;
+ struct spdk_io_channel *base_ch;
+
+ bdev_io = spdk_bdev_io_from_ctx(raid_io);
+ raid_bdev = raid_io->raid_bdev;
+
+ _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
+ raid_bdev->strip_size, raid_bdev->strip_size_shift,
+ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
+
+ if (raid_io->base_bdev_io_remaining == 0) {
+ raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
+ }
+
+ while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
+ uint8_t disk_idx;
+ uint64_t offset_in_disk;
+ uint64_t nblocks_in_disk;
+
+ /* base_bdev is started from start_disk to end_disk.
+ * It is possible that index of start_disk is larger than end_disk's.
+ */
+ disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
+ base_info = &raid_bdev->base_bdev_info[disk_idx];
+ base_ch = raid_io->raid_ch->base_channel[disk_idx];
+
+ _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch,
+ offset_in_disk, nblocks_in_disk,
+ raid0_base_io_complete, raid_io);
+ break;
+
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ ret = spdk_bdev_flush_blocks(base_info->desc, base_ch,
+ offset_in_disk, nblocks_in_disk,
+ raid0_base_io_complete, raid_io);
+ break;
+
+ default:
+ SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
+ assert(false);
+ ret = -EIO;
+ }
+
+ if (ret == 0) {
+ raid_io->base_bdev_io_submitted++;
+ } else if (ret == -ENOMEM) {
+ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
+ _raid0_submit_null_payload_request);
+ return;
+ } else {
+ SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
+ assert(false);
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+ }
+}
+
+static int raid0_start(struct raid_bdev *raid_bdev)
+{
+ uint64_t min_blockcnt = UINT64_MAX;
+ struct raid_base_bdev_info *base_info;
+
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ /* Calculate minimum block count from all base bdevs */
+ min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt);
+ }
+
+ /*
+ * Take the minimum block count based approach where total block count
+ * of raid bdev is the number of base bdev times the minimum block count
+ * of any base bdev.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0, "min blockcount %lu, numbasedev %u, strip size shift %u\n",
+ min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
+ raid_bdev->bdev.blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) <<
+ raid_bdev->strip_size_shift) * raid_bdev->num_base_bdevs;
+
+ if (raid_bdev->num_base_bdevs > 1) {
+ raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
+ raid_bdev->bdev.split_on_optimal_io_boundary = true;
+ } else {
+ /* Do not need to split reads/writes on single bdev RAID modules. */
+ raid_bdev->bdev.optimal_io_boundary = 0;
+ raid_bdev->bdev.split_on_optimal_io_boundary = false;
+ }
+
+ return 0;
+}
+
+static struct raid_bdev_module g_raid0_module = {
+ .level = RAID0,
+ .base_bdevs_min = 1,
+ .start = raid0_start,
+ .submit_rw_request = raid0_submit_rw_request,
+ .submit_null_payload_request = raid0_submit_null_payload_request,
+};
+RAID_MODULE_REGISTER(&g_raid0_module)
+
+SPDK_LOG_REGISTER_COMPONENT("bdev_raid0", SPDK_LOG_BDEV_RAID0)