From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/spdk/lib/ftl/Makefile | 47 + src/spdk/lib/ftl/ftl_addr.h | 76 ++ src/spdk/lib/ftl/ftl_band.c | 1097 ++++++++++++++++++ src/spdk/lib/ftl/ftl_band.h | 287 +++++ src/spdk/lib/ftl/ftl_core.c | 2460 ++++++++++++++++++++++++++++++++++++++++ src/spdk/lib/ftl/ftl_core.h | 552 +++++++++ src/spdk/lib/ftl/ftl_debug.c | 169 +++ src/spdk/lib/ftl/ftl_debug.h | 73 ++ src/spdk/lib/ftl/ftl_init.c | 1688 +++++++++++++++++++++++++++ src/spdk/lib/ftl/ftl_io.c | 563 +++++++++ src/spdk/lib/ftl/ftl_io.h | 351 ++++++ src/spdk/lib/ftl/ftl_reloc.c | 860 ++++++++++++++ src/spdk/lib/ftl/ftl_reloc.h | 53 + src/spdk/lib/ftl/ftl_restore.c | 1350 ++++++++++++++++++++++ src/spdk/lib/ftl/ftl_trace.c | 361 ++++++ src/spdk/lib/ftl/ftl_trace.h | 84 ++ src/spdk/lib/ftl/spdk_ftl.map | 14 + 17 files changed, 10085 insertions(+) create mode 100644 src/spdk/lib/ftl/Makefile create mode 100644 src/spdk/lib/ftl/ftl_addr.h create mode 100644 src/spdk/lib/ftl/ftl_band.c create mode 100644 src/spdk/lib/ftl/ftl_band.h create mode 100644 src/spdk/lib/ftl/ftl_core.c create mode 100644 src/spdk/lib/ftl/ftl_core.h create mode 100644 src/spdk/lib/ftl/ftl_debug.c create mode 100644 src/spdk/lib/ftl/ftl_debug.h create mode 100644 src/spdk/lib/ftl/ftl_init.c create mode 100644 src/spdk/lib/ftl/ftl_io.c create mode 100644 src/spdk/lib/ftl/ftl_io.h create mode 100644 src/spdk/lib/ftl/ftl_reloc.c create mode 100644 src/spdk/lib/ftl/ftl_reloc.h create mode 100644 src/spdk/lib/ftl/ftl_restore.c create mode 100644 src/spdk/lib/ftl/ftl_trace.c create mode 100644 src/spdk/lib/ftl/ftl_trace.h create mode 100644 src/spdk/lib/ftl/spdk_ftl.map (limited to 'src/spdk/lib/ftl') diff --git a/src/spdk/lib/ftl/Makefile b/src/spdk/lib/ftl/Makefile new file mode 100644 index 000000000..c24274622 --- /dev/null +++ b/src/spdk/lib/ftl/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 2 +SO_MINOR := 0 + +C_SRCS = ftl_band.c ftl_core.c ftl_debug.c ftl_io.c ftl_reloc.c \ + ftl_restore.c ftl_init.c ftl_trace.c + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_ftl.map) + +LIBNAME = ftl + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/ftl/ftl_addr.h b/src/spdk/lib/ftl/ftl_addr.h new file mode 100644 index 000000000..36d2ffb00 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_addr.h @@ -0,0 +1,76 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_ADDR_H +#define FTL_ADDR_H + +#include "spdk/stdinc.h" + +/* Marks address as invalid */ +#define FTL_ADDR_INVALID (-1) +/* Marks LBA as invalid */ +#define FTL_LBA_INVALID ((uint64_t)-1) +/* Smallest data unit size */ +#define FTL_BLOCK_SIZE 4096 + +/* This structure represents on-disk address. It can have one of the following */ +/* formats: */ +/* - offset inside the disk */ +/* - cache_offset inside the cache (indicated by the cached flag) */ +/* - packed version of the two formats above (can be only used when the */ +/* offset can be represented in less than 32 bits) */ +/* Packed format is used, when possible, to avoid wasting RAM on the L2P table. */ +struct ftl_addr { + union { + struct { + uint64_t cache_offset : 63; + uint64_t cached : 1; + }; + + struct { + union { + struct { + uint32_t cache_offset : 31; + uint32_t cached : 1; + }; + + uint32_t offset; + }; + uint32_t rsvd; + } pack; + + uint64_t offset; + }; +}; + +#endif /* FTL_ADDR_H */ diff --git a/src/spdk/lib/ftl/ftl_band.c b/src/spdk/lib/ftl/ftl_band.c new file mode 100644 index 000000000..62221dcf6 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_band.c @@ -0,0 +1,1097 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/crc32.h" +#include "spdk/likely.h" +#include "spdk/util.h" +#include "spdk/ftl.h" + +#include "ftl_band.h" +#include "ftl_io.h" +#include "ftl_core.h" +#include "ftl_reloc.h" +#include "ftl_debug.h" + +/* TODO: define some signature for meta version */ +#define FTL_MD_VER 1 + +struct __attribute__((packed)) ftl_md_hdr { + /* Device instance */ + struct spdk_uuid uuid; + + /* Meta version */ + uint8_t ver; + + /* Sequence number */ + uint64_t seq; + + /* CRC32 checksum */ + uint32_t checksum; +}; + +/* End metadata layout stored on media (with all three being aligned to block size): */ +/* - header */ +/* - valid bitmap */ +/* - LBA map */ +struct __attribute__((packed)) ftl_tail_md { + struct ftl_md_hdr hdr; + + /* Max number of blocks */ + uint64_t num_blocks; + + uint8_t reserved[4059]; +}; +SPDK_STATIC_ASSERT(sizeof(struct ftl_tail_md) == FTL_BLOCK_SIZE, "Incorrect metadata size"); + +struct __attribute__((packed)) ftl_head_md { + struct ftl_md_hdr hdr; + + /* Number of defrag cycles */ + uint64_t wr_cnt; + + /* Number of surfaced LBAs */ + uint64_t lba_cnt; + + /* Transfer size */ + uint32_t xfer_size; +}; + +size_t +ftl_tail_md_hdr_num_blocks(void) +{ + return spdk_divide_round_up(sizeof(struct ftl_tail_md), FTL_BLOCK_SIZE); +} + +size_t +ftl_vld_map_num_blocks(const struct spdk_ftl_dev *dev) +{ + return spdk_divide_round_up(ftl_vld_map_size(dev), FTL_BLOCK_SIZE); +} + +size_t +ftl_lba_map_num_blocks(const struct spdk_ftl_dev *dev) +{ + return spdk_divide_round_up(ftl_get_num_blocks_in_band(dev) * sizeof(uint64_t), FTL_BLOCK_SIZE); +} + +size_t +ftl_head_md_num_blocks(const struct spdk_ftl_dev *dev) +{ + return dev->xfer_size; +} + +size_t +ftl_tail_md_num_blocks(const struct spdk_ftl_dev *dev) +{ + return spdk_divide_round_up(ftl_tail_md_hdr_num_blocks() + + ftl_vld_map_num_blocks(dev) + + ftl_lba_map_num_blocks(dev), + dev->xfer_size) * dev->xfer_size; +} + +static uint64_t +ftl_band_tail_md_offset(const struct ftl_band *band) +{ + return ftl_band_num_usable_blocks(band) - + ftl_tail_md_num_blocks(band->dev); +} + +int +ftl_band_full(struct ftl_band *band, size_t offset) +{ + return offset == ftl_band_tail_md_offset(band); +} + +void +ftl_band_write_failed(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + + band->high_prio = 1; + + ftl_reloc_add(dev->reloc, band, 0, ftl_get_num_blocks_in_band(dev), 1, true); + ftl_band_set_state(band, FTL_BAND_STATE_CLOSED); +} + +static void +ftl_band_free_lba_map(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_lba_map *lba_map = &band->lba_map; + + assert(band->state == FTL_BAND_STATE_CLOSED || + band->state == FTL_BAND_STATE_FREE); + assert(lba_map->ref_cnt == 0); + assert(lba_map->map != NULL); + assert(!band->high_prio); + + /* Verify that band's metadata is consistent with l2p */ + if (band->num_zones) { + assert(ftl_band_validate_md(band) == true); + } + + spdk_mempool_put(dev->lba_pool, lba_map->dma_buf); + lba_map->map = NULL; + lba_map->dma_buf = NULL; +} + +static void +_ftl_band_set_free(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_band *lband, *prev; + + /* Remove the band from the closed band list */ + LIST_REMOVE(band, list_entry); + + /* Keep the list sorted by band's write count */ + LIST_FOREACH(lband, &dev->free_bands, list_entry) { + if (lband->wr_cnt > band->wr_cnt) { + LIST_INSERT_BEFORE(lband, band, list_entry); + break; + } + prev = lband; + } + + if (!lband) { + if (LIST_EMPTY(&dev->free_bands)) { + LIST_INSERT_HEAD(&dev->free_bands, band, list_entry); + } else { + LIST_INSERT_AFTER(prev, band, list_entry); + } + } + +#if defined(DEBUG) + prev = NULL; + LIST_FOREACH(lband, &dev->free_bands, list_entry) { + if (!prev) { + continue; + } + assert(prev->wr_cnt <= lband->wr_cnt); + } +#endif + dev->num_free++; + ftl_apply_limits(dev); +} + +static void +_ftl_band_set_preparing(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + + /* Remove band from free list */ + LIST_REMOVE(band, list_entry); + + band->wr_cnt++; + + assert(dev->num_free > 0); + dev->num_free--; + + ftl_apply_limits(dev); +} + +static void +_ftl_band_set_closed(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + + /* Set the state as free_md() checks for that */ + band->state = FTL_BAND_STATE_CLOSED; + + /* Free the lba map if there are no outstanding IOs */ + ftl_band_release_lba_map(band); + + if (spdk_likely(band->num_zones)) { + LIST_INSERT_HEAD(&dev->shut_bands, band, list_entry); + } else { + LIST_REMOVE(band, list_entry); + } +} + +static uint32_t +ftl_md_calc_crc(const struct ftl_md_hdr *hdr, size_t size) +{ + size_t checkoff = offsetof(struct ftl_md_hdr, checksum); + size_t mdoff = checkoff + sizeof(hdr->checksum); + uint32_t crc; + + crc = spdk_crc32c_update(hdr, checkoff, 0); + return spdk_crc32c_update((const char *)hdr + mdoff, size - mdoff, crc); +} + +static void +ftl_set_md_hdr(struct ftl_band *band, struct ftl_md_hdr *hdr, size_t size) +{ + hdr->seq = band->seq; + hdr->ver = FTL_MD_VER; + hdr->uuid = band->dev->uuid; + hdr->checksum = ftl_md_calc_crc(hdr, size); +} + +static int +ftl_pack_head_md(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_head_md *head = band->lba_map.dma_buf; + + head->wr_cnt = band->wr_cnt; + head->lba_cnt = dev->num_lbas; + head->xfer_size = dev->xfer_size; + ftl_set_md_hdr(band, &head->hdr, sizeof(struct ftl_head_md)); + + return FTL_MD_SUCCESS; +} + +static int +ftl_pack_tail_md(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_lba_map *lba_map = &band->lba_map; + struct ftl_tail_md *tail = lba_map->dma_buf; + void *vld_offset; + + vld_offset = (char *)tail + ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE; + + /* Clear out the buffer */ + memset(tail, 0, ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE); + tail->num_blocks = ftl_get_num_blocks_in_band(dev); + + pthread_spin_lock(&lba_map->lock); + spdk_bit_array_store_mask(lba_map->vld, vld_offset); + pthread_spin_unlock(&lba_map->lock); + + ftl_set_md_hdr(band, &tail->hdr, ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE); + + return FTL_MD_SUCCESS; +} + +static int +ftl_md_hdr_vld(struct spdk_ftl_dev *dev, const struct ftl_md_hdr *hdr, size_t size) +{ + if (spdk_uuid_compare(&dev->uuid, &hdr->uuid) != 0) { + return FTL_MD_NO_MD; + } + + if (hdr->ver != FTL_MD_VER) { + return FTL_MD_INVALID_VER; + } + + if (ftl_md_calc_crc(hdr, size) != hdr->checksum) { + return FTL_MD_INVALID_CRC; + } + + return FTL_MD_SUCCESS; +} + +static int +ftl_unpack_tail_md(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + void *vld_offset; + struct ftl_lba_map *lba_map = &band->lba_map; + struct ftl_tail_md *tail = lba_map->dma_buf; + int rc; + + vld_offset = (char *)tail + ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE; + + rc = ftl_md_hdr_vld(dev, &tail->hdr, ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE); + if (rc) { + return rc; + } + + /* + * When restoring from a dirty shutdown it's possible old tail meta wasn't yet cleared - + * band had saved head meta, but didn't manage to send erase to all zones. + * The already found tail md header is valid, but inconsistent with the head meta. Treat + * such a band as open/without valid tail md. + */ + if (band->seq != tail->hdr.seq) { + return FTL_MD_NO_MD; + } + + if (tail->num_blocks != ftl_get_num_blocks_in_band(dev)) { + return FTL_MD_INVALID_SIZE; + } + + spdk_bit_array_load_mask(lba_map->vld, vld_offset); + + return FTL_MD_SUCCESS; +} + +static int +ftl_unpack_head_md(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_head_md *head = band->lba_map.dma_buf; + int rc; + + rc = ftl_md_hdr_vld(dev, &head->hdr, sizeof(struct ftl_head_md)); + if (rc) { + return rc; + } + + band->seq = head->hdr.seq; + band->wr_cnt = head->wr_cnt; + + if (dev->global_md.num_lbas == 0) { + dev->global_md.num_lbas = head->lba_cnt; + } + + if (dev->global_md.num_lbas != head->lba_cnt) { + return FTL_MD_INVALID_SIZE; + } + + if (dev->xfer_size != head->xfer_size) { + return FTL_MD_INVALID_SIZE; + } + + return FTL_MD_SUCCESS; +} + +struct ftl_addr +ftl_band_tail_md_addr(struct ftl_band *band) +{ + struct ftl_addr addr = {}; + struct ftl_zone *zone; + struct spdk_ftl_dev *dev = band->dev; + size_t xfer_size = dev->xfer_size; + size_t num_req = ftl_band_tail_md_offset(band) / xfer_size; + size_t i; + + if (spdk_unlikely(!band->num_zones)) { + return ftl_to_addr(FTL_ADDR_INVALID); + } + + /* Metadata should be aligned to xfer size */ + assert(ftl_band_tail_md_offset(band) % xfer_size == 0); + + zone = CIRCLEQ_FIRST(&band->zones); + for (i = 0; i < num_req % band->num_zones; ++i) { + zone = ftl_band_next_zone(band, zone); + } + + addr.offset = (num_req / band->num_zones) * xfer_size; + addr.offset += zone->info.zone_id; + + return addr; +} + +struct ftl_addr +ftl_band_head_md_addr(struct ftl_band *band) +{ + if (spdk_unlikely(!band->num_zones)) { + return ftl_to_addr(FTL_ADDR_INVALID); + } + + return ftl_to_addr(CIRCLEQ_FIRST(&band->zones)->info.zone_id); +} + +void +ftl_band_set_state(struct ftl_band *band, enum ftl_band_state state) +{ + switch (state) { + case FTL_BAND_STATE_FREE: + assert(band->state == FTL_BAND_STATE_CLOSED); + _ftl_band_set_free(band); + break; + + case FTL_BAND_STATE_PREP: + assert(band->state == FTL_BAND_STATE_FREE); + _ftl_band_set_preparing(band); + break; + + case FTL_BAND_STATE_CLOSED: + if (band->state != FTL_BAND_STATE_CLOSED) { + assert(band->state == FTL_BAND_STATE_CLOSING || band->high_prio); + _ftl_band_set_closed(band); + } + break; + + default: + break; + } + + band->state = state; +} + +void +ftl_band_set_addr(struct ftl_band *band, uint64_t lba, struct ftl_addr addr) +{ + struct ftl_lba_map *lba_map = &band->lba_map; + uint64_t offset; + + assert(lba != FTL_LBA_INVALID); + + offset = ftl_band_block_offset_from_addr(band, addr); + pthread_spin_lock(&lba_map->lock); + + lba_map->num_vld++; + lba_map->map[offset] = lba; + spdk_bit_array_set(lba_map->vld, offset); + + pthread_spin_unlock(&lba_map->lock); +} + +size_t +ftl_band_age(const struct ftl_band *band) +{ + return (size_t)(band->dev->seq - band->seq); +} + +size_t +ftl_band_num_usable_blocks(const struct ftl_band *band) +{ + return band->num_zones * ftl_get_num_blocks_in_zone(band->dev); +} + +size_t +ftl_band_user_blocks_left(const struct ftl_band *band, size_t offset) +{ + size_t tail_md_offset = ftl_band_tail_md_offset(band); + + if (spdk_unlikely(offset <= ftl_head_md_num_blocks(band->dev))) { + return ftl_band_user_blocks(band); + } + + if (spdk_unlikely(offset > tail_md_offset)) { + return 0; + } + + return tail_md_offset - offset; +} + +size_t +ftl_band_user_blocks(const struct ftl_band *band) +{ + return ftl_band_num_usable_blocks(band) - + ftl_head_md_num_blocks(band->dev) - + ftl_tail_md_num_blocks(band->dev); +} + +struct ftl_band * +ftl_band_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + size_t band_id = ftl_addr_get_band(dev, addr); + + assert(band_id < ftl_get_num_bands(dev)); + return &dev->bands[band_id]; +} + +struct ftl_zone * +ftl_band_zone_from_addr(struct ftl_band *band, struct ftl_addr addr) +{ + size_t pu_id = ftl_addr_get_punit(band->dev, addr); + + assert(pu_id < ftl_get_num_punits(band->dev)); + return &band->zone_buf[pu_id]; +} + +uint64_t +ftl_band_block_offset_from_addr(struct ftl_band *band, struct ftl_addr addr) +{ + assert(ftl_addr_get_band(band->dev, addr) == band->id); + assert(ftl_addr_get_punit(band->dev, addr) < ftl_get_num_punits(band->dev)); + return addr.offset % ftl_get_num_blocks_in_band(band->dev); +} + +struct ftl_addr +ftl_band_next_xfer_addr(struct ftl_band *band, struct ftl_addr addr, size_t num_blocks) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_zone *zone; + size_t num_xfers, num_stripes; + uint64_t offset; + + assert(ftl_addr_get_band(dev, addr) == band->id); + + offset = ftl_addr_get_zone_offset(dev, addr); + zone = ftl_band_zone_from_addr(band, addr); + + num_blocks += (offset % dev->xfer_size); + offset -= (offset % dev->xfer_size); + +#if defined(DEBUG) + /* Check that the number of zones has not been changed */ + struct ftl_zone *_zone; + size_t _num_zones = 0; + CIRCLEQ_FOREACH(_zone, &band->zones, circleq) { + if (spdk_likely(_zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE)) { + _num_zones++; + } + } + assert(band->num_zones == _num_zones); +#endif + assert(band->num_zones != 0); + num_stripes = (num_blocks / dev->xfer_size) / band->num_zones; + offset += num_stripes * dev->xfer_size; + num_blocks -= num_stripes * dev->xfer_size * band->num_zones; + + if (offset > ftl_get_num_blocks_in_zone(dev)) { + return ftl_to_addr(FTL_ADDR_INVALID); + } + + num_xfers = num_blocks / dev->xfer_size; + for (size_t i = 0; i < num_xfers; ++i) { + /* When the last zone is reached the block part of the address */ + /* needs to be increased by xfer_size */ + if (ftl_band_zone_is_last(band, zone)) { + offset += dev->xfer_size; + if (offset > ftl_get_num_blocks_in_zone(dev)) { + return ftl_to_addr(FTL_ADDR_INVALID); + } + } + + zone = ftl_band_next_operational_zone(band, zone); + assert(zone); + + num_blocks -= dev->xfer_size; + } + + if (num_blocks) { + offset += num_blocks; + if (offset > ftl_get_num_blocks_in_zone(dev)) { + return ftl_to_addr(FTL_ADDR_INVALID); + } + } + + addr.offset = zone->info.zone_id + offset; + return addr; +} + +static size_t +ftl_xfer_offset_from_addr(struct ftl_band *band, struct ftl_addr addr) +{ + struct ftl_zone *zone, *current_zone; + unsigned int punit_offset = 0; + size_t num_stripes, xfer_size = band->dev->xfer_size; + uint64_t offset; + + assert(ftl_addr_get_band(band->dev, addr) == band->id); + + offset = ftl_addr_get_zone_offset(band->dev, addr); + num_stripes = (offset / xfer_size) * band->num_zones; + + current_zone = ftl_band_zone_from_addr(band, addr); + CIRCLEQ_FOREACH(zone, &band->zones, circleq) { + if (current_zone == zone) { + break; + } + punit_offset++; + } + + return xfer_size * (num_stripes + punit_offset) + offset % xfer_size; +} + +struct ftl_addr +ftl_band_addr_from_block_offset(struct ftl_band *band, uint64_t block_off) +{ + struct ftl_addr addr = { .offset = 0 }; + + addr.offset = block_off + band->id * ftl_get_num_blocks_in_band(band->dev); + return addr; +} + +struct ftl_addr +ftl_band_next_addr(struct ftl_band *band, struct ftl_addr addr, size_t offset) +{ + uint64_t block_off = ftl_band_block_offset_from_addr(band, addr); + return ftl_band_addr_from_block_offset(band, block_off + offset); +} + +void +ftl_band_acquire_lba_map(struct ftl_band *band) +{ + assert(band->lba_map.map != NULL); + band->lba_map.ref_cnt++; +} + +int +ftl_band_alloc_lba_map(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_lba_map *lba_map = &band->lba_map; + + assert(lba_map->ref_cnt == 0); + assert(lba_map->map == NULL); + + lba_map->dma_buf = spdk_mempool_get(dev->lba_pool); + + if (!lba_map->dma_buf) { + return -1; + } + + memset(lba_map->dma_buf, 0, ftl_lba_map_pool_elem_size(band->dev)); + + lba_map->map = (uint64_t *)((char *)lba_map->dma_buf + FTL_BLOCK_SIZE * + (ftl_tail_md_hdr_num_blocks() + ftl_vld_map_num_blocks(dev))); + + lba_map->segments = (char *)lba_map->dma_buf + ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE; + + ftl_band_acquire_lba_map(band); + return 0; +} + +void +ftl_band_release_lba_map(struct ftl_band *band) +{ + struct ftl_lba_map *lba_map = &band->lba_map; + + assert(lba_map->map != NULL); + assert(lba_map->ref_cnt > 0); + lba_map->ref_cnt--; + + if (lba_map->ref_cnt == 0) { + ftl_band_free_lba_map(band); + } +} + +static void +ftl_read_md_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_md_io *md_io = (struct ftl_md_io *)io; + + if (!status) { + status = md_io->pack_fn(md_io->io.band); + } else { + status = FTL_MD_IO_FAILURE; + } + + md_io->cb_fn(io, md_io->cb_ctx, status); +} + +static struct ftl_md_io * +ftl_io_init_md_read(struct spdk_ftl_dev *dev, struct ftl_addr addr, + struct ftl_band *band, size_t num_blocks, void *buf, + ftl_io_fn fn, ftl_md_pack_fn pack_fn, ftl_io_fn cb_fn, void *cb_ctx) +{ + struct ftl_md_io *io; + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .band = band, + .size = sizeof(*io), + .flags = FTL_IO_MD | FTL_IO_PHYSICAL_MODE, + .type = FTL_IO_READ, + .num_blocks = num_blocks, + .cb_fn = fn, + .iovs = { + { + .iov_base = buf, + .iov_len = num_blocks * FTL_BLOCK_SIZE, + } + }, + .iovcnt = 1, + }; + + io = (struct ftl_md_io *)ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->io.addr = addr; + io->pack_fn = pack_fn; + io->cb_fn = cb_fn; + io->cb_ctx = cb_ctx; + + return io; +} + +static struct ftl_io * +ftl_io_init_md_write(struct spdk_ftl_dev *dev, struct ftl_band *band, + void *data, size_t num_blocks, ftl_io_fn cb) +{ + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .band = band, + .size = sizeof(struct ftl_io), + .flags = FTL_IO_MD | FTL_IO_PHYSICAL_MODE, + .type = FTL_IO_WRITE, + .num_blocks = num_blocks, + .cb_fn = cb, + .iovs = { + { + .iov_base = data, + .iov_len = num_blocks * FTL_BLOCK_SIZE, + } + }, + .iovcnt = 1, + .md = NULL, + }; + + return ftl_io_init_internal(&opts); +} + +static int +ftl_band_write_md(struct ftl_band *band, size_t num_blocks, + ftl_md_pack_fn md_fn, ftl_io_fn cb) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_io *io; + + io = ftl_io_init_md_write(dev, band, band->lba_map.dma_buf, num_blocks, cb); + if (!io) { + return -ENOMEM; + } + + md_fn(band); + + ftl_io_write(io); + return 0; +} + +void +ftl_band_md_clear(struct ftl_band *band) +{ + band->seq = 0; + band->wr_cnt = 0; + band->lba_map.num_vld = 0; + band->lba_map.map = NULL; +} + +int +ftl_band_write_head_md(struct ftl_band *band, ftl_io_fn cb) +{ + return ftl_band_write_md(band, ftl_head_md_num_blocks(band->dev), + ftl_pack_head_md, cb); +} + +int +ftl_band_write_tail_md(struct ftl_band *band, ftl_io_fn cb) +{ + return ftl_band_write_md(band, ftl_tail_md_num_blocks(band->dev), + ftl_pack_tail_md, cb); +} + +static struct ftl_addr +ftl_band_lba_map_addr(struct ftl_band *band, size_t offset) +{ + return ftl_band_next_xfer_addr(band, band->tail_md_addr, + ftl_tail_md_hdr_num_blocks() + + ftl_vld_map_num_blocks(band->dev) + + offset); +} + +static int +ftl_band_read_md(struct ftl_band *band, size_t num_blocks, struct ftl_addr start_addr, + void *buf, ftl_io_fn fn, ftl_md_pack_fn pack_fn, ftl_io_fn cb_fn, void *cb_ctx) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_md_io *io; + + if (spdk_unlikely(!band->num_zones)) { + return -ENOENT; + } + + io = ftl_io_init_md_read(dev, start_addr, band, num_blocks, buf, fn, pack_fn, cb_fn, cb_ctx); + if (!io) { + return -ENOMEM; + } + + ftl_io_read((struct ftl_io *)io); + return 0; +} + +int +ftl_band_read_tail_md(struct ftl_band *band, struct ftl_addr addr, ftl_io_fn cb_fn, void *cb_ctx) +{ + return ftl_band_read_md(band, ftl_tail_md_num_blocks(band->dev), addr, band->lba_map.dma_buf, + ftl_read_md_cb, ftl_unpack_tail_md, cb_fn, cb_ctx); +} + +static size_t +ftl_lba_map_request_segment_done(struct ftl_lba_map_request *request, size_t offset, + size_t num_segments) +{ + size_t i, num_done = 0; + + for (i = offset; i < offset + num_segments; ++i) { + if (spdk_bit_array_get(request->segments, i)) { + spdk_bit_array_clear(request->segments, offset); + num_done++; + } + } + + assert(request->num_pending >= num_done); + request->num_pending -= num_done; + + return num_done; +} + +static void +ftl_lba_map_set_segment_state(struct ftl_lba_map *lba_map, size_t offset, size_t num_segments, + enum ftl_lba_map_seg_state state) +{ + size_t i; + + for (i = offset; i < offset + num_segments; ++i) { + lba_map->segments[i] = state; + } +} + +static void +ftl_lba_map_request_free(struct spdk_ftl_dev *dev, struct ftl_lba_map_request *request) +{ + spdk_bit_array_clear_mask(request->segments); + spdk_mempool_put(dev->lba_request_pool, request); +} + +static void +ftl_process_lba_map_requests(struct spdk_ftl_dev *dev, struct ftl_lba_map *lba_map, size_t offset, + size_t num_segments, int status) +{ + struct ftl_lba_map_request *request, *trequest; + size_t num_done; + + LIST_FOREACH_SAFE(request, &lba_map->request_list, list_entry, trequest) { + num_done = ftl_lba_map_request_segment_done(request, offset, num_segments); + if (request->num_pending == 0 || (status && num_done)) { + request->cb(NULL, request->cb_ctx, status); + LIST_REMOVE(request, list_entry); + ftl_lba_map_request_free(dev, request); + } + } +} + +static size_t +ftl_lba_map_offset_from_addr(struct ftl_band *band, struct ftl_addr addr) +{ + size_t offset; + struct ftl_addr start_addr = ftl_band_lba_map_addr(band, 0); + + offset = ftl_xfer_offset_from_addr(band, addr) - ftl_xfer_offset_from_addr(band, start_addr); + assert(offset < ftl_lba_map_num_blocks(band->dev)); + + return offset; +} + +static void +ftl_read_lba_map_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_lba_map *lba_map = &io->band->lba_map; + uint64_t block_off; + + block_off = ftl_lba_map_offset_from_addr(io->band, io->addr); + assert(block_off + io->num_blocks <= ftl_lba_map_num_blocks(io->dev)); + + if (!status) { + ftl_lba_map_set_segment_state(lba_map, block_off, io->num_blocks, + FTL_LBA_MAP_SEG_CACHED); + } + + ftl_process_lba_map_requests(io->dev, lba_map, block_off, io->num_blocks, status); +} + +static struct ftl_lba_map_request * +ftl_lba_map_alloc_request(struct ftl_band *band, size_t offset, size_t num_segments, + ftl_io_fn cb, void *cb_ctx) +{ + struct ftl_lba_map_request *request; + struct spdk_ftl_dev *dev = band->dev; + size_t i; + + request = spdk_mempool_get(dev->lba_request_pool); + if (!request) { + return NULL; + } + + request->cb = cb; + request->cb_ctx = cb_ctx; + request->num_pending = num_segments; + + for (i = offset; i < offset + num_segments; ++i) { + spdk_bit_array_set(request->segments, i); + } + + return request; +} + +static size_t +ftl_lba_map_num_clear_segments(struct ftl_lba_map *lba_map, + size_t offset, size_t num_segments) +{ + size_t i, cnt = 0; + + for (i = offset; i < offset + num_segments; ++i) { + if (lba_map->segments[i] != FTL_LBA_MAP_SEG_CLEAR) { + break; + } + cnt++; + } + + return cnt; +} + +int +ftl_band_read_lba_map(struct ftl_band *band, size_t offset, size_t lba_cnt, + ftl_io_fn cb_fn, void *cb_ctx) +{ + size_t num_blocks, block_off, num_read, num_segments; + struct ftl_lba_map *lba_map = &band->lba_map; + struct ftl_lba_map_request *request; + int rc = 0; + + block_off = offset / FTL_NUM_LBA_IN_BLOCK; + num_segments = spdk_divide_round_up(offset + lba_cnt, FTL_NUM_LBA_IN_BLOCK); + num_blocks = num_segments - block_off; + assert(block_off + num_blocks <= ftl_lba_map_num_blocks(band->dev)); + + request = ftl_lba_map_alloc_request(band, block_off, num_blocks, cb_fn, cb_ctx); + if (!request) { + return -ENOMEM; + } + + while (num_blocks) { + if (lba_map->segments[block_off] != FTL_LBA_MAP_SEG_CLEAR) { + if (lba_map->segments[block_off] == FTL_LBA_MAP_SEG_CACHED) { + ftl_lba_map_request_segment_done(request, block_off, 1); + } + num_blocks--; + block_off++; + continue; + } + + num_read = ftl_lba_map_num_clear_segments(lba_map, block_off, num_blocks); + ftl_lba_map_set_segment_state(lba_map, block_off, num_read, + FTL_LBA_MAP_SEG_PENDING); + + rc = ftl_band_read_md(band, num_read, ftl_band_lba_map_addr(band, block_off), + (char *)band->lba_map.map + block_off * FTL_BLOCK_SIZE, + ftl_read_lba_map_cb, NULL, cb_fn, cb_ctx); + if (rc) { + ftl_lba_map_request_free(band->dev, request); + return rc; + } + + assert(num_blocks >= num_read); + num_blocks -= num_read; + block_off += num_read; + } + + if (request->num_pending) { + LIST_INSERT_HEAD(&lba_map->request_list, request, list_entry); + } else { + cb_fn(NULL, cb_ctx, 0); + ftl_lba_map_request_free(band->dev, request); + } + + return rc; +} + +int +ftl_band_read_head_md(struct ftl_band *band, ftl_io_fn cb_fn, void *cb_ctx) +{ + return ftl_band_read_md(band, + ftl_head_md_num_blocks(band->dev), + ftl_band_head_md_addr(band), + band->lba_map.dma_buf, + ftl_read_md_cb, + ftl_unpack_head_md, + cb_fn, + cb_ctx); +} + +void +ftl_band_remove_zone(struct ftl_band *band, struct ftl_zone *zone) +{ + CIRCLEQ_REMOVE(&band->zones, zone, circleq); + band->num_zones--; +} + +int +ftl_band_write_prep(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + + if (ftl_band_alloc_lba_map(band)) { + return -1; + } + + band->seq = ++dev->seq; + return 0; +} + +struct ftl_zone * +ftl_band_next_operational_zone(struct ftl_band *band, struct ftl_zone *zone) +{ + struct ftl_zone *result = NULL; + struct ftl_zone *entry; + + if (spdk_unlikely(!band->num_zones)) { + return NULL; + } + + /* Erasing band may fail after it was assigned to wptr. */ + /* In such a case zone is no longer in band->zones queue. */ + if (spdk_likely(zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE)) { + result = ftl_band_next_zone(band, zone); + } else { + CIRCLEQ_FOREACH_REVERSE(entry, &band->zones, circleq) { + if (entry->info.zone_id > zone->info.zone_id) { + result = entry; + } else { + if (!result) { + result = CIRCLEQ_FIRST(&band->zones); + } + break; + } + } + } + + return result; +} + +void +ftl_band_clear_lba_map(struct ftl_band *band) +{ + struct ftl_lba_map *lba_map = &band->lba_map; + size_t num_segments; + + spdk_bit_array_clear_mask(lba_map->vld); + memset(lba_map->map, 0, ftl_lba_map_num_blocks(band->dev) * FTL_BLOCK_SIZE); + + /* For open band all lba map segments are already cached */ + assert(band->state == FTL_BAND_STATE_PREP); + num_segments = spdk_divide_round_up(ftl_get_num_blocks_in_band(band->dev), FTL_NUM_LBA_IN_BLOCK); + ftl_lba_map_set_segment_state(&band->lba_map, 0, num_segments, FTL_LBA_MAP_SEG_CACHED); + + lba_map->num_vld = 0; +} + +size_t +ftl_lba_map_pool_elem_size(struct spdk_ftl_dev *dev) +{ + /* Map pool element holds the whole tail md + segments map */ + return ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE + + spdk_divide_round_up(ftl_get_num_blocks_in_band(dev), FTL_NUM_LBA_IN_BLOCK); +} diff --git a/src/spdk/lib/ftl/ftl_band.h b/src/spdk/lib/ftl/ftl_band.h new file mode 100644 index 000000000..109b369a5 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_band.h @@ -0,0 +1,287 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_BAND_H +#define FTL_BAND_H + +#include "spdk/stdinc.h" +#include "spdk/bit_array.h" +#include "spdk/queue.h" +#include "spdk/bdev_zone.h" + +#include "ftl_io.h" +#include "ftl_addr.h" +#include "ftl_core.h" + +/* Number of LBAs that could be stored in a single block */ +#define FTL_NUM_LBA_IN_BLOCK (FTL_BLOCK_SIZE / sizeof(uint64_t)) + +struct spdk_ftl_dev; +struct ftl_lba_map_request; + +struct ftl_zone { + struct spdk_bdev_zone_info info; + + /* Indicates that there is inflight write */ + bool busy; + + CIRCLEQ_ENTRY(ftl_zone) circleq; +}; + +enum ftl_md_status { + FTL_MD_SUCCESS, + /* Metadata read failure */ + FTL_MD_IO_FAILURE, + /* Invalid version */ + FTL_MD_INVALID_VER, + /* UUID doesn't match */ + FTL_MD_NO_MD, + /* UUID and version matches but CRC doesn't */ + FTL_MD_INVALID_CRC, + /* Vld or lba map size doesn't match */ + FTL_MD_INVALID_SIZE +}; + +enum ftl_lba_map_seg_state { + FTL_LBA_MAP_SEG_CLEAR, + FTL_LBA_MAP_SEG_PENDING, + FTL_LBA_MAP_SEG_CACHED +}; + +struct ftl_lba_map { + /* LBA/vld map lock */ + pthread_spinlock_t lock; + + /* Number of valid LBAs */ + size_t num_vld; + + /* LBA map's reference count */ + size_t ref_cnt; + + /* Bitmap of valid LBAs */ + struct spdk_bit_array *vld; + + /* LBA map (only valid for open/relocating bands) */ + uint64_t *map; + + /* LBA map segment state map (clear, pending, cached) */ + uint8_t *segments; + + LIST_HEAD(, ftl_lba_map_request) request_list; + + /* Metadata DMA buffer (only valid for open/relocating bands) */ + void *dma_buf; +}; + +enum ftl_band_state { + FTL_BAND_STATE_FREE, + FTL_BAND_STATE_PREP, + FTL_BAND_STATE_OPENING, + FTL_BAND_STATE_OPEN, + FTL_BAND_STATE_FULL, + FTL_BAND_STATE_CLOSING, + FTL_BAND_STATE_CLOSED, + FTL_BAND_STATE_MAX +}; + +struct ftl_lba_map_request { + /* Completion callback */ + ftl_io_fn cb; + + /* Completion callback context */ + void *cb_ctx; + + /* Bit array of requested segments */ + struct spdk_bit_array *segments; + + /* Number of pending segments to read */ + size_t num_pending; + + LIST_ENTRY(ftl_lba_map_request) list_entry; +}; + +struct ftl_band { + /* Device this band belongs to */ + struct spdk_ftl_dev *dev; + + /* Number of operational zones */ + size_t num_zones; + + /* Array of zones */ + struct ftl_zone *zone_buf; + + /* List of operational zones */ + CIRCLEQ_HEAD(, ftl_zone) zones; + + /* LBA map */ + struct ftl_lba_map lba_map; + + /* Band's state */ + enum ftl_band_state state; + + /* Band's index */ + unsigned int id; + + /* Latest merit calculation */ + double merit; + + /* High defrag priority - means that the metadata should be copied and */ + /* the band should be defragged immediately */ + int high_prio; + + /* Sequence number */ + uint64_t seq; + + /* Number of defrag cycles */ + uint64_t wr_cnt; + + /* End metadata start addr */ + struct ftl_addr tail_md_addr; + + /* Bitmap of all bands that have its data moved onto this band */ + struct spdk_bit_array *reloc_bitmap; + /* Number of open bands containing data moved from this band */ + size_t num_reloc_bands; + /* Number of blocks currently being moved from this band */ + size_t num_reloc_blocks; + + /* Free/shut bands' lists */ + LIST_ENTRY(ftl_band) list_entry; + + /* High priority queue link */ + STAILQ_ENTRY(ftl_band) prio_stailq; +}; + +uint64_t ftl_band_block_offset_from_addr(struct ftl_band *band, struct ftl_addr addr); +struct ftl_addr ftl_band_addr_from_block_offset(struct ftl_band *band, uint64_t block_off); +void ftl_band_set_state(struct ftl_band *band, enum ftl_band_state state); +size_t ftl_band_age(const struct ftl_band *band); +void ftl_band_acquire_lba_map(struct ftl_band *band); +int ftl_band_alloc_lba_map(struct ftl_band *band); +void ftl_band_clear_lba_map(struct ftl_band *band); +void ftl_band_release_lba_map(struct ftl_band *band); +int ftl_band_read_lba_map(struct ftl_band *band, + size_t offset, size_t lba_cnt, + ftl_io_fn cb_fn, void *cb_ctx); +struct ftl_addr ftl_band_next_xfer_addr(struct ftl_band *band, struct ftl_addr addr, + size_t num_blocks); +struct ftl_addr ftl_band_next_addr(struct ftl_band *band, struct ftl_addr addr, + size_t offset); +size_t ftl_band_num_usable_blocks(const struct ftl_band *band); +size_t ftl_band_user_blocks_left(const struct ftl_band *band, size_t offset); +size_t ftl_band_user_blocks(const struct ftl_band *band); +void ftl_band_set_addr(struct ftl_band *band, uint64_t lba, + struct ftl_addr addr); +struct ftl_band *ftl_band_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr); +struct ftl_zone *ftl_band_zone_from_addr(struct ftl_band *band, struct ftl_addr); +void ftl_band_md_clear(struct ftl_band *band); +int ftl_band_read_tail_md(struct ftl_band *band, struct ftl_addr, + ftl_io_fn cb_fn, void *cb_ctx); +int ftl_band_read_head_md(struct ftl_band *band, ftl_io_fn cb_fn, void *cb_ctx); +int ftl_band_write_tail_md(struct ftl_band *band, ftl_io_fn cb); +int ftl_band_write_head_md(struct ftl_band *band, ftl_io_fn cb); +struct ftl_addr ftl_band_tail_md_addr(struct ftl_band *band); +struct ftl_addr ftl_band_head_md_addr(struct ftl_band *band); +void ftl_band_write_failed(struct ftl_band *band); +int ftl_band_full(struct ftl_band *band, size_t offset); +int ftl_band_write_prep(struct ftl_band *band); +struct ftl_zone *ftl_band_next_operational_zone(struct ftl_band *band, + struct ftl_zone *zone); +size_t ftl_lba_map_pool_elem_size(struct spdk_ftl_dev *dev); +void ftl_band_remove_zone(struct ftl_band *band, struct ftl_zone *zone); + + +static inline int +ftl_band_empty(const struct ftl_band *band) +{ + return band->lba_map.num_vld == 0; +} + +static inline struct ftl_zone * +ftl_band_next_zone(struct ftl_band *band, struct ftl_zone *zone) +{ + assert(zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE); + return CIRCLEQ_LOOP_NEXT(&band->zones, zone, circleq); +} + +static inline void +ftl_band_set_next_state(struct ftl_band *band) +{ + ftl_band_set_state(band, (band->state + 1) % FTL_BAND_STATE_MAX); +} + +static inline int +ftl_band_state_changing(struct ftl_band *band) +{ + return band->state == FTL_BAND_STATE_OPENING || + band->state == FTL_BAND_STATE_CLOSING; +} + +static inline int +ftl_band_block_offset_valid(struct ftl_band *band, size_t block_off) +{ + struct ftl_lba_map *lba_map = &band->lba_map; + + pthread_spin_lock(&lba_map->lock); + if (spdk_bit_array_get(lba_map->vld, block_off)) { + pthread_spin_unlock(&lba_map->lock); + return 1; + } + + pthread_spin_unlock(&lba_map->lock); + return 0; +} + +static inline int +ftl_band_zone_is_last(struct ftl_band *band, struct ftl_zone *zone) +{ + return zone == CIRCLEQ_LAST(&band->zones); +} + +static inline int +ftl_band_zone_is_first(struct ftl_band *band, struct ftl_zone *zone) +{ + return zone == CIRCLEQ_FIRST(&band->zones); +} + +static inline int +ftl_zone_is_writable(const struct spdk_ftl_dev *dev, const struct ftl_zone *zone) +{ + bool busy = ftl_is_append_supported(dev) ? false : zone->busy; + + return (zone->info.state == SPDK_BDEV_ZONE_STATE_OPEN || + zone->info.state == SPDK_BDEV_ZONE_STATE_EMPTY) && + !busy; +} + +#endif /* FTL_BAND_H */ diff --git a/src/spdk/lib/ftl/ftl_core.c b/src/spdk/lib/ftl/ftl_core.c new file mode 100644 index 000000000..b0b448806 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_core.c @@ -0,0 +1,2460 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/likely.h" +#include "spdk/stdinc.h" +#include "spdk/nvme.h" +#include "spdk/thread.h" +#include "spdk/bdev_module.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" +#include "spdk/ftl.h" +#include "spdk/crc32.h" + +#include "ftl_core.h" +#include "ftl_band.h" +#include "ftl_io.h" +#include "ftl_debug.h" +#include "ftl_reloc.h" + +struct ftl_band_flush { + struct spdk_ftl_dev *dev; + /* Number of bands left to be flushed */ + size_t num_bands; + /* User callback */ + spdk_ftl_fn cb_fn; + /* Callback's argument */ + void *cb_arg; + /* List link */ + LIST_ENTRY(ftl_band_flush) list_entry; +}; + +struct ftl_wptr { + /* Owner device */ + struct spdk_ftl_dev *dev; + + /* Current address */ + struct ftl_addr addr; + + /* Band currently being written to */ + struct ftl_band *band; + + /* Current logical block's offset */ + uint64_t offset; + + /* Current zone */ + struct ftl_zone *zone; + + /* Pending IO queue */ + TAILQ_HEAD(, ftl_io) pending_queue; + + /* List link */ + LIST_ENTRY(ftl_wptr) list_entry; + + /* + * If setup in direct mode, there will be no offset or band state update after IO. + * The zoned bdev address is not assigned by wptr, and is instead taken directly + * from the request. + */ + bool direct_mode; + + /* Number of outstanding write requests */ + uint32_t num_outstanding; + + /* Marks that the band related to this wptr needs to be closed as soon as possible */ + bool flush; +}; + +struct ftl_flush { + /* Owner device */ + struct spdk_ftl_dev *dev; + + /* Number of batches to wait for */ + size_t num_req; + + /* Callback */ + struct { + spdk_ftl_fn fn; + void *ctx; + } cb; + + /* Batch bitmap */ + struct spdk_bit_array *bmap; + + /* List link */ + LIST_ENTRY(ftl_flush) list_entry; +}; + +static void +ftl_wptr_free(struct ftl_wptr *wptr) +{ + if (!wptr) { + return; + } + + free(wptr); +} + +static void +ftl_remove_wptr(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_band_flush *flush, *tmp; + + if (spdk_unlikely(wptr->flush)) { + LIST_FOREACH_SAFE(flush, &dev->band_flush_list, list_entry, tmp) { + assert(flush->num_bands > 0); + if (--flush->num_bands == 0) { + flush->cb_fn(flush->cb_arg, 0); + LIST_REMOVE(flush, list_entry); + free(flush); + } + } + } + + LIST_REMOVE(wptr, list_entry); + ftl_wptr_free(wptr); +} + +static struct ftl_wbuf_entry * +ftl_acquire_wbuf_entry(struct ftl_io_channel *io_channel, int io_flags) +{ + struct ftl_wbuf_entry *entry = NULL; + uint32_t qdepth; + + if (!(io_flags & FTL_IO_INTERNAL)) { + qdepth = __atomic_fetch_add(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + if (qdepth >= io_channel->qdepth_limit) { + __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + return NULL; + } + } + + if (spdk_ring_dequeue(io_channel->free_queue, (void **)&entry, 1) != 1) { + if (!(io_flags & FTL_IO_INTERNAL)) { + __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + } + + return NULL; + } + + assert(entry != NULL); + + ftl_evict_cache_entry(io_channel->dev, entry); + + entry->io_flags = io_flags; + entry->addr.offset = FTL_ADDR_INVALID; + entry->lba = FTL_LBA_INVALID; + entry->band = NULL; + entry->valid = false; + + return entry; +} + +static void +ftl_release_wbuf_entry(struct ftl_wbuf_entry *entry) +{ + struct ftl_io_channel *io_channel = entry->ioch; + + if (!(entry->io_flags & FTL_IO_INTERNAL)) { + __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + } + + spdk_ring_enqueue(io_channel->free_queue, (void **)&entry, 1, NULL); +} + +static struct ftl_batch * +ftl_get_next_batch(struct spdk_ftl_dev *dev) +{ + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; +#define FTL_DEQUEUE_ENTRIES 128 + struct ftl_wbuf_entry *entries[FTL_DEQUEUE_ENTRIES]; + TAILQ_HEAD(, ftl_io_channel) ioch_queue; + size_t i, num_dequeued, num_remaining; + uint64_t *metadata; + + if (batch == NULL) { + batch = TAILQ_FIRST(&dev->pending_batches); + if (batch != NULL) { + TAILQ_REMOVE(&dev->pending_batches, batch, tailq); + return batch; + } + + batch = TAILQ_FIRST(&dev->free_batches); + if (spdk_unlikely(batch == NULL)) { + return NULL; + } + + assert(TAILQ_EMPTY(&batch->entries)); + assert(batch->num_entries == 0); + TAILQ_REMOVE(&dev->free_batches, batch, tailq); + } + + /* + * Keep shifting the queue to ensure fairness in IO channel selection. Each time + * ftl_get_next_batch() is called, we're starting to dequeue write buffer entries from a + * different IO channel. + */ + TAILQ_INIT(&ioch_queue); + while (!TAILQ_EMPTY(&dev->ioch_queue)) { + ioch = TAILQ_FIRST(&dev->ioch_queue); + TAILQ_REMOVE(&dev->ioch_queue, ioch, tailq); + TAILQ_INSERT_TAIL(&ioch_queue, ioch, tailq); + + num_remaining = dev->xfer_size - batch->num_entries; + while (num_remaining > 0) { + num_dequeued = spdk_ring_dequeue(ioch->submit_queue, (void **)entries, + spdk_min(num_remaining, + FTL_DEQUEUE_ENTRIES)); + if (num_dequeued == 0) { + break; + } + + for (i = 0; i < num_dequeued; ++i) { + batch->iov[batch->num_entries + i].iov_base = entries[i]->payload; + batch->iov[batch->num_entries + i].iov_len = FTL_BLOCK_SIZE; + + if (batch->metadata != NULL) { + metadata = (uint64_t *)((char *)batch->metadata + + i * dev->md_size); + *metadata = entries[i]->lba; + } + + TAILQ_INSERT_TAIL(&batch->entries, entries[i], tailq); + } + + batch->num_entries += num_dequeued; + num_remaining -= num_dequeued; + } + + if (num_remaining == 0) { + break; + } + } + + TAILQ_CONCAT(&dev->ioch_queue, &ioch_queue, tailq); + + if (batch->num_entries == dev->xfer_size) { + dev->current_batch = NULL; + } else { + dev->current_batch = batch; + batch = NULL; + } + + return batch; +} + +static void +ftl_release_batch(struct spdk_ftl_dev *dev, struct ftl_batch *batch) +{ + struct ftl_wbuf_entry *entry; + + while (!TAILQ_EMPTY(&batch->entries)) { + entry = TAILQ_FIRST(&batch->entries); + TAILQ_REMOVE(&batch->entries, entry, tailq); + ftl_release_wbuf_entry(entry); + } + + batch->num_entries = 0; + TAILQ_INSERT_TAIL(&dev->free_batches, batch, tailq); +} + +static struct ftl_wbuf_entry * +ftl_get_entry_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_io_channel *ioch; + uint64_t ioch_offset, entry_offset; + + ioch_offset = addr.cache_offset & ((1 << dev->ioch_shift) - 1); + entry_offset = addr.cache_offset >> dev->ioch_shift; + ioch = dev->ioch_array[ioch_offset]; + + assert(ioch_offset < dev->conf.max_io_channels); + assert(entry_offset < ioch->num_entries); + assert(addr.cached == 1); + + return &ioch->wbuf_entries[entry_offset]; +} + +static struct ftl_addr +ftl_get_addr_from_entry(struct ftl_wbuf_entry *entry) +{ + struct ftl_io_channel *ioch = entry->ioch; + struct ftl_addr addr = {}; + + addr.cached = 1; + addr.cache_offset = (uint64_t)entry->index << ioch->dev->ioch_shift | ioch->index; + + return addr; +} + +static void +ftl_io_cmpl_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_io *io = cb_arg; + struct spdk_ftl_dev *dev = io->dev; + + if (spdk_unlikely(!success)) { + io->status = -EIO; + } + + ftl_trace_completion(dev, io, FTL_TRACE_COMPLETION_DISK); + + if (io->type == FTL_IO_WRITE && ftl_is_append_supported(dev)) { + assert(io->parent); + io->parent->addr.offset = spdk_bdev_io_get_append_location(bdev_io); + } + + ftl_io_dec_req(io); + if (ftl_io_done(io)) { + ftl_io_complete(io); + } + + spdk_bdev_free_io(bdev_io); +} + +static void +ftl_halt_writes(struct spdk_ftl_dev *dev, struct ftl_band *band) +{ + struct ftl_wptr *wptr = NULL; + + LIST_FOREACH(wptr, &dev->wptr_list, list_entry) { + if (wptr->band == band) { + break; + } + } + + /* If the band already has the high_prio flag set, other writes must */ + /* have failed earlier, so it's already taken care of. */ + if (band->high_prio) { + assert(wptr == NULL); + return; + } + + ftl_band_write_failed(band); + ftl_remove_wptr(wptr); +} + +static struct ftl_wptr * +ftl_wptr_from_band(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_wptr *wptr = NULL; + + LIST_FOREACH(wptr, &dev->wptr_list, list_entry) { + if (wptr->band == band) { + return wptr; + } + } + + return NULL; +} + +static void +ftl_md_write_fail(struct ftl_io *io, int status) +{ + struct ftl_band *band = io->band; + struct ftl_wptr *wptr; + char buf[128]; + + wptr = ftl_wptr_from_band(band); + assert(wptr); + + SPDK_ERRLOG("Metadata write failed @addr: %s, status: %d\n", + ftl_addr2str(wptr->addr, buf, sizeof(buf)), status); + + ftl_halt_writes(io->dev, band); +} + +static void +ftl_md_write_cb(struct ftl_io *io, void *arg, int status) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + struct ftl_band *band = io->band; + struct ftl_wptr *wptr; + size_t id; + + wptr = ftl_wptr_from_band(band); + assert(wptr); + + if (status) { + ftl_md_write_fail(io, status); + return; + } + + ftl_band_set_next_state(band); + if (band->state == FTL_BAND_STATE_CLOSED) { + if (ftl_dev_has_nv_cache(dev)) { + pthread_spin_lock(&nv_cache->lock); + nv_cache->num_available += ftl_band_user_blocks(band); + + if (spdk_unlikely(nv_cache->num_available > nv_cache->num_data_blocks)) { + nv_cache->num_available = nv_cache->num_data_blocks; + } + pthread_spin_unlock(&nv_cache->lock); + } + + /* + * Go through the reloc_bitmap, checking for all the bands that had its data moved + * onto current band and update their counters to allow them to be used for writing + * (once they're closed and empty). + */ + for (id = 0; id < ftl_get_num_bands(dev); ++id) { + if (spdk_bit_array_get(band->reloc_bitmap, id)) { + assert(dev->bands[id].num_reloc_bands > 0); + dev->bands[id].num_reloc_bands--; + + spdk_bit_array_clear(band->reloc_bitmap, id); + } + } + + ftl_remove_wptr(wptr); + } +} + +static int +ftl_read_next_physical_addr(struct ftl_io *io, struct ftl_addr *addr) +{ + struct spdk_ftl_dev *dev = io->dev; + size_t num_blocks, max_blocks; + + assert(ftl_io_mode_physical(io)); + assert(io->iov_pos < io->iov_cnt); + + if (io->pos == 0) { + *addr = io->addr; + } else { + *addr = ftl_band_next_xfer_addr(io->band, io->addr, io->pos); + } + + assert(!ftl_addr_invalid(*addr)); + + /* Metadata has to be read in the way it's written (jumping across */ + /* the zones in xfer_size increments) */ + if (io->flags & FTL_IO_MD) { + max_blocks = dev->xfer_size - (addr->offset % dev->xfer_size); + num_blocks = spdk_min(ftl_io_iovec_len_left(io), max_blocks); + assert(addr->offset / dev->xfer_size == + (addr->offset + num_blocks - 1) / dev->xfer_size); + } else { + num_blocks = ftl_io_iovec_len_left(io); + } + + return num_blocks; +} + +static int +ftl_wptr_close_band(struct ftl_wptr *wptr) +{ + struct ftl_band *band = wptr->band; + + ftl_band_set_state(band, FTL_BAND_STATE_CLOSING); + + return ftl_band_write_tail_md(band, ftl_md_write_cb); +} + +static int +ftl_wptr_open_band(struct ftl_wptr *wptr) +{ + struct ftl_band *band = wptr->band; + + assert(ftl_band_zone_is_first(band, wptr->zone)); + assert(band->lba_map.num_vld == 0); + + ftl_band_clear_lba_map(band); + + assert(band->state == FTL_BAND_STATE_PREP); + ftl_band_set_state(band, FTL_BAND_STATE_OPENING); + + return ftl_band_write_head_md(band, ftl_md_write_cb); +} + +static int +ftl_submit_erase(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_band *band = io->band; + struct ftl_addr addr = io->addr; + struct ftl_io_channel *ioch; + struct ftl_zone *zone; + int rc = 0; + size_t i; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + for (i = 0; i < io->num_blocks; ++i) { + if (i != 0) { + zone = ftl_band_next_zone(band, ftl_band_zone_from_addr(band, addr)); + assert(zone->info.state == SPDK_BDEV_ZONE_STATE_FULL); + addr.offset = zone->info.zone_id; + } + + assert(ftl_addr_get_zone_offset(dev, addr) == 0); + + ftl_trace_submission(dev, io, addr, 1); + rc = spdk_bdev_zone_management(dev->base_bdev_desc, ioch->base_ioch, addr.offset, + SPDK_BDEV_ZONE_RESET, ftl_io_cmpl_cb, io); + if (spdk_unlikely(rc)) { + ftl_io_fail(io, rc); + SPDK_ERRLOG("Vector reset failed with status: %d\n", rc); + break; + } + + ftl_io_inc_req(io); + ftl_io_advance(io, 1); + } + + if (ftl_io_done(io)) { + ftl_io_complete(io); + } + + return rc; +} + +static bool +ftl_check_core_thread(const struct spdk_ftl_dev *dev) +{ + return dev->core_thread == spdk_get_thread(); +} + +struct spdk_io_channel * +ftl_get_io_channel(const struct spdk_ftl_dev *dev) +{ + if (ftl_check_core_thread(dev)) { + return dev->ioch; + } + + return NULL; +} + +static void +ftl_erase_fail(struct ftl_io *io, int status) +{ + struct ftl_zone *zone; + struct ftl_band *band = io->band; + char buf[128]; + + SPDK_ERRLOG("Erase failed at address: %s, status: %d\n", + ftl_addr2str(io->addr, buf, sizeof(buf)), status); + + zone = ftl_band_zone_from_addr(band, io->addr); + zone->info.state = SPDK_BDEV_ZONE_STATE_OFFLINE; + ftl_band_remove_zone(band, zone); + band->tail_md_addr = ftl_band_tail_md_addr(band); +} + +static void +ftl_zone_erase_cb(struct ftl_io *io, void *ctx, int status) +{ + struct ftl_zone *zone; + + zone = ftl_band_zone_from_addr(io->band, io->addr); + zone->busy = false; + + if (spdk_unlikely(status)) { + ftl_erase_fail(io, status); + return; + } + + zone->info.state = SPDK_BDEV_ZONE_STATE_EMPTY; + zone->info.write_pointer = zone->info.zone_id; +} + +static int +ftl_band_erase(struct ftl_band *band) +{ + struct ftl_zone *zone; + struct ftl_io *io; + int rc = 0; + + assert(band->state == FTL_BAND_STATE_CLOSED || + band->state == FTL_BAND_STATE_FREE); + + ftl_band_set_state(band, FTL_BAND_STATE_PREP); + + CIRCLEQ_FOREACH(zone, &band->zones, circleq) { + if (zone->info.state == SPDK_BDEV_ZONE_STATE_EMPTY) { + continue; + } + + io = ftl_io_erase_init(band, 1, ftl_zone_erase_cb); + if (!io) { + rc = -ENOMEM; + break; + } + + zone->busy = true; + io->addr.offset = zone->info.zone_id; + rc = ftl_submit_erase(io); + if (rc) { + zone->busy = false; + assert(0); + /* TODO: change band's state back to close? */ + break; + } + } + + return rc; +} + +static struct ftl_band * +ftl_next_write_band(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + + /* Find a free band that has all of its data moved onto other closed bands */ + LIST_FOREACH(band, &dev->free_bands, list_entry) { + assert(band->state == FTL_BAND_STATE_FREE); + if (band->num_reloc_bands == 0 && band->num_reloc_blocks == 0) { + break; + } + } + + if (spdk_unlikely(!band)) { + return NULL; + } + + if (ftl_band_erase(band)) { + /* TODO: handle erase failure */ + return NULL; + } + + return band; +} + +static struct ftl_band * +ftl_next_wptr_band(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + + if (!dev->next_band) { + band = ftl_next_write_band(dev); + } else { + assert(dev->next_band->state == FTL_BAND_STATE_PREP); + band = dev->next_band; + dev->next_band = NULL; + } + + return band; +} + +static struct ftl_wptr * +ftl_wptr_init(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_wptr *wptr; + + wptr = calloc(1, sizeof(*wptr)); + if (!wptr) { + return NULL; + } + + wptr->dev = dev; + wptr->band = band; + wptr->zone = CIRCLEQ_FIRST(&band->zones); + wptr->addr.offset = wptr->zone->info.zone_id; + TAILQ_INIT(&wptr->pending_queue); + + return wptr; +} + +static int +ftl_add_direct_wptr(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_wptr *wptr; + + assert(band->state == FTL_BAND_STATE_OPEN); + + wptr = ftl_wptr_init(band); + if (!wptr) { + return -1; + } + + wptr->direct_mode = true; + + if (ftl_band_alloc_lba_map(band)) { + ftl_wptr_free(wptr); + return -1; + } + + LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: direct band %u\n", band->id); + ftl_trace_write_band(dev, band); + return 0; +} + +static void +ftl_close_direct_wptr(struct ftl_band *band) +{ + struct ftl_wptr *wptr = ftl_wptr_from_band(band); + + assert(wptr); + assert(wptr->direct_mode); + assert(band->state == FTL_BAND_STATE_CLOSED); + + ftl_band_release_lba_map(band); + + ftl_remove_wptr(wptr); +} + +int +ftl_band_set_direct_access(struct ftl_band *band, bool access) +{ + if (access) { + return ftl_add_direct_wptr(band); + } else { + ftl_close_direct_wptr(band); + return 0; + } +} + +static int +ftl_add_wptr(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + struct ftl_wptr *wptr; + + band = ftl_next_wptr_band(dev); + if (!band) { + return -1; + } + + wptr = ftl_wptr_init(band); + if (!wptr) { + return -1; + } + + if (ftl_band_write_prep(band)) { + ftl_wptr_free(wptr); + return -1; + } + + LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: band %u\n", band->id); + ftl_trace_write_band(dev, band); + return 0; +} + +static void +ftl_wptr_advance(struct ftl_wptr *wptr, size_t xfer_size) +{ + struct ftl_band *band = wptr->band; + struct spdk_ftl_dev *dev = wptr->dev; + struct spdk_ftl_conf *conf = &dev->conf; + size_t next_thld; + + if (spdk_unlikely(wptr->direct_mode)) { + return; + } + + wptr->offset += xfer_size; + next_thld = (ftl_band_num_usable_blocks(band) * conf->band_thld) / 100; + + if (ftl_band_full(band, wptr->offset)) { + ftl_band_set_state(band, FTL_BAND_STATE_FULL); + } + + wptr->zone->busy = true; + wptr->addr = ftl_band_next_xfer_addr(band, wptr->addr, xfer_size); + wptr->zone = ftl_band_next_operational_zone(band, wptr->zone); + + assert(!ftl_addr_invalid(wptr->addr)); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: pu:%lu band:%lu, offset:%lu\n", + ftl_addr_get_punit(dev, wptr->addr), + ftl_addr_get_band(dev, wptr->addr), + wptr->addr.offset); + + if (wptr->offset >= next_thld && !dev->next_band) { + dev->next_band = ftl_next_write_band(dev); + } +} + +static size_t +ftl_wptr_user_blocks_left(const struct ftl_wptr *wptr) +{ + return ftl_band_user_blocks_left(wptr->band, wptr->offset); +} + +static bool +ftl_wptr_ready(struct ftl_wptr *wptr) +{ + struct ftl_band *band = wptr->band; + + /* TODO: add handling of empty bands */ + + if (spdk_unlikely(!ftl_zone_is_writable(wptr->dev, wptr->zone))) { + /* Erasing band may fail after it was assigned to wptr. */ + if (spdk_unlikely(wptr->zone->info.state == SPDK_BDEV_ZONE_STATE_OFFLINE)) { + ftl_wptr_advance(wptr, wptr->dev->xfer_size); + } + return false; + } + + /* If we're in the process of writing metadata, wait till it is */ + /* completed. */ + /* TODO: we should probably change bands once we're writing tail md */ + if (ftl_band_state_changing(band)) { + return false; + } + + if (band->state == FTL_BAND_STATE_FULL) { + if (wptr->num_outstanding == 0) { + if (ftl_wptr_close_band(wptr)) { + /* TODO: need recovery here */ + assert(false); + } + } + + return false; + } + + if (band->state != FTL_BAND_STATE_OPEN) { + if (ftl_wptr_open_band(wptr)) { + /* TODO: need recovery here */ + assert(false); + } + + return false; + } + + return true; +} + +int +ftl_flush_active_bands(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_wptr *wptr; + struct ftl_band_flush *flush; + + assert(ftl_get_core_thread(dev) == spdk_get_thread()); + + flush = calloc(1, sizeof(*flush)); + if (spdk_unlikely(!flush)) { + return -ENOMEM; + } + + LIST_INSERT_HEAD(&dev->band_flush_list, flush, list_entry); + + flush->cb_fn = cb_fn; + flush->cb_arg = cb_arg; + flush->dev = dev; + + LIST_FOREACH(wptr, &dev->wptr_list, list_entry) { + wptr->flush = true; + flush->num_bands++; + } + + return 0; +} + +static const struct spdk_ftl_limit * +ftl_get_limit(const struct spdk_ftl_dev *dev, int type) +{ + assert(type < SPDK_FTL_LIMIT_MAX); + return &dev->conf.limits[type]; +} + +static bool +ftl_cache_lba_valid(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry) +{ + struct ftl_addr addr; + + /* If the LBA is invalid don't bother checking the md and l2p */ + if (spdk_unlikely(entry->lba == FTL_LBA_INVALID)) { + return false; + } + + addr = ftl_l2p_get(dev, entry->lba); + if (!(ftl_addr_cached(addr) && entry == ftl_get_entry_from_addr(dev, addr))) { + return false; + } + + return true; +} + +void +ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry) +{ + pthread_spin_lock(&entry->lock); + + if (!entry->valid) { + goto unlock; + } + + /* If the l2p wasn't updated and still points at the entry, fill it with the */ + /* on-disk address and clear the cache status bit. Otherwise, skip the l2p update */ + /* and just clear the cache status. */ + if (!ftl_cache_lba_valid(dev, entry)) { + goto clear; + } + + ftl_l2p_set(dev, entry->lba, entry->addr); +clear: + entry->valid = false; +unlock: + pthread_spin_unlock(&entry->lock); +} + +static void +ftl_pad_wbuf(struct spdk_ftl_dev *dev, size_t size) +{ + struct ftl_wbuf_entry *entry; + struct ftl_io_channel *ioch; + int flags = FTL_IO_PAD | FTL_IO_INTERNAL; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + for (size_t i = 0; i < size; ++i) { + entry = ftl_acquire_wbuf_entry(ioch, flags); + if (!entry) { + break; + } + + entry->lba = FTL_LBA_INVALID; + entry->addr = ftl_to_addr(FTL_ADDR_INVALID); + memset(entry->payload, 0, FTL_BLOCK_SIZE); + + spdk_ring_enqueue(ioch->submit_queue, (void **)&entry, 1, NULL); + } +} + +static void +ftl_remove_free_bands(struct spdk_ftl_dev *dev) +{ + while (!LIST_EMPTY(&dev->free_bands)) { + LIST_REMOVE(LIST_FIRST(&dev->free_bands), list_entry); + } + + dev->next_band = NULL; +} + +static void +ftl_wptr_pad_band(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; + size_t size, pad_size, blocks_left; + + size = batch != NULL ? batch->num_entries : 0; + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + size += spdk_ring_count(ioch->submit_queue); + } + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + blocks_left = ftl_wptr_user_blocks_left(wptr); + assert(size <= blocks_left); + assert(blocks_left % dev->xfer_size == 0); + pad_size = spdk_min(blocks_left - size, spdk_ring_count(ioch->free_queue)); + + ftl_pad_wbuf(dev, pad_size); +} + +static void +ftl_wptr_process_shutdown(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; + size_t size; + + size = batch != NULL ? batch->num_entries : 0; + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + size += spdk_ring_count(ioch->submit_queue); + } + + if (size >= dev->xfer_size) { + return; + } + + /* If we reach this point we need to remove free bands */ + /* and pad current wptr band to the end */ + ftl_remove_free_bands(dev); + ftl_wptr_pad_band(wptr); +} + +static int +ftl_shutdown_complete(struct spdk_ftl_dev *dev) +{ + struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(dev->ioch); + + return !__atomic_load_n(&dev->num_inflight, __ATOMIC_SEQ_CST) && + dev->num_io_channels == 1 && LIST_EMPTY(&dev->wptr_list) && + TAILQ_EMPTY(&ioch->retry_queue); +} + +void +ftl_apply_limits(struct spdk_ftl_dev *dev) +{ + const struct spdk_ftl_limit *limit; + struct ftl_io_channel *ioch; + struct ftl_stats *stats = &dev->stats; + uint32_t qdepth_limit = 100; + int i; + + /* Clear existing limit */ + dev->limit = SPDK_FTL_LIMIT_MAX; + + for (i = SPDK_FTL_LIMIT_CRIT; i < SPDK_FTL_LIMIT_MAX; ++i) { + limit = ftl_get_limit(dev, i); + + if (dev->num_free <= limit->thld) { + qdepth_limit = limit->limit; + stats->limits[i]++; + dev->limit = i; + break; + } + } + + ftl_trace_limits(dev, dev->limit, dev->num_free); + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + __atomic_store_n(&ioch->qdepth_limit, (qdepth_limit * ioch->num_entries) / 100, + __ATOMIC_SEQ_CST); + } +} + +static int +ftl_invalidate_addr_unlocked(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_band *band = ftl_band_from_addr(dev, addr); + struct ftl_lba_map *lba_map = &band->lba_map; + uint64_t offset; + + offset = ftl_band_block_offset_from_addr(band, addr); + + /* The bit might be already cleared if two writes are scheduled to the */ + /* same LBA at the same time */ + if (spdk_bit_array_get(lba_map->vld, offset)) { + assert(lba_map->num_vld > 0); + spdk_bit_array_clear(lba_map->vld, offset); + lba_map->num_vld--; + return 1; + } + + return 0; +} + +int +ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_band *band; + int rc; + + assert(!ftl_addr_cached(addr)); + band = ftl_band_from_addr(dev, addr); + + pthread_spin_lock(&band->lba_map.lock); + rc = ftl_invalidate_addr_unlocked(dev, addr); + pthread_spin_unlock(&band->lba_map.lock); + + return rc; +} + +static int +ftl_read_retry(int rc) +{ + return rc == -EAGAIN; +} + +static int +ftl_read_canceled(int rc) +{ + return rc == -EFAULT || rc == 0; +} + +static int +ftl_cache_read(struct ftl_io *io, uint64_t lba, + struct ftl_addr addr, void *buf) +{ + struct ftl_wbuf_entry *entry; + struct ftl_addr naddr; + int rc = 0; + + entry = ftl_get_entry_from_addr(io->dev, addr); + pthread_spin_lock(&entry->lock); + + naddr = ftl_l2p_get(io->dev, lba); + if (addr.offset != naddr.offset) { + rc = -1; + goto out; + } + + memcpy(buf, entry->payload, FTL_BLOCK_SIZE); +out: + pthread_spin_unlock(&entry->lock); + return rc; +} + +static int +ftl_read_next_logical_addr(struct ftl_io *io, struct ftl_addr *addr) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_addr next_addr; + size_t i; + + *addr = ftl_l2p_get(dev, ftl_io_current_lba(io)); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Read addr:%lx, lba:%lu\n", + addr->offset, ftl_io_current_lba(io)); + + /* If the address is invalid, skip it (the buffer should already be zero'ed) */ + if (ftl_addr_invalid(*addr)) { + return -EFAULT; + } + + if (ftl_addr_cached(*addr)) { + if (!ftl_cache_read(io, ftl_io_current_lba(io), *addr, ftl_io_iovec_addr(io))) { + return 0; + } + + /* If the state changed, we have to re-read the l2p */ + return -EAGAIN; + } + + for (i = 1; i < ftl_io_iovec_len_left(io); ++i) { + next_addr = ftl_l2p_get(dev, ftl_io_get_lba(io, io->pos + i)); + + if (ftl_addr_invalid(next_addr) || ftl_addr_cached(next_addr)) { + break; + } + + if (addr->offset + i != next_addr.offset) { + break; + } + } + + return i; +} + +static int +ftl_submit_read(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch; + struct ftl_addr addr; + int rc = 0, num_blocks; + + ioch = ftl_io_channel_get_ctx(io->ioch); + + assert(LIST_EMPTY(&io->children)); + + while (io->pos < io->num_blocks) { + if (ftl_io_mode_physical(io)) { + num_blocks = rc = ftl_read_next_physical_addr(io, &addr); + } else { + num_blocks = rc = ftl_read_next_logical_addr(io, &addr); + } + + /* We might need to retry the read from scratch (e.g. */ + /* because write was under way and completed before */ + /* we could read it from the write buffer */ + if (ftl_read_retry(rc)) { + continue; + } + + /* We don't have to schedule the read, as it was read from cache */ + if (ftl_read_canceled(rc)) { + ftl_io_advance(io, 1); + ftl_trace_completion(io->dev, io, rc ? FTL_TRACE_COMPLETION_INVALID : + FTL_TRACE_COMPLETION_CACHE); + rc = 0; + continue; + } + + assert(num_blocks > 0); + + ftl_trace_submission(dev, io, addr, num_blocks); + rc = spdk_bdev_read_blocks(dev->base_bdev_desc, ioch->base_ioch, + ftl_io_iovec_addr(io), + addr.offset, + num_blocks, ftl_io_cmpl_cb, io); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry); + rc = 0; + } else { + ftl_io_fail(io, rc); + } + break; + } + + ftl_io_inc_req(io); + ftl_io_advance(io, num_blocks); + } + + /* If we didn't have to read anything from the device, */ + /* complete the request right away */ + if (ftl_io_done(io)) { + ftl_io_complete(io); + } + + return rc; +} + +static void +ftl_complete_flush(struct ftl_flush *flush) +{ + assert(flush->num_req == 0); + LIST_REMOVE(flush, list_entry); + + flush->cb.fn(flush->cb.ctx, 0); + + spdk_bit_array_free(&flush->bmap); + free(flush); +} + +static void +ftl_process_flush(struct spdk_ftl_dev *dev, struct ftl_batch *batch) +{ + struct ftl_flush *flush, *tflush; + size_t offset; + + LIST_FOREACH_SAFE(flush, &dev->flush_list, list_entry, tflush) { + offset = batch->index; + + if (spdk_bit_array_get(flush->bmap, offset)) { + spdk_bit_array_clear(flush->bmap, offset); + if (!(--flush->num_req)) { + ftl_complete_flush(flush); + } + } + } +} + +static void +ftl_nv_cache_wrap_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache *nv_cache = cb_arg; + + if (!success) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n"); + /* TODO: go into read-only mode */ + assert(0); + } + + pthread_spin_lock(&nv_cache->lock); + nv_cache->ready = true; + pthread_spin_unlock(&nv_cache->lock); + + spdk_bdev_free_io(bdev_io); +} + +static void +ftl_nv_cache_wrap(void *ctx) +{ + struct ftl_nv_cache *nv_cache = ctx; + int rc; + + rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_wrap_cb, nv_cache); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header: %s\n", + spdk_strerror(-rc)); + /* TODO: go into read-only mode */ + assert(0); + } +} + +static uint64_t +ftl_reserve_nv_cache(struct ftl_nv_cache *nv_cache, size_t *num_blocks, unsigned int *phase) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + uint64_t num_available, cache_size, cache_addr = FTL_LBA_INVALID; + + cache_size = spdk_bdev_get_num_blocks(bdev); + + pthread_spin_lock(&nv_cache->lock); + if (spdk_unlikely(nv_cache->num_available == 0 || !nv_cache->ready)) { + goto out; + } + + num_available = spdk_min(nv_cache->num_available, *num_blocks); + num_available = spdk_min(num_available, dev->conf.nv_cache.max_request_cnt); + + if (spdk_unlikely(nv_cache->current_addr + num_available > cache_size)) { + *num_blocks = cache_size - nv_cache->current_addr; + } else { + *num_blocks = num_available; + } + + cache_addr = nv_cache->current_addr; + nv_cache->current_addr += *num_blocks; + nv_cache->num_available -= *num_blocks; + *phase = nv_cache->phase; + + if (nv_cache->current_addr == spdk_bdev_get_num_blocks(bdev)) { + nv_cache->current_addr = FTL_NV_CACHE_DATA_OFFSET; + nv_cache->phase = ftl_nv_cache_next_phase(nv_cache->phase); + nv_cache->ready = false; + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_nv_cache_wrap, nv_cache); + } +out: + pthread_spin_unlock(&nv_cache->lock); + return cache_addr; +} + +static struct ftl_io * +ftl_alloc_io_nv_cache(struct ftl_io *parent, size_t num_blocks) +{ + struct ftl_io_init_opts opts = { + .dev = parent->dev, + .parent = parent, + .iovcnt = 0, + .num_blocks = num_blocks, + .flags = parent->flags | FTL_IO_CACHE, + }; + + return ftl_io_init_internal(&opts); +} + +static void +ftl_nv_cache_submit_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_io *io = cb_arg; + struct ftl_nv_cache *nv_cache = &io->dev->nv_cache; + + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Non-volatile cache write failed at %"PRIx64"\n", io->addr.offset); + io->status = -EIO; + } + + ftl_io_dec_req(io); + if (ftl_io_done(io)) { + spdk_mempool_put(nv_cache->md_pool, io->md); + ftl_io_complete(io); + } + + spdk_bdev_free_io(bdev_io); +} + +static void +ftl_submit_nv_cache(void *ctx) +{ + struct ftl_io *io = ctx; + struct spdk_ftl_dev *dev = io->dev; + struct spdk_thread *thread; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + struct ftl_io_channel *ioch; + int rc; + + ioch = ftl_io_channel_get_ctx(io->ioch); + thread = spdk_io_channel_get_thread(io->ioch); + + rc = spdk_bdev_write_blocks_with_md(nv_cache->bdev_desc, ioch->cache_ioch, + ftl_io_iovec_addr(io), io->md, io->addr.offset, + io->num_blocks, ftl_nv_cache_submit_cb, io); + if (rc == -ENOMEM) { + spdk_thread_send_msg(thread, ftl_submit_nv_cache, io); + return; + } else if (rc) { + SPDK_ERRLOG("Write to persistent cache failed: %s (%"PRIu64", %"PRIu64")\n", + spdk_strerror(-rc), io->addr.offset, io->num_blocks); + spdk_mempool_put(nv_cache->md_pool, io->md); + io->status = -EIO; + ftl_io_complete(io); + return; + } + + ftl_io_advance(io, io->num_blocks); + ftl_io_inc_req(io); +} + +static void +ftl_nv_cache_fill_md(struct ftl_io *io, unsigned int phase) +{ + struct spdk_bdev *bdev; + struct ftl_nv_cache *nv_cache = &io->dev->nv_cache; + uint64_t block_off, lba; + void *md_buf = io->md; + + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + + for (block_off = 0; block_off < io->num_blocks; ++block_off) { + lba = ftl_nv_cache_pack_lba(ftl_io_get_lba(io, block_off), phase); + memcpy(md_buf, &lba, sizeof(lba)); + md_buf += spdk_bdev_get_md_size(bdev); + } +} + +static void +_ftl_write_nv_cache(void *ctx) +{ + struct ftl_io *child, *io = ctx; + struct spdk_ftl_dev *dev = io->dev; + struct spdk_thread *thread; + unsigned int phase; + uint64_t num_blocks; + + thread = spdk_io_channel_get_thread(io->ioch); + + while (io->pos < io->num_blocks) { + num_blocks = ftl_io_iovec_len_left(io); + + child = ftl_alloc_io_nv_cache(io, num_blocks); + if (spdk_unlikely(!child)) { + spdk_thread_send_msg(thread, _ftl_write_nv_cache, io); + return; + } + + child->md = spdk_mempool_get(dev->nv_cache.md_pool); + if (spdk_unlikely(!child->md)) { + ftl_io_free(child); + spdk_thread_send_msg(thread, _ftl_write_nv_cache, io); + break; + } + + /* Reserve area on the write buffer cache */ + child->addr.offset = ftl_reserve_nv_cache(&dev->nv_cache, &num_blocks, &phase); + if (child->addr.offset == FTL_LBA_INVALID) { + spdk_mempool_put(dev->nv_cache.md_pool, child->md); + ftl_io_free(child); + spdk_thread_send_msg(thread, _ftl_write_nv_cache, io); + break; + } + + /* Shrink the IO if there isn't enough room in the cache to fill the whole iovec */ + if (spdk_unlikely(num_blocks != ftl_io_iovec_len_left(io))) { + ftl_io_shrink_iovec(child, num_blocks); + } + + ftl_nv_cache_fill_md(child, phase); + ftl_submit_nv_cache(child); + } + + if (ftl_io_done(io)) { + ftl_io_complete(io); + } +} + +static void +ftl_write_nv_cache(struct ftl_io *parent) +{ + ftl_io_reset(parent); + parent->flags |= FTL_IO_CACHE; + _ftl_write_nv_cache(parent); +} + +int +ftl_nv_cache_write_header(struct ftl_nv_cache *nv_cache, bool shutdown, + spdk_bdev_io_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + struct ftl_nv_cache_header *hdr = nv_cache->dma_buf; + struct spdk_bdev *bdev; + struct ftl_io_channel *ioch; + + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + memset(hdr, 0, spdk_bdev_get_block_size(bdev)); + + hdr->phase = (uint8_t)nv_cache->phase; + hdr->size = spdk_bdev_get_num_blocks(bdev); + hdr->uuid = dev->uuid; + hdr->version = FTL_NV_CACHE_HEADER_VERSION; + hdr->current_addr = shutdown ? nv_cache->current_addr : FTL_LBA_INVALID; + hdr->checksum = spdk_crc32c_update(hdr, offsetof(struct ftl_nv_cache_header, checksum), 0); + + return spdk_bdev_write_blocks(nv_cache->bdev_desc, ioch->cache_ioch, hdr, 0, 1, + cb_fn, cb_arg); +} + +int +ftl_nv_cache_scrub(struct ftl_nv_cache *nv_cache, spdk_bdev_io_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + struct ftl_io_channel *ioch; + struct spdk_bdev *bdev; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + + return spdk_bdev_write_zeroes_blocks(nv_cache->bdev_desc, ioch->cache_ioch, 1, + spdk_bdev_get_num_blocks(bdev) - 1, + cb_fn, cb_arg); +} + +static void +ftl_write_fail(struct ftl_io *io, int status) +{ + struct ftl_batch *batch = io->batch; + struct spdk_ftl_dev *dev = io->dev; + struct ftl_wbuf_entry *entry; + struct ftl_band *band; + char buf[128]; + + entry = TAILQ_FIRST(&batch->entries); + + band = ftl_band_from_addr(io->dev, entry->addr); + SPDK_ERRLOG("Write failed @addr: %s, status: %d\n", + ftl_addr2str(entry->addr, buf, sizeof(buf)), status); + + /* Close the band and, halt wptr and defrag */ + ftl_halt_writes(dev, band); + + TAILQ_FOREACH(entry, &batch->entries, tailq) { + /* Invalidate meta set by process_writes() */ + ftl_invalidate_addr(dev, entry->addr); + } + + /* Reset the batch back to the write buffer to resend it later */ + TAILQ_INSERT_TAIL(&dev->pending_batches, batch, tailq); +} + +static void +ftl_write_cb(struct ftl_io *io, void *arg, int status) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_batch *batch = io->batch; + struct ftl_wbuf_entry *entry; + struct ftl_band *band; + struct ftl_addr prev_addr, addr = io->addr; + + if (status) { + ftl_write_fail(io, status); + return; + } + + assert(io->num_blocks == dev->xfer_size); + assert(!(io->flags & FTL_IO_MD)); + + TAILQ_FOREACH(entry, &batch->entries, tailq) { + band = entry->band; + if (!(entry->io_flags & FTL_IO_PAD)) { + /* Verify that the LBA is set for user blocks */ + assert(entry->lba != FTL_LBA_INVALID); + } + + if (band != NULL) { + assert(band->num_reloc_blocks > 0); + band->num_reloc_blocks--; + } + + entry->addr = addr; + if (entry->lba != FTL_LBA_INVALID) { + pthread_spin_lock(&entry->lock); + prev_addr = ftl_l2p_get(dev, entry->lba); + + /* If the l2p was updated in the meantime, don't update band's metadata */ + if (ftl_addr_cached(prev_addr) && + entry == ftl_get_entry_from_addr(dev, prev_addr)) { + /* Setting entry's cache bit needs to be done after metadata */ + /* within the band is updated to make sure that writes */ + /* invalidating the entry clear the metadata as well */ + ftl_band_set_addr(io->band, entry->lba, entry->addr); + entry->valid = true; + } + pthread_spin_unlock(&entry->lock); + } + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write addr:%lu, lba:%lu\n", + entry->addr.offset, entry->lba); + + addr = ftl_band_next_addr(io->band, addr, 1); + } + + ftl_process_flush(dev, batch); + ftl_release_batch(dev, batch); +} + +static void +ftl_update_stats(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry) +{ + if (!(entry->io_flags & FTL_IO_INTERNAL)) { + dev->stats.write_user++; + } + dev->stats.write_total++; +} + +static void +ftl_update_l2p(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry, + struct ftl_addr addr) +{ + struct ftl_addr prev_addr; + struct ftl_wbuf_entry *prev; + struct ftl_band *band; + int valid; + bool io_weak = entry->io_flags & FTL_IO_WEAK; + + prev_addr = ftl_l2p_get(dev, entry->lba); + if (ftl_addr_invalid(prev_addr)) { + ftl_l2p_set(dev, entry->lba, addr); + return; + } + + if (ftl_addr_cached(prev_addr)) { + prev = ftl_get_entry_from_addr(dev, prev_addr); + pthread_spin_lock(&prev->lock); + + /* Re-read the L2P under the lock to protect against updates */ + /* to this LBA from other threads */ + prev_addr = ftl_l2p_get(dev, entry->lba); + + /* If the entry is no longer in cache, another write has been */ + /* scheduled in the meantime, so we can return to evicted path */ + if (!ftl_addr_cached(prev_addr)) { + pthread_spin_unlock(&prev->lock); + goto evicted; + } + + /* + * Relocating block could still reside in cache due to fact that write + * buffers are independent for each IO channel and enough amount of data + * (write unit size) must be collected before it will be submitted to lower + * layer. + * When previous entry wasn't overwritten invalidate old address and entry. + * Otherwise skip relocating block. + */ + if (io_weak && + /* Check if prev_addr was updated in meantime */ + !(ftl_addr_cmp(prev_addr, ftl_get_addr_from_entry(prev)) && + /* Check if relocating address it the same as in previous entry */ + ftl_addr_cmp(prev->addr, entry->addr))) { + pthread_spin_unlock(&prev->lock); + return; + } + + /* + * If previous entry is part of cache and was written into disk remove + * and invalidate it + */ + if (prev->valid) { + ftl_invalidate_addr(dev, prev->addr); + prev->valid = false; + } + + ftl_l2p_set(dev, entry->lba, addr); + pthread_spin_unlock(&prev->lock); + return; + } + +evicted: + /* + * If the L2P's physical address is different than what we expected we don't need to + * do anything (someone's already overwritten our data). + */ + if (io_weak && !ftl_addr_cmp(prev_addr, entry->addr)) { + return; + } + + /* Lock the band containing previous physical address. This assures atomic changes to */ + /* the L2P as wall as metadata. The valid bits in metadata are used to */ + /* check weak writes validity. */ + band = ftl_band_from_addr(dev, prev_addr); + pthread_spin_lock(&band->lba_map.lock); + + valid = ftl_invalidate_addr_unlocked(dev, prev_addr); + + /* If the address has been invalidated already, we don't want to update */ + /* the L2P for weak writes, as it means the write is no longer valid. */ + if (!io_weak || valid) { + ftl_l2p_set(dev, entry->lba, addr); + } + + pthread_spin_unlock(&band->lba_map.lock); +} + +static struct ftl_io * +ftl_io_init_child_write(struct ftl_io *parent, struct ftl_addr addr, ftl_io_fn cb) +{ + struct ftl_io *io; + struct spdk_ftl_dev *dev = parent->dev; + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .parent = parent, + .band = parent->band, + .size = sizeof(struct ftl_io), + .flags = 0, + .type = parent->type, + .num_blocks = dev->xfer_size, + .cb_fn = cb, + .iovcnt = 0, + }; + + io = ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->addr = addr; + + return io; +} + +static void +ftl_io_child_write_cb(struct ftl_io *io, void *ctx, int status) +{ + struct ftl_zone *zone; + struct ftl_wptr *wptr; + + zone = ftl_band_zone_from_addr(io->band, io->addr); + wptr = ftl_wptr_from_band(io->band); + + zone->busy = false; + zone->info.write_pointer += io->num_blocks; + + if (zone->info.write_pointer == zone->info.zone_id + zone->info.capacity) { + zone->info.state = SPDK_BDEV_ZONE_STATE_FULL; + } + + /* If some other write on the same band failed the write pointer would already be freed */ + if (spdk_likely(wptr)) { + wptr->num_outstanding--; + } +} + +static int +ftl_submit_child_write(struct ftl_wptr *wptr, struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch; + struct ftl_io *child; + struct ftl_addr addr; + int rc; + + ioch = ftl_io_channel_get_ctx(io->ioch); + + if (spdk_likely(!wptr->direct_mode)) { + addr = wptr->addr; + } else { + assert(io->flags & FTL_IO_DIRECT_ACCESS); + assert(ftl_addr_get_band(dev, io->addr) == wptr->band->id); + addr = io->addr; + } + + /* Split IO to child requests and release zone immediately after child is completed */ + child = ftl_io_init_child_write(io, addr, ftl_io_child_write_cb); + if (!child) { + return -EAGAIN; + } + + wptr->num_outstanding++; + + if (ftl_is_append_supported(dev)) { + rc = spdk_bdev_zone_appendv(dev->base_bdev_desc, ioch->base_ioch, + child->iov, child->iov_cnt, + ftl_addr_get_zone_slba(dev, addr), + dev->xfer_size, ftl_io_cmpl_cb, child); + } else { + rc = spdk_bdev_writev_blocks(dev->base_bdev_desc, ioch->base_ioch, + child->iov, child->iov_cnt, addr.offset, + dev->xfer_size, ftl_io_cmpl_cb, child); + } + + if (rc) { + wptr->num_outstanding--; + ftl_io_fail(child, rc); + ftl_io_complete(child); + SPDK_ERRLOG("spdk_bdev_write_blocks_with_md failed with status:%d, addr:%lu\n", + rc, addr.offset); + return -EIO; + } + + ftl_io_inc_req(child); + ftl_io_advance(child, dev->xfer_size); + + return 0; +} + +static int +ftl_submit_write(struct ftl_wptr *wptr, struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + int rc = 0; + + assert(io->num_blocks % dev->xfer_size == 0); + + while (io->iov_pos < io->iov_cnt) { + /* There are no guarantees of the order of completion of NVMe IO submission queue */ + /* so wait until zone is not busy before submitting another write */ + if (!ftl_is_append_supported(dev) && wptr->zone->busy) { + TAILQ_INSERT_TAIL(&wptr->pending_queue, io, ioch_entry); + rc = -EAGAIN; + break; + } + + rc = ftl_submit_child_write(wptr, io); + if (spdk_unlikely(rc)) { + if (rc == -EAGAIN) { + TAILQ_INSERT_TAIL(&wptr->pending_queue, io, ioch_entry); + } else { + ftl_io_fail(io, rc); + } + break; + } + + ftl_trace_submission(dev, io, wptr->addr, dev->xfer_size); + ftl_wptr_advance(wptr, dev->xfer_size); + } + + if (ftl_io_done(io)) { + /* Parent IO will complete after all children are completed */ + ftl_io_complete(io); + } + + return rc; +} + +static void +ftl_flush_pad_batch(struct spdk_ftl_dev *dev) +{ + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; + size_t size = 0, num_entries = 0; + + assert(batch != NULL); + assert(batch->num_entries < dev->xfer_size); + + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + size += spdk_ring_count(ioch->submit_queue); + } + + num_entries = dev->xfer_size - batch->num_entries; + if (size < num_entries) { + ftl_pad_wbuf(dev, num_entries - size); + } +} + +static bool +ftl_check_io_channel_flush(struct spdk_ftl_dev *dev) +{ + struct ftl_io_channel *ioch; + + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + if (ioch->flush && spdk_ring_count(ioch->free_queue) != ioch->num_entries) { + return true; + } + } + + return false; +} + +static int +ftl_wptr_process_writes(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_batch *batch; + struct ftl_wbuf_entry *entry; + struct ftl_io *io; + + if (spdk_unlikely(!TAILQ_EMPTY(&wptr->pending_queue))) { + io = TAILQ_FIRST(&wptr->pending_queue); + TAILQ_REMOVE(&wptr->pending_queue, io, ioch_entry); + + if (ftl_submit_write(wptr, io) == -EAGAIN) { + return 0; + } + } + + /* Make sure the band is prepared for writing */ + if (!ftl_wptr_ready(wptr)) { + return 0; + } + + if (dev->halt) { + ftl_wptr_process_shutdown(wptr); + } + + if (spdk_unlikely(wptr->flush)) { + ftl_wptr_pad_band(wptr); + } + + batch = ftl_get_next_batch(dev); + if (!batch) { + /* If there are queued flush requests we need to pad the write buffer to */ + /* force out remaining entries */ + if (!LIST_EMPTY(&dev->flush_list) || ftl_check_io_channel_flush(dev)) { + ftl_flush_pad_batch(dev); + } + + return 0; + } + + io = ftl_io_wbuf_init(dev, wptr->addr, wptr->band, batch, ftl_write_cb); + if (!io) { + goto error; + } + + TAILQ_FOREACH(entry, &batch->entries, tailq) { + /* Update band's relocation stats if the IO comes from reloc */ + if (entry->io_flags & FTL_IO_WEAK) { + if (!spdk_bit_array_get(wptr->band->reloc_bitmap, entry->band->id)) { + spdk_bit_array_set(wptr->band->reloc_bitmap, entry->band->id); + entry->band->num_reloc_bands++; + } + } + + ftl_trace_wbuf_pop(dev, entry); + ftl_update_stats(dev, entry); + } + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write addr:%lx\n", wptr->addr.offset); + + if (ftl_submit_write(wptr, io)) { + /* TODO: we need some recovery here */ + assert(0 && "Write submit failed"); + if (ftl_io_done(io)) { + ftl_io_free(io); + } + } + + return dev->xfer_size; +error: + TAILQ_INSERT_TAIL(&dev->pending_batches, batch, tailq); + return 0; +} + +static int +ftl_process_writes(struct spdk_ftl_dev *dev) +{ + struct ftl_wptr *wptr, *twptr; + size_t num_active = 0; + enum ftl_band_state state; + + LIST_FOREACH_SAFE(wptr, &dev->wptr_list, list_entry, twptr) { + ftl_wptr_process_writes(wptr); + state = wptr->band->state; + + if (state != FTL_BAND_STATE_FULL && + state != FTL_BAND_STATE_CLOSING && + state != FTL_BAND_STATE_CLOSED) { + num_active++; + } + } + + if (num_active < 1) { + ftl_add_wptr(dev); + } + + return 0; +} + +static void +ftl_fill_wbuf_entry(struct ftl_wbuf_entry *entry, struct ftl_io *io) +{ + memcpy(entry->payload, ftl_io_iovec_addr(io), FTL_BLOCK_SIZE); + + if (entry->io_flags & FTL_IO_WEAK) { + entry->band = ftl_band_from_addr(io->dev, io->addr); + entry->addr = ftl_band_next_addr(entry->band, io->addr, io->pos); + entry->band->num_reloc_blocks++; + } + + entry->trace = io->trace; + entry->lba = ftl_io_current_lba(io); +} + +static int +ftl_wbuf_fill(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch; + struct ftl_wbuf_entry *entry; + + ioch = ftl_io_channel_get_ctx(io->ioch); + + while (io->pos < io->num_blocks) { + if (ftl_io_current_lba(io) == FTL_LBA_INVALID) { + ftl_io_advance(io, 1); + continue; + } + + entry = ftl_acquire_wbuf_entry(ioch, io->flags); + if (!entry) { + TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry); + return 0; + } + + ftl_fill_wbuf_entry(entry, io); + + ftl_trace_wbuf_fill(dev, io); + ftl_update_l2p(dev, entry, ftl_get_addr_from_entry(entry)); + ftl_io_advance(io, 1); + + /* Needs to be done after L2P is updated to avoid race with */ + /* write completion callback when it's processed faster than */ + /* L2P is set in update_l2p(). */ + spdk_ring_enqueue(ioch->submit_queue, (void **)&entry, 1, NULL); + } + + if (ftl_io_done(io)) { + if (ftl_dev_has_nv_cache(dev) && !(io->flags & FTL_IO_BYPASS_CACHE)) { + ftl_write_nv_cache(io); + } else { + TAILQ_INSERT_TAIL(&ioch->write_cmpl_queue, io, ioch_entry); + } + } + + return 0; +} + +static bool +ftl_dev_needs_defrag(struct spdk_ftl_dev *dev) +{ + const struct spdk_ftl_limit *limit = ftl_get_limit(dev, SPDK_FTL_LIMIT_START); + + if (ftl_reloc_is_halted(dev->reloc)) { + return false; + } + + if (ftl_reloc_is_defrag_active(dev->reloc)) { + return false; + } + + if (dev->num_free <= limit->thld) { + return true; + } + + return false; +} + +static double +ftl_band_calc_merit(struct ftl_band *band, size_t *threshold_valid) +{ + size_t usable, valid, invalid; + double vld_ratio; + + /* If the band doesn't have any usable blocks it's of no use */ + usable = ftl_band_num_usable_blocks(band); + if (usable == 0) { + return 0.0; + } + + valid = threshold_valid ? (usable - *threshold_valid) : band->lba_map.num_vld; + invalid = usable - valid; + + /* Add one to avoid division by 0 */ + vld_ratio = (double)invalid / (double)(valid + 1); + return vld_ratio * ftl_band_age(band); +} + +static bool +ftl_band_needs_defrag(struct ftl_band *band, struct spdk_ftl_dev *dev) +{ + struct spdk_ftl_conf *conf = &dev->conf; + size_t thld_vld; + + /* If we're in dire need of free bands, every band is worth defragging */ + if (ftl_current_limit(dev) == SPDK_FTL_LIMIT_CRIT) { + return true; + } + + thld_vld = (ftl_band_num_usable_blocks(band) * conf->invalid_thld) / 100; + + return band->merit > ftl_band_calc_merit(band, &thld_vld); +} + +static struct ftl_band * +ftl_select_defrag_band(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band, *mband = NULL; + double merit = 0; + + LIST_FOREACH(band, &dev->shut_bands, list_entry) { + assert(band->state == FTL_BAND_STATE_CLOSED); + band->merit = ftl_band_calc_merit(band, NULL); + if (band->merit > merit) { + merit = band->merit; + mband = band; + } + } + + if (mband && !ftl_band_needs_defrag(mband, dev)) { + mband = NULL; + } + + return mband; +} + +static void +ftl_process_relocs(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + + if (ftl_dev_needs_defrag(dev)) { + band = ftl_select_defrag_band(dev); + if (band) { + ftl_reloc_add(dev->reloc, band, 0, ftl_get_num_blocks_in_band(dev), 0, true); + ftl_trace_defrag_band(dev, band); + } + } + + ftl_reloc(dev->reloc); +} + +int +ftl_current_limit(const struct spdk_ftl_dev *dev) +{ + return dev->limit; +} + +void +spdk_ftl_dev_get_attrs(const struct spdk_ftl_dev *dev, struct spdk_ftl_attrs *attrs) +{ + attrs->uuid = dev->uuid; + attrs->num_blocks = dev->num_lbas; + attrs->block_size = FTL_BLOCK_SIZE; + attrs->num_zones = ftl_get_num_zones(dev); + attrs->zone_size = ftl_get_num_blocks_in_zone(dev); + attrs->conf = dev->conf; + attrs->base_bdev = spdk_bdev_get_name(spdk_bdev_desc_get_bdev(dev->base_bdev_desc)); + + attrs->cache_bdev = NULL; + if (dev->nv_cache.bdev_desc) { + attrs->cache_bdev = spdk_bdev_get_name( + spdk_bdev_desc_get_bdev(dev->nv_cache.bdev_desc)); + } +} + +static void +_ftl_io_write(void *ctx) +{ + ftl_io_write((struct ftl_io *)ctx); +} + +static int +ftl_submit_write_leaf(struct ftl_io *io) +{ + int rc; + + rc = ftl_submit_write(ftl_wptr_from_band(io->band), io); + if (rc == -EAGAIN) { + /* EAGAIN means that the request was put on the pending queue */ + return 0; + } + + return rc; +} + +void +ftl_io_write(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(io->ioch); + + /* Put the IO on retry queue in case IO channel is not initialized */ + if (spdk_unlikely(ioch->index == FTL_IO_CHANNEL_INDEX_INVALID)) { + TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry); + return; + } + + /* For normal IOs we just need to copy the data onto the write buffer */ + if (!(io->flags & FTL_IO_MD)) { + ftl_io_call_foreach_child(io, ftl_wbuf_fill); + } else { + /* Metadata has its own buffer, so it doesn't have to be copied, so just */ + /* send it the the core thread and schedule the write immediately */ + if (ftl_check_core_thread(dev)) { + ftl_io_call_foreach_child(io, ftl_submit_write_leaf); + } else { + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_write, io); + } + } +} + +int +spdk_ftl_write(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt, + struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_io *io; + + if (iov_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt != ftl_iovec_num_blocks(iov, iov_cnt)) { + return -EINVAL; + } + + if (!dev->initialized) { + return -EBUSY; + } + + io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_WRITE); + if (!io) { + return -ENOMEM; + } + + ftl_io_write(io); + + return 0; +} + +void +ftl_io_read(struct ftl_io *io) +{ + ftl_io_call_foreach_child(io, ftl_submit_read); +} + +int +spdk_ftl_read(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt, + struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_io *io; + + if (iov_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt != ftl_iovec_num_blocks(iov, iov_cnt)) { + return -EINVAL; + } + + if (!dev->initialized) { + return -EBUSY; + } + + io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_READ); + if (!io) { + return -ENOMEM; + } + + ftl_io_read(io); + return 0; +} + +static struct ftl_flush * +ftl_flush_init(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_flush *flush; + + flush = calloc(1, sizeof(*flush)); + if (!flush) { + return NULL; + } + + flush->bmap = spdk_bit_array_create(FTL_BATCH_COUNT); + if (!flush->bmap) { + goto error; + } + + flush->dev = dev; + flush->cb.fn = cb_fn; + flush->cb.ctx = cb_arg; + + return flush; +error: + free(flush); + return NULL; +} + +static void +_ftl_flush(void *ctx) +{ + struct ftl_flush *flush = ctx; + struct spdk_ftl_dev *dev = flush->dev; + uint32_t i; + + /* Attach flush object to all non-empty batches */ + for (i = 0; i < FTL_BATCH_COUNT; ++i) { + if (dev->batch_array[i].num_entries > 0) { + spdk_bit_array_set(flush->bmap, i); + flush->num_req++; + } + } + + LIST_INSERT_HEAD(&dev->flush_list, flush, list_entry); + + /* If the write buffer was already empty, the flush can be completed right away */ + if (!flush->num_req) { + ftl_complete_flush(flush); + } +} + +int +ftl_flush_wbuf(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_flush *flush; + + flush = ftl_flush_init(dev, cb_fn, cb_arg); + if (!flush) { + return -ENOMEM; + } + + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_flush, flush); + return 0; +} + +int +spdk_ftl_flush(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + if (!dev->initialized) { + return -EBUSY; + } + + return ftl_flush_wbuf(dev, cb_fn, cb_arg); +} + +bool +ftl_addr_is_written(struct ftl_band *band, struct ftl_addr addr) +{ + struct ftl_zone *zone = ftl_band_zone_from_addr(band, addr); + + return addr.offset < zone->info.write_pointer; +} + +static void ftl_process_media_event(struct spdk_ftl_dev *dev, struct spdk_bdev_media_event event); + +static void +_ftl_process_media_event(void *ctx) +{ + struct ftl_media_event *event = ctx; + struct spdk_ftl_dev *dev = event->dev; + + ftl_process_media_event(dev, event->event); + spdk_mempool_put(dev->media_events_pool, event); +} + +static void +ftl_process_media_event(struct spdk_ftl_dev *dev, struct spdk_bdev_media_event event) +{ + struct ftl_band *band; + struct ftl_addr addr = { .offset = event.offset }; + size_t block_off; + + if (!ftl_check_core_thread(dev)) { + struct ftl_media_event *media_event; + + media_event = spdk_mempool_get(dev->media_events_pool); + if (!media_event) { + SPDK_ERRLOG("Media event lost due to lack of memory"); + return; + } + + media_event->dev = dev; + media_event->event = event; + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_process_media_event, + media_event); + return; + } + + band = ftl_band_from_addr(dev, addr); + block_off = ftl_band_block_offset_from_addr(band, addr); + + ftl_reloc_add(dev->reloc, band, block_off, event.num_blocks, 0, false); +} + +void +ftl_get_media_events(struct spdk_ftl_dev *dev) +{ +#define FTL_MAX_MEDIA_EVENTS 128 + struct spdk_bdev_media_event events[FTL_MAX_MEDIA_EVENTS]; + size_t num_events, i; + + if (!dev->initialized) { + return; + } + + do { + num_events = spdk_bdev_get_media_events(dev->base_bdev_desc, + events, FTL_MAX_MEDIA_EVENTS); + + for (i = 0; i < num_events; ++i) { + ftl_process_media_event(dev, events[i]); + } + + } while (num_events); +} + +int +ftl_io_channel_poll(void *arg) +{ + struct ftl_io_channel *ch = arg; + struct ftl_io *io; + TAILQ_HEAD(, ftl_io) retry_queue; + + if (TAILQ_EMPTY(&ch->write_cmpl_queue) && TAILQ_EMPTY(&ch->retry_queue)) { + return SPDK_POLLER_IDLE; + } + + while (!TAILQ_EMPTY(&ch->write_cmpl_queue)) { + io = TAILQ_FIRST(&ch->write_cmpl_queue); + TAILQ_REMOVE(&ch->write_cmpl_queue, io, ioch_entry); + ftl_io_complete(io); + } + + /* + * Create local copy of the retry queue to prevent from infinite retrying if IO will be + * inserted to the retry queue again + */ + TAILQ_INIT(&retry_queue); + TAILQ_SWAP(&ch->retry_queue, &retry_queue, ftl_io, ioch_entry); + + while (!TAILQ_EMPTY(&retry_queue)) { + io = TAILQ_FIRST(&retry_queue); + TAILQ_REMOVE(&retry_queue, io, ioch_entry); + if (io->type == FTL_IO_WRITE) { + ftl_io_write(io); + } else { + ftl_io_read(io); + } + } + + return SPDK_POLLER_BUSY; +} + +int +ftl_task_core(void *ctx) +{ + struct spdk_ftl_dev *dev = ctx; + + if (dev->halt) { + if (ftl_shutdown_complete(dev)) { + spdk_poller_unregister(&dev->core_poller); + return SPDK_POLLER_IDLE; + } + } + + ftl_process_writes(dev); + ftl_process_relocs(dev); + + return SPDK_POLLER_BUSY; +} + +SPDK_LOG_REGISTER_COMPONENT("ftl_core", SPDK_LOG_FTL_CORE) diff --git a/src/spdk/lib/ftl/ftl_core.h b/src/spdk/lib/ftl/ftl_core.h new file mode 100644 index 000000000..b782ba731 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_core.h @@ -0,0 +1,552 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_CORE_H +#define FTL_CORE_H + +#include "spdk/stdinc.h" +#include "spdk/uuid.h" +#include "spdk/thread.h" +#include "spdk/util.h" +#include "spdk_internal/log.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/ftl.h" +#include "spdk/bdev.h" +#include "spdk/bdev_zone.h" + +#include "ftl_addr.h" +#include "ftl_io.h" +#include "ftl_trace.h" + +#ifdef SPDK_CONFIG_PMDK +#include "libpmem.h" +#endif /* SPDK_CONFIG_PMDK */ + +struct spdk_ftl_dev; +struct ftl_band; +struct ftl_zone; +struct ftl_io; +struct ftl_restore; +struct ftl_wptr; +struct ftl_flush; +struct ftl_reloc; +struct ftl_anm_event; +struct ftl_band_flush; + +struct ftl_stats { + /* Number of writes scheduled directly by the user */ + uint64_t write_user; + + /* Total number of writes */ + uint64_t write_total; + + /* Traces */ + struct ftl_trace trace; + + /* Number of limits applied */ + uint64_t limits[SPDK_FTL_LIMIT_MAX]; +}; + +struct ftl_global_md { + /* Device instance */ + struct spdk_uuid uuid; + /* Size of the l2p table */ + uint64_t num_lbas; +}; + +struct ftl_nv_cache { + /* Write buffer cache bdev */ + struct spdk_bdev_desc *bdev_desc; + /* Write pointer */ + uint64_t current_addr; + /* Number of available blocks left */ + uint64_t num_available; + /* Maximum number of blocks */ + uint64_t num_data_blocks; + /* + * Phase of the current cycle of writes. Each time whole cache area is filled, the phase is + * advanced. Current phase is saved in every IO's metadata, as well as in the header saved + * in the first sector. By looking at the phase of each block, it's possible to find the + * oldest block and replay the order of the writes when recovering the data from the cache. + */ + unsigned int phase; + /* Indicates that the data can be written to the cache */ + bool ready; + /* Metadata pool */ + struct spdk_mempool *md_pool; + /* DMA buffer for writing the header */ + void *dma_buf; + /* Cache lock */ + pthread_spinlock_t lock; +}; + +struct ftl_batch { + /* Queue of write buffer entries, can reach up to xfer_size entries */ + TAILQ_HEAD(, ftl_wbuf_entry) entries; + /* Number of entries in the queue above */ + uint32_t num_entries; + /* Index within spdk_ftl_dev.batch_array */ + uint32_t index; + struct iovec *iov; + void *metadata; + TAILQ_ENTRY(ftl_batch) tailq; +}; + +struct spdk_ftl_dev { + /* Device instance */ + struct spdk_uuid uuid; + /* Device name */ + char *name; + /* Configuration */ + struct spdk_ftl_conf conf; + + /* Indicates the device is fully initialized */ + int initialized; + /* Indicates the device is about to be stopped */ + int halt; + /* Indicates the device is about to start stopping - use to handle multiple stop request */ + bool halt_started; + + /* Underlying device */ + struct spdk_bdev_desc *base_bdev_desc; + + /* Non-volatile write buffer cache */ + struct ftl_nv_cache nv_cache; + + /* LBA map memory pool */ + struct spdk_mempool *lba_pool; + + /* LBA map requests pool */ + struct spdk_mempool *lba_request_pool; + + /* Media management events pool */ + struct spdk_mempool *media_events_pool; + + /* Statistics */ + struct ftl_stats stats; + + /* Current sequence number */ + uint64_t seq; + + /* Array of bands */ + struct ftl_band *bands; + /* Number of operational bands */ + size_t num_bands; + /* Next write band */ + struct ftl_band *next_band; + /* Free band list */ + LIST_HEAD(, ftl_band) free_bands; + /* Closed bands list */ + LIST_HEAD(, ftl_band) shut_bands; + /* Number of free bands */ + size_t num_free; + + /* List of write pointers */ + LIST_HEAD(, ftl_wptr) wptr_list; + + /* Logical -> physical table */ + void *l2p; + /* Size of the l2p table */ + uint64_t num_lbas; + /* Size of pages mmapped for l2p, valid only for mapping on persistent memory */ + size_t l2p_pmem_len; + + /* Address size */ + size_t addr_len; + + /* Flush list */ + LIST_HEAD(, ftl_flush) flush_list; + /* List of band flush requests */ + LIST_HEAD(, ftl_band_flush) band_flush_list; + + /* Device specific md buffer */ + struct ftl_global_md global_md; + + /* Metadata size */ + size_t md_size; + void *md_buf; + + /* Transfer unit size */ + size_t xfer_size; + + /* Current user write limit */ + int limit; + + /* Inflight IO operations */ + uint32_t num_inflight; + + /* Manages data relocation */ + struct ftl_reloc *reloc; + + /* Thread on which the poller is running */ + struct spdk_thread *core_thread; + /* IO channel */ + struct spdk_io_channel *ioch; + /* Poller */ + struct spdk_poller *core_poller; + + /* IO channel array provides means for retrieving write buffer entries + * from their address stored in L2P. The address is divided into two + * parts - IO channel offset poining at specific IO channel (within this + * array) and entry offset pointing at specific entry within that IO + * channel. + */ + struct ftl_io_channel **ioch_array; + TAILQ_HEAD(, ftl_io_channel) ioch_queue; + uint64_t num_io_channels; + /* Value required to shift address of a write buffer entry to retrieve + * the IO channel it's part of. The other part of the address describes + * the offset of an entry within the IO channel's entry array. + */ + uint64_t ioch_shift; + + /* Write buffer batches */ +#define FTL_BATCH_COUNT 4096 + struct ftl_batch batch_array[FTL_BATCH_COUNT]; + /* Iovec buffer used by batches */ + struct iovec *iov_buf; + /* Batch currently being filled */ + struct ftl_batch *current_batch; + /* Full and ready to be sent batches. A batch is put on this queue in + * case it's already filled, but cannot be sent. + */ + TAILQ_HEAD(, ftl_batch) pending_batches; + TAILQ_HEAD(, ftl_batch) free_batches; + + /* Devices' list */ + STAILQ_ENTRY(spdk_ftl_dev) stailq; +}; + +struct ftl_nv_cache_header { + /* Version of the header */ + uint32_t version; + /* UUID of the FTL device */ + struct spdk_uuid uuid; + /* Size of the non-volatile cache (in blocks) */ + uint64_t size; + /* Contains the next address to be written after clean shutdown, invalid LBA otherwise */ + uint64_t current_addr; + /* Current phase */ + uint8_t phase; + /* Checksum of the header, needs to be last element */ + uint32_t checksum; +} __attribute__((packed)); + +struct ftl_media_event { + /* Owner */ + struct spdk_ftl_dev *dev; + /* Media event */ + struct spdk_bdev_media_event event; +}; + +typedef void (*ftl_restore_fn)(struct ftl_restore *, int, void *cb_arg); + +void ftl_apply_limits(struct spdk_ftl_dev *dev); +void ftl_io_read(struct ftl_io *io); +void ftl_io_write(struct ftl_io *io); +int ftl_flush_wbuf(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg); +int ftl_current_limit(const struct spdk_ftl_dev *dev); +int ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr); +int ftl_task_core(void *ctx); +int ftl_task_read(void *ctx); +void ftl_process_anm_event(struct ftl_anm_event *event); +size_t ftl_tail_md_num_blocks(const struct spdk_ftl_dev *dev); +size_t ftl_tail_md_hdr_num_blocks(void); +size_t ftl_vld_map_num_blocks(const struct spdk_ftl_dev *dev); +size_t ftl_lba_map_num_blocks(const struct spdk_ftl_dev *dev); +size_t ftl_head_md_num_blocks(const struct spdk_ftl_dev *dev); +int ftl_restore_md(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg); +int ftl_restore_device(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg); +void ftl_restore_nv_cache(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg); +int ftl_band_set_direct_access(struct ftl_band *band, bool access); +bool ftl_addr_is_written(struct ftl_band *band, struct ftl_addr addr); +int ftl_flush_active_bands(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg); +int ftl_nv_cache_write_header(struct ftl_nv_cache *nv_cache, bool shutdown, + spdk_bdev_io_completion_cb cb_fn, void *cb_arg); +int ftl_nv_cache_scrub(struct ftl_nv_cache *nv_cache, spdk_bdev_io_completion_cb cb_fn, + void *cb_arg); +void ftl_get_media_events(struct spdk_ftl_dev *dev); +int ftl_io_channel_poll(void *arg); +void ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry); +struct spdk_io_channel *ftl_get_io_channel(const struct spdk_ftl_dev *dev); +struct ftl_io_channel *ftl_io_channel_get_ctx(struct spdk_io_channel *ioch); + + +#define ftl_to_addr(address) \ + (struct ftl_addr) { .offset = (uint64_t)(address) } + +#define ftl_to_addr_packed(address) \ + (struct ftl_addr) { .pack.offset = (uint32_t)(address) } + +static inline struct spdk_thread * +ftl_get_core_thread(const struct spdk_ftl_dev *dev) +{ + return dev->core_thread; +} + +static inline size_t +ftl_get_num_bands(const struct spdk_ftl_dev *dev) +{ + return dev->num_bands; +} + +static inline size_t +ftl_get_num_punits(const struct spdk_ftl_dev *dev) +{ + return spdk_bdev_get_optimal_open_zones(spdk_bdev_desc_get_bdev(dev->base_bdev_desc)); +} + +static inline size_t +ftl_get_num_zones(const struct spdk_ftl_dev *dev) +{ + return ftl_get_num_bands(dev) * ftl_get_num_punits(dev); +} + +static inline size_t +ftl_get_num_blocks_in_zone(const struct spdk_ftl_dev *dev) +{ + return spdk_bdev_get_zone_size(spdk_bdev_desc_get_bdev(dev->base_bdev_desc)); +} + +static inline uint64_t +ftl_get_num_blocks_in_band(const struct spdk_ftl_dev *dev) +{ + return ftl_get_num_punits(dev) * ftl_get_num_blocks_in_zone(dev); +} + +static inline uint64_t +ftl_addr_get_zone_slba(const struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + return addr.offset -= (addr.offset % ftl_get_num_blocks_in_zone(dev)); +} + +static inline uint64_t +ftl_addr_get_band(const struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + return addr.offset / ftl_get_num_blocks_in_band(dev); +} + +static inline uint64_t +ftl_addr_get_punit(const struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + return (addr.offset / ftl_get_num_blocks_in_zone(dev)) % ftl_get_num_punits(dev); +} + +static inline uint64_t +ftl_addr_get_zone_offset(const struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + return addr.offset % ftl_get_num_blocks_in_zone(dev); +} + +static inline size_t +ftl_vld_map_size(const struct spdk_ftl_dev *dev) +{ + return (size_t)spdk_divide_round_up(ftl_get_num_blocks_in_band(dev), CHAR_BIT); +} + +static inline int +ftl_addr_packed(const struct spdk_ftl_dev *dev) +{ + return dev->addr_len < 32; +} + +static inline void +ftl_l2p_lba_persist(const struct spdk_ftl_dev *dev, uint64_t lba) +{ +#ifdef SPDK_CONFIG_PMDK + size_t ftl_addr_size = ftl_addr_packed(dev) ? 4 : 8; + pmem_persist((char *)dev->l2p + (lba * ftl_addr_size), ftl_addr_size); +#else /* SPDK_CONFIG_PMDK */ + SPDK_ERRLOG("Libpmem not available, cannot flush l2p to pmem\n"); + assert(0); +#endif /* SPDK_CONFIG_PMDK */ +} + +static inline int +ftl_addr_invalid(struct ftl_addr addr) +{ + return addr.offset == ftl_to_addr(FTL_ADDR_INVALID).offset; +} + +static inline int +ftl_addr_cached(struct ftl_addr addr) +{ + return !ftl_addr_invalid(addr) && addr.cached; +} + +static inline struct ftl_addr +ftl_addr_to_packed(const struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_addr p = {}; + + if (ftl_addr_invalid(addr)) { + p = ftl_to_addr_packed(FTL_ADDR_INVALID); + } else if (ftl_addr_cached(addr)) { + p.pack.cached = 1; + p.pack.cache_offset = (uint32_t) addr.cache_offset; + } else { + p.pack.offset = (uint32_t) addr.offset; + } + + return p; +} + +static inline struct ftl_addr +ftl_addr_from_packed(const struct spdk_ftl_dev *dev, struct ftl_addr p) +{ + struct ftl_addr addr = {}; + + if (p.pack.offset == (uint32_t)FTL_ADDR_INVALID) { + addr = ftl_to_addr(FTL_ADDR_INVALID); + } else if (p.pack.cached) { + addr.cached = 1; + addr.cache_offset = p.pack.cache_offset; + } else { + addr = p; + } + + return addr; +} + +#define _ftl_l2p_set(l2p, off, val, bits) \ + __atomic_store_n(((uint##bits##_t *)(l2p)) + (off), val, __ATOMIC_SEQ_CST) + +#define _ftl_l2p_set32(l2p, off, val) \ + _ftl_l2p_set(l2p, off, val, 32) + +#define _ftl_l2p_set64(l2p, off, val) \ + _ftl_l2p_set(l2p, off, val, 64) + +#define _ftl_l2p_get(l2p, off, bits) \ + __atomic_load_n(((uint##bits##_t *)(l2p)) + (off), __ATOMIC_SEQ_CST) + +#define _ftl_l2p_get32(l2p, off) \ + _ftl_l2p_get(l2p, off, 32) + +#define _ftl_l2p_get64(l2p, off) \ + _ftl_l2p_get(l2p, off, 64) + +#define ftl_addr_cmp(p1, p2) \ + ((p1).offset == (p2).offset) + +static inline void +ftl_l2p_set(struct spdk_ftl_dev *dev, uint64_t lba, struct ftl_addr addr) +{ + assert(dev->num_lbas > lba); + + if (ftl_addr_packed(dev)) { + _ftl_l2p_set32(dev->l2p, lba, ftl_addr_to_packed(dev, addr).offset); + } else { + _ftl_l2p_set64(dev->l2p, lba, addr.offset); + } + + if (dev->l2p_pmem_len != 0) { + ftl_l2p_lba_persist(dev, lba); + } +} + +static inline struct ftl_addr +ftl_l2p_get(struct spdk_ftl_dev *dev, uint64_t lba) +{ + assert(dev->num_lbas > lba); + + if (ftl_addr_packed(dev)) { + return ftl_addr_from_packed(dev, ftl_to_addr_packed( + _ftl_l2p_get32(dev->l2p, lba))); + } else { + return ftl_to_addr(_ftl_l2p_get64(dev->l2p, lba)); + } +} + +static inline bool +ftl_dev_has_nv_cache(const struct spdk_ftl_dev *dev) +{ + return dev->nv_cache.bdev_desc != NULL; +} + +#define FTL_NV_CACHE_HEADER_VERSION (1) +#define FTL_NV_CACHE_DATA_OFFSET (1) +#define FTL_NV_CACHE_PHASE_OFFSET (62) +#define FTL_NV_CACHE_PHASE_COUNT (4) +#define FTL_NV_CACHE_PHASE_MASK (3ULL << FTL_NV_CACHE_PHASE_OFFSET) +#define FTL_NV_CACHE_LBA_INVALID (FTL_LBA_INVALID & ~FTL_NV_CACHE_PHASE_MASK) + +static inline bool +ftl_nv_cache_phase_is_valid(unsigned int phase) +{ + return phase > 0 && phase <= 3; +} + +static inline unsigned int +ftl_nv_cache_next_phase(unsigned int current) +{ + static const unsigned int phases[] = { 0, 2, 3, 1 }; + assert(ftl_nv_cache_phase_is_valid(current)); + return phases[current]; +} + +static inline unsigned int +ftl_nv_cache_prev_phase(unsigned int current) +{ + static const unsigned int phases[] = { 0, 3, 1, 2 }; + assert(ftl_nv_cache_phase_is_valid(current)); + return phases[current]; +} + +static inline uint64_t +ftl_nv_cache_pack_lba(uint64_t lba, unsigned int phase) +{ + assert(ftl_nv_cache_phase_is_valid(phase)); + return (lba & ~FTL_NV_CACHE_PHASE_MASK) | ((uint64_t)phase << FTL_NV_CACHE_PHASE_OFFSET); +} + +static inline void +ftl_nv_cache_unpack_lba(uint64_t in_lba, uint64_t *out_lba, unsigned int *phase) +{ + *out_lba = in_lba & ~FTL_NV_CACHE_PHASE_MASK; + *phase = (in_lba & FTL_NV_CACHE_PHASE_MASK) >> FTL_NV_CACHE_PHASE_OFFSET; + + /* If the phase is invalid the block wasn't written yet, so treat the LBA as invalid too */ + if (!ftl_nv_cache_phase_is_valid(*phase) || *out_lba == FTL_NV_CACHE_LBA_INVALID) { + *out_lba = FTL_LBA_INVALID; + } +} + +static inline bool +ftl_is_append_supported(const struct spdk_ftl_dev *dev) +{ + return dev->conf.use_append; +} + +#endif /* FTL_CORE_H */ diff --git a/src/spdk/lib/ftl/ftl_debug.c b/src/spdk/lib/ftl/ftl_debug.c new file mode 100644 index 000000000..9fbb43810 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_debug.c @@ -0,0 +1,169 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk_internal/log.h" +#include "spdk/ftl.h" +#include "ftl_debug.h" +#include "ftl_band.h" + +#if defined(DEBUG) +#if defined(FTL_META_DEBUG) + +static const char *ftl_band_state_str[] = { + "free", + "prep", + "opening", + "open", + "full", + "closing", + "closed", + "max" +}; + +bool +ftl_band_validate_md(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_lba_map *lba_map = &band->lba_map; + struct ftl_addr addr_md, addr_l2p; + size_t i, size, seg_off; + bool valid = true; + + size = ftl_get_num_blocks_in_band(dev); + + pthread_spin_lock(&lba_map->lock); + for (i = 0; i < size; ++i) { + if (!spdk_bit_array_get(lba_map->vld, i)) { + continue; + } + + seg_off = i / FTL_NUM_LBA_IN_BLOCK; + if (lba_map->segments[seg_off] != FTL_LBA_MAP_SEG_CACHED) { + continue; + } + + addr_md = ftl_band_addr_from_block_offset(band, i); + addr_l2p = ftl_l2p_get(dev, lba_map->map[i]); + + if (addr_l2p.cached) { + continue; + } + + if (addr_l2p.offset != addr_md.offset) { + valid = false; + break; + } + + } + + pthread_spin_unlock(&lba_map->lock); + + return valid; +} + +void +ftl_dev_dump_bands(struct spdk_ftl_dev *dev) +{ + size_t i, total = 0; + + if (!dev->bands) { + return; + } + + ftl_debug("Bands validity:\n"); + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + if (dev->bands[i].state == FTL_BAND_STATE_FREE && + dev->bands[i].wr_cnt == 0) { + continue; + } + + if (!dev->bands[i].num_zones) { + ftl_debug(" Band %3zu: all zones are offline\n", i + 1); + continue; + } + + total += dev->bands[i].lba_map.num_vld; + ftl_debug(" Band %3zu: %8zu / %zu \tnum_zones: %zu \twr_cnt: %"PRIu64"\tmerit:" + "%10.3f\tstate: %s\n", + i + 1, dev->bands[i].lba_map.num_vld, + ftl_band_user_blocks(&dev->bands[i]), + dev->bands[i].num_zones, + dev->bands[i].wr_cnt, + dev->bands[i].merit, + ftl_band_state_str[dev->bands[i].state]); + } +} + +#endif /* defined(FTL_META_DEBUG) */ + +#if defined(FTL_DUMP_STATS) + +void +ftl_dev_dump_stats(const struct spdk_ftl_dev *dev) +{ + size_t i, total = 0; + char uuid[SPDK_UUID_STRING_LEN]; + double waf; + const char *limits[] = { + [SPDK_FTL_LIMIT_CRIT] = "crit", + [SPDK_FTL_LIMIT_HIGH] = "high", + [SPDK_FTL_LIMIT_LOW] = "low", + [SPDK_FTL_LIMIT_START] = "start" + }; + + if (!dev->bands) { + return; + } + + /* Count the number of valid LBAs */ + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + total += dev->bands[i].lba_map.num_vld; + } + + waf = (double)dev->stats.write_total / (double)dev->stats.write_user; + + spdk_uuid_fmt_lower(uuid, sizeof(uuid), &dev->uuid); + ftl_debug("\n"); + ftl_debug("device UUID: %s\n", uuid); + ftl_debug("total valid LBAs: %zu\n", total); + ftl_debug("total writes: %"PRIu64"\n", dev->stats.write_total); + ftl_debug("user writes: %"PRIu64"\n", dev->stats.write_user); + ftl_debug("WAF: %.4lf\n", waf); + ftl_debug("limits:\n"); + for (i = 0; i < SPDK_FTL_LIMIT_MAX; ++i) { + ftl_debug(" %5s: %"PRIu64"\n", limits[i], dev->stats.limits[i]); + } +} + +#endif /* defined(FTL_DUMP_STATS) */ +#endif /* defined(DEBUG) */ diff --git a/src/spdk/lib/ftl/ftl_debug.h b/src/spdk/lib/ftl/ftl_debug.h new file mode 100644 index 000000000..c90c92ef2 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_debug.h @@ -0,0 +1,73 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_DEBUG_H +#define FTL_DEBUG_H + +#include "ftl_addr.h" +#include "ftl_band.h" +#include "ftl_core.h" + +#if defined(DEBUG) +/* Debug flags - enabled when defined */ +#define FTL_META_DEBUG 1 +#define FTL_DUMP_STATS 1 + +#define ftl_debug(msg, ...) \ + SPDK_ERRLOG(msg, ## __VA_ARGS__) +#else +#define ftl_debug(msg, ...) +#endif + +static inline const char * +ftl_addr2str(struct ftl_addr addr, char *buf, size_t size) +{ + snprintf(buf, size, "(%"PRIu64")", addr.offset); + return buf; +} + +#if defined(FTL_META_DEBUG) +bool ftl_band_validate_md(struct ftl_band *band); +void ftl_dev_dump_bands(struct spdk_ftl_dev *dev); +#else +#define ftl_band_validate_md(band) +#define ftl_dev_dump_bands(dev) +#endif + +#if defined(FTL_DUMP_STATS) +void ftl_dev_dump_stats(const struct spdk_ftl_dev *dev); +#else +#define ftl_dev_dump_stats(dev) +#endif + +#endif /* FTL_DEBUG_H */ diff --git a/src/spdk/lib/ftl/ftl_init.c b/src/spdk/lib/ftl/ftl_init.c new file mode 100644 index 000000000..15a8c21c9 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_init.c @@ -0,0 +1,1688 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/nvme.h" +#include "spdk/thread.h" +#include "spdk/string.h" +#include "spdk/likely.h" +#include "spdk_internal/log.h" +#include "spdk/ftl.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/bdev_zone.h" +#include "spdk/bdev_module.h" +#include "spdk/config.h" + +#include "ftl_core.h" +#include "ftl_io.h" +#include "ftl_reloc.h" +#include "ftl_band.h" +#include "ftl_debug.h" + +#ifdef SPDK_CONFIG_PMDK +#include "libpmem.h" +#endif /* SPDK_CONFIG_PMDK */ + +#define FTL_CORE_RING_SIZE 4096 +#define FTL_INIT_TIMEOUT 30 +#define FTL_NSID 1 +#define FTL_ZONE_INFO_COUNT 64 + +/* Dummy bdev module used to to claim bdevs. */ +static struct spdk_bdev_module g_ftl_bdev_module = { + .name = "ftl_lib", +}; + +struct ftl_dev_init_ctx { + /* Owner */ + struct spdk_ftl_dev *dev; + /* Initial arguments */ + struct spdk_ftl_dev_init_opts opts; + /* IO channel for zone info retrieving */ + struct spdk_io_channel *ioch; + /* Buffer for reading zone info */ + struct spdk_bdev_zone_info info[FTL_ZONE_INFO_COUNT]; + /* Currently read zone */ + size_t zone_id; + /* User's callback */ + spdk_ftl_init_fn cb_fn; + /* Callback's argument */ + void *cb_arg; + /* Thread to call the callback on */ + struct spdk_thread *thread; + /* Poller to check if the device has been destroyed/initialized */ + struct spdk_poller *poller; + /* Status to return for halt completion callback */ + int halt_complete_status; +}; + +static STAILQ_HEAD(, spdk_ftl_dev) g_ftl_queue = STAILQ_HEAD_INITIALIZER(g_ftl_queue); +static pthread_mutex_t g_ftl_queue_lock = PTHREAD_MUTEX_INITIALIZER; +static const struct spdk_ftl_conf g_default_conf = { + .limits = { + /* 5 free bands / 0 % host writes */ + [SPDK_FTL_LIMIT_CRIT] = { .thld = 5, .limit = 0 }, + /* 10 free bands / 5 % host writes */ + [SPDK_FTL_LIMIT_HIGH] = { .thld = 10, .limit = 5 }, + /* 20 free bands / 40 % host writes */ + [SPDK_FTL_LIMIT_LOW] = { .thld = 20, .limit = 40 }, + /* 40 free bands / 100 % host writes - defrag starts running */ + [SPDK_FTL_LIMIT_START] = { .thld = 40, .limit = 100 }, + }, + /* 10 percent valid blocks */ + .invalid_thld = 10, + /* 20% spare blocks */ + .lba_rsvd = 20, + /* 6M write buffer per each IO channel */ + .write_buffer_size = 6 * 1024 * 1024, + /* 90% band fill threshold */ + .band_thld = 90, + /* Max 32 IO depth per band relocate */ + .max_reloc_qdepth = 32, + /* Max 3 active band relocates */ + .max_active_relocs = 3, + /* IO pool size per user thread (this should be adjusted to thread IO qdepth) */ + .user_io_pool_size = 2048, + /* + * If clear ftl will return error when restoring after a dirty shutdown + * If set, last band will be padded, ftl will restore based only on closed bands - this + * will result in lost data after recovery. + */ + .allow_open_bands = false, + .max_io_channels = 128, + .nv_cache = { + /* Maximum number of concurrent requests */ + .max_request_cnt = 2048, + /* Maximum number of blocks per request */ + .max_request_size = 16, + } +}; + +static int +ftl_band_init_md(struct ftl_band *band) +{ + struct ftl_lba_map *lba_map = &band->lba_map; + int rc; + + lba_map->vld = spdk_bit_array_create(ftl_get_num_blocks_in_band(band->dev)); + if (!lba_map->vld) { + return -ENOMEM; + } + + rc = pthread_spin_init(&lba_map->lock, PTHREAD_PROCESS_PRIVATE); + if (rc) { + spdk_bit_array_free(&lba_map->vld); + return rc; + } + ftl_band_md_clear(band); + return 0; +} + +static int +ftl_check_conf(const struct spdk_ftl_dev *dev, const struct spdk_ftl_conf *conf) +{ + size_t i; + + if (conf->invalid_thld >= 100) { + return -1; + } + if (conf->lba_rsvd >= 100) { + return -1; + } + if (conf->lba_rsvd == 0) { + return -1; + } + if (conf->write_buffer_size == 0) { + return -1; + } + if (conf->write_buffer_size % FTL_BLOCK_SIZE != 0) { + return -1; + } + + for (i = 0; i < SPDK_FTL_LIMIT_MAX; ++i) { + if (conf->limits[i].limit > 100) { + return -1; + } + } + + return 0; +} + +static int +ftl_dev_init_bands(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band, *pband; + unsigned int i; + int rc = 0; + + LIST_INIT(&dev->free_bands); + LIST_INIT(&dev->shut_bands); + + dev->num_free = 0; + dev->bands = calloc(ftl_get_num_bands(dev), sizeof(*dev->bands)); + if (!dev->bands) { + return -1; + } + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + band = &dev->bands[i]; + band->id = i; + band->dev = dev; + band->state = FTL_BAND_STATE_CLOSED; + + if (LIST_EMPTY(&dev->shut_bands)) { + LIST_INSERT_HEAD(&dev->shut_bands, band, list_entry); + } else { + LIST_INSERT_AFTER(pband, band, list_entry); + } + pband = band; + + CIRCLEQ_INIT(&band->zones); + band->zone_buf = calloc(ftl_get_num_punits(dev), sizeof(*band->zone_buf)); + if (!band->zone_buf) { + SPDK_ERRLOG("Failed to allocate block state table for band: [%u]\n", i); + rc = -1; + break; + } + + rc = ftl_band_init_md(band); + if (rc) { + SPDK_ERRLOG("Failed to initialize metadata structures for band [%u]\n", i); + break; + } + + band->reloc_bitmap = spdk_bit_array_create(ftl_get_num_bands(dev)); + if (!band->reloc_bitmap) { + SPDK_ERRLOG("Failed to allocate band relocation bitmap\n"); + break; + } + } + + return rc; +} + +static void +ftl_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx) +{ + struct spdk_ftl_dev *dev = event_ctx; + + switch (type) { + case SPDK_BDEV_EVENT_REMOVE: + assert(0); + break; + case SPDK_BDEV_EVENT_MEDIA_MANAGEMENT: + assert(bdev == spdk_bdev_desc_get_bdev(dev->base_bdev_desc)); + ftl_get_media_events(dev); + default: + break; + } +} + +static int +ftl_dev_init_nv_cache(struct spdk_ftl_dev *dev, const char *bdev_name) +{ + struct spdk_bdev *bdev; + struct spdk_ftl_conf *conf = &dev->conf; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + char pool_name[128]; + int rc; + + if (!bdev_name) { + return 0; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + SPDK_ERRLOG("Unable to find bdev: %s\n", bdev_name); + return -1; + } + + if (spdk_bdev_open_ext(bdev_name, true, ftl_bdev_event_cb, + dev, &nv_cache->bdev_desc)) { + SPDK_ERRLOG("Unable to open bdev: %s\n", bdev_name); + return -1; + } + + if (spdk_bdev_module_claim_bdev(bdev, nv_cache->bdev_desc, &g_ftl_bdev_module)) { + spdk_bdev_close(nv_cache->bdev_desc); + nv_cache->bdev_desc = NULL; + SPDK_ERRLOG("Unable to claim bdev %s\n", bdev_name); + return -1; + } + + SPDK_INFOLOG(SPDK_LOG_FTL_INIT, "Using %s as write buffer cache\n", + spdk_bdev_get_name(bdev)); + + if (spdk_bdev_get_block_size(bdev) != FTL_BLOCK_SIZE) { + SPDK_ERRLOG("Unsupported block size (%d)\n", spdk_bdev_get_block_size(bdev)); + return -1; + } + + if (!spdk_bdev_is_md_separate(bdev)) { + SPDK_ERRLOG("Bdev %s doesn't support separate metadata buffer IO\n", + spdk_bdev_get_name(bdev)); + return -1; + } + + if (spdk_bdev_get_md_size(bdev) < sizeof(uint64_t)) { + SPDK_ERRLOG("Bdev's %s metadata is too small (%"PRIu32")\n", + spdk_bdev_get_name(bdev), spdk_bdev_get_md_size(bdev)); + return -1; + } + + if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { + SPDK_ERRLOG("Unsupported DIF type used by bdev %s\n", + spdk_bdev_get_name(bdev)); + return -1; + } + + /* The cache needs to be capable of storing at least two full bands. This requirement comes + * from the fact that cache works as a protection against power loss, so before the data + * inside the cache can be overwritten, the band it's stored on has to be closed. Plus one + * extra block is needed to store the header. + */ + if (spdk_bdev_get_num_blocks(bdev) < ftl_get_num_blocks_in_band(dev) * 2 + 1) { + SPDK_ERRLOG("Insufficient number of blocks for write buffer cache (available: %" + PRIu64", required: %"PRIu64")\n", spdk_bdev_get_num_blocks(bdev), + ftl_get_num_blocks_in_band(dev) * 2 + 1); + return -1; + } + + rc = snprintf(pool_name, sizeof(pool_name), "ftl-nvpool-%p", dev); + if (rc < 0 || rc >= 128) { + return -1; + } + + nv_cache->md_pool = spdk_mempool_create(pool_name, conf->nv_cache.max_request_cnt, + spdk_bdev_get_md_size(bdev) * + conf->nv_cache.max_request_size, + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!nv_cache->md_pool) { + SPDK_ERRLOG("Failed to initialize non-volatile cache metadata pool\n"); + return -1; + } + + nv_cache->dma_buf = spdk_dma_zmalloc(FTL_BLOCK_SIZE, spdk_bdev_get_buf_align(bdev), NULL); + if (!nv_cache->dma_buf) { + SPDK_ERRLOG("Memory allocation failure\n"); + return -1; + } + + if (pthread_spin_init(&nv_cache->lock, PTHREAD_PROCESS_PRIVATE)) { + SPDK_ERRLOG("Failed to initialize cache lock\n"); + return -1; + } + + nv_cache->current_addr = FTL_NV_CACHE_DATA_OFFSET; + nv_cache->num_data_blocks = spdk_bdev_get_num_blocks(bdev) - 1; + nv_cache->num_available = nv_cache->num_data_blocks; + nv_cache->ready = false; + + return 0; +} + +void +spdk_ftl_conf_init_defaults(struct spdk_ftl_conf *conf) +{ + *conf = g_default_conf; +} + +static void +ftl_lba_map_request_ctor(struct spdk_mempool *mp, void *opaque, void *obj, unsigned obj_idx) +{ + struct ftl_lba_map_request *request = obj; + struct spdk_ftl_dev *dev = opaque; + + request->segments = spdk_bit_array_create(spdk_divide_round_up( + ftl_get_num_blocks_in_band(dev), FTL_NUM_LBA_IN_BLOCK)); +} + +static int +ftl_init_media_events_pool(struct spdk_ftl_dev *dev) +{ + char pool_name[128]; + int rc; + + rc = snprintf(pool_name, sizeof(pool_name), "ftl-media-%p", dev); + if (rc < 0 || rc >= (int)sizeof(pool_name)) { + SPDK_ERRLOG("Failed to create media pool name\n"); + return -1; + } + + dev->media_events_pool = spdk_mempool_create(pool_name, 1024, + sizeof(struct ftl_media_event), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!dev->media_events_pool) { + SPDK_ERRLOG("Failed to create media events pool\n"); + return -1; + } + + return 0; +} + +static int +ftl_init_lba_map_pools(struct spdk_ftl_dev *dev) +{ +#define POOL_NAME_LEN 128 + char pool_name[POOL_NAME_LEN]; + int rc; + + rc = snprintf(pool_name, sizeof(pool_name), "%s-%s", dev->name, "ftl-lba-pool"); + if (rc < 0 || rc >= POOL_NAME_LEN) { + return -ENAMETOOLONG; + } + + /* We need to reserve at least 2 buffers for band close / open sequence + * alone, plus additional (8) buffers for handling write errors. + * TODO: This memory pool is utilized only by core thread - it introduce + * unnecessary overhead and should be replaced by different data structure. + */ + dev->lba_pool = spdk_mempool_create(pool_name, 2 + 8, + ftl_lba_map_pool_elem_size(dev), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!dev->lba_pool) { + return -ENOMEM; + } + + rc = snprintf(pool_name, sizeof(pool_name), "%s-%s", dev->name, "ftl-lbareq-pool"); + if (rc < 0 || rc >= POOL_NAME_LEN) { + return -ENAMETOOLONG; + } + + dev->lba_request_pool = spdk_mempool_create_ctor(pool_name, + dev->conf.max_reloc_qdepth * dev->conf.max_active_relocs, + sizeof(struct ftl_lba_map_request), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY, + ftl_lba_map_request_ctor, + dev); + if (!dev->lba_request_pool) { + return -ENOMEM; + } + + return 0; +} + +static void +ftl_init_wptr_list(struct spdk_ftl_dev *dev) +{ + LIST_INIT(&dev->wptr_list); + LIST_INIT(&dev->flush_list); + LIST_INIT(&dev->band_flush_list); +} + +static size_t +ftl_dev_band_max_seq(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + size_t seq = 0; + + LIST_FOREACH(band, &dev->shut_bands, list_entry) { + if (band->seq > seq) { + seq = band->seq; + } + } + + return seq; +} + +static void +_ftl_init_bands_state(void *ctx) +{ + struct ftl_band *band, *temp_band; + struct spdk_ftl_dev *dev = ctx; + + dev->seq = ftl_dev_band_max_seq(dev); + + LIST_FOREACH_SAFE(band, &dev->shut_bands, list_entry, temp_band) { + if (!band->lba_map.num_vld) { + ftl_band_set_state(band, FTL_BAND_STATE_FREE); + } + } + + ftl_reloc_resume(dev->reloc); + /* Clear the limit applications as they're incremented incorrectly by */ + /* the initialization code */ + memset(dev->stats.limits, 0, sizeof(dev->stats.limits)); +} + +static int +ftl_init_num_free_bands(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + int cnt = 0; + + LIST_FOREACH(band, &dev->shut_bands, list_entry) { + if (band->num_zones && !band->lba_map.num_vld) { + cnt++; + } + } + return cnt; +} + +static int +ftl_init_bands_state(struct spdk_ftl_dev *dev) +{ + /* TODO: Should we abort initialization or expose read only device */ + /* if there is no free bands? */ + /* If we abort initialization should we depend on condition that */ + /* we have no free bands or should we have some minimal number of */ + /* free bands? */ + if (!ftl_init_num_free_bands(dev)) { + return -1; + } + + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_init_bands_state, dev); + return 0; +} + +static void +_ftl_dev_init_core_thread(void *ctx) +{ + struct spdk_ftl_dev *dev = ctx; + + dev->core_poller = SPDK_POLLER_REGISTER(ftl_task_core, dev, 0); + if (!dev->core_poller) { + SPDK_ERRLOG("Unable to register core poller\n"); + assert(0); + } + + dev->ioch = spdk_get_io_channel(dev); +} + +static int +ftl_dev_init_core_thread(struct spdk_ftl_dev *dev, const struct spdk_ftl_dev_init_opts *opts) +{ + if (!opts->core_thread) { + return -1; + } + + dev->core_thread = opts->core_thread; + + spdk_thread_send_msg(opts->core_thread, _ftl_dev_init_core_thread, dev); + return 0; +} + +static int +ftl_dev_l2p_alloc_pmem(struct spdk_ftl_dev *dev, size_t l2p_size, const char *l2p_path) +{ +#ifdef SPDK_CONFIG_PMDK + int is_pmem; + + if ((dev->l2p = pmem_map_file(l2p_path, 0, + 0, 0, &dev->l2p_pmem_len, &is_pmem)) == NULL) { + SPDK_ERRLOG("Failed to mmap l2p_path\n"); + return -1; + } + + if (!is_pmem) { + SPDK_NOTICELOG("l2p_path mapped on non-pmem device\n"); + } + + if (dev->l2p_pmem_len < l2p_size) { + SPDK_ERRLOG("l2p_path file is too small\n"); + return -1; + } + + pmem_memset_persist(dev->l2p, FTL_ADDR_INVALID, l2p_size); + + return 0; +#else /* SPDK_CONFIG_PMDK */ + SPDK_ERRLOG("Libpmem not available, cannot use pmem l2p_path\n"); + return -1; +#endif /* SPDK_CONFIG_PMDK */ +} + +static int +ftl_dev_l2p_alloc_dram(struct spdk_ftl_dev *dev, size_t l2p_size) +{ + dev->l2p = malloc(l2p_size); + if (!dev->l2p) { + SPDK_ERRLOG("Failed to allocate l2p table\n"); + return -1; + } + + memset(dev->l2p, FTL_ADDR_INVALID, l2p_size); + + return 0; +} + +static int +ftl_dev_l2p_alloc(struct spdk_ftl_dev *dev) +{ + size_t addr_size = dev->addr_len >= 32 ? 8 : 4; + size_t l2p_size = dev->num_lbas * addr_size; + const char *l2p_path = dev->conf.l2p_path; + + if (dev->num_lbas == 0) { + SPDK_ERRLOG("Invalid l2p table size\n"); + return -1; + } + + if (dev->l2p) { + SPDK_ERRLOG("L2p table already allocated\n"); + return -1; + } + + dev->l2p_pmem_len = 0; + if (l2p_path) { + return ftl_dev_l2p_alloc_pmem(dev, l2p_size, l2p_path); + } else { + return ftl_dev_l2p_alloc_dram(dev, l2p_size); + } +} + +static void +ftl_dev_free_init_ctx(struct ftl_dev_init_ctx *init_ctx) +{ + if (!init_ctx) { + return; + } + + if (init_ctx->ioch) { + spdk_put_io_channel(init_ctx->ioch); + } + + free(init_ctx); +} + +static void +ftl_call_init_complete_cb(void *ctx) +{ + struct ftl_dev_init_ctx *init_ctx = ctx; + struct spdk_ftl_dev *dev = init_ctx->dev; + + if (init_ctx->cb_fn != NULL) { + init_ctx->cb_fn(dev, init_ctx->cb_arg, 0); + } + + ftl_dev_free_init_ctx(init_ctx); +} + +static void +ftl_init_complete(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + + pthread_mutex_lock(&g_ftl_queue_lock); + STAILQ_INSERT_HEAD(&g_ftl_queue, dev, stailq); + pthread_mutex_unlock(&g_ftl_queue_lock); + + dev->initialized = 1; + + spdk_thread_send_msg(init_ctx->thread, ftl_call_init_complete_cb, init_ctx); +} + +static void +ftl_init_fail_cb(struct spdk_ftl_dev *dev, void *ctx, int status) +{ + struct ftl_dev_init_ctx *init_ctx = ctx; + + if (init_ctx->cb_fn != NULL) { + init_ctx->cb_fn(NULL, init_ctx->cb_arg, -ENODEV); + } + + ftl_dev_free_init_ctx(init_ctx); +} + +static int ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg, + struct spdk_thread *thread); + +static void +ftl_init_fail(struct ftl_dev_init_ctx *init_ctx) +{ + if (ftl_dev_free(init_ctx->dev, ftl_init_fail_cb, init_ctx, init_ctx->thread)) { + SPDK_ERRLOG("Unable to free the device\n"); + assert(0); + } +} + +static void +ftl_write_nv_cache_md_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + struct spdk_ftl_dev *dev = init_ctx->dev; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Writing non-volatile cache's metadata header failed\n"); + ftl_init_fail(init_ctx); + return; + } + + dev->nv_cache.ready = true; + ftl_init_complete(init_ctx); +} + +static void +ftl_clear_nv_cache_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + struct spdk_ftl_dev *dev = init_ctx->dev; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Unable to clear the non-volatile cache bdev\n"); + ftl_init_fail(init_ctx); + return; + } + + nv_cache->phase = 1; + if (ftl_nv_cache_write_header(nv_cache, false, ftl_write_nv_cache_md_cb, init_ctx)) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n"); + ftl_init_fail(init_ctx); + } +} + +static void +_ftl_nv_cache_scrub(void *ctx) +{ + struct ftl_dev_init_ctx *init_ctx = ctx; + struct spdk_ftl_dev *dev = init_ctx->dev; + int rc; + + rc = ftl_nv_cache_scrub(&dev->nv_cache, ftl_clear_nv_cache_cb, init_ctx); + + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to clear the non-volatile cache bdev: %s\n", + spdk_strerror(-rc)); + ftl_init_fail(init_ctx); + } +} + +static int +ftl_setup_initial_state(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + struct spdk_ftl_conf *conf = &dev->conf; + size_t i; + + spdk_uuid_generate(&dev->uuid); + + dev->num_lbas = 0; + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + dev->num_lbas += ftl_band_num_usable_blocks(&dev->bands[i]); + } + + dev->num_lbas = (dev->num_lbas * (100 - conf->lba_rsvd)) / 100; + + if (ftl_dev_l2p_alloc(dev)) { + SPDK_ERRLOG("Unable to init l2p table\n"); + return -1; + } + + if (ftl_init_bands_state(dev)) { + SPDK_ERRLOG("Unable to finish the initialization\n"); + return -1; + } + + if (!ftl_dev_has_nv_cache(dev)) { + ftl_init_complete(init_ctx); + } else { + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_nv_cache_scrub, init_ctx); + } + + return 0; +} + +static void +ftl_restore_nv_cache_cb(struct ftl_restore *restore, int status, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + + if (spdk_unlikely(status != 0)) { + SPDK_ERRLOG("Failed to restore the non-volatile cache state\n"); + ftl_init_fail(init_ctx); + return; + } + + ftl_init_complete(init_ctx); +} + +static void +ftl_restore_device_cb(struct ftl_restore *restore, int status, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + struct spdk_ftl_dev *dev = init_ctx->dev; + + if (status) { + SPDK_ERRLOG("Failed to restore the device from the SSD\n"); + ftl_init_fail(init_ctx); + return; + } + + if (ftl_init_bands_state(dev)) { + SPDK_ERRLOG("Unable to finish the initialization\n"); + ftl_init_fail(init_ctx); + return; + } + + if (!ftl_dev_has_nv_cache(dev)) { + ftl_init_complete(init_ctx); + return; + } + + ftl_restore_nv_cache(restore, ftl_restore_nv_cache_cb, init_ctx); +} + +static void +ftl_restore_md_cb(struct ftl_restore *restore, int status, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + + if (status) { + SPDK_ERRLOG("Failed to restore the metadata from the SSD\n"); + goto error; + } + + /* After the metadata is read it should be possible to allocate the L2P */ + if (ftl_dev_l2p_alloc(init_ctx->dev)) { + SPDK_ERRLOG("Failed to allocate the L2P\n"); + goto error; + } + + if (ftl_restore_device(restore, ftl_restore_device_cb, init_ctx)) { + SPDK_ERRLOG("Failed to start device restoration from the SSD\n"); + goto error; + } + + return; +error: + ftl_init_fail(init_ctx); +} + +static int +ftl_restore_state(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + + dev->uuid = init_ctx->opts.uuid; + + if (ftl_restore_md(dev, ftl_restore_md_cb, init_ctx)) { + SPDK_ERRLOG("Failed to start metadata restoration from the SSD\n"); + return -1; + } + + return 0; +} + +static void +ftl_dev_update_bands(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band, *temp_band; + size_t i; + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + band = &dev->bands[i]; + band->tail_md_addr = ftl_band_tail_md_addr(band); + } + + /* Remove band from shut_bands list to prevent further processing */ + /* if all blocks on this band are bad */ + LIST_FOREACH_SAFE(band, &dev->shut_bands, list_entry, temp_band) { + if (!band->num_zones) { + dev->num_bands--; + LIST_REMOVE(band, list_entry); + } + } +} + +static void +ftl_dev_init_state(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + + ftl_dev_update_bands(dev); + + if (ftl_dev_init_core_thread(dev, &init_ctx->opts)) { + SPDK_ERRLOG("Unable to initialize device thread\n"); + ftl_init_fail(init_ctx); + return; + } + + if (init_ctx->opts.mode & SPDK_FTL_MODE_CREATE) { + if (ftl_setup_initial_state(init_ctx)) { + SPDK_ERRLOG("Failed to setup initial state of the device\n"); + ftl_init_fail(init_ctx); + return; + } + } else { + if (ftl_restore_state(init_ctx)) { + SPDK_ERRLOG("Unable to restore device's state from the SSD\n"); + ftl_init_fail(init_ctx); + return; + } + } +} + +static void ftl_dev_get_zone_info(struct ftl_dev_init_ctx *init_ctx); + +static void +ftl_dev_get_zone_info_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_dev_init_ctx *init_ctx = cb_arg; + struct spdk_ftl_dev *dev = init_ctx->dev; + struct ftl_band *band; + struct ftl_zone *zone; + struct ftl_addr addr; + size_t i, zones_left, num_zones; + + spdk_bdev_free_io(bdev_io); + + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Unable to read zone info for zone id: %"PRIu64"\n", init_ctx->zone_id); + ftl_init_fail(init_ctx); + return; + } + + zones_left = ftl_get_num_zones(dev) - (init_ctx->zone_id / ftl_get_num_blocks_in_zone(dev)); + num_zones = spdk_min(zones_left, FTL_ZONE_INFO_COUNT); + + for (i = 0; i < num_zones; ++i) { + addr.offset = init_ctx->info[i].zone_id; + band = &dev->bands[ftl_addr_get_band(dev, addr)]; + zone = &band->zone_buf[ftl_addr_get_punit(dev, addr)]; + zone->info = init_ctx->info[i]; + + /* TODO: add support for zone capacity less than zone size */ + if (zone->info.capacity != ftl_get_num_blocks_in_zone(dev)) { + zone->info.state = SPDK_BDEV_ZONE_STATE_OFFLINE; + SPDK_ERRLOG("Zone capacity is not equal zone size for " + "zone id: %"PRIu64"\n", init_ctx->zone_id); + } + + /* Set write pointer to the last block plus one for zone in full state */ + if (zone->info.state == SPDK_BDEV_ZONE_STATE_FULL) { + zone->info.write_pointer = zone->info.zone_id + zone->info.capacity; + } + + if (zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE) { + band->num_zones++; + CIRCLEQ_INSERT_TAIL(&band->zones, zone, circleq); + } + } + + init_ctx->zone_id = init_ctx->zone_id + num_zones * ftl_get_num_blocks_in_zone(dev); + + ftl_dev_get_zone_info(init_ctx); +} + +static void +ftl_dev_get_zone_info(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + size_t zones_left, num_zones; + int rc; + + zones_left = ftl_get_num_zones(dev) - (init_ctx->zone_id / ftl_get_num_blocks_in_zone(dev)); + if (zones_left == 0) { + ftl_dev_init_state(init_ctx); + return; + } + + num_zones = spdk_min(zones_left, FTL_ZONE_INFO_COUNT); + + rc = spdk_bdev_get_zone_info(dev->base_bdev_desc, init_ctx->ioch, + init_ctx->zone_id, num_zones, init_ctx->info, + ftl_dev_get_zone_info_cb, init_ctx); + + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to read zone info for zone id: %"PRIu64"\n", init_ctx->zone_id); + ftl_init_fail(init_ctx); + } +} + +static int +ftl_dev_init_zones(struct ftl_dev_init_ctx *init_ctx) +{ + struct spdk_ftl_dev *dev = init_ctx->dev; + + init_ctx->zone_id = 0; + init_ctx->ioch = spdk_bdev_get_io_channel(dev->base_bdev_desc); + if (!init_ctx->ioch) { + SPDK_ERRLOG("Failed to get base bdev IO channel\n"); + return -1; + } + + ftl_dev_get_zone_info(init_ctx); + + return 0; +} + +struct _ftl_io_channel { + struct ftl_io_channel *ioch; +}; + +struct ftl_io_channel * +ftl_io_channel_get_ctx(struct spdk_io_channel *ioch) +{ + struct _ftl_io_channel *_ioch = spdk_io_channel_get_ctx(ioch); + + return _ioch->ioch; +} + +static void +ftl_io_channel_register(void *ctx) +{ + struct ftl_io_channel *ioch = ctx; + struct spdk_ftl_dev *dev = ioch->dev; + uint32_t ioch_index; + + for (ioch_index = 0; ioch_index < dev->conf.max_io_channels; ++ioch_index) { + if (dev->ioch_array[ioch_index] == NULL) { + dev->ioch_array[ioch_index] = ioch; + ioch->index = ioch_index; + break; + } + } + + assert(ioch_index < dev->conf.max_io_channels); + TAILQ_INSERT_TAIL(&dev->ioch_queue, ioch, tailq); +} + +static int +ftl_io_channel_init_wbuf(struct ftl_io_channel *ioch) +{ + struct spdk_ftl_dev *dev = ioch->dev; + struct ftl_wbuf_entry *entry; + uint32_t i; + int rc; + + ioch->num_entries = dev->conf.write_buffer_size / FTL_BLOCK_SIZE; + ioch->wbuf_entries = calloc(ioch->num_entries, sizeof(*ioch->wbuf_entries)); + if (ioch->wbuf_entries == NULL) { + SPDK_ERRLOG("Failed to allocate write buffer entry array\n"); + return -1; + } + + ioch->qdepth_limit = ioch->num_entries; + ioch->wbuf_payload = spdk_zmalloc(dev->conf.write_buffer_size, FTL_BLOCK_SIZE, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (ioch->wbuf_payload == NULL) { + SPDK_ERRLOG("Failed to allocate write buffer payload\n"); + goto error_entries; + } + + ioch->free_queue = spdk_ring_create(SPDK_RING_TYPE_SP_SC, + spdk_align32pow2(ioch->num_entries + 1), + SPDK_ENV_SOCKET_ID_ANY); + if (ioch->free_queue == NULL) { + SPDK_ERRLOG("Failed to allocate free queue\n"); + goto error_payload; + } + + ioch->submit_queue = spdk_ring_create(SPDK_RING_TYPE_SP_SC, + spdk_align32pow2(ioch->num_entries + 1), + SPDK_ENV_SOCKET_ID_ANY); + if (ioch->submit_queue == NULL) { + SPDK_ERRLOG("Failed to allocate submit queue\n"); + goto error_free_queue; + } + + for (i = 0; i < ioch->num_entries; ++i) { + entry = &ioch->wbuf_entries[i]; + entry->payload = (char *)ioch->wbuf_payload + i * FTL_BLOCK_SIZE; + entry->ioch = ioch; + entry->index = i; + entry->addr.offset = FTL_ADDR_INVALID; + + rc = pthread_spin_init(&entry->lock, PTHREAD_PROCESS_PRIVATE); + if (rc != 0) { + SPDK_ERRLOG("Failed to initialize spinlock\n"); + goto error_spinlock; + } + + spdk_ring_enqueue(ioch->free_queue, (void **)&entry, 1, NULL); + } + + return 0; +error_spinlock: + for (; i > 0; --i) { + pthread_spin_destroy(&ioch->wbuf_entries[i - 1].lock); + } + + spdk_ring_free(ioch->submit_queue); +error_free_queue: + spdk_ring_free(ioch->free_queue); +error_payload: + spdk_free(ioch->wbuf_payload); +error_entries: + free(ioch->wbuf_entries); + + return -1; +} + +static int +ftl_io_channel_create_cb(void *io_device, void *ctx) +{ + struct spdk_ftl_dev *dev = io_device; + struct _ftl_io_channel *_ioch = ctx; + struct ftl_io_channel *ioch; + uint32_t num_io_channels; + char mempool_name[32]; + int rc; + + num_io_channels = __atomic_fetch_add(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST); + if (num_io_channels >= dev->conf.max_io_channels) { + SPDK_ERRLOG("Reached maximum number of IO channels\n"); + __atomic_fetch_sub(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST); + return -1; + } + + ioch = calloc(1, sizeof(*ioch)); + if (ioch == NULL) { + SPDK_ERRLOG("Failed to allocate IO channel\n"); + return -1; + } + + rc = snprintf(mempool_name, sizeof(mempool_name), "ftl_io_%p", ioch); + if (rc < 0 || rc >= (int)sizeof(mempool_name)) { + SPDK_ERRLOG("Failed to create IO channel pool name\n"); + free(ioch); + return -1; + } + + ioch->cache_ioch = NULL; + ioch->index = FTL_IO_CHANNEL_INDEX_INVALID; + ioch->dev = dev; + ioch->elem_size = sizeof(struct ftl_md_io); + ioch->io_pool = spdk_mempool_create(mempool_name, + dev->conf.user_io_pool_size, + ioch->elem_size, + 0, + SPDK_ENV_SOCKET_ID_ANY); + if (!ioch->io_pool) { + SPDK_ERRLOG("Failed to create IO channel's IO pool\n"); + free(ioch); + return -1; + } + + ioch->base_ioch = spdk_bdev_get_io_channel(dev->base_bdev_desc); + if (!ioch->base_ioch) { + SPDK_ERRLOG("Failed to create base bdev IO channel\n"); + goto fail_ioch; + } + + if (ftl_dev_has_nv_cache(dev)) { + ioch->cache_ioch = spdk_bdev_get_io_channel(dev->nv_cache.bdev_desc); + if (!ioch->cache_ioch) { + SPDK_ERRLOG("Failed to create cache IO channel\n"); + goto fail_cache; + } + } + + TAILQ_INIT(&ioch->write_cmpl_queue); + TAILQ_INIT(&ioch->retry_queue); + ioch->poller = SPDK_POLLER_REGISTER(ftl_io_channel_poll, ioch, 0); + if (!ioch->poller) { + SPDK_ERRLOG("Failed to register IO channel poller\n"); + goto fail_poller; + } + + if (ftl_io_channel_init_wbuf(ioch)) { + SPDK_ERRLOG("Failed to initialize IO channel's write buffer\n"); + goto fail_wbuf; + } + + _ioch->ioch = ioch; + + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_io_channel_register, ioch); + + return 0; +fail_wbuf: + spdk_poller_unregister(&ioch->poller); +fail_poller: + if (ioch->cache_ioch) { + spdk_put_io_channel(ioch->cache_ioch); + } +fail_cache: + spdk_put_io_channel(ioch->base_ioch); +fail_ioch: + spdk_mempool_free(ioch->io_pool); + free(ioch); + + return -1; +} + +static void +ftl_io_channel_unregister(void *ctx) +{ + struct ftl_io_channel *ioch = ctx; + struct spdk_ftl_dev *dev = ioch->dev; + uint32_t i, num_io_channels __attribute__((unused)); + + assert(ioch->index < dev->conf.max_io_channels); + assert(dev->ioch_array[ioch->index] == ioch); + + dev->ioch_array[ioch->index] = NULL; + TAILQ_REMOVE(&dev->ioch_queue, ioch, tailq); + + num_io_channels = __atomic_fetch_sub(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST); + assert(num_io_channels > 0); + + for (i = 0; i < ioch->num_entries; ++i) { + pthread_spin_destroy(&ioch->wbuf_entries[i].lock); + } + + spdk_mempool_free(ioch->io_pool); + spdk_ring_free(ioch->free_queue); + spdk_ring_free(ioch->submit_queue); + spdk_free(ioch->wbuf_payload); + free(ioch->wbuf_entries); + free(ioch); +} + +static void +_ftl_io_channel_destroy_cb(void *ctx) +{ + struct ftl_io_channel *ioch = ctx; + struct spdk_ftl_dev *dev = ioch->dev; + uint32_t i; + + /* Do not destroy the channel if some of its entries are still in use */ + if (spdk_ring_count(ioch->free_queue) != ioch->num_entries) { + spdk_thread_send_msg(spdk_get_thread(), _ftl_io_channel_destroy_cb, ctx); + return; + } + + /* Evict all valid entries from cache */ + for (i = 0; i < ioch->num_entries; ++i) { + ftl_evict_cache_entry(dev, &ioch->wbuf_entries[i]); + } + + spdk_poller_unregister(&ioch->poller); + + spdk_put_io_channel(ioch->base_ioch); + if (ioch->cache_ioch) { + spdk_put_io_channel(ioch->cache_ioch); + } + + ioch->base_ioch = NULL; + ioch->cache_ioch = NULL; + + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_io_channel_unregister, ioch); +} + +static void +ftl_io_channel_destroy_cb(void *io_device, void *ctx) +{ + struct _ftl_io_channel *_ioch = ctx; + struct ftl_io_channel *ioch = _ioch->ioch; + + /* Mark the IO channel as being flush to force out any unwritten entries */ + ioch->flush = true; + + _ftl_io_channel_destroy_cb(ioch); +} + +static int +ftl_dev_init_io_channel(struct spdk_ftl_dev *dev) +{ + struct ftl_batch *batch; + uint32_t i; + + /* Align the IO channels to nearest power of 2 to allow for easy addr bit shift */ + dev->conf.max_io_channels = spdk_align32pow2(dev->conf.max_io_channels); + dev->ioch_shift = spdk_u32log2(dev->conf.max_io_channels); + + dev->ioch_array = calloc(dev->conf.max_io_channels, sizeof(*dev->ioch_array)); + if (!dev->ioch_array) { + SPDK_ERRLOG("Failed to allocate IO channel array\n"); + return -1; + } + + if (dev->md_size > 0) { + dev->md_buf = spdk_zmalloc(dev->md_size * dev->xfer_size * FTL_BATCH_COUNT, + dev->md_size, NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA); + if (dev->md_buf == NULL) { + SPDK_ERRLOG("Failed to allocate metadata buffer\n"); + return -1; + } + } + + dev->iov_buf = calloc(FTL_BATCH_COUNT, dev->xfer_size * sizeof(struct iovec)); + if (!dev->iov_buf) { + SPDK_ERRLOG("Failed to allocate iovec buffer\n"); + return -1; + } + + TAILQ_INIT(&dev->free_batches); + TAILQ_INIT(&dev->pending_batches); + TAILQ_INIT(&dev->ioch_queue); + + for (i = 0; i < FTL_BATCH_COUNT; ++i) { + batch = &dev->batch_array[i]; + batch->iov = &dev->iov_buf[i * dev->xfer_size]; + batch->num_entries = 0; + batch->index = i; + TAILQ_INIT(&batch->entries); + if (dev->md_buf != NULL) { + batch->metadata = (char *)dev->md_buf + i * dev->xfer_size * dev->md_size; + } + + TAILQ_INSERT_TAIL(&dev->free_batches, batch, tailq); + } + + dev->num_io_channels = 0; + + spdk_io_device_register(dev, ftl_io_channel_create_cb, ftl_io_channel_destroy_cb, + sizeof(struct _ftl_io_channel), + NULL); + + return 0; +} + +static int +ftl_dev_init_base_bdev(struct spdk_ftl_dev *dev, const char *bdev_name) +{ + uint32_t block_size; + uint64_t num_blocks; + struct spdk_bdev *bdev; + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + SPDK_ERRLOG("Unable to find bdev: %s\n", bdev_name); + return -1; + } + + if (!spdk_bdev_is_zoned(bdev)) { + SPDK_ERRLOG("Bdev dosen't support zone capabilities: %s\n", + spdk_bdev_get_name(bdev)); + return -1; + } + + if (spdk_bdev_open_ext(bdev_name, true, ftl_bdev_event_cb, + dev, &dev->base_bdev_desc)) { + SPDK_ERRLOG("Unable to open bdev: %s\n", bdev_name); + return -1; + } + + if (spdk_bdev_module_claim_bdev(bdev, dev->base_bdev_desc, &g_ftl_bdev_module)) { + spdk_bdev_close(dev->base_bdev_desc); + dev->base_bdev_desc = NULL; + SPDK_ERRLOG("Unable to claim bdev %s\n", bdev_name); + return -1; + } + + dev->xfer_size = spdk_bdev_get_write_unit_size(bdev); + dev->md_size = spdk_bdev_get_md_size(bdev); + + block_size = spdk_bdev_get_block_size(bdev); + if (block_size != FTL_BLOCK_SIZE) { + SPDK_ERRLOG("Unsupported block size (%"PRIu32")\n", block_size); + return -1; + } + + num_blocks = spdk_bdev_get_num_blocks(bdev); + if (num_blocks % ftl_get_num_punits(dev)) { + SPDK_ERRLOG("Unsupported geometry. Base bdev block count must be multiple " + "of optimal number of zones.\n"); + return -1; + } + + if (ftl_is_append_supported(dev) && + !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZONE_APPEND)) { + SPDK_ERRLOG("Bdev dosen't support append: %s\n", + spdk_bdev_get_name(bdev)); + return -1; + } + + dev->num_bands = num_blocks / (ftl_get_num_punits(dev) * ftl_get_num_blocks_in_zone(dev)); + dev->addr_len = spdk_u64log2(num_blocks) + 1; + + return 0; +} + +static void +ftl_lba_map_request_dtor(struct spdk_mempool *mp, void *opaque, void *obj, unsigned obj_idx) +{ + struct ftl_lba_map_request *request = obj; + + spdk_bit_array_free(&request->segments); +} + +static void +ftl_release_bdev(struct spdk_bdev_desc *bdev_desc) +{ + if (!bdev_desc) { + return; + } + + spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_desc)); + spdk_bdev_close(bdev_desc); +} + +static void +ftl_dev_free_sync(struct spdk_ftl_dev *dev) +{ + struct spdk_ftl_dev *iter; + size_t i; + + if (!dev) { + return; + } + + pthread_mutex_lock(&g_ftl_queue_lock); + STAILQ_FOREACH(iter, &g_ftl_queue, stailq) { + if (iter == dev) { + STAILQ_REMOVE(&g_ftl_queue, dev, spdk_ftl_dev, stailq); + break; + } + } + pthread_mutex_unlock(&g_ftl_queue_lock); + + assert(LIST_EMPTY(&dev->wptr_list)); + assert(dev->current_batch == NULL); + + ftl_dev_dump_bands(dev); + ftl_dev_dump_stats(dev); + + if (dev->bands) { + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + free(dev->bands[i].zone_buf); + spdk_bit_array_free(&dev->bands[i].lba_map.vld); + spdk_bit_array_free(&dev->bands[i].reloc_bitmap); + } + } + + spdk_dma_free(dev->nv_cache.dma_buf); + + spdk_mempool_free(dev->lba_pool); + spdk_mempool_free(dev->nv_cache.md_pool); + spdk_mempool_free(dev->media_events_pool); + if (dev->lba_request_pool) { + spdk_mempool_obj_iter(dev->lba_request_pool, ftl_lba_map_request_dtor, NULL); + } + spdk_mempool_free(dev->lba_request_pool); + + ftl_reloc_free(dev->reloc); + + ftl_release_bdev(dev->nv_cache.bdev_desc); + ftl_release_bdev(dev->base_bdev_desc); + + spdk_free(dev->md_buf); + + assert(dev->num_io_channels == 0); + free(dev->ioch_array); + free(dev->iov_buf); + free(dev->name); + free(dev->bands); + if (dev->l2p_pmem_len != 0) { +#ifdef SPDK_CONFIG_PMDK + pmem_unmap(dev->l2p, dev->l2p_pmem_len); +#endif /* SPDK_CONFIG_PMDK */ + } else { + free(dev->l2p); + } + free((char *)dev->conf.l2p_path); + free(dev); +} + +int +spdk_ftl_dev_init(const struct spdk_ftl_dev_init_opts *_opts, spdk_ftl_init_fn cb_fn, void *cb_arg) +{ + struct spdk_ftl_dev *dev; + struct spdk_ftl_dev_init_opts opts = *_opts; + struct ftl_dev_init_ctx *init_ctx = NULL; + int rc = -ENOMEM; + + dev = calloc(1, sizeof(*dev)); + if (!dev) { + return -ENOMEM; + } + + init_ctx = calloc(1, sizeof(*init_ctx)); + if (!init_ctx) { + goto fail_sync; + } + + init_ctx->dev = dev; + init_ctx->opts = *_opts; + init_ctx->cb_fn = cb_fn; + init_ctx->cb_arg = cb_arg; + init_ctx->thread = spdk_get_thread(); + + if (!opts.conf) { + opts.conf = &g_default_conf; + } + + if (!opts.base_bdev) { + SPDK_ERRLOG("Lack of underlying device in configuration\n"); + rc = -EINVAL; + goto fail_sync; + } + + dev->conf = *opts.conf; + dev->limit = SPDK_FTL_LIMIT_MAX; + + dev->name = strdup(opts.name); + if (!dev->name) { + SPDK_ERRLOG("Unable to set device name\n"); + goto fail_sync; + } + + if (ftl_dev_init_base_bdev(dev, opts.base_bdev)) { + SPDK_ERRLOG("Unsupported underlying device\n"); + goto fail_sync; + } + + if (opts.conf->l2p_path) { + dev->conf.l2p_path = strdup(opts.conf->l2p_path); + if (!dev->conf.l2p_path) { + rc = -ENOMEM; + goto fail_sync; + } + } + + /* In case of errors, we free all of the memory in ftl_dev_free_sync(), */ + /* so we don't have to clean up in each of the init functions. */ + if (ftl_check_conf(dev, opts.conf)) { + SPDK_ERRLOG("Invalid device configuration\n"); + goto fail_sync; + } + + if (ftl_init_lba_map_pools(dev)) { + SPDK_ERRLOG("Unable to init LBA map pools\n"); + goto fail_sync; + } + + if (ftl_init_media_events_pool(dev)) { + SPDK_ERRLOG("Unable to init media events pools\n"); + goto fail_sync; + } + + ftl_init_wptr_list(dev); + + if (ftl_dev_init_bands(dev)) { + SPDK_ERRLOG("Unable to initialize band array\n"); + goto fail_sync; + } + + if (ftl_dev_init_nv_cache(dev, opts.cache_bdev)) { + SPDK_ERRLOG("Unable to initialize persistent cache\n"); + goto fail_sync; + } + + dev->reloc = ftl_reloc_init(dev); + if (!dev->reloc) { + SPDK_ERRLOG("Unable to initialize reloc structures\n"); + goto fail_sync; + } + + if (ftl_dev_init_io_channel(dev)) { + SPDK_ERRLOG("Unable to initialize IO channels\n"); + goto fail_sync; + } + + if (ftl_dev_init_zones(init_ctx)) { + SPDK_ERRLOG("Failed to initialize zones\n"); + goto fail_async; + } + + return 0; +fail_sync: + ftl_dev_free_sync(dev); + ftl_dev_free_init_ctx(init_ctx); + return rc; +fail_async: + ftl_init_fail(init_ctx); + return 0; +} + +static void +_ftl_halt_defrag(void *arg) +{ + ftl_reloc_halt(((struct spdk_ftl_dev *)arg)->reloc); +} + +static void +ftl_halt_complete_cb(void *ctx) +{ + struct ftl_dev_init_ctx *fini_ctx = ctx; + struct spdk_ftl_dev *dev = fini_ctx->dev; + + /* Make sure core IO channel has already been released */ + if (dev->num_io_channels > 0) { + spdk_thread_send_msg(spdk_get_thread(), ftl_halt_complete_cb, ctx); + return; + } + + spdk_io_device_unregister(fini_ctx->dev, NULL); + + ftl_dev_free_sync(fini_ctx->dev); + if (fini_ctx->cb_fn != NULL) { + fini_ctx->cb_fn(NULL, fini_ctx->cb_arg, fini_ctx->halt_complete_status); + } + + ftl_dev_free_init_ctx(fini_ctx); +} + +static void +ftl_put_io_channel_cb(void *ctx) +{ + struct ftl_dev_init_ctx *fini_ctx = ctx; + struct spdk_ftl_dev *dev = fini_ctx->dev; + + spdk_put_io_channel(dev->ioch); + spdk_thread_send_msg(spdk_get_thread(), ftl_halt_complete_cb, ctx); +} + +static void +ftl_nv_cache_header_fini_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_dev_init_ctx *fini_ctx = cb_arg; + int rc = 0; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Failed to write non-volatile cache metadata header\n"); + rc = -EIO; + } + + fini_ctx->halt_complete_status = rc; + spdk_thread_send_msg(fini_ctx->thread, ftl_put_io_channel_cb, fini_ctx); +} + +static int +ftl_halt_poller(void *ctx) +{ + struct ftl_dev_init_ctx *fini_ctx = ctx; + struct spdk_ftl_dev *dev = fini_ctx->dev; + + if (!dev->core_poller) { + spdk_poller_unregister(&fini_ctx->poller); + + if (ftl_dev_has_nv_cache(dev)) { + ftl_nv_cache_write_header(&dev->nv_cache, true, + ftl_nv_cache_header_fini_cb, fini_ctx); + } else { + fini_ctx->halt_complete_status = 0; + spdk_thread_send_msg(fini_ctx->thread, ftl_put_io_channel_cb, fini_ctx); + } + } + + return SPDK_POLLER_BUSY; +} + +static void +ftl_add_halt_poller(void *ctx) +{ + struct ftl_dev_init_ctx *fini_ctx = ctx; + struct spdk_ftl_dev *dev = fini_ctx->dev; + + dev->halt = 1; + + _ftl_halt_defrag(dev); + + assert(!fini_ctx->poller); + fini_ctx->poller = SPDK_POLLER_REGISTER(ftl_halt_poller, fini_ctx, 100); +} + +static int +ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg, + struct spdk_thread *thread) +{ + struct ftl_dev_init_ctx *fini_ctx; + + if (dev->halt_started) { + dev->halt_started = true; + return -EBUSY; + } + + fini_ctx = calloc(1, sizeof(*fini_ctx)); + if (!fini_ctx) { + return -ENOMEM; + } + + fini_ctx->dev = dev; + fini_ctx->cb_fn = cb_fn; + fini_ctx->cb_arg = cb_arg; + fini_ctx->thread = thread; + + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_add_halt_poller, fini_ctx); + return 0; +} + +int +spdk_ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg) +{ + return ftl_dev_free(dev, cb_fn, cb_arg, spdk_get_thread()); +} + +SPDK_LOG_REGISTER_COMPONENT("ftl_init", SPDK_LOG_FTL_INIT) diff --git a/src/spdk/lib/ftl/ftl_io.c b/src/spdk/lib/ftl/ftl_io.c new file mode 100644 index 000000000..39a845bae --- /dev/null +++ b/src/spdk/lib/ftl/ftl_io.c @@ -0,0 +1,563 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/ftl.h" +#include "spdk/likely.h" +#include "spdk/util.h" + +#include "ftl_io.h" +#include "ftl_core.h" +#include "ftl_band.h" +#include "ftl_debug.h" + +void +ftl_io_inc_req(struct ftl_io *io) +{ + struct ftl_band *band = io->band; + + if (!(io->flags & FTL_IO_CACHE) && io->type != FTL_IO_READ && io->type != FTL_IO_ERASE) { + ftl_band_acquire_lba_map(band); + } + + __atomic_fetch_add(&io->dev->num_inflight, 1, __ATOMIC_SEQ_CST); + + ++io->req_cnt; +} + +void +ftl_io_dec_req(struct ftl_io *io) +{ + struct ftl_band *band = io->band; + unsigned long num_inflight __attribute__((unused)); + + if (!(io->flags & FTL_IO_CACHE) && io->type != FTL_IO_READ && io->type != FTL_IO_ERASE) { + ftl_band_release_lba_map(band); + } + + num_inflight = __atomic_fetch_sub(&io->dev->num_inflight, 1, __ATOMIC_SEQ_CST); + + assert(num_inflight > 0); + assert(io->req_cnt > 0); + + --io->req_cnt; +} + +struct iovec * +ftl_io_iovec(struct ftl_io *io) +{ + return &io->iov[0]; +} + +uint64_t +ftl_io_get_lba(const struct ftl_io *io, size_t offset) +{ + assert(offset < io->num_blocks); + + if (io->flags & FTL_IO_VECTOR_LBA) { + return io->lba.vector[offset]; + } else { + return io->lba.single + offset; + } +} + +uint64_t +ftl_io_current_lba(const struct ftl_io *io) +{ + return ftl_io_get_lba(io, io->pos); +} + +void +ftl_io_advance(struct ftl_io *io, size_t num_blocks) +{ + struct iovec *iov = ftl_io_iovec(io); + size_t iov_blocks, block_left = num_blocks; + + io->pos += num_blocks; + + if (io->iov_cnt != 0) { + while (block_left > 0) { + assert(io->iov_pos < io->iov_cnt); + iov_blocks = iov[io->iov_pos].iov_len / FTL_BLOCK_SIZE; + + if (io->iov_off + block_left < iov_blocks) { + io->iov_off += block_left; + break; + } + + assert(iov_blocks > io->iov_off); + block_left -= (iov_blocks - io->iov_off); + io->iov_off = 0; + io->iov_pos++; + } + } + + if (io->parent) { + ftl_io_advance(io->parent, num_blocks); + } +} + +size_t +ftl_iovec_num_blocks(struct iovec *iov, size_t iov_cnt) +{ + size_t num_blocks = 0, i = 0; + + for (; i < iov_cnt; ++i) { + num_blocks += iov[i].iov_len / FTL_BLOCK_SIZE; + } + + return num_blocks; +} + +void * +ftl_io_iovec_addr(struct ftl_io *io) +{ + assert(io->iov_pos < io->iov_cnt); + assert(io->iov_off * FTL_BLOCK_SIZE < ftl_io_iovec(io)[io->iov_pos].iov_len); + + return (char *)ftl_io_iovec(io)[io->iov_pos].iov_base + + io->iov_off * FTL_BLOCK_SIZE; +} + +size_t +ftl_io_iovec_len_left(struct ftl_io *io) +{ + struct iovec *iov = ftl_io_iovec(io); + return iov[io->iov_pos].iov_len / FTL_BLOCK_SIZE - io->iov_off; +} + +static void +ftl_io_init_iovec(struct ftl_io *io, const struct iovec *iov, size_t iov_cnt, size_t iov_off, + size_t num_blocks) +{ + size_t offset = 0, num_left; + + io->iov_pos = 0; + io->iov_cnt = 0; + io->num_blocks = num_blocks; + + while (offset < num_blocks) { + assert(io->iov_cnt < FTL_IO_MAX_IOVEC && io->iov_cnt < iov_cnt); + + num_left = spdk_min(iov[io->iov_cnt].iov_len / FTL_BLOCK_SIZE - iov_off, + num_blocks); + io->iov[io->iov_cnt].iov_base = (char *)iov[io->iov_cnt].iov_base + + iov_off * FTL_BLOCK_SIZE; + io->iov[io->iov_cnt].iov_len = num_left * FTL_BLOCK_SIZE; + + offset += num_left; + io->iov_cnt++; + iov_off = 0; + } +} + +void +ftl_io_shrink_iovec(struct ftl_io *io, size_t num_blocks) +{ + size_t iov_off = 0, block_off = 0; + + assert(io->num_blocks >= num_blocks); + assert(io->pos == 0 && io->iov_pos == 0 && io->iov_off == 0); + + for (; iov_off < io->iov_cnt; ++iov_off) { + size_t num_iov = io->iov[iov_off].iov_len / FTL_BLOCK_SIZE; + size_t num_left = num_blocks - block_off; + + if (num_iov >= num_left) { + io->iov[iov_off].iov_len = num_left * FTL_BLOCK_SIZE; + io->iov_cnt = iov_off + 1; + io->num_blocks = num_blocks; + break; + } + + block_off += num_iov; + } +} + +static void +ftl_io_init(struct ftl_io *io, struct spdk_ftl_dev *dev, + ftl_io_fn fn, void *ctx, int flags, int type) +{ + io->flags |= flags | FTL_IO_INITIALIZED; + io->type = type; + io->dev = dev; + io->lba.single = FTL_LBA_INVALID; + io->addr.offset = FTL_ADDR_INVALID; + io->cb_fn = fn; + io->cb_ctx = ctx; + io->trace = ftl_trace_alloc_id(dev); +} + +struct ftl_io * +ftl_io_init_internal(const struct ftl_io_init_opts *opts) +{ + struct ftl_io *io = opts->io; + struct ftl_io *parent = opts->parent; + struct spdk_ftl_dev *dev = opts->dev; + const struct iovec *iov; + size_t iov_cnt, iov_off; + + if (!io) { + if (parent) { + io = ftl_io_alloc_child(parent); + } else { + io = ftl_io_alloc(ftl_get_io_channel(dev)); + } + + if (!io) { + return NULL; + } + } + + ftl_io_clear(io); + ftl_io_init(io, dev, opts->cb_fn, opts->cb_ctx, opts->flags | FTL_IO_INTERNAL, opts->type); + + io->batch = opts->batch; + io->band = opts->band; + io->md = opts->md; + io->iov = &io->iov_buf[0]; + + if (parent) { + if (parent->flags & FTL_IO_VECTOR_LBA) { + io->lba.vector = parent->lba.vector + parent->pos; + } else { + io->lba.single = parent->lba.single + parent->pos; + } + + iov = &parent->iov[parent->iov_pos]; + iov_cnt = parent->iov_cnt - parent->iov_pos; + iov_off = parent->iov_off; + } else { + iov = &opts->iovs[0]; + iov_cnt = opts->iovcnt; + iov_off = 0; + } + + /* Some requests (zone resets) do not use iovecs */ + if (iov_cnt > 0) { + ftl_io_init_iovec(io, iov, iov_cnt, iov_off, opts->num_blocks); + } + + if (opts->flags & FTL_IO_VECTOR_LBA) { + io->lba.vector = calloc(io->num_blocks, sizeof(uint64_t)); + if (!io->lba.vector) { + ftl_io_free(io); + return NULL; + } + } + + return io; +} + +struct ftl_io * +ftl_io_wbuf_init(struct spdk_ftl_dev *dev, struct ftl_addr addr, struct ftl_band *band, + struct ftl_batch *batch, ftl_io_fn cb) +{ + struct ftl_io *io; + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .batch = batch, + .band = band, + .size = sizeof(struct ftl_io), + .flags = 0, + .type = FTL_IO_WRITE, + .num_blocks = dev->xfer_size, + .cb_fn = cb, + .iovcnt = dev->xfer_size, + .md = batch->metadata, + }; + + memcpy(opts.iovs, batch->iov, sizeof(struct iovec) * dev->xfer_size); + + io = ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->addr = addr; + + return io; +} + +struct ftl_io * +ftl_io_erase_init(struct ftl_band *band, size_t num_blocks, ftl_io_fn cb) +{ + struct ftl_io *io; + struct ftl_io_init_opts opts = { + .dev = band->dev, + .io = NULL, + .band = band, + .size = sizeof(struct ftl_io), + .flags = FTL_IO_PHYSICAL_MODE, + .type = FTL_IO_ERASE, + .num_blocks = 1, + .cb_fn = cb, + .iovcnt = 0, + .md = NULL, + }; + + io = ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->num_blocks = num_blocks; + + return io; +} + +static void +_ftl_user_cb(struct ftl_io *io, void *arg, int status) +{ + io->user_fn(arg, status); +} + +struct ftl_io * +ftl_io_user_init(struct spdk_io_channel *_ioch, uint64_t lba, size_t num_blocks, struct iovec *iov, + size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_ctx, int type) +{ + struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(_ioch); + struct spdk_ftl_dev *dev = ioch->dev; + struct ftl_io *io; + + io = ftl_io_alloc(_ioch); + if (spdk_unlikely(!io)) { + return NULL; + } + + ftl_io_init(io, dev, _ftl_user_cb, cb_ctx, 0, type); + io->lba.single = lba; + io->user_fn = cb_fn; + io->iov = iov; + io->iov_cnt = iov_cnt; + io->num_blocks = num_blocks; + + ftl_trace_lba_io_init(io->dev, io); + return io; +} + +static void +_ftl_io_free(struct ftl_io *io) +{ + struct ftl_io_channel *ioch; + + assert(LIST_EMPTY(&io->children)); + + if (io->flags & FTL_IO_VECTOR_LBA) { + free(io->lba.vector); + } + + if (pthread_spin_destroy(&io->lock)) { + SPDK_ERRLOG("pthread_spin_destroy failed\n"); + } + + ioch = ftl_io_channel_get_ctx(io->ioch); + spdk_mempool_put(ioch->io_pool, io); +} + +static bool +ftl_io_remove_child(struct ftl_io *io) +{ + struct ftl_io *parent = io->parent; + bool parent_done; + + pthread_spin_lock(&parent->lock); + LIST_REMOVE(io, child_entry); + parent_done = parent->done && LIST_EMPTY(&parent->children); + parent->status = parent->status ? : io->status; + pthread_spin_unlock(&parent->lock); + + return parent_done; +} + +void +ftl_io_complete(struct ftl_io *io) +{ + struct ftl_io *parent = io->parent; + bool complete; + + io->flags &= ~FTL_IO_INITIALIZED; + + pthread_spin_lock(&io->lock); + complete = LIST_EMPTY(&io->children); + io->done = true; + pthread_spin_unlock(&io->lock); + + if (complete) { + if (io->cb_fn) { + io->cb_fn(io, io->cb_ctx, io->status); + } + + if (parent && ftl_io_remove_child(io)) { + ftl_io_complete(parent); + } + + _ftl_io_free(io); + } +} + +struct ftl_io * +ftl_io_alloc_child(struct ftl_io *parent) +{ + struct ftl_io *io; + + io = ftl_io_alloc(parent->ioch); + if (spdk_unlikely(!io)) { + return NULL; + } + + ftl_io_init(io, parent->dev, NULL, NULL, parent->flags, parent->type); + io->parent = parent; + + pthread_spin_lock(&parent->lock); + LIST_INSERT_HEAD(&parent->children, io, child_entry); + pthread_spin_unlock(&parent->lock); + + return io; +} + +void ftl_io_fail(struct ftl_io *io, int status) +{ + io->status = status; + ftl_io_advance(io, io->num_blocks - io->pos); +} + +void * +ftl_io_get_md(const struct ftl_io *io) +{ + if (!io->md) { + return NULL; + } + + return (char *)io->md + io->pos * io->dev->md_size; +} + +struct ftl_io * +ftl_io_alloc(struct spdk_io_channel *ch) +{ + struct ftl_io *io; + struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(ch); + + io = spdk_mempool_get(ioch->io_pool); + if (!io) { + return NULL; + } + + memset(io, 0, ioch->elem_size); + io->ioch = ch; + + if (pthread_spin_init(&io->lock, PTHREAD_PROCESS_PRIVATE)) { + SPDK_ERRLOG("pthread_spin_init failed\n"); + spdk_mempool_put(ioch->io_pool, io); + return NULL; + } + + return io; +} + +void +ftl_io_reinit(struct ftl_io *io, ftl_io_fn cb, void *ctx, int flags, int type) +{ + ftl_io_clear(io); + ftl_io_init(io, io->dev, cb, ctx, flags, type); +} + +void +ftl_io_clear(struct ftl_io *io) +{ + ftl_io_reset(io); + + io->flags = 0; + io->batch = NULL; + io->band = NULL; +} + +void +ftl_io_reset(struct ftl_io *io) +{ + io->req_cnt = io->pos = io->iov_pos = io->iov_off = 0; + io->done = false; +} + +void +ftl_io_free(struct ftl_io *io) +{ + struct ftl_io *parent; + + if (!io) { + return; + } + + parent = io->parent; + if (parent && ftl_io_remove_child(io)) { + ftl_io_complete(parent); + } + + _ftl_io_free(io); +} + +void +ftl_io_call_foreach_child(struct ftl_io *io, int (*callback)(struct ftl_io *)) +{ + struct ftl_io *child, *tmp; + + assert(!io->done); + + /* + * If the IO doesn't have any children, it means that it directly describes a request (i.e. + * all of the buffers, LBAs, etc. are filled). Otherwise the IO only groups together several + * requests and may be partially filled, so the callback needs to be called on all of its + * children instead. + */ + if (LIST_EMPTY(&io->children)) { + callback(io); + return; + } + + LIST_FOREACH_SAFE(child, &io->children, child_entry, tmp) { + int rc = callback(child); + if (rc) { + assert(rc != -EAGAIN); + ftl_io_fail(io, rc); + break; + } + } + + /* + * If all the callbacks were processed or an error occurred, treat this IO as completed. + * Multiple calls to ftl_io_call_foreach_child are not supported, resubmissions are supposed + * to be handled in the callback. + */ + ftl_io_complete(io); +} diff --git a/src/spdk/lib/ftl/ftl_io.h b/src/spdk/lib/ftl/ftl_io.h new file mode 100644 index 000000000..d49dc3de7 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_io.h @@ -0,0 +1,351 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_IO_H +#define FTL_IO_H + +#include "spdk/stdinc.h" +#include "spdk/nvme.h" +#include "spdk/ftl.h" + +#include "ftl_addr.h" +#include "ftl_trace.h" + +struct spdk_ftl_dev; +struct ftl_band; +struct ftl_batch; +struct ftl_io; + +typedef int (*ftl_md_pack_fn)(struct ftl_band *); +typedef void (*ftl_io_fn)(struct ftl_io *, void *, int); + +/* IO flags */ +enum ftl_io_flags { + /* Indicates whether IO is already initialized */ + FTL_IO_INITIALIZED = (1 << 0), + /* Internal based IO (defrag, metadata etc.) */ + FTL_IO_INTERNAL = (1 << 1), + /* Indicates that the IO should not go through if there's */ + /* already another one scheduled to the same LBA */ + FTL_IO_WEAK = (1 << 2), + /* Indicates that the IO is used for padding */ + FTL_IO_PAD = (1 << 3), + /* The IO operates on metadata */ + FTL_IO_MD = (1 << 4), + /* Using physical instead of logical address */ + FTL_IO_PHYSICAL_MODE = (1 << 5), + /* Indicates that IO contains noncontiguous LBAs */ + FTL_IO_VECTOR_LBA = (1 << 6), + /* The IO is directed to non-volatile cache */ + FTL_IO_CACHE = (1 << 7), + /* Indicates that physical address should be taken from IO struct, */ + /* not assigned by wptr, only works if wptr is also in direct mode */ + FTL_IO_DIRECT_ACCESS = (1 << 8), + /* Bypass the non-volatile cache */ + FTL_IO_BYPASS_CACHE = (1 << 9), +}; + +enum ftl_io_type { + FTL_IO_READ, + FTL_IO_WRITE, + FTL_IO_ERASE, +}; + +#define FTL_IO_MAX_IOVEC 64 + +struct ftl_io_init_opts { + struct spdk_ftl_dev *dev; + + /* IO descriptor */ + struct ftl_io *io; + + /* Parent request */ + struct ftl_io *parent; + + /* Size of IO descriptor */ + size_t size; + + /* IO flags */ + int flags; + + /* IO type */ + enum ftl_io_type type; + + /* Transfer batch, set for IO going through the write buffer */ + struct ftl_batch *batch; + + /* Band to which the IO is directed */ + struct ftl_band *band; + + /* Number of logical blocks */ + size_t num_blocks; + + /* Data */ + struct iovec iovs[FTL_IO_MAX_IOVEC]; + int iovcnt; + + /* Metadata */ + void *md; + + /* Callback's function */ + ftl_io_fn cb_fn; + + /* Callback's context */ + void *cb_ctx; +}; + +struct ftl_io_channel; + +struct ftl_wbuf_entry { + /* IO channel that owns the write bufer entry */ + struct ftl_io_channel *ioch; + /* Data payload (single block) */ + void *payload; + /* Index within the IO channel's wbuf_entries array */ + uint32_t index; + uint32_t io_flags; + /* Points at the band the data is copied from. Only valid for internal + * requests coming from reloc. + */ + struct ftl_band *band; + /* Physical address of that particular block. Valid once the data has + * been written out. + */ + struct ftl_addr addr; + /* Logical block address */ + uint64_t lba; + + /* Trace ID of the requests the entry is part of */ + uint64_t trace; + + /* Indicates that the entry was written out and is still present in the + * L2P table. + */ + bool valid; + /* Lock that protects the entry from being evicted from the L2P */ + pthread_spinlock_t lock; + TAILQ_ENTRY(ftl_wbuf_entry) tailq; +}; + +#define FTL_IO_CHANNEL_INDEX_INVALID ((uint64_t)-1) + +struct ftl_io_channel { + /* Device */ + struct spdk_ftl_dev *dev; + /* IO pool element size */ + size_t elem_size; + /* Index within the IO channel array */ + uint64_t index; + /* IO pool */ + struct spdk_mempool *io_pool; + /* Underlying device IO channel */ + struct spdk_io_channel *base_ioch; + /* Persistent cache IO channel */ + struct spdk_io_channel *cache_ioch; + /* Poller used for completing write requests and retrying IO */ + struct spdk_poller *poller; + /* Write completion queue */ + TAILQ_HEAD(, ftl_io) write_cmpl_queue; + TAILQ_HEAD(, ftl_io) retry_queue; + TAILQ_ENTRY(ftl_io_channel) tailq; + + /* Array of write buffer entries */ + struct ftl_wbuf_entry *wbuf_entries; + /* Write buffer data payload */ + void *wbuf_payload; + /* Number of write buffer entries */ + uint32_t num_entries; + /* Write buffer queues */ + struct spdk_ring *free_queue; + struct spdk_ring *submit_queue; + /* Maximum number of concurrent user writes */ + uint32_t qdepth_limit; + /* Current number of concurrent user writes */ + uint32_t qdepth_current; + /* Means that the IO channel is being flushed */ + bool flush; +}; + +/* General IO descriptor */ +struct ftl_io { + /* Device */ + struct spdk_ftl_dev *dev; + + /* IO channel */ + struct spdk_io_channel *ioch; + + union { + /* LBA table */ + uint64_t *vector; + + /* First LBA */ + uint64_t single; + } lba; + + /* First block address */ + struct ftl_addr addr; + + /* Number of processed blocks */ + size_t pos; + + /* Number of blocks */ + size_t num_blocks; + + /* IO vector pointer */ + struct iovec *iov; + + /* IO vector buffer for internal requests */ + struct iovec iov_buf[FTL_IO_MAX_IOVEC]; + + /* Metadata */ + void *md; + + /* Number of IO vectors */ + size_t iov_cnt; + + /* Position within the iovec */ + size_t iov_pos; + + /* Offset within the iovec (in blocks) */ + size_t iov_off; + + /* Transfer batch (valid only for writes going through the write buffer) */ + struct ftl_batch *batch; + + /* Band this IO is being written to */ + struct ftl_band *band; + + /* Request status */ + int status; + + /* Number of split requests */ + size_t req_cnt; + + /* Callback's function */ + ftl_io_fn cb_fn; + + /* Callback's context */ + void *cb_ctx; + + /* User callback function */ + spdk_ftl_fn user_fn; + + /* Flags */ + int flags; + + /* IO type */ + enum ftl_io_type type; + + /* Done flag */ + bool done; + + /* Parent request */ + struct ftl_io *parent; + /* Child requests list */ + LIST_HEAD(, ftl_io) children; + /* Child list link */ + LIST_ENTRY(ftl_io) child_entry; + /* Children lock */ + pthread_spinlock_t lock; + + /* Trace group id */ + uint64_t trace; + + /* Used by retry and write completion queues */ + TAILQ_ENTRY(ftl_io) ioch_entry; +}; + +/* Metadata IO */ +struct ftl_md_io { + /* Parent IO structure */ + struct ftl_io io; + + /* Serialization/deserialization callback */ + ftl_md_pack_fn pack_fn; + + /* Callback's function */ + ftl_io_fn cb_fn; + + /* Callback's context */ + void *cb_ctx; +}; + +static inline bool +ftl_io_mode_physical(const struct ftl_io *io) +{ + return io->flags & FTL_IO_PHYSICAL_MODE; +} + +static inline bool +ftl_io_mode_logical(const struct ftl_io *io) +{ + return !ftl_io_mode_physical(io); +} + +static inline bool +ftl_io_done(const struct ftl_io *io) +{ + return io->req_cnt == 0 && io->pos == io->num_blocks; +} + +struct ftl_io *ftl_io_alloc(struct spdk_io_channel *ch); +struct ftl_io *ftl_io_alloc_child(struct ftl_io *parent); +void ftl_io_fail(struct ftl_io *io, int status); +void ftl_io_free(struct ftl_io *io); +struct ftl_io *ftl_io_init_internal(const struct ftl_io_init_opts *opts); +void ftl_io_reinit(struct ftl_io *io, ftl_io_fn cb, + void *ctx, int flags, int type); +void ftl_io_clear(struct ftl_io *io); +void ftl_io_inc_req(struct ftl_io *io); +void ftl_io_dec_req(struct ftl_io *io); +struct iovec *ftl_io_iovec(struct ftl_io *io); +uint64_t ftl_io_current_lba(const struct ftl_io *io); +uint64_t ftl_io_get_lba(const struct ftl_io *io, size_t offset); +void ftl_io_advance(struct ftl_io *io, size_t num_blocks); +size_t ftl_iovec_num_blocks(struct iovec *iov, size_t iov_cnt); +void *ftl_io_iovec_addr(struct ftl_io *io); +size_t ftl_io_iovec_len_left(struct ftl_io *io); +struct ftl_io *ftl_io_wbuf_init(struct spdk_ftl_dev *dev, struct ftl_addr addr, + struct ftl_band *band, struct ftl_batch *batch, ftl_io_fn cb); +struct ftl_io *ftl_io_erase_init(struct ftl_band *band, size_t num_blocks, ftl_io_fn cb); +struct ftl_io *ftl_io_user_init(struct spdk_io_channel *ioch, uint64_t lba, size_t num_blocks, + struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, + void *cb_arg, int type); +void *ftl_io_get_md(const struct ftl_io *io); +void ftl_io_complete(struct ftl_io *io); +void ftl_io_shrink_iovec(struct ftl_io *io, size_t num_blocks); +void ftl_io_process_error(struct ftl_io *io, const struct spdk_nvme_cpl *status); +void ftl_io_reset(struct ftl_io *io); +void ftl_io_call_foreach_child(struct ftl_io *io, int (*callback)(struct ftl_io *)); + +#endif /* FTL_IO_H */ diff --git a/src/spdk/lib/ftl/ftl_reloc.c b/src/spdk/lib/ftl/ftl_reloc.c new file mode 100644 index 000000000..e59bf4d81 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_reloc.c @@ -0,0 +1,860 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/likely.h" +#include "spdk_internal/log.h" +#include "spdk/ftl.h" + +#include "ftl_reloc.h" +#include "ftl_core.h" +#include "ftl_io.h" +#include "ftl_band.h" +#include "ftl_debug.h" + +/* Maximum active reloc moves */ +#define FTL_RELOC_MAX_MOVES 256 + +struct ftl_reloc; +struct ftl_band_reloc; + +enum ftl_reloc_move_state { + FTL_RELOC_STATE_READ_LBA_MAP, + FTL_RELOC_STATE_READ, + FTL_RELOC_STATE_WRITE, +}; + +enum ftl_band_reloc_state { + FTL_BAND_RELOC_STATE_INACTIVE, + FTL_BAND_RELOC_STATE_PENDING, + FTL_BAND_RELOC_STATE_ACTIVE, + FTL_BAND_RELOC_STATE_HIGH_PRIO +}; + +struct ftl_reloc_move { + struct ftl_band_reloc *breloc; + + /* Start addr */ + struct ftl_addr addr; + + /* Number of logical blocks */ + size_t num_blocks; + + /* Data buffer */ + void *data; + + /* Move state (read lba_map, read, write) */ + enum ftl_reloc_move_state state; + + /* IO associated with move */ + struct ftl_io *io; + + STAILQ_ENTRY(ftl_reloc_move) entry; +}; + +struct ftl_band_reloc { + struct ftl_reloc *parent; + + /* Band being relocated */ + struct ftl_band *band; + + /* Number of logical blocks to be relocated */ + size_t num_blocks; + + /* Bitmap of logical blocks to be relocated */ + struct spdk_bit_array *reloc_map; + + /* State of the band reloc */ + enum ftl_band_reloc_state state; + + /* The band is being defragged */ + bool defrag; + + /* Reloc map iterator */ + struct { + /* Array of zone offsets */ + size_t *zone_offset; + + /* Current zone */ + size_t zone_current; + } iter; + + /* Number of outstanding moves */ + size_t num_outstanding; + + /* Pool of move objects */ + struct ftl_reloc_move *moves; + + /* Move queue */ + STAILQ_HEAD(, ftl_reloc_move) move_queue; + + TAILQ_ENTRY(ftl_band_reloc) entry; +}; + +struct ftl_reloc { + /* Device associated with relocate */ + struct spdk_ftl_dev *dev; + + /* Indicates relocate is about to halt */ + bool halt; + + /* Maximum number of IOs per band */ + size_t max_qdepth; + + /* Maximum number of active band relocates */ + size_t max_active; + + /* Maximum transfer size (in logical blocks) per single IO */ + size_t xfer_size; + /* Number of bands being defragged */ + size_t num_defrag_bands; + + /* Array of band relocates */ + struct ftl_band_reloc *brelocs; + + /* Number of active/priority band relocates */ + size_t num_active; + + /* Priority band relocates queue */ + TAILQ_HEAD(, ftl_band_reloc) prio_queue; + + /* Active band relocates queue */ + TAILQ_HEAD(, ftl_band_reloc) active_queue; + + /* Pending band relocates queue */ + TAILQ_HEAD(, ftl_band_reloc) pending_queue; +}; + +bool +ftl_reloc_is_defrag_active(const struct ftl_reloc *reloc) +{ + return reloc->num_defrag_bands > 0; +} + +static size_t +ftl_reloc_iter_zone_offset(struct ftl_band_reloc *breloc) +{ + size_t zone = breloc->iter.zone_current; + + return breloc->iter.zone_offset[zone]; +} + +static size_t +ftl_reloc_iter_zone_done(struct ftl_band_reloc *breloc) +{ + size_t num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev); + + return ftl_reloc_iter_zone_offset(breloc) == num_blocks; +} + +static void +ftl_reloc_clr_block(struct ftl_band_reloc *breloc, size_t block_off) +{ + if (!spdk_bit_array_get(breloc->reloc_map, block_off)) { + return; + } + + spdk_bit_array_clear(breloc->reloc_map, block_off); + assert(breloc->num_blocks); + breloc->num_blocks--; +} + +static void +ftl_reloc_read_lba_map_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_reloc_move *move = arg; + struct ftl_band_reloc *breloc = move->breloc; + + breloc->num_outstanding--; + assert(status == 0); + move->state = FTL_RELOC_STATE_WRITE; + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); +} + +static int +ftl_reloc_read_lba_map(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move) +{ + struct ftl_band *band = breloc->band; + + breloc->num_outstanding++; + return ftl_band_read_lba_map(band, ftl_band_block_offset_from_addr(band, move->addr), + move->num_blocks, ftl_reloc_read_lba_map_cb, move); +} + +static void +ftl_reloc_prep(struct ftl_band_reloc *breloc) +{ + struct ftl_band *band = breloc->band; + struct ftl_reloc *reloc = breloc->parent; + struct ftl_reloc_move *move; + size_t i; + + reloc->num_active++; + + if (!band->high_prio) { + if (ftl_band_alloc_lba_map(band)) { + SPDK_ERRLOG("Failed to allocate lba map\n"); + assert(false); + } + } else { + ftl_band_acquire_lba_map(band); + } + + for (i = 0; i < reloc->max_qdepth; ++i) { + move = &breloc->moves[i]; + move->state = FTL_RELOC_STATE_READ; + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); + } +} + +static void +ftl_reloc_free_move(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move) +{ + assert(move); + spdk_dma_free(move->data); + memset(move, 0, sizeof(*move)); + move->state = FTL_RELOC_STATE_READ; +} + +static void +ftl_reloc_write_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_reloc_move *move = arg; + struct ftl_addr addr = move->addr; + struct ftl_band_reloc *breloc = move->breloc; + size_t i; + + breloc->num_outstanding--; + + if (status) { + SPDK_ERRLOG("Reloc write failed with status: %d\n", status); + assert(false); + return; + } + + for (i = 0; i < move->num_blocks; ++i) { + addr.offset = move->addr.offset + i; + size_t block_off = ftl_band_block_offset_from_addr(breloc->band, addr); + ftl_reloc_clr_block(breloc, block_off); + } + + ftl_reloc_free_move(breloc, move); + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); +} + +static void +ftl_reloc_read_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_reloc_move *move = arg; + struct ftl_band_reloc *breloc = move->breloc; + + breloc->num_outstanding--; + + /* TODO: We should handle fail on relocation read. We need to inform */ + /* user that this group of blocks is bad (update l2p with bad block address and */ + /* put it to lba_map/sector_lba). Maybe we could also retry read with smaller granularity? */ + if (status) { + SPDK_ERRLOG("Reloc read failed with status: %d\n", status); + assert(false); + return; + } + + move->state = FTL_RELOC_STATE_READ_LBA_MAP; + move->io = NULL; + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); +} + +static void +ftl_reloc_iter_reset(struct ftl_band_reloc *breloc) +{ + memset(breloc->iter.zone_offset, 0, ftl_get_num_punits(breloc->band->dev) * + sizeof(*breloc->iter.zone_offset)); + breloc->iter.zone_current = 0; +} + +static size_t +ftl_reloc_iter_block_offset(struct ftl_band_reloc *breloc) +{ + size_t zone_offset = breloc->iter.zone_current * ftl_get_num_blocks_in_zone(breloc->parent->dev); + + return breloc->iter.zone_offset[breloc->iter.zone_current] + zone_offset; +} + +static void +ftl_reloc_iter_next_zone(struct ftl_band_reloc *breloc) +{ + size_t num_zones = ftl_get_num_punits(breloc->band->dev); + + breloc->iter.zone_current = (breloc->iter.zone_current + 1) % num_zones; +} + +static int +ftl_reloc_block_valid(struct ftl_band_reloc *breloc, size_t block_off) +{ + struct ftl_addr addr = ftl_band_addr_from_block_offset(breloc->band, block_off); + + return ftl_addr_is_written(breloc->band, addr) && + spdk_bit_array_get(breloc->reloc_map, block_off) && + ftl_band_block_offset_valid(breloc->band, block_off); +} + +static int +ftl_reloc_iter_next(struct ftl_band_reloc *breloc, size_t *block_off) +{ + size_t zone = breloc->iter.zone_current; + + *block_off = ftl_reloc_iter_block_offset(breloc); + + if (ftl_reloc_iter_zone_done(breloc)) { + return 0; + } + + breloc->iter.zone_offset[zone]++; + + if (!ftl_reloc_block_valid(breloc, *block_off)) { + ftl_reloc_clr_block(breloc, *block_off); + return 0; + } + + return 1; +} + +static int +ftl_reloc_first_valid_block(struct ftl_band_reloc *breloc, size_t *block_off) +{ + size_t i, num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev); + + for (i = ftl_reloc_iter_zone_offset(breloc); i < num_blocks; ++i) { + if (ftl_reloc_iter_next(breloc, block_off)) { + return 1; + } + } + + return 0; +} + +static int +ftl_reloc_iter_done(struct ftl_band_reloc *breloc) +{ + size_t i; + size_t num_zones = ftl_get_num_punits(breloc->band->dev); + size_t num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev); + + for (i = 0; i < num_zones; ++i) { + if (breloc->iter.zone_offset[i] != num_blocks) { + return 0; + } + } + + return 1; +} + +static size_t +ftl_reloc_find_valid_blocks(struct ftl_band_reloc *breloc, + size_t _num_blocks, struct ftl_addr *addr) +{ + size_t block_off, num_blocks = 0; + + if (!ftl_reloc_first_valid_block(breloc, &block_off)) { + return 0; + } + + *addr = ftl_band_addr_from_block_offset(breloc->band, block_off); + + for (num_blocks = 1; num_blocks < _num_blocks; num_blocks++) { + if (!ftl_reloc_iter_next(breloc, &block_off)) { + break; + } + } + + return num_blocks; +} + +static size_t +ftl_reloc_next_blocks(struct ftl_band_reloc *breloc, struct ftl_addr *addr) +{ + size_t i, num_blocks = 0; + struct spdk_ftl_dev *dev = breloc->parent->dev; + + for (i = 0; i < ftl_get_num_punits(dev); ++i) { + num_blocks = ftl_reloc_find_valid_blocks(breloc, breloc->parent->xfer_size, addr); + ftl_reloc_iter_next_zone(breloc); + + if (num_blocks || ftl_reloc_iter_done(breloc)) { + break; + } + } + + return num_blocks; +} + +static struct ftl_io * +ftl_reloc_io_init(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move, + ftl_io_fn fn, enum ftl_io_type io_type, int flags) +{ + size_t block_off, i; + struct ftl_addr addr = move->addr; + struct ftl_io *io = NULL; + struct ftl_io_init_opts opts = { + .dev = breloc->parent->dev, + .band = breloc->band, + .size = sizeof(*io), + .flags = flags | FTL_IO_INTERNAL | FTL_IO_PHYSICAL_MODE, + .type = io_type, + .num_blocks = move->num_blocks, + .iovs = { + { + .iov_base = move->data, + .iov_len = move->num_blocks * FTL_BLOCK_SIZE, + } + }, + .iovcnt = 1, + .cb_fn = fn, + }; + + io = ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->cb_ctx = move; + io->addr = move->addr; + + if (flags & FTL_IO_VECTOR_LBA) { + for (i = 0; i < io->num_blocks; ++i, ++addr.offset) { + block_off = ftl_band_block_offset_from_addr(breloc->band, addr); + + if (!ftl_band_block_offset_valid(breloc->band, block_off)) { + io->lba.vector[i] = FTL_LBA_INVALID; + continue; + } + + io->lba.vector[i] = breloc->band->lba_map.map[block_off]; + } + } + + ftl_trace_lba_io_init(io->dev, io); + + return io; +} + +static int +ftl_reloc_write(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move) +{ + int io_flags = FTL_IO_WEAK | FTL_IO_VECTOR_LBA | FTL_IO_BYPASS_CACHE; + + if (spdk_likely(!move->io)) { + move->io = ftl_reloc_io_init(breloc, move, ftl_reloc_write_cb, + FTL_IO_WRITE, io_flags); + if (!move->io) { + ftl_reloc_free_move(breloc, move); + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); + return -ENOMEM; + } + } + + breloc->num_outstanding++; + ftl_io_write(move->io); + return 0; +} + +static int +ftl_reloc_read(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move) +{ + struct ftl_addr addr = {}; + + move->num_blocks = ftl_reloc_next_blocks(breloc, &addr); + move->breloc = breloc; + move->addr = addr; + + if (!move->num_blocks) { + return 0; + } + + move->data = spdk_dma_malloc(FTL_BLOCK_SIZE * move->num_blocks, 4096, NULL); + if (!move->data) { + return -1; + } + + move->io = ftl_reloc_io_init(breloc, move, ftl_reloc_read_cb, FTL_IO_READ, 0); + if (!move->io) { + ftl_reloc_free_move(breloc, move); + STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry); + SPDK_ERRLOG("Failed to initialize io for relocation."); + return -1; + } + + breloc->num_outstanding++; + ftl_io_read(move->io); + return 0; +} + +static void +ftl_reloc_process_moves(struct ftl_band_reloc *breloc) +{ + struct ftl_reloc_move *move; + STAILQ_HEAD(, ftl_reloc_move) move_queue; + int rc = 0; + + /* + * When IO allocation fails, we do not want to retry immediately so keep moves on + * temporary queue + */ + STAILQ_INIT(&move_queue); + STAILQ_SWAP(&breloc->move_queue, &move_queue, ftl_reloc_move); + + while (!STAILQ_EMPTY(&move_queue)) { + move = STAILQ_FIRST(&move_queue); + STAILQ_REMOVE_HEAD(&move_queue, entry); + + switch (move->state) { + case FTL_RELOC_STATE_READ_LBA_MAP: + rc = ftl_reloc_read_lba_map(breloc, move); + break; + case FTL_RELOC_STATE_READ: + rc = ftl_reloc_read(breloc, move); + break; + case FTL_RELOC_STATE_WRITE: + rc = ftl_reloc_write(breloc, move); + break; + default: + assert(false); + break; + } + + if (rc) { + SPDK_ERRLOG("Move queue processing failed\n"); + assert(false); + } + } +} + +static bool +ftl_reloc_done(struct ftl_band_reloc *breloc) +{ + return !breloc->num_outstanding && STAILQ_EMPTY(&breloc->move_queue); +} + +static void +ftl_reloc_release(struct ftl_band_reloc *breloc) +{ + struct ftl_reloc *reloc = breloc->parent; + struct ftl_band *band = breloc->band; + + ftl_reloc_iter_reset(breloc); + ftl_band_release_lba_map(band); + reloc->num_active--; + + if (breloc->state == FTL_BAND_RELOC_STATE_HIGH_PRIO) { + /* High prio band must be relocated as a whole and ANM events will be ignored */ + assert(breloc->num_blocks == 0 && ftl_band_empty(band)); + TAILQ_REMOVE(&reloc->prio_queue, breloc, entry); + band->high_prio = 0; + breloc->state = FTL_BAND_RELOC_STATE_INACTIVE; + } else { + assert(breloc->state == FTL_BAND_RELOC_STATE_ACTIVE); + TAILQ_REMOVE(&reloc->active_queue, breloc, entry); + breloc->state = FTL_BAND_RELOC_STATE_INACTIVE; + + /* If we got ANM event during relocation put such band back to pending queue */ + if (breloc->num_blocks != 0) { + breloc->state = FTL_BAND_RELOC_STATE_PENDING; + TAILQ_INSERT_TAIL(&reloc->pending_queue, breloc, entry); + return; + } + } + + if (ftl_band_empty(band) && band->state == FTL_BAND_STATE_CLOSED) { + ftl_band_set_state(breloc->band, FTL_BAND_STATE_FREE); + + if (breloc->defrag) { + breloc->defrag = false; + assert(reloc->num_defrag_bands > 0); + reloc->num_defrag_bands--; + } + } +} + +static void +ftl_process_reloc(struct ftl_band_reloc *breloc) +{ + ftl_reloc_process_moves(breloc); + + if (ftl_reloc_done(breloc)) { + ftl_reloc_release(breloc); + } +} + +static int +ftl_band_reloc_init(struct ftl_reloc *reloc, struct ftl_band_reloc *breloc, + struct ftl_band *band) +{ + breloc->band = band; + breloc->parent = reloc; + + breloc->reloc_map = spdk_bit_array_create(ftl_get_num_blocks_in_band(reloc->dev)); + if (!breloc->reloc_map) { + SPDK_ERRLOG("Failed to initialize reloc map"); + return -1; + } + + breloc->iter.zone_offset = calloc(ftl_get_num_punits(band->dev), + sizeof(*breloc->iter.zone_offset)); + if (!breloc->iter.zone_offset) { + SPDK_ERRLOG("Failed to initialize reloc iterator"); + return -1; + } + + STAILQ_INIT(&breloc->move_queue); + + breloc->moves = calloc(reloc->max_qdepth, sizeof(*breloc->moves)); + if (!breloc->moves) { + return -1; + } + + return 0; +} + +static void +ftl_band_reloc_free(struct ftl_band_reloc *breloc) +{ + struct ftl_reloc_move *move; + + if (!breloc) { + return; + } + + assert(breloc->num_outstanding == 0); + + /* Drain write queue if there is active band relocation during shutdown */ + if (breloc->state == FTL_BAND_RELOC_STATE_ACTIVE || + breloc->state == FTL_BAND_RELOC_STATE_HIGH_PRIO) { + assert(breloc->parent->halt); + STAILQ_FOREACH(move, &breloc->move_queue, entry) { + ftl_reloc_free_move(breloc, move); + } + } + + spdk_bit_array_free(&breloc->reloc_map); + free(breloc->iter.zone_offset); + free(breloc->moves); +} + +struct ftl_reloc * +ftl_reloc_init(struct spdk_ftl_dev *dev) +{ + struct ftl_reloc *reloc; + size_t i; + + reloc = calloc(1, sizeof(*reloc)); + if (!reloc) { + return NULL; + } + + reloc->dev = dev; + reloc->halt = true; + reloc->max_qdepth = dev->conf.max_reloc_qdepth; + reloc->max_active = dev->conf.max_active_relocs; + reloc->xfer_size = dev->xfer_size; + reloc->num_defrag_bands = 0; + + if (reloc->max_qdepth > FTL_RELOC_MAX_MOVES) { + goto error; + } + + reloc->brelocs = calloc(ftl_get_num_bands(dev), sizeof(*reloc->brelocs)); + if (!reloc->brelocs) { + goto error; + } + + for (i = 0; i < ftl_get_num_bands(reloc->dev); ++i) { + if (ftl_band_reloc_init(reloc, &reloc->brelocs[i], &dev->bands[i])) { + goto error; + } + } + + TAILQ_INIT(&reloc->pending_queue); + TAILQ_INIT(&reloc->active_queue); + TAILQ_INIT(&reloc->prio_queue); + + return reloc; +error: + ftl_reloc_free(reloc); + return NULL; +} + +void +ftl_reloc_free(struct ftl_reloc *reloc) +{ + size_t i; + + if (!reloc) { + return; + } + + for (i = 0; i < ftl_get_num_bands(reloc->dev); ++i) { + ftl_band_reloc_free(&reloc->brelocs[i]); + } + + free(reloc->brelocs); + free(reloc); +} + +bool +ftl_reloc_is_halted(const struct ftl_reloc *reloc) +{ + return reloc->halt; +} + +void +ftl_reloc_halt(struct ftl_reloc *reloc) +{ + reloc->halt = true; +} + +void +ftl_reloc_resume(struct ftl_reloc *reloc) +{ + reloc->halt = false; +} + +void +ftl_reloc(struct ftl_reloc *reloc) +{ + struct ftl_band_reloc *breloc, *tbreloc; + + if (ftl_reloc_is_halted(reloc)) { + return; + } + + /* Process first band from priority queue and return */ + breloc = TAILQ_FIRST(&reloc->prio_queue); + if (breloc) { + ftl_process_reloc(breloc); + return; + } + + TAILQ_FOREACH_SAFE(breloc, &reloc->pending_queue, entry, tbreloc) { + if (reloc->num_active == reloc->max_active) { + break; + } + + /* Wait for band to close before relocating */ + if (breloc->band->state != FTL_BAND_STATE_CLOSED) { + continue; + } + + ftl_reloc_prep(breloc); + assert(breloc->state == FTL_BAND_RELOC_STATE_PENDING); + TAILQ_REMOVE(&reloc->pending_queue, breloc, entry); + breloc->state = FTL_BAND_RELOC_STATE_ACTIVE; + TAILQ_INSERT_HEAD(&reloc->active_queue, breloc, entry); + } + + TAILQ_FOREACH_SAFE(breloc, &reloc->active_queue, entry, tbreloc) { + assert(breloc->state == FTL_BAND_RELOC_STATE_ACTIVE); + ftl_process_reloc(breloc); + } +} + +void +ftl_reloc_add(struct ftl_reloc *reloc, struct ftl_band *band, size_t offset, + size_t num_blocks, int prio, bool is_defrag) +{ + struct ftl_band_reloc *breloc = &reloc->brelocs[band->id]; + size_t i; + + /* No need to add anything if already at high prio - whole band should be relocated */ + if (!prio && band->high_prio) { + return; + } + + pthread_spin_lock(&band->lba_map.lock); + if (band->lba_map.num_vld == 0) { + pthread_spin_unlock(&band->lba_map.lock); + + /* If the band is closed and has no valid blocks, free it */ + if (band->state == FTL_BAND_STATE_CLOSED) { + ftl_band_set_state(band, FTL_BAND_STATE_FREE); + } + + return; + } + pthread_spin_unlock(&band->lba_map.lock); + + for (i = offset; i < offset + num_blocks; ++i) { + if (spdk_bit_array_get(breloc->reloc_map, i)) { + continue; + } + spdk_bit_array_set(breloc->reloc_map, i); + breloc->num_blocks++; + } + + /* If the band is coming from the defrag process, mark it appropriately */ + if (is_defrag) { + assert(offset == 0 && num_blocks == ftl_get_num_blocks_in_band(band->dev)); + reloc->num_defrag_bands++; + breloc->defrag = true; + } + + if (!prio) { + if (breloc->state == FTL_BAND_RELOC_STATE_INACTIVE) { + breloc->state = FTL_BAND_RELOC_STATE_PENDING; + TAILQ_INSERT_HEAD(&reloc->pending_queue, breloc, entry); + } + } else { + bool active = false; + /* If priority band is already on pending or active queue, remove it from it */ + switch (breloc->state) { + case FTL_BAND_RELOC_STATE_PENDING: + TAILQ_REMOVE(&reloc->pending_queue, breloc, entry); + break; + case FTL_BAND_RELOC_STATE_ACTIVE: + active = true; + TAILQ_REMOVE(&reloc->active_queue, breloc, entry); + break; + default: + break; + } + + breloc->state = FTL_BAND_RELOC_STATE_HIGH_PRIO; + TAILQ_INSERT_TAIL(&reloc->prio_queue, breloc, entry); + + /* + * If band has been already on active queue it doesn't need any additional + * resources + */ + if (!active) { + ftl_reloc_prep(breloc); + } + } +} diff --git a/src/spdk/lib/ftl/ftl_reloc.h b/src/spdk/lib/ftl/ftl_reloc.h new file mode 100644 index 000000000..21f49a47d --- /dev/null +++ b/src/spdk/lib/ftl/ftl_reloc.h @@ -0,0 +1,53 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_RELOC_H +#define FTL_RELOC_H + +#include "spdk/stdinc.h" +#include "spdk/ftl.h" + +struct ftl_reloc; +struct ftl_band; + +struct ftl_reloc *ftl_reloc_init(struct spdk_ftl_dev *dev); +void ftl_reloc_free(struct ftl_reloc *reloc); +void ftl_reloc_add(struct ftl_reloc *reloc, struct ftl_band *band, + size_t offset, size_t num_blocks, int prio, bool is_defrag); +void ftl_reloc(struct ftl_reloc *reloc); +void ftl_reloc_halt(struct ftl_reloc *reloc); +void ftl_reloc_resume(struct ftl_reloc *reloc); +bool ftl_reloc_is_halted(const struct ftl_reloc *reloc); +bool ftl_reloc_is_defrag_active(const struct ftl_reloc *reloc); + +#endif /* FTL_RELOC_H */ diff --git a/src/spdk/lib/ftl/ftl_restore.c b/src/spdk/lib/ftl/ftl_restore.c new file mode 100644 index 000000000..6f626645d --- /dev/null +++ b/src/spdk/lib/ftl/ftl_restore.c @@ -0,0 +1,1350 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/ftl.h" +#include "spdk/util.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/crc32.h" + +#include "ftl_core.h" +#include "ftl_band.h" +#include "ftl_io.h" + +struct ftl_restore_band { + struct ftl_restore *parent; + /* Associated band */ + struct ftl_band *band; + /* Status of retrieving this band's metadata */ + enum ftl_md_status md_status; + /* Padded queue link */ + STAILQ_ENTRY(ftl_restore_band) stailq; +}; + +struct ftl_nv_cache_restore; + +/* Describes single phase to be restored from non-volatile cache */ +struct ftl_nv_cache_range { + struct ftl_nv_cache_restore *parent; + /* Start offset */ + uint64_t start_addr; + /* Last block's address */ + uint64_t last_addr; + /* + * Number of blocks (can be smaller than the difference between the last + * and the starting block due to range overlap) + */ + uint64_t num_blocks; + /* Number of blocks already recovered */ + uint64_t num_recovered; + /* Current address during recovery */ + uint64_t current_addr; + /* Phase of the range */ + unsigned int phase; + /* Indicates whether the data from this range needs to be recovered */ + bool recovery; +}; + +struct ftl_nv_cache_block { + struct ftl_nv_cache_restore *parent; + /* Data buffer */ + void *buf; + /* Metadata buffer */ + void *md_buf; + /* Block offset within the cache */ + uint64_t offset; +}; + +struct ftl_nv_cache_restore { + struct ftl_nv_cache *nv_cache; + /* IO channel to use */ + struct spdk_io_channel *ioch; + /* + * Non-volatile cache ranges. The ranges can overlap, as we have no + * control over the order of completions. The phase of the range is the + * index within the table. The range with index 0 marks blocks that were + * never written. + */ + struct ftl_nv_cache_range range[FTL_NV_CACHE_PHASE_COUNT]; +#define FTL_NV_CACHE_RESTORE_DEPTH 128 + /* Non-volatile cache buffers */ + struct ftl_nv_cache_block block[FTL_NV_CACHE_RESTORE_DEPTH]; + /* Current address */ + uint64_t current_addr; + /* Number of outstanding requests */ + size_t num_outstanding; + /* Recovery/scan status */ + int status; + /* Current phase of the recovery */ + unsigned int phase; +}; + +struct ftl_restore { + struct spdk_ftl_dev *dev; + /* Completion callback (called for each phase of the restoration) */ + ftl_restore_fn cb; + /* Completion callback context */ + void *cb_arg; + /* Number of inflight IOs */ + unsigned int num_ios; + /* Current band number (index in the below bands array) */ + unsigned int current; + /* Array of bands */ + struct ftl_restore_band *bands; + /* Queue of bands to be padded (due to unsafe shutdown) */ + STAILQ_HEAD(, ftl_restore_band) pad_bands; + /* Status of the padding */ + int pad_status; + /* Metadata buffer */ + void *md_buf; + /* LBA map buffer */ + void *lba_map; + /* Indicates we're in the final phase of the restoration */ + bool final_phase; + /* Non-volatile cache recovery */ + struct ftl_nv_cache_restore nv_cache; +}; + +static int +ftl_restore_tail_md(struct ftl_restore_band *rband); +static void +ftl_pad_zone_cb(struct ftl_io *io, void *arg, int status); +static void +ftl_restore_pad_band(struct ftl_restore_band *rband); + +static void +ftl_restore_free(struct ftl_restore *restore) +{ + unsigned int i; + + if (!restore) { + return; + } + + for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) { + spdk_dma_free(restore->nv_cache.block[i].buf); + } + + spdk_dma_free(restore->md_buf); + free(restore->bands); + free(restore); +} + +static struct ftl_restore * +ftl_restore_init(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg) +{ + struct ftl_restore *restore; + struct ftl_restore_band *rband; + size_t i; + + restore = calloc(1, sizeof(*restore)); + if (!restore) { + goto error; + } + + restore->dev = dev; + restore->cb = cb; + restore->cb_arg = cb_arg; + restore->final_phase = false; + + restore->bands = calloc(ftl_get_num_bands(dev), sizeof(*restore->bands)); + if (!restore->bands) { + goto error; + } + + STAILQ_INIT(&restore->pad_bands); + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + rband = &restore->bands[i]; + rband->band = &dev->bands[i]; + rband->parent = restore; + rband->md_status = FTL_MD_NO_MD; + } + + /* Allocate buffer capable of holding head mds of all bands */ + restore->md_buf = spdk_dma_zmalloc(ftl_get_num_bands(dev) * ftl_head_md_num_blocks(dev) * + FTL_BLOCK_SIZE, 0, NULL); + if (!restore->md_buf) { + goto error; + } + + return restore; +error: + ftl_restore_free(restore); + return NULL; +} + +static void +ftl_restore_complete(struct ftl_restore *restore, int status) +{ + struct ftl_restore *ctx = status ? NULL : restore; + bool final_phase = restore->final_phase; + + restore->cb(ctx, status, restore->cb_arg); + if (status || final_phase) { + ftl_restore_free(restore); + } +} + +static int +ftl_band_cmp(const void *lband, const void *rband) +{ + uint64_t lseq = ((struct ftl_restore_band *)lband)->band->seq; + uint64_t rseq = ((struct ftl_restore_band *)rband)->band->seq; + + if (lseq < rseq) { + return -1; + } else { + return 1; + } +} + +static int +ftl_restore_check_seq(const struct ftl_restore *restore) +{ + const struct spdk_ftl_dev *dev = restore->dev; + const struct ftl_restore_band *rband; + const struct ftl_band *next_band; + size_t i; + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + rband = &restore->bands[i]; + if (rband->md_status != FTL_MD_SUCCESS) { + continue; + } + + next_band = LIST_NEXT(rband->band, list_entry); + if (next_band && rband->band->seq == next_band->seq) { + return -1; + } + } + + return 0; +} + +static bool +ftl_restore_head_valid(struct spdk_ftl_dev *dev, struct ftl_restore *restore, size_t *num_valid) +{ + struct ftl_restore_band *rband; + size_t i; + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + rband = &restore->bands[i]; + + if (rband->md_status != FTL_MD_SUCCESS && + rband->md_status != FTL_MD_NO_MD && + rband->md_status != FTL_MD_IO_FAILURE) { + SPDK_ERRLOG("Inconsistent head metadata found on band %u\n", + rband->band->id); + return false; + } + + if (rband->md_status == FTL_MD_SUCCESS) { + (*num_valid)++; + } + } + + return true; +} + +static void +ftl_restore_head_complete(struct ftl_restore *restore) +{ + struct spdk_ftl_dev *dev = restore->dev; + size_t num_valid = 0; + int status = -EIO; + + if (!ftl_restore_head_valid(dev, restore, &num_valid)) { + goto out; + } + + if (num_valid == 0) { + SPDK_ERRLOG("Couldn't find any valid bands\n"); + goto out; + } + + /* Sort bands in sequence number ascending order */ + qsort(restore->bands, ftl_get_num_bands(dev), sizeof(struct ftl_restore_band), + ftl_band_cmp); + + if (ftl_restore_check_seq(restore)) { + SPDK_ERRLOG("Band sequence consistency failed\n"); + goto out; + } + + dev->num_lbas = dev->global_md.num_lbas; + status = 0; +out: + ftl_restore_complete(restore, status); +} + +static void +ftl_restore_head_cb(struct ftl_io *io, void *ctx, int status) +{ + struct ftl_restore_band *rband = ctx; + struct ftl_restore *restore = rband->parent; + unsigned int num_ios; + + rband->md_status = status; + num_ios = __atomic_fetch_sub(&restore->num_ios, 1, __ATOMIC_SEQ_CST); + assert(num_ios > 0); + + if (num_ios == 1) { + ftl_restore_head_complete(restore); + } +} + +static void +ftl_restore_head_md(void *ctx) +{ + struct ftl_restore *restore = ctx; + struct spdk_ftl_dev *dev = restore->dev; + struct ftl_restore_band *rband; + struct ftl_lba_map *lba_map; + unsigned int num_failed = 0, num_ios; + size_t i; + + restore->num_ios = ftl_get_num_bands(dev); + + for (i = 0; i < ftl_get_num_bands(dev); ++i) { + rband = &restore->bands[i]; + lba_map = &rband->band->lba_map; + + lba_map->dma_buf = restore->md_buf + i * ftl_head_md_num_blocks(dev) * FTL_BLOCK_SIZE; + + if (ftl_band_read_head_md(rband->band, ftl_restore_head_cb, rband)) { + if (spdk_likely(rband->band->num_zones)) { + SPDK_ERRLOG("Failed to read metadata on band %zu\n", i); + + rband->md_status = FTL_MD_INVALID_CRC; + + /* If the first IO fails, don't bother sending anything else */ + if (i == 0) { + ftl_restore_complete(restore, -EIO); + } + } + + num_failed++; + } + } + + if (spdk_unlikely(num_failed > 0)) { + num_ios = __atomic_fetch_sub(&restore->num_ios, num_failed, __ATOMIC_SEQ_CST); + if (num_ios == num_failed) { + ftl_restore_complete(restore, -EIO); + } + } +} + +int +ftl_restore_md(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg) +{ + struct ftl_restore *restore; + + restore = ftl_restore_init(dev, cb, cb_arg); + if (!restore) { + return -ENOMEM; + } + + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_restore_head_md, restore); + + return 0; +} + +static int +ftl_restore_l2p(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_addr addr; + uint64_t lba; + size_t i; + + for (i = 0; i < ftl_get_num_blocks_in_band(band->dev); ++i) { + if (!spdk_bit_array_get(band->lba_map.vld, i)) { + continue; + } + + lba = band->lba_map.map[i]; + if (lba >= dev->num_lbas) { + return -1; + } + + addr = ftl_l2p_get(dev, lba); + if (!ftl_addr_invalid(addr)) { + ftl_invalidate_addr(dev, addr); + } + + addr = ftl_band_addr_from_block_offset(band, i); + + ftl_band_set_addr(band, lba, addr); + ftl_l2p_set(dev, lba, addr); + } + + return 0; +} + +static struct ftl_restore_band * +ftl_restore_next_band(struct ftl_restore *restore) +{ + struct ftl_restore_band *rband; + + for (; restore->current < ftl_get_num_bands(restore->dev); ++restore->current) { + rband = &restore->bands[restore->current]; + + if (spdk_likely(rband->band->num_zones) && + rband->md_status == FTL_MD_SUCCESS) { + restore->current++; + return rband; + } + } + + return NULL; +} + +static void +ftl_nv_cache_restore_complete(struct ftl_nv_cache_restore *restore, int status) +{ + struct ftl_restore *ftl_restore = SPDK_CONTAINEROF(restore, struct ftl_restore, nv_cache); + + restore->status = restore->status ? : status; + if (restore->num_outstanding == 0) { + ftl_restore_complete(ftl_restore, restore->status); + } +} + +static void ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); + +static void +ftl_nv_cache_restore_done(struct ftl_nv_cache_restore *restore, uint64_t current_addr) +{ + struct ftl_nv_cache *nv_cache = restore->nv_cache; + + pthread_spin_lock(&nv_cache->lock); + nv_cache->current_addr = current_addr; + nv_cache->ready = true; + pthread_spin_unlock(&nv_cache->lock); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Enabling non-volatile cache (phase: %u, addr: %" + PRIu64")\n", nv_cache->phase, current_addr); + + ftl_nv_cache_restore_complete(restore, 0); +} + +static void +ftl_nv_cache_write_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_restore *restore = cb_arg; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Unable to write the non-volatile cache metadata header\n"); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + ftl_nv_cache_restore_done(restore, FTL_NV_CACHE_DATA_OFFSET); +} + +static void +ftl_nv_cache_scrub_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_restore *restore = cb_arg; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + int rc; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Scrubbing non-volatile cache failed\n"); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + nv_cache->phase = 1; + rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_write_header_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to write the non-volatile cache metadata header: %s\n", + spdk_strerror(-rc)); + ftl_nv_cache_restore_complete(restore, -EIO); + } +} + +static void +ftl_nv_cache_scrub_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_restore *restore = cb_arg; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + int rc; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n"); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + rc = ftl_nv_cache_scrub(nv_cache, ftl_nv_cache_scrub_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to scrub the non-volatile cache: %s\n", spdk_strerror(-rc)); + ftl_nv_cache_restore_complete(restore, rc); + } +} + +static void +ftl_nv_cache_band_flush_cb(void *ctx, int status) +{ + struct ftl_nv_cache_restore *restore = ctx; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + int rc; + + if (spdk_unlikely(status != 0)) { + SPDK_ERRLOG("Flushing active bands failed: %s\n", spdk_strerror(-status)); + ftl_nv_cache_restore_complete(restore, status); + return; + } + + /* + * Use phase 0 to indicate that the cache is being scrubbed. If the power is lost during + * this process, we'll know it needs to be resumed. + */ + nv_cache->phase = 0; + rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_scrub_header_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header: %s\n", + spdk_strerror(-rc)); + ftl_nv_cache_restore_complete(restore, rc); + } +} + +static void +ftl_nv_cache_wbuf_flush_cb(void *ctx, int status) +{ + struct ftl_nv_cache_restore *restore = ctx; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + int rc; + + if (spdk_unlikely(status != 0)) { + SPDK_ERRLOG("Flushing the write buffer failed: %s\n", spdk_strerror(-status)); + ftl_nv_cache_restore_complete(restore, status); + return; + } + + rc = ftl_flush_active_bands(dev, ftl_nv_cache_band_flush_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to flush active bands: %s\n", spdk_strerror(-rc)); + ftl_nv_cache_restore_complete(restore, rc); + } +} + +static void +ftl_nv_cache_recovery_done(struct ftl_nv_cache_restore *restore) +{ + struct ftl_nv_cache *nv_cache = restore->nv_cache; + struct ftl_nv_cache_range *range_prev, *range_current; + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + struct spdk_bdev *bdev; + uint64_t current_addr; + int rc; + + range_prev = &restore->range[ftl_nv_cache_prev_phase(nv_cache->phase)]; + range_current = &restore->range[nv_cache->phase]; + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + + /* + * If there are more than two ranges or the ranges overlap, scrub the non-volatile cache to + * make sure that any subsequent power loss will find the cache in usable state + */ + if ((range_prev->num_blocks + range_current->num_blocks < nv_cache->num_data_blocks) || + (range_prev->start_addr < range_current->last_addr && + range_current->start_addr < range_prev->last_addr)) { + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache inconsistency detected\n"); + + rc = ftl_flush_wbuf(dev, ftl_nv_cache_wbuf_flush_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to flush the write buffer: %s\n", spdk_strerror(-rc)); + ftl_nv_cache_restore_complete(restore, rc); + } + + return; + } + + /* The latest phase is the one written in the header (set in nvc_cache->phase) */ + current_addr = range_current->last_addr + 1; + + /* + * The first range might be empty (only the header was written) or the range might + * end at the last available address, in which case set current address to the + * beginning of the device. + */ + if (range_current->num_blocks == 0 || current_addr >= spdk_bdev_get_num_blocks(bdev)) { + current_addr = FTL_NV_CACHE_DATA_OFFSET; + } + + ftl_nv_cache_restore_done(restore, current_addr); +} + +static void +ftl_nv_cache_recover_block(struct ftl_nv_cache_block *block) +{ + struct ftl_nv_cache_restore *restore = block->parent; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + struct ftl_nv_cache_range *range = &restore->range[restore->phase]; + int rc; + + assert(range->current_addr <= range->last_addr); + + restore->num_outstanding++; + block->offset = range->current_addr++; + rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch, + block->buf, block->md_buf, + block->offset, 1, ftl_nv_cache_block_read_cb, + block); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n", + block->offset, spdk_strerror(-rc)); + restore->num_outstanding--; + ftl_nv_cache_restore_complete(restore, rc); + } +} + +static void +ftl_nv_cache_recover_range(struct ftl_nv_cache_restore *restore) +{ + struct ftl_nv_cache_range *range; + unsigned int phase = restore->phase; + + do { + /* Find first range with non-zero number of blocks that is marked for recovery */ + range = &restore->range[phase]; + if (range->recovery && range->num_recovered < range->num_blocks) { + break; + } + + phase = ftl_nv_cache_next_phase(phase); + } while (phase != restore->phase); + + /* There are no ranges to be recovered, we're done */ + if (range->num_recovered == range->num_blocks || !range->recovery) { + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache recovery done\n"); + ftl_nv_cache_recovery_done(restore); + return; + } + + range->current_addr = range->start_addr; + restore->phase = phase; + + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Recovering range %u %"PRIu64"-%"PRIu64" (%"PRIu64")\n", + phase, range->start_addr, range->last_addr, range->num_blocks); + + ftl_nv_cache_recover_block(&restore->block[0]); +} + +static void +ftl_nv_cache_write_cb(struct ftl_io *io, void *cb_arg, int status) +{ + struct ftl_nv_cache_block *block = cb_arg; + struct ftl_nv_cache_restore *restore = block->parent; + struct ftl_nv_cache_range *range = &restore->range[restore->phase]; + + restore->num_outstanding--; + if (status != 0) { + SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n", + block->offset, spdk_strerror(-status)); + ftl_nv_cache_restore_complete(restore, -ENOMEM); + return; + } + + range->num_recovered++; + if (range->current_addr <= range->last_addr) { + ftl_nv_cache_recover_block(block); + } else if (restore->num_outstanding == 0) { + assert(range->num_recovered == range->num_blocks); + ftl_nv_cache_recover_range(restore); + } +} + +static struct ftl_io * +ftl_nv_cache_alloc_io(struct ftl_nv_cache_block *block, uint64_t lba) +{ + struct ftl_restore *restore = SPDK_CONTAINEROF(block->parent, struct ftl_restore, nv_cache); + struct ftl_io_init_opts opts = { + .dev = restore->dev, + .io = NULL, + .flags = FTL_IO_BYPASS_CACHE, + .type = FTL_IO_WRITE, + .num_blocks = 1, + .cb_fn = ftl_nv_cache_write_cb, + .cb_ctx = block, + .iovs = { + { + .iov_base = block->buf, + .iov_len = FTL_BLOCK_SIZE, + } + }, + .iovcnt = 1, + }; + struct ftl_io *io; + + io = ftl_io_init_internal(&opts); + if (spdk_unlikely(!io)) { + return NULL; + } + + io->lba.single = lba; + return io; +} + +static void +ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_block *block = cb_arg; + struct ftl_nv_cache_restore *restore = block->parent; + struct ftl_nv_cache_range *range = &restore->range[restore->phase]; + struct ftl_io *io; + unsigned int phase; + uint64_t lba; + + spdk_bdev_free_io(bdev_io); + restore->num_outstanding--; + + if (!success) { + SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64"\n", + block->offset); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase); + if (spdk_unlikely(phase != restore->phase)) { + if (range->current_addr < range->last_addr) { + ftl_nv_cache_recover_block(block); + } else if (restore->num_outstanding == 0) { + ftl_nv_cache_recover_range(restore); + } + + return; + } + + io = ftl_nv_cache_alloc_io(block, lba); + if (spdk_unlikely(!io)) { + SPDK_ERRLOG("Failed to allocate ftl_io during non-volatile cache recovery\n"); + ftl_nv_cache_restore_complete(restore, -ENOMEM); + return; + } + + restore->num_outstanding++; + ftl_io_write(io); +} + +/* + * Since we have no control over the order in which the requests complete in regards to their + * submission, the cache can be in either of the following states: + * - [1 1 1 1 1 1 1 1 1 1]: simplest case, whole cache contains single phase (although it should be + * very rare), + * - [1 1 1 1 3 3 3 3 3 3]: two phases, changing somewhere in the middle with no overlap. This is + * the state left by clean shutdown, + * - [1 1 1 1 3 1 3 3 3 3]: similar to the above, but this time the two ranges overlap. This + * happens when completions are reordered during unsafe shutdown, + * - [2 1 2 1 1 1 1 3 1 3]: three different phases, each one of which can overlap with + * previous/next one. The data from the oldest phase doesn't need to be + * recovered, as it was already being written to, which means it's + * already on the main storage. + */ +static void +ftl_nv_cache_scan_done(struct ftl_nv_cache_restore *restore) +{ + struct ftl_nv_cache *nv_cache = restore->nv_cache; +#if defined(DEBUG) + struct ftl_nv_cache_range *range; + uint64_t i, num_blocks = 0; + + for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) { + range = &restore->range[i]; + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Range %"PRIu64": %"PRIu64"-%"PRIu64" (%" PRIu64 + ")\n", i, range->start_addr, range->last_addr, range->num_blocks); + num_blocks += range->num_blocks; + } + assert(num_blocks == nv_cache->num_data_blocks); +#endif + restore->phase = ftl_nv_cache_prev_phase(nv_cache->phase); + + /* + * Only the latest two phases need to be recovered. The third one, even if present, + * already has to be stored on the main storage, as it's already started to be + * overwritten (only present here because of reordering of requests' completions). + */ + restore->range[nv_cache->phase].recovery = true; + restore->range[restore->phase].recovery = true; + + ftl_nv_cache_recover_range(restore); +} + +static int ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block); + +static void +ftl_nv_cache_scan_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_block *block = cb_arg; + struct ftl_nv_cache_restore *restore = block->parent; + struct ftl_nv_cache_range *range; + struct spdk_bdev *bdev; + unsigned int phase; + uint64_t lba; + + restore->num_outstanding--; + bdev = spdk_bdev_desc_get_bdev(restore->nv_cache->bdev_desc); + spdk_bdev_free_io(bdev_io); + + if (!success) { + SPDK_ERRLOG("Non-volatile cache scan failed on block %"PRIu64"\n", + block->offset); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + /* If we've already hit an error, don't bother with scanning anything else */ + if (spdk_unlikely(restore->status != 0)) { + ftl_nv_cache_restore_complete(restore, restore->status); + return; + } + + ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase); + range = &restore->range[phase]; + range->num_blocks++; + + if (range->start_addr == FTL_LBA_INVALID || range->start_addr > block->offset) { + range->start_addr = block->offset; + } + + if (range->last_addr == FTL_LBA_INVALID || range->last_addr < block->offset) { + range->last_addr = block->offset; + } + + /* All the blocks were read, once they're all completed and we're finished */ + if (restore->current_addr == spdk_bdev_get_num_blocks(bdev)) { + if (restore->num_outstanding == 0) { + ftl_nv_cache_scan_done(restore); + } + + return; + } + + ftl_nv_cache_scan_block(block); +} + +static int +ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block) +{ + struct ftl_nv_cache_restore *restore = block->parent; + struct ftl_nv_cache *nv_cache = restore->nv_cache; + int rc; + + restore->num_outstanding++; + block->offset = restore->current_addr++; + rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch, + block->buf, block->md_buf, + block->offset, 1, ftl_nv_cache_scan_cb, + block); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Non-volatile cache scan failed on block %"PRIu64" (%s)\n", + block->offset, spdk_strerror(-rc)); + restore->num_outstanding--; + ftl_nv_cache_restore_complete(restore, rc); + return rc; + } + + return 0; +} + +static void +ftl_nv_cache_clean_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache_restore *restore = cb_arg; + + spdk_bdev_free_io(bdev_io); + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Unable to write the non-volatile cache metadata header\n"); + ftl_nv_cache_restore_complete(restore, -EIO); + return; + } + + ftl_nv_cache_restore_done(restore, restore->current_addr); +} + +static bool +ftl_nv_cache_header_valid(struct spdk_ftl_dev *dev, const struct ftl_nv_cache_header *hdr) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(dev->nv_cache.bdev_desc); + uint32_t checksum; + + checksum = spdk_crc32c_update(hdr, offsetof(struct ftl_nv_cache_header, checksum), 0); + if (checksum != hdr->checksum) { + SPDK_ERRLOG("Invalid header checksum (found: %"PRIu32", expected: %"PRIu32")\n", + checksum, hdr->checksum); + return false; + } + + if (hdr->version != FTL_NV_CACHE_HEADER_VERSION) { + SPDK_ERRLOG("Invalid header version (found: %"PRIu32", expected: %"PRIu32")\n", + hdr->version, FTL_NV_CACHE_HEADER_VERSION); + return false; + } + + if (hdr->size != spdk_bdev_get_num_blocks(bdev)) { + SPDK_ERRLOG("Unexpected size of the non-volatile cache bdev (%"PRIu64", expected: %" + PRIu64")\n", hdr->size, spdk_bdev_get_num_blocks(bdev)); + return false; + } + + if (spdk_uuid_compare(&hdr->uuid, &dev->uuid)) { + SPDK_ERRLOG("Invalid device UUID\n"); + return false; + } + + if (!ftl_nv_cache_phase_is_valid(hdr->phase) && hdr->phase != 0) { + return false; + } + + if ((hdr->current_addr >= spdk_bdev_get_num_blocks(bdev) || + hdr->current_addr < FTL_NV_CACHE_DATA_OFFSET) && + (hdr->current_addr != FTL_LBA_INVALID)) { + SPDK_ERRLOG("Unexpected value of non-volatile cache's current address: %"PRIu64"\n", + hdr->current_addr); + return false; + } + + return true; +} + +static void +ftl_nv_cache_read_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_restore *restore = cb_arg; + struct spdk_ftl_dev *dev = restore->dev; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + struct ftl_nv_cache_header *hdr; + struct iovec *iov = NULL; + int iov_cnt = 0, i, rc; + + if (!success) { + SPDK_ERRLOG("Unable to read non-volatile cache metadata header\n"); + ftl_restore_complete(restore, -ENOTRECOVERABLE); + goto out; + } + + spdk_bdev_io_get_iovec(bdev_io, &iov, &iov_cnt); + assert(iov != NULL); + hdr = iov[0].iov_base; + + if (!ftl_nv_cache_header_valid(dev, hdr)) { + ftl_restore_complete(restore, -ENOTRECOVERABLE); + goto out; + } + + /* Remember the latest phase */ + nv_cache->phase = hdr->phase; + + /* If the phase equals zero, we lost power during recovery. We need to finish it up + * by scrubbing the device once again. + */ + if (hdr->phase == 0) { + SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Detected phase 0, restarting scrub\n"); + rc = ftl_nv_cache_scrub(nv_cache, ftl_nv_cache_scrub_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to scrub the non-volatile cache: %s\n", + spdk_strerror(-rc)); + ftl_restore_complete(restore, -ENOTRECOVERABLE); + } + + goto out; + } + + /* Valid current_addr means that the shutdown was clean, so we just need to overwrite the + * header to make sure that any power loss occurring before the cache is wrapped won't be + * mistaken for a clean shutdown. + */ + if (hdr->current_addr != FTL_LBA_INVALID) { + restore->nv_cache.current_addr = hdr->current_addr; + + rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_clean_header_cb, + &restore->nv_cache); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Failed to overwrite the non-volatile cache header: %s\n", + spdk_strerror(-rc)); + ftl_restore_complete(restore, -ENOTRECOVERABLE); + } + + goto out; + } + + /* Otherwise the shutdown was unexpected, so we need to recover the data from the cache */ + restore->nv_cache.current_addr = FTL_NV_CACHE_DATA_OFFSET; + + for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) { + if (ftl_nv_cache_scan_block(&restore->nv_cache.block[i])) { + break; + } + } +out: + spdk_bdev_free_io(bdev_io); +} + +void +ftl_restore_nv_cache(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg) +{ + struct spdk_ftl_dev *dev = restore->dev; + struct spdk_bdev *bdev; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + struct ftl_io_channel *ioch; + struct ftl_nv_cache_restore *nvc_restore = &restore->nv_cache; + struct ftl_nv_cache_block *block; + size_t alignment; + int rc, i; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + alignment = spdk_max(spdk_bdev_get_buf_align(bdev), sizeof(uint64_t)); + + nvc_restore->nv_cache = nv_cache; + nvc_restore->ioch = ioch->cache_ioch; + + restore->final_phase = true; + restore->cb = cb; + restore->cb_arg = cb_arg; + + for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) { + block = &nvc_restore->block[i]; + block->parent = nvc_restore; + block->buf = spdk_dma_zmalloc(spdk_bdev_get_block_size(bdev) + + spdk_bdev_get_md_size(bdev), + alignment, NULL); + if (!block->buf) { + /* The memory will be freed in ftl_restore_free */ + SPDK_ERRLOG("Unable to allocate memory\n"); + ftl_restore_complete(restore, -ENOMEM); + return; + } + + block->md_buf = (char *)block->buf + spdk_bdev_get_block_size(bdev); + } + + for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) { + nvc_restore->range[i].parent = nvc_restore; + nvc_restore->range[i].start_addr = FTL_LBA_INVALID; + nvc_restore->range[i].last_addr = FTL_LBA_INVALID; + nvc_restore->range[i].num_blocks = 0; + nvc_restore->range[i].recovery = false; + nvc_restore->range[i].phase = i; + } + + rc = spdk_bdev_read_blocks(nv_cache->bdev_desc, ioch->cache_ioch, nv_cache->dma_buf, + 0, FTL_NV_CACHE_DATA_OFFSET, ftl_nv_cache_read_header_cb, restore); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Failed to read non-volatile cache metadata header: %s\n", + spdk_strerror(-rc)); + ftl_restore_complete(restore, rc); + } +} + +static bool +ftl_pad_zone_pad_finish(struct ftl_restore_band *rband, bool direct_access) +{ + struct ftl_restore *restore = rband->parent; + struct ftl_restore_band *next_band; + size_t i, num_pad_zones = 0; + + if (spdk_unlikely(restore->pad_status && !restore->num_ios)) { + if (direct_access) { + /* In case of any errors found we want to clear direct access. */ + /* Direct access bands have their own allocated md, which would be lost */ + /* on restore complete otherwise. */ + rband->band->state = FTL_BAND_STATE_CLOSED; + ftl_band_set_direct_access(rband->band, false); + } + ftl_restore_complete(restore, restore->pad_status); + return true; + } + + for (i = 0; i < rband->band->num_zones; ++i) { + if (rband->band->zone_buf[i].info.state != SPDK_BDEV_ZONE_STATE_FULL) { + num_pad_zones++; + } + } + + /* Finished all zones in a band, check if all bands are done */ + if (num_pad_zones == 0) { + if (direct_access) { + rband->band->state = FTL_BAND_STATE_CLOSED; + ftl_band_set_direct_access(rband->band, false); + } + + next_band = STAILQ_NEXT(rband, stailq); + if (!next_band) { + ftl_restore_complete(restore, restore->pad_status); + return true; + } else { + /* Start off padding in the next band */ + ftl_restore_pad_band(next_band); + return true; + } + } + + return false; +} + +static struct ftl_io * +ftl_restore_init_pad_io(struct ftl_restore_band *rband, void *buffer, + struct ftl_addr addr) +{ + struct ftl_band *band = rband->band; + struct spdk_ftl_dev *dev = band->dev; + int flags = FTL_IO_PAD | FTL_IO_INTERNAL | FTL_IO_PHYSICAL_MODE | FTL_IO_MD | + FTL_IO_DIRECT_ACCESS; + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .band = band, + .size = sizeof(struct ftl_io), + .flags = flags, + .type = FTL_IO_WRITE, + .num_blocks = dev->xfer_size, + .cb_fn = ftl_pad_zone_cb, + .cb_ctx = rband, + .iovs = { + { + .iov_base = buffer, + .iov_len = dev->xfer_size * FTL_BLOCK_SIZE, + } + }, + .iovcnt = 1, + .parent = NULL, + }; + struct ftl_io *io; + + io = ftl_io_init_internal(&opts); + if (spdk_unlikely(!io)) { + return NULL; + } + + io->addr = addr; + rband->parent->num_ios++; + + return io; +} + +static void +ftl_pad_zone_cb(struct ftl_io *io, void *arg, int status) +{ + struct ftl_restore_band *rband = arg; + struct ftl_restore *restore = rband->parent; + struct ftl_band *band = io->band; + struct ftl_zone *zone; + struct ftl_io *new_io; + uint64_t offset; + + restore->num_ios--; + /* TODO check for next unit error vs early close error */ + if (status) { + restore->pad_status = status; + goto end; + } + + offset = io->addr.offset % ftl_get_num_blocks_in_zone(restore->dev); + if (offset + io->num_blocks == ftl_get_num_blocks_in_zone(restore->dev)) { + zone = ftl_band_zone_from_addr(band, io->addr); + zone->info.state = SPDK_BDEV_ZONE_STATE_FULL; + } else { + struct ftl_addr addr = io->addr; + addr.offset += io->num_blocks; + new_io = ftl_restore_init_pad_io(rband, io->iov[0].iov_base, addr); + if (spdk_unlikely(!new_io)) { + restore->pad_status = -ENOMEM; + goto end; + } + + ftl_io_write(new_io); + return; + } + +end: + spdk_dma_free(io->iov[0].iov_base); + ftl_pad_zone_pad_finish(rband, true); +} + +static void +ftl_restore_pad_band(struct ftl_restore_band *rband) +{ + struct ftl_restore *restore = rband->parent; + struct ftl_band *band = rband->band; + struct spdk_ftl_dev *dev = band->dev; + void *buffer = NULL; + struct ftl_io *io; + struct ftl_addr addr; + size_t i; + int rc = 0; + + /* Check if some zones are not closed */ + if (ftl_pad_zone_pad_finish(rband, false)) { + /* + * If we're here, end meta wasn't recognized, but the whole band is written + * Assume the band was padded and ignore it + */ + return; + } + + band->state = FTL_BAND_STATE_OPEN; + rc = ftl_band_set_direct_access(band, true); + if (rc) { + ftl_restore_complete(restore, rc); + return; + } + + for (i = 0; i < band->num_zones; ++i) { + if (band->zone_buf[i].info.state == SPDK_BDEV_ZONE_STATE_FULL) { + continue; + } + + addr.offset = band->zone_buf[i].info.write_pointer; + + buffer = spdk_dma_zmalloc(FTL_BLOCK_SIZE * dev->xfer_size, 0, NULL); + if (spdk_unlikely(!buffer)) { + rc = -ENOMEM; + goto error; + } + + io = ftl_restore_init_pad_io(rband, buffer, addr); + if (spdk_unlikely(!io)) { + rc = -ENOMEM; + spdk_dma_free(buffer); + goto error; + } + + ftl_io_write(io); + } + + return; + +error: + restore->pad_status = rc; + ftl_pad_zone_pad_finish(rband, true); +} + +static void +ftl_restore_pad_open_bands(void *ctx) +{ + struct ftl_restore *restore = ctx; + + ftl_restore_pad_band(STAILQ_FIRST(&restore->pad_bands)); +} + +static void +ftl_restore_tail_md_cb(struct ftl_io *io, void *ctx, int status) +{ + struct ftl_restore_band *rband = ctx; + struct ftl_restore *restore = rband->parent; + struct spdk_ftl_dev *dev = restore->dev; + + if (status) { + if (!dev->conf.allow_open_bands) { + SPDK_ERRLOG("%s while restoring tail md in band %u.\n", + spdk_strerror(-status), rband->band->id); + ftl_band_release_lba_map(rband->band); + ftl_restore_complete(restore, status); + return; + } else { + SPDK_ERRLOG("%s while restoring tail md. Will attempt to pad band %u.\n", + spdk_strerror(-status), rband->band->id); + STAILQ_INSERT_TAIL(&restore->pad_bands, rband, stailq); + } + } + + if (!status && ftl_restore_l2p(rband->band)) { + ftl_band_release_lba_map(rband->band); + ftl_restore_complete(restore, -ENOTRECOVERABLE); + return; + } + ftl_band_release_lba_map(rband->band); + + rband = ftl_restore_next_band(restore); + if (!rband) { + if (!STAILQ_EMPTY(&restore->pad_bands)) { + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_restore_pad_open_bands, + restore); + } else { + ftl_restore_complete(restore, 0); + } + + return; + } + + ftl_restore_tail_md(rband); +} + +static int +ftl_restore_tail_md(struct ftl_restore_band *rband) +{ + struct ftl_restore *restore = rband->parent; + struct ftl_band *band = rband->band; + + if (ftl_band_alloc_lba_map(band)) { + SPDK_ERRLOG("Failed to allocate lba map\n"); + ftl_restore_complete(restore, -ENOMEM); + return -ENOMEM; + } + + if (ftl_band_read_tail_md(band, band->tail_md_addr, ftl_restore_tail_md_cb, rband)) { + SPDK_ERRLOG("Failed to send tail metadata read\n"); + ftl_restore_complete(restore, -EIO); + return -EIO; + } + + return 0; +} + +int +ftl_restore_device(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg) +{ + struct spdk_ftl_dev *dev = restore->dev; + struct ftl_restore_band *rband; + + restore->current = 0; + restore->cb = cb; + restore->cb_arg = cb_arg; + restore->final_phase = dev->nv_cache.bdev_desc == NULL; + + /* If restore_device is called, there must be at least one valid band */ + rband = ftl_restore_next_band(restore); + assert(rband); + return ftl_restore_tail_md(rband); +} diff --git a/src/spdk/lib/ftl/ftl_trace.c b/src/spdk/lib/ftl/ftl_trace.c new file mode 100644 index 000000000..ba66323ad --- /dev/null +++ b/src/spdk/lib/ftl/ftl_trace.c @@ -0,0 +1,361 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/trace.h" + +#include "ftl_core.h" +#include "ftl_trace.h" +#include "ftl_io.h" +#include "ftl_band.h" + +#if defined(DEBUG) + +#define OWNER_FTL 0x20 +#define TRACE_GROUP_FTL 0x6 + +enum ftl_trace_source { + FTL_TRACE_SOURCE_INTERNAL, + FTL_TRACE_SOURCE_USER, + FTL_TRACE_SOURCE_MAX, +}; + +#define FTL_TPOINT_ID(id, src) SPDK_TPOINT_ID(TRACE_GROUP_FTL, (((id) << 1) | (!!(src)))) + +#define FTL_TRACE_BAND_DEFRAG(src) FTL_TPOINT_ID(0, src) +#define FTL_TRACE_BAND_WRITE(src) FTL_TPOINT_ID(1, src) +#define FTL_TRACE_LIMITS(src) FTL_TPOINT_ID(2, src) +#define FTL_TRACE_WBUF_POP(src) FTL_TPOINT_ID(3, src) + +#define FTL_TRACE_READ_SCHEDULE(src) FTL_TPOINT_ID(4, src) +#define FTL_TRACE_READ_SUBMISSION(src) FTL_TPOINT_ID(5, src) +#define FTL_TRACE_READ_COMPLETION_INVALID(src) FTL_TPOINT_ID(6, src) +#define FTL_TRACE_READ_COMPLETION_CACHE(src) FTL_TPOINT_ID(7, src) +#define FTL_TRACE_READ_COMPLETION_DISK(src) FTL_TPOINT_ID(8, src) + +#define FTL_TRACE_MD_READ_SCHEDULE(src) FTL_TPOINT_ID(9, src) +#define FTL_TRACE_MD_READ_SUBMISSION(src) FTL_TPOINT_ID(10, src) +#define FTL_TRACE_MD_READ_COMPLETION(src) FTL_TPOINT_ID(11, src) + +#define FTL_TRACE_WRITE_SCHEDULE(src) FTL_TPOINT_ID(12, src) +#define FTL_TRACE_WRITE_WBUF_FILL(src) FTL_TPOINT_ID(13, src) +#define FTL_TRACE_WRITE_SUBMISSION(src) FTL_TPOINT_ID(14, src) +#define FTL_TRACE_WRITE_COMPLETION(src) FTL_TPOINT_ID(15, src) + +#define FTL_TRACE_MD_WRITE_SCHEDULE(src) FTL_TPOINT_ID(16, src) +#define FTL_TRACE_MD_WRITE_SUBMISSION(src) FTL_TPOINT_ID(17, src) +#define FTL_TRACE_MD_WRITE_COMPLETION(src) FTL_TPOINT_ID(18, src) + +#define FTL_TRACE_ERASE_SUBMISSION(src) FTL_TPOINT_ID(19, src) +#define FTL_TRACE_ERASE_COMPLETION(src) FTL_TPOINT_ID(20, src) + +SPDK_TRACE_REGISTER_FN(ftl_trace_func, "ftl", TRACE_GROUP_FTL) +{ + const char source[] = { 'i', 'u' }; + char descbuf[128]; + int i; + + spdk_trace_register_owner(OWNER_FTL, 'f'); + + for (i = 0; i < FTL_TRACE_SOURCE_MAX; ++i) { + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "band_defrag"); + spdk_trace_register_description(descbuf, FTL_TRACE_BAND_DEFRAG(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "band: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "band_write"); + spdk_trace_register_description(descbuf, FTL_TRACE_BAND_WRITE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "band: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "limits"); + spdk_trace_register_description(descbuf, FTL_TRACE_LIMITS(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "limits: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "rwb_pop"); + spdk_trace_register_description(descbuf, FTL_TRACE_WBUF_POP(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_sched"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_SCHEDULE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_submit"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_SUBMISSION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_cmpl"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_COMPLETION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_sched"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_SCHEDULE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_submit"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_SUBMISSION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_cmpl"); + spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_COMPLETION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_sched"); + spdk_trace_register_description(descbuf, FTL_TRACE_READ_SCHEDULE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_submit"); + spdk_trace_register_description(descbuf, FTL_TRACE_READ_SUBMISSION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_invld"); + spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_INVALID(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_cache"); + spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_CACHE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_ssd"); + spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_DISK(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_sched"); + spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_SCHEDULE(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "rwb_fill"); + spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_WBUF_FILL(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_submit"); + spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_SUBMISSION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_cmpl"); + spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_COMPLETION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "lba: "); + + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "erase_submit"); + spdk_trace_register_description(descbuf, FTL_TRACE_ERASE_SUBMISSION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "erase_cmpl"); + spdk_trace_register_description(descbuf, FTL_TRACE_ERASE_COMPLETION(i), + OWNER_FTL, OBJECT_NONE, 0, 0, "addr: "); + } +} + +static uint16_t +ftl_trace_io_source(const struct ftl_io *io) +{ + if (io->flags & FTL_IO_INTERNAL) { + return FTL_TRACE_SOURCE_INTERNAL; + } else { + return FTL_TRACE_SOURCE_USER; + } +} + +static uint64_t +ftl_trace_next_id(struct ftl_trace *trace) +{ + assert(trace->id != FTL_TRACE_INVALID_ID); + return __atomic_fetch_add(&trace->id, 1, __ATOMIC_SEQ_CST); +} + +void +ftl_trace_defrag_band(struct spdk_ftl_dev *dev, const struct ftl_band *band) +{ + struct ftl_trace *trace = &dev->stats.trace; + + spdk_trace_record(FTL_TRACE_BAND_DEFRAG(FTL_TRACE_SOURCE_INTERNAL), + ftl_trace_next_id(trace), 0, band->lba_map.num_vld, band->id); +} + +void +ftl_trace_write_band(struct spdk_ftl_dev *dev, const struct ftl_band *band) +{ + struct ftl_trace *trace = &dev->stats.trace; + + spdk_trace_record(FTL_TRACE_BAND_WRITE(FTL_TRACE_SOURCE_INTERNAL), + ftl_trace_next_id(trace), 0, 0, band->id); +} + +void +ftl_trace_lba_io_init(struct spdk_ftl_dev *dev, const struct ftl_io *io) +{ + uint16_t tpoint_id = 0, source; + + assert(io->trace != FTL_TRACE_INVALID_ID); + source = ftl_trace_io_source(io); + + if (io->flags & FTL_IO_MD) { + switch (io->type) { + case FTL_IO_READ: + tpoint_id = FTL_TRACE_MD_READ_SCHEDULE(source); + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_MD_WRITE_SCHEDULE(source); + break; + default: + assert(0); + } + } else { + switch (io->type) { + case FTL_IO_READ: + tpoint_id = FTL_TRACE_READ_SCHEDULE(source); + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_WRITE_SCHEDULE(source); + break; + default: + assert(0); + } + } + + spdk_trace_record(tpoint_id, io->trace, io->num_blocks, 0, ftl_io_get_lba(io, 0)); +} + +void +ftl_trace_wbuf_fill(struct spdk_ftl_dev *dev, const struct ftl_io *io) +{ + assert(io->trace != FTL_TRACE_INVALID_ID); + + spdk_trace_record(FTL_TRACE_WRITE_WBUF_FILL(ftl_trace_io_source(io)), io->trace, + 0, 0, ftl_io_current_lba(io)); +} + +void +ftl_trace_wbuf_pop(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry) +{ + uint16_t tpoint_id; + + assert(entry->trace != FTL_TRACE_INVALID_ID); + + if (entry->io_flags & FTL_IO_INTERNAL) { + tpoint_id = FTL_TRACE_WBUF_POP(FTL_TRACE_SOURCE_INTERNAL); + } else { + tpoint_id = FTL_TRACE_WBUF_POP(FTL_TRACE_SOURCE_USER); + } + + spdk_trace_record(tpoint_id, entry->trace, 0, entry->addr.offset, entry->lba); +} + +void +ftl_trace_completion(struct spdk_ftl_dev *dev, const struct ftl_io *io, + enum ftl_trace_completion completion) +{ + uint16_t tpoint_id = 0, source; + + assert(io->trace != FTL_TRACE_INVALID_ID); + source = ftl_trace_io_source(io); + + if (io->flags & FTL_IO_MD) { + switch (io->type) { + case FTL_IO_READ: + tpoint_id = FTL_TRACE_MD_READ_COMPLETION(source); + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_MD_WRITE_COMPLETION(source); + break; + default: + assert(0); + } + } else { + switch (io->type) { + case FTL_IO_READ: + switch (completion) { + case FTL_TRACE_COMPLETION_INVALID: + tpoint_id = FTL_TRACE_READ_COMPLETION_INVALID(source); + break; + case FTL_TRACE_COMPLETION_CACHE: + tpoint_id = FTL_TRACE_READ_COMPLETION_CACHE(source); + break; + case FTL_TRACE_COMPLETION_DISK: + tpoint_id = FTL_TRACE_READ_COMPLETION_DISK(source); + break; + } + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_WRITE_COMPLETION(source); + break; + case FTL_IO_ERASE: + tpoint_id = FTL_TRACE_ERASE_COMPLETION(source); + break; + default: + assert(0); + } + } + + spdk_trace_record(tpoint_id, io->trace, 0, 0, ftl_io_get_lba(io, io->pos - 1)); +} + +void +ftl_trace_submission(struct spdk_ftl_dev *dev, const struct ftl_io *io, struct ftl_addr addr, + size_t addr_cnt) +{ + uint16_t tpoint_id = 0, source; + + assert(io->trace != FTL_TRACE_INVALID_ID); + source = ftl_trace_io_source(io); + + if (io->flags & FTL_IO_MD) { + switch (io->type) { + case FTL_IO_READ: + tpoint_id = FTL_TRACE_MD_READ_SUBMISSION(source); + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_MD_WRITE_SUBMISSION(source); + break; + default: + assert(0); + } + } else { + switch (io->type) { + case FTL_IO_READ: + tpoint_id = FTL_TRACE_READ_SUBMISSION(source); + break; + case FTL_IO_WRITE: + tpoint_id = FTL_TRACE_WRITE_SUBMISSION(source); + break; + case FTL_IO_ERASE: + tpoint_id = FTL_TRACE_ERASE_SUBMISSION(source); + break; + default: + assert(0); + } + } + + spdk_trace_record(tpoint_id, io->trace, addr_cnt, 0, addr.offset); +} + +void +ftl_trace_limits(struct spdk_ftl_dev *dev, int limit, size_t num_free) +{ + struct ftl_trace *trace = &dev->stats.trace; + + spdk_trace_record(FTL_TRACE_LIMITS(FTL_TRACE_SOURCE_INTERNAL), ftl_trace_next_id(trace), + num_free, limit, 0); +} + +uint64_t +ftl_trace_alloc_id(struct spdk_ftl_dev *dev) +{ + struct ftl_trace *trace = &dev->stats.trace; + + return ftl_trace_next_id(trace); +} + +#endif /* defined(DEBUG) */ diff --git a/src/spdk/lib/ftl/ftl_trace.h b/src/spdk/lib/ftl/ftl_trace.h new file mode 100644 index 000000000..52988cff6 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_trace.h @@ -0,0 +1,84 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FTL_TRACE_H +#define FTL_TRACE_H + +#include "ftl_addr.h" + +#define FTL_TRACE_INVALID_ID ((uint64_t) -1) + +enum ftl_trace_completion { + FTL_TRACE_COMPLETION_INVALID, + FTL_TRACE_COMPLETION_CACHE, + FTL_TRACE_COMPLETION_DISK, +}; + +struct ftl_trace { + /* Monotonically incrementing event id */ + uint64_t id; +}; + +struct spdk_ftl_dev; +struct ftl_trace; +struct ftl_io; +struct ftl_wbuf_entry; +struct ftl_band; + +#if defined(DEBUG) +uint64_t ftl_trace_alloc_id(struct spdk_ftl_dev *dev); +void ftl_trace_defrag_band(struct spdk_ftl_dev *dev, const struct ftl_band *band); +void ftl_trace_write_band(struct spdk_ftl_dev *dev, const struct ftl_band *band); +void ftl_trace_lba_io_init(struct spdk_ftl_dev *dev, const struct ftl_io *io); +void ftl_trace_wbuf_fill(struct spdk_ftl_dev *dev, const struct ftl_io *io); +void ftl_trace_wbuf_pop(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry); +void ftl_trace_submission(struct spdk_ftl_dev *dev, + const struct ftl_io *io, + struct ftl_addr addr, size_t addr_cnt); +void ftl_trace_completion(struct spdk_ftl_dev *dev, + const struct ftl_io *io, + enum ftl_trace_completion type); +void ftl_trace_limits(struct spdk_ftl_dev *dev, int limit, size_t num_free); +#else /* defined(DEBUG) */ +#define ftl_trace_alloc_id(dev) FTL_TRACE_INVALID_ID +#define ftl_trace_defrag_band(dev, band) +#define ftl_trace_write_band(dev, band) +#define ftl_trace_lba_io_init(dev, io) +#define ftl_trace_wbuf_fill(dev, io) +#define ftl_trace_wbuf_pop(dev, entry) +#define ftl_trace_submission(dev, io, addr, addr_cnt) +#define ftl_trace_completion(dev, io, type) +#define ftl_trace_limits(dev, limits, num_free) +#endif + +#endif /* FTL_TRACE_H */ diff --git a/src/spdk/lib/ftl/spdk_ftl.map b/src/spdk/lib/ftl/spdk_ftl.map new file mode 100644 index 000000000..141fd01e0 --- /dev/null +++ b/src/spdk/lib/ftl/spdk_ftl.map @@ -0,0 +1,14 @@ +{ + global: + + # public functions + spdk_ftl_dev_init; + spdk_ftl_dev_free; + spdk_ftl_conf_init_defaults; + spdk_ftl_dev_get_attrs; + spdk_ftl_read; + spdk_ftl_write; + spdk_ftl_flush; + + local: *; +}; -- cgit v1.2.3