diff options
Diffstat (limited to 'src/spdk/lib/ftl/ftl_core.c')
-rw-r--r-- | src/spdk/lib/ftl/ftl_core.c | 2460 |
1 files changed, 2460 insertions, 0 deletions
diff --git a/src/spdk/lib/ftl/ftl_core.c b/src/spdk/lib/ftl/ftl_core.c new file mode 100644 index 000000000..b0b448806 --- /dev/null +++ b/src/spdk/lib/ftl/ftl_core.c @@ -0,0 +1,2460 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/likely.h" +#include "spdk/stdinc.h" +#include "spdk/nvme.h" +#include "spdk/thread.h" +#include "spdk/bdev_module.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" +#include "spdk/ftl.h" +#include "spdk/crc32.h" + +#include "ftl_core.h" +#include "ftl_band.h" +#include "ftl_io.h" +#include "ftl_debug.h" +#include "ftl_reloc.h" + +struct ftl_band_flush { + struct spdk_ftl_dev *dev; + /* Number of bands left to be flushed */ + size_t num_bands; + /* User callback */ + spdk_ftl_fn cb_fn; + /* Callback's argument */ + void *cb_arg; + /* List link */ + LIST_ENTRY(ftl_band_flush) list_entry; +}; + +struct ftl_wptr { + /* Owner device */ + struct spdk_ftl_dev *dev; + + /* Current address */ + struct ftl_addr addr; + + /* Band currently being written to */ + struct ftl_band *band; + + /* Current logical block's offset */ + uint64_t offset; + + /* Current zone */ + struct ftl_zone *zone; + + /* Pending IO queue */ + TAILQ_HEAD(, ftl_io) pending_queue; + + /* List link */ + LIST_ENTRY(ftl_wptr) list_entry; + + /* + * If setup in direct mode, there will be no offset or band state update after IO. + * The zoned bdev address is not assigned by wptr, and is instead taken directly + * from the request. + */ + bool direct_mode; + + /* Number of outstanding write requests */ + uint32_t num_outstanding; + + /* Marks that the band related to this wptr needs to be closed as soon as possible */ + bool flush; +}; + +struct ftl_flush { + /* Owner device */ + struct spdk_ftl_dev *dev; + + /* Number of batches to wait for */ + size_t num_req; + + /* Callback */ + struct { + spdk_ftl_fn fn; + void *ctx; + } cb; + + /* Batch bitmap */ + struct spdk_bit_array *bmap; + + /* List link */ + LIST_ENTRY(ftl_flush) list_entry; +}; + +static void +ftl_wptr_free(struct ftl_wptr *wptr) +{ + if (!wptr) { + return; + } + + free(wptr); +} + +static void +ftl_remove_wptr(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_band_flush *flush, *tmp; + + if (spdk_unlikely(wptr->flush)) { + LIST_FOREACH_SAFE(flush, &dev->band_flush_list, list_entry, tmp) { + assert(flush->num_bands > 0); + if (--flush->num_bands == 0) { + flush->cb_fn(flush->cb_arg, 0); + LIST_REMOVE(flush, list_entry); + free(flush); + } + } + } + + LIST_REMOVE(wptr, list_entry); + ftl_wptr_free(wptr); +} + +static struct ftl_wbuf_entry * +ftl_acquire_wbuf_entry(struct ftl_io_channel *io_channel, int io_flags) +{ + struct ftl_wbuf_entry *entry = NULL; + uint32_t qdepth; + + if (!(io_flags & FTL_IO_INTERNAL)) { + qdepth = __atomic_fetch_add(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + if (qdepth >= io_channel->qdepth_limit) { + __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + return NULL; + } + } + + if (spdk_ring_dequeue(io_channel->free_queue, (void **)&entry, 1) != 1) { + if (!(io_flags & FTL_IO_INTERNAL)) { + __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + } + + return NULL; + } + + assert(entry != NULL); + + ftl_evict_cache_entry(io_channel->dev, entry); + + entry->io_flags = io_flags; + entry->addr.offset = FTL_ADDR_INVALID; + entry->lba = FTL_LBA_INVALID; + entry->band = NULL; + entry->valid = false; + + return entry; +} + +static void +ftl_release_wbuf_entry(struct ftl_wbuf_entry *entry) +{ + struct ftl_io_channel *io_channel = entry->ioch; + + if (!(entry->io_flags & FTL_IO_INTERNAL)) { + __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST); + } + + spdk_ring_enqueue(io_channel->free_queue, (void **)&entry, 1, NULL); +} + +static struct ftl_batch * +ftl_get_next_batch(struct spdk_ftl_dev *dev) +{ + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; +#define FTL_DEQUEUE_ENTRIES 128 + struct ftl_wbuf_entry *entries[FTL_DEQUEUE_ENTRIES]; + TAILQ_HEAD(, ftl_io_channel) ioch_queue; + size_t i, num_dequeued, num_remaining; + uint64_t *metadata; + + if (batch == NULL) { + batch = TAILQ_FIRST(&dev->pending_batches); + if (batch != NULL) { + TAILQ_REMOVE(&dev->pending_batches, batch, tailq); + return batch; + } + + batch = TAILQ_FIRST(&dev->free_batches); + if (spdk_unlikely(batch == NULL)) { + return NULL; + } + + assert(TAILQ_EMPTY(&batch->entries)); + assert(batch->num_entries == 0); + TAILQ_REMOVE(&dev->free_batches, batch, tailq); + } + + /* + * Keep shifting the queue to ensure fairness in IO channel selection. Each time + * ftl_get_next_batch() is called, we're starting to dequeue write buffer entries from a + * different IO channel. + */ + TAILQ_INIT(&ioch_queue); + while (!TAILQ_EMPTY(&dev->ioch_queue)) { + ioch = TAILQ_FIRST(&dev->ioch_queue); + TAILQ_REMOVE(&dev->ioch_queue, ioch, tailq); + TAILQ_INSERT_TAIL(&ioch_queue, ioch, tailq); + + num_remaining = dev->xfer_size - batch->num_entries; + while (num_remaining > 0) { + num_dequeued = spdk_ring_dequeue(ioch->submit_queue, (void **)entries, + spdk_min(num_remaining, + FTL_DEQUEUE_ENTRIES)); + if (num_dequeued == 0) { + break; + } + + for (i = 0; i < num_dequeued; ++i) { + batch->iov[batch->num_entries + i].iov_base = entries[i]->payload; + batch->iov[batch->num_entries + i].iov_len = FTL_BLOCK_SIZE; + + if (batch->metadata != NULL) { + metadata = (uint64_t *)((char *)batch->metadata + + i * dev->md_size); + *metadata = entries[i]->lba; + } + + TAILQ_INSERT_TAIL(&batch->entries, entries[i], tailq); + } + + batch->num_entries += num_dequeued; + num_remaining -= num_dequeued; + } + + if (num_remaining == 0) { + break; + } + } + + TAILQ_CONCAT(&dev->ioch_queue, &ioch_queue, tailq); + + if (batch->num_entries == dev->xfer_size) { + dev->current_batch = NULL; + } else { + dev->current_batch = batch; + batch = NULL; + } + + return batch; +} + +static void +ftl_release_batch(struct spdk_ftl_dev *dev, struct ftl_batch *batch) +{ + struct ftl_wbuf_entry *entry; + + while (!TAILQ_EMPTY(&batch->entries)) { + entry = TAILQ_FIRST(&batch->entries); + TAILQ_REMOVE(&batch->entries, entry, tailq); + ftl_release_wbuf_entry(entry); + } + + batch->num_entries = 0; + TAILQ_INSERT_TAIL(&dev->free_batches, batch, tailq); +} + +static struct ftl_wbuf_entry * +ftl_get_entry_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_io_channel *ioch; + uint64_t ioch_offset, entry_offset; + + ioch_offset = addr.cache_offset & ((1 << dev->ioch_shift) - 1); + entry_offset = addr.cache_offset >> dev->ioch_shift; + ioch = dev->ioch_array[ioch_offset]; + + assert(ioch_offset < dev->conf.max_io_channels); + assert(entry_offset < ioch->num_entries); + assert(addr.cached == 1); + + return &ioch->wbuf_entries[entry_offset]; +} + +static struct ftl_addr +ftl_get_addr_from_entry(struct ftl_wbuf_entry *entry) +{ + struct ftl_io_channel *ioch = entry->ioch; + struct ftl_addr addr = {}; + + addr.cached = 1; + addr.cache_offset = (uint64_t)entry->index << ioch->dev->ioch_shift | ioch->index; + + return addr; +} + +static void +ftl_io_cmpl_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_io *io = cb_arg; + struct spdk_ftl_dev *dev = io->dev; + + if (spdk_unlikely(!success)) { + io->status = -EIO; + } + + ftl_trace_completion(dev, io, FTL_TRACE_COMPLETION_DISK); + + if (io->type == FTL_IO_WRITE && ftl_is_append_supported(dev)) { + assert(io->parent); + io->parent->addr.offset = spdk_bdev_io_get_append_location(bdev_io); + } + + ftl_io_dec_req(io); + if (ftl_io_done(io)) { + ftl_io_complete(io); + } + + spdk_bdev_free_io(bdev_io); +} + +static void +ftl_halt_writes(struct spdk_ftl_dev *dev, struct ftl_band *band) +{ + struct ftl_wptr *wptr = NULL; + + LIST_FOREACH(wptr, &dev->wptr_list, list_entry) { + if (wptr->band == band) { + break; + } + } + + /* If the band already has the high_prio flag set, other writes must */ + /* have failed earlier, so it's already taken care of. */ + if (band->high_prio) { + assert(wptr == NULL); + return; + } + + ftl_band_write_failed(band); + ftl_remove_wptr(wptr); +} + +static struct ftl_wptr * +ftl_wptr_from_band(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_wptr *wptr = NULL; + + LIST_FOREACH(wptr, &dev->wptr_list, list_entry) { + if (wptr->band == band) { + return wptr; + } + } + + return NULL; +} + +static void +ftl_md_write_fail(struct ftl_io *io, int status) +{ + struct ftl_band *band = io->band; + struct ftl_wptr *wptr; + char buf[128]; + + wptr = ftl_wptr_from_band(band); + assert(wptr); + + SPDK_ERRLOG("Metadata write failed @addr: %s, status: %d\n", + ftl_addr2str(wptr->addr, buf, sizeof(buf)), status); + + ftl_halt_writes(io->dev, band); +} + +static void +ftl_md_write_cb(struct ftl_io *io, void *arg, int status) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + struct ftl_band *band = io->band; + struct ftl_wptr *wptr; + size_t id; + + wptr = ftl_wptr_from_band(band); + assert(wptr); + + if (status) { + ftl_md_write_fail(io, status); + return; + } + + ftl_band_set_next_state(band); + if (band->state == FTL_BAND_STATE_CLOSED) { + if (ftl_dev_has_nv_cache(dev)) { + pthread_spin_lock(&nv_cache->lock); + nv_cache->num_available += ftl_band_user_blocks(band); + + if (spdk_unlikely(nv_cache->num_available > nv_cache->num_data_blocks)) { + nv_cache->num_available = nv_cache->num_data_blocks; + } + pthread_spin_unlock(&nv_cache->lock); + } + + /* + * Go through the reloc_bitmap, checking for all the bands that had its data moved + * onto current band and update their counters to allow them to be used for writing + * (once they're closed and empty). + */ + for (id = 0; id < ftl_get_num_bands(dev); ++id) { + if (spdk_bit_array_get(band->reloc_bitmap, id)) { + assert(dev->bands[id].num_reloc_bands > 0); + dev->bands[id].num_reloc_bands--; + + spdk_bit_array_clear(band->reloc_bitmap, id); + } + } + + ftl_remove_wptr(wptr); + } +} + +static int +ftl_read_next_physical_addr(struct ftl_io *io, struct ftl_addr *addr) +{ + struct spdk_ftl_dev *dev = io->dev; + size_t num_blocks, max_blocks; + + assert(ftl_io_mode_physical(io)); + assert(io->iov_pos < io->iov_cnt); + + if (io->pos == 0) { + *addr = io->addr; + } else { + *addr = ftl_band_next_xfer_addr(io->band, io->addr, io->pos); + } + + assert(!ftl_addr_invalid(*addr)); + + /* Metadata has to be read in the way it's written (jumping across */ + /* the zones in xfer_size increments) */ + if (io->flags & FTL_IO_MD) { + max_blocks = dev->xfer_size - (addr->offset % dev->xfer_size); + num_blocks = spdk_min(ftl_io_iovec_len_left(io), max_blocks); + assert(addr->offset / dev->xfer_size == + (addr->offset + num_blocks - 1) / dev->xfer_size); + } else { + num_blocks = ftl_io_iovec_len_left(io); + } + + return num_blocks; +} + +static int +ftl_wptr_close_band(struct ftl_wptr *wptr) +{ + struct ftl_band *band = wptr->band; + + ftl_band_set_state(band, FTL_BAND_STATE_CLOSING); + + return ftl_band_write_tail_md(band, ftl_md_write_cb); +} + +static int +ftl_wptr_open_band(struct ftl_wptr *wptr) +{ + struct ftl_band *band = wptr->band; + + assert(ftl_band_zone_is_first(band, wptr->zone)); + assert(band->lba_map.num_vld == 0); + + ftl_band_clear_lba_map(band); + + assert(band->state == FTL_BAND_STATE_PREP); + ftl_band_set_state(band, FTL_BAND_STATE_OPENING); + + return ftl_band_write_head_md(band, ftl_md_write_cb); +} + +static int +ftl_submit_erase(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_band *band = io->band; + struct ftl_addr addr = io->addr; + struct ftl_io_channel *ioch; + struct ftl_zone *zone; + int rc = 0; + size_t i; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + for (i = 0; i < io->num_blocks; ++i) { + if (i != 0) { + zone = ftl_band_next_zone(band, ftl_band_zone_from_addr(band, addr)); + assert(zone->info.state == SPDK_BDEV_ZONE_STATE_FULL); + addr.offset = zone->info.zone_id; + } + + assert(ftl_addr_get_zone_offset(dev, addr) == 0); + + ftl_trace_submission(dev, io, addr, 1); + rc = spdk_bdev_zone_management(dev->base_bdev_desc, ioch->base_ioch, addr.offset, + SPDK_BDEV_ZONE_RESET, ftl_io_cmpl_cb, io); + if (spdk_unlikely(rc)) { + ftl_io_fail(io, rc); + SPDK_ERRLOG("Vector reset failed with status: %d\n", rc); + break; + } + + ftl_io_inc_req(io); + ftl_io_advance(io, 1); + } + + if (ftl_io_done(io)) { + ftl_io_complete(io); + } + + return rc; +} + +static bool +ftl_check_core_thread(const struct spdk_ftl_dev *dev) +{ + return dev->core_thread == spdk_get_thread(); +} + +struct spdk_io_channel * +ftl_get_io_channel(const struct spdk_ftl_dev *dev) +{ + if (ftl_check_core_thread(dev)) { + return dev->ioch; + } + + return NULL; +} + +static void +ftl_erase_fail(struct ftl_io *io, int status) +{ + struct ftl_zone *zone; + struct ftl_band *band = io->band; + char buf[128]; + + SPDK_ERRLOG("Erase failed at address: %s, status: %d\n", + ftl_addr2str(io->addr, buf, sizeof(buf)), status); + + zone = ftl_band_zone_from_addr(band, io->addr); + zone->info.state = SPDK_BDEV_ZONE_STATE_OFFLINE; + ftl_band_remove_zone(band, zone); + band->tail_md_addr = ftl_band_tail_md_addr(band); +} + +static void +ftl_zone_erase_cb(struct ftl_io *io, void *ctx, int status) +{ + struct ftl_zone *zone; + + zone = ftl_band_zone_from_addr(io->band, io->addr); + zone->busy = false; + + if (spdk_unlikely(status)) { + ftl_erase_fail(io, status); + return; + } + + zone->info.state = SPDK_BDEV_ZONE_STATE_EMPTY; + zone->info.write_pointer = zone->info.zone_id; +} + +static int +ftl_band_erase(struct ftl_band *band) +{ + struct ftl_zone *zone; + struct ftl_io *io; + int rc = 0; + + assert(band->state == FTL_BAND_STATE_CLOSED || + band->state == FTL_BAND_STATE_FREE); + + ftl_band_set_state(band, FTL_BAND_STATE_PREP); + + CIRCLEQ_FOREACH(zone, &band->zones, circleq) { + if (zone->info.state == SPDK_BDEV_ZONE_STATE_EMPTY) { + continue; + } + + io = ftl_io_erase_init(band, 1, ftl_zone_erase_cb); + if (!io) { + rc = -ENOMEM; + break; + } + + zone->busy = true; + io->addr.offset = zone->info.zone_id; + rc = ftl_submit_erase(io); + if (rc) { + zone->busy = false; + assert(0); + /* TODO: change band's state back to close? */ + break; + } + } + + return rc; +} + +static struct ftl_band * +ftl_next_write_band(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + + /* Find a free band that has all of its data moved onto other closed bands */ + LIST_FOREACH(band, &dev->free_bands, list_entry) { + assert(band->state == FTL_BAND_STATE_FREE); + if (band->num_reloc_bands == 0 && band->num_reloc_blocks == 0) { + break; + } + } + + if (spdk_unlikely(!band)) { + return NULL; + } + + if (ftl_band_erase(band)) { + /* TODO: handle erase failure */ + return NULL; + } + + return band; +} + +static struct ftl_band * +ftl_next_wptr_band(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + + if (!dev->next_band) { + band = ftl_next_write_band(dev); + } else { + assert(dev->next_band->state == FTL_BAND_STATE_PREP); + band = dev->next_band; + dev->next_band = NULL; + } + + return band; +} + +static struct ftl_wptr * +ftl_wptr_init(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_wptr *wptr; + + wptr = calloc(1, sizeof(*wptr)); + if (!wptr) { + return NULL; + } + + wptr->dev = dev; + wptr->band = band; + wptr->zone = CIRCLEQ_FIRST(&band->zones); + wptr->addr.offset = wptr->zone->info.zone_id; + TAILQ_INIT(&wptr->pending_queue); + + return wptr; +} + +static int +ftl_add_direct_wptr(struct ftl_band *band) +{ + struct spdk_ftl_dev *dev = band->dev; + struct ftl_wptr *wptr; + + assert(band->state == FTL_BAND_STATE_OPEN); + + wptr = ftl_wptr_init(band); + if (!wptr) { + return -1; + } + + wptr->direct_mode = true; + + if (ftl_band_alloc_lba_map(band)) { + ftl_wptr_free(wptr); + return -1; + } + + LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: direct band %u\n", band->id); + ftl_trace_write_band(dev, band); + return 0; +} + +static void +ftl_close_direct_wptr(struct ftl_band *band) +{ + struct ftl_wptr *wptr = ftl_wptr_from_band(band); + + assert(wptr); + assert(wptr->direct_mode); + assert(band->state == FTL_BAND_STATE_CLOSED); + + ftl_band_release_lba_map(band); + + ftl_remove_wptr(wptr); +} + +int +ftl_band_set_direct_access(struct ftl_band *band, bool access) +{ + if (access) { + return ftl_add_direct_wptr(band); + } else { + ftl_close_direct_wptr(band); + return 0; + } +} + +static int +ftl_add_wptr(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + struct ftl_wptr *wptr; + + band = ftl_next_wptr_band(dev); + if (!band) { + return -1; + } + + wptr = ftl_wptr_init(band); + if (!wptr) { + return -1; + } + + if (ftl_band_write_prep(band)) { + ftl_wptr_free(wptr); + return -1; + } + + LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: band %u\n", band->id); + ftl_trace_write_band(dev, band); + return 0; +} + +static void +ftl_wptr_advance(struct ftl_wptr *wptr, size_t xfer_size) +{ + struct ftl_band *band = wptr->band; + struct spdk_ftl_dev *dev = wptr->dev; + struct spdk_ftl_conf *conf = &dev->conf; + size_t next_thld; + + if (spdk_unlikely(wptr->direct_mode)) { + return; + } + + wptr->offset += xfer_size; + next_thld = (ftl_band_num_usable_blocks(band) * conf->band_thld) / 100; + + if (ftl_band_full(band, wptr->offset)) { + ftl_band_set_state(band, FTL_BAND_STATE_FULL); + } + + wptr->zone->busy = true; + wptr->addr = ftl_band_next_xfer_addr(band, wptr->addr, xfer_size); + wptr->zone = ftl_band_next_operational_zone(band, wptr->zone); + + assert(!ftl_addr_invalid(wptr->addr)); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: pu:%lu band:%lu, offset:%lu\n", + ftl_addr_get_punit(dev, wptr->addr), + ftl_addr_get_band(dev, wptr->addr), + wptr->addr.offset); + + if (wptr->offset >= next_thld && !dev->next_band) { + dev->next_band = ftl_next_write_band(dev); + } +} + +static size_t +ftl_wptr_user_blocks_left(const struct ftl_wptr *wptr) +{ + return ftl_band_user_blocks_left(wptr->band, wptr->offset); +} + +static bool +ftl_wptr_ready(struct ftl_wptr *wptr) +{ + struct ftl_band *band = wptr->band; + + /* TODO: add handling of empty bands */ + + if (spdk_unlikely(!ftl_zone_is_writable(wptr->dev, wptr->zone))) { + /* Erasing band may fail after it was assigned to wptr. */ + if (spdk_unlikely(wptr->zone->info.state == SPDK_BDEV_ZONE_STATE_OFFLINE)) { + ftl_wptr_advance(wptr, wptr->dev->xfer_size); + } + return false; + } + + /* If we're in the process of writing metadata, wait till it is */ + /* completed. */ + /* TODO: we should probably change bands once we're writing tail md */ + if (ftl_band_state_changing(band)) { + return false; + } + + if (band->state == FTL_BAND_STATE_FULL) { + if (wptr->num_outstanding == 0) { + if (ftl_wptr_close_band(wptr)) { + /* TODO: need recovery here */ + assert(false); + } + } + + return false; + } + + if (band->state != FTL_BAND_STATE_OPEN) { + if (ftl_wptr_open_band(wptr)) { + /* TODO: need recovery here */ + assert(false); + } + + return false; + } + + return true; +} + +int +ftl_flush_active_bands(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_wptr *wptr; + struct ftl_band_flush *flush; + + assert(ftl_get_core_thread(dev) == spdk_get_thread()); + + flush = calloc(1, sizeof(*flush)); + if (spdk_unlikely(!flush)) { + return -ENOMEM; + } + + LIST_INSERT_HEAD(&dev->band_flush_list, flush, list_entry); + + flush->cb_fn = cb_fn; + flush->cb_arg = cb_arg; + flush->dev = dev; + + LIST_FOREACH(wptr, &dev->wptr_list, list_entry) { + wptr->flush = true; + flush->num_bands++; + } + + return 0; +} + +static const struct spdk_ftl_limit * +ftl_get_limit(const struct spdk_ftl_dev *dev, int type) +{ + assert(type < SPDK_FTL_LIMIT_MAX); + return &dev->conf.limits[type]; +} + +static bool +ftl_cache_lba_valid(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry) +{ + struct ftl_addr addr; + + /* If the LBA is invalid don't bother checking the md and l2p */ + if (spdk_unlikely(entry->lba == FTL_LBA_INVALID)) { + return false; + } + + addr = ftl_l2p_get(dev, entry->lba); + if (!(ftl_addr_cached(addr) && entry == ftl_get_entry_from_addr(dev, addr))) { + return false; + } + + return true; +} + +void +ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry) +{ + pthread_spin_lock(&entry->lock); + + if (!entry->valid) { + goto unlock; + } + + /* If the l2p wasn't updated and still points at the entry, fill it with the */ + /* on-disk address and clear the cache status bit. Otherwise, skip the l2p update */ + /* and just clear the cache status. */ + if (!ftl_cache_lba_valid(dev, entry)) { + goto clear; + } + + ftl_l2p_set(dev, entry->lba, entry->addr); +clear: + entry->valid = false; +unlock: + pthread_spin_unlock(&entry->lock); +} + +static void +ftl_pad_wbuf(struct spdk_ftl_dev *dev, size_t size) +{ + struct ftl_wbuf_entry *entry; + struct ftl_io_channel *ioch; + int flags = FTL_IO_PAD | FTL_IO_INTERNAL; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + for (size_t i = 0; i < size; ++i) { + entry = ftl_acquire_wbuf_entry(ioch, flags); + if (!entry) { + break; + } + + entry->lba = FTL_LBA_INVALID; + entry->addr = ftl_to_addr(FTL_ADDR_INVALID); + memset(entry->payload, 0, FTL_BLOCK_SIZE); + + spdk_ring_enqueue(ioch->submit_queue, (void **)&entry, 1, NULL); + } +} + +static void +ftl_remove_free_bands(struct spdk_ftl_dev *dev) +{ + while (!LIST_EMPTY(&dev->free_bands)) { + LIST_REMOVE(LIST_FIRST(&dev->free_bands), list_entry); + } + + dev->next_band = NULL; +} + +static void +ftl_wptr_pad_band(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; + size_t size, pad_size, blocks_left; + + size = batch != NULL ? batch->num_entries : 0; + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + size += spdk_ring_count(ioch->submit_queue); + } + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + blocks_left = ftl_wptr_user_blocks_left(wptr); + assert(size <= blocks_left); + assert(blocks_left % dev->xfer_size == 0); + pad_size = spdk_min(blocks_left - size, spdk_ring_count(ioch->free_queue)); + + ftl_pad_wbuf(dev, pad_size); +} + +static void +ftl_wptr_process_shutdown(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; + size_t size; + + size = batch != NULL ? batch->num_entries : 0; + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + size += spdk_ring_count(ioch->submit_queue); + } + + if (size >= dev->xfer_size) { + return; + } + + /* If we reach this point we need to remove free bands */ + /* and pad current wptr band to the end */ + ftl_remove_free_bands(dev); + ftl_wptr_pad_band(wptr); +} + +static int +ftl_shutdown_complete(struct spdk_ftl_dev *dev) +{ + struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(dev->ioch); + + return !__atomic_load_n(&dev->num_inflight, __ATOMIC_SEQ_CST) && + dev->num_io_channels == 1 && LIST_EMPTY(&dev->wptr_list) && + TAILQ_EMPTY(&ioch->retry_queue); +} + +void +ftl_apply_limits(struct spdk_ftl_dev *dev) +{ + const struct spdk_ftl_limit *limit; + struct ftl_io_channel *ioch; + struct ftl_stats *stats = &dev->stats; + uint32_t qdepth_limit = 100; + int i; + + /* Clear existing limit */ + dev->limit = SPDK_FTL_LIMIT_MAX; + + for (i = SPDK_FTL_LIMIT_CRIT; i < SPDK_FTL_LIMIT_MAX; ++i) { + limit = ftl_get_limit(dev, i); + + if (dev->num_free <= limit->thld) { + qdepth_limit = limit->limit; + stats->limits[i]++; + dev->limit = i; + break; + } + } + + ftl_trace_limits(dev, dev->limit, dev->num_free); + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + __atomic_store_n(&ioch->qdepth_limit, (qdepth_limit * ioch->num_entries) / 100, + __ATOMIC_SEQ_CST); + } +} + +static int +ftl_invalidate_addr_unlocked(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_band *band = ftl_band_from_addr(dev, addr); + struct ftl_lba_map *lba_map = &band->lba_map; + uint64_t offset; + + offset = ftl_band_block_offset_from_addr(band, addr); + + /* The bit might be already cleared if two writes are scheduled to the */ + /* same LBA at the same time */ + if (spdk_bit_array_get(lba_map->vld, offset)) { + assert(lba_map->num_vld > 0); + spdk_bit_array_clear(lba_map->vld, offset); + lba_map->num_vld--; + return 1; + } + + return 0; +} + +int +ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr) +{ + struct ftl_band *band; + int rc; + + assert(!ftl_addr_cached(addr)); + band = ftl_band_from_addr(dev, addr); + + pthread_spin_lock(&band->lba_map.lock); + rc = ftl_invalidate_addr_unlocked(dev, addr); + pthread_spin_unlock(&band->lba_map.lock); + + return rc; +} + +static int +ftl_read_retry(int rc) +{ + return rc == -EAGAIN; +} + +static int +ftl_read_canceled(int rc) +{ + return rc == -EFAULT || rc == 0; +} + +static int +ftl_cache_read(struct ftl_io *io, uint64_t lba, + struct ftl_addr addr, void *buf) +{ + struct ftl_wbuf_entry *entry; + struct ftl_addr naddr; + int rc = 0; + + entry = ftl_get_entry_from_addr(io->dev, addr); + pthread_spin_lock(&entry->lock); + + naddr = ftl_l2p_get(io->dev, lba); + if (addr.offset != naddr.offset) { + rc = -1; + goto out; + } + + memcpy(buf, entry->payload, FTL_BLOCK_SIZE); +out: + pthread_spin_unlock(&entry->lock); + return rc; +} + +static int +ftl_read_next_logical_addr(struct ftl_io *io, struct ftl_addr *addr) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_addr next_addr; + size_t i; + + *addr = ftl_l2p_get(dev, ftl_io_current_lba(io)); + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Read addr:%lx, lba:%lu\n", + addr->offset, ftl_io_current_lba(io)); + + /* If the address is invalid, skip it (the buffer should already be zero'ed) */ + if (ftl_addr_invalid(*addr)) { + return -EFAULT; + } + + if (ftl_addr_cached(*addr)) { + if (!ftl_cache_read(io, ftl_io_current_lba(io), *addr, ftl_io_iovec_addr(io))) { + return 0; + } + + /* If the state changed, we have to re-read the l2p */ + return -EAGAIN; + } + + for (i = 1; i < ftl_io_iovec_len_left(io); ++i) { + next_addr = ftl_l2p_get(dev, ftl_io_get_lba(io, io->pos + i)); + + if (ftl_addr_invalid(next_addr) || ftl_addr_cached(next_addr)) { + break; + } + + if (addr->offset + i != next_addr.offset) { + break; + } + } + + return i; +} + +static int +ftl_submit_read(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch; + struct ftl_addr addr; + int rc = 0, num_blocks; + + ioch = ftl_io_channel_get_ctx(io->ioch); + + assert(LIST_EMPTY(&io->children)); + + while (io->pos < io->num_blocks) { + if (ftl_io_mode_physical(io)) { + num_blocks = rc = ftl_read_next_physical_addr(io, &addr); + } else { + num_blocks = rc = ftl_read_next_logical_addr(io, &addr); + } + + /* We might need to retry the read from scratch (e.g. */ + /* because write was under way and completed before */ + /* we could read it from the write buffer */ + if (ftl_read_retry(rc)) { + continue; + } + + /* We don't have to schedule the read, as it was read from cache */ + if (ftl_read_canceled(rc)) { + ftl_io_advance(io, 1); + ftl_trace_completion(io->dev, io, rc ? FTL_TRACE_COMPLETION_INVALID : + FTL_TRACE_COMPLETION_CACHE); + rc = 0; + continue; + } + + assert(num_blocks > 0); + + ftl_trace_submission(dev, io, addr, num_blocks); + rc = spdk_bdev_read_blocks(dev->base_bdev_desc, ioch->base_ioch, + ftl_io_iovec_addr(io), + addr.offset, + num_blocks, ftl_io_cmpl_cb, io); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry); + rc = 0; + } else { + ftl_io_fail(io, rc); + } + break; + } + + ftl_io_inc_req(io); + ftl_io_advance(io, num_blocks); + } + + /* If we didn't have to read anything from the device, */ + /* complete the request right away */ + if (ftl_io_done(io)) { + ftl_io_complete(io); + } + + return rc; +} + +static void +ftl_complete_flush(struct ftl_flush *flush) +{ + assert(flush->num_req == 0); + LIST_REMOVE(flush, list_entry); + + flush->cb.fn(flush->cb.ctx, 0); + + spdk_bit_array_free(&flush->bmap); + free(flush); +} + +static void +ftl_process_flush(struct spdk_ftl_dev *dev, struct ftl_batch *batch) +{ + struct ftl_flush *flush, *tflush; + size_t offset; + + LIST_FOREACH_SAFE(flush, &dev->flush_list, list_entry, tflush) { + offset = batch->index; + + if (spdk_bit_array_get(flush->bmap, offset)) { + spdk_bit_array_clear(flush->bmap, offset); + if (!(--flush->num_req)) { + ftl_complete_flush(flush); + } + } + } +} + +static void +ftl_nv_cache_wrap_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_nv_cache *nv_cache = cb_arg; + + if (!success) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n"); + /* TODO: go into read-only mode */ + assert(0); + } + + pthread_spin_lock(&nv_cache->lock); + nv_cache->ready = true; + pthread_spin_unlock(&nv_cache->lock); + + spdk_bdev_free_io(bdev_io); +} + +static void +ftl_nv_cache_wrap(void *ctx) +{ + struct ftl_nv_cache *nv_cache = ctx; + int rc; + + rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_wrap_cb, nv_cache); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Unable to write non-volatile cache metadata header: %s\n", + spdk_strerror(-rc)); + /* TODO: go into read-only mode */ + assert(0); + } +} + +static uint64_t +ftl_reserve_nv_cache(struct ftl_nv_cache *nv_cache, size_t *num_blocks, unsigned int *phase) +{ + struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + uint64_t num_available, cache_size, cache_addr = FTL_LBA_INVALID; + + cache_size = spdk_bdev_get_num_blocks(bdev); + + pthread_spin_lock(&nv_cache->lock); + if (spdk_unlikely(nv_cache->num_available == 0 || !nv_cache->ready)) { + goto out; + } + + num_available = spdk_min(nv_cache->num_available, *num_blocks); + num_available = spdk_min(num_available, dev->conf.nv_cache.max_request_cnt); + + if (spdk_unlikely(nv_cache->current_addr + num_available > cache_size)) { + *num_blocks = cache_size - nv_cache->current_addr; + } else { + *num_blocks = num_available; + } + + cache_addr = nv_cache->current_addr; + nv_cache->current_addr += *num_blocks; + nv_cache->num_available -= *num_blocks; + *phase = nv_cache->phase; + + if (nv_cache->current_addr == spdk_bdev_get_num_blocks(bdev)) { + nv_cache->current_addr = FTL_NV_CACHE_DATA_OFFSET; + nv_cache->phase = ftl_nv_cache_next_phase(nv_cache->phase); + nv_cache->ready = false; + spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_nv_cache_wrap, nv_cache); + } +out: + pthread_spin_unlock(&nv_cache->lock); + return cache_addr; +} + +static struct ftl_io * +ftl_alloc_io_nv_cache(struct ftl_io *parent, size_t num_blocks) +{ + struct ftl_io_init_opts opts = { + .dev = parent->dev, + .parent = parent, + .iovcnt = 0, + .num_blocks = num_blocks, + .flags = parent->flags | FTL_IO_CACHE, + }; + + return ftl_io_init_internal(&opts); +} + +static void +ftl_nv_cache_submit_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct ftl_io *io = cb_arg; + struct ftl_nv_cache *nv_cache = &io->dev->nv_cache; + + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("Non-volatile cache write failed at %"PRIx64"\n", io->addr.offset); + io->status = -EIO; + } + + ftl_io_dec_req(io); + if (ftl_io_done(io)) { + spdk_mempool_put(nv_cache->md_pool, io->md); + ftl_io_complete(io); + } + + spdk_bdev_free_io(bdev_io); +} + +static void +ftl_submit_nv_cache(void *ctx) +{ + struct ftl_io *io = ctx; + struct spdk_ftl_dev *dev = io->dev; + struct spdk_thread *thread; + struct ftl_nv_cache *nv_cache = &dev->nv_cache; + struct ftl_io_channel *ioch; + int rc; + + ioch = ftl_io_channel_get_ctx(io->ioch); + thread = spdk_io_channel_get_thread(io->ioch); + + rc = spdk_bdev_write_blocks_with_md(nv_cache->bdev_desc, ioch->cache_ioch, + ftl_io_iovec_addr(io), io->md, io->addr.offset, + io->num_blocks, ftl_nv_cache_submit_cb, io); + if (rc == -ENOMEM) { + spdk_thread_send_msg(thread, ftl_submit_nv_cache, io); + return; + } else if (rc) { + SPDK_ERRLOG("Write to persistent cache failed: %s (%"PRIu64", %"PRIu64")\n", + spdk_strerror(-rc), io->addr.offset, io->num_blocks); + spdk_mempool_put(nv_cache->md_pool, io->md); + io->status = -EIO; + ftl_io_complete(io); + return; + } + + ftl_io_advance(io, io->num_blocks); + ftl_io_inc_req(io); +} + +static void +ftl_nv_cache_fill_md(struct ftl_io *io, unsigned int phase) +{ + struct spdk_bdev *bdev; + struct ftl_nv_cache *nv_cache = &io->dev->nv_cache; + uint64_t block_off, lba; + void *md_buf = io->md; + + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + + for (block_off = 0; block_off < io->num_blocks; ++block_off) { + lba = ftl_nv_cache_pack_lba(ftl_io_get_lba(io, block_off), phase); + memcpy(md_buf, &lba, sizeof(lba)); + md_buf += spdk_bdev_get_md_size(bdev); + } +} + +static void +_ftl_write_nv_cache(void *ctx) +{ + struct ftl_io *child, *io = ctx; + struct spdk_ftl_dev *dev = io->dev; + struct spdk_thread *thread; + unsigned int phase; + uint64_t num_blocks; + + thread = spdk_io_channel_get_thread(io->ioch); + + while (io->pos < io->num_blocks) { + num_blocks = ftl_io_iovec_len_left(io); + + child = ftl_alloc_io_nv_cache(io, num_blocks); + if (spdk_unlikely(!child)) { + spdk_thread_send_msg(thread, _ftl_write_nv_cache, io); + return; + } + + child->md = spdk_mempool_get(dev->nv_cache.md_pool); + if (spdk_unlikely(!child->md)) { + ftl_io_free(child); + spdk_thread_send_msg(thread, _ftl_write_nv_cache, io); + break; + } + + /* Reserve area on the write buffer cache */ + child->addr.offset = ftl_reserve_nv_cache(&dev->nv_cache, &num_blocks, &phase); + if (child->addr.offset == FTL_LBA_INVALID) { + spdk_mempool_put(dev->nv_cache.md_pool, child->md); + ftl_io_free(child); + spdk_thread_send_msg(thread, _ftl_write_nv_cache, io); + break; + } + + /* Shrink the IO if there isn't enough room in the cache to fill the whole iovec */ + if (spdk_unlikely(num_blocks != ftl_io_iovec_len_left(io))) { + ftl_io_shrink_iovec(child, num_blocks); + } + + ftl_nv_cache_fill_md(child, phase); + ftl_submit_nv_cache(child); + } + + if (ftl_io_done(io)) { + ftl_io_complete(io); + } +} + +static void +ftl_write_nv_cache(struct ftl_io *parent) +{ + ftl_io_reset(parent); + parent->flags |= FTL_IO_CACHE; + _ftl_write_nv_cache(parent); +} + +int +ftl_nv_cache_write_header(struct ftl_nv_cache *nv_cache, bool shutdown, + spdk_bdev_io_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + struct ftl_nv_cache_header *hdr = nv_cache->dma_buf; + struct spdk_bdev *bdev; + struct ftl_io_channel *ioch; + + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + + memset(hdr, 0, spdk_bdev_get_block_size(bdev)); + + hdr->phase = (uint8_t)nv_cache->phase; + hdr->size = spdk_bdev_get_num_blocks(bdev); + hdr->uuid = dev->uuid; + hdr->version = FTL_NV_CACHE_HEADER_VERSION; + hdr->current_addr = shutdown ? nv_cache->current_addr : FTL_LBA_INVALID; + hdr->checksum = spdk_crc32c_update(hdr, offsetof(struct ftl_nv_cache_header, checksum), 0); + + return spdk_bdev_write_blocks(nv_cache->bdev_desc, ioch->cache_ioch, hdr, 0, 1, + cb_fn, cb_arg); +} + +int +ftl_nv_cache_scrub(struct ftl_nv_cache *nv_cache, spdk_bdev_io_completion_cb cb_fn, void *cb_arg) +{ + struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache); + struct ftl_io_channel *ioch; + struct spdk_bdev *bdev; + + ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev)); + bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc); + + return spdk_bdev_write_zeroes_blocks(nv_cache->bdev_desc, ioch->cache_ioch, 1, + spdk_bdev_get_num_blocks(bdev) - 1, + cb_fn, cb_arg); +} + +static void +ftl_write_fail(struct ftl_io *io, int status) +{ + struct ftl_batch *batch = io->batch; + struct spdk_ftl_dev *dev = io->dev; + struct ftl_wbuf_entry *entry; + struct ftl_band *band; + char buf[128]; + + entry = TAILQ_FIRST(&batch->entries); + + band = ftl_band_from_addr(io->dev, entry->addr); + SPDK_ERRLOG("Write failed @addr: %s, status: %d\n", + ftl_addr2str(entry->addr, buf, sizeof(buf)), status); + + /* Close the band and, halt wptr and defrag */ + ftl_halt_writes(dev, band); + + TAILQ_FOREACH(entry, &batch->entries, tailq) { + /* Invalidate meta set by process_writes() */ + ftl_invalidate_addr(dev, entry->addr); + } + + /* Reset the batch back to the write buffer to resend it later */ + TAILQ_INSERT_TAIL(&dev->pending_batches, batch, tailq); +} + +static void +ftl_write_cb(struct ftl_io *io, void *arg, int status) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_batch *batch = io->batch; + struct ftl_wbuf_entry *entry; + struct ftl_band *band; + struct ftl_addr prev_addr, addr = io->addr; + + if (status) { + ftl_write_fail(io, status); + return; + } + + assert(io->num_blocks == dev->xfer_size); + assert(!(io->flags & FTL_IO_MD)); + + TAILQ_FOREACH(entry, &batch->entries, tailq) { + band = entry->band; + if (!(entry->io_flags & FTL_IO_PAD)) { + /* Verify that the LBA is set for user blocks */ + assert(entry->lba != FTL_LBA_INVALID); + } + + if (band != NULL) { + assert(band->num_reloc_blocks > 0); + band->num_reloc_blocks--; + } + + entry->addr = addr; + if (entry->lba != FTL_LBA_INVALID) { + pthread_spin_lock(&entry->lock); + prev_addr = ftl_l2p_get(dev, entry->lba); + + /* If the l2p was updated in the meantime, don't update band's metadata */ + if (ftl_addr_cached(prev_addr) && + entry == ftl_get_entry_from_addr(dev, prev_addr)) { + /* Setting entry's cache bit needs to be done after metadata */ + /* within the band is updated to make sure that writes */ + /* invalidating the entry clear the metadata as well */ + ftl_band_set_addr(io->band, entry->lba, entry->addr); + entry->valid = true; + } + pthread_spin_unlock(&entry->lock); + } + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write addr:%lu, lba:%lu\n", + entry->addr.offset, entry->lba); + + addr = ftl_band_next_addr(io->band, addr, 1); + } + + ftl_process_flush(dev, batch); + ftl_release_batch(dev, batch); +} + +static void +ftl_update_stats(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry) +{ + if (!(entry->io_flags & FTL_IO_INTERNAL)) { + dev->stats.write_user++; + } + dev->stats.write_total++; +} + +static void +ftl_update_l2p(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry, + struct ftl_addr addr) +{ + struct ftl_addr prev_addr; + struct ftl_wbuf_entry *prev; + struct ftl_band *band; + int valid; + bool io_weak = entry->io_flags & FTL_IO_WEAK; + + prev_addr = ftl_l2p_get(dev, entry->lba); + if (ftl_addr_invalid(prev_addr)) { + ftl_l2p_set(dev, entry->lba, addr); + return; + } + + if (ftl_addr_cached(prev_addr)) { + prev = ftl_get_entry_from_addr(dev, prev_addr); + pthread_spin_lock(&prev->lock); + + /* Re-read the L2P under the lock to protect against updates */ + /* to this LBA from other threads */ + prev_addr = ftl_l2p_get(dev, entry->lba); + + /* If the entry is no longer in cache, another write has been */ + /* scheduled in the meantime, so we can return to evicted path */ + if (!ftl_addr_cached(prev_addr)) { + pthread_spin_unlock(&prev->lock); + goto evicted; + } + + /* + * Relocating block could still reside in cache due to fact that write + * buffers are independent for each IO channel and enough amount of data + * (write unit size) must be collected before it will be submitted to lower + * layer. + * When previous entry wasn't overwritten invalidate old address and entry. + * Otherwise skip relocating block. + */ + if (io_weak && + /* Check if prev_addr was updated in meantime */ + !(ftl_addr_cmp(prev_addr, ftl_get_addr_from_entry(prev)) && + /* Check if relocating address it the same as in previous entry */ + ftl_addr_cmp(prev->addr, entry->addr))) { + pthread_spin_unlock(&prev->lock); + return; + } + + /* + * If previous entry is part of cache and was written into disk remove + * and invalidate it + */ + if (prev->valid) { + ftl_invalidate_addr(dev, prev->addr); + prev->valid = false; + } + + ftl_l2p_set(dev, entry->lba, addr); + pthread_spin_unlock(&prev->lock); + return; + } + +evicted: + /* + * If the L2P's physical address is different than what we expected we don't need to + * do anything (someone's already overwritten our data). + */ + if (io_weak && !ftl_addr_cmp(prev_addr, entry->addr)) { + return; + } + + /* Lock the band containing previous physical address. This assures atomic changes to */ + /* the L2P as wall as metadata. The valid bits in metadata are used to */ + /* check weak writes validity. */ + band = ftl_band_from_addr(dev, prev_addr); + pthread_spin_lock(&band->lba_map.lock); + + valid = ftl_invalidate_addr_unlocked(dev, prev_addr); + + /* If the address has been invalidated already, we don't want to update */ + /* the L2P for weak writes, as it means the write is no longer valid. */ + if (!io_weak || valid) { + ftl_l2p_set(dev, entry->lba, addr); + } + + pthread_spin_unlock(&band->lba_map.lock); +} + +static struct ftl_io * +ftl_io_init_child_write(struct ftl_io *parent, struct ftl_addr addr, ftl_io_fn cb) +{ + struct ftl_io *io; + struct spdk_ftl_dev *dev = parent->dev; + struct ftl_io_init_opts opts = { + .dev = dev, + .io = NULL, + .parent = parent, + .band = parent->band, + .size = sizeof(struct ftl_io), + .flags = 0, + .type = parent->type, + .num_blocks = dev->xfer_size, + .cb_fn = cb, + .iovcnt = 0, + }; + + io = ftl_io_init_internal(&opts); + if (!io) { + return NULL; + } + + io->addr = addr; + + return io; +} + +static void +ftl_io_child_write_cb(struct ftl_io *io, void *ctx, int status) +{ + struct ftl_zone *zone; + struct ftl_wptr *wptr; + + zone = ftl_band_zone_from_addr(io->band, io->addr); + wptr = ftl_wptr_from_band(io->band); + + zone->busy = false; + zone->info.write_pointer += io->num_blocks; + + if (zone->info.write_pointer == zone->info.zone_id + zone->info.capacity) { + zone->info.state = SPDK_BDEV_ZONE_STATE_FULL; + } + + /* If some other write on the same band failed the write pointer would already be freed */ + if (spdk_likely(wptr)) { + wptr->num_outstanding--; + } +} + +static int +ftl_submit_child_write(struct ftl_wptr *wptr, struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch; + struct ftl_io *child; + struct ftl_addr addr; + int rc; + + ioch = ftl_io_channel_get_ctx(io->ioch); + + if (spdk_likely(!wptr->direct_mode)) { + addr = wptr->addr; + } else { + assert(io->flags & FTL_IO_DIRECT_ACCESS); + assert(ftl_addr_get_band(dev, io->addr) == wptr->band->id); + addr = io->addr; + } + + /* Split IO to child requests and release zone immediately after child is completed */ + child = ftl_io_init_child_write(io, addr, ftl_io_child_write_cb); + if (!child) { + return -EAGAIN; + } + + wptr->num_outstanding++; + + if (ftl_is_append_supported(dev)) { + rc = spdk_bdev_zone_appendv(dev->base_bdev_desc, ioch->base_ioch, + child->iov, child->iov_cnt, + ftl_addr_get_zone_slba(dev, addr), + dev->xfer_size, ftl_io_cmpl_cb, child); + } else { + rc = spdk_bdev_writev_blocks(dev->base_bdev_desc, ioch->base_ioch, + child->iov, child->iov_cnt, addr.offset, + dev->xfer_size, ftl_io_cmpl_cb, child); + } + + if (rc) { + wptr->num_outstanding--; + ftl_io_fail(child, rc); + ftl_io_complete(child); + SPDK_ERRLOG("spdk_bdev_write_blocks_with_md failed with status:%d, addr:%lu\n", + rc, addr.offset); + return -EIO; + } + + ftl_io_inc_req(child); + ftl_io_advance(child, dev->xfer_size); + + return 0; +} + +static int +ftl_submit_write(struct ftl_wptr *wptr, struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + int rc = 0; + + assert(io->num_blocks % dev->xfer_size == 0); + + while (io->iov_pos < io->iov_cnt) { + /* There are no guarantees of the order of completion of NVMe IO submission queue */ + /* so wait until zone is not busy before submitting another write */ + if (!ftl_is_append_supported(dev) && wptr->zone->busy) { + TAILQ_INSERT_TAIL(&wptr->pending_queue, io, ioch_entry); + rc = -EAGAIN; + break; + } + + rc = ftl_submit_child_write(wptr, io); + if (spdk_unlikely(rc)) { + if (rc == -EAGAIN) { + TAILQ_INSERT_TAIL(&wptr->pending_queue, io, ioch_entry); + } else { + ftl_io_fail(io, rc); + } + break; + } + + ftl_trace_submission(dev, io, wptr->addr, dev->xfer_size); + ftl_wptr_advance(wptr, dev->xfer_size); + } + + if (ftl_io_done(io)) { + /* Parent IO will complete after all children are completed */ + ftl_io_complete(io); + } + + return rc; +} + +static void +ftl_flush_pad_batch(struct spdk_ftl_dev *dev) +{ + struct ftl_batch *batch = dev->current_batch; + struct ftl_io_channel *ioch; + size_t size = 0, num_entries = 0; + + assert(batch != NULL); + assert(batch->num_entries < dev->xfer_size); + + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + size += spdk_ring_count(ioch->submit_queue); + } + + num_entries = dev->xfer_size - batch->num_entries; + if (size < num_entries) { + ftl_pad_wbuf(dev, num_entries - size); + } +} + +static bool +ftl_check_io_channel_flush(struct spdk_ftl_dev *dev) +{ + struct ftl_io_channel *ioch; + + TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) { + if (ioch->flush && spdk_ring_count(ioch->free_queue) != ioch->num_entries) { + return true; + } + } + + return false; +} + +static int +ftl_wptr_process_writes(struct ftl_wptr *wptr) +{ + struct spdk_ftl_dev *dev = wptr->dev; + struct ftl_batch *batch; + struct ftl_wbuf_entry *entry; + struct ftl_io *io; + + if (spdk_unlikely(!TAILQ_EMPTY(&wptr->pending_queue))) { + io = TAILQ_FIRST(&wptr->pending_queue); + TAILQ_REMOVE(&wptr->pending_queue, io, ioch_entry); + + if (ftl_submit_write(wptr, io) == -EAGAIN) { + return 0; + } + } + + /* Make sure the band is prepared for writing */ + if (!ftl_wptr_ready(wptr)) { + return 0; + } + + if (dev->halt) { + ftl_wptr_process_shutdown(wptr); + } + + if (spdk_unlikely(wptr->flush)) { + ftl_wptr_pad_band(wptr); + } + + batch = ftl_get_next_batch(dev); + if (!batch) { + /* If there are queued flush requests we need to pad the write buffer to */ + /* force out remaining entries */ + if (!LIST_EMPTY(&dev->flush_list) || ftl_check_io_channel_flush(dev)) { + ftl_flush_pad_batch(dev); + } + + return 0; + } + + io = ftl_io_wbuf_init(dev, wptr->addr, wptr->band, batch, ftl_write_cb); + if (!io) { + goto error; + } + + TAILQ_FOREACH(entry, &batch->entries, tailq) { + /* Update band's relocation stats if the IO comes from reloc */ + if (entry->io_flags & FTL_IO_WEAK) { + if (!spdk_bit_array_get(wptr->band->reloc_bitmap, entry->band->id)) { + spdk_bit_array_set(wptr->band->reloc_bitmap, entry->band->id); + entry->band->num_reloc_bands++; + } + } + + ftl_trace_wbuf_pop(dev, entry); + ftl_update_stats(dev, entry); + } + + SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write addr:%lx\n", wptr->addr.offset); + + if (ftl_submit_write(wptr, io)) { + /* TODO: we need some recovery here */ + assert(0 && "Write submit failed"); + if (ftl_io_done(io)) { + ftl_io_free(io); + } + } + + return dev->xfer_size; +error: + TAILQ_INSERT_TAIL(&dev->pending_batches, batch, tailq); + return 0; +} + +static int +ftl_process_writes(struct spdk_ftl_dev *dev) +{ + struct ftl_wptr *wptr, *twptr; + size_t num_active = 0; + enum ftl_band_state state; + + LIST_FOREACH_SAFE(wptr, &dev->wptr_list, list_entry, twptr) { + ftl_wptr_process_writes(wptr); + state = wptr->band->state; + + if (state != FTL_BAND_STATE_FULL && + state != FTL_BAND_STATE_CLOSING && + state != FTL_BAND_STATE_CLOSED) { + num_active++; + } + } + + if (num_active < 1) { + ftl_add_wptr(dev); + } + + return 0; +} + +static void +ftl_fill_wbuf_entry(struct ftl_wbuf_entry *entry, struct ftl_io *io) +{ + memcpy(entry->payload, ftl_io_iovec_addr(io), FTL_BLOCK_SIZE); + + if (entry->io_flags & FTL_IO_WEAK) { + entry->band = ftl_band_from_addr(io->dev, io->addr); + entry->addr = ftl_band_next_addr(entry->band, io->addr, io->pos); + entry->band->num_reloc_blocks++; + } + + entry->trace = io->trace; + entry->lba = ftl_io_current_lba(io); +} + +static int +ftl_wbuf_fill(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch; + struct ftl_wbuf_entry *entry; + + ioch = ftl_io_channel_get_ctx(io->ioch); + + while (io->pos < io->num_blocks) { + if (ftl_io_current_lba(io) == FTL_LBA_INVALID) { + ftl_io_advance(io, 1); + continue; + } + + entry = ftl_acquire_wbuf_entry(ioch, io->flags); + if (!entry) { + TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry); + return 0; + } + + ftl_fill_wbuf_entry(entry, io); + + ftl_trace_wbuf_fill(dev, io); + ftl_update_l2p(dev, entry, ftl_get_addr_from_entry(entry)); + ftl_io_advance(io, 1); + + /* Needs to be done after L2P is updated to avoid race with */ + /* write completion callback when it's processed faster than */ + /* L2P is set in update_l2p(). */ + spdk_ring_enqueue(ioch->submit_queue, (void **)&entry, 1, NULL); + } + + if (ftl_io_done(io)) { + if (ftl_dev_has_nv_cache(dev) && !(io->flags & FTL_IO_BYPASS_CACHE)) { + ftl_write_nv_cache(io); + } else { + TAILQ_INSERT_TAIL(&ioch->write_cmpl_queue, io, ioch_entry); + } + } + + return 0; +} + +static bool +ftl_dev_needs_defrag(struct spdk_ftl_dev *dev) +{ + const struct spdk_ftl_limit *limit = ftl_get_limit(dev, SPDK_FTL_LIMIT_START); + + if (ftl_reloc_is_halted(dev->reloc)) { + return false; + } + + if (ftl_reloc_is_defrag_active(dev->reloc)) { + return false; + } + + if (dev->num_free <= limit->thld) { + return true; + } + + return false; +} + +static double +ftl_band_calc_merit(struct ftl_band *band, size_t *threshold_valid) +{ + size_t usable, valid, invalid; + double vld_ratio; + + /* If the band doesn't have any usable blocks it's of no use */ + usable = ftl_band_num_usable_blocks(band); + if (usable == 0) { + return 0.0; + } + + valid = threshold_valid ? (usable - *threshold_valid) : band->lba_map.num_vld; + invalid = usable - valid; + + /* Add one to avoid division by 0 */ + vld_ratio = (double)invalid / (double)(valid + 1); + return vld_ratio * ftl_band_age(band); +} + +static bool +ftl_band_needs_defrag(struct ftl_band *band, struct spdk_ftl_dev *dev) +{ + struct spdk_ftl_conf *conf = &dev->conf; + size_t thld_vld; + + /* If we're in dire need of free bands, every band is worth defragging */ + if (ftl_current_limit(dev) == SPDK_FTL_LIMIT_CRIT) { + return true; + } + + thld_vld = (ftl_band_num_usable_blocks(band) * conf->invalid_thld) / 100; + + return band->merit > ftl_band_calc_merit(band, &thld_vld); +} + +static struct ftl_band * +ftl_select_defrag_band(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band, *mband = NULL; + double merit = 0; + + LIST_FOREACH(band, &dev->shut_bands, list_entry) { + assert(band->state == FTL_BAND_STATE_CLOSED); + band->merit = ftl_band_calc_merit(band, NULL); + if (band->merit > merit) { + merit = band->merit; + mband = band; + } + } + + if (mband && !ftl_band_needs_defrag(mband, dev)) { + mband = NULL; + } + + return mband; +} + +static void +ftl_process_relocs(struct spdk_ftl_dev *dev) +{ + struct ftl_band *band; + + if (ftl_dev_needs_defrag(dev)) { + band = ftl_select_defrag_band(dev); + if (band) { + ftl_reloc_add(dev->reloc, band, 0, ftl_get_num_blocks_in_band(dev), 0, true); + ftl_trace_defrag_band(dev, band); + } + } + + ftl_reloc(dev->reloc); +} + +int +ftl_current_limit(const struct spdk_ftl_dev *dev) +{ + return dev->limit; +} + +void +spdk_ftl_dev_get_attrs(const struct spdk_ftl_dev *dev, struct spdk_ftl_attrs *attrs) +{ + attrs->uuid = dev->uuid; + attrs->num_blocks = dev->num_lbas; + attrs->block_size = FTL_BLOCK_SIZE; + attrs->num_zones = ftl_get_num_zones(dev); + attrs->zone_size = ftl_get_num_blocks_in_zone(dev); + attrs->conf = dev->conf; + attrs->base_bdev = spdk_bdev_get_name(spdk_bdev_desc_get_bdev(dev->base_bdev_desc)); + + attrs->cache_bdev = NULL; + if (dev->nv_cache.bdev_desc) { + attrs->cache_bdev = spdk_bdev_get_name( + spdk_bdev_desc_get_bdev(dev->nv_cache.bdev_desc)); + } +} + +static void +_ftl_io_write(void *ctx) +{ + ftl_io_write((struct ftl_io *)ctx); +} + +static int +ftl_submit_write_leaf(struct ftl_io *io) +{ + int rc; + + rc = ftl_submit_write(ftl_wptr_from_band(io->band), io); + if (rc == -EAGAIN) { + /* EAGAIN means that the request was put on the pending queue */ + return 0; + } + + return rc; +} + +void +ftl_io_write(struct ftl_io *io) +{ + struct spdk_ftl_dev *dev = io->dev; + struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(io->ioch); + + /* Put the IO on retry queue in case IO channel is not initialized */ + if (spdk_unlikely(ioch->index == FTL_IO_CHANNEL_INDEX_INVALID)) { + TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry); + return; + } + + /* For normal IOs we just need to copy the data onto the write buffer */ + if (!(io->flags & FTL_IO_MD)) { + ftl_io_call_foreach_child(io, ftl_wbuf_fill); + } else { + /* Metadata has its own buffer, so it doesn't have to be copied, so just */ + /* send it the the core thread and schedule the write immediately */ + if (ftl_check_core_thread(dev)) { + ftl_io_call_foreach_child(io, ftl_submit_write_leaf); + } else { + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_write, io); + } + } +} + +int +spdk_ftl_write(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt, + struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_io *io; + + if (iov_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt != ftl_iovec_num_blocks(iov, iov_cnt)) { + return -EINVAL; + } + + if (!dev->initialized) { + return -EBUSY; + } + + io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_WRITE); + if (!io) { + return -ENOMEM; + } + + ftl_io_write(io); + + return 0; +} + +void +ftl_io_read(struct ftl_io *io) +{ + ftl_io_call_foreach_child(io, ftl_submit_read); +} + +int +spdk_ftl_read(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt, + struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_io *io; + + if (iov_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt == 0) { + return -EINVAL; + } + + if (lba_cnt != ftl_iovec_num_blocks(iov, iov_cnt)) { + return -EINVAL; + } + + if (!dev->initialized) { + return -EBUSY; + } + + io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_READ); + if (!io) { + return -ENOMEM; + } + + ftl_io_read(io); + return 0; +} + +static struct ftl_flush * +ftl_flush_init(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_flush *flush; + + flush = calloc(1, sizeof(*flush)); + if (!flush) { + return NULL; + } + + flush->bmap = spdk_bit_array_create(FTL_BATCH_COUNT); + if (!flush->bmap) { + goto error; + } + + flush->dev = dev; + flush->cb.fn = cb_fn; + flush->cb.ctx = cb_arg; + + return flush; +error: + free(flush); + return NULL; +} + +static void +_ftl_flush(void *ctx) +{ + struct ftl_flush *flush = ctx; + struct spdk_ftl_dev *dev = flush->dev; + uint32_t i; + + /* Attach flush object to all non-empty batches */ + for (i = 0; i < FTL_BATCH_COUNT; ++i) { + if (dev->batch_array[i].num_entries > 0) { + spdk_bit_array_set(flush->bmap, i); + flush->num_req++; + } + } + + LIST_INSERT_HEAD(&dev->flush_list, flush, list_entry); + + /* If the write buffer was already empty, the flush can be completed right away */ + if (!flush->num_req) { + ftl_complete_flush(flush); + } +} + +int +ftl_flush_wbuf(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + struct ftl_flush *flush; + + flush = ftl_flush_init(dev, cb_fn, cb_arg); + if (!flush) { + return -ENOMEM; + } + + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_flush, flush); + return 0; +} + +int +spdk_ftl_flush(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg) +{ + if (!dev->initialized) { + return -EBUSY; + } + + return ftl_flush_wbuf(dev, cb_fn, cb_arg); +} + +bool +ftl_addr_is_written(struct ftl_band *band, struct ftl_addr addr) +{ + struct ftl_zone *zone = ftl_band_zone_from_addr(band, addr); + + return addr.offset < zone->info.write_pointer; +} + +static void ftl_process_media_event(struct spdk_ftl_dev *dev, struct spdk_bdev_media_event event); + +static void +_ftl_process_media_event(void *ctx) +{ + struct ftl_media_event *event = ctx; + struct spdk_ftl_dev *dev = event->dev; + + ftl_process_media_event(dev, event->event); + spdk_mempool_put(dev->media_events_pool, event); +} + +static void +ftl_process_media_event(struct spdk_ftl_dev *dev, struct spdk_bdev_media_event event) +{ + struct ftl_band *band; + struct ftl_addr addr = { .offset = event.offset }; + size_t block_off; + + if (!ftl_check_core_thread(dev)) { + struct ftl_media_event *media_event; + + media_event = spdk_mempool_get(dev->media_events_pool); + if (!media_event) { + SPDK_ERRLOG("Media event lost due to lack of memory"); + return; + } + + media_event->dev = dev; + media_event->event = event; + spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_process_media_event, + media_event); + return; + } + + band = ftl_band_from_addr(dev, addr); + block_off = ftl_band_block_offset_from_addr(band, addr); + + ftl_reloc_add(dev->reloc, band, block_off, event.num_blocks, 0, false); +} + +void +ftl_get_media_events(struct spdk_ftl_dev *dev) +{ +#define FTL_MAX_MEDIA_EVENTS 128 + struct spdk_bdev_media_event events[FTL_MAX_MEDIA_EVENTS]; + size_t num_events, i; + + if (!dev->initialized) { + return; + } + + do { + num_events = spdk_bdev_get_media_events(dev->base_bdev_desc, + events, FTL_MAX_MEDIA_EVENTS); + + for (i = 0; i < num_events; ++i) { + ftl_process_media_event(dev, events[i]); + } + + } while (num_events); +} + +int +ftl_io_channel_poll(void *arg) +{ + struct ftl_io_channel *ch = arg; + struct ftl_io *io; + TAILQ_HEAD(, ftl_io) retry_queue; + + if (TAILQ_EMPTY(&ch->write_cmpl_queue) && TAILQ_EMPTY(&ch->retry_queue)) { + return SPDK_POLLER_IDLE; + } + + while (!TAILQ_EMPTY(&ch->write_cmpl_queue)) { + io = TAILQ_FIRST(&ch->write_cmpl_queue); + TAILQ_REMOVE(&ch->write_cmpl_queue, io, ioch_entry); + ftl_io_complete(io); + } + + /* + * Create local copy of the retry queue to prevent from infinite retrying if IO will be + * inserted to the retry queue again + */ + TAILQ_INIT(&retry_queue); + TAILQ_SWAP(&ch->retry_queue, &retry_queue, ftl_io, ioch_entry); + + while (!TAILQ_EMPTY(&retry_queue)) { + io = TAILQ_FIRST(&retry_queue); + TAILQ_REMOVE(&retry_queue, io, ioch_entry); + if (io->type == FTL_IO_WRITE) { + ftl_io_write(io); + } else { + ftl_io_read(io); + } + } + + return SPDK_POLLER_BUSY; +} + +int +ftl_task_core(void *ctx) +{ + struct spdk_ftl_dev *dev = ctx; + + if (dev->halt) { + if (ftl_shutdown_complete(dev)) { + spdk_poller_unregister(&dev->core_poller); + return SPDK_POLLER_IDLE; + } + } + + ftl_process_writes(dev); + ftl_process_relocs(dev); + + return SPDK_POLLER_BUSY; +} + +SPDK_LOG_REGISTER_COMPONENT("ftl_core", SPDK_LOG_FTL_CORE) |