diff options
Diffstat (limited to '')
-rw-r--r-- | drivers/lightnvm/Kconfig | 42 | ||||
-rw-r--r-- | drivers/lightnvm/Makefile | 11 | ||||
-rw-r--r-- | drivers/lightnvm/core.c | 1201 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-cache.c | 134 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-core.c | 2095 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-gc.c | 705 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-init.c | 1366 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-map.c | 188 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-rb.c | 852 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-read.c | 701 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-recovery.c | 1011 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-rl.c | 250 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-sysfs.c | 720 | ||||
-rw-r--r-- | drivers/lightnvm/pblk-write.c | 673 | ||||
-rw-r--r-- | drivers/lightnvm/pblk.h | 1444 |
15 files changed, 11393 insertions, 0 deletions
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig new file mode 100644 index 000000000..38cd49a3a --- /dev/null +++ b/drivers/lightnvm/Kconfig @@ -0,0 +1,42 @@ +# +# Open-Channel SSD NVM configuration +# + +menuconfig NVM + bool "Open-Channel SSD target support" + depends on BLOCK && PCI && BROKEN + select BLK_DEV_NVME + help + Say Y here to get to enable Open-channel SSDs. + + Open-Channel SSDs implement a set of extension to SSDs, that + exposes direct access to the underlying non-volatile memory. + + If you say N, all options in this submenu will be skipped and disabled + only do this if you know what you are doing. + +if NVM + +config NVM_PBLK + tristate "Physical Block Device Open-Channel SSD target" + select CRC32 + help + Allows an open-channel SSD to be exposed as a block device to the + host. The target assumes the device exposes raw flash and must be + explicitly managed by the host. + + Please note the disk format is considered EXPERIMENTAL for now. + +if NVM_PBLK + +config NVM_PBLK_DEBUG + bool "PBlk Debug Support" + default n + help + Enables debug support for pblk. This includes extra checks, more + vocal error messages, and extra tracking fields in the pblk sysfs + entries. + +endif # NVM_PBLK_DEBUG + +endif # NVM diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile new file mode 100644 index 000000000..97d9d7c71 --- /dev/null +++ b/drivers/lightnvm/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Open-Channel SSDs. +# + +obj-$(CONFIG_NVM) := core.o +obj-$(CONFIG_NVM_PBLK) += pblk.o +pblk-y := pblk-init.o pblk-core.o pblk-rb.o \ + pblk-write.o pblk-cache.o pblk-read.o \ + pblk-gc.o pblk-recovery.o pblk-map.o \ + pblk-rl.o pblk-sysfs.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c new file mode 100644 index 000000000..60aa7bc5a --- /dev/null +++ b/drivers/lightnvm/core.c @@ -0,0 +1,1201 @@ +/* + * Copyright (C) 2015 IT University of Copenhagen. All rights reserved. + * Initial release: Matias Bjorling <m@bjorling.me> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include <linux/list.h> +#include <linux/types.h> +#include <linux/sem.h> +#include <linux/bitmap.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/miscdevice.h> +#include <linux/lightnvm.h> +#include <linux/sched/sysctl.h> + +static LIST_HEAD(nvm_tgt_types); +static DECLARE_RWSEM(nvm_tgtt_lock); +static LIST_HEAD(nvm_devices); +static DECLARE_RWSEM(nvm_lock); + +/* Map between virtual and physical channel and lun */ +struct nvm_ch_map { + int ch_off; + int num_lun; + int *lun_offs; +}; + +struct nvm_dev_map { + struct nvm_ch_map *chnls; + int num_ch; +}; + +static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) +{ + struct nvm_target *tgt; + + list_for_each_entry(tgt, &dev->targets, list) + if (!strcmp(name, tgt->disk->disk_name)) + return tgt; + + return NULL; +} + +static bool nvm_target_exists(const char *name) +{ + struct nvm_dev *dev; + struct nvm_target *tgt; + bool ret = false; + + down_write(&nvm_lock); + list_for_each_entry(dev, &nvm_devices, devices) { + mutex_lock(&dev->mlock); + list_for_each_entry(tgt, &dev->targets, list) { + if (!strcmp(name, tgt->disk->disk_name)) { + ret = true; + mutex_unlock(&dev->mlock); + goto out; + } + } + mutex_unlock(&dev->mlock); + } + +out: + up_write(&nvm_lock); + return ret; +} + +static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) +{ + int i; + + for (i = lun_begin; i <= lun_end; i++) { + if (test_and_set_bit(i, dev->lun_map)) { + pr_err("nvm: lun %d already allocated\n", i); + goto err; + } + } + + return 0; +err: + while (--i >= lun_begin) + clear_bit(i, dev->lun_map); + + return -EBUSY; +} + +static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin, + int lun_end) +{ + int i; + + for (i = lun_begin; i <= lun_end; i++) + WARN_ON(!test_and_clear_bit(i, dev->lun_map)); +} + +static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear) +{ + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_dev_map *dev_map = tgt_dev->map; + int i, j; + + for (i = 0; i < dev_map->num_ch; i++) { + struct nvm_ch_map *ch_map = &dev_map->chnls[i]; + int *lun_offs = ch_map->lun_offs; + int ch = i + ch_map->ch_off; + + if (clear) { + for (j = 0; j < ch_map->num_lun; j++) { + int lun = j + lun_offs[j]; + int lunid = (ch * dev->geo.num_lun) + lun; + + WARN_ON(!test_and_clear_bit(lunid, + dev->lun_map)); + } + } + + kfree(ch_map->lun_offs); + } + + kfree(dev_map->chnls); + kfree(dev_map); + + kfree(tgt_dev->luns); + kfree(tgt_dev); +} + +static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, + u16 lun_begin, u16 lun_end, + u16 op) +{ + struct nvm_tgt_dev *tgt_dev = NULL; + struct nvm_dev_map *dev_rmap = dev->rmap; + struct nvm_dev_map *dev_map; + struct ppa_addr *luns; + int num_lun = lun_end - lun_begin + 1; + int luns_left = num_lun; + int num_ch = num_lun / dev->geo.num_lun; + int num_ch_mod = num_lun % dev->geo.num_lun; + int bch = lun_begin / dev->geo.num_lun; + int blun = lun_begin % dev->geo.num_lun; + int lunid = 0; + int lun_balanced = 1; + int sec_per_lun, prev_num_lun; + int i, j; + + num_ch = (num_ch_mod == 0) ? num_ch : num_ch + 1; + + dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); + if (!dev_map) + goto err_dev; + + dev_map->chnls = kcalloc(num_ch, sizeof(struct nvm_ch_map), GFP_KERNEL); + if (!dev_map->chnls) + goto err_chnls; + + luns = kcalloc(num_lun, sizeof(struct ppa_addr), GFP_KERNEL); + if (!luns) + goto err_luns; + + prev_num_lun = (luns_left > dev->geo.num_lun) ? + dev->geo.num_lun : luns_left; + for (i = 0; i < num_ch; i++) { + struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; + int *lun_roffs = ch_rmap->lun_offs; + struct nvm_ch_map *ch_map = &dev_map->chnls[i]; + int *lun_offs; + int luns_in_chnl = (luns_left > dev->geo.num_lun) ? + dev->geo.num_lun : luns_left; + + if (lun_balanced && prev_num_lun != luns_in_chnl) + lun_balanced = 0; + + ch_map->ch_off = ch_rmap->ch_off = bch; + ch_map->num_lun = luns_in_chnl; + + lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); + if (!lun_offs) + goto err_ch; + + for (j = 0; j < luns_in_chnl; j++) { + luns[lunid].ppa = 0; + luns[lunid].a.ch = i; + luns[lunid++].a.lun = j; + + lun_offs[j] = blun; + lun_roffs[j + blun] = blun; + } + + ch_map->lun_offs = lun_offs; + + /* when starting a new channel, lun offset is reset */ + blun = 0; + luns_left -= luns_in_chnl; + } + + dev_map->num_ch = num_ch; + + tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL); + if (!tgt_dev) + goto err_ch; + + /* Inherit device geometry from parent */ + memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); + + /* Target device only owns a portion of the physical device */ + tgt_dev->geo.num_ch = num_ch; + tgt_dev->geo.num_lun = (lun_balanced) ? prev_num_lun : -1; + tgt_dev->geo.all_luns = num_lun; + tgt_dev->geo.all_chunks = num_lun * dev->geo.num_chk; + + tgt_dev->geo.op = op; + + sec_per_lun = dev->geo.clba * dev->geo.num_chk; + tgt_dev->geo.total_secs = num_lun * sec_per_lun; + + tgt_dev->q = dev->q; + tgt_dev->map = dev_map; + tgt_dev->luns = luns; + tgt_dev->parent = dev; + + return tgt_dev; +err_ch: + while (--i >= 0) + kfree(dev_map->chnls[i].lun_offs); + kfree(luns); +err_luns: + kfree(dev_map->chnls); +err_chnls: + kfree(dev_map); +err_dev: + return tgt_dev; +} + +static const struct block_device_operations nvm_fops = { + .owner = THIS_MODULE, +}; + +static struct nvm_tgt_type *__nvm_find_target_type(const char *name) +{ + struct nvm_tgt_type *tt; + + list_for_each_entry(tt, &nvm_tgt_types, list) + if (!strcmp(name, tt->name)) + return tt; + + return NULL; +} + +static struct nvm_tgt_type *nvm_find_target_type(const char *name) +{ + struct nvm_tgt_type *tt; + + down_write(&nvm_tgtt_lock); + tt = __nvm_find_target_type(name); + up_write(&nvm_tgtt_lock); + + return tt; +} + +static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin, + int lun_end) +{ + if (lun_begin > lun_end || lun_end >= geo->all_luns) { + pr_err("nvm: lun out of bound (%u:%u > %u)\n", + lun_begin, lun_end, geo->all_luns - 1); + return -EINVAL; + } + + return 0; +} + +static int __nvm_config_simple(struct nvm_dev *dev, + struct nvm_ioctl_create_simple *s) +{ + struct nvm_geo *geo = &dev->geo; + + if (s->lun_begin == -1 && s->lun_end == -1) { + s->lun_begin = 0; + s->lun_end = geo->all_luns - 1; + } + + return nvm_config_check_luns(geo, s->lun_begin, s->lun_end); +} + +static int __nvm_config_extended(struct nvm_dev *dev, + struct nvm_ioctl_create_extended *e) +{ + if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) { + e->lun_begin = 0; + e->lun_end = dev->geo.all_luns - 1; + } + + /* op not set falls into target's default */ + if (e->op == 0xFFFF) { + e->op = NVM_TARGET_DEFAULT_OP; + } else if (e->op < NVM_TARGET_MIN_OP || e->op > NVM_TARGET_MAX_OP) { + pr_err("nvm: invalid over provisioning value\n"); + return -EINVAL; + } + + return nvm_config_check_luns(&dev->geo, e->lun_begin, e->lun_end); +} + +static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) +{ + struct nvm_ioctl_create_extended e; + struct request_queue *tqueue; + struct gendisk *tdisk; + struct nvm_tgt_type *tt; + struct nvm_target *t; + struct nvm_tgt_dev *tgt_dev; + void *targetdata; + int ret; + + switch (create->conf.type) { + case NVM_CONFIG_TYPE_SIMPLE: + ret = __nvm_config_simple(dev, &create->conf.s); + if (ret) + return ret; + + e.lun_begin = create->conf.s.lun_begin; + e.lun_end = create->conf.s.lun_end; + e.op = NVM_TARGET_DEFAULT_OP; + break; + case NVM_CONFIG_TYPE_EXTENDED: + ret = __nvm_config_extended(dev, &create->conf.e); + if (ret) + return ret; + + e = create->conf.e; + break; + default: + pr_err("nvm: config type not valid\n"); + return -EINVAL; + } + + tt = nvm_find_target_type(create->tgttype); + if (!tt) { + pr_err("nvm: target type %s not found\n", create->tgttype); + return -EINVAL; + } + + if (nvm_target_exists(create->tgtname)) { + pr_err("nvm: target name already exists (%s)\n", + create->tgtname); + return -EINVAL; + } + + ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end); + if (ret) + return ret; + + t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); + if (!t) { + ret = -ENOMEM; + goto err_reserve; + } + + tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op); + if (!tgt_dev) { + pr_err("nvm: could not create target device\n"); + ret = -ENOMEM; + goto err_t; + } + + tdisk = alloc_disk(0); + if (!tdisk) { + ret = -ENOMEM; + goto err_dev; + } + + tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node, NULL); + if (!tqueue) { + ret = -ENOMEM; + goto err_disk; + } + blk_queue_make_request(tqueue, tt->make_rq); + + strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name)); + tdisk->flags = GENHD_FL_EXT_DEVT; + tdisk->major = 0; + tdisk->first_minor = 0; + tdisk->fops = &nvm_fops; + tdisk->queue = tqueue; + + targetdata = tt->init(tgt_dev, tdisk, create->flags); + if (IS_ERR(targetdata)) { + ret = PTR_ERR(targetdata); + goto err_init; + } + + tdisk->private_data = targetdata; + tqueue->queuedata = targetdata; + + blk_queue_max_hw_sectors(tqueue, + (dev->geo.csecs >> 9) * NVM_MAX_VLBA); + + set_capacity(tdisk, tt->capacity(targetdata)); + add_disk(tdisk); + + if (tt->sysfs_init && tt->sysfs_init(tdisk)) { + ret = -ENOMEM; + goto err_sysfs; + } + + t->type = tt; + t->disk = tdisk; + t->dev = tgt_dev; + + mutex_lock(&dev->mlock); + list_add_tail(&t->list, &dev->targets); + mutex_unlock(&dev->mlock); + + __module_get(tt->owner); + + return 0; +err_sysfs: + if (tt->exit) + tt->exit(targetdata, true); +err_init: + blk_cleanup_queue(tqueue); + tdisk->queue = NULL; +err_disk: + put_disk(tdisk); +err_dev: + nvm_remove_tgt_dev(tgt_dev, 0); +err_t: + kfree(t); +err_reserve: + nvm_release_luns_err(dev, e.lun_begin, e.lun_end); + return ret; +} + +static void __nvm_remove_target(struct nvm_target *t, bool graceful) +{ + struct nvm_tgt_type *tt = t->type; + struct gendisk *tdisk = t->disk; + struct request_queue *q = tdisk->queue; + + del_gendisk(tdisk); + blk_cleanup_queue(q); + + if (tt->sysfs_exit) + tt->sysfs_exit(tdisk); + + if (tt->exit) + tt->exit(tdisk->private_data, graceful); + + nvm_remove_tgt_dev(t->dev, 1); + put_disk(tdisk); + module_put(t->type->owner); + + list_del(&t->list); + kfree(t); +} + +/** + * nvm_remove_tgt - Removes a target from the media manager + * @dev: device + * @remove: ioctl structure with target name to remove. + * + * Returns: + * 0: on success + * 1: on not found + * <0: on error + */ +static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) +{ + struct nvm_target *t; + + mutex_lock(&dev->mlock); + t = nvm_find_target(dev, remove->tgtname); + if (!t) { + mutex_unlock(&dev->mlock); + return 1; + } + __nvm_remove_target(t, true); + mutex_unlock(&dev->mlock); + + return 0; +} + +static int nvm_register_map(struct nvm_dev *dev) +{ + struct nvm_dev_map *rmap; + int i, j; + + rmap = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); + if (!rmap) + goto err_rmap; + + rmap->chnls = kcalloc(dev->geo.num_ch, sizeof(struct nvm_ch_map), + GFP_KERNEL); + if (!rmap->chnls) + goto err_chnls; + + for (i = 0; i < dev->geo.num_ch; i++) { + struct nvm_ch_map *ch_rmap; + int *lun_roffs; + int luns_in_chnl = dev->geo.num_lun; + + ch_rmap = &rmap->chnls[i]; + + ch_rmap->ch_off = -1; + ch_rmap->num_lun = luns_in_chnl; + + lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); + if (!lun_roffs) + goto err_ch; + + for (j = 0; j < luns_in_chnl; j++) + lun_roffs[j] = -1; + + ch_rmap->lun_offs = lun_roffs; + } + + dev->rmap = rmap; + + return 0; +err_ch: + while (--i >= 0) + kfree(rmap->chnls[i].lun_offs); +err_chnls: + kfree(rmap); +err_rmap: + return -ENOMEM; +} + +static void nvm_unregister_map(struct nvm_dev *dev) +{ + struct nvm_dev_map *rmap = dev->rmap; + int i; + + for (i = 0; i < dev->geo.num_ch; i++) + kfree(rmap->chnls[i].lun_offs); + + kfree(rmap->chnls); + kfree(rmap); +} + +static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) +{ + struct nvm_dev_map *dev_map = tgt_dev->map; + struct nvm_ch_map *ch_map = &dev_map->chnls[p->a.ch]; + int lun_off = ch_map->lun_offs[p->a.lun]; + + p->a.ch += ch_map->ch_off; + p->a.lun += lun_off; +} + +static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) +{ + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_dev_map *dev_rmap = dev->rmap; + struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->a.ch]; + int lun_roff = ch_rmap->lun_offs[p->a.lun]; + + p->a.ch -= ch_rmap->ch_off; + p->a.lun -= lun_roff; +} + +static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr *ppa_list, int nr_ppas) +{ + int i; + + for (i = 0; i < nr_ppas; i++) { + nvm_map_to_dev(tgt_dev, &ppa_list[i]); + ppa_list[i] = generic_to_dev_addr(tgt_dev->parent, ppa_list[i]); + } +} + +static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr *ppa_list, int nr_ppas) +{ + int i; + + for (i = 0; i < nr_ppas; i++) { + ppa_list[i] = dev_to_generic_addr(tgt_dev->parent, ppa_list[i]); + nvm_map_to_tgt(tgt_dev, &ppa_list[i]); + } +} + +static void nvm_rq_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +{ + if (rqd->nr_ppas == 1) { + nvm_ppa_tgt_to_dev(tgt_dev, &rqd->ppa_addr, 1); + return; + } + + nvm_ppa_tgt_to_dev(tgt_dev, rqd->ppa_list, rqd->nr_ppas); +} + +static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +{ + if (rqd->nr_ppas == 1) { + nvm_ppa_dev_to_tgt(tgt_dev, &rqd->ppa_addr, 1); + return; + } + + nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas); +} + +int nvm_register_tgt_type(struct nvm_tgt_type *tt) +{ + int ret = 0; + + down_write(&nvm_tgtt_lock); + if (__nvm_find_target_type(tt->name)) + ret = -EEXIST; + else + list_add(&tt->list, &nvm_tgt_types); + up_write(&nvm_tgtt_lock); + + return ret; +} +EXPORT_SYMBOL(nvm_register_tgt_type); + +void nvm_unregister_tgt_type(struct nvm_tgt_type *tt) +{ + if (!tt) + return; + + down_write(&nvm_tgtt_lock); + list_del(&tt->list); + up_write(&nvm_tgtt_lock); +} +EXPORT_SYMBOL(nvm_unregister_tgt_type); + +void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags, + dma_addr_t *dma_handler) +{ + return dev->ops->dev_dma_alloc(dev, dev->dma_pool, mem_flags, + dma_handler); +} +EXPORT_SYMBOL(nvm_dev_dma_alloc); + +void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler) +{ + dev->ops->dev_dma_free(dev->dma_pool, addr, dma_handler); +} +EXPORT_SYMBOL(nvm_dev_dma_free); + +static struct nvm_dev *nvm_find_nvm_dev(const char *name) +{ + struct nvm_dev *dev; + + list_for_each_entry(dev, &nvm_devices, devices) + if (!strcmp(name, dev->name)) + return dev; + + return NULL; +} + +static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, + const struct ppa_addr *ppas, int nr_ppas) +{ + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_geo *geo = &tgt_dev->geo; + int i, plane_cnt, pl_idx; + struct ppa_addr ppa; + + if (geo->pln_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { + rqd->nr_ppas = nr_ppas; + rqd->ppa_addr = ppas[0]; + + return 0; + } + + rqd->nr_ppas = nr_ppas; + rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); + if (!rqd->ppa_list) { + pr_err("nvm: failed to allocate dma memory\n"); + return -ENOMEM; + } + + plane_cnt = geo->pln_mode; + rqd->nr_ppas *= plane_cnt; + + for (i = 0; i < nr_ppas; i++) { + for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { + ppa = ppas[i]; + ppa.g.pl = pl_idx; + rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; + } + } + + return 0; +} + +static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, + struct nvm_rq *rqd) +{ + if (!rqd->ppa_list) + return; + + nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list); +} + +int nvm_get_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct nvm_chk_meta *meta, + struct ppa_addr ppa, int nchks) +{ + struct nvm_dev *dev = tgt_dev->parent; + + nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1); + + return dev->ops->get_chk_meta(tgt_dev->parent, meta, + (sector_t)ppa.ppa, nchks); +} +EXPORT_SYMBOL(nvm_get_chunk_meta); + +int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, + int nr_ppas, int type) +{ + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_rq rqd; + int ret; + + if (nr_ppas > NVM_MAX_VLBA) { + pr_err("nvm: unable to update all blocks atomically\n"); + return -EINVAL; + } + + memset(&rqd, 0, sizeof(struct nvm_rq)); + + nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); + nvm_rq_tgt_to_dev(tgt_dev, &rqd); + + ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); + nvm_free_rqd_ppalist(tgt_dev, &rqd); + if (ret) { + pr_err("nvm: failed bb mark\n"); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL(nvm_set_tgt_bb_tbl); + +int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +{ + struct nvm_dev *dev = tgt_dev->parent; + int ret; + + if (!dev->ops->submit_io) + return -ENODEV; + + nvm_rq_tgt_to_dev(tgt_dev, rqd); + + rqd->dev = tgt_dev; + + /* In case of error, fail with right address format */ + ret = dev->ops->submit_io(dev, rqd); + if (ret) + nvm_rq_dev_to_tgt(tgt_dev, rqd); + return ret; +} +EXPORT_SYMBOL(nvm_submit_io); + +int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +{ + struct nvm_dev *dev = tgt_dev->parent; + int ret; + + if (!dev->ops->submit_io_sync) + return -ENODEV; + + nvm_rq_tgt_to_dev(tgt_dev, rqd); + + rqd->dev = tgt_dev; + + /* In case of error, fail with right address format */ + ret = dev->ops->submit_io_sync(dev, rqd); + nvm_rq_dev_to_tgt(tgt_dev, rqd); + + return ret; +} +EXPORT_SYMBOL(nvm_submit_io_sync); + +void nvm_end_io(struct nvm_rq *rqd) +{ + struct nvm_tgt_dev *tgt_dev = rqd->dev; + + /* Convert address space */ + if (tgt_dev) + nvm_rq_dev_to_tgt(tgt_dev, rqd); + + if (rqd->end_io) + rqd->end_io(rqd); +} +EXPORT_SYMBOL(nvm_end_io); + +/* + * folds a bad block list from its plane representation to its virtual + * block representation. The fold is done in place and reduced size is + * returned. + * + * If any of the planes status are bad or grown bad block, the virtual block + * is marked bad. If not bad, the first plane state acts as the block state. + */ +int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks) +{ + struct nvm_geo *geo = &dev->geo; + int blk, offset, pl, blktype; + + if (nr_blks != geo->num_chk * geo->pln_mode) + return -EINVAL; + + for (blk = 0; blk < geo->num_chk; blk++) { + offset = blk * geo->pln_mode; + blktype = blks[offset]; + + /* Bad blocks on any planes take precedence over other types */ + for (pl = 0; pl < geo->pln_mode; pl++) { + if (blks[offset + pl] & + (NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) { + blktype = blks[offset + pl]; + break; + } + } + + blks[blk] = blktype; + } + + return geo->num_chk; +} +EXPORT_SYMBOL(nvm_bb_tbl_fold); + +int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa, + u8 *blks) +{ + struct nvm_dev *dev = tgt_dev->parent; + + nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1); + + return dev->ops->get_bb_tbl(dev, ppa, blks); +} +EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); + +static int nvm_core_init(struct nvm_dev *dev) +{ + struct nvm_geo *geo = &dev->geo; + int ret; + + dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns), + sizeof(unsigned long), GFP_KERNEL); + if (!dev->lun_map) + return -ENOMEM; + + INIT_LIST_HEAD(&dev->area_list); + INIT_LIST_HEAD(&dev->targets); + mutex_init(&dev->mlock); + spin_lock_init(&dev->lock); + + ret = nvm_register_map(dev); + if (ret) + goto err_fmtype; + + return 0; +err_fmtype: + kfree(dev->lun_map); + return ret; +} + +static void nvm_free(struct nvm_dev *dev) +{ + if (!dev) + return; + + if (dev->dma_pool) + dev->ops->destroy_dma_pool(dev->dma_pool); + + nvm_unregister_map(dev); + kfree(dev->lun_map); + kfree(dev); +} + +static int nvm_init(struct nvm_dev *dev) +{ + struct nvm_geo *geo = &dev->geo; + int ret = -EINVAL; + + if (dev->ops->identity(dev)) { + pr_err("nvm: device could not be identified\n"); + goto err; + } + + pr_debug("nvm: ver:%u.%u nvm_vendor:%x\n", + geo->major_ver_id, geo->minor_ver_id, + geo->vmnt); + + ret = nvm_core_init(dev); + if (ret) { + pr_err("nvm: could not initialize core structures.\n"); + goto err; + } + + pr_info("nvm: registered %s [%u/%u/%u/%u/%u]\n", + dev->name, dev->geo.ws_min, dev->geo.ws_opt, + dev->geo.num_chk, dev->geo.all_luns, + dev->geo.num_ch); + return 0; +err: + pr_err("nvm: failed to initialize nvm\n"); + return ret; +} + +struct nvm_dev *nvm_alloc_dev(int node) +{ + return kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node); +} +EXPORT_SYMBOL(nvm_alloc_dev); + +int nvm_register(struct nvm_dev *dev) +{ + int ret; + + if (!dev->q || !dev->ops) + return -EINVAL; + + dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist"); + if (!dev->dma_pool) { + pr_err("nvm: could not create dma pool\n"); + return -ENOMEM; + } + + ret = nvm_init(dev); + if (ret) + goto err_init; + + /* register device with a supported media manager */ + down_write(&nvm_lock); + list_add(&dev->devices, &nvm_devices); + up_write(&nvm_lock); + + return 0; +err_init: + dev->ops->destroy_dma_pool(dev->dma_pool); + return ret; +} +EXPORT_SYMBOL(nvm_register); + +void nvm_unregister(struct nvm_dev *dev) +{ + struct nvm_target *t, *tmp; + + mutex_lock(&dev->mlock); + list_for_each_entry_safe(t, tmp, &dev->targets, list) { + if (t->dev->parent != dev) + continue; + __nvm_remove_target(t, false); + } + mutex_unlock(&dev->mlock); + + down_write(&nvm_lock); + list_del(&dev->devices); + up_write(&nvm_lock); + + nvm_free(dev); +} +EXPORT_SYMBOL(nvm_unregister); + +static int __nvm_configure_create(struct nvm_ioctl_create *create) +{ + struct nvm_dev *dev; + + down_write(&nvm_lock); + dev = nvm_find_nvm_dev(create->dev); + up_write(&nvm_lock); + + if (!dev) { + pr_err("nvm: device not found\n"); + return -EINVAL; + } + + return nvm_create_tgt(dev, create); +} + +static long nvm_ioctl_info(struct file *file, void __user *arg) +{ + struct nvm_ioctl_info *info; + struct nvm_tgt_type *tt; + int tgt_iter = 0; + + info = memdup_user(arg, sizeof(struct nvm_ioctl_info)); + if (IS_ERR(info)) + return -EFAULT; + + info->version[0] = NVM_VERSION_MAJOR; + info->version[1] = NVM_VERSION_MINOR; + info->version[2] = NVM_VERSION_PATCH; + + down_write(&nvm_tgtt_lock); + list_for_each_entry(tt, &nvm_tgt_types, list) { + struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter]; + + tgt->version[0] = tt->version[0]; + tgt->version[1] = tt->version[1]; + tgt->version[2] = tt->version[2]; + strncpy(tgt->tgtname, tt->name, NVM_TTYPE_NAME_MAX); + + tgt_iter++; + } + + info->tgtsize = tgt_iter; + up_write(&nvm_tgtt_lock); + + if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) { + kfree(info); + return -EFAULT; + } + + kfree(info); + return 0; +} + +static long nvm_ioctl_get_devices(struct file *file, void __user *arg) +{ + struct nvm_ioctl_get_devices *devices; + struct nvm_dev *dev; + int i = 0; + + devices = kzalloc(sizeof(struct nvm_ioctl_get_devices), GFP_KERNEL); + if (!devices) + return -ENOMEM; + + down_write(&nvm_lock); + list_for_each_entry(dev, &nvm_devices, devices) { + struct nvm_ioctl_device_info *info = &devices->info[i]; + + strlcpy(info->devname, dev->name, sizeof(info->devname)); + + /* kept for compatibility */ + info->bmversion[0] = 1; + info->bmversion[1] = 0; + info->bmversion[2] = 0; + strlcpy(info->bmname, "gennvm", sizeof(info->bmname)); + i++; + + if (i > 31) { + pr_err("nvm: max 31 devices can be reported.\n"); + break; + } + } + up_write(&nvm_lock); + + devices->nr_devices = i; + + if (copy_to_user(arg, devices, + sizeof(struct nvm_ioctl_get_devices))) { + kfree(devices); + return -EFAULT; + } + + kfree(devices); + return 0; +} + +static long nvm_ioctl_dev_create(struct file *file, void __user *arg) +{ + struct nvm_ioctl_create create; + + if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create))) + return -EFAULT; + + if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED && + create.conf.e.rsv != 0) { + pr_err("nvm: reserved config field in use\n"); + return -EINVAL; + } + + create.dev[DISK_NAME_LEN - 1] = '\0'; + create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0'; + create.tgtname[DISK_NAME_LEN - 1] = '\0'; + + if (create.flags != 0) { + __u32 flags = create.flags; + + /* Check for valid flags */ + if (flags & NVM_TARGET_FACTORY) + flags &= ~NVM_TARGET_FACTORY; + + if (flags) { + pr_err("nvm: flag not supported\n"); + return -EINVAL; + } + } + + return __nvm_configure_create(&create); +} + +static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) +{ + struct nvm_ioctl_remove remove; + struct nvm_dev *dev; + int ret = 0; + + if (copy_from_user(&remove, arg, sizeof(struct nvm_ioctl_remove))) + return -EFAULT; + + remove.tgtname[DISK_NAME_LEN - 1] = '\0'; + + if (remove.flags != 0) { + pr_err("nvm: no flags supported\n"); + return -EINVAL; + } + + list_for_each_entry(dev, &nvm_devices, devices) { + ret = nvm_remove_tgt(dev, &remove); + if (!ret) + break; + } + + return ret; +} + +/* kept for compatibility reasons */ +static long nvm_ioctl_dev_init(struct file *file, void __user *arg) +{ + struct nvm_ioctl_dev_init init; + + if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init))) + return -EFAULT; + + if (init.flags != 0) { + pr_err("nvm: no flags supported\n"); + return -EINVAL; + } + + return 0; +} + +/* Kept for compatibility reasons */ +static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) +{ + struct nvm_ioctl_dev_factory fact; + + if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory))) + return -EFAULT; + + fact.dev[DISK_NAME_LEN - 1] = '\0'; + + if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1)) + return -EINVAL; + + return 0; +} + +static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case NVM_INFO: + return nvm_ioctl_info(file, argp); + case NVM_GET_DEVICES: + return nvm_ioctl_get_devices(file, argp); + case NVM_DEV_CREATE: + return nvm_ioctl_dev_create(file, argp); + case NVM_DEV_REMOVE: + return nvm_ioctl_dev_remove(file, argp); + case NVM_DEV_INIT: + return nvm_ioctl_dev_init(file, argp); + case NVM_DEV_FACTORY: + return nvm_ioctl_dev_factory(file, argp); + } + return 0; +} + +static const struct file_operations _ctl_fops = { + .open = nonseekable_open, + .unlocked_ioctl = nvm_ctl_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice _nvm_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "lightnvm", + .nodename = "lightnvm/control", + .fops = &_ctl_fops, +}; +builtin_misc_device(_nvm_misc); diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c new file mode 100644 index 000000000..f565a56b8 --- /dev/null +++ b/drivers/lightnvm/pblk-cache.c @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * pblk-cache.c - pblk's write cache + */ + +#include "pblk.h" + +int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) +{ + struct request_queue *q = pblk->dev->q; + struct pblk_w_ctx w_ctx; + sector_t lba = pblk_get_lba(bio); + unsigned long start_time = jiffies; + unsigned int bpos, pos; + int nr_entries = pblk_get_secs(bio); + int i, ret; + + generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio), + &pblk->disk->part0); + + /* Update the write buffer head (mem) with the entries that we can + * write. The write in itself cannot fail, so there is no need to + * rollback from here on. + */ +retry: + ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos); + switch (ret) { + case NVM_IO_REQUEUE: + io_schedule(); + goto retry; + case NVM_IO_ERR: + pblk_pipeline_stop(pblk); + goto out; + } + + pblk_ppa_set_empty(&w_ctx.ppa); + w_ctx.flags = flags; + if (bio->bi_opf & REQ_PREFLUSH) { + w_ctx.flags |= PBLK_FLUSH_ENTRY; + pblk_write_kick(pblk); + } + + if (unlikely(!bio_has_data(bio))) + goto out; + + for (i = 0; i < nr_entries; i++) { + void *data = bio_data(bio); + + w_ctx.lba = lba + i; + + pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i); + pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos); + + bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); + } + + atomic64_add(nr_entries, &pblk->user_wa); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_add(nr_entries, &pblk->inflight_writes); + atomic_long_add(nr_entries, &pblk->req_writes); +#endif + + pblk_rl_inserted(&pblk->rl, nr_entries); + +out: + generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time); + pblk_write_should_kick(pblk); + return ret; +} + +/* + * On GC the incoming lbas are not necessarily sequential. Also, some of the + * lbas might not be valid entries, which are marked as empty by the GC thread + */ +int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq) +{ + struct pblk_w_ctx w_ctx; + unsigned int bpos, pos; + void *data = gc_rq->data; + int i, valid_entries; + + /* Update the write buffer head (mem) with the entries that we can + * write. The write in itself cannot fail, so there is no need to + * rollback from here on. + */ +retry: + if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) { + io_schedule(); + goto retry; + } + + w_ctx.flags = PBLK_IOTYPE_GC; + pblk_ppa_set_empty(&w_ctx.ppa); + + for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) { + if (gc_rq->lba_list[i] == ADDR_EMPTY) + continue; + + w_ctx.lba = gc_rq->lba_list[i]; + + pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries); + pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line, + gc_rq->paddr_list[i], pos); + + data += PBLK_EXPOSED_PAGE_SIZE; + valid_entries++; + } + + WARN_ONCE(gc_rq->secs_to_gc != valid_entries, + "pblk: inconsistent GC write\n"); + + atomic64_add(valid_entries, &pblk->gc_wa); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_add(valid_entries, &pblk->inflight_writes); + atomic_long_add(valid_entries, &pblk->recov_gc_writes); +#endif + + pblk_write_should_kick(pblk); + return NVM_IO_OK; +} diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c new file mode 100644 index 000000000..8dce31dbf --- /dev/null +++ b/drivers/lightnvm/pblk-core.c @@ -0,0 +1,2095 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * pblk-core.c - pblk's core functionality + * + */ + +#include "pblk.h" + +static void pblk_line_mark_bb(struct work_struct *work) +{ + struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, + ws); + struct pblk *pblk = line_ws->pblk; + struct nvm_tgt_dev *dev = pblk->dev; + struct ppa_addr *ppa = line_ws->priv; + int ret; + + ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD); + if (ret) { + struct pblk_line *line; + int pos; + + line = &pblk->lines[pblk_ppa_to_line(*ppa)]; + pos = pblk_ppa_to_pos(&dev->geo, *ppa); + + pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n", + line->id, pos); + } + + kfree(ppa); + mempool_free(line_ws, &pblk->gen_ws_pool); +} + +static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, + struct ppa_addr ppa_addr) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct ppa_addr *ppa; + int pos = pblk_ppa_to_pos(geo, ppa_addr); + + pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos); + atomic_long_inc(&pblk->erase_failed); + + atomic_dec(&line->blk_in_line); + if (test_and_set_bit(pos, line->blk_bitmap)) + pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n", + line->id, pos); + + /* Not necessary to mark bad blocks on 2.0 spec. */ + if (geo->version == NVM_OCSSD_SPEC_20) + return; + + ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC); + if (!ppa) + return; + + *ppa = ppa_addr; + pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, + GFP_ATOMIC, pblk->bb_wq); +} + +static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct nvm_chk_meta *chunk; + struct pblk_line *line; + int pos; + + line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)]; + pos = pblk_ppa_to_pos(geo, rqd->ppa_addr); + chunk = &line->chks[pos]; + + atomic_dec(&line->left_seblks); + + if (rqd->error) { + chunk->state = NVM_CHK_ST_OFFLINE; + pblk_mark_bb(pblk, line, rqd->ppa_addr); + } else { + chunk->state = NVM_CHK_ST_FREE; + } + + atomic_dec(&pblk->inflight_io); +} + +/* Erase completion assumes that only one block is erased at the time */ +static void pblk_end_io_erase(struct nvm_rq *rqd) +{ + struct pblk *pblk = rqd->private; + + __pblk_end_io_erase(pblk, rqd); + mempool_free(rqd, &pblk->e_rq_pool); +} + +/* + * Get information for all chunks from the device. + * + * The caller is responsible for freeing the returned structure + */ +struct nvm_chk_meta *pblk_chunk_get_info(struct pblk *pblk) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct nvm_chk_meta *meta; + struct ppa_addr ppa; + unsigned long len; + int ret; + + ppa.ppa = 0; + + len = geo->all_chunks * sizeof(*meta); + meta = kzalloc(len, GFP_KERNEL); + if (!meta) + return ERR_PTR(-ENOMEM); + + ret = nvm_get_chunk_meta(dev, meta, ppa, geo->all_chunks); + if (ret) { + kfree(meta); + return ERR_PTR(-EIO); + } + + return meta; +} + +struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk, + struct nvm_chk_meta *meta, + struct ppa_addr ppa) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int ch_off = ppa.m.grp * geo->num_chk * geo->num_lun; + int lun_off = ppa.m.pu * geo->num_chk; + int chk_off = ppa.m.chk; + + return meta + ch_off + lun_off + chk_off; +} + +void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, + u64 paddr) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct list_head *move_list = NULL; + + /* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P + * table is modified with reclaimed sectors, a check is done to endure + * that newer updates are not overwritten. + */ + spin_lock(&line->lock); + WARN_ON(line->state == PBLK_LINESTATE_FREE); + + if (test_and_set_bit(paddr, line->invalid_bitmap)) { + WARN_ONCE(1, "pblk: double invalidate\n"); + spin_unlock(&line->lock); + return; + } + le32_add_cpu(line->vsc, -1); + + if (line->state == PBLK_LINESTATE_CLOSED) + move_list = pblk_line_gc_list(pblk, line); + spin_unlock(&line->lock); + + if (move_list) { + spin_lock(&l_mg->gc_lock); + spin_lock(&line->lock); + /* Prevent moving a line that has just been chosen for GC */ + if (line->state == PBLK_LINESTATE_GC) { + spin_unlock(&line->lock); + spin_unlock(&l_mg->gc_lock); + return; + } + spin_unlock(&line->lock); + + list_move_tail(&line->list, move_list); + spin_unlock(&l_mg->gc_lock); + } +} + +void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa) +{ + struct pblk_line *line; + u64 paddr; + int line_id; + +#ifdef CONFIG_NVM_PBLK_DEBUG + /* Callers must ensure that the ppa points to a device address */ + BUG_ON(pblk_addr_in_cache(ppa)); + BUG_ON(pblk_ppa_empty(ppa)); +#endif + + line_id = pblk_ppa_to_line(ppa); + line = &pblk->lines[line_id]; + paddr = pblk_dev_ppa_to_line_addr(pblk, ppa); + + __pblk_map_invalidate(pblk, line, paddr); +} + +static void pblk_invalidate_range(struct pblk *pblk, sector_t slba, + unsigned int nr_secs) +{ + sector_t lba; + + spin_lock(&pblk->trans_lock); + for (lba = slba; lba < slba + nr_secs; lba++) { + struct ppa_addr ppa; + + ppa = pblk_trans_map_get(pblk, lba); + + if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa)) + pblk_map_invalidate(pblk, ppa); + + pblk_ppa_set_empty(&ppa); + pblk_trans_map_set(pblk, lba, ppa); + } + spin_unlock(&pblk->trans_lock); +} + +/* Caller must guarantee that the request is a valid type */ +struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type) +{ + mempool_t *pool; + struct nvm_rq *rqd; + int rq_size; + + switch (type) { + case PBLK_WRITE: + case PBLK_WRITE_INT: + pool = &pblk->w_rq_pool; + rq_size = pblk_w_rq_size; + break; + case PBLK_READ: + pool = &pblk->r_rq_pool; + rq_size = pblk_g_rq_size; + break; + default: + pool = &pblk->e_rq_pool; + rq_size = pblk_g_rq_size; + } + + rqd = mempool_alloc(pool, GFP_KERNEL); + memset(rqd, 0, rq_size); + + return rqd; +} + +/* Typically used on completion path. Cannot guarantee request consistency */ +void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) +{ + struct nvm_tgt_dev *dev = pblk->dev; + mempool_t *pool; + + switch (type) { + case PBLK_WRITE: + kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); + /* fall through */ + case PBLK_WRITE_INT: + pool = &pblk->w_rq_pool; + break; + case PBLK_READ: + pool = &pblk->r_rq_pool; + break; + case PBLK_ERASE: + pool = &pblk->e_rq_pool; + break; + default: + pblk_err(pblk, "trying to free unknown rqd type\n"); + return; + } + + if (rqd->meta_list) + nvm_dev_dma_free(dev->parent, rqd->meta_list, + rqd->dma_meta_list); + mempool_free(rqd, pool); +} + +void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, + int nr_pages) +{ + struct bio_vec *bv; + struct page *page; + int i, e, nbv = 0; + + for (i = 0; i < bio->bi_vcnt; i++) { + bv = &bio->bi_io_vec[i]; + page = bv->bv_page; + for (e = 0; e < bv->bv_len; e += PBLK_EXPOSED_PAGE_SIZE, nbv++) + if (nbv >= off) + mempool_free(page++, &pblk->page_bio_pool); + } +} + +int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, + int nr_pages) +{ + struct request_queue *q = pblk->dev->q; + struct page *page; + int i, ret; + + for (i = 0; i < nr_pages; i++) { + page = mempool_alloc(&pblk->page_bio_pool, flags); + + ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); + if (ret != PBLK_EXPOSED_PAGE_SIZE) { + pblk_err(pblk, "could not add page to bio\n"); + mempool_free(page, &pblk->page_bio_pool); + goto err; + } + } + + return 0; +err: + pblk_bio_free_pages(pblk, bio, (bio->bi_vcnt - i), i); + return -1; +} + +void pblk_write_kick(struct pblk *pblk) +{ + wake_up_process(pblk->writer_ts); + mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000)); +} + +void pblk_write_timer_fn(struct timer_list *t) +{ + struct pblk *pblk = from_timer(pblk, t, wtimer); + + /* kick the write thread every tick to flush outstanding data */ + pblk_write_kick(pblk); +} + +void pblk_write_should_kick(struct pblk *pblk) +{ + unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb); + + if (secs_avail >= pblk->min_write_pgs) + pblk_write_kick(pblk); +} + +static void pblk_wait_for_meta(struct pblk *pblk) +{ + do { + if (!atomic_read(&pblk->inflight_io)) + break; + + schedule(); + } while (1); +} + +static void pblk_flush_writer(struct pblk *pblk) +{ + pblk_rb_flush(&pblk->rwb); + do { + if (!pblk_rb_sync_count(&pblk->rwb)) + break; + + pblk_write_kick(pblk); + schedule(); + } while (1); +} + +struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct list_head *move_list = NULL; + int vsc = le32_to_cpu(*line->vsc); + + lockdep_assert_held(&line->lock); + + if (line->w_err_gc->has_write_err) { + if (line->gc_group != PBLK_LINEGC_WERR) { + line->gc_group = PBLK_LINEGC_WERR; + move_list = &l_mg->gc_werr_list; + pblk_rl_werr_line_in(&pblk->rl); + } + } else if (!vsc) { + if (line->gc_group != PBLK_LINEGC_FULL) { + line->gc_group = PBLK_LINEGC_FULL; + move_list = &l_mg->gc_full_list; + } + } else if (vsc < lm->high_thrs) { + if (line->gc_group != PBLK_LINEGC_HIGH) { + line->gc_group = PBLK_LINEGC_HIGH; + move_list = &l_mg->gc_high_list; + } + } else if (vsc < lm->mid_thrs) { + if (line->gc_group != PBLK_LINEGC_MID) { + line->gc_group = PBLK_LINEGC_MID; + move_list = &l_mg->gc_mid_list; + } + } else if (vsc < line->sec_in_line) { + if (line->gc_group != PBLK_LINEGC_LOW) { + line->gc_group = PBLK_LINEGC_LOW; + move_list = &l_mg->gc_low_list; + } + } else if (vsc == line->sec_in_line) { + if (line->gc_group != PBLK_LINEGC_EMPTY) { + line->gc_group = PBLK_LINEGC_EMPTY; + move_list = &l_mg->gc_empty_list; + } + } else { + line->state = PBLK_LINESTATE_CORRUPT; + line->gc_group = PBLK_LINEGC_NONE; + move_list = &l_mg->corrupt_list; + pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", + line->id, vsc, + line->sec_in_line, + lm->high_thrs, lm->mid_thrs); + } + + return move_list; +} + +void pblk_discard(struct pblk *pblk, struct bio *bio) +{ + sector_t slba = pblk_get_lba(bio); + sector_t nr_secs = pblk_get_secs(bio); + + pblk_invalidate_range(pblk, slba, nr_secs); +} + +void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) +{ + atomic_long_inc(&pblk->write_failed); +#ifdef CONFIG_NVM_PBLK_DEBUG + pblk_print_failed_rqd(pblk, rqd, rqd->error); +#endif +} + +void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd) +{ + /* Empty page read is not necessarily an error (e.g., L2P recovery) */ + if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) { + atomic_long_inc(&pblk->read_empty); + return; + } + + switch (rqd->error) { + case NVM_RSP_WARN_HIGHECC: + atomic_long_inc(&pblk->read_high_ecc); + break; + case NVM_RSP_ERR_FAILECC: + case NVM_RSP_ERR_FAILCRC: + atomic_long_inc(&pblk->read_failed); + break; + default: + pblk_err(pblk, "unknown read error:%d\n", rqd->error); + } +#ifdef CONFIG_NVM_PBLK_DEBUG + pblk_print_failed_rqd(pblk, rqd, rqd->error); +#endif +} + +void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write) +{ + pblk->sec_per_write = sec_per_write; +} + +int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + + atomic_inc(&pblk->inflight_io); + +#ifdef CONFIG_NVM_PBLK_DEBUG + if (pblk_check_io(pblk, rqd)) + return NVM_IO_ERR; +#endif + + return nvm_submit_io(dev, rqd); +} + +int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + + atomic_inc(&pblk->inflight_io); + +#ifdef CONFIG_NVM_PBLK_DEBUG + if (pblk_check_io(pblk, rqd)) + return NVM_IO_ERR; +#endif + + return nvm_submit_io_sync(dev, rqd); +} + +static void pblk_bio_map_addr_endio(struct bio *bio) +{ + bio_put(bio); +} + +struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, + unsigned int nr_secs, unsigned int len, + int alloc_type, gfp_t gfp_mask) +{ + struct nvm_tgt_dev *dev = pblk->dev; + void *kaddr = data; + struct page *page; + struct bio *bio; + int i, ret; + + if (alloc_type == PBLK_KMALLOC_META) + return bio_map_kern(dev->q, kaddr, len, gfp_mask); + + bio = bio_kmalloc(gfp_mask, nr_secs); + if (!bio) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_secs; i++) { + page = vmalloc_to_page(kaddr); + if (!page) { + pblk_err(pblk, "could not map vmalloc bio\n"); + bio_put(bio); + bio = ERR_PTR(-ENOMEM); + goto out; + } + + ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { + pblk_err(pblk, "could not add page to bio\n"); + bio_put(bio); + bio = ERR_PTR(-ENOMEM); + goto out; + } + + kaddr += PAGE_SIZE; + } + + bio->bi_end_io = pblk_bio_map_addr_endio; +out: + return bio; +} + +int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, + unsigned long secs_to_flush) +{ + int max = pblk->sec_per_write; + int min = pblk->min_write_pgs; + int secs_to_sync = 0; + + if (secs_avail >= max) + secs_to_sync = max; + else if (secs_avail >= min) + secs_to_sync = min * (secs_avail / min); + else if (secs_to_flush) + secs_to_sync = min; + + return secs_to_sync; +} + +void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) +{ + u64 addr; + int i; + + spin_lock(&line->lock); + addr = find_next_zero_bit(line->map_bitmap, + pblk->lm.sec_per_line, line->cur_sec); + line->cur_sec = addr - nr_secs; + + for (i = 0; i < nr_secs; i++, line->cur_sec--) + WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); + spin_unlock(&line->lock); +} + +u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) +{ + u64 addr; + int i; + + lockdep_assert_held(&line->lock); + + /* logic error: ppa out-of-bounds. Prevent generating bad address */ + if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) { + WARN(1, "pblk: page allocation out of bounds\n"); + nr_secs = pblk->lm.sec_per_line - line->cur_sec; + } + + line->cur_sec = addr = find_next_zero_bit(line->map_bitmap, + pblk->lm.sec_per_line, line->cur_sec); + for (i = 0; i < nr_secs; i++, line->cur_sec++) + WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap)); + + return addr; +} + +u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) +{ + u64 addr; + + /* Lock needed in case a write fails and a recovery needs to remap + * failed write buffer entries + */ + spin_lock(&line->lock); + addr = __pblk_alloc_page(pblk, line, nr_secs); + line->left_msecs -= nr_secs; + WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n"); + spin_unlock(&line->lock); + + return addr; +} + +u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line) +{ + u64 paddr; + + spin_lock(&line->lock); + paddr = find_next_zero_bit(line->map_bitmap, + pblk->lm.sec_per_line, line->cur_sec); + spin_unlock(&line->lock); + + return paddr; +} + +/* + * Submit emeta to one LUN in the raid line at the time to avoid a deadlock when + * taking the per LUN semaphore. + */ +static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, + void *emeta_buf, u64 paddr, int dir) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + void *ppa_list, *meta_list; + struct bio *bio; + struct nvm_rq rqd; + dma_addr_t dma_ppa_list, dma_meta_list; + int min = pblk->min_write_pgs; + int left_ppas = lm->emeta_sec[0]; + int id = line->id; + int rq_ppas, rq_len; + int cmd_op, bio_op; + int i, j; + int ret; + + if (dir == PBLK_WRITE) { + bio_op = REQ_OP_WRITE; + cmd_op = NVM_OP_PWRITE; + } else if (dir == PBLK_READ) { + bio_op = REQ_OP_READ; + cmd_op = NVM_OP_PREAD; + } else + return -EINVAL; + + meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &dma_meta_list); + if (!meta_list) + return -ENOMEM; + + ppa_list = meta_list + pblk_dma_meta_size; + dma_ppa_list = dma_meta_list + pblk_dma_meta_size; + +next_rq: + memset(&rqd, 0, sizeof(struct nvm_rq)); + + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); + rq_len = rq_ppas * geo->csecs; + + bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len, + l_mg->emeta_alloc_type, GFP_KERNEL); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + goto free_rqd_dma; + } + + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, bio_op, 0); + + rqd.bio = bio; + rqd.meta_list = meta_list; + rqd.ppa_list = ppa_list; + rqd.dma_meta_list = dma_meta_list; + rqd.dma_ppa_list = dma_ppa_list; + rqd.opcode = cmd_op; + rqd.nr_ppas = rq_ppas; + + if (dir == PBLK_WRITE) { + struct pblk_sec_meta *meta_list = rqd.meta_list; + + rqd.flags = pblk_set_progr_mode(pblk, PBLK_WRITE); + for (i = 0; i < rqd.nr_ppas; ) { + spin_lock(&line->lock); + paddr = __pblk_alloc_page(pblk, line, min); + spin_unlock(&line->lock); + for (j = 0; j < min; j++, i++, paddr++) { + meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); + rqd.ppa_list[i] = + addr_to_gen_ppa(pblk, paddr, id); + } + } + } else { + for (i = 0; i < rqd.nr_ppas; ) { + struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id); + int pos = pblk_ppa_to_pos(geo, ppa); + int read_type = PBLK_READ_RANDOM; + + if (pblk_io_aligned(pblk, rq_ppas)) + read_type = PBLK_READ_SEQUENTIAL; + rqd.flags = pblk_set_read_mode(pblk, read_type); + + while (test_bit(pos, line->blk_bitmap)) { + paddr += min; + if (pblk_boundary_paddr_checks(pblk, paddr)) { + pblk_err(pblk, "corrupt emeta line:%d\n", + line->id); + bio_put(bio); + ret = -EINTR; + goto free_rqd_dma; + } + + ppa = addr_to_gen_ppa(pblk, paddr, id); + pos = pblk_ppa_to_pos(geo, ppa); + } + + if (pblk_boundary_paddr_checks(pblk, paddr + min)) { + pblk_err(pblk, "corrupt emeta line:%d\n", + line->id); + bio_put(bio); + ret = -EINTR; + goto free_rqd_dma; + } + + for (j = 0; j < min; j++, i++, paddr++) + rqd.ppa_list[i] = + addr_to_gen_ppa(pblk, paddr, line->id); + } + } + + ret = pblk_submit_io_sync(pblk, &rqd); + if (ret) { + pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); + bio_put(bio); + goto free_rqd_dma; + } + + atomic_dec(&pblk->inflight_io); + + if (rqd.error) { + if (dir == PBLK_WRITE) + pblk_log_write_err(pblk, &rqd); + else + pblk_log_read_err(pblk, &rqd); + } + + emeta_buf += rq_len; + left_ppas -= rq_ppas; + if (left_ppas) + goto next_rq; +free_rqd_dma: + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); + return ret; +} + +u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + int bit; + + /* This usually only happens on bad lines */ + bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); + if (bit >= lm->blk_per_line) + return -1; + + return bit * geo->ws_opt; +} + +static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, + u64 paddr, int dir) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct pblk_line_meta *lm = &pblk->lm; + struct bio *bio; + struct nvm_rq rqd; + __le64 *lba_list = NULL; + int i, ret; + int cmd_op, bio_op; + int flags; + + if (dir == PBLK_WRITE) { + bio_op = REQ_OP_WRITE; + cmd_op = NVM_OP_PWRITE; + flags = pblk_set_progr_mode(pblk, PBLK_WRITE); + lba_list = emeta_to_lbas(pblk, line->emeta->buf); + } else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) { + bio_op = REQ_OP_READ; + cmd_op = NVM_OP_PREAD; + flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + } else + return -EINVAL; + + memset(&rqd, 0, sizeof(struct nvm_rq)); + + rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &rqd.dma_meta_list); + if (!rqd.meta_list) + return -ENOMEM; + + rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; + rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; + + bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + goto free_ppa_list; + } + + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, bio_op, 0); + + rqd.bio = bio; + rqd.opcode = cmd_op; + rqd.flags = flags; + rqd.nr_ppas = lm->smeta_sec; + + for (i = 0; i < lm->smeta_sec; i++, paddr++) { + struct pblk_sec_meta *meta_list = rqd.meta_list; + + rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); + + if (dir == PBLK_WRITE) { + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + + meta_list[i].lba = lba_list[paddr] = addr_empty; + } + } + + /* + * This I/O is sent by the write thread when a line is replace. Since + * the write thread is the only one sending write and erase commands, + * there is no need to take the LUN semaphore. + */ + ret = pblk_submit_io_sync(pblk, &rqd); + if (ret) { + pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); + bio_put(bio); + goto free_ppa_list; + } + + atomic_dec(&pblk->inflight_io); + + if (rqd.error) { + if (dir == PBLK_WRITE) { + pblk_log_write_err(pblk, &rqd); + ret = 1; + } else if (dir == PBLK_READ) + pblk_log_read_err(pblk, &rqd); + } + +free_ppa_list: + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); + + return ret; +} + +int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line) +{ + u64 bpaddr = pblk_line_smeta_start(pblk, line); + + return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV); +} + +int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, + void *emeta_buf) +{ + return pblk_line_submit_emeta_io(pblk, line, emeta_buf, + line->emeta_ssec, PBLK_READ); +} + +static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, + struct ppa_addr ppa) +{ + rqd->opcode = NVM_OP_ERASE; + rqd->ppa_addr = ppa; + rqd->nr_ppas = 1; + rqd->flags = pblk_set_progr_mode(pblk, PBLK_ERASE); + rqd->bio = NULL; +} + +static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) +{ + struct nvm_rq rqd = {NULL}; + int ret; + + pblk_setup_e_rq(pblk, &rqd, ppa); + + /* The write thread schedules erases so that it minimizes disturbances + * with writes. Thus, there is no need to take the LUN semaphore. + */ + ret = pblk_submit_io_sync(pblk, &rqd); + rqd.private = pblk; + __pblk_end_io_erase(pblk, &rqd); + + return ret; +} + +int pblk_line_erase(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct ppa_addr ppa; + int ret, bit = -1; + + /* Erase only good blocks, one at a time */ + do { + spin_lock(&line->lock); + bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line, + bit + 1); + if (bit >= lm->blk_per_line) { + spin_unlock(&line->lock); + break; + } + + ppa = pblk->luns[bit].bppa; /* set ch and lun */ + ppa.a.blk = line->id; + + atomic_dec(&line->left_eblks); + WARN_ON(test_and_set_bit(bit, line->erase_bitmap)); + spin_unlock(&line->lock); + + ret = pblk_blk_erase_sync(pblk, ppa); + if (ret) { + pblk_err(pblk, "failed to erase line %d\n", line->id); + return ret; + } + } while (1); + + return 0; +} + +static void pblk_line_setup_metadata(struct pblk_line *line, + struct pblk_line_mgmt *l_mg, + struct pblk_line_meta *lm) +{ + int meta_line; + + lockdep_assert_held(&l_mg->free_lock); + +retry_meta: + meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); + if (meta_line == PBLK_DATA_LINES) { + spin_unlock(&l_mg->free_lock); + io_schedule(); + spin_lock(&l_mg->free_lock); + goto retry_meta; + } + + set_bit(meta_line, &l_mg->meta_bitmap); + line->meta_line = meta_line; + + line->smeta = l_mg->sline_meta[meta_line]; + line->emeta = l_mg->eline_meta[meta_line]; + + memset(line->smeta, 0, lm->smeta_len); + memset(line->emeta->buf, 0, lm->emeta_len[0]); + + line->emeta->mem = 0; + atomic_set(&line->emeta->sync, 0); +} + +/* For now lines are always assumed full lines. Thus, smeta former and current + * lun bitmaps are omitted. + */ +static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, + struct pblk_line *cur) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_emeta *emeta = line->emeta; + struct line_emeta *emeta_buf = emeta->buf; + struct line_smeta *smeta_buf = (struct line_smeta *)line->smeta; + int nr_blk_line; + + /* After erasing the line, new bad blocks might appear and we risk + * having an invalid line + */ + nr_blk_line = lm->blk_per_line - + bitmap_weight(line->blk_bitmap, lm->blk_per_line); + if (nr_blk_line < lm->min_blk_line) { + spin_lock(&l_mg->free_lock); + spin_lock(&line->lock); + line->state = PBLK_LINESTATE_BAD; + spin_unlock(&line->lock); + + list_add_tail(&line->list, &l_mg->bad_list); + spin_unlock(&l_mg->free_lock); + + pblk_debug(pblk, "line %d is bad\n", line->id); + + return 0; + } + + /* Run-time metadata */ + line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta); + + /* Mark LUNs allocated in this line (all for now) */ + bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len); + + smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); + memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16); + smeta_buf->header.id = cpu_to_le32(line->id); + smeta_buf->header.type = cpu_to_le16(line->type); + smeta_buf->header.version_major = SMETA_VERSION_MAJOR; + smeta_buf->header.version_minor = SMETA_VERSION_MINOR; + + /* Start metadata */ + smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); + smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns); + + /* Fill metadata among lines */ + if (cur) { + memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len); + smeta_buf->prev_id = cpu_to_le32(cur->id); + cur->emeta->buf->next_id = cpu_to_le32(line->id); + } else { + smeta_buf->prev_id = cpu_to_le32(PBLK_LINE_EMPTY); + } + + /* All smeta must be set at this point */ + smeta_buf->header.crc = cpu_to_le32( + pblk_calc_meta_header_crc(pblk, &smeta_buf->header)); + smeta_buf->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta_buf)); + + /* End metadata */ + memcpy(&emeta_buf->header, &smeta_buf->header, + sizeof(struct line_header)); + + emeta_buf->header.version_major = EMETA_VERSION_MAJOR; + emeta_buf->header.version_minor = EMETA_VERSION_MINOR; + emeta_buf->header.crc = cpu_to_le32( + pblk_calc_meta_header_crc(pblk, &emeta_buf->header)); + + emeta_buf->seq_nr = cpu_to_le64(line->seq_nr); + emeta_buf->nr_lbas = cpu_to_le64(line->sec_in_line); + emeta_buf->nr_valid_lbas = cpu_to_le64(0); + emeta_buf->next_id = cpu_to_le32(PBLK_LINE_EMPTY); + emeta_buf->crc = cpu_to_le32(0); + emeta_buf->prev_id = smeta_buf->prev_id; + + return 1; +} + +static int pblk_line_alloc_bitmaps(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + + line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); + if (!line->map_bitmap) + return -ENOMEM; + + /* will be initialized using bb info from map_bitmap */ + line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL); + if (!line->invalid_bitmap) { + kfree(line->map_bitmap); + line->map_bitmap = NULL; + return -ENOMEM; + } + + return 0; +} + +/* For now lines are always assumed full lines. Thus, smeta former and current + * lun bitmaps are omitted. + */ +static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, + int init) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + u64 off; + int bit = -1; + int emeta_secs; + + line->sec_in_line = lm->sec_per_line; + + /* Capture bad block information on line mapping bitmaps */ + while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line, + bit + 1)) < lm->blk_per_line) { + off = bit * geo->ws_opt; + bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off, + lm->sec_per_line); + bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux, + lm->sec_per_line); + line->sec_in_line -= geo->clba; + } + + /* Mark smeta metadata sectors as bad sectors */ + bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); + off = bit * geo->ws_opt; + bitmap_set(line->map_bitmap, off, lm->smeta_sec); + line->sec_in_line -= lm->smeta_sec; + line->smeta_ssec = off; + line->cur_sec = off + lm->smeta_sec; + + if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) { + pblk_debug(pblk, "line smeta I/O failed. Retry\n"); + return 0; + } + + bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line); + + /* Mark emeta metadata sectors as bad sectors. We need to consider bad + * blocks to make sure that there are enough sectors to store emeta + */ + emeta_secs = lm->emeta_sec[0]; + off = lm->sec_per_line; + while (emeta_secs) { + off -= geo->ws_opt; + if (!test_bit(off, line->invalid_bitmap)) { + bitmap_set(line->invalid_bitmap, off, geo->ws_opt); + emeta_secs -= geo->ws_opt; + } + } + + line->emeta_ssec = off; + line->sec_in_line -= lm->emeta_sec[0]; + line->nr_valid_lbas = 0; + line->left_msecs = line->sec_in_line; + *line->vsc = cpu_to_le32(line->sec_in_line); + + if (lm->sec_per_line - line->sec_in_line != + bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) { + spin_lock(&line->lock); + line->state = PBLK_LINESTATE_BAD; + spin_unlock(&line->lock); + + list_add_tail(&line->list, &l_mg->bad_list); + pblk_err(pblk, "unexpected line %d is bad\n", line->id); + + return 0; + } + + return 1; +} + +static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int blk_to_erase = atomic_read(&line->blk_in_line); + int i; + + for (i = 0; i < lm->blk_per_line; i++) { + struct pblk_lun *rlun = &pblk->luns[i]; + int pos = pblk_ppa_to_pos(geo, rlun->bppa); + int state = line->chks[pos].state; + + /* Free chunks should not be erased */ + if (state & NVM_CHK_ST_FREE) { + set_bit(pblk_ppa_to_pos(geo, rlun->bppa), + line->erase_bitmap); + blk_to_erase--; + } + } + + return blk_to_erase; +} + +static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + int blk_in_line = atomic_read(&line->blk_in_line); + int blk_to_erase; + + /* Bad blocks do not need to be erased */ + bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line); + + spin_lock(&line->lock); + + /* If we have not written to this line, we need to mark up free chunks + * as already erased + */ + if (line->state == PBLK_LINESTATE_NEW) { + blk_to_erase = pblk_prepare_new_line(pblk, line); + line->state = PBLK_LINESTATE_FREE; + } else { + blk_to_erase = blk_in_line; + } + + if (blk_in_line < lm->min_blk_line) { + spin_unlock(&line->lock); + return -EAGAIN; + } + + if (line->state != PBLK_LINESTATE_FREE) { + WARN(1, "pblk: corrupted line %d, state %d\n", + line->id, line->state); + spin_unlock(&line->lock); + return -EINTR; + } + + line->state = PBLK_LINESTATE_OPEN; + + atomic_set(&line->left_eblks, blk_to_erase); + atomic_set(&line->left_seblks, blk_to_erase); + + line->meta_distance = lm->meta_distance; + spin_unlock(&line->lock); + + kref_init(&line->ref); + + return 0; +} + +int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + int ret; + + spin_lock(&l_mg->free_lock); + l_mg->data_line = line; + list_del(&line->list); + + ret = pblk_line_prepare(pblk, line); + if (ret) { + list_add(&line->list, &l_mg->free_list); + spin_unlock(&l_mg->free_lock); + return ret; + } + spin_unlock(&l_mg->free_lock); + + ret = pblk_line_alloc_bitmaps(pblk, line); + if (ret) + goto fail; + + if (!pblk_line_init_bb(pblk, line, 0)) { + ret = -EINTR; + goto fail; + } + + pblk_rl_free_lines_dec(&pblk->rl, line, true); + return 0; + +fail: + spin_lock(&l_mg->free_lock); + list_add(&line->list, &l_mg->free_list); + spin_unlock(&l_mg->free_lock); + + return ret; +} + +void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) +{ + kfree(line->map_bitmap); + line->map_bitmap = NULL; + line->smeta = NULL; + line->emeta = NULL; +} + +static void pblk_line_reinit(struct pblk_line *line) +{ + *line->vsc = cpu_to_le32(EMPTY_ENTRY); + + line->map_bitmap = NULL; + line->invalid_bitmap = NULL; + line->smeta = NULL; + line->emeta = NULL; +} + +void pblk_line_free(struct pblk_line *line) +{ + kfree(line->map_bitmap); + kfree(line->invalid_bitmap); + + pblk_line_reinit(line); +} + +struct pblk_line *pblk_line_get(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line *line; + int ret, bit; + + lockdep_assert_held(&l_mg->free_lock); + +retry: + if (list_empty(&l_mg->free_list)) { + pblk_err(pblk, "no free lines\n"); + return NULL; + } + + line = list_first_entry(&l_mg->free_list, struct pblk_line, list); + list_del(&line->list); + l_mg->nr_free_lines--; + + bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); + if (unlikely(bit >= lm->blk_per_line)) { + spin_lock(&line->lock); + line->state = PBLK_LINESTATE_BAD; + spin_unlock(&line->lock); + + list_add_tail(&line->list, &l_mg->bad_list); + + pblk_debug(pblk, "line %d is bad\n", line->id); + goto retry; + } + + ret = pblk_line_prepare(pblk, line); + if (ret) { + switch (ret) { + case -EAGAIN: + list_add(&line->list, &l_mg->bad_list); + goto retry; + case -EINTR: + list_add(&line->list, &l_mg->corrupt_list); + goto retry; + default: + pblk_err(pblk, "failed to prepare line %d\n", line->id); + list_add(&line->list, &l_mg->free_list); + l_mg->nr_free_lines++; + return NULL; + } + } + + return line; +} + +static struct pblk_line *pblk_line_retry(struct pblk *pblk, + struct pblk_line *line) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line *retry_line; + +retry: + spin_lock(&l_mg->free_lock); + retry_line = pblk_line_get(pblk); + if (!retry_line) { + l_mg->data_line = NULL; + spin_unlock(&l_mg->free_lock); + return NULL; + } + + retry_line->map_bitmap = line->map_bitmap; + retry_line->invalid_bitmap = line->invalid_bitmap; + retry_line->smeta = line->smeta; + retry_line->emeta = line->emeta; + retry_line->meta_line = line->meta_line; + + pblk_line_reinit(line); + + l_mg->data_line = retry_line; + spin_unlock(&l_mg->free_lock); + + pblk_rl_free_lines_dec(&pblk->rl, line, false); + + if (pblk_line_erase(pblk, retry_line)) + goto retry; + + return retry_line; +} + +static void pblk_set_space_limit(struct pblk *pblk) +{ + struct pblk_rl *rl = &pblk->rl; + + atomic_set(&rl->rb_space, 0); +} + +struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line *line; + + spin_lock(&l_mg->free_lock); + line = pblk_line_get(pblk); + if (!line) { + spin_unlock(&l_mg->free_lock); + return NULL; + } + + line->seq_nr = l_mg->d_seq_nr++; + line->type = PBLK_LINETYPE_DATA; + l_mg->data_line = line; + + pblk_line_setup_metadata(line, l_mg, &pblk->lm); + + /* Allocate next line for preparation */ + l_mg->data_next = pblk_line_get(pblk); + if (!l_mg->data_next) { + /* If we cannot get a new line, we need to stop the pipeline. + * Only allow as many writes in as we can store safely and then + * fail gracefully + */ + pblk_set_space_limit(pblk); + + l_mg->data_next = NULL; + } else { + l_mg->data_next->seq_nr = l_mg->d_seq_nr++; + l_mg->data_next->type = PBLK_LINETYPE_DATA; + } + spin_unlock(&l_mg->free_lock); + + if (pblk_line_alloc_bitmaps(pblk, line)) + return NULL; + + if (pblk_line_erase(pblk, line)) { + line = pblk_line_retry(pblk, line); + if (!line) + return NULL; + } + +retry_setup: + if (!pblk_line_init_metadata(pblk, line, NULL)) { + line = pblk_line_retry(pblk, line); + if (!line) + return NULL; + + goto retry_setup; + } + + if (!pblk_line_init_bb(pblk, line, 1)) { + line = pblk_line_retry(pblk, line); + if (!line) + return NULL; + + goto retry_setup; + } + + pblk_rl_free_lines_dec(&pblk->rl, line, true); + + return line; +} + +static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line) +{ + lockdep_assert_held(&pblk->l_mg.free_lock); + + pblk_set_space_limit(pblk); + pblk->state = PBLK_STATE_STOPPING; +} + +static void pblk_line_close_meta_sync(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line *line, *tline; + LIST_HEAD(list); + + spin_lock(&l_mg->close_lock); + if (list_empty(&l_mg->emeta_list)) { + spin_unlock(&l_mg->close_lock); + return; + } + + list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); + spin_unlock(&l_mg->close_lock); + + list_for_each_entry_safe(line, tline, &list, list) { + struct pblk_emeta *emeta = line->emeta; + + while (emeta->mem < lm->emeta_len[0]) { + int ret; + + ret = pblk_submit_meta_io(pblk, line); + if (ret) { + pblk_err(pblk, "sync meta line %d failed (%d)\n", + line->id, ret); + return; + } + } + } + + pblk_wait_for_meta(pblk); + flush_workqueue(pblk->close_wq); +} + +void __pblk_pipeline_flush(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + int ret; + + spin_lock(&l_mg->free_lock); + if (pblk->state == PBLK_STATE_RECOVERING || + pblk->state == PBLK_STATE_STOPPED) { + spin_unlock(&l_mg->free_lock); + return; + } + pblk->state = PBLK_STATE_RECOVERING; + spin_unlock(&l_mg->free_lock); + + pblk_flush_writer(pblk); + pblk_wait_for_meta(pblk); + + ret = pblk_recov_pad(pblk); + if (ret) { + pblk_err(pblk, "could not close data on teardown(%d)\n", ret); + return; + } + + flush_workqueue(pblk->bb_wq); + pblk_line_close_meta_sync(pblk); +} + +void __pblk_pipeline_stop(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + + spin_lock(&l_mg->free_lock); + pblk->state = PBLK_STATE_STOPPED; + l_mg->data_line = NULL; + l_mg->data_next = NULL; + spin_unlock(&l_mg->free_lock); +} + +void pblk_pipeline_stop(struct pblk *pblk) +{ + __pblk_pipeline_flush(pblk); + __pblk_pipeline_stop(pblk); +} + +struct pblk_line *pblk_line_replace_data(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line *cur, *new = NULL; + unsigned int left_seblks; + + new = l_mg->data_next; + if (!new) + goto out; + + spin_lock(&l_mg->free_lock); + cur = l_mg->data_line; + l_mg->data_line = new; + + pblk_line_setup_metadata(new, l_mg, &pblk->lm); + spin_unlock(&l_mg->free_lock); + +retry_erase: + left_seblks = atomic_read(&new->left_seblks); + if (left_seblks) { + /* If line is not fully erased, erase it */ + if (atomic_read(&new->left_eblks)) { + if (pblk_line_erase(pblk, new)) + goto out; + } else { + io_schedule(); + } + goto retry_erase; + } + + if (pblk_line_alloc_bitmaps(pblk, new)) + return NULL; + +retry_setup: + if (!pblk_line_init_metadata(pblk, new, cur)) { + new = pblk_line_retry(pblk, new); + if (!new) + goto out; + + goto retry_setup; + } + + if (!pblk_line_init_bb(pblk, new, 1)) { + new = pblk_line_retry(pblk, new); + if (!new) + goto out; + + goto retry_setup; + } + + pblk_rl_free_lines_dec(&pblk->rl, new, true); + + /* Allocate next line for preparation */ + spin_lock(&l_mg->free_lock); + l_mg->data_next = pblk_line_get(pblk); + if (!l_mg->data_next) { + /* If we cannot get a new line, we need to stop the pipeline. + * Only allow as many writes in as we can store safely and then + * fail gracefully + */ + pblk_stop_writes(pblk, new); + l_mg->data_next = NULL; + } else { + l_mg->data_next->seq_nr = l_mg->d_seq_nr++; + l_mg->data_next->type = PBLK_LINETYPE_DATA; + } + spin_unlock(&l_mg->free_lock); + +out: + return new; +} + +static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_gc *gc = &pblk->gc; + + spin_lock(&line->lock); + WARN_ON(line->state != PBLK_LINESTATE_GC); + line->state = PBLK_LINESTATE_FREE; + line->gc_group = PBLK_LINEGC_NONE; + pblk_line_free(line); + + if (line->w_err_gc->has_write_err) { + pblk_rl_werr_line_out(&pblk->rl); + line->w_err_gc->has_write_err = 0; + } + + spin_unlock(&line->lock); + atomic_dec(&gc->pipeline_gc); + + spin_lock(&l_mg->free_lock); + list_add_tail(&line->list, &l_mg->free_list); + l_mg->nr_free_lines++; + spin_unlock(&l_mg->free_lock); + + pblk_rl_free_lines_inc(&pblk->rl, line); +} + +static void pblk_line_put_ws(struct work_struct *work) +{ + struct pblk_line_ws *line_put_ws = container_of(work, + struct pblk_line_ws, ws); + struct pblk *pblk = line_put_ws->pblk; + struct pblk_line *line = line_put_ws->line; + + __pblk_line_put(pblk, line); + mempool_free(line_put_ws, &pblk->gen_ws_pool); +} + +void pblk_line_put(struct kref *ref) +{ + struct pblk_line *line = container_of(ref, struct pblk_line, ref); + struct pblk *pblk = line->pblk; + + __pblk_line_put(pblk, line); +} + +void pblk_line_put_wq(struct kref *ref) +{ + struct pblk_line *line = container_of(ref, struct pblk_line, ref); + struct pblk *pblk = line->pblk; + struct pblk_line_ws *line_put_ws; + + line_put_ws = mempool_alloc(&pblk->gen_ws_pool, GFP_ATOMIC); + if (!line_put_ws) + return; + + line_put_ws->pblk = pblk; + line_put_ws->line = line; + line_put_ws->priv = NULL; + + INIT_WORK(&line_put_ws->ws, pblk_line_put_ws); + queue_work(pblk->r_end_wq, &line_put_ws->ws); +} + +int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) +{ + struct nvm_rq *rqd; + int err; + + rqd = pblk_alloc_rqd(pblk, PBLK_ERASE); + + pblk_setup_e_rq(pblk, rqd, ppa); + + rqd->end_io = pblk_end_io_erase; + rqd->private = pblk; + + /* The write thread schedules erases so that it minimizes disturbances + * with writes. Thus, there is no need to take the LUN semaphore. + */ + err = pblk_submit_io(pblk, rqd); + if (err) { + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + + pblk_err(pblk, "could not async erase line:%d,blk:%d\n", + pblk_ppa_to_line(ppa), + pblk_ppa_to_pos(geo, ppa)); + } + + return err; +} + +struct pblk_line *pblk_line_get_data(struct pblk *pblk) +{ + return pblk->l_mg.data_line; +} + +/* For now, always erase next line */ +struct pblk_line *pblk_line_get_erase(struct pblk *pblk) +{ + return pblk->l_mg.data_next; +} + +int pblk_line_is_full(struct pblk_line *line) +{ + return (line->left_msecs == 0); +} + +static void pblk_line_should_sync_meta(struct pblk *pblk) +{ + if (pblk_rl_is_limit(&pblk->rl)) + pblk_line_close_meta_sync(pblk); +} + +void pblk_line_close(struct pblk *pblk, struct pblk_line *line) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct list_head *move_list; + int i; + +#ifdef CONFIG_NVM_PBLK_DEBUG + WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line), + "pblk: corrupt closed line %d\n", line->id); +#endif + + spin_lock(&l_mg->free_lock); + WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap)); + spin_unlock(&l_mg->free_lock); + + spin_lock(&l_mg->gc_lock); + spin_lock(&line->lock); + WARN_ON(line->state != PBLK_LINESTATE_OPEN); + line->state = PBLK_LINESTATE_CLOSED; + move_list = pblk_line_gc_list(pblk, line); + + list_add_tail(&line->list, move_list); + + kfree(line->map_bitmap); + line->map_bitmap = NULL; + line->smeta = NULL; + line->emeta = NULL; + + for (i = 0; i < lm->blk_per_line; i++) { + struct pblk_lun *rlun = &pblk->luns[i]; + int pos = pblk_ppa_to_pos(geo, rlun->bppa); + int state = line->chks[pos].state; + + if (!(state & NVM_CHK_ST_OFFLINE)) + state = NVM_CHK_ST_CLOSED; + } + + spin_unlock(&line->lock); + spin_unlock(&l_mg->gc_lock); +} + +void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_emeta *emeta = line->emeta; + struct line_emeta *emeta_buf = emeta->buf; + struct wa_counters *wa = emeta_to_wa(lm, emeta_buf); + + /* No need for exact vsc value; avoid a big line lock and take aprox. */ + memcpy(emeta_to_vsc(pblk, emeta_buf), l_mg->vsc_list, lm->vsc_list_len); + memcpy(emeta_to_bb(emeta_buf), line->blk_bitmap, lm->blk_bitmap_len); + + wa->user = cpu_to_le64(atomic64_read(&pblk->user_wa)); + wa->pad = cpu_to_le64(atomic64_read(&pblk->pad_wa)); + wa->gc = cpu_to_le64(atomic64_read(&pblk->gc_wa)); + + if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) { + emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); + memcpy(emeta_buf->header.uuid, pblk->instance_uuid, 16); + emeta_buf->header.id = cpu_to_le32(line->id); + emeta_buf->header.type = cpu_to_le16(line->type); + emeta_buf->header.version_major = EMETA_VERSION_MAJOR; + emeta_buf->header.version_minor = EMETA_VERSION_MINOR; + emeta_buf->header.crc = cpu_to_le32( + pblk_calc_meta_header_crc(pblk, &emeta_buf->header)); + } + + emeta_buf->nr_valid_lbas = cpu_to_le64(line->nr_valid_lbas); + emeta_buf->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, emeta_buf)); + + spin_lock(&l_mg->close_lock); + spin_lock(&line->lock); + + /* Update the in-memory start address for emeta, in case it has + * shifted due to write errors + */ + if (line->emeta_ssec != line->cur_sec) + line->emeta_ssec = line->cur_sec; + + list_add_tail(&line->list, &l_mg->emeta_list); + spin_unlock(&line->lock); + spin_unlock(&l_mg->close_lock); + + pblk_line_should_sync_meta(pblk); +} + +static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + unsigned int lba_list_size = lm->emeta_len[2]; + struct pblk_w_err_gc *w_err_gc = line->w_err_gc; + struct pblk_emeta *emeta = line->emeta; + + w_err_gc->lba_list = pblk_malloc(lba_list_size, + l_mg->emeta_alloc_type, GFP_KERNEL); + memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf), + lba_list_size); +} + +void pblk_line_close_ws(struct work_struct *work) +{ + struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, + ws); + struct pblk *pblk = line_ws->pblk; + struct pblk_line *line = line_ws->line; + struct pblk_w_err_gc *w_err_gc = line->w_err_gc; + + /* Write errors makes the emeta start address stored in smeta invalid, + * so keep a copy of the lba list until we've gc'd the line + */ + if (w_err_gc->has_write_err) + pblk_save_lba_list(pblk, line); + + pblk_line_close(pblk, line); + mempool_free(line_ws, &pblk->gen_ws_pool); +} + +void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, + void (*work)(struct work_struct *), gfp_t gfp_mask, + struct workqueue_struct *wq) +{ + struct pblk_line_ws *line_ws; + + line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask); + + line_ws->pblk = pblk; + line_ws->line = line; + line_ws->priv = priv; + + INIT_WORK(&line_ws->ws, work); + queue_work(wq, &line_ws->ws); +} + +static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, + int nr_ppas, int pos) +{ + struct pblk_lun *rlun = &pblk->luns[pos]; + int ret; + + /* + * Only send one inflight I/O per LUN. Since we map at a page + * granurality, all ppas in the I/O will map to the same LUN + */ +#ifdef CONFIG_NVM_PBLK_DEBUG + int i; + + for (i = 1; i < nr_ppas; i++) + WARN_ON(ppa_list[0].a.lun != ppa_list[i].a.lun || + ppa_list[0].a.ch != ppa_list[i].a.ch); +#endif + + ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); + if (ret == -ETIME || ret == -EINTR) + pblk_err(pblk, "taking lun semaphore timed out: err %d\n", + -ret); +} + +void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int pos = pblk_ppa_to_pos(geo, ppa_list[0]); + + __pblk_down_page(pblk, ppa_list, nr_ppas, pos); +} + +void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, + unsigned long *lun_bitmap) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int pos = pblk_ppa_to_pos(geo, ppa_list[0]); + + /* If the LUN has been locked for this same request, do no attempt to + * lock it again + */ + if (test_and_set_bit(pos, lun_bitmap)) + return; + + __pblk_down_page(pblk, ppa_list, nr_ppas, pos); +} + +void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_lun *rlun; + int pos = pblk_ppa_to_pos(geo, ppa_list[0]); + +#ifdef CONFIG_NVM_PBLK_DEBUG + int i; + + for (i = 1; i < nr_ppas; i++) + WARN_ON(ppa_list[0].a.lun != ppa_list[i].a.lun || + ppa_list[0].a.ch != ppa_list[i].a.ch); +#endif + + rlun = &pblk->luns[pos]; + up(&rlun->wr_sem); +} + +void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, + unsigned long *lun_bitmap) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_lun *rlun; + int num_lun = geo->all_luns; + int bit = -1; + + while ((bit = find_next_bit(lun_bitmap, num_lun, bit + 1)) < num_lun) { + rlun = &pblk->luns[bit]; + up(&rlun->wr_sem); + } +} + +void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) +{ + struct ppa_addr ppa_l2p; + + /* logic error: lba out-of-bounds. Ignore update */ + if (!(lba < pblk->rl.nr_secs)) { + WARN(1, "pblk: corrupted L2P map request\n"); + return; + } + + spin_lock(&pblk->trans_lock); + ppa_l2p = pblk_trans_map_get(pblk, lba); + + if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)) + pblk_map_invalidate(pblk, ppa_l2p); + + pblk_trans_map_set(pblk, lba, ppa); + spin_unlock(&pblk->trans_lock); +} + +void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) +{ + +#ifdef CONFIG_NVM_PBLK_DEBUG + /* Callers must ensure that the ppa points to a cache address */ + BUG_ON(!pblk_addr_in_cache(ppa)); + BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); +#endif + + pblk_update_map(pblk, lba, ppa); +} + +int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, + struct pblk_line *gc_line, u64 paddr_gc) +{ + struct ppa_addr ppa_l2p, ppa_gc; + int ret = 1; + +#ifdef CONFIG_NVM_PBLK_DEBUG + /* Callers must ensure that the ppa points to a cache address */ + BUG_ON(!pblk_addr_in_cache(ppa_new)); + BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); +#endif + + /* logic error: lba out-of-bounds. Ignore update */ + if (!(lba < pblk->rl.nr_secs)) { + WARN(1, "pblk: corrupted L2P map request\n"); + return 0; + } + + spin_lock(&pblk->trans_lock); + ppa_l2p = pblk_trans_map_get(pblk, lba); + ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id); + + if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) { + spin_lock(&gc_line->lock); + WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap), + "pblk: corrupted GC update"); + spin_unlock(&gc_line->lock); + + ret = 0; + goto out; + } + + pblk_trans_map_set(pblk, lba, ppa_new); +out: + spin_unlock(&pblk->trans_lock); + return ret; +} + +void pblk_update_map_dev(struct pblk *pblk, sector_t lba, + struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache) +{ + struct ppa_addr ppa_l2p; + +#ifdef CONFIG_NVM_PBLK_DEBUG + /* Callers must ensure that the ppa points to a device address */ + BUG_ON(pblk_addr_in_cache(ppa_mapped)); +#endif + /* Invalidate and discard padded entries */ + if (lba == ADDR_EMPTY) { + atomic64_inc(&pblk->pad_wa); +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_inc(&pblk->padded_wb); +#endif + if (!pblk_ppa_empty(ppa_mapped)) + pblk_map_invalidate(pblk, ppa_mapped); + return; + } + + /* logic error: lba out-of-bounds. Ignore update */ + if (!(lba < pblk->rl.nr_secs)) { + WARN(1, "pblk: corrupted L2P map request\n"); + return; + } + + spin_lock(&pblk->trans_lock); + ppa_l2p = pblk_trans_map_get(pblk, lba); + + /* Do not update L2P if the cacheline has been updated. In this case, + * the mapped ppa must be invalidated + */ + if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) { + if (!pblk_ppa_empty(ppa_mapped)) + pblk_map_invalidate(pblk, ppa_mapped); + goto out; + } + +#ifdef CONFIG_NVM_PBLK_DEBUG + WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); +#endif + + pblk_trans_map_set(pblk, lba, ppa_mapped); +out: + spin_unlock(&pblk->trans_lock); +} + +void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, + sector_t blba, int nr_secs) +{ + int i; + + spin_lock(&pblk->trans_lock); + for (i = 0; i < nr_secs; i++) { + struct ppa_addr ppa; + + ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i); + + /* If the L2P entry maps to a line, the reference is valid */ + if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { + int line_id = pblk_ppa_to_line(ppa); + struct pblk_line *line = &pblk->lines[line_id]; + + kref_get(&line->ref); + } + } + spin_unlock(&pblk->trans_lock); +} + +void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, + u64 *lba_list, int nr_secs) +{ + u64 lba; + int i; + + spin_lock(&pblk->trans_lock); + for (i = 0; i < nr_secs; i++) { + lba = lba_list[i]; + if (lba != ADDR_EMPTY) { + /* logic error: lba out-of-bounds. Ignore update */ + if (!(lba < pblk->rl.nr_secs)) { + WARN(1, "pblk: corrupted L2P map request\n"); + continue; + } + ppas[i] = pblk_trans_map_get(pblk, lba); + } + } + spin_unlock(&pblk->trans_lock); +} diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c new file mode 100644 index 000000000..157c2567c --- /dev/null +++ b/drivers/lightnvm/pblk-gc.c @@ -0,0 +1,705 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * pblk-gc.c - pblk's garbage collector + */ + +#include "pblk.h" +#include <linux/delay.h> + +static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) +{ + if (gc_rq->data) + vfree(gc_rq->data); + kfree(gc_rq); +} + +static int pblk_gc_write(struct pblk *pblk) +{ + struct pblk_gc *gc = &pblk->gc; + struct pblk_gc_rq *gc_rq, *tgc_rq; + LIST_HEAD(w_list); + + spin_lock(&gc->w_lock); + if (list_empty(&gc->w_list)) { + spin_unlock(&gc->w_lock); + return 1; + } + + list_cut_position(&w_list, &gc->w_list, gc->w_list.prev); + gc->w_entries = 0; + spin_unlock(&gc->w_lock); + + list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { + pblk_write_gc_to_cache(pblk, gc_rq); + list_del(&gc_rq->list); + kref_put(&gc_rq->line->ref, pblk_line_put); + pblk_gc_free_gc_rq(gc_rq); + } + + return 0; +} + +static void pblk_gc_writer_kick(struct pblk_gc *gc) +{ + wake_up_process(gc->gc_writer_ts); +} + +static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct list_head *move_list; + + spin_lock(&line->lock); + WARN_ON(line->state != PBLK_LINESTATE_GC); + line->state = PBLK_LINESTATE_CLOSED; + move_list = pblk_line_gc_list(pblk, line); + spin_unlock(&line->lock); + + if (move_list) { + spin_lock(&l_mg->gc_lock); + list_add_tail(&line->list, move_list); + spin_unlock(&l_mg->gc_lock); + } +} + +static void pblk_gc_line_ws(struct work_struct *work) +{ + struct pblk_line_ws *gc_rq_ws = container_of(work, + struct pblk_line_ws, ws); + struct pblk *pblk = gc_rq_ws->pblk; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_gc *gc = &pblk->gc; + struct pblk_line *line = gc_rq_ws->line; + struct pblk_gc_rq *gc_rq = gc_rq_ws->priv; + int ret; + + up(&gc->gc_sem); + + gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs)); + if (!gc_rq->data) { + pblk_err(pblk, "could not GC line:%d (%d/%d)\n", + line->id, *line->vsc, gc_rq->nr_secs); + goto out; + } + + /* Read from GC victim block */ + ret = pblk_submit_read_gc(pblk, gc_rq); + if (ret) { + pblk_err(pblk, "failed GC read in line:%d (err:%d)\n", + line->id, ret); + goto out; + } + + if (!gc_rq->secs_to_gc) + goto out; + +retry: + spin_lock(&gc->w_lock); + if (gc->w_entries >= PBLK_GC_RQ_QD) { + spin_unlock(&gc->w_lock); + pblk_gc_writer_kick(&pblk->gc); + usleep_range(128, 256); + goto retry; + } + gc->w_entries++; + list_add_tail(&gc_rq->list, &gc->w_list); + spin_unlock(&gc->w_lock); + + pblk_gc_writer_kick(&pblk->gc); + + kfree(gc_rq_ws); + return; + +out: + pblk_gc_free_gc_rq(gc_rq); + kref_put(&line->ref, pblk_line_put); + kfree(gc_rq_ws); +} + +static __le64 *get_lba_list_from_emeta(struct pblk *pblk, + struct pblk_line *line) +{ + struct line_emeta *emeta_buf; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + unsigned int lba_list_size = lm->emeta_len[2]; + __le64 *lba_list; + int ret; + + emeta_buf = pblk_malloc(lm->emeta_len[0], + l_mg->emeta_alloc_type, GFP_KERNEL); + if (!emeta_buf) + return NULL; + + ret = pblk_line_read_emeta(pblk, line, emeta_buf); + if (ret) { + pblk_err(pblk, "line %d read emeta failed (%d)\n", + line->id, ret); + pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + return NULL; + } + + /* If this read fails, it means that emeta is corrupted. + * For now, leave the line untouched. + * TODO: Implement a recovery routine that scans and moves + * all sectors on the line. + */ + + ret = pblk_recov_check_emeta(pblk, emeta_buf); + if (ret) { + pblk_err(pblk, "inconsistent emeta (line %d)\n", + line->id); + pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + return NULL; + } + + lba_list = pblk_malloc(lba_list_size, + l_mg->emeta_alloc_type, GFP_KERNEL); + if (lba_list) + memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size); + + pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + + return lba_list; +} + +static void pblk_gc_line_prepare_ws(struct work_struct *work) +{ + struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, + ws); + struct pblk *pblk = line_ws->pblk; + struct pblk_line *line = line_ws->line; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_gc *gc = &pblk->gc; + struct pblk_line_ws *gc_rq_ws; + struct pblk_gc_rq *gc_rq; + __le64 *lba_list; + unsigned long *invalid_bitmap; + int sec_left, nr_secs, bit; + + invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL); + if (!invalid_bitmap) + goto fail_free_ws; + + if (line->w_err_gc->has_write_err) { + lba_list = line->w_err_gc->lba_list; + line->w_err_gc->lba_list = NULL; + } else { + lba_list = get_lba_list_from_emeta(pblk, line); + if (!lba_list) { + pblk_err(pblk, "could not interpret emeta (line %d)\n", + line->id); + goto fail_free_invalid_bitmap; + } + } + + spin_lock(&line->lock); + bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line); + sec_left = pblk_line_vsc(line); + spin_unlock(&line->lock); + + if (sec_left < 0) { + pblk_err(pblk, "corrupted GC line (%d)\n", line->id); + goto fail_free_lba_list; + } + + bit = -1; +next_rq: + gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL); + if (!gc_rq) + goto fail_free_lba_list; + + nr_secs = 0; + do { + bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line, + bit + 1); + if (bit > line->emeta_ssec) + break; + + gc_rq->paddr_list[nr_secs] = bit; + gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); + } while (nr_secs < pblk->max_write_pgs); + + if (unlikely(!nr_secs)) { + kfree(gc_rq); + goto out; + } + + gc_rq->nr_secs = nr_secs; + gc_rq->line = line; + + gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); + if (!gc_rq_ws) + goto fail_free_gc_rq; + + gc_rq_ws->pblk = pblk; + gc_rq_ws->line = line; + gc_rq_ws->priv = gc_rq; + + /* The write GC path can be much slower than the read GC one due to + * the budget imposed by the rate-limiter. Balance in case that we get + * back pressure from the write GC path. + */ + while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000))) + io_schedule(); + + kref_get(&line->ref); + + INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws); + queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws); + + sec_left -= nr_secs; + if (sec_left > 0) + goto next_rq; + +out: + pblk_mfree(lba_list, l_mg->emeta_alloc_type); + kfree(line_ws); + kfree(invalid_bitmap); + + kref_put(&line->ref, pblk_line_put); + atomic_dec(&gc->read_inflight_gc); + + return; + +fail_free_gc_rq: + kfree(gc_rq); +fail_free_lba_list: + pblk_mfree(lba_list, l_mg->emeta_alloc_type); +fail_free_invalid_bitmap: + kfree(invalid_bitmap); +fail_free_ws: + kfree(line_ws); + + pblk_put_line_back(pblk, line); + kref_put(&line->ref, pblk_line_put); + atomic_dec(&gc->read_inflight_gc); + + pblk_err(pblk, "failed to GC line %d\n", line->id); +} + +static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_gc *gc = &pblk->gc; + struct pblk_line_ws *line_ws; + + pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id); + + line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); + if (!line_ws) + return -ENOMEM; + + line_ws->pblk = pblk; + line_ws->line = line; + + atomic_inc(&gc->pipeline_gc); + INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); + queue_work(gc->gc_reader_wq, &line_ws->ws); + + return 0; +} + +static void pblk_gc_reader_kick(struct pblk_gc *gc) +{ + wake_up_process(gc->gc_reader_ts); +} + +static void pblk_gc_kick(struct pblk *pblk) +{ + struct pblk_gc *gc = &pblk->gc; + + pblk_gc_writer_kick(gc); + pblk_gc_reader_kick(gc); + + /* If we're shutting down GC, let's not start it up again */ + if (gc->gc_enabled) { + wake_up_process(gc->gc_ts); + mod_timer(&gc->gc_timer, + jiffies + msecs_to_jiffies(GC_TIME_MSECS)); + } +} + +static int pblk_gc_read(struct pblk *pblk) +{ + struct pblk_gc *gc = &pblk->gc; + struct pblk_line *line; + + spin_lock(&gc->r_lock); + if (list_empty(&gc->r_list)) { + spin_unlock(&gc->r_lock); + return 1; + } + + line = list_first_entry(&gc->r_list, struct pblk_line, list); + list_del(&line->list); + spin_unlock(&gc->r_lock); + + pblk_gc_kick(pblk); + + if (pblk_gc_line(pblk, line)) + pblk_err(pblk, "failed to GC line %d\n", line->id); + + return 0; +} + +static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, + struct list_head *group_list) +{ + struct pblk_line *line, *victim; + int line_vsc, victim_vsc; + + victim = list_first_entry(group_list, struct pblk_line, list); + list_for_each_entry(line, group_list, list) { + line_vsc = le32_to_cpu(*line->vsc); + victim_vsc = le32_to_cpu(*victim->vsc); + if (line_vsc < victim_vsc) + victim = line; + } + + return victim; +} + +static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) +{ + unsigned int nr_blocks_free, nr_blocks_need; + unsigned int werr_lines = atomic_read(&rl->werr_lines); + + nr_blocks_need = pblk_rl_high_thrs(rl); + nr_blocks_free = pblk_rl_nr_free_blks(rl); + + /* This is not critical, no need to take lock here */ + return ((werr_lines > 0) || + ((gc->gc_active) && (nr_blocks_need > nr_blocks_free))); +} + +void pblk_gc_free_full_lines(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_gc *gc = &pblk->gc; + struct pblk_line *line; + + do { + spin_lock(&l_mg->gc_lock); + if (list_empty(&l_mg->gc_full_list)) { + spin_unlock(&l_mg->gc_lock); + return; + } + + line = list_first_entry(&l_mg->gc_full_list, + struct pblk_line, list); + + spin_lock(&line->lock); + WARN_ON(line->state != PBLK_LINESTATE_CLOSED); + line->state = PBLK_LINESTATE_GC; + spin_unlock(&line->lock); + + list_del(&line->list); + spin_unlock(&l_mg->gc_lock); + + atomic_inc(&gc->pipeline_gc); + kref_put(&line->ref, pblk_line_put); + } while (1); +} + +/* + * Lines with no valid sectors will be returned to the free list immediately. If + * GC is activated - either because the free block count is under the determined + * threshold, or because it is being forced from user space - only lines with a + * high count of invalid sectors will be recycled. + */ +static void pblk_gc_run(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_gc *gc = &pblk->gc; + struct pblk_line *line; + struct list_head *group_list; + bool run_gc; + int read_inflight_gc, gc_group = 0, prev_group = 0; + + pblk_gc_free_full_lines(pblk); + + run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); + if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD)) + return; + +next_gc_group: + group_list = l_mg->gc_lists[gc_group++]; + + do { + spin_lock(&l_mg->gc_lock); + if (list_empty(group_list)) { + spin_unlock(&l_mg->gc_lock); + break; + } + + line = pblk_gc_get_victim_line(pblk, group_list); + + spin_lock(&line->lock); + WARN_ON(line->state != PBLK_LINESTATE_CLOSED); + line->state = PBLK_LINESTATE_GC; + spin_unlock(&line->lock); + + list_del(&line->list); + spin_unlock(&l_mg->gc_lock); + + spin_lock(&gc->r_lock); + list_add_tail(&line->list, &gc->r_list); + spin_unlock(&gc->r_lock); + + read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc); + pblk_gc_reader_kick(gc); + + prev_group = 1; + + /* No need to queue up more GC lines than we can handle */ + run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); + if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD) + break; + } while (1); + + if (!prev_group && pblk->rl.rb_state > gc_group && + gc_group < PBLK_GC_NR_LISTS) + goto next_gc_group; +} + +static void pblk_gc_timer(struct timer_list *t) +{ + struct pblk *pblk = from_timer(pblk, t, gc.gc_timer); + + pblk_gc_kick(pblk); +} + +static int pblk_gc_ts(void *data) +{ + struct pblk *pblk = data; + + while (!kthread_should_stop()) { + pblk_gc_run(pblk); + set_current_state(TASK_INTERRUPTIBLE); + io_schedule(); + } + + return 0; +} + +static int pblk_gc_writer_ts(void *data) +{ + struct pblk *pblk = data; + + while (!kthread_should_stop()) { + if (!pblk_gc_write(pblk)) + continue; + set_current_state(TASK_INTERRUPTIBLE); + io_schedule(); + } + + return 0; +} + +static int pblk_gc_reader_ts(void *data) +{ + struct pblk *pblk = data; + struct pblk_gc *gc = &pblk->gc; + + while (!kthread_should_stop()) { + if (!pblk_gc_read(pblk)) + continue; + set_current_state(TASK_INTERRUPTIBLE); + io_schedule(); + } + +#ifdef CONFIG_NVM_PBLK_DEBUG + pblk_info(pblk, "flushing gc pipeline, %d lines left\n", + atomic_read(&gc->pipeline_gc)); +#endif + + do { + if (!atomic_read(&gc->pipeline_gc)) + break; + + schedule(); + } while (1); + + return 0; +} + +static void pblk_gc_start(struct pblk *pblk) +{ + pblk->gc.gc_active = 1; + pblk_debug(pblk, "gc start\n"); +} + +void pblk_gc_should_start(struct pblk *pblk) +{ + struct pblk_gc *gc = &pblk->gc; + + if (gc->gc_enabled && !gc->gc_active) { + pblk_gc_start(pblk); + pblk_gc_kick(pblk); + } +} + +void pblk_gc_should_stop(struct pblk *pblk) +{ + struct pblk_gc *gc = &pblk->gc; + + if (gc->gc_active && !gc->gc_forced) + gc->gc_active = 0; +} + +void pblk_gc_should_kick(struct pblk *pblk) +{ + pblk_rl_update_rates(&pblk->rl); +} + +void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, + int *gc_active) +{ + struct pblk_gc *gc = &pblk->gc; + + spin_lock(&gc->lock); + *gc_enabled = gc->gc_enabled; + *gc_active = gc->gc_active; + spin_unlock(&gc->lock); +} + +int pblk_gc_sysfs_force(struct pblk *pblk, int force) +{ + struct pblk_gc *gc = &pblk->gc; + + if (force < 0 || force > 1) + return -EINVAL; + + spin_lock(&gc->lock); + gc->gc_forced = force; + + if (force) + gc->gc_enabled = 1; + else + gc->gc_enabled = 0; + spin_unlock(&gc->lock); + + pblk_gc_should_start(pblk); + + return 0; +} + +int pblk_gc_init(struct pblk *pblk) +{ + struct pblk_gc *gc = &pblk->gc; + int ret; + + gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts"); + if (IS_ERR(gc->gc_ts)) { + pblk_err(pblk, "could not allocate GC main kthread\n"); + return PTR_ERR(gc->gc_ts); + } + + gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk, + "pblk-gc-writer-ts"); + if (IS_ERR(gc->gc_writer_ts)) { + pblk_err(pblk, "could not allocate GC writer kthread\n"); + ret = PTR_ERR(gc->gc_writer_ts); + goto fail_free_main_kthread; + } + + gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk, + "pblk-gc-reader-ts"); + if (IS_ERR(gc->gc_reader_ts)) { + pblk_err(pblk, "could not allocate GC reader kthread\n"); + ret = PTR_ERR(gc->gc_reader_ts); + goto fail_free_writer_kthread; + } + + timer_setup(&gc->gc_timer, pblk_gc_timer, 0); + mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); + + gc->gc_active = 0; + gc->gc_forced = 0; + gc->gc_enabled = 1; + gc->w_entries = 0; + atomic_set(&gc->read_inflight_gc, 0); + atomic_set(&gc->pipeline_gc, 0); + + /* Workqueue that reads valid sectors from a line and submit them to the + * GC writer to be recycled. + */ + gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS); + if (!gc->gc_line_reader_wq) { + pblk_err(pblk, "could not allocate GC line reader workqueue\n"); + ret = -ENOMEM; + goto fail_free_reader_kthread; + } + + /* Workqueue that prepare lines for GC */ + gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!gc->gc_reader_wq) { + pblk_err(pblk, "could not allocate GC reader workqueue\n"); + ret = -ENOMEM; + goto fail_free_reader_line_wq; + } + + spin_lock_init(&gc->lock); + spin_lock_init(&gc->w_lock); + spin_lock_init(&gc->r_lock); + + sema_init(&gc->gc_sem, PBLK_GC_RQ_QD); + + INIT_LIST_HEAD(&gc->w_list); + INIT_LIST_HEAD(&gc->r_list); + + return 0; + +fail_free_reader_line_wq: + destroy_workqueue(gc->gc_line_reader_wq); +fail_free_reader_kthread: + kthread_stop(gc->gc_reader_ts); +fail_free_writer_kthread: + kthread_stop(gc->gc_writer_ts); +fail_free_main_kthread: + kthread_stop(gc->gc_ts); + + return ret; +} + +void pblk_gc_exit(struct pblk *pblk, bool graceful) +{ + struct pblk_gc *gc = &pblk->gc; + + gc->gc_enabled = 0; + del_timer_sync(&gc->gc_timer); + gc->gc_active = 0; + + if (gc->gc_ts) + kthread_stop(gc->gc_ts); + + if (gc->gc_reader_ts) + kthread_stop(gc->gc_reader_ts); + + if (graceful) { + flush_workqueue(gc->gc_reader_wq); + flush_workqueue(gc->gc_line_reader_wq); + } + + destroy_workqueue(gc->gc_reader_wq); + destroy_workqueue(gc->gc_line_reader_wq); + + if (gc->gc_writer_ts) + kthread_stop(gc->gc_writer_ts); +} diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c new file mode 100644 index 000000000..88b632787 --- /dev/null +++ b/drivers/lightnvm/pblk-init.c @@ -0,0 +1,1366 @@ +/* + * Copyright (C) 2015 IT University of Copenhagen (rrpc.c) + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Implementation of a physical block-device target for Open-channel SSDs. + * + * pblk-init.c - pblk's initialization. + */ + +#include "pblk.h" + +static unsigned int write_buffer_size; + +module_param(write_buffer_size, uint, 0644); +MODULE_PARM_DESC(write_buffer_size, "number of entries in a write buffer"); + +static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, + *pblk_w_rq_cache; +static DECLARE_RWSEM(pblk_lock); +struct bio_set pblk_bio_set; + +static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, + struct bio *bio) +{ + int ret; + + /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap + * constraint. Writes can be of arbitrary size. + */ + if (bio_data_dir(bio) == READ) { + blk_queue_split(q, &bio); + ret = pblk_submit_read(pblk, bio); + if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED)) + bio_put(bio); + + return ret; + } + + /* Prevent deadlock in the case of a modest LUN configuration and large + * user I/Os. Unless stalled, the rate limiter leaves at least 256KB + * available for user I/O. + */ + if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl)) + blk_queue_split(q, &bio); + + return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); +} + +static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio) +{ + struct pblk *pblk = q->queuedata; + + if (bio_op(bio) == REQ_OP_DISCARD) { + pblk_discard(pblk, bio); + if (!(bio->bi_opf & REQ_PREFLUSH)) { + bio_endio(bio); + return BLK_QC_T_NONE; + } + } + + switch (pblk_rw_io(q, pblk, bio)) { + case NVM_IO_ERR: + bio_io_error(bio); + break; + case NVM_IO_DONE: + bio_endio(bio); + break; + } + + return BLK_QC_T_NONE; +} + +static size_t pblk_trans_map_size(struct pblk *pblk) +{ + int entry_size = 8; + + if (pblk->addrf_len < 32) + entry_size = 4; + + return entry_size * pblk->rl.nr_secs; +} + +#ifdef CONFIG_NVM_PBLK_DEBUG +static u32 pblk_l2p_crc(struct pblk *pblk) +{ + size_t map_size; + u32 crc = ~(u32)0; + + map_size = pblk_trans_map_size(pblk); + crc = crc32_le(crc, pblk->trans_map, map_size); + return crc; +} +#endif + +static void pblk_l2p_free(struct pblk *pblk) +{ + vfree(pblk->trans_map); +} + +static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) +{ + struct pblk_line *line = NULL; + + if (factory_init) { + pblk_setup_uuid(pblk); + } else { + line = pblk_recov_l2p(pblk); + if (IS_ERR(line)) { + pblk_err(pblk, "could not recover l2p table\n"); + return -EFAULT; + } + } + +#ifdef CONFIG_NVM_PBLK_DEBUG + pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); +#endif + + /* Free full lines directly as GC has not been started yet */ + pblk_gc_free_full_lines(pblk); + + if (!line) { + /* Configure next line for user data */ + line = pblk_line_get_first_data(pblk); + if (!line) + return -EFAULT; + } + + return 0; +} + +static int pblk_l2p_init(struct pblk *pblk, bool factory_init) +{ + sector_t i; + struct ppa_addr ppa; + size_t map_size; + int ret = 0; + + map_size = pblk_trans_map_size(pblk); + pblk->trans_map = vmalloc(map_size); + if (!pblk->trans_map) + return -ENOMEM; + + pblk_ppa_set_empty(&ppa); + + for (i = 0; i < pblk->rl.nr_secs; i++) + pblk_trans_map_set(pblk, i, ppa); + + ret = pblk_l2p_recover(pblk, factory_init); + if (ret) + vfree(pblk->trans_map); + + return ret; +} + +static void pblk_rwb_free(struct pblk *pblk) +{ + if (pblk_rb_tear_down_check(&pblk->rwb)) + pblk_err(pblk, "write buffer error on tear down\n"); + + pblk_rb_data_free(&pblk->rwb); + vfree(pblk_rb_entries_ref(&pblk->rwb)); +} + +static int pblk_rwb_init(struct pblk *pblk) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_rb_entry *entries; + unsigned long nr_entries, buffer_size; + unsigned int power_size, power_seg_sz; + int pgs_in_buffer; + + pgs_in_buffer = (max(geo->mw_cunits, geo->ws_opt) + geo->ws_opt) + * geo->all_luns; + + if (write_buffer_size && (write_buffer_size > pgs_in_buffer)) + buffer_size = write_buffer_size; + else + buffer_size = pgs_in_buffer; + + nr_entries = pblk_rb_calculate_size(buffer_size); + + entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry))); + if (!entries) + return -ENOMEM; + + power_size = get_count_order(nr_entries); + power_seg_sz = get_count_order(geo->csecs); + + return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz); +} + +/* Minimum pages needed within a lun */ +#define ADDR_POOL_SIZE 64 + +static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo, + struct nvm_addrf_12 *dst) +{ + struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf; + int power_len; + + /* Re-calculate channel and lun format to adapt to configuration */ + power_len = get_count_order(geo->num_ch); + if (1 << power_len != geo->num_ch) { + pblk_err(pblk, "supports only power-of-two channel config.\n"); + return -EINVAL; + } + dst->ch_len = power_len; + + power_len = get_count_order(geo->num_lun); + if (1 << power_len != geo->num_lun) { + pblk_err(pblk, "supports only power-of-two LUN config.\n"); + return -EINVAL; + } + dst->lun_len = power_len; + + dst->blk_len = src->blk_len; + dst->pg_len = src->pg_len; + dst->pln_len = src->pln_len; + dst->sec_len = src->sec_len; + + dst->sec_offset = 0; + dst->pln_offset = dst->sec_len; + dst->ch_offset = dst->pln_offset + dst->pln_len; + dst->lun_offset = dst->ch_offset + dst->ch_len; + dst->pg_offset = dst->lun_offset + dst->lun_len; + dst->blk_offset = dst->pg_offset + dst->pg_len; + + dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset; + dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset; + dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset; + dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset; + dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset; + dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset; + + return dst->blk_offset + src->blk_len; +} + +static int pblk_set_addrf_20(struct nvm_geo *geo, struct nvm_addrf *adst, + struct pblk_addrf *udst) +{ + struct nvm_addrf *src = &geo->addrf; + + adst->ch_len = get_count_order(geo->num_ch); + adst->lun_len = get_count_order(geo->num_lun); + adst->chk_len = src->chk_len; + adst->sec_len = src->sec_len; + + adst->sec_offset = 0; + adst->ch_offset = adst->sec_len; + adst->lun_offset = adst->ch_offset + adst->ch_len; + adst->chk_offset = adst->lun_offset + adst->lun_len; + + adst->sec_mask = ((1ULL << adst->sec_len) - 1) << adst->sec_offset; + adst->chk_mask = ((1ULL << adst->chk_len) - 1) << adst->chk_offset; + adst->lun_mask = ((1ULL << adst->lun_len) - 1) << adst->lun_offset; + adst->ch_mask = ((1ULL << adst->ch_len) - 1) << adst->ch_offset; + + udst->sec_stripe = geo->ws_opt; + udst->ch_stripe = geo->num_ch; + udst->lun_stripe = geo->num_lun; + + udst->sec_lun_stripe = udst->sec_stripe * udst->ch_stripe; + udst->sec_ws_stripe = udst->sec_lun_stripe * udst->lun_stripe; + + return adst->chk_offset + adst->chk_len; +} + +static int pblk_set_addrf(struct pblk *pblk) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int mod; + + switch (geo->version) { + case NVM_OCSSD_SPEC_12: + div_u64_rem(geo->clba, pblk->min_write_pgs, &mod); + if (mod) { + pblk_err(pblk, "bad configuration of sectors/pages\n"); + return -EINVAL; + } + + pblk->addrf_len = pblk_set_addrf_12(pblk, geo, + (void *)&pblk->addrf); + break; + case NVM_OCSSD_SPEC_20: + pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf, + &pblk->uaddrf); + break; + default: + pblk_err(pblk, "OCSSD revision not supported (%d)\n", + geo->version); + return -EINVAL; + } + + return 0; +} + +static int pblk_init_global_caches(struct pblk *pblk) +{ + down_write(&pblk_lock); + pblk_ws_cache = kmem_cache_create("pblk_blk_ws", + sizeof(struct pblk_line_ws), 0, 0, NULL); + if (!pblk_ws_cache) { + up_write(&pblk_lock); + return -ENOMEM; + } + + pblk_rec_cache = kmem_cache_create("pblk_rec", + sizeof(struct pblk_rec_ctx), 0, 0, NULL); + if (!pblk_rec_cache) { + kmem_cache_destroy(pblk_ws_cache); + up_write(&pblk_lock); + return -ENOMEM; + } + + pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, + 0, 0, NULL); + if (!pblk_g_rq_cache) { + kmem_cache_destroy(pblk_ws_cache); + kmem_cache_destroy(pblk_rec_cache); + up_write(&pblk_lock); + return -ENOMEM; + } + + pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size, + 0, 0, NULL); + if (!pblk_w_rq_cache) { + kmem_cache_destroy(pblk_ws_cache); + kmem_cache_destroy(pblk_rec_cache); + kmem_cache_destroy(pblk_g_rq_cache); + up_write(&pblk_lock); + return -ENOMEM; + } + up_write(&pblk_lock); + + return 0; +} + +static void pblk_free_global_caches(struct pblk *pblk) +{ + kmem_cache_destroy(pblk_ws_cache); + kmem_cache_destroy(pblk_rec_cache); + kmem_cache_destroy(pblk_g_rq_cache); + kmem_cache_destroy(pblk_w_rq_cache); +} + +static int pblk_core_init(struct pblk *pblk) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int ret, max_write_ppas; + + atomic64_set(&pblk->user_wa, 0); + atomic64_set(&pblk->pad_wa, 0); + atomic64_set(&pblk->gc_wa, 0); + pblk->user_rst_wa = 0; + pblk->pad_rst_wa = 0; + pblk->gc_rst_wa = 0; + + atomic64_set(&pblk->nr_flush, 0); + pblk->nr_flush_rst = 0; + + pblk->min_write_pgs = geo->ws_opt; + max_write_ppas = pblk->min_write_pgs * geo->all_luns; + pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); + pblk->max_write_pgs = min_t(int, pblk->max_write_pgs, + queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT)); + pblk_set_sec_per_write(pblk, pblk->min_write_pgs); + + if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { + pblk_err(pblk, "vector list too big(%u > %u)\n", + pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS); + return -EINVAL; + } + + pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t), + GFP_KERNEL); + if (!pblk->pad_dist) + return -ENOMEM; + + if (pblk_init_global_caches(pblk)) + goto fail_free_pad_dist; + + /* Internal bios can be at most the sectors signaled by the device. */ + ret = mempool_init_page_pool(&pblk->page_bio_pool, NVM_MAX_VLBA, 0); + if (ret) + goto free_global_caches; + + ret = mempool_init_slab_pool(&pblk->gen_ws_pool, PBLK_GEN_WS_POOL_SIZE, + pblk_ws_cache); + if (ret) + goto free_page_bio_pool; + + ret = mempool_init_slab_pool(&pblk->rec_pool, geo->all_luns, + pblk_rec_cache); + if (ret) + goto free_gen_ws_pool; + + ret = mempool_init_slab_pool(&pblk->r_rq_pool, geo->all_luns, + pblk_g_rq_cache); + if (ret) + goto free_rec_pool; + + ret = mempool_init_slab_pool(&pblk->e_rq_pool, geo->all_luns, + pblk_g_rq_cache); + if (ret) + goto free_r_rq_pool; + + ret = mempool_init_slab_pool(&pblk->w_rq_pool, geo->all_luns, + pblk_w_rq_cache); + if (ret) + goto free_e_rq_pool; + + pblk->close_wq = alloc_workqueue("pblk-close-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); + if (!pblk->close_wq) + goto free_w_rq_pool; + + pblk->bb_wq = alloc_workqueue("pblk-bb-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!pblk->bb_wq) + goto free_close_wq; + + pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!pblk->r_end_wq) + goto free_bb_wq; + + if (pblk_set_addrf(pblk)) + goto free_r_end_wq; + + INIT_LIST_HEAD(&pblk->compl_list); + INIT_LIST_HEAD(&pblk->resubmit_list); + + return 0; + +free_r_end_wq: + destroy_workqueue(pblk->r_end_wq); +free_bb_wq: + destroy_workqueue(pblk->bb_wq); +free_close_wq: + destroy_workqueue(pblk->close_wq); +free_w_rq_pool: + mempool_exit(&pblk->w_rq_pool); +free_e_rq_pool: + mempool_exit(&pblk->e_rq_pool); +free_r_rq_pool: + mempool_exit(&pblk->r_rq_pool); +free_rec_pool: + mempool_exit(&pblk->rec_pool); +free_gen_ws_pool: + mempool_exit(&pblk->gen_ws_pool); +free_page_bio_pool: + mempool_exit(&pblk->page_bio_pool); +free_global_caches: + pblk_free_global_caches(pblk); +fail_free_pad_dist: + kfree(pblk->pad_dist); + return -ENOMEM; +} + +static void pblk_core_free(struct pblk *pblk) +{ + if (pblk->close_wq) + destroy_workqueue(pblk->close_wq); + + if (pblk->r_end_wq) + destroy_workqueue(pblk->r_end_wq); + + if (pblk->bb_wq) + destroy_workqueue(pblk->bb_wq); + + mempool_exit(&pblk->page_bio_pool); + mempool_exit(&pblk->gen_ws_pool); + mempool_exit(&pblk->rec_pool); + mempool_exit(&pblk->r_rq_pool); + mempool_exit(&pblk->e_rq_pool); + mempool_exit(&pblk->w_rq_pool); + + pblk_free_global_caches(pblk); + kfree(pblk->pad_dist); +} + +static void pblk_line_mg_free(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + int i; + + kfree(l_mg->bb_template); + kfree(l_mg->bb_aux); + kfree(l_mg->vsc_list); + + for (i = 0; i < PBLK_DATA_LINES; i++) { + kfree(l_mg->sline_meta[i]); + pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type); + kfree(l_mg->eline_meta[i]); + } +} + +static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg, + struct pblk_line *line) +{ + struct pblk_w_err_gc *w_err_gc = line->w_err_gc; + + kfree(line->blk_bitmap); + kfree(line->erase_bitmap); + kfree(line->chks); + + pblk_mfree(w_err_gc->lba_list, l_mg->emeta_alloc_type); + kfree(w_err_gc); +} + +static void pblk_lines_free(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line *line; + int i; + + spin_lock(&l_mg->free_lock); + for (i = 0; i < l_mg->nr_lines; i++) { + line = &pblk->lines[i]; + + pblk_line_free(line); + pblk_line_meta_free(l_mg, line); + } + spin_unlock(&l_mg->free_lock); + + pblk_line_mg_free(pblk); + + kfree(pblk->luns); + kfree(pblk->lines); +} + +static int pblk_bb_get_tbl(struct nvm_tgt_dev *dev, struct pblk_lun *rlun, + u8 *blks, int nr_blks) +{ + struct ppa_addr ppa; + int ret; + + ppa.ppa = 0; + ppa.g.ch = rlun->bppa.g.ch; + ppa.g.lun = rlun->bppa.g.lun; + + ret = nvm_get_tgt_bb_tbl(dev, ppa, blks); + if (ret) + return ret; + + nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks); + if (nr_blks < 0) + return -EIO; + + return 0; +} + +static void *pblk_bb_get_meta(struct pblk *pblk) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + u8 *meta; + int i, nr_blks, blk_per_lun; + int ret; + + blk_per_lun = geo->num_chk * geo->pln_mode; + nr_blks = blk_per_lun * geo->all_luns; + + meta = kmalloc(nr_blks, GFP_KERNEL); + if (!meta) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < geo->all_luns; i++) { + struct pblk_lun *rlun = &pblk->luns[i]; + u8 *meta_pos = meta + i * blk_per_lun; + + ret = pblk_bb_get_tbl(dev, rlun, meta_pos, blk_per_lun); + if (ret) { + kfree(meta); + return ERR_PTR(-EIO); + } + } + + return meta; +} + +static void *pblk_chunk_get_meta(struct pblk *pblk) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + + if (geo->version == NVM_OCSSD_SPEC_12) + return pblk_bb_get_meta(pblk); + else + return pblk_chunk_get_info(pblk); +} + +static int pblk_luns_init(struct pblk *pblk) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_lun *rlun; + int i; + + /* TODO: Implement unbalanced LUN support */ + if (geo->num_lun < 0) { + pblk_err(pblk, "unbalanced LUN config.\n"); + return -EINVAL; + } + + pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun), + GFP_KERNEL); + if (!pblk->luns) + return -ENOMEM; + + for (i = 0; i < geo->all_luns; i++) { + /* Stripe across channels */ + int ch = i % geo->num_ch; + int lun_raw = i / geo->num_ch; + int lunid = lun_raw + ch * geo->num_lun; + + rlun = &pblk->luns[i]; + rlun->bppa = dev->luns[lunid]; + + sema_init(&rlun->wr_sem, 1); + } + + return 0; +} + +/* See comment over struct line_emeta definition */ +static unsigned int calc_emeta_len(struct pblk *pblk) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + + /* Round to sector size so that lba_list starts on its own sector */ + lm->emeta_sec[1] = DIV_ROUND_UP( + sizeof(struct line_emeta) + lm->blk_bitmap_len + + sizeof(struct wa_counters), geo->csecs); + lm->emeta_len[1] = lm->emeta_sec[1] * geo->csecs; + + /* Round to sector size so that vsc_list starts on its own sector */ + lm->dsec_per_line = lm->sec_per_line - lm->emeta_sec[0]; + lm->emeta_sec[2] = DIV_ROUND_UP(lm->dsec_per_line * sizeof(u64), + geo->csecs); + lm->emeta_len[2] = lm->emeta_sec[2] * geo->csecs; + + lm->emeta_sec[3] = DIV_ROUND_UP(l_mg->nr_lines * sizeof(u32), + geo->csecs); + lm->emeta_len[3] = lm->emeta_sec[3] * geo->csecs; + + lm->vsc_list_len = l_mg->nr_lines * sizeof(u32); + + return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]); +} + +static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct nvm_geo *geo = &dev->geo; + sector_t provisioned; + int sec_meta, blk_meta; + + if (geo->op == NVM_TARGET_DEFAULT_OP) + pblk->op = PBLK_DEFAULT_OP; + else + pblk->op = geo->op; + + provisioned = nr_free_blks; + provisioned *= (100 - pblk->op); + sector_div(provisioned, 100); + + pblk->op_blks = nr_free_blks - provisioned; + + /* Internally pblk manages all free blocks, but all calculations based + * on user capacity consider only provisioned blocks + */ + pblk->rl.total_blocks = nr_free_blks; + pblk->rl.nr_secs = nr_free_blks * geo->clba; + + /* Consider sectors used for metadata */ + sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; + blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); + + pblk->capacity = (provisioned - blk_meta) * geo->clba; + + atomic_set(&pblk->rl.free_blocks, nr_free_blks); + atomic_set(&pblk->rl.free_user_blocks, nr_free_blks); +} + +static int pblk_setup_line_meta_12(struct pblk *pblk, struct pblk_line *line, + void *chunk_meta) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + int i, chk_per_lun, nr_bad_chks = 0; + + chk_per_lun = geo->num_chk * geo->pln_mode; + + for (i = 0; i < lm->blk_per_line; i++) { + struct pblk_lun *rlun = &pblk->luns[i]; + struct nvm_chk_meta *chunk; + int pos = pblk_ppa_to_pos(geo, rlun->bppa); + u8 *lun_bb_meta = chunk_meta + pos * chk_per_lun; + + chunk = &line->chks[pos]; + + /* + * In 1.2 spec. chunk state is not persisted by the device. Thus + * some of the values are reset each time pblk is instantiated, + * so we have to assume that the block is closed. + */ + if (lun_bb_meta[line->id] == NVM_BLK_T_FREE) + chunk->state = NVM_CHK_ST_CLOSED; + else + chunk->state = NVM_CHK_ST_OFFLINE; + + chunk->type = NVM_CHK_TP_W_SEQ; + chunk->wi = 0; + chunk->slba = -1; + chunk->cnlb = geo->clba; + chunk->wp = 0; + + if (!(chunk->state & NVM_CHK_ST_OFFLINE)) + continue; + + set_bit(pos, line->blk_bitmap); + nr_bad_chks++; + } + + return nr_bad_chks; +} + +static int pblk_setup_line_meta_20(struct pblk *pblk, struct pblk_line *line, + struct nvm_chk_meta *meta) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + int i, nr_bad_chks = 0; + + for (i = 0; i < lm->blk_per_line; i++) { + struct pblk_lun *rlun = &pblk->luns[i]; + struct nvm_chk_meta *chunk; + struct nvm_chk_meta *chunk_meta; + struct ppa_addr ppa; + int pos; + + ppa = rlun->bppa; + pos = pblk_ppa_to_pos(geo, ppa); + chunk = &line->chks[pos]; + + ppa.m.chk = line->id; + chunk_meta = pblk_chunk_get_off(pblk, meta, ppa); + + chunk->state = chunk_meta->state; + chunk->type = chunk_meta->type; + chunk->wi = chunk_meta->wi; + chunk->slba = chunk_meta->slba; + chunk->cnlb = chunk_meta->cnlb; + chunk->wp = chunk_meta->wp; + + if (chunk->type & NVM_CHK_TP_SZ_SPEC) { + WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n"); + continue; + } + + if (!(chunk->state & NVM_CHK_ST_OFFLINE)) + continue; + + set_bit(pos, line->blk_bitmap); + nr_bad_chks++; + } + + return nr_bad_chks; +} + +static long pblk_setup_line_meta(struct pblk *pblk, struct pblk_line *line, + void *chunk_meta, int line_id) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + long nr_bad_chks, chk_in_line; + + line->pblk = pblk; + line->id = line_id; + line->type = PBLK_LINETYPE_FREE; + line->state = PBLK_LINESTATE_NEW; + line->gc_group = PBLK_LINEGC_NONE; + line->vsc = &l_mg->vsc_list[line_id]; + spin_lock_init(&line->lock); + + if (geo->version == NVM_OCSSD_SPEC_12) + nr_bad_chks = pblk_setup_line_meta_12(pblk, line, chunk_meta); + else + nr_bad_chks = pblk_setup_line_meta_20(pblk, line, chunk_meta); + + chk_in_line = lm->blk_per_line - nr_bad_chks; + if (nr_bad_chks < 0 || nr_bad_chks > lm->blk_per_line || + chk_in_line < lm->min_blk_line) { + line->state = PBLK_LINESTATE_BAD; + list_add_tail(&line->list, &l_mg->bad_list); + return 0; + } + + atomic_set(&line->blk_in_line, chk_in_line); + list_add_tail(&line->list, &l_mg->free_list); + l_mg->nr_free_lines++; + + return chk_in_line; +} + +static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line) +{ + struct pblk_line_meta *lm = &pblk->lm; + + line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); + if (!line->blk_bitmap) + return -ENOMEM; + + line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); + if (!line->erase_bitmap) + goto free_blk_bitmap; + + + line->chks = kmalloc_array(lm->blk_per_line, + sizeof(struct nvm_chk_meta), GFP_KERNEL); + if (!line->chks) + goto free_erase_bitmap; + + line->w_err_gc = kzalloc(sizeof(struct pblk_w_err_gc), GFP_KERNEL); + if (!line->w_err_gc) + goto free_chks; + + return 0; + +free_chks: + kfree(line->chks); +free_erase_bitmap: + kfree(line->erase_bitmap); +free_blk_bitmap: + kfree(line->blk_bitmap); + return -ENOMEM; +} + +static int pblk_line_mg_init(struct pblk *pblk) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + int i, bb_distance; + + l_mg->nr_lines = geo->num_chk; + l_mg->log_line = l_mg->data_line = NULL; + l_mg->l_seq_nr = l_mg->d_seq_nr = 0; + l_mg->nr_free_lines = 0; + bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); + + INIT_LIST_HEAD(&l_mg->free_list); + INIT_LIST_HEAD(&l_mg->corrupt_list); + INIT_LIST_HEAD(&l_mg->bad_list); + INIT_LIST_HEAD(&l_mg->gc_full_list); + INIT_LIST_HEAD(&l_mg->gc_high_list); + INIT_LIST_HEAD(&l_mg->gc_mid_list); + INIT_LIST_HEAD(&l_mg->gc_low_list); + INIT_LIST_HEAD(&l_mg->gc_empty_list); + INIT_LIST_HEAD(&l_mg->gc_werr_list); + + INIT_LIST_HEAD(&l_mg->emeta_list); + + l_mg->gc_lists[0] = &l_mg->gc_werr_list; + l_mg->gc_lists[1] = &l_mg->gc_high_list; + l_mg->gc_lists[2] = &l_mg->gc_mid_list; + l_mg->gc_lists[3] = &l_mg->gc_low_list; + + spin_lock_init(&l_mg->free_lock); + spin_lock_init(&l_mg->close_lock); + spin_lock_init(&l_mg->gc_lock); + + l_mg->vsc_list = kcalloc(l_mg->nr_lines, sizeof(__le32), GFP_KERNEL); + if (!l_mg->vsc_list) + goto fail; + + l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); + if (!l_mg->bb_template) + goto fail_free_vsc_list; + + l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); + if (!l_mg->bb_aux) + goto fail_free_bb_template; + + /* smeta is always small enough to fit on a kmalloc memory allocation, + * emeta depends on the number of LUNs allocated to the pblk instance + */ + for (i = 0; i < PBLK_DATA_LINES; i++) { + l_mg->sline_meta[i] = kmalloc(lm->smeta_len, GFP_KERNEL); + if (!l_mg->sline_meta[i]) + goto fail_free_smeta; + } + + /* emeta allocates three different buffers for managing metadata with + * in-memory and in-media layouts + */ + for (i = 0; i < PBLK_DATA_LINES; i++) { + struct pblk_emeta *emeta; + + emeta = kmalloc(sizeof(struct pblk_emeta), GFP_KERNEL); + if (!emeta) + goto fail_free_emeta; + + if (lm->emeta_len[0] > KMALLOC_MAX_CACHE_SIZE) { + l_mg->emeta_alloc_type = PBLK_VMALLOC_META; + + emeta->buf = vmalloc(lm->emeta_len[0]); + if (!emeta->buf) { + kfree(emeta); + goto fail_free_emeta; + } + + emeta->nr_entries = lm->emeta_sec[0]; + l_mg->eline_meta[i] = emeta; + } else { + l_mg->emeta_alloc_type = PBLK_KMALLOC_META; + + emeta->buf = kmalloc(lm->emeta_len[0], GFP_KERNEL); + if (!emeta->buf) { + kfree(emeta); + goto fail_free_emeta; + } + + emeta->nr_entries = lm->emeta_sec[0]; + l_mg->eline_meta[i] = emeta; + } + } + + for (i = 0; i < l_mg->nr_lines; i++) + l_mg->vsc_list[i] = cpu_to_le32(EMPTY_ENTRY); + + bb_distance = (geo->all_luns) * geo->ws_opt; + for (i = 0; i < lm->sec_per_line; i += bb_distance) + bitmap_set(l_mg->bb_template, i, geo->ws_opt); + + return 0; + +fail_free_emeta: + while (--i >= 0) { + if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META) + vfree(l_mg->eline_meta[i]->buf); + else + kfree(l_mg->eline_meta[i]->buf); + kfree(l_mg->eline_meta[i]); + } +fail_free_smeta: + for (i = 0; i < PBLK_DATA_LINES; i++) + kfree(l_mg->sline_meta[i]); + kfree(l_mg->bb_aux); +fail_free_bb_template: + kfree(l_mg->bb_template); +fail_free_vsc_list: + kfree(l_mg->vsc_list); +fail: + return -ENOMEM; +} + +static int pblk_line_meta_init(struct pblk *pblk) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + unsigned int smeta_len, emeta_len; + int i; + + lm->sec_per_line = geo->clba * geo->all_luns; + lm->blk_per_line = geo->all_luns; + lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); + lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); + lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); + lm->mid_thrs = lm->sec_per_line / 2; + lm->high_thrs = lm->sec_per_line / 4; + lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs; + + /* Calculate necessary pages for smeta. See comment over struct + * line_smeta definition + */ + i = 1; +add_smeta_page: + lm->smeta_sec = i * geo->ws_opt; + lm->smeta_len = lm->smeta_sec * geo->csecs; + + smeta_len = sizeof(struct line_smeta) + lm->lun_bitmap_len; + if (smeta_len > lm->smeta_len) { + i++; + goto add_smeta_page; + } + + /* Calculate necessary pages for emeta. See comment over struct + * line_emeta definition + */ + i = 1; +add_emeta_page: + lm->emeta_sec[0] = i * geo->ws_opt; + lm->emeta_len[0] = lm->emeta_sec[0] * geo->csecs; + + emeta_len = calc_emeta_len(pblk); + if (emeta_len > lm->emeta_len[0]) { + i++; + goto add_emeta_page; + } + + lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0; + + lm->min_blk_line = 1; + if (geo->all_luns > 1) + lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + + lm->emeta_sec[0], geo->clba); + + if (lm->min_blk_line > lm->blk_per_line) { + pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n", + lm->blk_per_line); + return -EINVAL; + } + + return 0; +} + +static int pblk_lines_init(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line *line; + void *chunk_meta; + long nr_free_chks = 0; + int i, ret; + + ret = pblk_line_meta_init(pblk); + if (ret) + return ret; + + ret = pblk_line_mg_init(pblk); + if (ret) + return ret; + + ret = pblk_luns_init(pblk); + if (ret) + goto fail_free_meta; + + chunk_meta = pblk_chunk_get_meta(pblk); + if (IS_ERR(chunk_meta)) { + ret = PTR_ERR(chunk_meta); + goto fail_free_luns; + } + + pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line), + GFP_KERNEL); + if (!pblk->lines) { + ret = -ENOMEM; + goto fail_free_chunk_meta; + } + + for (i = 0; i < l_mg->nr_lines; i++) { + line = &pblk->lines[i]; + + ret = pblk_alloc_line_meta(pblk, line); + if (ret) + goto fail_free_lines; + + nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i); + } + + if (!nr_free_chks) { + pblk_err(pblk, "too many bad blocks prevent for sane instance\n"); + ret = -EINTR; + goto fail_free_lines; + } + + pblk_set_provision(pblk, nr_free_chks); + + kfree(chunk_meta); + return 0; + +fail_free_lines: + while (--i >= 0) + pblk_line_meta_free(l_mg, &pblk->lines[i]); + kfree(pblk->lines); +fail_free_chunk_meta: + kfree(chunk_meta); +fail_free_luns: + kfree(pblk->luns); +fail_free_meta: + pblk_line_mg_free(pblk); + + return ret; +} + +static int pblk_writer_init(struct pblk *pblk) +{ + pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); + if (IS_ERR(pblk->writer_ts)) { + int err = PTR_ERR(pblk->writer_ts); + + if (err != -EINTR) + pblk_err(pblk, "could not allocate writer kthread (%d)\n", + err); + return err; + } + + timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); + mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); + + return 0; +} + +static void pblk_writer_stop(struct pblk *pblk) +{ + /* The pipeline must be stopped and the write buffer emptied before the + * write thread is stopped + */ + WARN(pblk_rb_read_count(&pblk->rwb), + "Stopping not fully persisted write buffer\n"); + + WARN(pblk_rb_sync_count(&pblk->rwb), + "Stopping not fully synced write buffer\n"); + + del_timer_sync(&pblk->wtimer); + if (pblk->writer_ts) + kthread_stop(pblk->writer_ts); +} + +static void pblk_free(struct pblk *pblk) +{ + pblk_lines_free(pblk); + pblk_l2p_free(pblk); + pblk_rwb_free(pblk); + pblk_core_free(pblk); + + kfree(pblk); +} + +static void pblk_tear_down(struct pblk *pblk, bool graceful) +{ + if (graceful) + __pblk_pipeline_flush(pblk); + __pblk_pipeline_stop(pblk); + pblk_writer_stop(pblk); + pblk_rb_sync_l2p(&pblk->rwb); + pblk_rl_free(&pblk->rl); + + pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful); +} + +static void pblk_exit(void *private, bool graceful) +{ + struct pblk *pblk = private; + + down_write(&pblk_lock); + pblk_gc_exit(pblk, graceful); + pblk_tear_down(pblk, graceful); + +#ifdef CONFIG_NVM_PBLK_DEBUG + pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); +#endif + + pblk_free(pblk); + up_write(&pblk_lock); +} + +static sector_t pblk_capacity(void *private) +{ + struct pblk *pblk = private; + + return pblk->capacity * NR_PHY_IN_LOG; +} + +static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, + int flags) +{ + struct nvm_geo *geo = &dev->geo; + struct request_queue *bqueue = dev->q; + struct request_queue *tqueue = tdisk->queue; + struct pblk *pblk; + int ret; + + pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL); + if (!pblk) + return ERR_PTR(-ENOMEM); + + pblk->dev = dev; + pblk->disk = tdisk; + pblk->state = PBLK_STATE_RUNNING; + pblk->gc.gc_enabled = 0; + + if (!(geo->version == NVM_OCSSD_SPEC_12 || + geo->version == NVM_OCSSD_SPEC_20)) { + pblk_err(pblk, "OCSSD version not supported (%u)\n", + geo->version); + kfree(pblk); + return ERR_PTR(-EINVAL); + } + + if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) { + pblk_err(pblk, "host-side L2P table not supported. (%x)\n", + geo->dom); + kfree(pblk); + return ERR_PTR(-EINVAL); + } + + spin_lock_init(&pblk->resubmit_lock); + spin_lock_init(&pblk->trans_lock); + spin_lock_init(&pblk->lock); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_set(&pblk->inflight_writes, 0); + atomic_long_set(&pblk->padded_writes, 0); + atomic_long_set(&pblk->padded_wb, 0); + atomic_long_set(&pblk->req_writes, 0); + atomic_long_set(&pblk->sub_writes, 0); + atomic_long_set(&pblk->sync_writes, 0); + atomic_long_set(&pblk->inflight_reads, 0); + atomic_long_set(&pblk->cache_reads, 0); + atomic_long_set(&pblk->sync_reads, 0); + atomic_long_set(&pblk->recov_writes, 0); + atomic_long_set(&pblk->recov_writes, 0); + atomic_long_set(&pblk->recov_gc_writes, 0); + atomic_long_set(&pblk->recov_gc_reads, 0); +#endif + + atomic_long_set(&pblk->read_failed, 0); + atomic_long_set(&pblk->read_empty, 0); + atomic_long_set(&pblk->read_high_ecc, 0); + atomic_long_set(&pblk->read_failed_gc, 0); + atomic_long_set(&pblk->write_failed, 0); + atomic_long_set(&pblk->erase_failed, 0); + + ret = pblk_core_init(pblk); + if (ret) { + pblk_err(pblk, "could not initialize core\n"); + goto fail; + } + + ret = pblk_lines_init(pblk); + if (ret) { + pblk_err(pblk, "could not initialize lines\n"); + goto fail_free_core; + } + + ret = pblk_rwb_init(pblk); + if (ret) { + pblk_err(pblk, "could not initialize write buffer\n"); + goto fail_free_lines; + } + + ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY); + if (ret) { + pblk_err(pblk, "could not initialize maps\n"); + goto fail_free_rwb; + } + + ret = pblk_writer_init(pblk); + if (ret) { + if (ret != -EINTR) + pblk_err(pblk, "could not initialize write thread\n"); + goto fail_free_l2p; + } + + ret = pblk_gc_init(pblk); + if (ret) { + pblk_err(pblk, "could not initialize gc\n"); + goto fail_stop_writer; + } + + /* inherit the size from the underlying device */ + blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue)); + blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue)); + + blk_queue_write_cache(tqueue, true, false); + + tqueue->limits.discard_granularity = geo->clba * geo->csecs; + tqueue->limits.discard_alignment = 0; + blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue); + + pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n", + geo->all_luns, pblk->l_mg.nr_lines, + (unsigned long long)pblk->rl.nr_secs, + pblk->rwb.nr_entries); + + wake_up_process(pblk->writer_ts); + + /* Check if we need to start GC */ + pblk_gc_should_kick(pblk); + + return pblk; + +fail_stop_writer: + pblk_writer_stop(pblk); +fail_free_l2p: + pblk_l2p_free(pblk); +fail_free_rwb: + pblk_rwb_free(pblk); +fail_free_lines: + pblk_lines_free(pblk); +fail_free_core: + pblk_core_free(pblk); +fail: + kfree(pblk); + return ERR_PTR(ret); +} + +/* physical block device target */ +static struct nvm_tgt_type tt_pblk = { + .name = "pblk", + .version = {1, 0, 0}, + + .make_rq = pblk_make_rq, + .capacity = pblk_capacity, + + .init = pblk_init, + .exit = pblk_exit, + + .sysfs_init = pblk_sysfs_init, + .sysfs_exit = pblk_sysfs_exit, + .owner = THIS_MODULE, +}; + +static int __init pblk_module_init(void) +{ + int ret; + + ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); + if (ret) + return ret; + ret = nvm_register_tgt_type(&tt_pblk); + if (ret) + bioset_exit(&pblk_bio_set); + return ret; +} + +static void pblk_module_exit(void) +{ + bioset_exit(&pblk_bio_set); + nvm_unregister_tgt_type(&tt_pblk); +} + +module_init(pblk_module_init); +module_exit(pblk_module_exit); +MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>"); +MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>"); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs"); diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c new file mode 100644 index 000000000..953ca31dd --- /dev/null +++ b/drivers/lightnvm/pblk-map.c @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * pblk-map.c - pblk's lba-ppa mapping strategy + * + */ + +#include "pblk.h" + +static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, + struct ppa_addr *ppa_list, + unsigned long *lun_bitmap, + struct pblk_sec_meta *meta_list, + unsigned int valid_secs) +{ + struct pblk_line *line = pblk_line_get_data(pblk); + struct pblk_emeta *emeta; + struct pblk_w_ctx *w_ctx; + __le64 *lba_list; + u64 paddr; + int nr_secs = pblk->min_write_pgs; + int i; + + if (pblk_line_is_full(line)) { + struct pblk_line *prev_line = line; + + /* If we cannot allocate a new line, make sure to store metadata + * on current line and then fail + */ + line = pblk_line_replace_data(pblk); + pblk_line_close_meta(pblk, prev_line); + + if (!line) + return -EINTR; + } + + emeta = line->emeta; + lba_list = emeta_to_lbas(pblk, emeta->buf); + + paddr = pblk_alloc_page(pblk, line, nr_secs); + + for (i = 0; i < nr_secs; i++, paddr++) { + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + + /* ppa to be sent to the device */ + ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); + + /* Write context for target bio completion on write buffer. Note + * that the write buffer is protected by the sync backpointer, + * and a single writer thread have access to each specific entry + * at a time. Thus, it is safe to modify the context for the + * entry we are setting up for submission without taking any + * lock or memory barrier. + */ + if (i < valid_secs) { + kref_get(&line->ref); + w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i); + w_ctx->ppa = ppa_list[i]; + meta_list[i].lba = cpu_to_le64(w_ctx->lba); + lba_list[paddr] = cpu_to_le64(w_ctx->lba); + if (lba_list[paddr] != addr_empty) + line->nr_valid_lbas++; + else + atomic64_inc(&pblk->pad_wa); + } else { + lba_list[paddr] = meta_list[i].lba = addr_empty; + __pblk_map_invalidate(pblk, line, paddr); + } + } + + pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap); + return 0; +} + +void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, + unsigned long *lun_bitmap, unsigned int valid_secs, + unsigned int off) +{ + struct pblk_sec_meta *meta_list = rqd->meta_list; + unsigned int map_secs; + int min = pblk->min_write_pgs; + int i; + + for (i = off; i < rqd->nr_ppas; i += min) { + map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; + if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i], + lun_bitmap, &meta_list[i], map_secs)) { + bio_put(rqd->bio); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); + pblk_pipeline_stop(pblk); + } + } +} + +/* only if erase_ppa is set, acquire erase semaphore */ +void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, + unsigned int sentry, unsigned long *lun_bitmap, + unsigned int valid_secs, struct ppa_addr *erase_ppa) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_sec_meta *meta_list = rqd->meta_list; + struct pblk_line *e_line, *d_line; + unsigned int map_secs; + int min = pblk->min_write_pgs; + int i, erase_lun; + + for (i = 0; i < rqd->nr_ppas; i += min) { + map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; + if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i], + lun_bitmap, &meta_list[i], map_secs)) { + bio_put(rqd->bio); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); + pblk_pipeline_stop(pblk); + } + + erase_lun = pblk_ppa_to_pos(geo, rqd->ppa_list[i]); + + /* line can change after page map. We might also be writing the + * last line. + */ + e_line = pblk_line_get_erase(pblk); + if (!e_line) + return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, + valid_secs, i + min); + + spin_lock(&e_line->lock); + if (!test_bit(erase_lun, e_line->erase_bitmap)) { + set_bit(erase_lun, e_line->erase_bitmap); + atomic_dec(&e_line->left_eblks); + + *erase_ppa = rqd->ppa_list[i]; + erase_ppa->a.blk = e_line->id; + + spin_unlock(&e_line->lock); + + /* Avoid evaluating e_line->left_eblks */ + return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, + valid_secs, i + min); + } + spin_unlock(&e_line->lock); + } + + d_line = pblk_line_get_data(pblk); + + /* line can change after page map. We might also be writing the + * last line. + */ + e_line = pblk_line_get_erase(pblk); + if (!e_line) + return; + + /* Erase blocks that are bad in this line but might not be in next */ + if (unlikely(pblk_ppa_empty(*erase_ppa)) && + bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { + int bit = -1; + +retry: + bit = find_next_bit(d_line->blk_bitmap, + lm->blk_per_line, bit + 1); + if (bit >= lm->blk_per_line) + return; + + spin_lock(&e_line->lock); + if (test_bit(bit, e_line->erase_bitmap)) { + spin_unlock(&e_line->lock); + goto retry; + } + spin_unlock(&e_line->lock); + + set_bit(bit, e_line->erase_bitmap); + atomic_dec(&e_line->left_eblks); + *erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */ + erase_ppa->a.blk = e_line->id; + } +} diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c new file mode 100644 index 000000000..d22c13b55 --- /dev/null +++ b/drivers/lightnvm/pblk-rb.c @@ -0,0 +1,852 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * + * Based upon the circular ringbuffer. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * pblk-rb.c - pblk's write buffer + */ + +#include <linux/circ_buf.h> + +#include "pblk.h" + +static DECLARE_RWSEM(pblk_rb_lock); + +void pblk_rb_data_free(struct pblk_rb *rb) +{ + struct pblk_rb_pages *p, *t; + + down_write(&pblk_rb_lock); + list_for_each_entry_safe(p, t, &rb->pages, list) { + free_pages((unsigned long)page_address(p->pages), p->order); + list_del(&p->list); + kfree(p); + } + up_write(&pblk_rb_lock); +} + +/* + * Initialize ring buffer. The data and metadata buffers must be previously + * allocated and their size must be a power of two + * (Documentation/core-api/circular-buffers.rst) + */ +int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base, + unsigned int power_size, unsigned int power_seg_sz) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + unsigned int init_entry = 0; + unsigned int alloc_order = power_size; + unsigned int max_order = MAX_ORDER - 1; + unsigned int order, iter; + + down_write(&pblk_rb_lock); + rb->entries = rb_entry_base; + rb->seg_size = (1 << power_seg_sz); + rb->nr_entries = (1 << power_size); + rb->mem = rb->subm = rb->sync = rb->l2p_update = 0; + rb->flush_point = EMPTY_ENTRY; + + spin_lock_init(&rb->w_lock); + spin_lock_init(&rb->s_lock); + + INIT_LIST_HEAD(&rb->pages); + + if (alloc_order >= max_order) { + order = max_order; + iter = (1 << (alloc_order - max_order)); + } else { + order = alloc_order; + iter = 1; + } + + do { + struct pblk_rb_entry *entry; + struct pblk_rb_pages *page_set; + void *kaddr; + unsigned long set_size; + int i; + + page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL); + if (!page_set) { + up_write(&pblk_rb_lock); + return -ENOMEM; + } + + page_set->order = order; + page_set->pages = alloc_pages(GFP_KERNEL, order); + if (!page_set->pages) { + kfree(page_set); + pblk_rb_data_free(rb); + up_write(&pblk_rb_lock); + return -ENOMEM; + } + kaddr = page_address(page_set->pages); + + entry = &rb->entries[init_entry]; + entry->data = kaddr; + entry->cacheline = pblk_cacheline_to_addr(init_entry++); + entry->w_ctx.flags = PBLK_WRITABLE_ENTRY; + + set_size = (1 << order); + for (i = 1; i < set_size; i++) { + entry = &rb->entries[init_entry]; + entry->cacheline = pblk_cacheline_to_addr(init_entry++); + entry->data = kaddr + (i * rb->seg_size); + entry->w_ctx.flags = PBLK_WRITABLE_ENTRY; + bio_list_init(&entry->w_ctx.bios); + } + + list_add_tail(&page_set->list, &rb->pages); + iter--; + } while (iter > 0); + up_write(&pblk_rb_lock); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_set(&rb->inflight_flush_point, 0); +#endif + + /* + * Initialize rate-limiter, which controls access to the write buffer + * but user and GC I/O + */ + pblk_rl_init(&pblk->rl, rb->nr_entries); + + return 0; +} + +/* + * pblk_rb_calculate_size -- calculate the size of the write buffer + */ +unsigned int pblk_rb_calculate_size(unsigned int nr_entries) +{ + /* Alloc a write buffer that can at least fit 128 entries */ + return (1 << max(get_count_order(nr_entries), 7)); +} + +void *pblk_rb_entries_ref(struct pblk_rb *rb) +{ + return rb->entries; +} + +static void clean_wctx(struct pblk_w_ctx *w_ctx) +{ + int flags; + + flags = READ_ONCE(w_ctx->flags); + WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY), + "pblk: overwriting unsubmitted data\n"); + + /* Release flags on context. Protect from writes and reads */ + smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY); + pblk_ppa_set_empty(&w_ctx->ppa); + w_ctx->lba = ADDR_EMPTY; +} + +#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size) +#define pblk_rb_ring_space(rb, head, tail, size) \ + (CIRC_SPACE(head, tail, size)) + +/* + * Buffer space is calculated with respect to the back pointer signaling + * synchronized entries to the media. + */ +static unsigned int pblk_rb_space(struct pblk_rb *rb) +{ + unsigned int mem = READ_ONCE(rb->mem); + unsigned int sync = READ_ONCE(rb->sync); + + return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries); +} + +/* + * Buffer count is calculated with respect to the submission entry signaling the + * entries that are available to send to the media + */ +unsigned int pblk_rb_read_count(struct pblk_rb *rb) +{ + unsigned int mem = READ_ONCE(rb->mem); + unsigned int subm = READ_ONCE(rb->subm); + + return pblk_rb_ring_count(mem, subm, rb->nr_entries); +} + +unsigned int pblk_rb_sync_count(struct pblk_rb *rb) +{ + unsigned int mem = READ_ONCE(rb->mem); + unsigned int sync = READ_ONCE(rb->sync); + + return pblk_rb_ring_count(mem, sync, rb->nr_entries); +} + +unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries) +{ + unsigned int subm; + + subm = READ_ONCE(rb->subm); + /* Commit read means updating submission pointer */ + smp_store_release(&rb->subm, + (subm + nr_entries) & (rb->nr_entries - 1)); + + return subm; +} + +static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + struct pblk_line *line; + struct pblk_rb_entry *entry; + struct pblk_w_ctx *w_ctx; + unsigned int user_io = 0, gc_io = 0; + unsigned int i; + int flags; + + for (i = 0; i < to_update; i++) { + entry = &rb->entries[rb->l2p_update]; + w_ctx = &entry->w_ctx; + + flags = READ_ONCE(entry->w_ctx.flags); + if (flags & PBLK_IOTYPE_USER) + user_io++; + else if (flags & PBLK_IOTYPE_GC) + gc_io++; + else + WARN(1, "pblk: unknown IO type\n"); + + pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, + entry->cacheline); + + line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)]; + kref_put(&line->ref, pblk_line_put); + clean_wctx(w_ctx); + rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1); + } + + pblk_rl_out(&pblk->rl, user_io, gc_io); + + return 0; +} + +/* + * When we move the l2p_update pointer, we update the l2p table - lookups will + * point to the physical address instead of to the cacheline in the write buffer + * from this moment on. + */ +static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries, + unsigned int mem, unsigned int sync) +{ + unsigned int space, count; + int ret = 0; + + lockdep_assert_held(&rb->w_lock); + + /* Update l2p only as buffer entries are being overwritten */ + space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries); + if (space > nr_entries) + goto out; + + count = nr_entries - space; + /* l2p_update used exclusively under rb->w_lock */ + ret = __pblk_rb_update_l2p(rb, count); + +out: + return ret; +} + +/* + * Update the l2p entry for all sectors stored on the write buffer. This means + * that all future lookups to the l2p table will point to a device address, not + * to the cacheline in the write buffer. + */ +void pblk_rb_sync_l2p(struct pblk_rb *rb) +{ + unsigned int sync; + unsigned int to_update; + + spin_lock(&rb->w_lock); + + /* Protect from reads and writes */ + sync = smp_load_acquire(&rb->sync); + + to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries); + __pblk_rb_update_l2p(rb, to_update); + + spin_unlock(&rb->w_lock); +} + +/* + * Write @nr_entries to ring buffer from @data buffer if there is enough space. + * Typically, 4KB data chunks coming from a bio will be copied to the ring + * buffer, thus the write will fail if not all incoming data can be copied. + * + */ +static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data, + struct pblk_w_ctx w_ctx, + struct pblk_rb_entry *entry) +{ + memcpy(entry->data, data, rb->seg_size); + + entry->w_ctx.lba = w_ctx.lba; + entry->w_ctx.ppa = w_ctx.ppa; +} + +void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, + struct pblk_w_ctx w_ctx, unsigned int ring_pos) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + struct pblk_rb_entry *entry; + int flags; + + entry = &rb->entries[ring_pos]; + flags = READ_ONCE(entry->w_ctx.flags); +#ifdef CONFIG_NVM_PBLK_DEBUG + /* Caller must guarantee that the entry is free */ + BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); +#endif + + __pblk_rb_write_entry(rb, data, w_ctx, entry); + + pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline); + flags = w_ctx.flags | PBLK_WRITTEN_DATA; + + /* Release flags on write context. Protect from writes */ + smp_store_release(&entry->w_ctx.flags, flags); +} + +void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, + struct pblk_w_ctx w_ctx, struct pblk_line *line, + u64 paddr, unsigned int ring_pos) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + struct pblk_rb_entry *entry; + int flags; + + entry = &rb->entries[ring_pos]; + flags = READ_ONCE(entry->w_ctx.flags); +#ifdef CONFIG_NVM_PBLK_DEBUG + /* Caller must guarantee that the entry is free */ + BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); +#endif + + __pblk_rb_write_entry(rb, data, w_ctx, entry); + + if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr)) + entry->w_ctx.lba = ADDR_EMPTY; + + flags = w_ctx.flags | PBLK_WRITTEN_DATA; + + /* Release flags on write context. Protect from writes */ + smp_store_release(&entry->w_ctx.flags, flags); +} + +static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio, + unsigned int pos) +{ + struct pblk_rb_entry *entry; + unsigned int sync, flush_point; + + pblk_rb_sync_init(rb, NULL); + sync = READ_ONCE(rb->sync); + + if (pos == sync) { + pblk_rb_sync_end(rb, NULL); + return 0; + } + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_inc(&rb->inflight_flush_point); +#endif + + flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); + entry = &rb->entries[flush_point]; + + /* Protect flush points */ + smp_store_release(&rb->flush_point, flush_point); + + if (bio) + bio_list_add(&entry->w_ctx.bios, bio); + + pblk_rb_sync_end(rb, NULL); + + return bio ? 1 : 0; +} + +static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, + unsigned int *pos) +{ + unsigned int mem; + unsigned int sync; + + sync = READ_ONCE(rb->sync); + mem = READ_ONCE(rb->mem); + + if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries) + return 0; + + if (pblk_rb_update_l2p(rb, nr_entries, mem, sync)) + return 0; + + *pos = mem; + + return 1; +} + +static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, + unsigned int *pos) +{ + if (!__pblk_rb_may_write(rb, nr_entries, pos)) + return 0; + + /* Protect from read count */ + smp_store_release(&rb->mem, (*pos + nr_entries) & (rb->nr_entries - 1)); + return 1; +} + +void pblk_rb_flush(struct pblk_rb *rb) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + unsigned int mem = READ_ONCE(rb->mem); + + if (pblk_rb_flush_point_set(rb, NULL, mem)) + return; + + pblk_write_kick(pblk); +} + +static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, + unsigned int *pos, struct bio *bio, + int *io_ret) +{ + unsigned int mem; + + if (!__pblk_rb_may_write(rb, nr_entries, pos)) + return 0; + + mem = (*pos + nr_entries) & (rb->nr_entries - 1); + *io_ret = NVM_IO_DONE; + + if (bio->bi_opf & REQ_PREFLUSH) { + struct pblk *pblk = container_of(rb, struct pblk, rwb); + + atomic64_inc(&pblk->nr_flush); + if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem)) + *io_ret = NVM_IO_OK; + } + + /* Protect from read count */ + smp_store_release(&rb->mem, mem); + + return 1; +} + +/* + * Atomically check that (i) there is space on the write buffer for the + * incoming I/O, and (ii) the current I/O type has enough budget in the write + * buffer (rate-limiter). + */ +int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, + unsigned int nr_entries, unsigned int *pos) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + int io_ret; + + spin_lock(&rb->w_lock); + io_ret = pblk_rl_user_may_insert(&pblk->rl, nr_entries); + if (io_ret) { + spin_unlock(&rb->w_lock); + return io_ret; + } + + if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &io_ret)) { + spin_unlock(&rb->w_lock); + return NVM_IO_REQUEUE; + } + + pblk_rl_user_in(&pblk->rl, nr_entries); + spin_unlock(&rb->w_lock); + + return io_ret; +} + +/* + * Look at pblk_rb_may_write_user comment + */ +int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, + unsigned int *pos) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + + spin_lock(&rb->w_lock); + if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) { + spin_unlock(&rb->w_lock); + return 0; + } + + if (!pblk_rb_may_write(rb, nr_entries, pos)) { + spin_unlock(&rb->w_lock); + return 0; + } + + pblk_rl_gc_in(&pblk->rl, nr_entries); + spin_unlock(&rb->w_lock); + + return 1; +} + +/* + * Read available entries on rb and add them to the given bio. To avoid a memory + * copy, a page reference to the write buffer is used to be added to the bio. + * + * This function is used by the write thread to form the write bio that will + * persist data on the write buffer to the media. + */ +unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, + unsigned int pos, unsigned int nr_entries, + unsigned int count) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + struct request_queue *q = pblk->dev->q; + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + struct bio *bio = rqd->bio; + struct pblk_rb_entry *entry; + struct page *page; + unsigned int pad = 0, to_read = nr_entries; + unsigned int i; + int flags; + + if (count < nr_entries) { + pad = nr_entries - count; + to_read = count; + } + + c_ctx->sentry = pos; + c_ctx->nr_valid = to_read; + c_ctx->nr_padded = pad; + + for (i = 0; i < to_read; i++) { + entry = &rb->entries[pos]; + + /* A write has been allowed into the buffer, but data is still + * being copied to it. It is ok to busy wait. + */ +try: + flags = READ_ONCE(entry->w_ctx.flags); + if (!(flags & PBLK_WRITTEN_DATA)) { + io_schedule(); + goto try; + } + + page = virt_to_page(entry->data); + if (!page) { + pblk_err(pblk, "could not allocate write bio page\n"); + flags &= ~PBLK_WRITTEN_DATA; + flags |= PBLK_SUBMITTED_ENTRY; + /* Release flags on context. Protect from writes */ + smp_store_release(&entry->w_ctx.flags, flags); + return NVM_IO_ERR; + } + + if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) != + rb->seg_size) { + pblk_err(pblk, "could not add page to write bio\n"); + flags &= ~PBLK_WRITTEN_DATA; + flags |= PBLK_SUBMITTED_ENTRY; + /* Release flags on context. Protect from writes */ + smp_store_release(&entry->w_ctx.flags, flags); + return NVM_IO_ERR; + } + + flags &= ~PBLK_WRITTEN_DATA; + flags |= PBLK_SUBMITTED_ENTRY; + + /* Release flags on context. Protect from writes */ + smp_store_release(&entry->w_ctx.flags, flags); + + pos = (pos + 1) & (rb->nr_entries - 1); + } + + if (pad) { + if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) { + pblk_err(pblk, "could not pad page in write bio\n"); + return NVM_IO_ERR; + } + + if (pad < pblk->min_write_pgs) + atomic64_inc(&pblk->pad_dist[pad - 1]); + else + pblk_warn(pblk, "padding more than min. sectors\n"); + + atomic64_add(pad, &pblk->pad_wa); + } + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_add(pad, &pblk->padded_writes); +#endif + + return NVM_IO_OK; +} + +/* + * Copy to bio only if the lba matches the one on the given cache entry. + * Otherwise, it means that the entry has been overwritten, and the bio should + * be directed to disk. + */ +int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, + struct ppa_addr ppa, int bio_iter, bool advanced_bio) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + struct pblk_rb_entry *entry; + struct pblk_w_ctx *w_ctx; + struct ppa_addr l2p_ppa; + u64 pos = pblk_addr_to_cacheline(ppa); + void *data; + int flags; + int ret = 1; + + +#ifdef CONFIG_NVM_PBLK_DEBUG + /* Caller must ensure that the access will not cause an overflow */ + BUG_ON(pos >= rb->nr_entries); +#endif + entry = &rb->entries[pos]; + w_ctx = &entry->w_ctx; + flags = READ_ONCE(w_ctx->flags); + + spin_lock(&rb->w_lock); + spin_lock(&pblk->trans_lock); + l2p_ppa = pblk_trans_map_get(pblk, lba); + spin_unlock(&pblk->trans_lock); + + /* Check if the entry has been overwritten or is scheduled to be */ + if (!pblk_ppa_comp(l2p_ppa, ppa) || w_ctx->lba != lba || + flags & PBLK_WRITABLE_ENTRY) { + ret = 0; + goto out; + } + + /* Only advance the bio if it hasn't been advanced already. If advanced, + * this bio is at least a partial bio (i.e., it has partially been + * filled with data from the cache). If part of the data resides on the + * media, we will read later on + */ + if (unlikely(!advanced_bio)) + bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE); + + data = bio_data(bio); + memcpy(data, entry->data, rb->seg_size); + +out: + spin_unlock(&rb->w_lock); + return ret; +} + +struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos) +{ + unsigned int entry = pos & (rb->nr_entries - 1); + + return &rb->entries[entry].w_ctx; +} + +unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags) + __acquires(&rb->s_lock) +{ + if (flags) + spin_lock_irqsave(&rb->s_lock, *flags); + else + spin_lock_irq(&rb->s_lock); + + return rb->sync; +} + +void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags) + __releases(&rb->s_lock) +{ + lockdep_assert_held(&rb->s_lock); + + if (flags) + spin_unlock_irqrestore(&rb->s_lock, *flags); + else + spin_unlock_irq(&rb->s_lock); +} + +unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) +{ + unsigned int sync, flush_point; + lockdep_assert_held(&rb->s_lock); + + sync = READ_ONCE(rb->sync); + flush_point = READ_ONCE(rb->flush_point); + + if (flush_point != EMPTY_ENTRY) { + unsigned int secs_to_flush; + + secs_to_flush = pblk_rb_ring_count(flush_point, sync, + rb->nr_entries); + if (secs_to_flush < nr_entries) { + /* Protect flush points */ + smp_store_release(&rb->flush_point, EMPTY_ENTRY); + } + } + + sync = (sync + nr_entries) & (rb->nr_entries - 1); + + /* Protect from counts */ + smp_store_release(&rb->sync, sync); + + return sync; +} + +/* Calculate how many sectors to submit up to the current flush point. */ +unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb) +{ + unsigned int subm, sync, flush_point; + unsigned int submitted, to_flush; + + /* Protect flush points */ + flush_point = smp_load_acquire(&rb->flush_point); + if (flush_point == EMPTY_ENTRY) + return 0; + + /* Protect syncs */ + sync = smp_load_acquire(&rb->sync); + + subm = READ_ONCE(rb->subm); + submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries); + + /* The sync point itself counts as a sector to sync */ + to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1; + + return (submitted < to_flush) ? (to_flush - submitted) : 0; +} + +/* + * Scan from the current position of the sync pointer to find the entry that + * corresponds to the given ppa. This is necessary since write requests can be + * completed out of order. The assumption is that the ppa is close to the sync + * pointer thus the search will not take long. + * + * The caller of this function must guarantee that the sync pointer will no + * reach the entry while it is using the metadata associated with it. With this + * assumption in mind, there is no need to take the sync lock. + */ +struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb, + struct ppa_addr *ppa) +{ + unsigned int sync, subm, count; + unsigned int i; + + sync = READ_ONCE(rb->sync); + subm = READ_ONCE(rb->subm); + count = pblk_rb_ring_count(subm, sync, rb->nr_entries); + + for (i = 0; i < count; i++) + sync = (sync + 1) & (rb->nr_entries - 1); + + return NULL; +} + +int pblk_rb_tear_down_check(struct pblk_rb *rb) +{ + struct pblk_rb_entry *entry; + int i; + int ret = 0; + + spin_lock(&rb->w_lock); + spin_lock_irq(&rb->s_lock); + + if ((rb->mem == rb->subm) && (rb->subm == rb->sync) && + (rb->sync == rb->l2p_update) && + (rb->flush_point == EMPTY_ENTRY)) { + goto out; + } + + if (!rb->entries) { + ret = 1; + goto out; + } + + for (i = 0; i < rb->nr_entries; i++) { + entry = &rb->entries[i]; + + if (!entry->data) { + ret = 1; + goto out; + } + } + +out: + spin_unlock_irq(&rb->s_lock); + spin_unlock(&rb->w_lock); + + return ret; +} + +unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos) +{ + return (pos & (rb->nr_entries - 1)); +} + +int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos) +{ + return (pos >= rb->nr_entries); +} + +ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) +{ + struct pblk *pblk = container_of(rb, struct pblk, rwb); + struct pblk_c_ctx *c; + ssize_t offset; + int queued_entries = 0; + + spin_lock_irq(&rb->s_lock); + list_for_each_entry(c, &pblk->compl_list, list) + queued_entries++; + spin_unlock_irq(&rb->s_lock); + + if (rb->flush_point != EMPTY_ENTRY) + offset = scnprintf(buf, PAGE_SIZE, + "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n", + rb->nr_entries, + rb->mem, + rb->subm, + rb->sync, + rb->l2p_update, +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_read(&rb->inflight_flush_point), +#else + 0, +#endif + rb->flush_point, + pblk_rb_read_count(rb), + pblk_rb_space(rb), + pblk_rb_flush_point_count(rb), + queued_entries); + else + offset = scnprintf(buf, PAGE_SIZE, + "%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n", + rb->nr_entries, + rb->mem, + rb->subm, + rb->sync, + rb->l2p_update, +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_read(&rb->inflight_flush_point), +#else + 0, +#endif + pblk_rb_read_count(rb), + pblk_rb_space(rb), + pblk_rb_flush_point_count(rb), + queued_entries); + + return offset; +} diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c new file mode 100644 index 000000000..5a46d7f93 --- /dev/null +++ b/drivers/lightnvm/pblk-read.c @@ -0,0 +1,701 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * pblk-read.c - pblk's read path + */ + +#include "pblk.h" + +/* + * There is no guarantee that the value read from cache has not been updated and + * resides at another location in the cache. We guarantee though that if the + * value is read from the cache, it belongs to the mapped lba. In order to + * guarantee and order between writes and reads are ordered, a flush must be + * issued. + */ +static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, + sector_t lba, struct ppa_addr ppa, + int bio_iter, bool advanced_bio) +{ +#ifdef CONFIG_NVM_PBLK_DEBUG + /* Callers must ensure that the ppa points to a cache address */ + BUG_ON(pblk_ppa_empty(ppa)); + BUG_ON(!pblk_addr_in_cache(ppa)); +#endif + + return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa, + bio_iter, advanced_bio); +} + +static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, + struct bio *bio, sector_t blba, + unsigned long *read_bitmap) +{ + struct pblk_sec_meta *meta_list = rqd->meta_list; + struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; + int nr_secs = rqd->nr_ppas; + bool advanced_bio = false; + int i, j = 0; + + pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs); + + for (i = 0; i < nr_secs; i++) { + struct ppa_addr p = ppas[i]; + sector_t lba = blba + i; + +retry: + if (pblk_ppa_empty(p)) { + WARN_ON(test_and_set_bit(i, read_bitmap)); + meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); + + if (unlikely(!advanced_bio)) { + bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE); + advanced_bio = true; + } + + goto next; + } + + /* Try to read from write buffer. The address is later checked + * on the write buffer to prevent retrieving overwritten data. + */ + if (pblk_addr_in_cache(p)) { + if (!pblk_read_from_cache(pblk, bio, lba, p, i, + advanced_bio)) { + pblk_lookup_l2p_seq(pblk, &p, lba, 1); + goto retry; + } + WARN_ON(test_and_set_bit(i, read_bitmap)); + meta_list[i].lba = cpu_to_le64(lba); + advanced_bio = true; +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_inc(&pblk->cache_reads); +#endif + } else { + /* Read from media non-cached sectors */ + rqd->ppa_list[j++] = p; + } + +next: + if (advanced_bio) + bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); + } + + if (pblk_io_aligned(pblk, nr_secs)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_add(nr_secs, &pblk->inflight_reads); +#endif +} + + +static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, + sector_t blba) +{ + struct pblk_sec_meta *meta_lba_list = rqd->meta_list; + int nr_lbas = rqd->nr_ppas; + int i; + + for (i = 0; i < nr_lbas; i++) { + u64 lba = le64_to_cpu(meta_lba_list[i].lba); + + if (lba == ADDR_EMPTY) + continue; + + if (lba != blba + i) { +#ifdef CONFIG_NVM_PBLK_DEBUG + struct ppa_addr *p; + + p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr; + print_ppa(pblk, p, "seq", i); +#endif + pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", + lba, (u64)blba + i); + WARN_ON(1); + } + } +} + +/* + * There can be holes in the lba list. + */ +static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd, + u64 *lba_list, int nr_lbas) +{ + struct pblk_sec_meta *meta_lba_list = rqd->meta_list; + int i, j; + + for (i = 0, j = 0; i < nr_lbas; i++) { + u64 lba = lba_list[i]; + u64 meta_lba; + + if (lba == ADDR_EMPTY) + continue; + + meta_lba = le64_to_cpu(meta_lba_list[j].lba); + + if (lba != meta_lba) { +#ifdef CONFIG_NVM_PBLK_DEBUG + struct ppa_addr *p; + int nr_ppas = rqd->nr_ppas; + + p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr; + print_ppa(pblk, p, "seq", j); +#endif + pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", + lba, meta_lba); + WARN_ON(1); + } + + j++; + } + + WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n"); +} + +static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct ppa_addr *ppa_list; + int i; + + ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; + + for (i = 0; i < rqd->nr_ppas; i++) { + struct ppa_addr ppa = ppa_list[i]; + struct pblk_line *line; + + line = &pblk->lines[pblk_ppa_to_line(ppa)]; + kref_put(&line->ref, pblk_line_put_wq); + } +} + +static void pblk_end_user_read(struct bio *bio) +{ +#ifdef CONFIG_NVM_PBLK_DEBUG + WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n"); +#endif + bio_endio(bio); +} + +static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, + bool put_line) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct bio *int_bio = rqd->bio; + unsigned long start_time = r_ctx->start_time; + + generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time); + + if (rqd->error) + pblk_log_read_err(pblk, rqd); + + pblk_read_check_seq(pblk, rqd, r_ctx->lba); + + if (int_bio) + bio_put(int_bio); + + if (put_line) + pblk_read_put_rqd_kref(pblk, rqd); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); + atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); +#endif + + pblk_free_rqd(pblk, rqd, PBLK_READ); + atomic_dec(&pblk->inflight_io); +} + +static void pblk_end_io_read(struct nvm_rq *rqd) +{ + struct pblk *pblk = rqd->private; + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct bio *bio = (struct bio *)r_ctx->private; + + pblk_end_user_read(bio); + __pblk_end_io_read(pblk, rqd, true); +} + +static void pblk_end_partial_read(struct nvm_rq *rqd) +{ + struct pblk *pblk = rqd->private; + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct pblk_pr_ctx *pr_ctx = r_ctx->private; + struct bio *new_bio = rqd->bio; + struct bio *bio = pr_ctx->orig_bio; + struct bio_vec src_bv, dst_bv; + struct pblk_sec_meta *meta_list = rqd->meta_list; + int bio_init_idx = pr_ctx->bio_init_idx; + unsigned long *read_bitmap = pr_ctx->bitmap; + int nr_secs = pr_ctx->orig_nr_secs; + int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); + __le64 *lba_list_mem, *lba_list_media; + void *src_p, *dst_p; + int hole, i; + + if (unlikely(nr_holes == 1)) { + struct ppa_addr ppa; + + ppa = rqd->ppa_addr; + rqd->ppa_list = pr_ctx->ppa_ptr; + rqd->dma_ppa_list = pr_ctx->dma_ppa_list; + rqd->ppa_list[0] = ppa; + } + + /* Re-use allocated memory for intermediate lbas */ + lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); + lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size); + + for (i = 0; i < nr_secs; i++) { + lba_list_media[i] = meta_list[i].lba; + meta_list[i].lba = lba_list_mem[i]; + } + + /* Fill the holes in the original bio */ + i = 0; + hole = find_first_zero_bit(read_bitmap, nr_secs); + do { + int line_id = pblk_ppa_to_line(rqd->ppa_list[i]); + struct pblk_line *line = &pblk->lines[line_id]; + + kref_put(&line->ref, pblk_line_put); + + meta_list[hole].lba = lba_list_media[i]; + + src_bv = new_bio->bi_io_vec[i++]; + dst_bv = bio->bi_io_vec[bio_init_idx + hole]; + + src_p = kmap_atomic(src_bv.bv_page); + dst_p = kmap_atomic(dst_bv.bv_page); + + memcpy(dst_p + dst_bv.bv_offset, + src_p + src_bv.bv_offset, + PBLK_EXPOSED_PAGE_SIZE); + + kunmap_atomic(src_p); + kunmap_atomic(dst_p); + + mempool_free(src_bv.bv_page, &pblk->page_bio_pool); + + hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1); + } while (hole < nr_secs); + + bio_put(new_bio); + kfree(pr_ctx); + + /* restore original request */ + rqd->bio = NULL; + rqd->nr_ppas = nr_secs; + + bio_endio(bio); + __pblk_end_io_read(pblk, rqd, false); +} + +static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd, + unsigned int bio_init_idx, + unsigned long *read_bitmap, + int nr_holes) +{ + struct pblk_sec_meta *meta_list = rqd->meta_list; + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct pblk_pr_ctx *pr_ctx; + struct bio *new_bio, *bio = r_ctx->private; + __le64 *lba_list_mem; + int nr_secs = rqd->nr_ppas; + int i; + + /* Re-use allocated memory for intermediate lbas */ + lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); + + new_bio = bio_alloc(GFP_KERNEL, nr_holes); + + if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) + goto fail_bio_put; + + if (nr_holes != new_bio->bi_vcnt) { + WARN_ONCE(1, "pblk: malformed bio\n"); + goto fail_free_pages; + } + + pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL); + if (!pr_ctx) + goto fail_free_pages; + + for (i = 0; i < nr_secs; i++) + lba_list_mem[i] = meta_list[i].lba; + + new_bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(new_bio, REQ_OP_READ, 0); + + rqd->bio = new_bio; + rqd->nr_ppas = nr_holes; + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + + pr_ctx->ppa_ptr = NULL; + pr_ctx->orig_bio = bio; + bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA); + pr_ctx->bio_init_idx = bio_init_idx; + pr_ctx->orig_nr_secs = nr_secs; + r_ctx->private = pr_ctx; + + if (unlikely(nr_holes == 1)) { + pr_ctx->ppa_ptr = rqd->ppa_list; + pr_ctx->dma_ppa_list = rqd->dma_ppa_list; + rqd->ppa_addr = rqd->ppa_list[0]; + } + return 0; + +fail_free_pages: + pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt); +fail_bio_put: + bio_put(new_bio); + + return -ENOMEM; +} + +static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, + unsigned int bio_init_idx, + unsigned long *read_bitmap, int nr_secs) +{ + int nr_holes; + int ret; + + nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); + + if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap, + nr_holes)) + return NVM_IO_ERR; + + rqd->end_io = pblk_end_partial_read; + + ret = pblk_submit_io(pblk, rqd); + if (ret) { + bio_put(rqd->bio); + pblk_err(pblk, "partial read IO submission failed\n"); + goto err; + } + + return NVM_IO_OK; + +err: + pblk_err(pblk, "failed to perform partial read\n"); + + /* Free allocated pages in new bio */ + pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt); + __pblk_end_io_read(pblk, rqd, false); + return NVM_IO_ERR; +} + +static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, + sector_t lba, unsigned long *read_bitmap) +{ + struct pblk_sec_meta *meta_list = rqd->meta_list; + struct ppa_addr ppa; + + pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_inc(&pblk->inflight_reads); +#endif + +retry: + if (pblk_ppa_empty(ppa)) { + WARN_ON(test_and_set_bit(0, read_bitmap)); + meta_list[0].lba = cpu_to_le64(ADDR_EMPTY); + return; + } + + /* Try to read from write buffer. The address is later checked on the + * write buffer to prevent retrieving overwritten data. + */ + if (pblk_addr_in_cache(ppa)) { + if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0, 1)) { + pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); + goto retry; + } + + WARN_ON(test_and_set_bit(0, read_bitmap)); + meta_list[0].lba = cpu_to_le64(lba); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_inc(&pblk->cache_reads); +#endif + } else { + rqd->ppa_addr = ppa; + } + + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); +} + +int pblk_submit_read(struct pblk *pblk, struct bio *bio) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct request_queue *q = dev->q; + sector_t blba = pblk_get_lba(bio); + unsigned int nr_secs = pblk_get_secs(bio); + struct pblk_g_ctx *r_ctx; + struct nvm_rq *rqd; + unsigned int bio_init_idx; + DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA); + int ret = NVM_IO_ERR; + + /* logic error: lba out-of-bounds. Ignore read request */ + if (blba >= pblk->rl.nr_secs || nr_secs > PBLK_MAX_REQ_ADDRS) { + WARN(1, "pblk: read lba out of bounds (lba:%llu, nr:%d)\n", + (unsigned long long)blba, nr_secs); + return NVM_IO_ERR; + } + + generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio), + &pblk->disk->part0); + + bitmap_zero(read_bitmap, nr_secs); + + rqd = pblk_alloc_rqd(pblk, PBLK_READ); + + rqd->opcode = NVM_OP_PREAD; + rqd->nr_ppas = nr_secs; + rqd->bio = NULL; /* cloned bio if needed */ + rqd->private = pblk; + rqd->end_io = pblk_end_io_read; + + r_ctx = nvm_rq_to_pdu(rqd); + r_ctx->start_time = jiffies; + r_ctx->lba = blba; + r_ctx->private = bio; /* original bio */ + + /* Save the index for this bio's start. This is needed in case + * we need to fill a partial read. + */ + bio_init_idx = pblk_get_bi_idx(bio); + + rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &rqd->dma_meta_list); + if (!rqd->meta_list) { + pblk_err(pblk, "not able to allocate ppa list\n"); + goto fail_rqd_free; + } + + if (nr_secs > 1) { + rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; + rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; + + pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap); + } else { + pblk_read_rq(pblk, rqd, bio, blba, read_bitmap); + } + + if (bitmap_full(read_bitmap, nr_secs)) { + atomic_inc(&pblk->inflight_io); + __pblk_end_io_read(pblk, rqd, false); + return NVM_IO_DONE; + } + + /* All sectors are to be read from the device */ + if (bitmap_empty(read_bitmap, rqd->nr_ppas)) { + struct bio *int_bio = NULL; + + /* Clone read bio to deal with read errors internally */ + int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); + if (!int_bio) { + pblk_err(pblk, "could not clone read bio\n"); + goto fail_end_io; + } + + rqd->bio = int_bio; + + if (pblk_submit_io(pblk, rqd)) { + pblk_err(pblk, "read IO submission failed\n"); + ret = NVM_IO_ERR; + goto fail_end_io; + } + + return NVM_IO_OK; + } + + /* The read bio request could be partially filled by the write buffer, + * but there are some holes that need to be read from the drive. + */ + ret = pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap, + nr_secs); + if (ret) + goto fail_meta_free; + + return NVM_IO_OK; + +fail_meta_free: + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); +fail_rqd_free: + pblk_free_rqd(pblk, rqd, PBLK_READ); + return ret; +fail_end_io: + __pblk_end_io_read(pblk, rqd, false); + return ret; +} + +static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, + struct pblk_line *line, u64 *lba_list, + u64 *paddr_list_gc, unsigned int nr_secs) +{ + struct ppa_addr ppa_list_l2p[PBLK_MAX_REQ_ADDRS]; + struct ppa_addr ppa_gc; + int valid_secs = 0; + int i; + + pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs); + + for (i = 0; i < nr_secs; i++) { + if (lba_list[i] == ADDR_EMPTY) + continue; + + ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id); + if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) { + paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY; + continue; + } + + rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; + } + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_add(valid_secs, &pblk->inflight_reads); +#endif + + return valid_secs; +} + +static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, + struct pblk_line *line, sector_t lba, + u64 paddr_gc) +{ + struct ppa_addr ppa_l2p, ppa_gc; + int valid_secs = 0; + + if (lba == ADDR_EMPTY) + goto out; + + /* logic error: lba out-of-bounds */ + if (lba >= pblk->rl.nr_secs) { + WARN(1, "pblk: read lba out of bounds\n"); + goto out; + } + + spin_lock(&pblk->trans_lock); + ppa_l2p = pblk_trans_map_get(pblk, lba); + spin_unlock(&pblk->trans_lock); + + ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id); + if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) + goto out; + + rqd->ppa_addr = ppa_l2p; + valid_secs = 1; + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_inc(&pblk->inflight_reads); +#endif + +out: + return valid_secs; +} + +int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct bio *bio; + struct nvm_rq rqd; + int data_len; + int ret = NVM_IO_OK; + + memset(&rqd, 0, sizeof(struct nvm_rq)); + + rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &rqd.dma_meta_list); + if (!rqd.meta_list) + return -ENOMEM; + + if (gc_rq->nr_secs > 1) { + rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; + rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; + + gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line, + gc_rq->lba_list, + gc_rq->paddr_list, + gc_rq->nr_secs); + if (gc_rq->secs_to_gc == 1) + rqd.ppa_addr = rqd.ppa_list[0]; + } else { + gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line, + gc_rq->lba_list[0], + gc_rq->paddr_list[0]); + } + + if (!(gc_rq->secs_to_gc)) + goto out; + + data_len = (gc_rq->secs_to_gc) * geo->csecs; + bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len, + PBLK_VMALLOC_META, GFP_KERNEL); + if (IS_ERR(bio)) { + pblk_err(pblk, "could not allocate GC bio (%lu)\n", + PTR_ERR(bio)); + goto err_free_dma; + } + + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + rqd.opcode = NVM_OP_PREAD; + rqd.nr_ppas = gc_rq->secs_to_gc; + rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + rqd.bio = bio; + + if (pblk_submit_io_sync(pblk, &rqd)) { + ret = -EIO; + pblk_err(pblk, "GC read request failed\n"); + goto err_free_bio; + } + + pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs); + + atomic_dec(&pblk->inflight_io); + + if (rqd.error) { + atomic_long_inc(&pblk->read_failed_gc); +#ifdef CONFIG_NVM_PBLK_DEBUG + pblk_print_failed_rqd(pblk, &rqd, rqd.error); +#endif + } + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); + atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); + atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); +#endif + +out: + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); + return ret; + +err_free_bio: + bio_put(bio); +err_free_dma: + nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); + return ret; +} diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c new file mode 100644 index 000000000..df75d9cae --- /dev/null +++ b/drivers/lightnvm/pblk-recovery.c @@ -0,0 +1,1011 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial: Javier Gonzalez <javier@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * pblk-recovery.c - pblk's recovery path + */ + +#include "pblk.h" + +int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf) +{ + u32 crc; + + crc = pblk_calc_emeta_crc(pblk, emeta_buf); + if (le32_to_cpu(emeta_buf->crc) != crc) + return 1; + + if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) + return 1; + + return 0; +} + +static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_emeta *emeta = line->emeta; + struct line_emeta *emeta_buf = emeta->buf; + __le64 *lba_list; + u64 data_start, data_end; + u64 nr_valid_lbas, nr_lbas = 0; + u64 i; + + lba_list = emeta_to_lbas(pblk, emeta_buf); + if (!lba_list) + return 1; + + data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; + data_end = line->emeta_ssec; + nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); + + for (i = data_start; i < data_end; i++) { + struct ppa_addr ppa; + int pos; + + ppa = addr_to_gen_ppa(pblk, i, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + + /* Do not update bad blocks */ + if (test_bit(pos, line->blk_bitmap)) + continue; + + if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) { + spin_lock(&line->lock); + if (test_and_set_bit(i, line->invalid_bitmap)) + WARN_ONCE(1, "pblk: rec. double invalidate:\n"); + else + le32_add_cpu(line->vsc, -1); + spin_unlock(&line->lock); + + continue; + } + + pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa); + nr_lbas++; + } + + if (nr_valid_lbas != nr_lbas) + pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n", + line->id, nr_valid_lbas, nr_lbas); + + line->left_msecs = 0; + + return 0; +} + +static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); + + return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] - + nr_bb * geo->clba; +} + +struct pblk_recov_alloc { + struct ppa_addr *ppa_list; + struct pblk_sec_meta *meta_list; + struct nvm_rq *rqd; + void *data; + dma_addr_t dma_ppa_list; + dma_addr_t dma_meta_list; +}; + +static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line, + struct pblk_recov_alloc p, u64 r_ptr) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct ppa_addr *ppa_list; + struct pblk_sec_meta *meta_list; + struct nvm_rq *rqd; + struct bio *bio; + void *data; + dma_addr_t dma_ppa_list, dma_meta_list; + u64 r_ptr_int; + int left_ppas; + int rq_ppas, rq_len; + int i, j; + int ret = 0; + + ppa_list = p.ppa_list; + meta_list = p.meta_list; + rqd = p.rqd; + data = p.data; + dma_ppa_list = p.dma_ppa_list; + dma_meta_list = p.dma_meta_list; + + left_ppas = line->cur_sec - r_ptr; + if (!left_ppas) + return 0; + + r_ptr_int = r_ptr; + +next_read_rq: + memset(rqd, 0, pblk_g_rq_size); + + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); + if (!rq_ppas) + rq_ppas = pblk->min_write_pgs; + rq_len = rq_ppas * geo->csecs; + + bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL); + if (IS_ERR(bio)) + return PTR_ERR(bio); + + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + rqd->bio = bio; + rqd->opcode = NVM_OP_PREAD; + rqd->meta_list = meta_list; + rqd->nr_ppas = rq_ppas; + rqd->ppa_list = ppa_list; + rqd->dma_ppa_list = dma_ppa_list; + rqd->dma_meta_list = dma_meta_list; + + if (pblk_io_aligned(pblk, rq_ppas)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + + for (i = 0; i < rqd->nr_ppas; ) { + struct ppa_addr ppa; + int pos; + + ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + + while (test_bit(pos, line->blk_bitmap)) { + r_ptr_int += pblk->min_write_pgs; + ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + } + + for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++) + rqd->ppa_list[i] = + addr_to_gen_ppa(pblk, r_ptr_int, line->id); + } + + /* If read fails, more padding is needed */ + ret = pblk_submit_io_sync(pblk, rqd); + if (ret) { + pblk_err(pblk, "I/O submission failed: %d\n", ret); + return ret; + } + + atomic_dec(&pblk->inflight_io); + + /* At this point, the read should not fail. If it does, it is a problem + * we cannot recover from here. Need FTL log. + */ + if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) { + pblk_err(pblk, "L2P recovery failed (%d)\n", rqd->error); + return -EINTR; + } + + for (i = 0; i < rqd->nr_ppas; i++) { + u64 lba = le64_to_cpu(meta_list[i].lba); + + if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs) + continue; + + pblk_update_map(pblk, lba, rqd->ppa_list[i]); + } + + left_ppas -= rq_ppas; + if (left_ppas > 0) + goto next_read_rq; + + return 0; +} + +static void pblk_recov_complete(struct kref *ref) +{ + struct pblk_pad_rq *pad_rq = container_of(ref, struct pblk_pad_rq, ref); + + complete(&pad_rq->wait); +} + +static void pblk_end_io_recov(struct nvm_rq *rqd) +{ + struct pblk_pad_rq *pad_rq = rqd->private; + struct pblk *pblk = pad_rq->pblk; + + pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); + + pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); + + atomic_dec(&pblk->inflight_io); + kref_put(&pad_rq->ref, pblk_recov_complete); +} + +static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line, + int left_ppas) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct ppa_addr *ppa_list; + struct pblk_sec_meta *meta_list; + struct pblk_pad_rq *pad_rq; + struct nvm_rq *rqd; + struct bio *bio; + void *data; + dma_addr_t dma_ppa_list, dma_meta_list; + __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); + u64 w_ptr = line->cur_sec; + int left_line_ppas, rq_ppas, rq_len; + int i, j; + int ret = 0; + + spin_lock(&line->lock); + left_line_ppas = line->left_msecs; + spin_unlock(&line->lock); + + pad_rq = kmalloc(sizeof(struct pblk_pad_rq), GFP_KERNEL); + if (!pad_rq) + return -ENOMEM; + + data = vzalloc(array_size(pblk->max_write_pgs, geo->csecs)); + if (!data) { + ret = -ENOMEM; + goto free_rq; + } + + pad_rq->pblk = pblk; + init_completion(&pad_rq->wait); + kref_init(&pad_rq->ref); + +next_pad_rq: + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); + if (rq_ppas < pblk->min_write_pgs) { + pblk_err(pblk, "corrupted pad line %d\n", line->id); + goto fail_free_pad; + } + + rq_len = rq_ppas * geo->csecs; + + meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); + if (!meta_list) { + ret = -ENOMEM; + goto fail_free_pad; + } + + ppa_list = (void *)(meta_list) + pblk_dma_meta_size; + dma_ppa_list = dma_meta_list + pblk_dma_meta_size; + + bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, + PBLK_VMALLOC_META, GFP_KERNEL); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + goto fail_free_meta; + } + + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); + + rqd->bio = bio; + rqd->opcode = NVM_OP_PWRITE; + rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); + rqd->meta_list = meta_list; + rqd->nr_ppas = rq_ppas; + rqd->ppa_list = ppa_list; + rqd->dma_ppa_list = dma_ppa_list; + rqd->dma_meta_list = dma_meta_list; + rqd->end_io = pblk_end_io_recov; + rqd->private = pad_rq; + + for (i = 0; i < rqd->nr_ppas; ) { + struct ppa_addr ppa; + int pos; + + w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); + ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + + while (test_bit(pos, line->blk_bitmap)) { + w_ptr += pblk->min_write_pgs; + ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + } + + for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) { + struct ppa_addr dev_ppa; + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + + dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); + + pblk_map_invalidate(pblk, dev_ppa); + lba_list[w_ptr] = meta_list[i].lba = addr_empty; + rqd->ppa_list[i] = dev_ppa; + } + } + + kref_get(&pad_rq->ref); + pblk_down_page(pblk, rqd->ppa_list, rqd->nr_ppas); + + ret = pblk_submit_io(pblk, rqd); + if (ret) { + pblk_err(pblk, "I/O submission failed: %d\n", ret); + pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); + goto fail_free_bio; + } + + left_line_ppas -= rq_ppas; + left_ppas -= rq_ppas; + if (left_ppas && left_line_ppas) + goto next_pad_rq; + + kref_put(&pad_rq->ref, pblk_recov_complete); + + if (!wait_for_completion_io_timeout(&pad_rq->wait, + msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { + pblk_err(pblk, "pad write timed out\n"); + ret = -ETIME; + } + + if (!pblk_line_is_full(line)) + pblk_err(pblk, "corrupted padded line: %d\n", line->id); + + vfree(data); +free_rq: + kfree(pad_rq); + return ret; + +fail_free_bio: + bio_put(bio); +fail_free_meta: + nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); +fail_free_pad: + kfree(pad_rq); + vfree(data); + return ret; +} + +/* When this function is called, it means that not all upper pages have been + * written in a page that contains valid data. In order to recover this data, we + * first find the write pointer on the device, then we pad all necessary + * sectors, and finally attempt to read the valid data + */ +static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line, + struct pblk_recov_alloc p) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct ppa_addr *ppa_list; + struct pblk_sec_meta *meta_list; + struct nvm_rq *rqd; + struct bio *bio; + void *data; + dma_addr_t dma_ppa_list, dma_meta_list; + u64 w_ptr = 0, r_ptr; + int rq_ppas, rq_len; + int i, j; + int ret = 0; + int rec_round; + int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec; + + ppa_list = p.ppa_list; + meta_list = p.meta_list; + rqd = p.rqd; + data = p.data; + dma_ppa_list = p.dma_ppa_list; + dma_meta_list = p.dma_meta_list; + + /* we could recover up until the line write pointer */ + r_ptr = line->cur_sec; + rec_round = 0; + +next_rq: + memset(rqd, 0, pblk_g_rq_size); + + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); + if (!rq_ppas) + rq_ppas = pblk->min_write_pgs; + rq_len = rq_ppas * geo->csecs; + + bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL); + if (IS_ERR(bio)) + return PTR_ERR(bio); + + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + rqd->bio = bio; + rqd->opcode = NVM_OP_PREAD; + rqd->meta_list = meta_list; + rqd->nr_ppas = rq_ppas; + rqd->ppa_list = ppa_list; + rqd->dma_ppa_list = dma_ppa_list; + rqd->dma_meta_list = dma_meta_list; + + if (pblk_io_aligned(pblk, rq_ppas)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + + for (i = 0; i < rqd->nr_ppas; ) { + struct ppa_addr ppa; + int pos; + + w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); + ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + + while (test_bit(pos, line->blk_bitmap)) { + w_ptr += pblk->min_write_pgs; + ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + } + + for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) + rqd->ppa_list[i] = + addr_to_gen_ppa(pblk, w_ptr, line->id); + } + + ret = pblk_submit_io_sync(pblk, rqd); + if (ret) { + pblk_err(pblk, "I/O submission failed: %d\n", ret); + return ret; + } + + atomic_dec(&pblk->inflight_io); + + /* This should not happen since the read failed during normal recovery, + * but the media works funny sometimes... + */ + if (!rec_round++ && !rqd->error) { + rec_round = 0; + for (i = 0; i < rqd->nr_ppas; i++, r_ptr++) { + u64 lba = le64_to_cpu(meta_list[i].lba); + + if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs) + continue; + + pblk_update_map(pblk, lba, rqd->ppa_list[i]); + } + } + + /* Reached the end of the written line */ + if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) { + int pad_secs, nr_error_bits, bit; + int ret; + + bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas); + nr_error_bits = rqd->nr_ppas - bit; + + /* Roll back failed sectors */ + line->cur_sec -= nr_error_bits; + line->left_msecs += nr_error_bits; + bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits); + + pad_secs = pblk_pad_distance(pblk); + if (pad_secs > line->left_msecs) + pad_secs = line->left_msecs; + + ret = pblk_recov_pad_oob(pblk, line, pad_secs); + if (ret) + pblk_err(pblk, "OOB padding failed (err:%d)\n", ret); + + ret = pblk_recov_read_oob(pblk, line, p, r_ptr); + if (ret) + pblk_err(pblk, "OOB read failed (err:%d)\n", ret); + + left_ppas = 0; + } + + left_ppas -= rq_ppas; + if (left_ppas > 0) + goto next_rq; + + return ret; +} + +static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, + struct pblk_recov_alloc p, int *done) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct ppa_addr *ppa_list; + struct pblk_sec_meta *meta_list; + struct nvm_rq *rqd; + struct bio *bio; + void *data; + dma_addr_t dma_ppa_list, dma_meta_list; + u64 paddr; + int rq_ppas, rq_len; + int i, j; + int ret = 0; + int left_ppas = pblk_calc_sec_in_line(pblk, line); + + ppa_list = p.ppa_list; + meta_list = p.meta_list; + rqd = p.rqd; + data = p.data; + dma_ppa_list = p.dma_ppa_list; + dma_meta_list = p.dma_meta_list; + + *done = 1; + +next_rq: + memset(rqd, 0, pblk_g_rq_size); + + rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); + if (!rq_ppas) + rq_ppas = pblk->min_write_pgs; + rq_len = rq_ppas * geo->csecs; + + bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL); + if (IS_ERR(bio)) + return PTR_ERR(bio); + + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + rqd->bio = bio; + rqd->opcode = NVM_OP_PREAD; + rqd->meta_list = meta_list; + rqd->nr_ppas = rq_ppas; + rqd->ppa_list = ppa_list; + rqd->dma_ppa_list = dma_ppa_list; + rqd->dma_meta_list = dma_meta_list; + + if (pblk_io_aligned(pblk, rq_ppas)) + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); + else + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + + for (i = 0; i < rqd->nr_ppas; ) { + struct ppa_addr ppa; + int pos; + + paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); + ppa = addr_to_gen_ppa(pblk, paddr, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + + while (test_bit(pos, line->blk_bitmap)) { + paddr += pblk->min_write_pgs; + ppa = addr_to_gen_ppa(pblk, paddr, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + } + + for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++) + rqd->ppa_list[i] = + addr_to_gen_ppa(pblk, paddr, line->id); + } + + ret = pblk_submit_io_sync(pblk, rqd); + if (ret) { + pblk_err(pblk, "I/O submission failed: %d\n", ret); + bio_put(bio); + return ret; + } + + atomic_dec(&pblk->inflight_io); + + /* Reached the end of the written line */ + if (rqd->error) { + int nr_error_bits, bit; + + bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas); + nr_error_bits = rqd->nr_ppas - bit; + + /* Roll back failed sectors */ + line->cur_sec -= nr_error_bits; + line->left_msecs += nr_error_bits; + bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits); + + left_ppas = 0; + rqd->nr_ppas = bit; + + if (rqd->error != NVM_RSP_ERR_EMPTYPAGE) + *done = 0; + } + + for (i = 0; i < rqd->nr_ppas; i++) { + u64 lba = le64_to_cpu(meta_list[i].lba); + + if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs) + continue; + + pblk_update_map(pblk, lba, rqd->ppa_list[i]); + } + + left_ppas -= rq_ppas; + if (left_ppas > 0) + goto next_rq; + + return ret; +} + +/* Scan line for lbas on out of bound area */ +static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct nvm_rq *rqd; + struct ppa_addr *ppa_list; + struct pblk_sec_meta *meta_list; + struct pblk_recov_alloc p; + void *data; + dma_addr_t dma_ppa_list, dma_meta_list; + int done, ret = 0; + + meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); + if (!meta_list) + return -ENOMEM; + + ppa_list = (void *)(meta_list) + pblk_dma_meta_size; + dma_ppa_list = dma_meta_list + pblk_dma_meta_size; + + data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto free_meta_list; + } + + rqd = pblk_alloc_rqd(pblk, PBLK_READ); + + p.ppa_list = ppa_list; + p.meta_list = meta_list; + p.rqd = rqd; + p.data = data; + p.dma_ppa_list = dma_ppa_list; + p.dma_meta_list = dma_meta_list; + + ret = pblk_recov_scan_oob(pblk, line, p, &done); + if (ret) { + pblk_err(pblk, "could not recover L2P from OOB\n"); + goto out; + } + + if (!done) { + ret = pblk_recov_scan_all_oob(pblk, line, p); + if (ret) { + pblk_err(pblk, "could not recover L2P from OOB\n"); + goto out; + } + } + + if (pblk_line_is_full(line)) + pblk_line_recov_close(pblk, line); + +out: + kfree(data); +free_meta_list: + nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); + + return ret; +} + +/* Insert lines ordered by sequence number (seq_num) on list */ +static void pblk_recov_line_add_ordered(struct list_head *head, + struct pblk_line *line) +{ + struct pblk_line *t = NULL; + + list_for_each_entry(t, head, list) + if (t->seq_nr > line->seq_nr) + break; + + __list_add(&line->list, t->list.prev, &t->list); +} + +static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + unsigned int emeta_secs; + u64 emeta_start; + struct ppa_addr ppa; + int pos; + + emeta_secs = lm->emeta_sec[0]; + emeta_start = lm->sec_per_line; + + while (emeta_secs) { + emeta_start--; + ppa = addr_to_gen_ppa(pblk, emeta_start, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + if (!test_bit(pos, line->blk_bitmap)) + emeta_secs--; + } + + return emeta_start; +} + +static int pblk_recov_check_line_version(struct pblk *pblk, + struct line_emeta *emeta) +{ + struct line_header *header = &emeta->header; + + if (header->version_major != EMETA_VERSION_MAJOR) { + pblk_err(pblk, "line major version mismatch: %d, expected: %d\n", + header->version_major, EMETA_VERSION_MAJOR); + return 1; + } + +#ifdef CONFIG_NVM_PBLK_DEBUG + if (header->version_minor > EMETA_VERSION_MINOR) + pblk_info(pblk, "newer line minor version found: %d\n", + header->version_minor); +#endif + + return 0; +} + +static void pblk_recov_wa_counters(struct pblk *pblk, + struct line_emeta *emeta) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct line_header *header = &emeta->header; + struct wa_counters *wa = emeta_to_wa(lm, emeta); + + /* WA counters were introduced in emeta version 0.2 */ + if (header->version_major > 0 || header->version_minor >= 2) { + u64 user = le64_to_cpu(wa->user); + u64 pad = le64_to_cpu(wa->pad); + u64 gc = le64_to_cpu(wa->gc); + + atomic64_set(&pblk->user_wa, user); + atomic64_set(&pblk->pad_wa, pad); + atomic64_set(&pblk->gc_wa, gc); + + pblk->user_rst_wa = user; + pblk->pad_rst_wa = pad; + pblk->gc_rst_wa = gc; + } +} + +static int pblk_line_was_written(struct pblk_line *line, + struct pblk *pblk) +{ + + struct pblk_line_meta *lm = &pblk->lm; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct nvm_chk_meta *chunk; + struct ppa_addr bppa; + int smeta_blk; + + if (line->state == PBLK_LINESTATE_BAD) + return 0; + + smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); + if (smeta_blk >= lm->blk_per_line) + return 0; + + bppa = pblk->luns[smeta_blk].bppa; + chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)]; + + if (chunk->state & NVM_CHK_ST_FREE) + return 0; + + return 1; +} + +struct pblk_line *pblk_recov_l2p(struct pblk *pblk) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line *line, *tline, *data_line = NULL; + struct pblk_smeta *smeta; + struct pblk_emeta *emeta; + struct line_smeta *smeta_buf; + int found_lines = 0, recovered_lines = 0, open_lines = 0; + int is_next = 0; + int meta_line; + int i, valid_uuid = 0; + LIST_HEAD(recov_list); + + /* TODO: Implement FTL snapshot */ + + /* Scan recovery - takes place when FTL snapshot fails */ + spin_lock(&l_mg->free_lock); + meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); + set_bit(meta_line, &l_mg->meta_bitmap); + smeta = l_mg->sline_meta[meta_line]; + emeta = l_mg->eline_meta[meta_line]; + smeta_buf = (struct line_smeta *)smeta; + spin_unlock(&l_mg->free_lock); + + /* Order data lines using their sequence number */ + for (i = 0; i < l_mg->nr_lines; i++) { + u32 crc; + + line = &pblk->lines[i]; + + memset(smeta, 0, lm->smeta_len); + line->smeta = smeta; + line->lun_bitmap = ((void *)(smeta_buf)) + + sizeof(struct line_smeta); + + if (!pblk_line_was_written(line, pblk)) + continue; + + /* Lines that cannot be read are assumed as not written here */ + if (pblk_line_read_smeta(pblk, line)) + continue; + + crc = pblk_calc_smeta_crc(pblk, smeta_buf); + if (le32_to_cpu(smeta_buf->crc) != crc) + continue; + + if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) + continue; + + if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) { + pblk_err(pblk, "found incompatible line version %u\n", + smeta_buf->header.version_major); + return ERR_PTR(-EINVAL); + } + + /* The first valid instance uuid is used for initialization */ + if (!valid_uuid) { + memcpy(pblk->instance_uuid, smeta_buf->header.uuid, 16); + valid_uuid = 1; + } + + if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) { + pblk_debug(pblk, "ignore line %u due to uuid mismatch\n", + i); + continue; + } + + /* Update line metadata */ + spin_lock(&line->lock); + line->id = le32_to_cpu(smeta_buf->header.id); + line->type = le16_to_cpu(smeta_buf->header.type); + line->seq_nr = le64_to_cpu(smeta_buf->seq_nr); + spin_unlock(&line->lock); + + /* Update general metadata */ + spin_lock(&l_mg->free_lock); + if (line->seq_nr >= l_mg->d_seq_nr) + l_mg->d_seq_nr = line->seq_nr + 1; + l_mg->nr_free_lines--; + spin_unlock(&l_mg->free_lock); + + if (pblk_line_recov_alloc(pblk, line)) + goto out; + + pblk_recov_line_add_ordered(&recov_list, line); + found_lines++; + pblk_debug(pblk, "recovering data line %d, seq:%llu\n", + line->id, smeta_buf->seq_nr); + } + + if (!found_lines) { + pblk_setup_uuid(pblk); + + spin_lock(&l_mg->free_lock); + WARN_ON_ONCE(!test_and_clear_bit(meta_line, + &l_mg->meta_bitmap)); + spin_unlock(&l_mg->free_lock); + + goto out; + } + + /* Verify closed blocks and recover this portion of L2P table*/ + list_for_each_entry_safe(line, tline, &recov_list, list) { + recovered_lines++; + + line->emeta_ssec = pblk_line_emeta_start(pblk, line); + line->emeta = emeta; + memset(line->emeta->buf, 0, lm->emeta_len[0]); + + if (pblk_line_read_emeta(pblk, line, line->emeta->buf)) { + pblk_recov_l2p_from_oob(pblk, line); + goto next; + } + + if (pblk_recov_check_emeta(pblk, line->emeta->buf)) { + pblk_recov_l2p_from_oob(pblk, line); + goto next; + } + + if (pblk_recov_check_line_version(pblk, line->emeta->buf)) + return ERR_PTR(-EINVAL); + + pblk_recov_wa_counters(pblk, line->emeta->buf); + + if (pblk_recov_l2p_from_emeta(pblk, line)) + pblk_recov_l2p_from_oob(pblk, line); + +next: + if (pblk_line_is_full(line)) { + struct list_head *move_list; + + spin_lock(&line->lock); + line->state = PBLK_LINESTATE_CLOSED; + move_list = pblk_line_gc_list(pblk, line); + spin_unlock(&line->lock); + + spin_lock(&l_mg->gc_lock); + list_move_tail(&line->list, move_list); + spin_unlock(&l_mg->gc_lock); + + kfree(line->map_bitmap); + line->map_bitmap = NULL; + line->smeta = NULL; + line->emeta = NULL; + } else { + if (open_lines > 1) + pblk_err(pblk, "failed to recover L2P\n"); + + open_lines++; + line->meta_line = meta_line; + data_line = line; + } + } + + if (!open_lines) { + spin_lock(&l_mg->free_lock); + WARN_ON_ONCE(!test_and_clear_bit(meta_line, + &l_mg->meta_bitmap)); + spin_unlock(&l_mg->free_lock); + pblk_line_replace_data(pblk); + } else { + spin_lock(&l_mg->free_lock); + /* Allocate next line for preparation */ + l_mg->data_next = pblk_line_get(pblk); + if (l_mg->data_next) { + l_mg->data_next->seq_nr = l_mg->d_seq_nr++; + l_mg->data_next->type = PBLK_LINETYPE_DATA; + is_next = 1; + } + spin_unlock(&l_mg->free_lock); + } + + if (is_next) + pblk_line_erase(pblk, l_mg->data_next); + +out: + if (found_lines != recovered_lines) + pblk_err(pblk, "failed to recover all found lines %d/%d\n", + found_lines, recovered_lines); + + return data_line; +} + +/* + * Pad current line + */ +int pblk_recov_pad(struct pblk *pblk) +{ + struct pblk_line *line; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + int left_msecs; + int ret = 0; + + spin_lock(&l_mg->free_lock); + line = l_mg->data_line; + left_msecs = line->left_msecs; + spin_unlock(&l_mg->free_lock); + + ret = pblk_recov_pad_oob(pblk, line, left_msecs); + if (ret) { + pblk_err(pblk, "tear down padding failed (%d)\n", ret); + return ret; + } + + pblk_line_close_meta(pblk, line); + return ret; +} diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c new file mode 100644 index 000000000..6a0616a6f --- /dev/null +++ b/drivers/lightnvm/pblk-rl.c @@ -0,0 +1,250 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * pblk-rl.c - pblk's rate limiter for user I/O + * + */ + +#include "pblk.h" + +static void pblk_rl_kick_u_timer(struct pblk_rl *rl) +{ + mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000)); +} + +int pblk_rl_is_limit(struct pblk_rl *rl) +{ + int rb_space; + + rb_space = atomic_read(&rl->rb_space); + + return (rb_space == 0); +} + +int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries) +{ + int rb_user_cnt = atomic_read(&rl->rb_user_cnt); + int rb_space = atomic_read(&rl->rb_space); + + if (unlikely(rb_space >= 0) && (rb_space - nr_entries < 0)) + return NVM_IO_ERR; + + if (rb_user_cnt >= rl->rb_user_max) + return NVM_IO_REQUEUE; + + return NVM_IO_OK; +} + +void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries) +{ + int rb_space = atomic_read(&rl->rb_space); + + if (unlikely(rb_space >= 0)) + atomic_sub(nr_entries, &rl->rb_space); +} + +int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries) +{ + int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt); + int rb_user_active; + + /* If there is no user I/O let GC take over space on the write buffer */ + rb_user_active = READ_ONCE(rl->rb_user_active); + return (!(rb_gc_cnt >= rl->rb_gc_max && rb_user_active)); +} + +void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries) +{ + atomic_add(nr_entries, &rl->rb_user_cnt); + + /* Release user I/O state. Protect from GC */ + smp_store_release(&rl->rb_user_active, 1); + pblk_rl_kick_u_timer(rl); +} + +void pblk_rl_werr_line_in(struct pblk_rl *rl) +{ + atomic_inc(&rl->werr_lines); +} + +void pblk_rl_werr_line_out(struct pblk_rl *rl) +{ + atomic_dec(&rl->werr_lines); +} + +void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries) +{ + atomic_add(nr_entries, &rl->rb_gc_cnt); +} + +void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc) +{ + atomic_sub(nr_user, &rl->rb_user_cnt); + atomic_sub(nr_gc, &rl->rb_gc_cnt); +} + +unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) +{ + return atomic_read(&rl->free_blocks); +} + +unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl) +{ + return atomic_read(&rl->free_user_blocks); +} + +static void __pblk_rl_update_rates(struct pblk_rl *rl, + unsigned long free_blocks) +{ + struct pblk *pblk = container_of(rl, struct pblk, rl); + int max = rl->rb_budget; + int werr_gc_needed = atomic_read(&rl->werr_lines); + + if (free_blocks >= rl->high) { + if (werr_gc_needed) { + /* Allocate a small budget for recovering + * lines with write errors + */ + rl->rb_gc_max = 1 << rl->rb_windows_pw; + rl->rb_user_max = max - rl->rb_gc_max; + rl->rb_state = PBLK_RL_WERR; + } else { + rl->rb_user_max = max; + rl->rb_gc_max = 0; + rl->rb_state = PBLK_RL_OFF; + } + } else if (free_blocks < rl->high) { + int shift = rl->high_pw - rl->rb_windows_pw; + int user_windows = free_blocks >> shift; + int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW; + + rl->rb_user_max = user_max; + rl->rb_gc_max = max - user_max; + + if (free_blocks <= rl->rsv_blocks) { + rl->rb_user_max = 0; + rl->rb_gc_max = max; + } + + /* In the worst case, we will need to GC lines in the low list + * (high valid sector count). If there are lines to GC on high + * or mid lists, these will be prioritized + */ + rl->rb_state = PBLK_RL_LOW; + } + + if (rl->rb_state != PBLK_RL_OFF) + pblk_gc_should_start(pblk); + else + pblk_gc_should_stop(pblk); +} + +void pblk_rl_update_rates(struct pblk_rl *rl) +{ + __pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl)); +} + +void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) +{ + int blk_in_line = atomic_read(&line->blk_in_line); + int free_blocks; + + atomic_add(blk_in_line, &rl->free_blocks); + free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks); + + __pblk_rl_update_rates(rl, free_blocks); +} + +void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, + bool used) +{ + int blk_in_line = atomic_read(&line->blk_in_line); + int free_blocks; + + atomic_sub(blk_in_line, &rl->free_blocks); + + if (used) + free_blocks = atomic_sub_return(blk_in_line, + &rl->free_user_blocks); + else + free_blocks = atomic_read(&rl->free_user_blocks); + + __pblk_rl_update_rates(rl, free_blocks); +} + +int pblk_rl_high_thrs(struct pblk_rl *rl) +{ + return rl->high; +} + +int pblk_rl_max_io(struct pblk_rl *rl) +{ + return rl->rb_max_io; +} + +static void pblk_rl_u_timer(struct timer_list *t) +{ + struct pblk_rl *rl = from_timer(rl, t, u_timer); + + /* Release user I/O state. Protect from GC */ + smp_store_release(&rl->rb_user_active, 0); +} + +void pblk_rl_free(struct pblk_rl *rl) +{ + del_timer(&rl->u_timer); +} + +void pblk_rl_init(struct pblk_rl *rl, int budget) +{ + struct pblk *pblk = container_of(rl, struct pblk, rl); + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE; + int sec_meta, blk_meta; + + unsigned int rb_windows; + + /* Consider sectors used for metadata */ + sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; + blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); + + rl->high = pblk->op_blks - blk_meta - lm->blk_per_line; + rl->high_pw = get_count_order(rl->high); + + rl->rsv_blocks = min_blocks; + + /* This will always be a power-of-2 */ + rb_windows = budget / PBLK_MAX_REQ_ADDRS; + rl->rb_windows_pw = get_count_order(rb_windows); + + /* To start with, all buffer is available to user I/O writers */ + rl->rb_budget = budget; + rl->rb_user_max = budget; + rl->rb_max_io = budget >> 1; + rl->rb_gc_max = 0; + rl->rb_state = PBLK_RL_HIGH; + + atomic_set(&rl->rb_user_cnt, 0); + atomic_set(&rl->rb_gc_cnt, 0); + atomic_set(&rl->rb_space, -1); + atomic_set(&rl->werr_lines, 0); + + timer_setup(&rl->u_timer, pblk_rl_u_timer, 0); + + rl->rb_user_active = 0; + rl->rb_gc_active = 0; +} diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c new file mode 100644 index 000000000..bdc86ee4c --- /dev/null +++ b/drivers/lightnvm/pblk-sysfs.c @@ -0,0 +1,720 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Implementation of a physical block-device target for Open-channel SSDs. + * + * pblk-sysfs.c - pblk's sysfs + * + */ + +#include "pblk.h" + +static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_lun *rlun; + ssize_t sz = 0; + int i; + + for (i = 0; i < geo->all_luns; i++) { + int active = 1; + + rlun = &pblk->luns[i]; + if (!down_trylock(&rlun->wr_sem)) { + active = 0; + up(&rlun->wr_sem); + } + sz += snprintf(page + sz, PAGE_SIZE - sz, + "pblk: pos:%d, ch:%d, lun:%d - %d\n", + i, + rlun->bppa.a.ch, + rlun->bppa.a.lun, + active); + } + + return sz; +} + +static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) +{ + int free_blocks, free_user_blocks, total_blocks; + int rb_user_max, rb_user_cnt; + int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; + + free_blocks = pblk_rl_nr_free_blks(&pblk->rl); + free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl); + rb_user_max = pblk->rl.rb_user_max; + rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); + rb_gc_max = pblk->rl.rb_gc_max; + rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt); + rb_budget = pblk->rl.rb_budget; + rb_state = pblk->rl.rb_state; + + total_blocks = pblk->rl.total_blocks; + + return snprintf(page, PAGE_SIZE, + "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n", + rb_user_cnt, + rb_user_max, + rb_gc_cnt, + rb_gc_max, + rb_state, + rb_budget, + pblk->rl.high, + free_blocks, + free_user_blocks, + total_blocks, + READ_ONCE(pblk->rl.rb_user_active)); +} + +static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page) +{ + int gc_enabled, gc_active; + + pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active); + return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n", + gc_enabled, gc_active); +} + +static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page) +{ + ssize_t sz; + + sz = snprintf(page, PAGE_SIZE, + "read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n", + atomic_long_read(&pblk->read_failed), + atomic_long_read(&pblk->read_high_ecc), + atomic_long_read(&pblk->read_empty), + atomic_long_read(&pblk->read_failed_gc), + atomic_long_read(&pblk->write_failed), + atomic_long_read(&pblk->erase_failed)); + + return sz; +} + +static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page) +{ + return pblk_rb_sysfs(&pblk->rwb, page); +} + +static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + ssize_t sz = 0; + + if (geo->version == NVM_OCSSD_SPEC_12) { + struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; + struct nvm_addrf_12 *gppaf = (struct nvm_addrf_12 *)&geo->addrf; + + sz = snprintf(page, PAGE_SIZE, + "g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n", + pblk->addrf_len, + ppaf->blk_offset, ppaf->blk_len, + ppaf->pg_offset, ppaf->pg_len, + ppaf->lun_offset, ppaf->lun_len, + ppaf->ch_offset, ppaf->ch_len, + ppaf->pln_offset, ppaf->pln_len, + ppaf->sec_offset, ppaf->sec_len); + + sz += snprintf(page + sz, PAGE_SIZE - sz, + "d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n", + gppaf->blk_offset, gppaf->blk_len, + gppaf->pg_offset, gppaf->pg_len, + gppaf->lun_offset, gppaf->lun_len, + gppaf->ch_offset, gppaf->ch_len, + gppaf->pln_offset, gppaf->pln_len, + gppaf->sec_offset, gppaf->sec_len); + } else { + struct nvm_addrf *ppaf = &pblk->addrf; + struct nvm_addrf *gppaf = &geo->addrf; + + sz = snprintf(page, PAGE_SIZE, + "pblk:(s:%d)ch:%d/%d,lun:%d/%d,chk:%d/%d/sec:%d/%d\n", + pblk->addrf_len, + ppaf->ch_offset, ppaf->ch_len, + ppaf->lun_offset, ppaf->lun_len, + ppaf->chk_offset, ppaf->chk_len, + ppaf->sec_offset, ppaf->sec_len); + + sz += snprintf(page + sz, PAGE_SIZE - sz, + "device:ch:%d/%d,lun:%d/%d,chk:%d/%d,sec:%d/%d\n", + gppaf->ch_offset, gppaf->ch_len, + gppaf->lun_offset, gppaf->lun_len, + gppaf->chk_offset, gppaf->chk_len, + gppaf->sec_offset, gppaf->sec_len); + } + + return sz; +} + +static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line *line; + ssize_t sz = 0; + int nr_free_lines; + int cur_data, cur_log; + int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0; + int d_line_cnt = 0, l_line_cnt = 0; + int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0; + int gc_werr = 0; + + int bad = 0, cor = 0; + int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0; + int map_weight = 0, meta_weight = 0; + + spin_lock(&l_mg->free_lock); + cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1; + cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1; + nr_free_lines = l_mg->nr_free_lines; + + list_for_each_entry(line, &l_mg->free_list, list) + free_line_cnt++; + spin_unlock(&l_mg->free_lock); + + spin_lock(&l_mg->close_lock); + list_for_each_entry(line, &l_mg->emeta_list, list) + emeta_line_cnt++; + spin_unlock(&l_mg->close_lock); + + spin_lock(&l_mg->gc_lock); + list_for_each_entry(line, &l_mg->gc_full_list, list) { + if (line->type == PBLK_LINETYPE_DATA) + d_line_cnt++; + else if (line->type == PBLK_LINETYPE_LOG) + l_line_cnt++; + closed_line_cnt++; + gc_full++; + } + + list_for_each_entry(line, &l_mg->gc_high_list, list) { + if (line->type == PBLK_LINETYPE_DATA) + d_line_cnt++; + else if (line->type == PBLK_LINETYPE_LOG) + l_line_cnt++; + closed_line_cnt++; + gc_high++; + } + + list_for_each_entry(line, &l_mg->gc_mid_list, list) { + if (line->type == PBLK_LINETYPE_DATA) + d_line_cnt++; + else if (line->type == PBLK_LINETYPE_LOG) + l_line_cnt++; + closed_line_cnt++; + gc_mid++; + } + + list_for_each_entry(line, &l_mg->gc_low_list, list) { + if (line->type == PBLK_LINETYPE_DATA) + d_line_cnt++; + else if (line->type == PBLK_LINETYPE_LOG) + l_line_cnt++; + closed_line_cnt++; + gc_low++; + } + + list_for_each_entry(line, &l_mg->gc_empty_list, list) { + if (line->type == PBLK_LINETYPE_DATA) + d_line_cnt++; + else if (line->type == PBLK_LINETYPE_LOG) + l_line_cnt++; + closed_line_cnt++; + gc_empty++; + } + + list_for_each_entry(line, &l_mg->gc_werr_list, list) { + if (line->type == PBLK_LINETYPE_DATA) + d_line_cnt++; + else if (line->type == PBLK_LINETYPE_LOG) + l_line_cnt++; + closed_line_cnt++; + gc_werr++; + } + + list_for_each_entry(line, &l_mg->bad_list, list) + bad++; + list_for_each_entry(line, &l_mg->corrupt_list, list) + cor++; + spin_unlock(&l_mg->gc_lock); + + spin_lock(&l_mg->free_lock); + if (l_mg->data_line) { + cur_sec = l_mg->data_line->cur_sec; + msecs = l_mg->data_line->left_msecs; + vsc = le32_to_cpu(*l_mg->data_line->vsc); + sec_in_line = l_mg->data_line->sec_in_line; + meta_weight = bitmap_weight(&l_mg->meta_bitmap, + PBLK_DATA_LINES); + + spin_lock(&l_mg->data_line->lock); + if (l_mg->data_line->map_bitmap) + map_weight = bitmap_weight(l_mg->data_line->map_bitmap, + lm->sec_per_line); + else + map_weight = 0; + spin_unlock(&l_mg->data_line->lock); + } + spin_unlock(&l_mg->free_lock); + + if (nr_free_lines != free_line_cnt) + pblk_err(pblk, "corrupted free line list:%d/%d\n", + nr_free_lines, free_line_cnt); + + sz = snprintf(page, PAGE_SIZE - sz, + "line: nluns:%d, nblks:%d, nsecs:%d\n", + geo->all_luns, lm->blk_per_line, lm->sec_per_line); + + sz += snprintf(page + sz, PAGE_SIZE - sz, + "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", + cur_data, cur_log, + nr_free_lines, + emeta_line_cnt, meta_weight, + closed_line_cnt, + bad, cor, + d_line_cnt, l_line_cnt, + l_mg->nr_lines); + + sz += snprintf(page + sz, PAGE_SIZE - sz, + "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n", + gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr, + atomic_read(&pblk->gc.read_inflight_gc)); + + sz += snprintf(page + sz, PAGE_SIZE - sz, + "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", + cur_data, cur_sec, msecs, vsc, sec_in_line, + map_weight, lm->sec_per_line, + atomic_read(&pblk->inflight_io)); + + return sz; +} + +static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + ssize_t sz = 0; + + sz = snprintf(page, PAGE_SIZE - sz, + "smeta - len:%d, secs:%d\n", + lm->smeta_len, lm->smeta_sec); + sz += snprintf(page + sz, PAGE_SIZE - sz, + "emeta - len:%d, sec:%d, bb_start:%d\n", + lm->emeta_len[0], lm->emeta_sec[0], + lm->emeta_bb); + sz += snprintf(page + sz, PAGE_SIZE - sz, + "bitmap lengths: sec:%d, blk:%d, lun:%d\n", + lm->sec_bitmap_len, + lm->blk_bitmap_len, + lm->lun_bitmap_len); + sz += snprintf(page + sz, PAGE_SIZE - sz, + "blk_line:%d, sec_line:%d, sec_blk:%d\n", + lm->blk_per_line, + lm->sec_per_line, + geo->clba); + + return sz; +} + +static ssize_t pblk_sysfs_get_sec_per_write(struct pblk *pblk, char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write); +} + +static ssize_t pblk_get_write_amp(u64 user, u64 gc, u64 pad, + char *page) +{ + int sz; + + sz = snprintf(page, PAGE_SIZE, + "user:%lld gc:%lld pad:%lld WA:", + user, gc, pad); + + if (!user) { + sz += snprintf(page + sz, PAGE_SIZE - sz, "NaN\n"); + } else { + u64 wa_int; + u32 wa_frac; + + wa_int = (user + gc + pad) * 100000; + wa_int = div64_u64(wa_int, user); + wa_int = div_u64_rem(wa_int, 100000, &wa_frac); + + sz += snprintf(page + sz, PAGE_SIZE - sz, "%llu.%05u\n", + wa_int, wa_frac); + } + + return sz; +} + +static ssize_t pblk_sysfs_get_write_amp_mileage(struct pblk *pblk, char *page) +{ + return pblk_get_write_amp(atomic64_read(&pblk->user_wa), + atomic64_read(&pblk->gc_wa), atomic64_read(&pblk->pad_wa), + page); +} + +static ssize_t pblk_sysfs_get_write_amp_trip(struct pblk *pblk, char *page) +{ + return pblk_get_write_amp( + atomic64_read(&pblk->user_wa) - pblk->user_rst_wa, + atomic64_read(&pblk->gc_wa) - pblk->gc_rst_wa, + atomic64_read(&pblk->pad_wa) - pblk->pad_rst_wa, page); +} + +static long long bucket_percentage(unsigned long long bucket, + unsigned long long total) +{ + int p = bucket * 100; + + p = div_u64(p, total); + + return p; +} + +static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page) +{ + int sz = 0; + unsigned long long total; + unsigned long long total_buckets = 0; + int buckets = pblk->min_write_pgs - 1; + int i; + + total = atomic64_read(&pblk->nr_flush) - pblk->nr_flush_rst; + if (!total) { + for (i = 0; i < (buckets + 1); i++) + sz += snprintf(page + sz, PAGE_SIZE - sz, + "%d:0 ", i); + sz += snprintf(page + sz, PAGE_SIZE - sz, "\n"); + + return sz; + } + + for (i = 0; i < buckets; i++) + total_buckets += atomic64_read(&pblk->pad_dist[i]); + + sz += snprintf(page + sz, PAGE_SIZE - sz, "0:%lld%% ", + bucket_percentage(total - total_buckets, total)); + + for (i = 0; i < buckets; i++) { + unsigned long long p; + + p = bucket_percentage(atomic64_read(&pblk->pad_dist[i]), + total); + sz += snprintf(page + sz, PAGE_SIZE - sz, "%d:%lld%% ", + i + 1, p); + } + sz += snprintf(page + sz, PAGE_SIZE - sz, "\n"); + + return sz; +} + +#ifdef CONFIG_NVM_PBLK_DEBUG +static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) +{ + return snprintf(page, PAGE_SIZE, + "%lu\t%lu\t%ld\t%llu\t%ld\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n", + atomic_long_read(&pblk->inflight_writes), + atomic_long_read(&pblk->inflight_reads), + atomic_long_read(&pblk->req_writes), + (u64)atomic64_read(&pblk->nr_flush), + atomic_long_read(&pblk->padded_writes), + atomic_long_read(&pblk->padded_wb), + atomic_long_read(&pblk->sub_writes), + atomic_long_read(&pblk->sync_writes), + atomic_long_read(&pblk->recov_writes), + atomic_long_read(&pblk->recov_gc_writes), + atomic_long_read(&pblk->recov_gc_reads), + atomic_long_read(&pblk->cache_reads), + atomic_long_read(&pblk->sync_reads)); +} +#endif + +static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page, + size_t len) +{ + size_t c_len; + int force; + + c_len = strcspn(page, "\n"); + if (c_len >= len) + return -EINVAL; + + if (kstrtouint(page, 0, &force)) + return -EINVAL; + + pblk_gc_sysfs_force(pblk, force); + + return len; +} + +static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk, + const char *page, size_t len) +{ + size_t c_len; + int sec_per_write; + + c_len = strcspn(page, "\n"); + if (c_len >= len) + return -EINVAL; + + if (kstrtouint(page, 0, &sec_per_write)) + return -EINVAL; + + if (sec_per_write < pblk->min_write_pgs + || sec_per_write > pblk->max_write_pgs + || sec_per_write % pblk->min_write_pgs != 0) + return -EINVAL; + + pblk_set_sec_per_write(pblk, sec_per_write); + + return len; +} + +static ssize_t pblk_sysfs_set_write_amp_trip(struct pblk *pblk, + const char *page, size_t len) +{ + size_t c_len; + int reset_value; + + c_len = strcspn(page, "\n"); + if (c_len >= len) + return -EINVAL; + + if (kstrtouint(page, 0, &reset_value)) + return -EINVAL; + + if (reset_value != 0) + return -EINVAL; + + pblk->user_rst_wa = atomic64_read(&pblk->user_wa); + pblk->pad_rst_wa = atomic64_read(&pblk->pad_wa); + pblk->gc_rst_wa = atomic64_read(&pblk->gc_wa); + + return len; +} + + +static ssize_t pblk_sysfs_set_padding_dist(struct pblk *pblk, + const char *page, size_t len) +{ + size_t c_len; + int reset_value; + int buckets = pblk->min_write_pgs - 1; + int i; + + c_len = strcspn(page, "\n"); + if (c_len >= len) + return -EINVAL; + + if (kstrtouint(page, 0, &reset_value)) + return -EINVAL; + + if (reset_value != 0) + return -EINVAL; + + for (i = 0; i < buckets; i++) + atomic64_set(&pblk->pad_dist[i], 0); + + pblk->nr_flush_rst = atomic64_read(&pblk->nr_flush); + + return len; +} + +static struct attribute sys_write_luns = { + .name = "write_luns", + .mode = 0444, +}; + +static struct attribute sys_rate_limiter_attr = { + .name = "rate_limiter", + .mode = 0444, +}; + +static struct attribute sys_gc_state = { + .name = "gc_state", + .mode = 0444, +}; + +static struct attribute sys_errors_attr = { + .name = "errors", + .mode = 0444, +}; + +static struct attribute sys_rb_attr = { + .name = "write_buffer", + .mode = 0444, +}; + +static struct attribute sys_stats_ppaf_attr = { + .name = "ppa_format", + .mode = 0444, +}; + +static struct attribute sys_lines_attr = { + .name = "lines", + .mode = 0444, +}; + +static struct attribute sys_lines_info_attr = { + .name = "lines_info", + .mode = 0444, +}; + +static struct attribute sys_gc_force = { + .name = "gc_force", + .mode = 0200, +}; + +static struct attribute sys_max_sec_per_write = { + .name = "max_sec_per_write", + .mode = 0644, +}; + +static struct attribute sys_write_amp_mileage = { + .name = "write_amp_mileage", + .mode = 0444, +}; + +static struct attribute sys_write_amp_trip = { + .name = "write_amp_trip", + .mode = 0644, +}; + +static struct attribute sys_padding_dist = { + .name = "padding_dist", + .mode = 0644, +}; + +#ifdef CONFIG_NVM_PBLK_DEBUG +static struct attribute sys_stats_debug_attr = { + .name = "stats", + .mode = 0444, +}; +#endif + +static struct attribute *pblk_attrs[] = { + &sys_write_luns, + &sys_rate_limiter_attr, + &sys_errors_attr, + &sys_gc_state, + &sys_gc_force, + &sys_max_sec_per_write, + &sys_rb_attr, + &sys_stats_ppaf_attr, + &sys_lines_attr, + &sys_lines_info_attr, + &sys_write_amp_mileage, + &sys_write_amp_trip, + &sys_padding_dist, +#ifdef CONFIG_NVM_PBLK_DEBUG + &sys_stats_debug_attr, +#endif + NULL, +}; + +static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct pblk *pblk = container_of(kobj, struct pblk, kobj); + + if (strcmp(attr->name, "rate_limiter") == 0) + return pblk_sysfs_rate_limiter(pblk, buf); + else if (strcmp(attr->name, "write_luns") == 0) + return pblk_sysfs_luns_show(pblk, buf); + else if (strcmp(attr->name, "gc_state") == 0) + return pblk_sysfs_gc_state_show(pblk, buf); + else if (strcmp(attr->name, "errors") == 0) + return pblk_sysfs_stats(pblk, buf); + else if (strcmp(attr->name, "write_buffer") == 0) + return pblk_sysfs_write_buffer(pblk, buf); + else if (strcmp(attr->name, "ppa_format") == 0) + return pblk_sysfs_ppaf(pblk, buf); + else if (strcmp(attr->name, "lines") == 0) + return pblk_sysfs_lines(pblk, buf); + else if (strcmp(attr->name, "lines_info") == 0) + return pblk_sysfs_lines_info(pblk, buf); + else if (strcmp(attr->name, "max_sec_per_write") == 0) + return pblk_sysfs_get_sec_per_write(pblk, buf); + else if (strcmp(attr->name, "write_amp_mileage") == 0) + return pblk_sysfs_get_write_amp_mileage(pblk, buf); + else if (strcmp(attr->name, "write_amp_trip") == 0) + return pblk_sysfs_get_write_amp_trip(pblk, buf); + else if (strcmp(attr->name, "padding_dist") == 0) + return pblk_sysfs_get_padding_dist(pblk, buf); +#ifdef CONFIG_NVM_PBLK_DEBUG + else if (strcmp(attr->name, "stats") == 0) + return pblk_sysfs_stats_debug(pblk, buf); +#endif + return 0; +} + +static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct pblk *pblk = container_of(kobj, struct pblk, kobj); + + if (strcmp(attr->name, "gc_force") == 0) + return pblk_sysfs_gc_force(pblk, buf, len); + else if (strcmp(attr->name, "max_sec_per_write") == 0) + return pblk_sysfs_set_sec_per_write(pblk, buf, len); + else if (strcmp(attr->name, "write_amp_trip") == 0) + return pblk_sysfs_set_write_amp_trip(pblk, buf, len); + else if (strcmp(attr->name, "padding_dist") == 0) + return pblk_sysfs_set_padding_dist(pblk, buf, len); + return 0; +} + +static const struct sysfs_ops pblk_sysfs_ops = { + .show = pblk_sysfs_show, + .store = pblk_sysfs_store, +}; + +static struct kobj_type pblk_ktype = { + .sysfs_ops = &pblk_sysfs_ops, + .default_attrs = pblk_attrs, +}; + +int pblk_sysfs_init(struct gendisk *tdisk) +{ + struct pblk *pblk = tdisk->private_data; + struct device *parent_dev = disk_to_dev(pblk->disk); + int ret; + + ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype, + kobject_get(&parent_dev->kobj), + "%s", "pblk"); + if (ret) { + pblk_err(pblk, "could not register\n"); + return ret; + } + + kobject_uevent(&pblk->kobj, KOBJ_ADD); + return 0; +} + +void pblk_sysfs_exit(struct gendisk *tdisk) +{ + struct pblk *pblk = tdisk->private_data; + + kobject_uevent(&pblk->kobj, KOBJ_REMOVE); + kobject_del(&pblk->kobj); + kobject_put(&pblk->kobj); +} diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c new file mode 100644 index 000000000..c3e038d4b --- /dev/null +++ b/drivers/lightnvm/pblk-write.c @@ -0,0 +1,673 @@ +/* + * Copyright (C) 2016 CNEX Labs + * Initial release: Javier Gonzalez <javier@cnexlabs.com> + * Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * pblk-write.c - pblk's write path from write buffer to media + */ + +#include "pblk.h" + +static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, + struct pblk_c_ctx *c_ctx) +{ + struct bio *original_bio; + struct pblk_rb *rwb = &pblk->rwb; + unsigned long ret; + int i; + + for (i = 0; i < c_ctx->nr_valid; i++) { + struct pblk_w_ctx *w_ctx; + int pos = c_ctx->sentry + i; + int flags; + + w_ctx = pblk_rb_w_ctx(rwb, pos); + flags = READ_ONCE(w_ctx->flags); + + if (flags & PBLK_FLUSH_ENTRY) { + flags &= ~PBLK_FLUSH_ENTRY; + /* Release flags on context. Protect from writes */ + smp_store_release(&w_ctx->flags, flags); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_dec(&rwb->inflight_flush_point); +#endif + } + + while ((original_bio = bio_list_pop(&w_ctx->bios))) + bio_endio(original_bio); + } + + if (c_ctx->nr_padded) + pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, + c_ctx->nr_padded); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); +#endif + + ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); + + bio_put(rqd->bio); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); + + return ret; +} + +static unsigned long pblk_end_queued_w_bio(struct pblk *pblk, + struct nvm_rq *rqd, + struct pblk_c_ctx *c_ctx) +{ + list_del(&c_ctx->list); + return pblk_end_w_bio(pblk, rqd, c_ctx); +} + +static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd, + struct pblk_c_ctx *c_ctx) +{ + struct pblk_c_ctx *c, *r; + unsigned long flags; + unsigned long pos; + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes); +#endif + + pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap); + + pos = pblk_rb_sync_init(&pblk->rwb, &flags); + if (pos == c_ctx->sentry) { + pos = pblk_end_w_bio(pblk, rqd, c_ctx); + +retry: + list_for_each_entry_safe(c, r, &pblk->compl_list, list) { + rqd = nvm_rq_from_c_ctx(c); + if (c->sentry == pos) { + pos = pblk_end_queued_w_bio(pblk, rqd, c); + goto retry; + } + } + } else { + WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd); + list_add_tail(&c_ctx->list, &pblk->compl_list); + } + pblk_rb_sync_end(&pblk->rwb, &flags); +} + +/* Map remaining sectors in chunk, starting from ppa */ +static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line *line; + struct ppa_addr map_ppa = *ppa; + u64 paddr; + int done = 0; + + line = &pblk->lines[pblk_ppa_to_line(*ppa)]; + spin_lock(&line->lock); + + while (!done) { + paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa); + + if (!test_and_set_bit(paddr, line->map_bitmap)) + line->left_msecs--; + + if (!test_and_set_bit(paddr, line->invalid_bitmap)) + le32_add_cpu(line->vsc, -1); + + if (geo->version == NVM_OCSSD_SPEC_12) { + map_ppa.ppa++; + if (map_ppa.g.pg == geo->num_pg) + done = 1; + } else { + map_ppa.m.sec++; + if (map_ppa.m.sec == geo->clba) + done = 1; + } + } + + line->w_err_gc->has_write_err = 1; + spin_unlock(&line->lock); +} + +static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry, + unsigned int nr_entries) +{ + struct pblk_rb *rb = &pblk->rwb; + struct pblk_rb_entry *entry; + struct pblk_line *line; + struct pblk_w_ctx *w_ctx; + struct ppa_addr ppa_l2p; + int flags; + unsigned int pos, i; + + spin_lock(&pblk->trans_lock); + pos = sentry; + for (i = 0; i < nr_entries; i++) { + entry = &rb->entries[pos]; + w_ctx = &entry->w_ctx; + + /* Check if the lba has been overwritten */ + if (w_ctx->lba != ADDR_EMPTY) { + ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba); + if (!pblk_ppa_comp(ppa_l2p, entry->cacheline)) + w_ctx->lba = ADDR_EMPTY; + } + + /* Mark up the entry as submittable again */ + flags = READ_ONCE(w_ctx->flags); + flags |= PBLK_WRITTEN_DATA; + /* Release flags on write context. Protect from writes */ + smp_store_release(&w_ctx->flags, flags); + + /* Decrese the reference count to the line as we will + * re-map these entries + */ + line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)]; + kref_put(&line->ref, pblk_line_put); + + pos = (pos + 1) & (rb->nr_entries - 1); + } + spin_unlock(&pblk->trans_lock); +} + +static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx) +{ + struct pblk_c_ctx *r_ctx; + + r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL); + if (!r_ctx) + return; + + r_ctx->lun_bitmap = NULL; + r_ctx->sentry = c_ctx->sentry; + r_ctx->nr_valid = c_ctx->nr_valid; + r_ctx->nr_padded = c_ctx->nr_padded; + + spin_lock(&pblk->resubmit_lock); + list_add_tail(&r_ctx->list, &pblk->resubmit_list); + spin_unlock(&pblk->resubmit_lock); + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes); +#endif +} + +static void pblk_submit_rec(struct work_struct *work) +{ + struct pblk_rec_ctx *recovery = + container_of(work, struct pblk_rec_ctx, ws_rec); + struct pblk *pblk = recovery->pblk; + struct nvm_rq *rqd = recovery->rqd; + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + struct ppa_addr *ppa_list; + + pblk_log_write_err(pblk, rqd); + + if (rqd->nr_ppas == 1) + ppa_list = &rqd->ppa_addr; + else + ppa_list = rqd->ppa_list; + + pblk_map_remaining(pblk, ppa_list); + pblk_queue_resubmit(pblk, c_ctx); + + pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap); + if (c_ctx->nr_padded) + pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, + c_ctx->nr_padded); + bio_put(rqd->bio); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); + mempool_free(recovery, &pblk->rec_pool); + + atomic_dec(&pblk->inflight_io); +} + + +static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct pblk_rec_ctx *recovery; + + recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC); + if (!recovery) { + pblk_err(pblk, "could not allocate recovery work\n"); + return; + } + + recovery->pblk = pblk; + recovery->rqd = rqd; + + INIT_WORK(&recovery->ws_rec, pblk_submit_rec); + queue_work(pblk->close_wq, &recovery->ws_rec); +} + +static void pblk_end_io_write(struct nvm_rq *rqd) +{ + struct pblk *pblk = rqd->private; + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + + if (rqd->error) { + pblk_end_w_fail(pblk, rqd); + return; + } +#ifdef CONFIG_NVM_PBLK_DEBUG + else + WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); +#endif + + pblk_complete_write(pblk, rqd, c_ctx); + atomic_dec(&pblk->inflight_io); +} + +static void pblk_end_io_write_meta(struct nvm_rq *rqd) +{ + struct pblk *pblk = rqd->private; + struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); + struct pblk_line *line = m_ctx->private; + struct pblk_emeta *emeta = line->emeta; + int sync; + + pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); + + if (rqd->error) { + pblk_log_write_err(pblk, rqd); + pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id); + line->w_err_gc->has_write_err = 1; + } + + sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); + if (sync == emeta->nr_entries) + pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, + GFP_ATOMIC, pblk->close_wq); + + pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); + + atomic_dec(&pblk->inflight_io); +} + +static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, + unsigned int nr_secs, + nvm_end_io_fn(*end_io)) +{ + struct nvm_tgt_dev *dev = pblk->dev; + + /* Setup write request */ + rqd->opcode = NVM_OP_PWRITE; + rqd->nr_ppas = nr_secs; + rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); + rqd->private = pblk; + rqd->end_io = end_io; + + rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, + &rqd->dma_meta_list); + if (!rqd->meta_list) + return -ENOMEM; + + rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; + rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; + + return 0; +} + +static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, + struct ppa_addr *erase_ppa) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line *e_line = pblk_line_get_erase(pblk); + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + unsigned int valid = c_ctx->nr_valid; + unsigned int padded = c_ctx->nr_padded; + unsigned int nr_secs = valid + padded; + unsigned long *lun_bitmap; + int ret; + + lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); + if (!lun_bitmap) + return -ENOMEM; + c_ctx->lun_bitmap = lun_bitmap; + + ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write); + if (ret) { + kfree(lun_bitmap); + return ret; + } + + if (likely(!e_line || !atomic_read(&e_line->left_eblks))) + pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0); + else + pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, + valid, erase_ppa); + + return 0; +} + +static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, + unsigned int secs_to_flush) +{ + int secs_to_sync; + + secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush); + +#ifdef CONFIG_NVM_PBLK_DEBUG + if ((!secs_to_sync && secs_to_flush) + || (secs_to_sync < 0) + || (secs_to_sync > secs_avail && !secs_to_flush)) { + pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n", + secs_avail, secs_to_sync, secs_to_flush); + } +#endif + + return secs_to_sync; +} + +int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_emeta *emeta = meta_line->emeta; + struct pblk_g_ctx *m_ctx; + struct bio *bio; + struct nvm_rq *rqd; + void *data; + u64 paddr; + int rq_ppas = pblk->min_write_pgs; + int id = meta_line->id; + int rq_len; + int i, j; + int ret; + + rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); + + m_ctx = nvm_rq_to_pdu(rqd); + m_ctx->private = meta_line; + + rq_len = rq_ppas * geo->csecs; + data = ((void *)emeta->buf) + emeta->mem; + + bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, + l_mg->emeta_alloc_type, GFP_KERNEL); + if (IS_ERR(bio)) { + pblk_err(pblk, "failed to map emeta io"); + ret = PTR_ERR(bio); + goto fail_free_rqd; + } + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + rqd->bio = bio; + + ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta); + if (ret) + goto fail_free_bio; + + for (i = 0; i < rqd->nr_ppas; ) { + spin_lock(&meta_line->lock); + paddr = __pblk_alloc_page(pblk, meta_line, rq_ppas); + spin_unlock(&meta_line->lock); + for (j = 0; j < rq_ppas; j++, i++, paddr++) + rqd->ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id); + } + + spin_lock(&l_mg->close_lock); + emeta->mem += rq_len; + if (emeta->mem >= lm->emeta_len[0]) + list_del(&meta_line->list); + spin_unlock(&l_mg->close_lock); + + pblk_down_page(pblk, rqd->ppa_list, rqd->nr_ppas); + + ret = pblk_submit_io(pblk, rqd); + if (ret) { + pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); + goto fail_rollback; + } + + return NVM_IO_OK; + +fail_rollback: + pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); + spin_lock(&l_mg->close_lock); + pblk_dealloc_page(pblk, meta_line, rq_ppas); + list_add(&meta_line->list, &meta_line->list); + spin_unlock(&l_mg->close_lock); +fail_free_bio: + bio_put(bio); +fail_free_rqd: + pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); + return ret; +} + +static inline bool pblk_valid_meta_ppa(struct pblk *pblk, + struct pblk_line *meta_line, + struct nvm_rq *data_rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd); + struct pblk_line *data_line = pblk_line_get_data(pblk); + struct ppa_addr ppa, ppa_opt; + u64 paddr; + int pos_opt; + + /* Schedule a metadata I/O that is half the distance from the data I/O + * with regards to the number of LUNs forming the pblk instance. This + * balances LUN conflicts across every I/O. + * + * When the LUN configuration changes (e.g., due to GC), this distance + * can align, which would result on metadata and data I/Os colliding. In + * this case, modify the distance to not be optimal, but move the + * optimal in the right direction. + */ + paddr = pblk_lookup_page(pblk, meta_line); + ppa = addr_to_gen_ppa(pblk, paddr, 0); + ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); + pos_opt = pblk_ppa_to_pos(geo, ppa_opt); + + if (test_bit(pos_opt, data_c_ctx->lun_bitmap) || + test_bit(pos_opt, data_line->blk_bitmap)) + return true; + + if (unlikely(pblk_ppa_comp(ppa_opt, ppa))) + data_line->meta_distance--; + + return false; +} + +static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk, + struct nvm_rq *data_rqd) +{ + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line *meta_line; + + spin_lock(&l_mg->close_lock); + if (list_empty(&l_mg->emeta_list)) { + spin_unlock(&l_mg->close_lock); + return NULL; + } + meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); + if (meta_line->emeta->mem >= lm->emeta_len[0]) { + spin_unlock(&l_mg->close_lock); + return NULL; + } + spin_unlock(&l_mg->close_lock); + + if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd)) + return NULL; + + return meta_line; +} + +static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct ppa_addr erase_ppa; + struct pblk_line *meta_line; + int err; + + pblk_ppa_set_empty(&erase_ppa); + + /* Assign lbas to ppas and populate request structure */ + err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); + if (err) { + pblk_err(pblk, "could not setup write request: %d\n", err); + return NVM_IO_ERR; + } + + meta_line = pblk_should_submit_meta_io(pblk, rqd); + + /* Submit data write for current data line */ + err = pblk_submit_io(pblk, rqd); + if (err) { + pblk_err(pblk, "data I/O submission failed: %d\n", err); + return NVM_IO_ERR; + } + + if (!pblk_ppa_empty(erase_ppa)) { + /* Submit erase for next data line */ + if (pblk_blk_erase_async(pblk, erase_ppa)) { + struct pblk_line *e_line = pblk_line_get_erase(pblk); + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int bit; + + atomic_inc(&e_line->left_eblks); + bit = pblk_ppa_to_pos(geo, erase_ppa); + WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap)); + } + } + + if (meta_line) { + /* Submit metadata write for previous data line */ + err = pblk_submit_meta_io(pblk, meta_line); + if (err) { + pblk_err(pblk, "metadata I/O submission failed: %d", + err); + return NVM_IO_ERR; + } + } + + return NVM_IO_OK; +} + +static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + struct bio *bio = rqd->bio; + + if (c_ctx->nr_padded) + pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid, + c_ctx->nr_padded); +} + +static int pblk_submit_write(struct pblk *pblk) +{ + struct bio *bio; + struct nvm_rq *rqd; + unsigned int secs_avail, secs_to_sync, secs_to_com; + unsigned int secs_to_flush; + unsigned long pos; + unsigned int resubmit; + + spin_lock(&pblk->resubmit_lock); + resubmit = !list_empty(&pblk->resubmit_list); + spin_unlock(&pblk->resubmit_lock); + + /* Resubmit failed writes first */ + if (resubmit) { + struct pblk_c_ctx *r_ctx; + + spin_lock(&pblk->resubmit_lock); + r_ctx = list_first_entry(&pblk->resubmit_list, + struct pblk_c_ctx, list); + list_del(&r_ctx->list); + spin_unlock(&pblk->resubmit_lock); + + secs_avail = r_ctx->nr_valid; + pos = r_ctx->sentry; + + pblk_prepare_resubmit(pblk, pos, secs_avail); + secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, + secs_avail); + + kfree(r_ctx); + } else { + /* If there are no sectors in the cache, + * flushes (bios without data) will be cleared on + * the cache threads + */ + secs_avail = pblk_rb_read_count(&pblk->rwb); + if (!secs_avail) + return 1; + + secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); + if (!secs_to_flush && secs_avail < pblk->min_write_pgs) + return 1; + + secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, + secs_to_flush); + if (secs_to_sync > pblk->max_write_pgs) { + pblk_err(pblk, "bad buffer sync calculation\n"); + return 1; + } + + secs_to_com = (secs_to_sync > secs_avail) ? + secs_avail : secs_to_sync; + pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); + } + + bio = bio_alloc(GFP_KERNEL, secs_to_sync); + + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); + rqd->bio = bio; + + if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, + secs_avail)) { + pblk_err(pblk, "corrupted write bio\n"); + goto fail_put_bio; + } + + if (pblk_submit_io_set(pblk, rqd)) + goto fail_free_bio; + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_long_add(secs_to_sync, &pblk->sub_writes); +#endif + + return 0; + +fail_free_bio: + pblk_free_write_rqd(pblk, rqd); +fail_put_bio: + bio_put(bio); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); + + return 1; +} + +int pblk_write_ts(void *data) +{ + struct pblk *pblk = data; + + while (!kthread_should_stop()) { + if (!pblk_submit_write(pblk)) + continue; + set_current_state(TASK_INTERRUPTIBLE); + io_schedule(); + } + + return 0; +} diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h new file mode 100644 index 000000000..4760af7b6 --- /dev/null +++ b/drivers/lightnvm/pblk.h @@ -0,0 +1,1444 @@ +/* + * Copyright (C) 2015 IT University of Copenhagen (rrpc.h) + * Copyright (C) 2016 CNEX Labs + * Initial release: Matias Bjorling <matias@cnexlabs.com> + * Write buffering: Javier Gonzalez <javier@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Implementation of a Physical Block-device target for Open-channel SSDs. + * + */ + +#ifndef PBLK_H_ +#define PBLK_H_ + +#include <linux/blkdev.h> +#include <linux/blk-mq.h> +#include <linux/bio.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/vmalloc.h> +#include <linux/crc32.h> +#include <linux/uuid.h> + +#include <linux/lightnvm.h> + +/* Run only GC if less than 1/X blocks are free */ +#define GC_LIMIT_INVERSE 5 +#define GC_TIME_MSECS 1000 + +#define PBLK_SECTOR (512) +#define PBLK_EXPOSED_PAGE_SIZE (4096) +#define PBLK_MAX_REQ_ADDRS (64) +#define PBLK_MAX_REQ_ADDRS_PW (6) + +#define PBLK_NR_CLOSE_JOBS (4) + +#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) + +#define PBLK_COMMAND_TIMEOUT_MS 30000 + +/* Max 512 LUNs per device */ +#define PBLK_MAX_LUNS_BITMAP (4) + +#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR) + +/* Static pool sizes */ +#define PBLK_GEN_WS_POOL_SIZE (2) + +#define PBLK_DEFAULT_OP (11) + +enum { + PBLK_READ = READ, + PBLK_WRITE = WRITE,/* Write from write buffer */ + PBLK_WRITE_INT, /* Internal write - no write buffer */ + PBLK_READ_RECOV, /* Recovery read - errors allowed */ + PBLK_ERASE, +}; + +enum { + /* IO Types */ + PBLK_IOTYPE_USER = 1 << 0, + PBLK_IOTYPE_GC = 1 << 1, + + /* Write buffer flags */ + PBLK_FLUSH_ENTRY = 1 << 2, + PBLK_WRITTEN_DATA = 1 << 3, + PBLK_SUBMITTED_ENTRY = 1 << 4, + PBLK_WRITABLE_ENTRY = 1 << 5, +}; + +enum { + PBLK_BLK_ST_OPEN = 0x1, + PBLK_BLK_ST_CLOSED = 0x2, +}; + +struct pblk_sec_meta { + u64 reserved; + __le64 lba; +}; + +/* The number of GC lists and the rate-limiter states go together. This way the + * rate-limiter can dictate how much GC is needed based on resource utilization. + */ +#define PBLK_GC_NR_LISTS 4 + +enum { + PBLK_RL_OFF = 0, + PBLK_RL_WERR = 1, + PBLK_RL_HIGH = 2, + PBLK_RL_MID = 3, + PBLK_RL_LOW = 4 +}; + +#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS) +#define pblk_dma_ppa_size (sizeof(u64) * PBLK_MAX_REQ_ADDRS) + +/* write buffer completion context */ +struct pblk_c_ctx { + struct list_head list; /* Head for out-of-order completion */ + + unsigned long *lun_bitmap; /* Luns used on current request */ + unsigned int sentry; + unsigned int nr_valid; + unsigned int nr_padded; +}; + +/* read context */ +struct pblk_g_ctx { + void *private; + unsigned long start_time; + u64 lba; +}; + +/* partial read context */ +struct pblk_pr_ctx { + struct bio *orig_bio; + DECLARE_BITMAP(bitmap, NVM_MAX_VLBA); + unsigned int orig_nr_secs; + unsigned int bio_init_idx; + void *ppa_ptr; + dma_addr_t dma_ppa_list; +}; + +/* Pad context */ +struct pblk_pad_rq { + struct pblk *pblk; + struct completion wait; + struct kref ref; +}; + +/* Recovery context */ +struct pblk_rec_ctx { + struct pblk *pblk; + struct nvm_rq *rqd; + struct work_struct ws_rec; +}; + +/* Write context */ +struct pblk_w_ctx { + struct bio_list bios; /* Original bios - used for completion + * in REQ_FUA, REQ_FLUSH case + */ + u64 lba; /* Logic addr. associated with entry */ + struct ppa_addr ppa; /* Physic addr. associated with entry */ + int flags; /* Write context flags */ +}; + +struct pblk_rb_entry { + struct ppa_addr cacheline; /* Cacheline for this entry */ + void *data; /* Pointer to data on this entry */ + struct pblk_w_ctx w_ctx; /* Context for this entry */ + struct list_head index; /* List head to enable indexes */ +}; + +#define EMPTY_ENTRY (~0U) + +struct pblk_rb_pages { + struct page *pages; + int order; + struct list_head list; +}; + +struct pblk_rb { + struct pblk_rb_entry *entries; /* Ring buffer entries */ + unsigned int mem; /* Write offset - points to next + * writable entry in memory + */ + unsigned int subm; /* Read offset - points to last entry + * that has been submitted to the media + * to be persisted + */ + unsigned int sync; /* Synced - backpointer that signals + * the last submitted entry that has + * been successfully persisted to media + */ + unsigned int flush_point; /* Sync point - last entry that must be + * flushed to the media. Used with + * REQ_FLUSH and REQ_FUA + */ + unsigned int l2p_update; /* l2p update point - next entry for + * which l2p mapping will be updated to + * contain a device ppa address (instead + * of a cacheline + */ + unsigned int nr_entries; /* Number of entries in write buffer - + * must be a power of two + */ + unsigned int seg_size; /* Size of the data segments being + * stored on each entry. Typically this + * will be 4KB + */ + + struct list_head pages; /* List of data pages */ + + spinlock_t w_lock; /* Write lock */ + spinlock_t s_lock; /* Sync lock */ + +#ifdef CONFIG_NVM_PBLK_DEBUG + atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */ +#endif +}; + +#define PBLK_RECOVERY_SECTORS 16 + +struct pblk_lun { + struct ppa_addr bppa; + struct semaphore wr_sem; +}; + +struct pblk_gc_rq { + struct pblk_line *line; + void *data; + u64 paddr_list[PBLK_MAX_REQ_ADDRS]; + u64 lba_list[PBLK_MAX_REQ_ADDRS]; + int nr_secs; + int secs_to_gc; + struct list_head list; +}; + +struct pblk_gc { + /* These states are not protected by a lock since (i) they are in the + * fast path, and (ii) they are not critical. + */ + int gc_active; + int gc_enabled; + int gc_forced; + + struct task_struct *gc_ts; + struct task_struct *gc_writer_ts; + struct task_struct *gc_reader_ts; + + struct workqueue_struct *gc_line_reader_wq; + struct workqueue_struct *gc_reader_wq; + + struct timer_list gc_timer; + + struct semaphore gc_sem; + atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */ + atomic_t pipeline_gc; /* Number of lines in the GC pipeline - + * started reads to finished writes + */ + int w_entries; + + struct list_head w_list; + struct list_head r_list; + + spinlock_t lock; + spinlock_t w_lock; + spinlock_t r_lock; +}; + +struct pblk_rl { + unsigned int high; /* Upper threshold for rate limiter (free run - + * user I/O rate limiter + */ + unsigned int high_pw; /* High rounded up as a power of 2 */ + +#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ +#define PBLK_USER_LOW_THRS 10 /* Aggressive GC at 10% available blocks */ + + int rb_windows_pw; /* Number of rate windows in the write buffer + * given as a power-of-2. This guarantees that + * when user I/O is being rate limited, there + * will be reserved enough space for the GC to + * place its payload. A window is of + * pblk->max_write_pgs size, which in NVMe is + * 64, i.e., 256kb. + */ + int rb_budget; /* Total number of entries available for I/O */ + int rb_user_max; /* Max buffer entries available for user I/O */ + int rb_gc_max; /* Max buffer entries available for GC I/O */ + int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ + int rb_state; /* Rate-limiter current state */ + int rb_max_io; /* Maximum size for an I/O giving the config */ + + atomic_t rb_user_cnt; /* User I/O buffer counter */ + atomic_t rb_gc_cnt; /* GC I/O buffer counter */ + atomic_t rb_space; /* Space limit in case of reaching capacity */ + + int rsv_blocks; /* Reserved blocks for GC */ + + int rb_user_active; + int rb_gc_active; + + atomic_t werr_lines; /* Number of write error lines that needs gc */ + + struct timer_list u_timer; + + unsigned long long nr_secs; + unsigned long total_blocks; + + atomic_t free_blocks; /* Total number of free blocks (+ OP) */ + atomic_t free_user_blocks; /* Number of user free blocks (no OP) */ +}; + +#define PBLK_LINE_EMPTY (~0U) + +enum { + /* Line Types */ + PBLK_LINETYPE_FREE = 0, + PBLK_LINETYPE_LOG = 1, + PBLK_LINETYPE_DATA = 2, + + /* Line state */ + PBLK_LINESTATE_NEW = 9, + PBLK_LINESTATE_FREE = 10, + PBLK_LINESTATE_OPEN = 11, + PBLK_LINESTATE_CLOSED = 12, + PBLK_LINESTATE_GC = 13, + PBLK_LINESTATE_BAD = 14, + PBLK_LINESTATE_CORRUPT = 15, + + /* GC group */ + PBLK_LINEGC_NONE = 20, + PBLK_LINEGC_EMPTY = 21, + PBLK_LINEGC_LOW = 22, + PBLK_LINEGC_MID = 23, + PBLK_LINEGC_HIGH = 24, + PBLK_LINEGC_FULL = 25, + PBLK_LINEGC_WERR = 26 +}; + +#define PBLK_MAGIC 0x70626c6b /*pblk*/ + +/* emeta/smeta persistent storage format versions: + * Changes in major version requires offline migration. + * Changes in minor version are handled automatically during + * recovery. + */ + +#define SMETA_VERSION_MAJOR (0) +#define SMETA_VERSION_MINOR (1) + +#define EMETA_VERSION_MAJOR (0) +#define EMETA_VERSION_MINOR (2) + +struct line_header { + __le32 crc; + __le32 identifier; /* pblk identifier */ + __u8 uuid[16]; /* instance uuid */ + __le16 type; /* line type */ + __u8 version_major; /* version major */ + __u8 version_minor; /* version minor */ + __le32 id; /* line id for current line */ +}; + +struct line_smeta { + struct line_header header; + + __le32 crc; /* Full structure including struct crc */ + /* Previous line metadata */ + __le32 prev_id; /* Line id for previous line */ + + /* Current line metadata */ + __le64 seq_nr; /* Sequence number for current line */ + + /* Active writers */ + __le32 window_wr_lun; /* Number of parallel LUNs to write */ + + __le32 rsvd[2]; + + __le64 lun_bitmap[]; +}; + + +/* + * Metadata layout in media: + * First sector: + * 1. struct line_emeta + * 2. bad block bitmap (u64 * window_wr_lun) + * 3. write amplification counters + * Mid sectors (start at lbas_sector): + * 3. nr_lbas (u64) forming lba list + * Last sectors (start at vsc_sector): + * 4. u32 valid sector count (vsc) for all lines (~0U: free line) + */ +struct line_emeta { + struct line_header header; + + __le32 crc; /* Full structure including struct crc */ + + /* Previous line metadata */ + __le32 prev_id; /* Line id for prev line */ + + /* Current line metadata */ + __le64 seq_nr; /* Sequence number for current line */ + + /* Active writers */ + __le32 window_wr_lun; /* Number of parallel LUNs to write */ + + /* Bookkeeping for recovery */ + __le32 next_id; /* Line id for next line */ + __le64 nr_lbas; /* Number of lbas mapped in line */ + __le64 nr_valid_lbas; /* Number of valid lbas mapped in line */ + __le64 bb_bitmap[]; /* Updated bad block bitmap for line */ +}; + + +/* Write amplification counters stored on media */ +struct wa_counters { + __le64 user; /* Number of user written sectors */ + __le64 gc; /* Number of sectors written by GC*/ + __le64 pad; /* Number of padded sectors */ +}; + +struct pblk_emeta { + struct line_emeta *buf; /* emeta buffer in media format */ + int mem; /* Write offset - points to next + * writable entry in memory + */ + atomic_t sync; /* Synced - backpointer that signals the + * last entry that has been successfully + * persisted to media + */ + unsigned int nr_entries; /* Number of emeta entries */ +}; + +struct pblk_smeta { + struct line_smeta *buf; /* smeta buffer in persistent format */ +}; + +struct pblk_w_err_gc { + int has_write_err; + __le64 *lba_list; +}; + +struct pblk_line { + struct pblk *pblk; + unsigned int id; /* Line number corresponds to the + * block line + */ + unsigned int seq_nr; /* Unique line sequence number */ + + int state; /* PBLK_LINESTATE_X */ + int type; /* PBLK_LINETYPE_X */ + int gc_group; /* PBLK_LINEGC_X */ + struct list_head list; /* Free, GC lists */ + + unsigned long *lun_bitmap; /* Bitmap for LUNs mapped in line */ + + struct nvm_chk_meta *chks; /* Chunks forming line */ + + struct pblk_smeta *smeta; /* Start metadata */ + struct pblk_emeta *emeta; /* End medatada */ + + int meta_line; /* Metadata line id */ + int meta_distance; /* Distance between data and metadata */ + + u64 smeta_ssec; /* Sector where smeta starts */ + u64 emeta_ssec; /* Sector where emeta starts */ + + unsigned int sec_in_line; /* Number of usable secs in line */ + + atomic_t blk_in_line; /* Number of good blocks in line */ + unsigned long *blk_bitmap; /* Bitmap for valid/invalid blocks */ + unsigned long *erase_bitmap; /* Bitmap for erased blocks */ + + unsigned long *map_bitmap; /* Bitmap for mapped sectors in line */ + unsigned long *invalid_bitmap; /* Bitmap for invalid sectors in line */ + + atomic_t left_eblks; /* Blocks left for erasing */ + atomic_t left_seblks; /* Blocks left for sync erasing */ + + int left_msecs; /* Sectors left for mapping */ + unsigned int cur_sec; /* Sector map pointer */ + unsigned int nr_valid_lbas; /* Number of valid lbas in line */ + + __le32 *vsc; /* Valid sector count in line */ + + struct kref ref; /* Write buffer L2P references */ + + struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */ + + spinlock_t lock; /* Necessary for invalid_bitmap only */ +}; + +#define PBLK_DATA_LINES 4 + +enum { + PBLK_KMALLOC_META = 1, + PBLK_VMALLOC_META = 2, +}; + +enum { + PBLK_EMETA_TYPE_HEADER = 1, /* struct line_emeta first sector */ + PBLK_EMETA_TYPE_LLBA = 2, /* lba list - type: __le64 */ + PBLK_EMETA_TYPE_VSC = 3, /* vsc list - type: __le32 */ +}; + +struct pblk_line_mgmt { + int nr_lines; /* Total number of full lines */ + int nr_free_lines; /* Number of full lines in free list */ + + /* Free lists - use free_lock */ + struct list_head free_list; /* Full lines ready to use */ + struct list_head corrupt_list; /* Full lines corrupted */ + struct list_head bad_list; /* Full lines bad */ + + /* GC lists - use gc_lock */ + struct list_head *gc_lists[PBLK_GC_NR_LISTS]; + struct list_head gc_high_list; /* Full lines ready to GC, high isc */ + struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */ + struct list_head gc_low_list; /* Full lines ready to GC, low isc */ + + struct list_head gc_werr_list; /* Write err recovery list */ + + struct list_head gc_full_list; /* Full lines ready to GC, no valid */ + struct list_head gc_empty_list; /* Full lines close, all valid */ + + struct pblk_line *log_line; /* Current FTL log line */ + struct pblk_line *data_line; /* Current data line */ + struct pblk_line *log_next; /* Next FTL log line */ + struct pblk_line *data_next; /* Next data line */ + + struct list_head emeta_list; /* Lines queued to schedule emeta */ + + __le32 *vsc_list; /* Valid sector counts for all lines */ + + /* Metadata allocation type: VMALLOC | KMALLOC */ + int emeta_alloc_type; + + /* Pre-allocated metadata for data lines */ + struct pblk_smeta *sline_meta[PBLK_DATA_LINES]; + struct pblk_emeta *eline_meta[PBLK_DATA_LINES]; + unsigned long meta_bitmap; + + /* Helpers for fast bitmap calculations */ + unsigned long *bb_template; + unsigned long *bb_aux; + + unsigned long d_seq_nr; /* Data line unique sequence number */ + unsigned long l_seq_nr; /* Log line unique sequence number */ + + spinlock_t free_lock; + spinlock_t close_lock; + spinlock_t gc_lock; +}; + +struct pblk_line_meta { + unsigned int smeta_len; /* Total length for smeta */ + unsigned int smeta_sec; /* Sectors needed for smeta */ + + unsigned int emeta_len[4]; /* Lengths for emeta: + * [0]: Total + * [1]: struct line_emeta + + * bb_bitmap + struct wa_counters + * [2]: L2P portion + * [3]: vsc + */ + unsigned int emeta_sec[4]; /* Sectors needed for emeta. Same layout + * as emeta_len + */ + + unsigned int emeta_bb; /* Boundary for bb that affects emeta */ + + unsigned int vsc_list_len; /* Length for vsc list */ + unsigned int sec_bitmap_len; /* Length for sector bitmap in line */ + unsigned int blk_bitmap_len; /* Length for block bitmap in line */ + unsigned int lun_bitmap_len; /* Length for lun bitmap in line */ + + unsigned int blk_per_line; /* Number of blocks in a full line */ + unsigned int sec_per_line; /* Number of sectors in a line */ + unsigned int dsec_per_line; /* Number of data sectors in a line */ + unsigned int min_blk_line; /* Min. number of good blocks in line */ + + unsigned int mid_thrs; /* Threshold for GC mid list */ + unsigned int high_thrs; /* Threshold for GC high list */ + + unsigned int meta_distance; /* Distance between data and metadata */ +}; + +enum { + PBLK_STATE_RUNNING = 0, + PBLK_STATE_STOPPING = 1, + PBLK_STATE_RECOVERING = 2, + PBLK_STATE_STOPPED = 3, +}; + +/* Internal format to support not power-of-2 device formats */ +struct pblk_addrf { + /* gen to dev */ + int sec_stripe; + int ch_stripe; + int lun_stripe; + + /* dev to gen */ + int sec_lun_stripe; + int sec_ws_stripe; +}; + +struct pblk { + struct nvm_tgt_dev *dev; + struct gendisk *disk; + + struct kobject kobj; + + struct pblk_lun *luns; + + struct pblk_line *lines; /* Line array */ + struct pblk_line_mgmt l_mg; /* Line management */ + struct pblk_line_meta lm; /* Line metadata */ + + struct nvm_addrf addrf; /* Aligned address format */ + struct pblk_addrf uaddrf; /* Unaligned address format */ + int addrf_len; + + struct pblk_rb rwb; + + int state; /* pblk line state */ + + int min_write_pgs; /* Minimum amount of pages required by controller */ + int max_write_pgs; /* Maximum amount of pages supported by controller */ + + sector_t capacity; /* Device capacity when bad blocks are subtracted */ + + int op; /* Percentage of device used for over-provisioning */ + int op_blks; /* Number of blocks used for over-provisioning */ + + /* pblk provisioning values. Used by rate limiter */ + struct pblk_rl rl; + + int sec_per_write; + + unsigned char instance_uuid[16]; + + /* Persistent write amplification counters, 4kb sector I/Os */ + atomic64_t user_wa; /* Sectors written by user */ + atomic64_t gc_wa; /* Sectors written by GC */ + atomic64_t pad_wa; /* Padded sectors written */ + + /* Reset values for delta write amplification measurements */ + u64 user_rst_wa; + u64 gc_rst_wa; + u64 pad_rst_wa; + + /* Counters used for calculating padding distribution */ + atomic64_t *pad_dist; /* Padding distribution buckets */ + u64 nr_flush_rst; /* Flushes reset value for pad dist.*/ + atomic64_t nr_flush; /* Number of flush/fua I/O */ + +#ifdef CONFIG_NVM_PBLK_DEBUG + /* Non-persistent debug counters, 4kb sector I/Os */ + atomic_long_t inflight_writes; /* Inflight writes (user and gc) */ + atomic_long_t padded_writes; /* Sectors padded due to flush/fua */ + atomic_long_t padded_wb; /* Sectors padded in write buffer */ + atomic_long_t req_writes; /* Sectors stored on write buffer */ + atomic_long_t sub_writes; /* Sectors submitted from buffer */ + atomic_long_t sync_writes; /* Sectors synced to media */ + atomic_long_t inflight_reads; /* Inflight sector read requests */ + atomic_long_t cache_reads; /* Read requests that hit the cache */ + atomic_long_t sync_reads; /* Completed sector read requests */ + atomic_long_t recov_writes; /* Sectors submitted from recovery */ + atomic_long_t recov_gc_writes; /* Sectors submitted from write GC */ + atomic_long_t recov_gc_reads; /* Sectors submitted from read GC */ +#endif + + spinlock_t lock; + + atomic_long_t read_failed; + atomic_long_t read_empty; + atomic_long_t read_high_ecc; + atomic_long_t read_failed_gc; + atomic_long_t write_failed; + atomic_long_t erase_failed; + + atomic_t inflight_io; /* General inflight I/O counter */ + + struct task_struct *writer_ts; + + /* Simple translation map of logical addresses to physical addresses. + * The logical addresses is known by the host system, while the physical + * addresses are used when writing to the disk block device. + */ + unsigned char *trans_map; + spinlock_t trans_lock; + + struct list_head compl_list; + + spinlock_t resubmit_lock; /* Resubmit list lock */ + struct list_head resubmit_list; /* Resubmit list for failed writes*/ + + mempool_t page_bio_pool; + mempool_t gen_ws_pool; + mempool_t rec_pool; + mempool_t r_rq_pool; + mempool_t w_rq_pool; + mempool_t e_rq_pool; + + struct workqueue_struct *close_wq; + struct workqueue_struct *bb_wq; + struct workqueue_struct *r_end_wq; + + struct timer_list wtimer; + + struct pblk_gc gc; +}; + +struct pblk_line_ws { + struct pblk *pblk; + struct pblk_line *line; + void *priv; + struct work_struct ws; +}; + +#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) +#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) + +#define pblk_err(pblk, fmt, ...) \ + pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) +#define pblk_info(pblk, fmt, ...) \ + pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) +#define pblk_warn(pblk, fmt, ...) \ + pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) +#define pblk_debug(pblk, fmt, ...) \ + pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) + +/* + * pblk ring buffer operations + */ +int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base, + unsigned int power_size, unsigned int power_seg_sz); +unsigned int pblk_rb_calculate_size(unsigned int nr_entries); +void *pblk_rb_entries_ref(struct pblk_rb *rb); +int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, + unsigned int nr_entries, unsigned int *pos); +int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, + unsigned int *pos); +void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, + struct pblk_w_ctx w_ctx, unsigned int pos); +void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, + struct pblk_w_ctx w_ctx, struct pblk_line *line, + u64 paddr, unsigned int pos); +struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); +void pblk_rb_flush(struct pblk_rb *rb); + +void pblk_rb_sync_l2p(struct pblk_rb *rb); +unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, + unsigned int pos, unsigned int nr_entries, + unsigned int count); +int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, + struct ppa_addr ppa, int bio_iter, bool advanced_bio); +unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries); + +unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags); +unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries); +struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb, + struct ppa_addr *ppa); +void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); +unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb); + +unsigned int pblk_rb_read_count(struct pblk_rb *rb); +unsigned int pblk_rb_sync_count(struct pblk_rb *rb); +unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos); + +int pblk_rb_tear_down_check(struct pblk_rb *rb); +int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos); +void pblk_rb_data_free(struct pblk_rb *rb); +ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf); + +/* + * pblk core + */ +struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type); +void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type); +void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); +int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, + struct pblk_c_ctx *c_ctx); +void pblk_discard(struct pblk *pblk, struct bio *bio); +struct nvm_chk_meta *pblk_chunk_get_info(struct pblk *pblk); +struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk, + struct nvm_chk_meta *lp, + struct ppa_addr ppa); +void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); +void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); +int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd); +int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd); +int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); +struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, + unsigned int nr_secs, unsigned int len, + int alloc_type, gfp_t gfp_mask); +struct pblk_line *pblk_line_get(struct pblk *pblk); +struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); +struct pblk_line *pblk_line_replace_data(struct pblk *pblk); +int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); +void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); +struct pblk_line *pblk_line_get_data(struct pblk *pblk); +struct pblk_line *pblk_line_get_erase(struct pblk *pblk); +int pblk_line_erase(struct pblk *pblk, struct pblk_line *line); +int pblk_line_is_full(struct pblk_line *line); +void pblk_line_free(struct pblk_line *line); +void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); +void pblk_line_close(struct pblk *pblk, struct pblk_line *line); +void pblk_line_close_ws(struct work_struct *work); +void pblk_pipeline_stop(struct pblk *pblk); +void __pblk_pipeline_stop(struct pblk *pblk); +void __pblk_pipeline_flush(struct pblk *pblk); +void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, + void (*work)(struct work_struct *), gfp_t gfp_mask, + struct workqueue_struct *wq); +u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); +int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line); +int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, + void *emeta_buf); +int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); +void pblk_line_put(struct kref *ref); +void pblk_line_put_wq(struct kref *ref); +struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); +u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); +void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); +u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); +u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); +int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, + unsigned long secs_to_flush); +void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas); +void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, + unsigned long *lun_bitmap); +void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas); +void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, + unsigned long *lun_bitmap); +int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, + int nr_pages); +void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, + int nr_pages); +void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa); +void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, + u64 paddr); +void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa); +void pblk_update_map_cache(struct pblk *pblk, sector_t lba, + struct ppa_addr ppa); +void pblk_update_map_dev(struct pblk *pblk, sector_t lba, + struct ppa_addr ppa, struct ppa_addr entry_line); +int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, + struct pblk_line *gc_line, u64 paddr); +void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, + u64 *lba_list, int nr_secs); +void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, + sector_t blba, int nr_secs); + +/* + * pblk user I/O write path + */ +int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, + unsigned long flags); +int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq); + +/* + * pblk map + */ +void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, + unsigned int sentry, unsigned long *lun_bitmap, + unsigned int valid_secs, struct ppa_addr *erase_ppa); +void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, + unsigned long *lun_bitmap, unsigned int valid_secs, + unsigned int off); + +/* + * pblk write thread + */ +int pblk_write_ts(void *data); +void pblk_write_timer_fn(struct timer_list *t); +void pblk_write_should_kick(struct pblk *pblk); +void pblk_write_kick(struct pblk *pblk); + +/* + * pblk read path + */ +extern struct bio_set pblk_bio_set; +int pblk_submit_read(struct pblk *pblk, struct bio *bio); +int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); +/* + * pblk recovery + */ +struct pblk_line *pblk_recov_l2p(struct pblk *pblk); +int pblk_recov_pad(struct pblk *pblk); +int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta); + +/* + * pblk gc + */ +#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ +#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */ +#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ +#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */ + +int pblk_gc_init(struct pblk *pblk); +void pblk_gc_exit(struct pblk *pblk, bool graceful); +void pblk_gc_should_start(struct pblk *pblk); +void pblk_gc_should_stop(struct pblk *pblk); +void pblk_gc_should_kick(struct pblk *pblk); +void pblk_gc_free_full_lines(struct pblk *pblk); +void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, + int *gc_active); +int pblk_gc_sysfs_force(struct pblk *pblk, int force); + +/* + * pblk rate limiter + */ +void pblk_rl_init(struct pblk_rl *rl, int budget); +void pblk_rl_free(struct pblk_rl *rl); +void pblk_rl_update_rates(struct pblk_rl *rl); +int pblk_rl_high_thrs(struct pblk_rl *rl); +unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); +unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl); +int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); +void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); +void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); +int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); +void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); +void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); +int pblk_rl_max_io(struct pblk_rl *rl); +void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); +void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, + bool used); +int pblk_rl_is_limit(struct pblk_rl *rl); + +void pblk_rl_werr_line_in(struct pblk_rl *rl); +void pblk_rl_werr_line_out(struct pblk_rl *rl); + +/* + * pblk sysfs + */ +int pblk_sysfs_init(struct gendisk *tdisk); +void pblk_sysfs_exit(struct gendisk *tdisk); + +static inline void *pblk_malloc(size_t size, int type, gfp_t flags) +{ + if (type == PBLK_KMALLOC_META) + return kmalloc(size, flags); + return vmalloc(size); +} + +static inline void pblk_mfree(void *ptr, int type) +{ + if (type == PBLK_KMALLOC_META) + kfree(ptr); + else + vfree(ptr); +} + +static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx) +{ + return c_ctx - sizeof(struct nvm_rq); +} + +static inline void *emeta_to_bb(struct line_emeta *emeta) +{ + return emeta->bb_bitmap; +} + +static inline void *emeta_to_wa(struct pblk_line_meta *lm, + struct line_emeta *emeta) +{ + return emeta->bb_bitmap + lm->blk_bitmap_len; +} + +static inline void *emeta_to_lbas(struct pblk *pblk, struct line_emeta *emeta) +{ + return ((void *)emeta + pblk->lm.emeta_len[1]); +} + +static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta) +{ + return (emeta_to_lbas(pblk, emeta) + pblk->lm.emeta_len[2]); +} + +static inline int pblk_line_vsc(struct pblk_line *line) +{ + return le32_to_cpu(*line->vsc); +} + +static inline int pblk_pad_distance(struct pblk *pblk) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + + return geo->mw_cunits * geo->all_luns * geo->ws_opt; +} + +static inline int pblk_ppa_to_line(struct ppa_addr p) +{ + return p.a.blk; +} + +static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) +{ + return p.a.lun * geo->num_ch + p.a.ch; +} + +static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr, + u64 line_id) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct ppa_addr ppa; + + if (geo->version == NVM_OCSSD_SPEC_12) { + struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; + + ppa.ppa = 0; + ppa.g.blk = line_id; + ppa.g.pg = (paddr & ppaf->pg_mask) >> ppaf->pg_offset; + ppa.g.lun = (paddr & ppaf->lun_mask) >> ppaf->lun_offset; + ppa.g.ch = (paddr & ppaf->ch_mask) >> ppaf->ch_offset; + ppa.g.pl = (paddr & ppaf->pln_mask) >> ppaf->pln_offset; + ppa.g.sec = (paddr & ppaf->sec_mask) >> ppaf->sec_offset; + } else { + struct pblk_addrf *uaddrf = &pblk->uaddrf; + int secs, chnls, luns; + + ppa.ppa = 0; + + ppa.m.chk = line_id; + + paddr = div_u64_rem(paddr, uaddrf->sec_stripe, &secs); + ppa.m.sec = secs; + + paddr = div_u64_rem(paddr, uaddrf->ch_stripe, &chnls); + ppa.m.grp = chnls; + + paddr = div_u64_rem(paddr, uaddrf->lun_stripe, &luns); + ppa.m.pu = luns; + + ppa.m.sec += uaddrf->sec_stripe * paddr; + } + + return ppa; +} + +static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk, + struct ppa_addr p) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + u64 paddr; + + if (geo->version == NVM_OCSSD_SPEC_12) { + struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; + + paddr = (u64)p.g.ch << ppaf->ch_offset; + paddr |= (u64)p.g.lun << ppaf->lun_offset; + paddr |= (u64)p.g.pg << ppaf->pg_offset; + paddr |= (u64)p.g.pl << ppaf->pln_offset; + paddr |= (u64)p.g.sec << ppaf->sec_offset; + } else { + struct pblk_addrf *uaddrf = &pblk->uaddrf; + u64 secs = p.m.sec; + int sec_stripe; + + paddr = (u64)p.m.grp * uaddrf->sec_stripe; + paddr += (u64)p.m.pu * uaddrf->sec_lun_stripe; + + secs = div_u64_rem(secs, uaddrf->sec_stripe, &sec_stripe); + paddr += secs * uaddrf->sec_ws_stripe; + paddr += sec_stripe; + } + + return paddr; +} + +static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) +{ + struct ppa_addr ppa64; + + ppa64.ppa = 0; + + if (ppa32 == -1) { + ppa64.ppa = ADDR_EMPTY; + } else if (ppa32 & (1U << 31)) { + ppa64.c.line = ppa32 & ((~0U) >> 1); + ppa64.c.is_cached = 1; + } else { + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + + if (geo->version == NVM_OCSSD_SPEC_12) { + struct nvm_addrf_12 *ppaf = + (struct nvm_addrf_12 *)&pblk->addrf; + + ppa64.g.ch = (ppa32 & ppaf->ch_mask) >> + ppaf->ch_offset; + ppa64.g.lun = (ppa32 & ppaf->lun_mask) >> + ppaf->lun_offset; + ppa64.g.blk = (ppa32 & ppaf->blk_mask) >> + ppaf->blk_offset; + ppa64.g.pg = (ppa32 & ppaf->pg_mask) >> + ppaf->pg_offset; + ppa64.g.pl = (ppa32 & ppaf->pln_mask) >> + ppaf->pln_offset; + ppa64.g.sec = (ppa32 & ppaf->sec_mask) >> + ppaf->sec_offset; + } else { + struct nvm_addrf *lbaf = &pblk->addrf; + + ppa64.m.grp = (ppa32 & lbaf->ch_mask) >> + lbaf->ch_offset; + ppa64.m.pu = (ppa32 & lbaf->lun_mask) >> + lbaf->lun_offset; + ppa64.m.chk = (ppa32 & lbaf->chk_mask) >> + lbaf->chk_offset; + ppa64.m.sec = (ppa32 & lbaf->sec_mask) >> + lbaf->sec_offset; + } + } + + return ppa64; +} + +static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) +{ + u32 ppa32 = 0; + + if (ppa64.ppa == ADDR_EMPTY) { + ppa32 = ~0U; + } else if (ppa64.c.is_cached) { + ppa32 |= ppa64.c.line; + ppa32 |= 1U << 31; + } else { + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + + if (geo->version == NVM_OCSSD_SPEC_12) { + struct nvm_addrf_12 *ppaf = + (struct nvm_addrf_12 *)&pblk->addrf; + + ppa32 |= ppa64.g.ch << ppaf->ch_offset; + ppa32 |= ppa64.g.lun << ppaf->lun_offset; + ppa32 |= ppa64.g.blk << ppaf->blk_offset; + ppa32 |= ppa64.g.pg << ppaf->pg_offset; + ppa32 |= ppa64.g.pl << ppaf->pln_offset; + ppa32 |= ppa64.g.sec << ppaf->sec_offset; + } else { + struct nvm_addrf *lbaf = &pblk->addrf; + + ppa32 |= ppa64.m.grp << lbaf->ch_offset; + ppa32 |= ppa64.m.pu << lbaf->lun_offset; + ppa32 |= ppa64.m.chk << lbaf->chk_offset; + ppa32 |= ppa64.m.sec << lbaf->sec_offset; + } + } + + return ppa32; +} + +static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk, + sector_t lba) +{ + struct ppa_addr ppa; + + if (pblk->addrf_len < 32) { + u32 *map = (u32 *)pblk->trans_map; + + ppa = pblk_ppa32_to_ppa64(pblk, map[lba]); + } else { + struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map; + + ppa = map[lba]; + } + + return ppa; +} + +static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba, + struct ppa_addr ppa) +{ + if (pblk->addrf_len < 32) { + u32 *map = (u32 *)pblk->trans_map; + + map[lba] = pblk_ppa64_to_ppa32(pblk, ppa); + } else { + u64 *map = (u64 *)pblk->trans_map; + + map[lba] = ppa.ppa; + } +} + +static inline int pblk_ppa_empty(struct ppa_addr ppa_addr) +{ + return (ppa_addr.ppa == ADDR_EMPTY); +} + +static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr) +{ + ppa_addr->ppa = ADDR_EMPTY; +} + +static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) +{ + return (lppa.ppa == rppa.ppa); +} + +static inline int pblk_addr_in_cache(struct ppa_addr ppa) +{ + return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached); +} + +static inline int pblk_addr_to_cacheline(struct ppa_addr ppa) +{ + return ppa.c.line; +} + +static inline struct ppa_addr pblk_cacheline_to_addr(int addr) +{ + struct ppa_addr p; + + p.c.line = addr; + p.c.is_cached = 1; + + return p; +} + +static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, + struct line_header *header) +{ + u32 crc = ~(u32)0; + + crc = crc32_le(crc, (unsigned char *)header + sizeof(crc), + sizeof(struct line_header) - sizeof(crc)); + + return crc; +} + +static inline u32 pblk_calc_smeta_crc(struct pblk *pblk, + struct line_smeta *smeta) +{ + struct pblk_line_meta *lm = &pblk->lm; + u32 crc = ~(u32)0; + + crc = crc32_le(crc, (unsigned char *)smeta + + sizeof(struct line_header) + sizeof(crc), + lm->smeta_len - + sizeof(struct line_header) - sizeof(crc)); + + return crc; +} + +static inline u32 pblk_calc_emeta_crc(struct pblk *pblk, + struct line_emeta *emeta) +{ + struct pblk_line_meta *lm = &pblk->lm; + u32 crc = ~(u32)0; + + crc = crc32_le(crc, (unsigned char *)emeta + + sizeof(struct line_header) + sizeof(crc), + lm->emeta_len[0] - + sizeof(struct line_header) - sizeof(crc)); + + return crc; +} + +static inline int pblk_set_progr_mode(struct pblk *pblk, int type) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int flags; + + if (geo->version == NVM_OCSSD_SPEC_20) + return 0; + + flags = geo->pln_mode >> 1; + + if (type == PBLK_WRITE) + flags |= NVM_IO_SCRAMBLE_ENABLE; + + return flags; +} + +enum { + PBLK_READ_RANDOM = 0, + PBLK_READ_SEQUENTIAL = 1, +}; + +static inline int pblk_set_read_mode(struct pblk *pblk, int type) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + int flags; + + if (geo->version == NVM_OCSSD_SPEC_20) + return 0; + + flags = NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE; + if (type == PBLK_READ_SEQUENTIAL) + flags |= geo->pln_mode >> 1; + + return flags; +} + +static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs) +{ + return !(nr_secs % pblk->min_write_pgs); +} + +#ifdef CONFIG_NVM_PBLK_DEBUG +static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p, + char *msg, int error) +{ + struct nvm_geo *geo = &pblk->dev->geo; + + if (p->c.is_cached) { + pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n", + msg, error, (u64)p->c.line); + } else if (geo->version == NVM_OCSSD_SPEC_12) { + pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n", + msg, error, + p->g.ch, p->g.lun, p->g.blk, + p->g.pg, p->g.pl, p->g.sec); + } else { + pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n", + msg, error, + p->m.grp, p->m.pu, p->m.chk, p->m.sec); + } +} + +static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, + int error) +{ + int bit = -1; + + if (rqd->nr_ppas == 1) { + print_ppa(pblk, &rqd->ppa_addr, "rqd", error); + return; + } + + while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas, + bit + 1)) < rqd->nr_ppas) { + print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error); + } + + pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status); +} + +static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, + struct ppa_addr *ppas, int nr_ppas) +{ + struct nvm_geo *geo = &tgt_dev->geo; + struct ppa_addr *ppa; + int i; + + for (i = 0; i < nr_ppas; i++) { + ppa = &ppas[i]; + + if (geo->version == NVM_OCSSD_SPEC_12) { + if (!ppa->c.is_cached && + ppa->g.ch < geo->num_ch && + ppa->g.lun < geo->num_lun && + ppa->g.pl < geo->num_pln && + ppa->g.blk < geo->num_chk && + ppa->g.pg < geo->num_pg && + ppa->g.sec < geo->ws_min) + continue; + } else { + if (!ppa->c.is_cached && + ppa->m.grp < geo->num_ch && + ppa->m.pu < geo->num_lun && + ppa->m.chk < geo->num_chk && + ppa->m.sec < geo->clba) + continue; + } + + print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i); + + return 1; + } + return 0; +} + +static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct ppa_addr *ppa_list; + + ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; + + if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { + WARN_ON(1); + return -EINVAL; + } + + if (rqd->opcode == NVM_OP_PWRITE) { + struct pblk_line *line; + struct ppa_addr ppa; + int i; + + for (i = 0; i < rqd->nr_ppas; i++) { + ppa = ppa_list[i]; + line = &pblk->lines[pblk_ppa_to_line(ppa)]; + + spin_lock(&line->lock); + if (line->state != PBLK_LINESTATE_OPEN) { + pblk_err(pblk, "bad ppa: line:%d,state:%d\n", + line->id, line->state); + WARN_ON(1); + spin_unlock(&line->lock); + return -EINVAL; + } + spin_unlock(&line->lock); + } + } + + return 0; +} +#endif + +static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr) +{ + struct pblk_line_meta *lm = &pblk->lm; + + if (paddr > lm->sec_per_line) + return 1; + + return 0; +} + +static inline unsigned int pblk_get_bi_idx(struct bio *bio) +{ + return bio->bi_iter.bi_idx; +} + +static inline sector_t pblk_get_lba(struct bio *bio) +{ + return bio->bi_iter.bi_sector / NR_PHY_IN_LOG; +} + +static inline unsigned int pblk_get_secs(struct bio *bio) +{ + return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; +} + +static inline void pblk_setup_uuid(struct pblk *pblk) +{ + uuid_le uuid; + + uuid_le_gen(&uuid); + memcpy(pblk->instance_uuid, uuid.b, 16); +} +#endif /* PBLK_H_ */ |