summaryrefslogtreecommitdiffstats
path: root/drivers/lightnvm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--drivers/lightnvm/Kconfig42
-rw-r--r--drivers/lightnvm/Makefile11
-rw-r--r--drivers/lightnvm/core.c1201
-rw-r--r--drivers/lightnvm/pblk-cache.c134
-rw-r--r--drivers/lightnvm/pblk-core.c2095
-rw-r--r--drivers/lightnvm/pblk-gc.c705
-rw-r--r--drivers/lightnvm/pblk-init.c1366
-rw-r--r--drivers/lightnvm/pblk-map.c188
-rw-r--r--drivers/lightnvm/pblk-rb.c852
-rw-r--r--drivers/lightnvm/pblk-read.c701
-rw-r--r--drivers/lightnvm/pblk-recovery.c1011
-rw-r--r--drivers/lightnvm/pblk-rl.c250
-rw-r--r--drivers/lightnvm/pblk-sysfs.c720
-rw-r--r--drivers/lightnvm/pblk-write.c673
-rw-r--r--drivers/lightnvm/pblk.h1444
15 files changed, 11393 insertions, 0 deletions
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
new file mode 100644
index 000000000..38cd49a3a
--- /dev/null
+++ b/drivers/lightnvm/Kconfig
@@ -0,0 +1,42 @@
+#
+# Open-Channel SSD NVM configuration
+#
+
+menuconfig NVM
+ bool "Open-Channel SSD target support"
+ depends on BLOCK && PCI && BROKEN
+ select BLK_DEV_NVME
+ help
+ Say Y here to get to enable Open-channel SSDs.
+
+ Open-Channel SSDs implement a set of extension to SSDs, that
+ exposes direct access to the underlying non-volatile memory.
+
+ If you say N, all options in this submenu will be skipped and disabled
+ only do this if you know what you are doing.
+
+if NVM
+
+config NVM_PBLK
+ tristate "Physical Block Device Open-Channel SSD target"
+ select CRC32
+ help
+ Allows an open-channel SSD to be exposed as a block device to the
+ host. The target assumes the device exposes raw flash and must be
+ explicitly managed by the host.
+
+ Please note the disk format is considered EXPERIMENTAL for now.
+
+if NVM_PBLK
+
+config NVM_PBLK_DEBUG
+ bool "PBlk Debug Support"
+ default n
+ help
+ Enables debug support for pblk. This includes extra checks, more
+ vocal error messages, and extra tracking fields in the pblk sysfs
+ entries.
+
+endif # NVM_PBLK_DEBUG
+
+endif # NVM
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
new file mode 100644
index 000000000..97d9d7c71
--- /dev/null
+++ b/drivers/lightnvm/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Open-Channel SSDs.
+#
+
+obj-$(CONFIG_NVM) := core.o
+obj-$(CONFIG_NVM_PBLK) += pblk.o
+pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
+ pblk-write.o pblk-cache.o pblk-read.o \
+ pblk-gc.o pblk-recovery.o pblk-map.o \
+ pblk-rl.o pblk-sysfs.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
new file mode 100644
index 000000000..60aa7bc5a
--- /dev/null
+++ b/drivers/lightnvm/core.c
@@ -0,0 +1,1201 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen. All rights reserved.
+ * Initial release: Matias Bjorling <m@bjorling.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/sem.h>
+#include <linux/bitmap.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/miscdevice.h>
+#include <linux/lightnvm.h>
+#include <linux/sched/sysctl.h>
+
+static LIST_HEAD(nvm_tgt_types);
+static DECLARE_RWSEM(nvm_tgtt_lock);
+static LIST_HEAD(nvm_devices);
+static DECLARE_RWSEM(nvm_lock);
+
+/* Map between virtual and physical channel and lun */
+struct nvm_ch_map {
+ int ch_off;
+ int num_lun;
+ int *lun_offs;
+};
+
+struct nvm_dev_map {
+ struct nvm_ch_map *chnls;
+ int num_ch;
+};
+
+static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
+{
+ struct nvm_target *tgt;
+
+ list_for_each_entry(tgt, &dev->targets, list)
+ if (!strcmp(name, tgt->disk->disk_name))
+ return tgt;
+
+ return NULL;
+}
+
+static bool nvm_target_exists(const char *name)
+{
+ struct nvm_dev *dev;
+ struct nvm_target *tgt;
+ bool ret = false;
+
+ down_write(&nvm_lock);
+ list_for_each_entry(dev, &nvm_devices, devices) {
+ mutex_lock(&dev->mlock);
+ list_for_each_entry(tgt, &dev->targets, list) {
+ if (!strcmp(name, tgt->disk->disk_name)) {
+ ret = true;
+ mutex_unlock(&dev->mlock);
+ goto out;
+ }
+ }
+ mutex_unlock(&dev->mlock);
+ }
+
+out:
+ up_write(&nvm_lock);
+ return ret;
+}
+
+static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
+{
+ int i;
+
+ for (i = lun_begin; i <= lun_end; i++) {
+ if (test_and_set_bit(i, dev->lun_map)) {
+ pr_err("nvm: lun %d already allocated\n", i);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ while (--i >= lun_begin)
+ clear_bit(i, dev->lun_map);
+
+ return -EBUSY;
+}
+
+static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin,
+ int lun_end)
+{
+ int i;
+
+ for (i = lun_begin; i <= lun_end; i++)
+ WARN_ON(!test_and_clear_bit(i, dev->lun_map));
+}
+
+static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
+{
+ struct nvm_dev *dev = tgt_dev->parent;
+ struct nvm_dev_map *dev_map = tgt_dev->map;
+ int i, j;
+
+ for (i = 0; i < dev_map->num_ch; i++) {
+ struct nvm_ch_map *ch_map = &dev_map->chnls[i];
+ int *lun_offs = ch_map->lun_offs;
+ int ch = i + ch_map->ch_off;
+
+ if (clear) {
+ for (j = 0; j < ch_map->num_lun; j++) {
+ int lun = j + lun_offs[j];
+ int lunid = (ch * dev->geo.num_lun) + lun;
+
+ WARN_ON(!test_and_clear_bit(lunid,
+ dev->lun_map));
+ }
+ }
+
+ kfree(ch_map->lun_offs);
+ }
+
+ kfree(dev_map->chnls);
+ kfree(dev_map);
+
+ kfree(tgt_dev->luns);
+ kfree(tgt_dev);
+}
+
+static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
+ u16 lun_begin, u16 lun_end,
+ u16 op)
+{
+ struct nvm_tgt_dev *tgt_dev = NULL;
+ struct nvm_dev_map *dev_rmap = dev->rmap;
+ struct nvm_dev_map *dev_map;
+ struct ppa_addr *luns;
+ int num_lun = lun_end - lun_begin + 1;
+ int luns_left = num_lun;
+ int num_ch = num_lun / dev->geo.num_lun;
+ int num_ch_mod = num_lun % dev->geo.num_lun;
+ int bch = lun_begin / dev->geo.num_lun;
+ int blun = lun_begin % dev->geo.num_lun;
+ int lunid = 0;
+ int lun_balanced = 1;
+ int sec_per_lun, prev_num_lun;
+ int i, j;
+
+ num_ch = (num_ch_mod == 0) ? num_ch : num_ch + 1;
+
+ dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
+ if (!dev_map)
+ goto err_dev;
+
+ dev_map->chnls = kcalloc(num_ch, sizeof(struct nvm_ch_map), GFP_KERNEL);
+ if (!dev_map->chnls)
+ goto err_chnls;
+
+ luns = kcalloc(num_lun, sizeof(struct ppa_addr), GFP_KERNEL);
+ if (!luns)
+ goto err_luns;
+
+ prev_num_lun = (luns_left > dev->geo.num_lun) ?
+ dev->geo.num_lun : luns_left;
+ for (i = 0; i < num_ch; i++) {
+ struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
+ int *lun_roffs = ch_rmap->lun_offs;
+ struct nvm_ch_map *ch_map = &dev_map->chnls[i];
+ int *lun_offs;
+ int luns_in_chnl = (luns_left > dev->geo.num_lun) ?
+ dev->geo.num_lun : luns_left;
+
+ if (lun_balanced && prev_num_lun != luns_in_chnl)
+ lun_balanced = 0;
+
+ ch_map->ch_off = ch_rmap->ch_off = bch;
+ ch_map->num_lun = luns_in_chnl;
+
+ lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
+ if (!lun_offs)
+ goto err_ch;
+
+ for (j = 0; j < luns_in_chnl; j++) {
+ luns[lunid].ppa = 0;
+ luns[lunid].a.ch = i;
+ luns[lunid++].a.lun = j;
+
+ lun_offs[j] = blun;
+ lun_roffs[j + blun] = blun;
+ }
+
+ ch_map->lun_offs = lun_offs;
+
+ /* when starting a new channel, lun offset is reset */
+ blun = 0;
+ luns_left -= luns_in_chnl;
+ }
+
+ dev_map->num_ch = num_ch;
+
+ tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL);
+ if (!tgt_dev)
+ goto err_ch;
+
+ /* Inherit device geometry from parent */
+ memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
+
+ /* Target device only owns a portion of the physical device */
+ tgt_dev->geo.num_ch = num_ch;
+ tgt_dev->geo.num_lun = (lun_balanced) ? prev_num_lun : -1;
+ tgt_dev->geo.all_luns = num_lun;
+ tgt_dev->geo.all_chunks = num_lun * dev->geo.num_chk;
+
+ tgt_dev->geo.op = op;
+
+ sec_per_lun = dev->geo.clba * dev->geo.num_chk;
+ tgt_dev->geo.total_secs = num_lun * sec_per_lun;
+
+ tgt_dev->q = dev->q;
+ tgt_dev->map = dev_map;
+ tgt_dev->luns = luns;
+ tgt_dev->parent = dev;
+
+ return tgt_dev;
+err_ch:
+ while (--i >= 0)
+ kfree(dev_map->chnls[i].lun_offs);
+ kfree(luns);
+err_luns:
+ kfree(dev_map->chnls);
+err_chnls:
+ kfree(dev_map);
+err_dev:
+ return tgt_dev;
+}
+
+static const struct block_device_operations nvm_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct nvm_tgt_type *__nvm_find_target_type(const char *name)
+{
+ struct nvm_tgt_type *tt;
+
+ list_for_each_entry(tt, &nvm_tgt_types, list)
+ if (!strcmp(name, tt->name))
+ return tt;
+
+ return NULL;
+}
+
+static struct nvm_tgt_type *nvm_find_target_type(const char *name)
+{
+ struct nvm_tgt_type *tt;
+
+ down_write(&nvm_tgtt_lock);
+ tt = __nvm_find_target_type(name);
+ up_write(&nvm_tgtt_lock);
+
+ return tt;
+}
+
+static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
+ int lun_end)
+{
+ if (lun_begin > lun_end || lun_end >= geo->all_luns) {
+ pr_err("nvm: lun out of bound (%u:%u > %u)\n",
+ lun_begin, lun_end, geo->all_luns - 1);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __nvm_config_simple(struct nvm_dev *dev,
+ struct nvm_ioctl_create_simple *s)
+{
+ struct nvm_geo *geo = &dev->geo;
+
+ if (s->lun_begin == -1 && s->lun_end == -1) {
+ s->lun_begin = 0;
+ s->lun_end = geo->all_luns - 1;
+ }
+
+ return nvm_config_check_luns(geo, s->lun_begin, s->lun_end);
+}
+
+static int __nvm_config_extended(struct nvm_dev *dev,
+ struct nvm_ioctl_create_extended *e)
+{
+ if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
+ e->lun_begin = 0;
+ e->lun_end = dev->geo.all_luns - 1;
+ }
+
+ /* op not set falls into target's default */
+ if (e->op == 0xFFFF) {
+ e->op = NVM_TARGET_DEFAULT_OP;
+ } else if (e->op < NVM_TARGET_MIN_OP || e->op > NVM_TARGET_MAX_OP) {
+ pr_err("nvm: invalid over provisioning value\n");
+ return -EINVAL;
+ }
+
+ return nvm_config_check_luns(&dev->geo, e->lun_begin, e->lun_end);
+}
+
+static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
+{
+ struct nvm_ioctl_create_extended e;
+ struct request_queue *tqueue;
+ struct gendisk *tdisk;
+ struct nvm_tgt_type *tt;
+ struct nvm_target *t;
+ struct nvm_tgt_dev *tgt_dev;
+ void *targetdata;
+ int ret;
+
+ switch (create->conf.type) {
+ case NVM_CONFIG_TYPE_SIMPLE:
+ ret = __nvm_config_simple(dev, &create->conf.s);
+ if (ret)
+ return ret;
+
+ e.lun_begin = create->conf.s.lun_begin;
+ e.lun_end = create->conf.s.lun_end;
+ e.op = NVM_TARGET_DEFAULT_OP;
+ break;
+ case NVM_CONFIG_TYPE_EXTENDED:
+ ret = __nvm_config_extended(dev, &create->conf.e);
+ if (ret)
+ return ret;
+
+ e = create->conf.e;
+ break;
+ default:
+ pr_err("nvm: config type not valid\n");
+ return -EINVAL;
+ }
+
+ tt = nvm_find_target_type(create->tgttype);
+ if (!tt) {
+ pr_err("nvm: target type %s not found\n", create->tgttype);
+ return -EINVAL;
+ }
+
+ if (nvm_target_exists(create->tgtname)) {
+ pr_err("nvm: target name already exists (%s)\n",
+ create->tgtname);
+ return -EINVAL;
+ }
+
+ ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end);
+ if (ret)
+ return ret;
+
+ t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
+ if (!t) {
+ ret = -ENOMEM;
+ goto err_reserve;
+ }
+
+ tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
+ if (!tgt_dev) {
+ pr_err("nvm: could not create target device\n");
+ ret = -ENOMEM;
+ goto err_t;
+ }
+
+ tdisk = alloc_disk(0);
+ if (!tdisk) {
+ ret = -ENOMEM;
+ goto err_dev;
+ }
+
+ tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node, NULL);
+ if (!tqueue) {
+ ret = -ENOMEM;
+ goto err_disk;
+ }
+ blk_queue_make_request(tqueue, tt->make_rq);
+
+ strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name));
+ tdisk->flags = GENHD_FL_EXT_DEVT;
+ tdisk->major = 0;
+ tdisk->first_minor = 0;
+ tdisk->fops = &nvm_fops;
+ tdisk->queue = tqueue;
+
+ targetdata = tt->init(tgt_dev, tdisk, create->flags);
+ if (IS_ERR(targetdata)) {
+ ret = PTR_ERR(targetdata);
+ goto err_init;
+ }
+
+ tdisk->private_data = targetdata;
+ tqueue->queuedata = targetdata;
+
+ blk_queue_max_hw_sectors(tqueue,
+ (dev->geo.csecs >> 9) * NVM_MAX_VLBA);
+
+ set_capacity(tdisk, tt->capacity(targetdata));
+ add_disk(tdisk);
+
+ if (tt->sysfs_init && tt->sysfs_init(tdisk)) {
+ ret = -ENOMEM;
+ goto err_sysfs;
+ }
+
+ t->type = tt;
+ t->disk = tdisk;
+ t->dev = tgt_dev;
+
+ mutex_lock(&dev->mlock);
+ list_add_tail(&t->list, &dev->targets);
+ mutex_unlock(&dev->mlock);
+
+ __module_get(tt->owner);
+
+ return 0;
+err_sysfs:
+ if (tt->exit)
+ tt->exit(targetdata, true);
+err_init:
+ blk_cleanup_queue(tqueue);
+ tdisk->queue = NULL;
+err_disk:
+ put_disk(tdisk);
+err_dev:
+ nvm_remove_tgt_dev(tgt_dev, 0);
+err_t:
+ kfree(t);
+err_reserve:
+ nvm_release_luns_err(dev, e.lun_begin, e.lun_end);
+ return ret;
+}
+
+static void __nvm_remove_target(struct nvm_target *t, bool graceful)
+{
+ struct nvm_tgt_type *tt = t->type;
+ struct gendisk *tdisk = t->disk;
+ struct request_queue *q = tdisk->queue;
+
+ del_gendisk(tdisk);
+ blk_cleanup_queue(q);
+
+ if (tt->sysfs_exit)
+ tt->sysfs_exit(tdisk);
+
+ if (tt->exit)
+ tt->exit(tdisk->private_data, graceful);
+
+ nvm_remove_tgt_dev(t->dev, 1);
+ put_disk(tdisk);
+ module_put(t->type->owner);
+
+ list_del(&t->list);
+ kfree(t);
+}
+
+/**
+ * nvm_remove_tgt - Removes a target from the media manager
+ * @dev: device
+ * @remove: ioctl structure with target name to remove.
+ *
+ * Returns:
+ * 0: on success
+ * 1: on not found
+ * <0: on error
+ */
+static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
+{
+ struct nvm_target *t;
+
+ mutex_lock(&dev->mlock);
+ t = nvm_find_target(dev, remove->tgtname);
+ if (!t) {
+ mutex_unlock(&dev->mlock);
+ return 1;
+ }
+ __nvm_remove_target(t, true);
+ mutex_unlock(&dev->mlock);
+
+ return 0;
+}
+
+static int nvm_register_map(struct nvm_dev *dev)
+{
+ struct nvm_dev_map *rmap;
+ int i, j;
+
+ rmap = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
+ if (!rmap)
+ goto err_rmap;
+
+ rmap->chnls = kcalloc(dev->geo.num_ch, sizeof(struct nvm_ch_map),
+ GFP_KERNEL);
+ if (!rmap->chnls)
+ goto err_chnls;
+
+ for (i = 0; i < dev->geo.num_ch; i++) {
+ struct nvm_ch_map *ch_rmap;
+ int *lun_roffs;
+ int luns_in_chnl = dev->geo.num_lun;
+
+ ch_rmap = &rmap->chnls[i];
+
+ ch_rmap->ch_off = -1;
+ ch_rmap->num_lun = luns_in_chnl;
+
+ lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
+ if (!lun_roffs)
+ goto err_ch;
+
+ for (j = 0; j < luns_in_chnl; j++)
+ lun_roffs[j] = -1;
+
+ ch_rmap->lun_offs = lun_roffs;
+ }
+
+ dev->rmap = rmap;
+
+ return 0;
+err_ch:
+ while (--i >= 0)
+ kfree(rmap->chnls[i].lun_offs);
+err_chnls:
+ kfree(rmap);
+err_rmap:
+ return -ENOMEM;
+}
+
+static void nvm_unregister_map(struct nvm_dev *dev)
+{
+ struct nvm_dev_map *rmap = dev->rmap;
+ int i;
+
+ for (i = 0; i < dev->geo.num_ch; i++)
+ kfree(rmap->chnls[i].lun_offs);
+
+ kfree(rmap->chnls);
+ kfree(rmap);
+}
+
+static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
+{
+ struct nvm_dev_map *dev_map = tgt_dev->map;
+ struct nvm_ch_map *ch_map = &dev_map->chnls[p->a.ch];
+ int lun_off = ch_map->lun_offs[p->a.lun];
+
+ p->a.ch += ch_map->ch_off;
+ p->a.lun += lun_off;
+}
+
+static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
+{
+ struct nvm_dev *dev = tgt_dev->parent;
+ struct nvm_dev_map *dev_rmap = dev->rmap;
+ struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->a.ch];
+ int lun_roff = ch_rmap->lun_offs[p->a.lun];
+
+ p->a.ch -= ch_rmap->ch_off;
+ p->a.lun -= lun_roff;
+}
+
+static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev,
+ struct ppa_addr *ppa_list, int nr_ppas)
+{
+ int i;
+
+ for (i = 0; i < nr_ppas; i++) {
+ nvm_map_to_dev(tgt_dev, &ppa_list[i]);
+ ppa_list[i] = generic_to_dev_addr(tgt_dev->parent, ppa_list[i]);
+ }
+}
+
+static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev,
+ struct ppa_addr *ppa_list, int nr_ppas)
+{
+ int i;
+
+ for (i = 0; i < nr_ppas; i++) {
+ ppa_list[i] = dev_to_generic_addr(tgt_dev->parent, ppa_list[i]);
+ nvm_map_to_tgt(tgt_dev, &ppa_list[i]);
+ }
+}
+
+static void nvm_rq_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
+{
+ if (rqd->nr_ppas == 1) {
+ nvm_ppa_tgt_to_dev(tgt_dev, &rqd->ppa_addr, 1);
+ return;
+ }
+
+ nvm_ppa_tgt_to_dev(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
+}
+
+static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
+{
+ if (rqd->nr_ppas == 1) {
+ nvm_ppa_dev_to_tgt(tgt_dev, &rqd->ppa_addr, 1);
+ return;
+ }
+
+ nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
+}
+
+int nvm_register_tgt_type(struct nvm_tgt_type *tt)
+{
+ int ret = 0;
+
+ down_write(&nvm_tgtt_lock);
+ if (__nvm_find_target_type(tt->name))
+ ret = -EEXIST;
+ else
+ list_add(&tt->list, &nvm_tgt_types);
+ up_write(&nvm_tgtt_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(nvm_register_tgt_type);
+
+void nvm_unregister_tgt_type(struct nvm_tgt_type *tt)
+{
+ if (!tt)
+ return;
+
+ down_write(&nvm_tgtt_lock);
+ list_del(&tt->list);
+ up_write(&nvm_tgtt_lock);
+}
+EXPORT_SYMBOL(nvm_unregister_tgt_type);
+
+void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags,
+ dma_addr_t *dma_handler)
+{
+ return dev->ops->dev_dma_alloc(dev, dev->dma_pool, mem_flags,
+ dma_handler);
+}
+EXPORT_SYMBOL(nvm_dev_dma_alloc);
+
+void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler)
+{
+ dev->ops->dev_dma_free(dev->dma_pool, addr, dma_handler);
+}
+EXPORT_SYMBOL(nvm_dev_dma_free);
+
+static struct nvm_dev *nvm_find_nvm_dev(const char *name)
+{
+ struct nvm_dev *dev;
+
+ list_for_each_entry(dev, &nvm_devices, devices)
+ if (!strcmp(name, dev->name))
+ return dev;
+
+ return NULL;
+}
+
+static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
+ const struct ppa_addr *ppas, int nr_ppas)
+{
+ struct nvm_dev *dev = tgt_dev->parent;
+ struct nvm_geo *geo = &tgt_dev->geo;
+ int i, plane_cnt, pl_idx;
+ struct ppa_addr ppa;
+
+ if (geo->pln_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
+ rqd->nr_ppas = nr_ppas;
+ rqd->ppa_addr = ppas[0];
+
+ return 0;
+ }
+
+ rqd->nr_ppas = nr_ppas;
+ rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
+ if (!rqd->ppa_list) {
+ pr_err("nvm: failed to allocate dma memory\n");
+ return -ENOMEM;
+ }
+
+ plane_cnt = geo->pln_mode;
+ rqd->nr_ppas *= plane_cnt;
+
+ for (i = 0; i < nr_ppas; i++) {
+ for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+ ppa = ppas[i];
+ ppa.g.pl = pl_idx;
+ rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
+ }
+ }
+
+ return 0;
+}
+
+static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev,
+ struct nvm_rq *rqd)
+{
+ if (!rqd->ppa_list)
+ return;
+
+ nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+}
+
+int nvm_get_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct nvm_chk_meta *meta,
+ struct ppa_addr ppa, int nchks)
+{
+ struct nvm_dev *dev = tgt_dev->parent;
+
+ nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1);
+
+ return dev->ops->get_chk_meta(tgt_dev->parent, meta,
+ (sector_t)ppa.ppa, nchks);
+}
+EXPORT_SYMBOL(nvm_get_chunk_meta);
+
+int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
+ int nr_ppas, int type)
+{
+ struct nvm_dev *dev = tgt_dev->parent;
+ struct nvm_rq rqd;
+ int ret;
+
+ if (nr_ppas > NVM_MAX_VLBA) {
+ pr_err("nvm: unable to update all blocks atomically\n");
+ return -EINVAL;
+ }
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
+ nvm_rq_tgt_to_dev(tgt_dev, &rqd);
+
+ ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
+ nvm_free_rqd_ppalist(tgt_dev, &rqd);
+ if (ret) {
+ pr_err("nvm: failed bb mark\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(nvm_set_tgt_bb_tbl);
+
+int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
+{
+ struct nvm_dev *dev = tgt_dev->parent;
+ int ret;
+
+ if (!dev->ops->submit_io)
+ return -ENODEV;
+
+ nvm_rq_tgt_to_dev(tgt_dev, rqd);
+
+ rqd->dev = tgt_dev;
+
+ /* In case of error, fail with right address format */
+ ret = dev->ops->submit_io(dev, rqd);
+ if (ret)
+ nvm_rq_dev_to_tgt(tgt_dev, rqd);
+ return ret;
+}
+EXPORT_SYMBOL(nvm_submit_io);
+
+int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
+{
+ struct nvm_dev *dev = tgt_dev->parent;
+ int ret;
+
+ if (!dev->ops->submit_io_sync)
+ return -ENODEV;
+
+ nvm_rq_tgt_to_dev(tgt_dev, rqd);
+
+ rqd->dev = tgt_dev;
+
+ /* In case of error, fail with right address format */
+ ret = dev->ops->submit_io_sync(dev, rqd);
+ nvm_rq_dev_to_tgt(tgt_dev, rqd);
+
+ return ret;
+}
+EXPORT_SYMBOL(nvm_submit_io_sync);
+
+void nvm_end_io(struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *tgt_dev = rqd->dev;
+
+ /* Convert address space */
+ if (tgt_dev)
+ nvm_rq_dev_to_tgt(tgt_dev, rqd);
+
+ if (rqd->end_io)
+ rqd->end_io(rqd);
+}
+EXPORT_SYMBOL(nvm_end_io);
+
+/*
+ * folds a bad block list from its plane representation to its virtual
+ * block representation. The fold is done in place and reduced size is
+ * returned.
+ *
+ * If any of the planes status are bad or grown bad block, the virtual block
+ * is marked bad. If not bad, the first plane state acts as the block state.
+ */
+int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
+{
+ struct nvm_geo *geo = &dev->geo;
+ int blk, offset, pl, blktype;
+
+ if (nr_blks != geo->num_chk * geo->pln_mode)
+ return -EINVAL;
+
+ for (blk = 0; blk < geo->num_chk; blk++) {
+ offset = blk * geo->pln_mode;
+ blktype = blks[offset];
+
+ /* Bad blocks on any planes take precedence over other types */
+ for (pl = 0; pl < geo->pln_mode; pl++) {
+ if (blks[offset + pl] &
+ (NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) {
+ blktype = blks[offset + pl];
+ break;
+ }
+ }
+
+ blks[blk] = blktype;
+ }
+
+ return geo->num_chk;
+}
+EXPORT_SYMBOL(nvm_bb_tbl_fold);
+
+int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
+ u8 *blks)
+{
+ struct nvm_dev *dev = tgt_dev->parent;
+
+ nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1);
+
+ return dev->ops->get_bb_tbl(dev, ppa, blks);
+}
+EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
+
+static int nvm_core_init(struct nvm_dev *dev)
+{
+ struct nvm_geo *geo = &dev->geo;
+ int ret;
+
+ dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!dev->lun_map)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&dev->area_list);
+ INIT_LIST_HEAD(&dev->targets);
+ mutex_init(&dev->mlock);
+ spin_lock_init(&dev->lock);
+
+ ret = nvm_register_map(dev);
+ if (ret)
+ goto err_fmtype;
+
+ return 0;
+err_fmtype:
+ kfree(dev->lun_map);
+ return ret;
+}
+
+static void nvm_free(struct nvm_dev *dev)
+{
+ if (!dev)
+ return;
+
+ if (dev->dma_pool)
+ dev->ops->destroy_dma_pool(dev->dma_pool);
+
+ nvm_unregister_map(dev);
+ kfree(dev->lun_map);
+ kfree(dev);
+}
+
+static int nvm_init(struct nvm_dev *dev)
+{
+ struct nvm_geo *geo = &dev->geo;
+ int ret = -EINVAL;
+
+ if (dev->ops->identity(dev)) {
+ pr_err("nvm: device could not be identified\n");
+ goto err;
+ }
+
+ pr_debug("nvm: ver:%u.%u nvm_vendor:%x\n",
+ geo->major_ver_id, geo->minor_ver_id,
+ geo->vmnt);
+
+ ret = nvm_core_init(dev);
+ if (ret) {
+ pr_err("nvm: could not initialize core structures.\n");
+ goto err;
+ }
+
+ pr_info("nvm: registered %s [%u/%u/%u/%u/%u]\n",
+ dev->name, dev->geo.ws_min, dev->geo.ws_opt,
+ dev->geo.num_chk, dev->geo.all_luns,
+ dev->geo.num_ch);
+ return 0;
+err:
+ pr_err("nvm: failed to initialize nvm\n");
+ return ret;
+}
+
+struct nvm_dev *nvm_alloc_dev(int node)
+{
+ return kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node);
+}
+EXPORT_SYMBOL(nvm_alloc_dev);
+
+int nvm_register(struct nvm_dev *dev)
+{
+ int ret;
+
+ if (!dev->q || !dev->ops)
+ return -EINVAL;
+
+ dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
+ if (!dev->dma_pool) {
+ pr_err("nvm: could not create dma pool\n");
+ return -ENOMEM;
+ }
+
+ ret = nvm_init(dev);
+ if (ret)
+ goto err_init;
+
+ /* register device with a supported media manager */
+ down_write(&nvm_lock);
+ list_add(&dev->devices, &nvm_devices);
+ up_write(&nvm_lock);
+
+ return 0;
+err_init:
+ dev->ops->destroy_dma_pool(dev->dma_pool);
+ return ret;
+}
+EXPORT_SYMBOL(nvm_register);
+
+void nvm_unregister(struct nvm_dev *dev)
+{
+ struct nvm_target *t, *tmp;
+
+ mutex_lock(&dev->mlock);
+ list_for_each_entry_safe(t, tmp, &dev->targets, list) {
+ if (t->dev->parent != dev)
+ continue;
+ __nvm_remove_target(t, false);
+ }
+ mutex_unlock(&dev->mlock);
+
+ down_write(&nvm_lock);
+ list_del(&dev->devices);
+ up_write(&nvm_lock);
+
+ nvm_free(dev);
+}
+EXPORT_SYMBOL(nvm_unregister);
+
+static int __nvm_configure_create(struct nvm_ioctl_create *create)
+{
+ struct nvm_dev *dev;
+
+ down_write(&nvm_lock);
+ dev = nvm_find_nvm_dev(create->dev);
+ up_write(&nvm_lock);
+
+ if (!dev) {
+ pr_err("nvm: device not found\n");
+ return -EINVAL;
+ }
+
+ return nvm_create_tgt(dev, create);
+}
+
+static long nvm_ioctl_info(struct file *file, void __user *arg)
+{
+ struct nvm_ioctl_info *info;
+ struct nvm_tgt_type *tt;
+ int tgt_iter = 0;
+
+ info = memdup_user(arg, sizeof(struct nvm_ioctl_info));
+ if (IS_ERR(info))
+ return -EFAULT;
+
+ info->version[0] = NVM_VERSION_MAJOR;
+ info->version[1] = NVM_VERSION_MINOR;
+ info->version[2] = NVM_VERSION_PATCH;
+
+ down_write(&nvm_tgtt_lock);
+ list_for_each_entry(tt, &nvm_tgt_types, list) {
+ struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter];
+
+ tgt->version[0] = tt->version[0];
+ tgt->version[1] = tt->version[1];
+ tgt->version[2] = tt->version[2];
+ strncpy(tgt->tgtname, tt->name, NVM_TTYPE_NAME_MAX);
+
+ tgt_iter++;
+ }
+
+ info->tgtsize = tgt_iter;
+ up_write(&nvm_tgtt_lock);
+
+ if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) {
+ kfree(info);
+ return -EFAULT;
+ }
+
+ kfree(info);
+ return 0;
+}
+
+static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
+{
+ struct nvm_ioctl_get_devices *devices;
+ struct nvm_dev *dev;
+ int i = 0;
+
+ devices = kzalloc(sizeof(struct nvm_ioctl_get_devices), GFP_KERNEL);
+ if (!devices)
+ return -ENOMEM;
+
+ down_write(&nvm_lock);
+ list_for_each_entry(dev, &nvm_devices, devices) {
+ struct nvm_ioctl_device_info *info = &devices->info[i];
+
+ strlcpy(info->devname, dev->name, sizeof(info->devname));
+
+ /* kept for compatibility */
+ info->bmversion[0] = 1;
+ info->bmversion[1] = 0;
+ info->bmversion[2] = 0;
+ strlcpy(info->bmname, "gennvm", sizeof(info->bmname));
+ i++;
+
+ if (i > 31) {
+ pr_err("nvm: max 31 devices can be reported.\n");
+ break;
+ }
+ }
+ up_write(&nvm_lock);
+
+ devices->nr_devices = i;
+
+ if (copy_to_user(arg, devices,
+ sizeof(struct nvm_ioctl_get_devices))) {
+ kfree(devices);
+ return -EFAULT;
+ }
+
+ kfree(devices);
+ return 0;
+}
+
+static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
+{
+ struct nvm_ioctl_create create;
+
+ if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
+ return -EFAULT;
+
+ if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
+ create.conf.e.rsv != 0) {
+ pr_err("nvm: reserved config field in use\n");
+ return -EINVAL;
+ }
+
+ create.dev[DISK_NAME_LEN - 1] = '\0';
+ create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0';
+ create.tgtname[DISK_NAME_LEN - 1] = '\0';
+
+ if (create.flags != 0) {
+ __u32 flags = create.flags;
+
+ /* Check for valid flags */
+ if (flags & NVM_TARGET_FACTORY)
+ flags &= ~NVM_TARGET_FACTORY;
+
+ if (flags) {
+ pr_err("nvm: flag not supported\n");
+ return -EINVAL;
+ }
+ }
+
+ return __nvm_configure_create(&create);
+}
+
+static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
+{
+ struct nvm_ioctl_remove remove;
+ struct nvm_dev *dev;
+ int ret = 0;
+
+ if (copy_from_user(&remove, arg, sizeof(struct nvm_ioctl_remove)))
+ return -EFAULT;
+
+ remove.tgtname[DISK_NAME_LEN - 1] = '\0';
+
+ if (remove.flags != 0) {
+ pr_err("nvm: no flags supported\n");
+ return -EINVAL;
+ }
+
+ list_for_each_entry(dev, &nvm_devices, devices) {
+ ret = nvm_remove_tgt(dev, &remove);
+ if (!ret)
+ break;
+ }
+
+ return ret;
+}
+
+/* kept for compatibility reasons */
+static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
+{
+ struct nvm_ioctl_dev_init init;
+
+ if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init)))
+ return -EFAULT;
+
+ if (init.flags != 0) {
+ pr_err("nvm: no flags supported\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Kept for compatibility reasons */
+static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
+{
+ struct nvm_ioctl_dev_factory fact;
+
+ if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory)))
+ return -EFAULT;
+
+ fact.dev[DISK_NAME_LEN - 1] = '\0';
+
+ if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1))
+ return -EINVAL;
+
+ return 0;
+}
+
+static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case NVM_INFO:
+ return nvm_ioctl_info(file, argp);
+ case NVM_GET_DEVICES:
+ return nvm_ioctl_get_devices(file, argp);
+ case NVM_DEV_CREATE:
+ return nvm_ioctl_dev_create(file, argp);
+ case NVM_DEV_REMOVE:
+ return nvm_ioctl_dev_remove(file, argp);
+ case NVM_DEV_INIT:
+ return nvm_ioctl_dev_init(file, argp);
+ case NVM_DEV_FACTORY:
+ return nvm_ioctl_dev_factory(file, argp);
+ }
+ return 0;
+}
+
+static const struct file_operations _ctl_fops = {
+ .open = nonseekable_open,
+ .unlocked_ioctl = nvm_ctl_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice _nvm_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "lightnvm",
+ .nodename = "lightnvm/control",
+ .fops = &_ctl_fops,
+};
+builtin_misc_device(_nvm_misc);
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
new file mode 100644
index 000000000..f565a56b8
--- /dev/null
+++ b/drivers/lightnvm/pblk-cache.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-cache.c - pblk's write cache
+ */
+
+#include "pblk.h"
+
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
+{
+ struct request_queue *q = pblk->dev->q;
+ struct pblk_w_ctx w_ctx;
+ sector_t lba = pblk_get_lba(bio);
+ unsigned long start_time = jiffies;
+ unsigned int bpos, pos;
+ int nr_entries = pblk_get_secs(bio);
+ int i, ret;
+
+ generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio),
+ &pblk->disk->part0);
+
+ /* Update the write buffer head (mem) with the entries that we can
+ * write. The write in itself cannot fail, so there is no need to
+ * rollback from here on.
+ */
+retry:
+ ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos);
+ switch (ret) {
+ case NVM_IO_REQUEUE:
+ io_schedule();
+ goto retry;
+ case NVM_IO_ERR:
+ pblk_pipeline_stop(pblk);
+ goto out;
+ }
+
+ pblk_ppa_set_empty(&w_ctx.ppa);
+ w_ctx.flags = flags;
+ if (bio->bi_opf & REQ_PREFLUSH) {
+ w_ctx.flags |= PBLK_FLUSH_ENTRY;
+ pblk_write_kick(pblk);
+ }
+
+ if (unlikely(!bio_has_data(bio)))
+ goto out;
+
+ for (i = 0; i < nr_entries; i++) {
+ void *data = bio_data(bio);
+
+ w_ctx.lba = lba + i;
+
+ pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
+ pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);
+
+ bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+ }
+
+ atomic64_add(nr_entries, &pblk->user_wa);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_add(nr_entries, &pblk->inflight_writes);
+ atomic_long_add(nr_entries, &pblk->req_writes);
+#endif
+
+ pblk_rl_inserted(&pblk->rl, nr_entries);
+
+out:
+ generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time);
+ pblk_write_should_kick(pblk);
+ return ret;
+}
+
+/*
+ * On GC the incoming lbas are not necessarily sequential. Also, some of the
+ * lbas might not be valid entries, which are marked as empty by the GC thread
+ */
+int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
+{
+ struct pblk_w_ctx w_ctx;
+ unsigned int bpos, pos;
+ void *data = gc_rq->data;
+ int i, valid_entries;
+
+ /* Update the write buffer head (mem) with the entries that we can
+ * write. The write in itself cannot fail, so there is no need to
+ * rollback from here on.
+ */
+retry:
+ if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) {
+ io_schedule();
+ goto retry;
+ }
+
+ w_ctx.flags = PBLK_IOTYPE_GC;
+ pblk_ppa_set_empty(&w_ctx.ppa);
+
+ for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) {
+ if (gc_rq->lba_list[i] == ADDR_EMPTY)
+ continue;
+
+ w_ctx.lba = gc_rq->lba_list[i];
+
+ pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
+ pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line,
+ gc_rq->paddr_list[i], pos);
+
+ data += PBLK_EXPOSED_PAGE_SIZE;
+ valid_entries++;
+ }
+
+ WARN_ONCE(gc_rq->secs_to_gc != valid_entries,
+ "pblk: inconsistent GC write\n");
+
+ atomic64_add(valid_entries, &pblk->gc_wa);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_add(valid_entries, &pblk->inflight_writes);
+ atomic_long_add(valid_entries, &pblk->recov_gc_writes);
+#endif
+
+ pblk_write_should_kick(pblk);
+ return NVM_IO_OK;
+}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
new file mode 100644
index 000000000..8dce31dbf
--- /dev/null
+++ b/drivers/lightnvm/pblk-core.c
@@ -0,0 +1,2095 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-core.c - pblk's core functionality
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_line_mark_bb(struct work_struct *work)
+{
+ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+ ws);
+ struct pblk *pblk = line_ws->pblk;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct ppa_addr *ppa = line_ws->priv;
+ int ret;
+
+ ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
+ if (ret) {
+ struct pblk_line *line;
+ int pos;
+
+ line = &pblk->lines[pblk_ppa_to_line(*ppa)];
+ pos = pblk_ppa_to_pos(&dev->geo, *ppa);
+
+ pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n",
+ line->id, pos);
+ }
+
+ kfree(ppa);
+ mempool_free(line_ws, &pblk->gen_ws_pool);
+}
+
+static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
+ struct ppa_addr ppa_addr)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa;
+ int pos = pblk_ppa_to_pos(geo, ppa_addr);
+
+ pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos);
+ atomic_long_inc(&pblk->erase_failed);
+
+ atomic_dec(&line->blk_in_line);
+ if (test_and_set_bit(pos, line->blk_bitmap))
+ pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n",
+ line->id, pos);
+
+ /* Not necessary to mark bad blocks on 2.0 spec. */
+ if (geo->version == NVM_OCSSD_SPEC_20)
+ return;
+
+ ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC);
+ if (!ppa)
+ return;
+
+ *ppa = ppa_addr;
+ pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb,
+ GFP_ATOMIC, pblk->bb_wq);
+}
+
+static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct nvm_chk_meta *chunk;
+ struct pblk_line *line;
+ int pos;
+
+ line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)];
+ pos = pblk_ppa_to_pos(geo, rqd->ppa_addr);
+ chunk = &line->chks[pos];
+
+ atomic_dec(&line->left_seblks);
+
+ if (rqd->error) {
+ chunk->state = NVM_CHK_ST_OFFLINE;
+ pblk_mark_bb(pblk, line, rqd->ppa_addr);
+ } else {
+ chunk->state = NVM_CHK_ST_FREE;
+ }
+
+ atomic_dec(&pblk->inflight_io);
+}
+
+/* Erase completion assumes that only one block is erased at the time */
+static void pblk_end_io_erase(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+
+ __pblk_end_io_erase(pblk, rqd);
+ mempool_free(rqd, &pblk->e_rq_pool);
+}
+
+/*
+ * Get information for all chunks from the device.
+ *
+ * The caller is responsible for freeing the returned structure
+ */
+struct nvm_chk_meta *pblk_chunk_get_info(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct nvm_chk_meta *meta;
+ struct ppa_addr ppa;
+ unsigned long len;
+ int ret;
+
+ ppa.ppa = 0;
+
+ len = geo->all_chunks * sizeof(*meta);
+ meta = kzalloc(len, GFP_KERNEL);
+ if (!meta)
+ return ERR_PTR(-ENOMEM);
+
+ ret = nvm_get_chunk_meta(dev, meta, ppa, geo->all_chunks);
+ if (ret) {
+ kfree(meta);
+ return ERR_PTR(-EIO);
+ }
+
+ return meta;
+}
+
+struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk,
+ struct nvm_chk_meta *meta,
+ struct ppa_addr ppa)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int ch_off = ppa.m.grp * geo->num_chk * geo->num_lun;
+ int lun_off = ppa.m.pu * geo->num_chk;
+ int chk_off = ppa.m.chk;
+
+ return meta + ch_off + lun_off + chk_off;
+}
+
+void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list = NULL;
+
+ /* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P
+ * table is modified with reclaimed sectors, a check is done to endure
+ * that newer updates are not overwritten.
+ */
+ spin_lock(&line->lock);
+ WARN_ON(line->state == PBLK_LINESTATE_FREE);
+
+ if (test_and_set_bit(paddr, line->invalid_bitmap)) {
+ WARN_ONCE(1, "pblk: double invalidate\n");
+ spin_unlock(&line->lock);
+ return;
+ }
+ le32_add_cpu(line->vsc, -1);
+
+ if (line->state == PBLK_LINESTATE_CLOSED)
+ move_list = pblk_line_gc_list(pblk, line);
+ spin_unlock(&line->lock);
+
+ if (move_list) {
+ spin_lock(&l_mg->gc_lock);
+ spin_lock(&line->lock);
+ /* Prevent moving a line that has just been chosen for GC */
+ if (line->state == PBLK_LINESTATE_GC) {
+ spin_unlock(&line->lock);
+ spin_unlock(&l_mg->gc_lock);
+ return;
+ }
+ spin_unlock(&line->lock);
+
+ list_move_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
+ }
+}
+
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
+{
+ struct pblk_line *line;
+ u64 paddr;
+ int line_id;
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ /* Callers must ensure that the ppa points to a device address */
+ BUG_ON(pblk_addr_in_cache(ppa));
+ BUG_ON(pblk_ppa_empty(ppa));
+#endif
+
+ line_id = pblk_ppa_to_line(ppa);
+ line = &pblk->lines[line_id];
+ paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
+
+ __pblk_map_invalidate(pblk, line, paddr);
+}
+
+static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
+ unsigned int nr_secs)
+{
+ sector_t lba;
+
+ spin_lock(&pblk->trans_lock);
+ for (lba = slba; lba < slba + nr_secs; lba++) {
+ struct ppa_addr ppa;
+
+ ppa = pblk_trans_map_get(pblk, lba);
+
+ if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa))
+ pblk_map_invalidate(pblk, ppa);
+
+ pblk_ppa_set_empty(&ppa);
+ pblk_trans_map_set(pblk, lba, ppa);
+ }
+ spin_unlock(&pblk->trans_lock);
+}
+
+/* Caller must guarantee that the request is a valid type */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type)
+{
+ mempool_t *pool;
+ struct nvm_rq *rqd;
+ int rq_size;
+
+ switch (type) {
+ case PBLK_WRITE:
+ case PBLK_WRITE_INT:
+ pool = &pblk->w_rq_pool;
+ rq_size = pblk_w_rq_size;
+ break;
+ case PBLK_READ:
+ pool = &pblk->r_rq_pool;
+ rq_size = pblk_g_rq_size;
+ break;
+ default:
+ pool = &pblk->e_rq_pool;
+ rq_size = pblk_g_rq_size;
+ }
+
+ rqd = mempool_alloc(pool, GFP_KERNEL);
+ memset(rqd, 0, rq_size);
+
+ return rqd;
+}
+
+/* Typically used on completion path. Cannot guarantee request consistency */
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ mempool_t *pool;
+
+ switch (type) {
+ case PBLK_WRITE:
+ kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
+ /* fall through */
+ case PBLK_WRITE_INT:
+ pool = &pblk->w_rq_pool;
+ break;
+ case PBLK_READ:
+ pool = &pblk->r_rq_pool;
+ break;
+ case PBLK_ERASE:
+ pool = &pblk->e_rq_pool;
+ break;
+ default:
+ pblk_err(pblk, "trying to free unknown rqd type\n");
+ return;
+ }
+
+ if (rqd->meta_list)
+ nvm_dev_dma_free(dev->parent, rqd->meta_list,
+ rqd->dma_meta_list);
+ mempool_free(rqd, pool);
+}
+
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+ int nr_pages)
+{
+ struct bio_vec *bv;
+ struct page *page;
+ int i, e, nbv = 0;
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ bv = &bio->bi_io_vec[i];
+ page = bv->bv_page;
+ for (e = 0; e < bv->bv_len; e += PBLK_EXPOSED_PAGE_SIZE, nbv++)
+ if (nbv >= off)
+ mempool_free(page++, &pblk->page_bio_pool);
+ }
+}
+
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+ int nr_pages)
+{
+ struct request_queue *q = pblk->dev->q;
+ struct page *page;
+ int i, ret;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = mempool_alloc(&pblk->page_bio_pool, flags);
+
+ ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
+ if (ret != PBLK_EXPOSED_PAGE_SIZE) {
+ pblk_err(pblk, "could not add page to bio\n");
+ mempool_free(page, &pblk->page_bio_pool);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ pblk_bio_free_pages(pblk, bio, (bio->bi_vcnt - i), i);
+ return -1;
+}
+
+void pblk_write_kick(struct pblk *pblk)
+{
+ wake_up_process(pblk->writer_ts);
+ mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
+}
+
+void pblk_write_timer_fn(struct timer_list *t)
+{
+ struct pblk *pblk = from_timer(pblk, t, wtimer);
+
+ /* kick the write thread every tick to flush outstanding data */
+ pblk_write_kick(pblk);
+}
+
+void pblk_write_should_kick(struct pblk *pblk)
+{
+ unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
+
+ if (secs_avail >= pblk->min_write_pgs)
+ pblk_write_kick(pblk);
+}
+
+static void pblk_wait_for_meta(struct pblk *pblk)
+{
+ do {
+ if (!atomic_read(&pblk->inflight_io))
+ break;
+
+ schedule();
+ } while (1);
+}
+
+static void pblk_flush_writer(struct pblk *pblk)
+{
+ pblk_rb_flush(&pblk->rwb);
+ do {
+ if (!pblk_rb_sync_count(&pblk->rwb))
+ break;
+
+ pblk_write_kick(pblk);
+ schedule();
+ } while (1);
+}
+
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list = NULL;
+ int vsc = le32_to_cpu(*line->vsc);
+
+ lockdep_assert_held(&line->lock);
+
+ if (line->w_err_gc->has_write_err) {
+ if (line->gc_group != PBLK_LINEGC_WERR) {
+ line->gc_group = PBLK_LINEGC_WERR;
+ move_list = &l_mg->gc_werr_list;
+ pblk_rl_werr_line_in(&pblk->rl);
+ }
+ } else if (!vsc) {
+ if (line->gc_group != PBLK_LINEGC_FULL) {
+ line->gc_group = PBLK_LINEGC_FULL;
+ move_list = &l_mg->gc_full_list;
+ }
+ } else if (vsc < lm->high_thrs) {
+ if (line->gc_group != PBLK_LINEGC_HIGH) {
+ line->gc_group = PBLK_LINEGC_HIGH;
+ move_list = &l_mg->gc_high_list;
+ }
+ } else if (vsc < lm->mid_thrs) {
+ if (line->gc_group != PBLK_LINEGC_MID) {
+ line->gc_group = PBLK_LINEGC_MID;
+ move_list = &l_mg->gc_mid_list;
+ }
+ } else if (vsc < line->sec_in_line) {
+ if (line->gc_group != PBLK_LINEGC_LOW) {
+ line->gc_group = PBLK_LINEGC_LOW;
+ move_list = &l_mg->gc_low_list;
+ }
+ } else if (vsc == line->sec_in_line) {
+ if (line->gc_group != PBLK_LINEGC_EMPTY) {
+ line->gc_group = PBLK_LINEGC_EMPTY;
+ move_list = &l_mg->gc_empty_list;
+ }
+ } else {
+ line->state = PBLK_LINESTATE_CORRUPT;
+ line->gc_group = PBLK_LINEGC_NONE;
+ move_list = &l_mg->corrupt_list;
+ pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
+ line->id, vsc,
+ line->sec_in_line,
+ lm->high_thrs, lm->mid_thrs);
+ }
+
+ return move_list;
+}
+
+void pblk_discard(struct pblk *pblk, struct bio *bio)
+{
+ sector_t slba = pblk_get_lba(bio);
+ sector_t nr_secs = pblk_get_secs(bio);
+
+ pblk_invalidate_range(pblk, slba, nr_secs);
+}
+
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ atomic_long_inc(&pblk->write_failed);
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ /* Empty page read is not necessarily an error (e.g., L2P recovery) */
+ if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+ atomic_long_inc(&pblk->read_empty);
+ return;
+ }
+
+ switch (rqd->error) {
+ case NVM_RSP_WARN_HIGHECC:
+ atomic_long_inc(&pblk->read_high_ecc);
+ break;
+ case NVM_RSP_ERR_FAILECC:
+ case NVM_RSP_ERR_FAILCRC:
+ atomic_long_inc(&pblk->read_failed);
+ break;
+ default:
+ pblk_err(pblk, "unknown read error:%d\n", rqd->error);
+ }
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write)
+{
+ pblk->sec_per_write = sec_per_write;
+}
+
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+
+ atomic_inc(&pblk->inflight_io);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ if (pblk_check_io(pblk, rqd))
+ return NVM_IO_ERR;
+#endif
+
+ return nvm_submit_io(dev, rqd);
+}
+
+int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+
+ atomic_inc(&pblk->inflight_io);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ if (pblk_check_io(pblk, rqd))
+ return NVM_IO_ERR;
+#endif
+
+ return nvm_submit_io_sync(dev, rqd);
+}
+
+static void pblk_bio_map_addr_endio(struct bio *bio)
+{
+ bio_put(bio);
+}
+
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+ unsigned int nr_secs, unsigned int len,
+ int alloc_type, gfp_t gfp_mask)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ void *kaddr = data;
+ struct page *page;
+ struct bio *bio;
+ int i, ret;
+
+ if (alloc_type == PBLK_KMALLOC_META)
+ return bio_map_kern(dev->q, kaddr, len, gfp_mask);
+
+ bio = bio_kmalloc(gfp_mask, nr_secs);
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < nr_secs; i++) {
+ page = vmalloc_to_page(kaddr);
+ if (!page) {
+ pblk_err(pblk, "could not map vmalloc bio\n");
+ bio_put(bio);
+ bio = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
+ if (ret != PAGE_SIZE) {
+ pblk_err(pblk, "could not add page to bio\n");
+ bio_put(bio);
+ bio = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ kaddr += PAGE_SIZE;
+ }
+
+ bio->bi_end_io = pblk_bio_map_addr_endio;
+out:
+ return bio;
+}
+
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+ unsigned long secs_to_flush)
+{
+ int max = pblk->sec_per_write;
+ int min = pblk->min_write_pgs;
+ int secs_to_sync = 0;
+
+ if (secs_avail >= max)
+ secs_to_sync = max;
+ else if (secs_avail >= min)
+ secs_to_sync = min * (secs_avail / min);
+ else if (secs_to_flush)
+ secs_to_sync = min;
+
+ return secs_to_sync;
+}
+
+void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
+{
+ u64 addr;
+ int i;
+
+ spin_lock(&line->lock);
+ addr = find_next_zero_bit(line->map_bitmap,
+ pblk->lm.sec_per_line, line->cur_sec);
+ line->cur_sec = addr - nr_secs;
+
+ for (i = 0; i < nr_secs; i++, line->cur_sec--)
+ WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap));
+ spin_unlock(&line->lock);
+}
+
+u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
+{
+ u64 addr;
+ int i;
+
+ lockdep_assert_held(&line->lock);
+
+ /* logic error: ppa out-of-bounds. Prevent generating bad address */
+ if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) {
+ WARN(1, "pblk: page allocation out of bounds\n");
+ nr_secs = pblk->lm.sec_per_line - line->cur_sec;
+ }
+
+ line->cur_sec = addr = find_next_zero_bit(line->map_bitmap,
+ pblk->lm.sec_per_line, line->cur_sec);
+ for (i = 0; i < nr_secs; i++, line->cur_sec++)
+ WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap));
+
+ return addr;
+}
+
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
+{
+ u64 addr;
+
+ /* Lock needed in case a write fails and a recovery needs to remap
+ * failed write buffer entries
+ */
+ spin_lock(&line->lock);
+ addr = __pblk_alloc_page(pblk, line, nr_secs);
+ line->left_msecs -= nr_secs;
+ WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n");
+ spin_unlock(&line->lock);
+
+ return addr;
+}
+
+u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line)
+{
+ u64 paddr;
+
+ spin_lock(&line->lock);
+ paddr = find_next_zero_bit(line->map_bitmap,
+ pblk->lm.sec_per_line, line->cur_sec);
+ spin_unlock(&line->lock);
+
+ return paddr;
+}
+
+/*
+ * Submit emeta to one LUN in the raid line at the time to avoid a deadlock when
+ * taking the per LUN semaphore.
+ */
+static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
+ void *emeta_buf, u64 paddr, int dir)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ void *ppa_list, *meta_list;
+ struct bio *bio;
+ struct nvm_rq rqd;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ int min = pblk->min_write_pgs;
+ int left_ppas = lm->emeta_sec[0];
+ int id = line->id;
+ int rq_ppas, rq_len;
+ int cmd_op, bio_op;
+ int i, j;
+ int ret;
+
+ if (dir == PBLK_WRITE) {
+ bio_op = REQ_OP_WRITE;
+ cmd_op = NVM_OP_PWRITE;
+ } else if (dir == PBLK_READ) {
+ bio_op = REQ_OP_READ;
+ cmd_op = NVM_OP_PREAD;
+ } else
+ return -EINVAL;
+
+ meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &dma_meta_list);
+ if (!meta_list)
+ return -ENOMEM;
+
+ ppa_list = meta_list + pblk_dma_meta_size;
+ dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+next_rq:
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ rq_len = rq_ppas * geo->csecs;
+
+ bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
+ l_mg->emeta_alloc_type, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ ret = PTR_ERR(bio);
+ goto free_rqd_dma;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, bio_op, 0);
+
+ rqd.bio = bio;
+ rqd.meta_list = meta_list;
+ rqd.ppa_list = ppa_list;
+ rqd.dma_meta_list = dma_meta_list;
+ rqd.dma_ppa_list = dma_ppa_list;
+ rqd.opcode = cmd_op;
+ rqd.nr_ppas = rq_ppas;
+
+ if (dir == PBLK_WRITE) {
+ struct pblk_sec_meta *meta_list = rqd.meta_list;
+
+ rqd.flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
+ for (i = 0; i < rqd.nr_ppas; ) {
+ spin_lock(&line->lock);
+ paddr = __pblk_alloc_page(pblk, line, min);
+ spin_unlock(&line->lock);
+ for (j = 0; j < min; j++, i++, paddr++) {
+ meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+ rqd.ppa_list[i] =
+ addr_to_gen_ppa(pblk, paddr, id);
+ }
+ }
+ } else {
+ for (i = 0; i < rqd.nr_ppas; ) {
+ struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
+ int pos = pblk_ppa_to_pos(geo, ppa);
+ int read_type = PBLK_READ_RANDOM;
+
+ if (pblk_io_aligned(pblk, rq_ppas))
+ read_type = PBLK_READ_SEQUENTIAL;
+ rqd.flags = pblk_set_read_mode(pblk, read_type);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ paddr += min;
+ if (pblk_boundary_paddr_checks(pblk, paddr)) {
+ pblk_err(pblk, "corrupt emeta line:%d\n",
+ line->id);
+ bio_put(bio);
+ ret = -EINTR;
+ goto free_rqd_dma;
+ }
+
+ ppa = addr_to_gen_ppa(pblk, paddr, id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+ }
+
+ if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
+ pblk_err(pblk, "corrupt emeta line:%d\n",
+ line->id);
+ bio_put(bio);
+ ret = -EINTR;
+ goto free_rqd_dma;
+ }
+
+ for (j = 0; j < min; j++, i++, paddr++)
+ rqd.ppa_list[i] =
+ addr_to_gen_ppa(pblk, paddr, line->id);
+ }
+ }
+
+ ret = pblk_submit_io_sync(pblk, &rqd);
+ if (ret) {
+ pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
+ bio_put(bio);
+ goto free_rqd_dma;
+ }
+
+ atomic_dec(&pblk->inflight_io);
+
+ if (rqd.error) {
+ if (dir == PBLK_WRITE)
+ pblk_log_write_err(pblk, &rqd);
+ else
+ pblk_log_read_err(pblk, &rqd);
+ }
+
+ emeta_buf += rq_len;
+ left_ppas -= rq_ppas;
+ if (left_ppas)
+ goto next_rq;
+free_rqd_dma:
+ nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
+ return ret;
+}
+
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ int bit;
+
+ /* This usually only happens on bad lines */
+ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ if (bit >= lm->blk_per_line)
+ return -1;
+
+ return bit * geo->ws_opt;
+}
+
+static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr, int dir)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct bio *bio;
+ struct nvm_rq rqd;
+ __le64 *lba_list = NULL;
+ int i, ret;
+ int cmd_op, bio_op;
+ int flags;
+
+ if (dir == PBLK_WRITE) {
+ bio_op = REQ_OP_WRITE;
+ cmd_op = NVM_OP_PWRITE;
+ flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
+ lba_list = emeta_to_lbas(pblk, line->emeta->buf);
+ } else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) {
+ bio_op = REQ_OP_READ;
+ cmd_op = NVM_OP_PREAD;
+ flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
+ } else
+ return -EINVAL;
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd.dma_meta_list);
+ if (!rqd.meta_list)
+ return -ENOMEM;
+
+ rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size;
+ rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size;
+
+ bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ ret = PTR_ERR(bio);
+ goto free_ppa_list;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, bio_op, 0);
+
+ rqd.bio = bio;
+ rqd.opcode = cmd_op;
+ rqd.flags = flags;
+ rqd.nr_ppas = lm->smeta_sec;
+
+ for (i = 0; i < lm->smeta_sec; i++, paddr++) {
+ struct pblk_sec_meta *meta_list = rqd.meta_list;
+
+ rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+
+ if (dir == PBLK_WRITE) {
+ __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
+ meta_list[i].lba = lba_list[paddr] = addr_empty;
+ }
+ }
+
+ /*
+ * This I/O is sent by the write thread when a line is replace. Since
+ * the write thread is the only one sending write and erase commands,
+ * there is no need to take the LUN semaphore.
+ */
+ ret = pblk_submit_io_sync(pblk, &rqd);
+ if (ret) {
+ pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
+ bio_put(bio);
+ goto free_ppa_list;
+ }
+
+ atomic_dec(&pblk->inflight_io);
+
+ if (rqd.error) {
+ if (dir == PBLK_WRITE) {
+ pblk_log_write_err(pblk, &rqd);
+ ret = 1;
+ } else if (dir == PBLK_READ)
+ pblk_log_read_err(pblk, &rqd);
+ }
+
+free_ppa_list:
+ nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
+
+ return ret;
+}
+
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
+{
+ u64 bpaddr = pblk_line_smeta_start(pblk, line);
+
+ return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV);
+}
+
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
+ void *emeta_buf)
+{
+ return pblk_line_submit_emeta_io(pblk, line, emeta_buf,
+ line->emeta_ssec, PBLK_READ);
+}
+
+static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct ppa_addr ppa)
+{
+ rqd->opcode = NVM_OP_ERASE;
+ rqd->ppa_addr = ppa;
+ rqd->nr_ppas = 1;
+ rqd->flags = pblk_set_progr_mode(pblk, PBLK_ERASE);
+ rqd->bio = NULL;
+}
+
+static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
+{
+ struct nvm_rq rqd = {NULL};
+ int ret;
+
+ pblk_setup_e_rq(pblk, &rqd, ppa);
+
+ /* The write thread schedules erases so that it minimizes disturbances
+ * with writes. Thus, there is no need to take the LUN semaphore.
+ */
+ ret = pblk_submit_io_sync(pblk, &rqd);
+ rqd.private = pblk;
+ __pblk_end_io_erase(pblk, &rqd);
+
+ return ret;
+}
+
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct ppa_addr ppa;
+ int ret, bit = -1;
+
+ /* Erase only good blocks, one at a time */
+ do {
+ spin_lock(&line->lock);
+ bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line,
+ bit + 1);
+ if (bit >= lm->blk_per_line) {
+ spin_unlock(&line->lock);
+ break;
+ }
+
+ ppa = pblk->luns[bit].bppa; /* set ch and lun */
+ ppa.a.blk = line->id;
+
+ atomic_dec(&line->left_eblks);
+ WARN_ON(test_and_set_bit(bit, line->erase_bitmap));
+ spin_unlock(&line->lock);
+
+ ret = pblk_blk_erase_sync(pblk, ppa);
+ if (ret) {
+ pblk_err(pblk, "failed to erase line %d\n", line->id);
+ return ret;
+ }
+ } while (1);
+
+ return 0;
+}
+
+static void pblk_line_setup_metadata(struct pblk_line *line,
+ struct pblk_line_mgmt *l_mg,
+ struct pblk_line_meta *lm)
+{
+ int meta_line;
+
+ lockdep_assert_held(&l_mg->free_lock);
+
+retry_meta:
+ meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+ if (meta_line == PBLK_DATA_LINES) {
+ spin_unlock(&l_mg->free_lock);
+ io_schedule();
+ spin_lock(&l_mg->free_lock);
+ goto retry_meta;
+ }
+
+ set_bit(meta_line, &l_mg->meta_bitmap);
+ line->meta_line = meta_line;
+
+ line->smeta = l_mg->sline_meta[meta_line];
+ line->emeta = l_mg->eline_meta[meta_line];
+
+ memset(line->smeta, 0, lm->smeta_len);
+ memset(line->emeta->buf, 0, lm->emeta_len[0]);
+
+ line->emeta->mem = 0;
+ atomic_set(&line->emeta->sync, 0);
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_line *cur)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_emeta *emeta = line->emeta;
+ struct line_emeta *emeta_buf = emeta->buf;
+ struct line_smeta *smeta_buf = (struct line_smeta *)line->smeta;
+ int nr_blk_line;
+
+ /* After erasing the line, new bad blocks might appear and we risk
+ * having an invalid line
+ */
+ nr_blk_line = lm->blk_per_line -
+ bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+ if (nr_blk_line < lm->min_blk_line) {
+ spin_lock(&l_mg->free_lock);
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_BAD;
+ spin_unlock(&line->lock);
+
+ list_add_tail(&line->list, &l_mg->bad_list);
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_debug(pblk, "line %d is bad\n", line->id);
+
+ return 0;
+ }
+
+ /* Run-time metadata */
+ line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta);
+
+ /* Mark LUNs allocated in this line (all for now) */
+ bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
+
+ smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
+ memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16);
+ smeta_buf->header.id = cpu_to_le32(line->id);
+ smeta_buf->header.type = cpu_to_le16(line->type);
+ smeta_buf->header.version_major = SMETA_VERSION_MAJOR;
+ smeta_buf->header.version_minor = SMETA_VERSION_MINOR;
+
+ /* Start metadata */
+ smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
+ smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns);
+
+ /* Fill metadata among lines */
+ if (cur) {
+ memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len);
+ smeta_buf->prev_id = cpu_to_le32(cur->id);
+ cur->emeta->buf->next_id = cpu_to_le32(line->id);
+ } else {
+ smeta_buf->prev_id = cpu_to_le32(PBLK_LINE_EMPTY);
+ }
+
+ /* All smeta must be set at this point */
+ smeta_buf->header.crc = cpu_to_le32(
+ pblk_calc_meta_header_crc(pblk, &smeta_buf->header));
+ smeta_buf->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta_buf));
+
+ /* End metadata */
+ memcpy(&emeta_buf->header, &smeta_buf->header,
+ sizeof(struct line_header));
+
+ emeta_buf->header.version_major = EMETA_VERSION_MAJOR;
+ emeta_buf->header.version_minor = EMETA_VERSION_MINOR;
+ emeta_buf->header.crc = cpu_to_le32(
+ pblk_calc_meta_header_crc(pblk, &emeta_buf->header));
+
+ emeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
+ emeta_buf->nr_lbas = cpu_to_le64(line->sec_in_line);
+ emeta_buf->nr_valid_lbas = cpu_to_le64(0);
+ emeta_buf->next_id = cpu_to_le32(PBLK_LINE_EMPTY);
+ emeta_buf->crc = cpu_to_le32(0);
+ emeta_buf->prev_id = smeta_buf->prev_id;
+
+ return 1;
+}
+
+static int pblk_line_alloc_bitmaps(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+
+ line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!line->map_bitmap)
+ return -ENOMEM;
+
+ /* will be initialized using bb info from map_bitmap */
+ line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!line->invalid_bitmap) {
+ kfree(line->map_bitmap);
+ line->map_bitmap = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
+ int init)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ u64 off;
+ int bit = -1;
+ int emeta_secs;
+
+ line->sec_in_line = lm->sec_per_line;
+
+ /* Capture bad block information on line mapping bitmaps */
+ while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line,
+ bit + 1)) < lm->blk_per_line) {
+ off = bit * geo->ws_opt;
+ bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off,
+ lm->sec_per_line);
+ bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
+ lm->sec_per_line);
+ line->sec_in_line -= geo->clba;
+ }
+
+ /* Mark smeta metadata sectors as bad sectors */
+ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ off = bit * geo->ws_opt;
+ bitmap_set(line->map_bitmap, off, lm->smeta_sec);
+ line->sec_in_line -= lm->smeta_sec;
+ line->smeta_ssec = off;
+ line->cur_sec = off + lm->smeta_sec;
+
+ if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
+ pblk_debug(pblk, "line smeta I/O failed. Retry\n");
+ return 0;
+ }
+
+ bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
+
+ /* Mark emeta metadata sectors as bad sectors. We need to consider bad
+ * blocks to make sure that there are enough sectors to store emeta
+ */
+ emeta_secs = lm->emeta_sec[0];
+ off = lm->sec_per_line;
+ while (emeta_secs) {
+ off -= geo->ws_opt;
+ if (!test_bit(off, line->invalid_bitmap)) {
+ bitmap_set(line->invalid_bitmap, off, geo->ws_opt);
+ emeta_secs -= geo->ws_opt;
+ }
+ }
+
+ line->emeta_ssec = off;
+ line->sec_in_line -= lm->emeta_sec[0];
+ line->nr_valid_lbas = 0;
+ line->left_msecs = line->sec_in_line;
+ *line->vsc = cpu_to_le32(line->sec_in_line);
+
+ if (lm->sec_per_line - line->sec_in_line !=
+ bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) {
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_BAD;
+ spin_unlock(&line->lock);
+
+ list_add_tail(&line->list, &l_mg->bad_list);
+ pblk_err(pblk, "unexpected line %d is bad\n", line->id);
+
+ return 0;
+ }
+
+ return 1;
+}
+
+static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int blk_to_erase = atomic_read(&line->blk_in_line);
+ int i;
+
+ for (i = 0; i < lm->blk_per_line; i++) {
+ struct pblk_lun *rlun = &pblk->luns[i];
+ int pos = pblk_ppa_to_pos(geo, rlun->bppa);
+ int state = line->chks[pos].state;
+
+ /* Free chunks should not be erased */
+ if (state & NVM_CHK_ST_FREE) {
+ set_bit(pblk_ppa_to_pos(geo, rlun->bppa),
+ line->erase_bitmap);
+ blk_to_erase--;
+ }
+ }
+
+ return blk_to_erase;
+}
+
+static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ int blk_in_line = atomic_read(&line->blk_in_line);
+ int blk_to_erase;
+
+ /* Bad blocks do not need to be erased */
+ bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
+
+ spin_lock(&line->lock);
+
+ /* If we have not written to this line, we need to mark up free chunks
+ * as already erased
+ */
+ if (line->state == PBLK_LINESTATE_NEW) {
+ blk_to_erase = pblk_prepare_new_line(pblk, line);
+ line->state = PBLK_LINESTATE_FREE;
+ } else {
+ blk_to_erase = blk_in_line;
+ }
+
+ if (blk_in_line < lm->min_blk_line) {
+ spin_unlock(&line->lock);
+ return -EAGAIN;
+ }
+
+ if (line->state != PBLK_LINESTATE_FREE) {
+ WARN(1, "pblk: corrupted line %d, state %d\n",
+ line->id, line->state);
+ spin_unlock(&line->lock);
+ return -EINTR;
+ }
+
+ line->state = PBLK_LINESTATE_OPEN;
+
+ atomic_set(&line->left_eblks, blk_to_erase);
+ atomic_set(&line->left_seblks, blk_to_erase);
+
+ line->meta_distance = lm->meta_distance;
+ spin_unlock(&line->lock);
+
+ kref_init(&line->ref);
+
+ return 0;
+}
+
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ int ret;
+
+ spin_lock(&l_mg->free_lock);
+ l_mg->data_line = line;
+ list_del(&line->list);
+
+ ret = pblk_line_prepare(pblk, line);
+ if (ret) {
+ list_add(&line->list, &l_mg->free_list);
+ spin_unlock(&l_mg->free_lock);
+ return ret;
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ ret = pblk_line_alloc_bitmaps(pblk, line);
+ if (ret)
+ goto fail;
+
+ if (!pblk_line_init_bb(pblk, line, 0)) {
+ ret = -EINTR;
+ goto fail;
+ }
+
+ pblk_rl_free_lines_dec(&pblk->rl, line, true);
+ return 0;
+
+fail:
+ spin_lock(&l_mg->free_lock);
+ list_add(&line->list, &l_mg->free_list);
+ spin_unlock(&l_mg->free_lock);
+
+ return ret;
+}
+
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
+{
+ kfree(line->map_bitmap);
+ line->map_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+}
+
+static void pblk_line_reinit(struct pblk_line *line)
+{
+ *line->vsc = cpu_to_le32(EMPTY_ENTRY);
+
+ line->map_bitmap = NULL;
+ line->invalid_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+}
+
+void pblk_line_free(struct pblk_line *line)
+{
+ kfree(line->map_bitmap);
+ kfree(line->invalid_bitmap);
+
+ pblk_line_reinit(line);
+}
+
+struct pblk_line *pblk_line_get(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line *line;
+ int ret, bit;
+
+ lockdep_assert_held(&l_mg->free_lock);
+
+retry:
+ if (list_empty(&l_mg->free_list)) {
+ pblk_err(pblk, "no free lines\n");
+ return NULL;
+ }
+
+ line = list_first_entry(&l_mg->free_list, struct pblk_line, list);
+ list_del(&line->list);
+ l_mg->nr_free_lines--;
+
+ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ if (unlikely(bit >= lm->blk_per_line)) {
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_BAD;
+ spin_unlock(&line->lock);
+
+ list_add_tail(&line->list, &l_mg->bad_list);
+
+ pblk_debug(pblk, "line %d is bad\n", line->id);
+ goto retry;
+ }
+
+ ret = pblk_line_prepare(pblk, line);
+ if (ret) {
+ switch (ret) {
+ case -EAGAIN:
+ list_add(&line->list, &l_mg->bad_list);
+ goto retry;
+ case -EINTR:
+ list_add(&line->list, &l_mg->corrupt_list);
+ goto retry;
+ default:
+ pblk_err(pblk, "failed to prepare line %d\n", line->id);
+ list_add(&line->list, &l_mg->free_list);
+ l_mg->nr_free_lines++;
+ return NULL;
+ }
+ }
+
+ return line;
+}
+
+static struct pblk_line *pblk_line_retry(struct pblk *pblk,
+ struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *retry_line;
+
+retry:
+ spin_lock(&l_mg->free_lock);
+ retry_line = pblk_line_get(pblk);
+ if (!retry_line) {
+ l_mg->data_line = NULL;
+ spin_unlock(&l_mg->free_lock);
+ return NULL;
+ }
+
+ retry_line->map_bitmap = line->map_bitmap;
+ retry_line->invalid_bitmap = line->invalid_bitmap;
+ retry_line->smeta = line->smeta;
+ retry_line->emeta = line->emeta;
+ retry_line->meta_line = line->meta_line;
+
+ pblk_line_reinit(line);
+
+ l_mg->data_line = retry_line;
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_rl_free_lines_dec(&pblk->rl, line, false);
+
+ if (pblk_line_erase(pblk, retry_line))
+ goto retry;
+
+ return retry_line;
+}
+
+static void pblk_set_space_limit(struct pblk *pblk)
+{
+ struct pblk_rl *rl = &pblk->rl;
+
+ atomic_set(&rl->rb_space, 0);
+}
+
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line;
+
+ spin_lock(&l_mg->free_lock);
+ line = pblk_line_get(pblk);
+ if (!line) {
+ spin_unlock(&l_mg->free_lock);
+ return NULL;
+ }
+
+ line->seq_nr = l_mg->d_seq_nr++;
+ line->type = PBLK_LINETYPE_DATA;
+ l_mg->data_line = line;
+
+ pblk_line_setup_metadata(line, l_mg, &pblk->lm);
+
+ /* Allocate next line for preparation */
+ l_mg->data_next = pblk_line_get(pblk);
+ if (!l_mg->data_next) {
+ /* If we cannot get a new line, we need to stop the pipeline.
+ * Only allow as many writes in as we can store safely and then
+ * fail gracefully
+ */
+ pblk_set_space_limit(pblk);
+
+ l_mg->data_next = NULL;
+ } else {
+ l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+ l_mg->data_next->type = PBLK_LINETYPE_DATA;
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ if (pblk_line_alloc_bitmaps(pblk, line))
+ return NULL;
+
+ if (pblk_line_erase(pblk, line)) {
+ line = pblk_line_retry(pblk, line);
+ if (!line)
+ return NULL;
+ }
+
+retry_setup:
+ if (!pblk_line_init_metadata(pblk, line, NULL)) {
+ line = pblk_line_retry(pblk, line);
+ if (!line)
+ return NULL;
+
+ goto retry_setup;
+ }
+
+ if (!pblk_line_init_bb(pblk, line, 1)) {
+ line = pblk_line_retry(pblk, line);
+ if (!line)
+ return NULL;
+
+ goto retry_setup;
+ }
+
+ pblk_rl_free_lines_dec(&pblk->rl, line, true);
+
+ return line;
+}
+
+static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line)
+{
+ lockdep_assert_held(&pblk->l_mg.free_lock);
+
+ pblk_set_space_limit(pblk);
+ pblk->state = PBLK_STATE_STOPPING;
+}
+
+static void pblk_line_close_meta_sync(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line *line, *tline;
+ LIST_HEAD(list);
+
+ spin_lock(&l_mg->close_lock);
+ if (list_empty(&l_mg->emeta_list)) {
+ spin_unlock(&l_mg->close_lock);
+ return;
+ }
+
+ list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev);
+ spin_unlock(&l_mg->close_lock);
+
+ list_for_each_entry_safe(line, tline, &list, list) {
+ struct pblk_emeta *emeta = line->emeta;
+
+ while (emeta->mem < lm->emeta_len[0]) {
+ int ret;
+
+ ret = pblk_submit_meta_io(pblk, line);
+ if (ret) {
+ pblk_err(pblk, "sync meta line %d failed (%d)\n",
+ line->id, ret);
+ return;
+ }
+ }
+ }
+
+ pblk_wait_for_meta(pblk);
+ flush_workqueue(pblk->close_wq);
+}
+
+void __pblk_pipeline_flush(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ int ret;
+
+ spin_lock(&l_mg->free_lock);
+ if (pblk->state == PBLK_STATE_RECOVERING ||
+ pblk->state == PBLK_STATE_STOPPED) {
+ spin_unlock(&l_mg->free_lock);
+ return;
+ }
+ pblk->state = PBLK_STATE_RECOVERING;
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_flush_writer(pblk);
+ pblk_wait_for_meta(pblk);
+
+ ret = pblk_recov_pad(pblk);
+ if (ret) {
+ pblk_err(pblk, "could not close data on teardown(%d)\n", ret);
+ return;
+ }
+
+ flush_workqueue(pblk->bb_wq);
+ pblk_line_close_meta_sync(pblk);
+}
+
+void __pblk_pipeline_stop(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+
+ spin_lock(&l_mg->free_lock);
+ pblk->state = PBLK_STATE_STOPPED;
+ l_mg->data_line = NULL;
+ l_mg->data_next = NULL;
+ spin_unlock(&l_mg->free_lock);
+}
+
+void pblk_pipeline_stop(struct pblk *pblk)
+{
+ __pblk_pipeline_flush(pblk);
+ __pblk_pipeline_stop(pblk);
+}
+
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *cur, *new = NULL;
+ unsigned int left_seblks;
+
+ new = l_mg->data_next;
+ if (!new)
+ goto out;
+
+ spin_lock(&l_mg->free_lock);
+ cur = l_mg->data_line;
+ l_mg->data_line = new;
+
+ pblk_line_setup_metadata(new, l_mg, &pblk->lm);
+ spin_unlock(&l_mg->free_lock);
+
+retry_erase:
+ left_seblks = atomic_read(&new->left_seblks);
+ if (left_seblks) {
+ /* If line is not fully erased, erase it */
+ if (atomic_read(&new->left_eblks)) {
+ if (pblk_line_erase(pblk, new))
+ goto out;
+ } else {
+ io_schedule();
+ }
+ goto retry_erase;
+ }
+
+ if (pblk_line_alloc_bitmaps(pblk, new))
+ return NULL;
+
+retry_setup:
+ if (!pblk_line_init_metadata(pblk, new, cur)) {
+ new = pblk_line_retry(pblk, new);
+ if (!new)
+ goto out;
+
+ goto retry_setup;
+ }
+
+ if (!pblk_line_init_bb(pblk, new, 1)) {
+ new = pblk_line_retry(pblk, new);
+ if (!new)
+ goto out;
+
+ goto retry_setup;
+ }
+
+ pblk_rl_free_lines_dec(&pblk->rl, new, true);
+
+ /* Allocate next line for preparation */
+ spin_lock(&l_mg->free_lock);
+ l_mg->data_next = pblk_line_get(pblk);
+ if (!l_mg->data_next) {
+ /* If we cannot get a new line, we need to stop the pipeline.
+ * Only allow as many writes in as we can store safely and then
+ * fail gracefully
+ */
+ pblk_stop_writes(pblk, new);
+ l_mg->data_next = NULL;
+ } else {
+ l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+ l_mg->data_next->type = PBLK_LINETYPE_DATA;
+ }
+ spin_unlock(&l_mg->free_lock);
+
+out:
+ return new;
+}
+
+static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_gc *gc = &pblk->gc;
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_GC);
+ line->state = PBLK_LINESTATE_FREE;
+ line->gc_group = PBLK_LINEGC_NONE;
+ pblk_line_free(line);
+
+ if (line->w_err_gc->has_write_err) {
+ pblk_rl_werr_line_out(&pblk->rl);
+ line->w_err_gc->has_write_err = 0;
+ }
+
+ spin_unlock(&line->lock);
+ atomic_dec(&gc->pipeline_gc);
+
+ spin_lock(&l_mg->free_lock);
+ list_add_tail(&line->list, &l_mg->free_list);
+ l_mg->nr_free_lines++;
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_rl_free_lines_inc(&pblk->rl, line);
+}
+
+static void pblk_line_put_ws(struct work_struct *work)
+{
+ struct pblk_line_ws *line_put_ws = container_of(work,
+ struct pblk_line_ws, ws);
+ struct pblk *pblk = line_put_ws->pblk;
+ struct pblk_line *line = line_put_ws->line;
+
+ __pblk_line_put(pblk, line);
+ mempool_free(line_put_ws, &pblk->gen_ws_pool);
+}
+
+void pblk_line_put(struct kref *ref)
+{
+ struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+ struct pblk *pblk = line->pblk;
+
+ __pblk_line_put(pblk, line);
+}
+
+void pblk_line_put_wq(struct kref *ref)
+{
+ struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+ struct pblk *pblk = line->pblk;
+ struct pblk_line_ws *line_put_ws;
+
+ line_put_ws = mempool_alloc(&pblk->gen_ws_pool, GFP_ATOMIC);
+ if (!line_put_ws)
+ return;
+
+ line_put_ws->pblk = pblk;
+ line_put_ws->line = line;
+ line_put_ws->priv = NULL;
+
+ INIT_WORK(&line_put_ws->ws, pblk_line_put_ws);
+ queue_work(pblk->r_end_wq, &line_put_ws->ws);
+}
+
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
+{
+ struct nvm_rq *rqd;
+ int err;
+
+ rqd = pblk_alloc_rqd(pblk, PBLK_ERASE);
+
+ pblk_setup_e_rq(pblk, rqd, ppa);
+
+ rqd->end_io = pblk_end_io_erase;
+ rqd->private = pblk;
+
+ /* The write thread schedules erases so that it minimizes disturbances
+ * with writes. Thus, there is no need to take the LUN semaphore.
+ */
+ err = pblk_submit_io(pblk, rqd);
+ if (err) {
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ pblk_err(pblk, "could not async erase line:%d,blk:%d\n",
+ pblk_ppa_to_line(ppa),
+ pblk_ppa_to_pos(geo, ppa));
+ }
+
+ return err;
+}
+
+struct pblk_line *pblk_line_get_data(struct pblk *pblk)
+{
+ return pblk->l_mg.data_line;
+}
+
+/* For now, always erase next line */
+struct pblk_line *pblk_line_get_erase(struct pblk *pblk)
+{
+ return pblk->l_mg.data_next;
+}
+
+int pblk_line_is_full(struct pblk_line *line)
+{
+ return (line->left_msecs == 0);
+}
+
+static void pblk_line_should_sync_meta(struct pblk *pblk)
+{
+ if (pblk_rl_is_limit(&pblk->rl))
+ pblk_line_close_meta_sync(pblk);
+}
+
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list;
+ int i;
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line),
+ "pblk: corrupt closed line %d\n", line->id);
+#endif
+
+ spin_lock(&l_mg->free_lock);
+ WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap));
+ spin_unlock(&l_mg->free_lock);
+
+ spin_lock(&l_mg->gc_lock);
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_OPEN);
+ line->state = PBLK_LINESTATE_CLOSED;
+ move_list = pblk_line_gc_list(pblk, line);
+
+ list_add_tail(&line->list, move_list);
+
+ kfree(line->map_bitmap);
+ line->map_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+
+ for (i = 0; i < lm->blk_per_line; i++) {
+ struct pblk_lun *rlun = &pblk->luns[i];
+ int pos = pblk_ppa_to_pos(geo, rlun->bppa);
+ int state = line->chks[pos].state;
+
+ if (!(state & NVM_CHK_ST_OFFLINE))
+ state = NVM_CHK_ST_CLOSED;
+ }
+
+ spin_unlock(&line->lock);
+ spin_unlock(&l_mg->gc_lock);
+}
+
+void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_emeta *emeta = line->emeta;
+ struct line_emeta *emeta_buf = emeta->buf;
+ struct wa_counters *wa = emeta_to_wa(lm, emeta_buf);
+
+ /* No need for exact vsc value; avoid a big line lock and take aprox. */
+ memcpy(emeta_to_vsc(pblk, emeta_buf), l_mg->vsc_list, lm->vsc_list_len);
+ memcpy(emeta_to_bb(emeta_buf), line->blk_bitmap, lm->blk_bitmap_len);
+
+ wa->user = cpu_to_le64(atomic64_read(&pblk->user_wa));
+ wa->pad = cpu_to_le64(atomic64_read(&pblk->pad_wa));
+ wa->gc = cpu_to_le64(atomic64_read(&pblk->gc_wa));
+
+ if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) {
+ emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
+ memcpy(emeta_buf->header.uuid, pblk->instance_uuid, 16);
+ emeta_buf->header.id = cpu_to_le32(line->id);
+ emeta_buf->header.type = cpu_to_le16(line->type);
+ emeta_buf->header.version_major = EMETA_VERSION_MAJOR;
+ emeta_buf->header.version_minor = EMETA_VERSION_MINOR;
+ emeta_buf->header.crc = cpu_to_le32(
+ pblk_calc_meta_header_crc(pblk, &emeta_buf->header));
+ }
+
+ emeta_buf->nr_valid_lbas = cpu_to_le64(line->nr_valid_lbas);
+ emeta_buf->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, emeta_buf));
+
+ spin_lock(&l_mg->close_lock);
+ spin_lock(&line->lock);
+
+ /* Update the in-memory start address for emeta, in case it has
+ * shifted due to write errors
+ */
+ if (line->emeta_ssec != line->cur_sec)
+ line->emeta_ssec = line->cur_sec;
+
+ list_add_tail(&line->list, &l_mg->emeta_list);
+ spin_unlock(&line->lock);
+ spin_unlock(&l_mg->close_lock);
+
+ pblk_line_should_sync_meta(pblk);
+}
+
+static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ unsigned int lba_list_size = lm->emeta_len[2];
+ struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+ struct pblk_emeta *emeta = line->emeta;
+
+ w_err_gc->lba_list = pblk_malloc(lba_list_size,
+ l_mg->emeta_alloc_type, GFP_KERNEL);
+ memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf),
+ lba_list_size);
+}
+
+void pblk_line_close_ws(struct work_struct *work)
+{
+ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+ ws);
+ struct pblk *pblk = line_ws->pblk;
+ struct pblk_line *line = line_ws->line;
+ struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+
+ /* Write errors makes the emeta start address stored in smeta invalid,
+ * so keep a copy of the lba list until we've gc'd the line
+ */
+ if (w_err_gc->has_write_err)
+ pblk_save_lba_list(pblk, line);
+
+ pblk_line_close(pblk, line);
+ mempool_free(line_ws, &pblk->gen_ws_pool);
+}
+
+void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+ void (*work)(struct work_struct *), gfp_t gfp_mask,
+ struct workqueue_struct *wq)
+{
+ struct pblk_line_ws *line_ws;
+
+ line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask);
+
+ line_ws->pblk = pblk;
+ line_ws->line = line;
+ line_ws->priv = priv;
+
+ INIT_WORK(&line_ws->ws, work);
+ queue_work(wq, &line_ws->ws);
+}
+
+static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
+ int nr_ppas, int pos)
+{
+ struct pblk_lun *rlun = &pblk->luns[pos];
+ int ret;
+
+ /*
+ * Only send one inflight I/O per LUN. Since we map at a page
+ * granurality, all ppas in the I/O will map to the same LUN
+ */
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ int i;
+
+ for (i = 1; i < nr_ppas; i++)
+ WARN_ON(ppa_list[0].a.lun != ppa_list[i].a.lun ||
+ ppa_list[0].a.ch != ppa_list[i].a.ch);
+#endif
+
+ ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
+ if (ret == -ETIME || ret == -EINTR)
+ pblk_err(pblk, "taking lun semaphore timed out: err %d\n",
+ -ret);
+}
+
+void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+
+ __pblk_down_page(pblk, ppa_list, nr_ppas, pos);
+}
+
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+
+ /* If the LUN has been locked for this same request, do no attempt to
+ * lock it again
+ */
+ if (test_and_set_bit(pos, lun_bitmap))
+ return;
+
+ __pblk_down_page(pblk, ppa_list, nr_ppas, pos);
+}
+
+void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ int i;
+
+ for (i = 1; i < nr_ppas; i++)
+ WARN_ON(ppa_list[0].a.lun != ppa_list[i].a.lun ||
+ ppa_list[0].a.ch != ppa_list[i].a.ch);
+#endif
+
+ rlun = &pblk->luns[pos];
+ up(&rlun->wr_sem);
+}
+
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int num_lun = geo->all_luns;
+ int bit = -1;
+
+ while ((bit = find_next_bit(lun_bitmap, num_lun, bit + 1)) < num_lun) {
+ rlun = &pblk->luns[bit];
+ up(&rlun->wr_sem);
+ }
+}
+
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+ struct ppa_addr ppa_l2p;
+
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ return;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ ppa_l2p = pblk_trans_map_get(pblk, lba);
+
+ if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p))
+ pblk_map_invalidate(pblk, ppa_l2p);
+
+ pblk_trans_map_set(pblk, lba, ppa);
+ spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ /* Callers must ensure that the ppa points to a cache address */
+ BUG_ON(!pblk_addr_in_cache(ppa));
+ BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+ pblk_update_map(pblk, lba, ppa);
+}
+
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
+ struct pblk_line *gc_line, u64 paddr_gc)
+{
+ struct ppa_addr ppa_l2p, ppa_gc;
+ int ret = 1;
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ /* Callers must ensure that the ppa points to a cache address */
+ BUG_ON(!pblk_addr_in_cache(ppa_new));
+ BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new)));
+#endif
+
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ return 0;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ ppa_l2p = pblk_trans_map_get(pblk, lba);
+ ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id);
+
+ if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) {
+ spin_lock(&gc_line->lock);
+ WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap),
+ "pblk: corrupted GC update");
+ spin_unlock(&gc_line->lock);
+
+ ret = 0;
+ goto out;
+ }
+
+ pblk_trans_map_set(pblk, lba, ppa_new);
+out:
+ spin_unlock(&pblk->trans_lock);
+ return ret;
+}
+
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache)
+{
+ struct ppa_addr ppa_l2p;
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ /* Callers must ensure that the ppa points to a device address */
+ BUG_ON(pblk_addr_in_cache(ppa_mapped));
+#endif
+ /* Invalidate and discard padded entries */
+ if (lba == ADDR_EMPTY) {
+ atomic64_inc(&pblk->pad_wa);
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_inc(&pblk->padded_wb);
+#endif
+ if (!pblk_ppa_empty(ppa_mapped))
+ pblk_map_invalidate(pblk, ppa_mapped);
+ return;
+ }
+
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ return;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ ppa_l2p = pblk_trans_map_get(pblk, lba);
+
+ /* Do not update L2P if the cacheline has been updated. In this case,
+ * the mapped ppa must be invalidated
+ */
+ if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) {
+ if (!pblk_ppa_empty(ppa_mapped))
+ pblk_map_invalidate(pblk, ppa_mapped);
+ goto out;
+ }
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p));
+#endif
+
+ pblk_trans_map_set(pblk, lba, ppa_mapped);
+out:
+ spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+ sector_t blba, int nr_secs)
+{
+ int i;
+
+ spin_lock(&pblk->trans_lock);
+ for (i = 0; i < nr_secs; i++) {
+ struct ppa_addr ppa;
+
+ ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i);
+
+ /* If the L2P entry maps to a line, the reference is valid */
+ if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
+ int line_id = pblk_ppa_to_line(ppa);
+ struct pblk_line *line = &pblk->lines[line_id];
+
+ kref_get(&line->ref);
+ }
+ }
+ spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+ u64 *lba_list, int nr_secs)
+{
+ u64 lba;
+ int i;
+
+ spin_lock(&pblk->trans_lock);
+ for (i = 0; i < nr_secs; i++) {
+ lba = lba_list[i];
+ if (lba != ADDR_EMPTY) {
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ continue;
+ }
+ ppas[i] = pblk_trans_map_get(pblk, lba);
+ }
+ }
+ spin_unlock(&pblk->trans_lock);
+}
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
new file mode 100644
index 000000000..157c2567c
--- /dev/null
+++ b/drivers/lightnvm/pblk-gc.c
@@ -0,0 +1,705 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-gc.c - pblk's garbage collector
+ */
+
+#include "pblk.h"
+#include <linux/delay.h>
+
+static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
+{
+ if (gc_rq->data)
+ vfree(gc_rq->data);
+ kfree(gc_rq);
+}
+
+static int pblk_gc_write(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_gc_rq *gc_rq, *tgc_rq;
+ LIST_HEAD(w_list);
+
+ spin_lock(&gc->w_lock);
+ if (list_empty(&gc->w_list)) {
+ spin_unlock(&gc->w_lock);
+ return 1;
+ }
+
+ list_cut_position(&w_list, &gc->w_list, gc->w_list.prev);
+ gc->w_entries = 0;
+ spin_unlock(&gc->w_lock);
+
+ list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
+ pblk_write_gc_to_cache(pblk, gc_rq);
+ list_del(&gc_rq->list);
+ kref_put(&gc_rq->line->ref, pblk_line_put);
+ pblk_gc_free_gc_rq(gc_rq);
+ }
+
+ return 0;
+}
+
+static void pblk_gc_writer_kick(struct pblk_gc *gc)
+{
+ wake_up_process(gc->gc_writer_ts);
+}
+
+static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list;
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_GC);
+ line->state = PBLK_LINESTATE_CLOSED;
+ move_list = pblk_line_gc_list(pblk, line);
+ spin_unlock(&line->lock);
+
+ if (move_list) {
+ spin_lock(&l_mg->gc_lock);
+ list_add_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
+ }
+}
+
+static void pblk_gc_line_ws(struct work_struct *work)
+{
+ struct pblk_line_ws *gc_rq_ws = container_of(work,
+ struct pblk_line_ws, ws);
+ struct pblk *pblk = gc_rq_ws->pblk;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_line *line = gc_rq_ws->line;
+ struct pblk_gc_rq *gc_rq = gc_rq_ws->priv;
+ int ret;
+
+ up(&gc->gc_sem);
+
+ gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs));
+ if (!gc_rq->data) {
+ pblk_err(pblk, "could not GC line:%d (%d/%d)\n",
+ line->id, *line->vsc, gc_rq->nr_secs);
+ goto out;
+ }
+
+ /* Read from GC victim block */
+ ret = pblk_submit_read_gc(pblk, gc_rq);
+ if (ret) {
+ pblk_err(pblk, "failed GC read in line:%d (err:%d)\n",
+ line->id, ret);
+ goto out;
+ }
+
+ if (!gc_rq->secs_to_gc)
+ goto out;
+
+retry:
+ spin_lock(&gc->w_lock);
+ if (gc->w_entries >= PBLK_GC_RQ_QD) {
+ spin_unlock(&gc->w_lock);
+ pblk_gc_writer_kick(&pblk->gc);
+ usleep_range(128, 256);
+ goto retry;
+ }
+ gc->w_entries++;
+ list_add_tail(&gc_rq->list, &gc->w_list);
+ spin_unlock(&gc->w_lock);
+
+ pblk_gc_writer_kick(&pblk->gc);
+
+ kfree(gc_rq_ws);
+ return;
+
+out:
+ pblk_gc_free_gc_rq(gc_rq);
+ kref_put(&line->ref, pblk_line_put);
+ kfree(gc_rq_ws);
+}
+
+static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
+ struct pblk_line *line)
+{
+ struct line_emeta *emeta_buf;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ unsigned int lba_list_size = lm->emeta_len[2];
+ __le64 *lba_list;
+ int ret;
+
+ emeta_buf = pblk_malloc(lm->emeta_len[0],
+ l_mg->emeta_alloc_type, GFP_KERNEL);
+ if (!emeta_buf)
+ return NULL;
+
+ ret = pblk_line_read_emeta(pblk, line, emeta_buf);
+ if (ret) {
+ pblk_err(pblk, "line %d read emeta failed (%d)\n",
+ line->id, ret);
+ pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+ return NULL;
+ }
+
+ /* If this read fails, it means that emeta is corrupted.
+ * For now, leave the line untouched.
+ * TODO: Implement a recovery routine that scans and moves
+ * all sectors on the line.
+ */
+
+ ret = pblk_recov_check_emeta(pblk, emeta_buf);
+ if (ret) {
+ pblk_err(pblk, "inconsistent emeta (line %d)\n",
+ line->id);
+ pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+ return NULL;
+ }
+
+ lba_list = pblk_malloc(lba_list_size,
+ l_mg->emeta_alloc_type, GFP_KERNEL);
+ if (lba_list)
+ memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size);
+
+ pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+
+ return lba_list;
+}
+
+static void pblk_gc_line_prepare_ws(struct work_struct *work)
+{
+ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+ ws);
+ struct pblk *pblk = line_ws->pblk;
+ struct pblk_line *line = line_ws->line;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_line_ws *gc_rq_ws;
+ struct pblk_gc_rq *gc_rq;
+ __le64 *lba_list;
+ unsigned long *invalid_bitmap;
+ int sec_left, nr_secs, bit;
+
+ invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!invalid_bitmap)
+ goto fail_free_ws;
+
+ if (line->w_err_gc->has_write_err) {
+ lba_list = line->w_err_gc->lba_list;
+ line->w_err_gc->lba_list = NULL;
+ } else {
+ lba_list = get_lba_list_from_emeta(pblk, line);
+ if (!lba_list) {
+ pblk_err(pblk, "could not interpret emeta (line %d)\n",
+ line->id);
+ goto fail_free_invalid_bitmap;
+ }
+ }
+
+ spin_lock(&line->lock);
+ bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line);
+ sec_left = pblk_line_vsc(line);
+ spin_unlock(&line->lock);
+
+ if (sec_left < 0) {
+ pblk_err(pblk, "corrupted GC line (%d)\n", line->id);
+ goto fail_free_lba_list;
+ }
+
+ bit = -1;
+next_rq:
+ gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
+ if (!gc_rq)
+ goto fail_free_lba_list;
+
+ nr_secs = 0;
+ do {
+ bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line,
+ bit + 1);
+ if (bit > line->emeta_ssec)
+ break;
+
+ gc_rq->paddr_list[nr_secs] = bit;
+ gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]);
+ } while (nr_secs < pblk->max_write_pgs);
+
+ if (unlikely(!nr_secs)) {
+ kfree(gc_rq);
+ goto out;
+ }
+
+ gc_rq->nr_secs = nr_secs;
+ gc_rq->line = line;
+
+ gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
+ if (!gc_rq_ws)
+ goto fail_free_gc_rq;
+
+ gc_rq_ws->pblk = pblk;
+ gc_rq_ws->line = line;
+ gc_rq_ws->priv = gc_rq;
+
+ /* The write GC path can be much slower than the read GC one due to
+ * the budget imposed by the rate-limiter. Balance in case that we get
+ * back pressure from the write GC path.
+ */
+ while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000)))
+ io_schedule();
+
+ kref_get(&line->ref);
+
+ INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws);
+ queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws);
+
+ sec_left -= nr_secs;
+ if (sec_left > 0)
+ goto next_rq;
+
+out:
+ pblk_mfree(lba_list, l_mg->emeta_alloc_type);
+ kfree(line_ws);
+ kfree(invalid_bitmap);
+
+ kref_put(&line->ref, pblk_line_put);
+ atomic_dec(&gc->read_inflight_gc);
+
+ return;
+
+fail_free_gc_rq:
+ kfree(gc_rq);
+fail_free_lba_list:
+ pblk_mfree(lba_list, l_mg->emeta_alloc_type);
+fail_free_invalid_bitmap:
+ kfree(invalid_bitmap);
+fail_free_ws:
+ kfree(line_ws);
+
+ pblk_put_line_back(pblk, line);
+ kref_put(&line->ref, pblk_line_put);
+ atomic_dec(&gc->read_inflight_gc);
+
+ pblk_err(pblk, "failed to GC line %d\n", line->id);
+}
+
+static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_line_ws *line_ws;
+
+ pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id);
+
+ line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
+ if (!line_ws)
+ return -ENOMEM;
+
+ line_ws->pblk = pblk;
+ line_ws->line = line;
+
+ atomic_inc(&gc->pipeline_gc);
+ INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws);
+ queue_work(gc->gc_reader_wq, &line_ws->ws);
+
+ return 0;
+}
+
+static void pblk_gc_reader_kick(struct pblk_gc *gc)
+{
+ wake_up_process(gc->gc_reader_ts);
+}
+
+static void pblk_gc_kick(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ pblk_gc_writer_kick(gc);
+ pblk_gc_reader_kick(gc);
+
+ /* If we're shutting down GC, let's not start it up again */
+ if (gc->gc_enabled) {
+ wake_up_process(gc->gc_ts);
+ mod_timer(&gc->gc_timer,
+ jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+ }
+}
+
+static int pblk_gc_read(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_line *line;
+
+ spin_lock(&gc->r_lock);
+ if (list_empty(&gc->r_list)) {
+ spin_unlock(&gc->r_lock);
+ return 1;
+ }
+
+ line = list_first_entry(&gc->r_list, struct pblk_line, list);
+ list_del(&line->list);
+ spin_unlock(&gc->r_lock);
+
+ pblk_gc_kick(pblk);
+
+ if (pblk_gc_line(pblk, line))
+ pblk_err(pblk, "failed to GC line %d\n", line->id);
+
+ return 0;
+}
+
+static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
+ struct list_head *group_list)
+{
+ struct pblk_line *line, *victim;
+ int line_vsc, victim_vsc;
+
+ victim = list_first_entry(group_list, struct pblk_line, list);
+ list_for_each_entry(line, group_list, list) {
+ line_vsc = le32_to_cpu(*line->vsc);
+ victim_vsc = le32_to_cpu(*victim->vsc);
+ if (line_vsc < victim_vsc)
+ victim = line;
+ }
+
+ return victim;
+}
+
+static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl)
+{
+ unsigned int nr_blocks_free, nr_blocks_need;
+ unsigned int werr_lines = atomic_read(&rl->werr_lines);
+
+ nr_blocks_need = pblk_rl_high_thrs(rl);
+ nr_blocks_free = pblk_rl_nr_free_blks(rl);
+
+ /* This is not critical, no need to take lock here */
+ return ((werr_lines > 0) ||
+ ((gc->gc_active) && (nr_blocks_need > nr_blocks_free)));
+}
+
+void pblk_gc_free_full_lines(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_line *line;
+
+ do {
+ spin_lock(&l_mg->gc_lock);
+ if (list_empty(&l_mg->gc_full_list)) {
+ spin_unlock(&l_mg->gc_lock);
+ return;
+ }
+
+ line = list_first_entry(&l_mg->gc_full_list,
+ struct pblk_line, list);
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+ line->state = PBLK_LINESTATE_GC;
+ spin_unlock(&line->lock);
+
+ list_del(&line->list);
+ spin_unlock(&l_mg->gc_lock);
+
+ atomic_inc(&gc->pipeline_gc);
+ kref_put(&line->ref, pblk_line_put);
+ } while (1);
+}
+
+/*
+ * Lines with no valid sectors will be returned to the free list immediately. If
+ * GC is activated - either because the free block count is under the determined
+ * threshold, or because it is being forced from user space - only lines with a
+ * high count of invalid sectors will be recycled.
+ */
+static void pblk_gc_run(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_line *line;
+ struct list_head *group_list;
+ bool run_gc;
+ int read_inflight_gc, gc_group = 0, prev_group = 0;
+
+ pblk_gc_free_full_lines(pblk);
+
+ run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
+ if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD))
+ return;
+
+next_gc_group:
+ group_list = l_mg->gc_lists[gc_group++];
+
+ do {
+ spin_lock(&l_mg->gc_lock);
+ if (list_empty(group_list)) {
+ spin_unlock(&l_mg->gc_lock);
+ break;
+ }
+
+ line = pblk_gc_get_victim_line(pblk, group_list);
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+ line->state = PBLK_LINESTATE_GC;
+ spin_unlock(&line->lock);
+
+ list_del(&line->list);
+ spin_unlock(&l_mg->gc_lock);
+
+ spin_lock(&gc->r_lock);
+ list_add_tail(&line->list, &gc->r_list);
+ spin_unlock(&gc->r_lock);
+
+ read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc);
+ pblk_gc_reader_kick(gc);
+
+ prev_group = 1;
+
+ /* No need to queue up more GC lines than we can handle */
+ run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
+ if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD)
+ break;
+ } while (1);
+
+ if (!prev_group && pblk->rl.rb_state > gc_group &&
+ gc_group < PBLK_GC_NR_LISTS)
+ goto next_gc_group;
+}
+
+static void pblk_gc_timer(struct timer_list *t)
+{
+ struct pblk *pblk = from_timer(pblk, t, gc.gc_timer);
+
+ pblk_gc_kick(pblk);
+}
+
+static int pblk_gc_ts(void *data)
+{
+ struct pblk *pblk = data;
+
+ while (!kthread_should_stop()) {
+ pblk_gc_run(pblk);
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+ return 0;
+}
+
+static int pblk_gc_writer_ts(void *data)
+{
+ struct pblk *pblk = data;
+
+ while (!kthread_should_stop()) {
+ if (!pblk_gc_write(pblk))
+ continue;
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+ return 0;
+}
+
+static int pblk_gc_reader_ts(void *data)
+{
+ struct pblk *pblk = data;
+ struct pblk_gc *gc = &pblk->gc;
+
+ while (!kthread_should_stop()) {
+ if (!pblk_gc_read(pblk))
+ continue;
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ pblk_info(pblk, "flushing gc pipeline, %d lines left\n",
+ atomic_read(&gc->pipeline_gc));
+#endif
+
+ do {
+ if (!atomic_read(&gc->pipeline_gc))
+ break;
+
+ schedule();
+ } while (1);
+
+ return 0;
+}
+
+static void pblk_gc_start(struct pblk *pblk)
+{
+ pblk->gc.gc_active = 1;
+ pblk_debug(pblk, "gc start\n");
+}
+
+void pblk_gc_should_start(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ if (gc->gc_enabled && !gc->gc_active) {
+ pblk_gc_start(pblk);
+ pblk_gc_kick(pblk);
+ }
+}
+
+void pblk_gc_should_stop(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ if (gc->gc_active && !gc->gc_forced)
+ gc->gc_active = 0;
+}
+
+void pblk_gc_should_kick(struct pblk *pblk)
+{
+ pblk_rl_update_rates(&pblk->rl);
+}
+
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+ int *gc_active)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ spin_lock(&gc->lock);
+ *gc_enabled = gc->gc_enabled;
+ *gc_active = gc->gc_active;
+ spin_unlock(&gc->lock);
+}
+
+int pblk_gc_sysfs_force(struct pblk *pblk, int force)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ if (force < 0 || force > 1)
+ return -EINVAL;
+
+ spin_lock(&gc->lock);
+ gc->gc_forced = force;
+
+ if (force)
+ gc->gc_enabled = 1;
+ else
+ gc->gc_enabled = 0;
+ spin_unlock(&gc->lock);
+
+ pblk_gc_should_start(pblk);
+
+ return 0;
+}
+
+int pblk_gc_init(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ int ret;
+
+ gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
+ if (IS_ERR(gc->gc_ts)) {
+ pblk_err(pblk, "could not allocate GC main kthread\n");
+ return PTR_ERR(gc->gc_ts);
+ }
+
+ gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
+ "pblk-gc-writer-ts");
+ if (IS_ERR(gc->gc_writer_ts)) {
+ pblk_err(pblk, "could not allocate GC writer kthread\n");
+ ret = PTR_ERR(gc->gc_writer_ts);
+ goto fail_free_main_kthread;
+ }
+
+ gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk,
+ "pblk-gc-reader-ts");
+ if (IS_ERR(gc->gc_reader_ts)) {
+ pblk_err(pblk, "could not allocate GC reader kthread\n");
+ ret = PTR_ERR(gc->gc_reader_ts);
+ goto fail_free_writer_kthread;
+ }
+
+ timer_setup(&gc->gc_timer, pblk_gc_timer, 0);
+ mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+
+ gc->gc_active = 0;
+ gc->gc_forced = 0;
+ gc->gc_enabled = 1;
+ gc->w_entries = 0;
+ atomic_set(&gc->read_inflight_gc, 0);
+ atomic_set(&gc->pipeline_gc, 0);
+
+ /* Workqueue that reads valid sectors from a line and submit them to the
+ * GC writer to be recycled.
+ */
+ gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS);
+ if (!gc->gc_line_reader_wq) {
+ pblk_err(pblk, "could not allocate GC line reader workqueue\n");
+ ret = -ENOMEM;
+ goto fail_free_reader_kthread;
+ }
+
+ /* Workqueue that prepare lines for GC */
+ gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!gc->gc_reader_wq) {
+ pblk_err(pblk, "could not allocate GC reader workqueue\n");
+ ret = -ENOMEM;
+ goto fail_free_reader_line_wq;
+ }
+
+ spin_lock_init(&gc->lock);
+ spin_lock_init(&gc->w_lock);
+ spin_lock_init(&gc->r_lock);
+
+ sema_init(&gc->gc_sem, PBLK_GC_RQ_QD);
+
+ INIT_LIST_HEAD(&gc->w_list);
+ INIT_LIST_HEAD(&gc->r_list);
+
+ return 0;
+
+fail_free_reader_line_wq:
+ destroy_workqueue(gc->gc_line_reader_wq);
+fail_free_reader_kthread:
+ kthread_stop(gc->gc_reader_ts);
+fail_free_writer_kthread:
+ kthread_stop(gc->gc_writer_ts);
+fail_free_main_kthread:
+ kthread_stop(gc->gc_ts);
+
+ return ret;
+}
+
+void pblk_gc_exit(struct pblk *pblk, bool graceful)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ gc->gc_enabled = 0;
+ del_timer_sync(&gc->gc_timer);
+ gc->gc_active = 0;
+
+ if (gc->gc_ts)
+ kthread_stop(gc->gc_ts);
+
+ if (gc->gc_reader_ts)
+ kthread_stop(gc->gc_reader_ts);
+
+ if (graceful) {
+ flush_workqueue(gc->gc_reader_wq);
+ flush_workqueue(gc->gc_line_reader_wq);
+ }
+
+ destroy_workqueue(gc->gc_reader_wq);
+ destroy_workqueue(gc->gc_line_reader_wq);
+
+ if (gc->gc_writer_ts)
+ kthread_stop(gc->gc_writer_ts);
+}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
new file mode 100644
index 000000000..88b632787
--- /dev/null
+++ b/drivers/lightnvm/pblk-init.c
@@ -0,0 +1,1366 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-init.c - pblk's initialization.
+ */
+
+#include "pblk.h"
+
+static unsigned int write_buffer_size;
+
+module_param(write_buffer_size, uint, 0644);
+MODULE_PARM_DESC(write_buffer_size, "number of entries in a write buffer");
+
+static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache,
+ *pblk_w_rq_cache;
+static DECLARE_RWSEM(pblk_lock);
+struct bio_set pblk_bio_set;
+
+static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
+ struct bio *bio)
+{
+ int ret;
+
+ /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
+ * constraint. Writes can be of arbitrary size.
+ */
+ if (bio_data_dir(bio) == READ) {
+ blk_queue_split(q, &bio);
+ ret = pblk_submit_read(pblk, bio);
+ if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED))
+ bio_put(bio);
+
+ return ret;
+ }
+
+ /* Prevent deadlock in the case of a modest LUN configuration and large
+ * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
+ * available for user I/O.
+ */
+ if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl))
+ blk_queue_split(q, &bio);
+
+ return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
+}
+
+static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
+{
+ struct pblk *pblk = q->queuedata;
+
+ if (bio_op(bio) == REQ_OP_DISCARD) {
+ pblk_discard(pblk, bio);
+ if (!(bio->bi_opf & REQ_PREFLUSH)) {
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+ }
+ }
+
+ switch (pblk_rw_io(q, pblk, bio)) {
+ case NVM_IO_ERR:
+ bio_io_error(bio);
+ break;
+ case NVM_IO_DONE:
+ bio_endio(bio);
+ break;
+ }
+
+ return BLK_QC_T_NONE;
+}
+
+static size_t pblk_trans_map_size(struct pblk *pblk)
+{
+ int entry_size = 8;
+
+ if (pblk->addrf_len < 32)
+ entry_size = 4;
+
+ return entry_size * pblk->rl.nr_secs;
+}
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+static u32 pblk_l2p_crc(struct pblk *pblk)
+{
+ size_t map_size;
+ u32 crc = ~(u32)0;
+
+ map_size = pblk_trans_map_size(pblk);
+ crc = crc32_le(crc, pblk->trans_map, map_size);
+ return crc;
+}
+#endif
+
+static void pblk_l2p_free(struct pblk *pblk)
+{
+ vfree(pblk->trans_map);
+}
+
+static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
+{
+ struct pblk_line *line = NULL;
+
+ if (factory_init) {
+ pblk_setup_uuid(pblk);
+ } else {
+ line = pblk_recov_l2p(pblk);
+ if (IS_ERR(line)) {
+ pblk_err(pblk, "could not recover l2p table\n");
+ return -EFAULT;
+ }
+ }
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#endif
+
+ /* Free full lines directly as GC has not been started yet */
+ pblk_gc_free_full_lines(pblk);
+
+ if (!line) {
+ /* Configure next line for user data */
+ line = pblk_line_get_first_data(pblk);
+ if (!line)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
+{
+ sector_t i;
+ struct ppa_addr ppa;
+ size_t map_size;
+ int ret = 0;
+
+ map_size = pblk_trans_map_size(pblk);
+ pblk->trans_map = vmalloc(map_size);
+ if (!pblk->trans_map)
+ return -ENOMEM;
+
+ pblk_ppa_set_empty(&ppa);
+
+ for (i = 0; i < pblk->rl.nr_secs; i++)
+ pblk_trans_map_set(pblk, i, ppa);
+
+ ret = pblk_l2p_recover(pblk, factory_init);
+ if (ret)
+ vfree(pblk->trans_map);
+
+ return ret;
+}
+
+static void pblk_rwb_free(struct pblk *pblk)
+{
+ if (pblk_rb_tear_down_check(&pblk->rwb))
+ pblk_err(pblk, "write buffer error on tear down\n");
+
+ pblk_rb_data_free(&pblk->rwb);
+ vfree(pblk_rb_entries_ref(&pblk->rwb));
+}
+
+static int pblk_rwb_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_rb_entry *entries;
+ unsigned long nr_entries, buffer_size;
+ unsigned int power_size, power_seg_sz;
+ int pgs_in_buffer;
+
+ pgs_in_buffer = (max(geo->mw_cunits, geo->ws_opt) + geo->ws_opt)
+ * geo->all_luns;
+
+ if (write_buffer_size && (write_buffer_size > pgs_in_buffer))
+ buffer_size = write_buffer_size;
+ else
+ buffer_size = pgs_in_buffer;
+
+ nr_entries = pblk_rb_calculate_size(buffer_size);
+
+ entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry)));
+ if (!entries)
+ return -ENOMEM;
+
+ power_size = get_count_order(nr_entries);
+ power_seg_sz = get_count_order(geo->csecs);
+
+ return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz);
+}
+
+/* Minimum pages needed within a lun */
+#define ADDR_POOL_SIZE 64
+
+static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo,
+ struct nvm_addrf_12 *dst)
+{
+ struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf;
+ int power_len;
+
+ /* Re-calculate channel and lun format to adapt to configuration */
+ power_len = get_count_order(geo->num_ch);
+ if (1 << power_len != geo->num_ch) {
+ pblk_err(pblk, "supports only power-of-two channel config.\n");
+ return -EINVAL;
+ }
+ dst->ch_len = power_len;
+
+ power_len = get_count_order(geo->num_lun);
+ if (1 << power_len != geo->num_lun) {
+ pblk_err(pblk, "supports only power-of-two LUN config.\n");
+ return -EINVAL;
+ }
+ dst->lun_len = power_len;
+
+ dst->blk_len = src->blk_len;
+ dst->pg_len = src->pg_len;
+ dst->pln_len = src->pln_len;
+ dst->sec_len = src->sec_len;
+
+ dst->sec_offset = 0;
+ dst->pln_offset = dst->sec_len;
+ dst->ch_offset = dst->pln_offset + dst->pln_len;
+ dst->lun_offset = dst->ch_offset + dst->ch_len;
+ dst->pg_offset = dst->lun_offset + dst->lun_len;
+ dst->blk_offset = dst->pg_offset + dst->pg_len;
+
+ dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
+ dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset;
+ dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
+ dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
+ dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset;
+ dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset;
+
+ return dst->blk_offset + src->blk_len;
+}
+
+static int pblk_set_addrf_20(struct nvm_geo *geo, struct nvm_addrf *adst,
+ struct pblk_addrf *udst)
+{
+ struct nvm_addrf *src = &geo->addrf;
+
+ adst->ch_len = get_count_order(geo->num_ch);
+ adst->lun_len = get_count_order(geo->num_lun);
+ adst->chk_len = src->chk_len;
+ adst->sec_len = src->sec_len;
+
+ adst->sec_offset = 0;
+ adst->ch_offset = adst->sec_len;
+ adst->lun_offset = adst->ch_offset + adst->ch_len;
+ adst->chk_offset = adst->lun_offset + adst->lun_len;
+
+ adst->sec_mask = ((1ULL << adst->sec_len) - 1) << adst->sec_offset;
+ adst->chk_mask = ((1ULL << adst->chk_len) - 1) << adst->chk_offset;
+ adst->lun_mask = ((1ULL << adst->lun_len) - 1) << adst->lun_offset;
+ adst->ch_mask = ((1ULL << adst->ch_len) - 1) << adst->ch_offset;
+
+ udst->sec_stripe = geo->ws_opt;
+ udst->ch_stripe = geo->num_ch;
+ udst->lun_stripe = geo->num_lun;
+
+ udst->sec_lun_stripe = udst->sec_stripe * udst->ch_stripe;
+ udst->sec_ws_stripe = udst->sec_lun_stripe * udst->lun_stripe;
+
+ return adst->chk_offset + adst->chk_len;
+}
+
+static int pblk_set_addrf(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int mod;
+
+ switch (geo->version) {
+ case NVM_OCSSD_SPEC_12:
+ div_u64_rem(geo->clba, pblk->min_write_pgs, &mod);
+ if (mod) {
+ pblk_err(pblk, "bad configuration of sectors/pages\n");
+ return -EINVAL;
+ }
+
+ pblk->addrf_len = pblk_set_addrf_12(pblk, geo,
+ (void *)&pblk->addrf);
+ break;
+ case NVM_OCSSD_SPEC_20:
+ pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf,
+ &pblk->uaddrf);
+ break;
+ default:
+ pblk_err(pblk, "OCSSD revision not supported (%d)\n",
+ geo->version);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int pblk_init_global_caches(struct pblk *pblk)
+{
+ down_write(&pblk_lock);
+ pblk_ws_cache = kmem_cache_create("pblk_blk_ws",
+ sizeof(struct pblk_line_ws), 0, 0, NULL);
+ if (!pblk_ws_cache) {
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_rec_cache = kmem_cache_create("pblk_rec",
+ sizeof(struct pblk_rec_ctx), 0, 0, NULL);
+ if (!pblk_rec_cache) {
+ kmem_cache_destroy(pblk_ws_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size,
+ 0, 0, NULL);
+ if (!pblk_g_rq_cache) {
+ kmem_cache_destroy(pblk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
+ 0, 0, NULL);
+ if (!pblk_w_rq_cache) {
+ kmem_cache_destroy(pblk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_g_rq_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+ up_write(&pblk_lock);
+
+ return 0;
+}
+
+static void pblk_free_global_caches(struct pblk *pblk)
+{
+ kmem_cache_destroy(pblk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_g_rq_cache);
+ kmem_cache_destroy(pblk_w_rq_cache);
+}
+
+static int pblk_core_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int ret, max_write_ppas;
+
+ atomic64_set(&pblk->user_wa, 0);
+ atomic64_set(&pblk->pad_wa, 0);
+ atomic64_set(&pblk->gc_wa, 0);
+ pblk->user_rst_wa = 0;
+ pblk->pad_rst_wa = 0;
+ pblk->gc_rst_wa = 0;
+
+ atomic64_set(&pblk->nr_flush, 0);
+ pblk->nr_flush_rst = 0;
+
+ pblk->min_write_pgs = geo->ws_opt;
+ max_write_ppas = pblk->min_write_pgs * geo->all_luns;
+ pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
+ pblk->max_write_pgs = min_t(int, pblk->max_write_pgs,
+ queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT));
+ pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
+
+ if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+ pblk_err(pblk, "vector list too big(%u > %u)\n",
+ pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
+ return -EINVAL;
+ }
+
+ pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t),
+ GFP_KERNEL);
+ if (!pblk->pad_dist)
+ return -ENOMEM;
+
+ if (pblk_init_global_caches(pblk))
+ goto fail_free_pad_dist;
+
+ /* Internal bios can be at most the sectors signaled by the device. */
+ ret = mempool_init_page_pool(&pblk->page_bio_pool, NVM_MAX_VLBA, 0);
+ if (ret)
+ goto free_global_caches;
+
+ ret = mempool_init_slab_pool(&pblk->gen_ws_pool, PBLK_GEN_WS_POOL_SIZE,
+ pblk_ws_cache);
+ if (ret)
+ goto free_page_bio_pool;
+
+ ret = mempool_init_slab_pool(&pblk->rec_pool, geo->all_luns,
+ pblk_rec_cache);
+ if (ret)
+ goto free_gen_ws_pool;
+
+ ret = mempool_init_slab_pool(&pblk->r_rq_pool, geo->all_luns,
+ pblk_g_rq_cache);
+ if (ret)
+ goto free_rec_pool;
+
+ ret = mempool_init_slab_pool(&pblk->e_rq_pool, geo->all_luns,
+ pblk_g_rq_cache);
+ if (ret)
+ goto free_r_rq_pool;
+
+ ret = mempool_init_slab_pool(&pblk->w_rq_pool, geo->all_luns,
+ pblk_w_rq_cache);
+ if (ret)
+ goto free_e_rq_pool;
+
+ pblk->close_wq = alloc_workqueue("pblk-close-wq",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS);
+ if (!pblk->close_wq)
+ goto free_w_rq_pool;
+
+ pblk->bb_wq = alloc_workqueue("pblk-bb-wq",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (!pblk->bb_wq)
+ goto free_close_wq;
+
+ pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (!pblk->r_end_wq)
+ goto free_bb_wq;
+
+ if (pblk_set_addrf(pblk))
+ goto free_r_end_wq;
+
+ INIT_LIST_HEAD(&pblk->compl_list);
+ INIT_LIST_HEAD(&pblk->resubmit_list);
+
+ return 0;
+
+free_r_end_wq:
+ destroy_workqueue(pblk->r_end_wq);
+free_bb_wq:
+ destroy_workqueue(pblk->bb_wq);
+free_close_wq:
+ destroy_workqueue(pblk->close_wq);
+free_w_rq_pool:
+ mempool_exit(&pblk->w_rq_pool);
+free_e_rq_pool:
+ mempool_exit(&pblk->e_rq_pool);
+free_r_rq_pool:
+ mempool_exit(&pblk->r_rq_pool);
+free_rec_pool:
+ mempool_exit(&pblk->rec_pool);
+free_gen_ws_pool:
+ mempool_exit(&pblk->gen_ws_pool);
+free_page_bio_pool:
+ mempool_exit(&pblk->page_bio_pool);
+free_global_caches:
+ pblk_free_global_caches(pblk);
+fail_free_pad_dist:
+ kfree(pblk->pad_dist);
+ return -ENOMEM;
+}
+
+static void pblk_core_free(struct pblk *pblk)
+{
+ if (pblk->close_wq)
+ destroy_workqueue(pblk->close_wq);
+
+ if (pblk->r_end_wq)
+ destroy_workqueue(pblk->r_end_wq);
+
+ if (pblk->bb_wq)
+ destroy_workqueue(pblk->bb_wq);
+
+ mempool_exit(&pblk->page_bio_pool);
+ mempool_exit(&pblk->gen_ws_pool);
+ mempool_exit(&pblk->rec_pool);
+ mempool_exit(&pblk->r_rq_pool);
+ mempool_exit(&pblk->e_rq_pool);
+ mempool_exit(&pblk->w_rq_pool);
+
+ pblk_free_global_caches(pblk);
+ kfree(pblk->pad_dist);
+}
+
+static void pblk_line_mg_free(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ int i;
+
+ kfree(l_mg->bb_template);
+ kfree(l_mg->bb_aux);
+ kfree(l_mg->vsc_list);
+
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ kfree(l_mg->sline_meta[i]);
+ pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type);
+ kfree(l_mg->eline_meta[i]);
+ }
+}
+
+static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg,
+ struct pblk_line *line)
+{
+ struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+
+ kfree(line->blk_bitmap);
+ kfree(line->erase_bitmap);
+ kfree(line->chks);
+
+ pblk_mfree(w_err_gc->lba_list, l_mg->emeta_alloc_type);
+ kfree(w_err_gc);
+}
+
+static void pblk_lines_free(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line;
+ int i;
+
+ spin_lock(&l_mg->free_lock);
+ for (i = 0; i < l_mg->nr_lines; i++) {
+ line = &pblk->lines[i];
+
+ pblk_line_free(line);
+ pblk_line_meta_free(l_mg, line);
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_line_mg_free(pblk);
+
+ kfree(pblk->luns);
+ kfree(pblk->lines);
+}
+
+static int pblk_bb_get_tbl(struct nvm_tgt_dev *dev, struct pblk_lun *rlun,
+ u8 *blks, int nr_blks)
+{
+ struct ppa_addr ppa;
+ int ret;
+
+ ppa.ppa = 0;
+ ppa.g.ch = rlun->bppa.g.ch;
+ ppa.g.lun = rlun->bppa.g.lun;
+
+ ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
+ if (ret)
+ return ret;
+
+ nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
+ if (nr_blks < 0)
+ return -EIO;
+
+ return 0;
+}
+
+static void *pblk_bb_get_meta(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ u8 *meta;
+ int i, nr_blks, blk_per_lun;
+ int ret;
+
+ blk_per_lun = geo->num_chk * geo->pln_mode;
+ nr_blks = blk_per_lun * geo->all_luns;
+
+ meta = kmalloc(nr_blks, GFP_KERNEL);
+ if (!meta)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < geo->all_luns; i++) {
+ struct pblk_lun *rlun = &pblk->luns[i];
+ u8 *meta_pos = meta + i * blk_per_lun;
+
+ ret = pblk_bb_get_tbl(dev, rlun, meta_pos, blk_per_lun);
+ if (ret) {
+ kfree(meta);
+ return ERR_PTR(-EIO);
+ }
+ }
+
+ return meta;
+}
+
+static void *pblk_chunk_get_meta(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ if (geo->version == NVM_OCSSD_SPEC_12)
+ return pblk_bb_get_meta(pblk);
+ else
+ return pblk_chunk_get_info(pblk);
+}
+
+static int pblk_luns_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int i;
+
+ /* TODO: Implement unbalanced LUN support */
+ if (geo->num_lun < 0) {
+ pblk_err(pblk, "unbalanced LUN config.\n");
+ return -EINVAL;
+ }
+
+ pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun),
+ GFP_KERNEL);
+ if (!pblk->luns)
+ return -ENOMEM;
+
+ for (i = 0; i < geo->all_luns; i++) {
+ /* Stripe across channels */
+ int ch = i % geo->num_ch;
+ int lun_raw = i / geo->num_ch;
+ int lunid = lun_raw + ch * geo->num_lun;
+
+ rlun = &pblk->luns[i];
+ rlun->bppa = dev->luns[lunid];
+
+ sema_init(&rlun->wr_sem, 1);
+ }
+
+ return 0;
+}
+
+/* See comment over struct line_emeta definition */
+static unsigned int calc_emeta_len(struct pblk *pblk)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ /* Round to sector size so that lba_list starts on its own sector */
+ lm->emeta_sec[1] = DIV_ROUND_UP(
+ sizeof(struct line_emeta) + lm->blk_bitmap_len +
+ sizeof(struct wa_counters), geo->csecs);
+ lm->emeta_len[1] = lm->emeta_sec[1] * geo->csecs;
+
+ /* Round to sector size so that vsc_list starts on its own sector */
+ lm->dsec_per_line = lm->sec_per_line - lm->emeta_sec[0];
+ lm->emeta_sec[2] = DIV_ROUND_UP(lm->dsec_per_line * sizeof(u64),
+ geo->csecs);
+ lm->emeta_len[2] = lm->emeta_sec[2] * geo->csecs;
+
+ lm->emeta_sec[3] = DIV_ROUND_UP(l_mg->nr_lines * sizeof(u32),
+ geo->csecs);
+ lm->emeta_len[3] = lm->emeta_sec[3] * geo->csecs;
+
+ lm->vsc_list_len = l_mg->nr_lines * sizeof(u32);
+
+ return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]);
+}
+
+static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct nvm_geo *geo = &dev->geo;
+ sector_t provisioned;
+ int sec_meta, blk_meta;
+
+ if (geo->op == NVM_TARGET_DEFAULT_OP)
+ pblk->op = PBLK_DEFAULT_OP;
+ else
+ pblk->op = geo->op;
+
+ provisioned = nr_free_blks;
+ provisioned *= (100 - pblk->op);
+ sector_div(provisioned, 100);
+
+ pblk->op_blks = nr_free_blks - provisioned;
+
+ /* Internally pblk manages all free blocks, but all calculations based
+ * on user capacity consider only provisioned blocks
+ */
+ pblk->rl.total_blocks = nr_free_blks;
+ pblk->rl.nr_secs = nr_free_blks * geo->clba;
+
+ /* Consider sectors used for metadata */
+ sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
+ blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
+
+ pblk->capacity = (provisioned - blk_meta) * geo->clba;
+
+ atomic_set(&pblk->rl.free_blocks, nr_free_blks);
+ atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
+}
+
+static int pblk_setup_line_meta_12(struct pblk *pblk, struct pblk_line *line,
+ void *chunk_meta)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ int i, chk_per_lun, nr_bad_chks = 0;
+
+ chk_per_lun = geo->num_chk * geo->pln_mode;
+
+ for (i = 0; i < lm->blk_per_line; i++) {
+ struct pblk_lun *rlun = &pblk->luns[i];
+ struct nvm_chk_meta *chunk;
+ int pos = pblk_ppa_to_pos(geo, rlun->bppa);
+ u8 *lun_bb_meta = chunk_meta + pos * chk_per_lun;
+
+ chunk = &line->chks[pos];
+
+ /*
+ * In 1.2 spec. chunk state is not persisted by the device. Thus
+ * some of the values are reset each time pblk is instantiated,
+ * so we have to assume that the block is closed.
+ */
+ if (lun_bb_meta[line->id] == NVM_BLK_T_FREE)
+ chunk->state = NVM_CHK_ST_CLOSED;
+ else
+ chunk->state = NVM_CHK_ST_OFFLINE;
+
+ chunk->type = NVM_CHK_TP_W_SEQ;
+ chunk->wi = 0;
+ chunk->slba = -1;
+ chunk->cnlb = geo->clba;
+ chunk->wp = 0;
+
+ if (!(chunk->state & NVM_CHK_ST_OFFLINE))
+ continue;
+
+ set_bit(pos, line->blk_bitmap);
+ nr_bad_chks++;
+ }
+
+ return nr_bad_chks;
+}
+
+static int pblk_setup_line_meta_20(struct pblk *pblk, struct pblk_line *line,
+ struct nvm_chk_meta *meta)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ int i, nr_bad_chks = 0;
+
+ for (i = 0; i < lm->blk_per_line; i++) {
+ struct pblk_lun *rlun = &pblk->luns[i];
+ struct nvm_chk_meta *chunk;
+ struct nvm_chk_meta *chunk_meta;
+ struct ppa_addr ppa;
+ int pos;
+
+ ppa = rlun->bppa;
+ pos = pblk_ppa_to_pos(geo, ppa);
+ chunk = &line->chks[pos];
+
+ ppa.m.chk = line->id;
+ chunk_meta = pblk_chunk_get_off(pblk, meta, ppa);
+
+ chunk->state = chunk_meta->state;
+ chunk->type = chunk_meta->type;
+ chunk->wi = chunk_meta->wi;
+ chunk->slba = chunk_meta->slba;
+ chunk->cnlb = chunk_meta->cnlb;
+ chunk->wp = chunk_meta->wp;
+
+ if (chunk->type & NVM_CHK_TP_SZ_SPEC) {
+ WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
+ continue;
+ }
+
+ if (!(chunk->state & NVM_CHK_ST_OFFLINE))
+ continue;
+
+ set_bit(pos, line->blk_bitmap);
+ nr_bad_chks++;
+ }
+
+ return nr_bad_chks;
+}
+
+static long pblk_setup_line_meta(struct pblk *pblk, struct pblk_line *line,
+ void *chunk_meta, int line_id)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ long nr_bad_chks, chk_in_line;
+
+ line->pblk = pblk;
+ line->id = line_id;
+ line->type = PBLK_LINETYPE_FREE;
+ line->state = PBLK_LINESTATE_NEW;
+ line->gc_group = PBLK_LINEGC_NONE;
+ line->vsc = &l_mg->vsc_list[line_id];
+ spin_lock_init(&line->lock);
+
+ if (geo->version == NVM_OCSSD_SPEC_12)
+ nr_bad_chks = pblk_setup_line_meta_12(pblk, line, chunk_meta);
+ else
+ nr_bad_chks = pblk_setup_line_meta_20(pblk, line, chunk_meta);
+
+ chk_in_line = lm->blk_per_line - nr_bad_chks;
+ if (nr_bad_chks < 0 || nr_bad_chks > lm->blk_per_line ||
+ chk_in_line < lm->min_blk_line) {
+ line->state = PBLK_LINESTATE_BAD;
+ list_add_tail(&line->list, &l_mg->bad_list);
+ return 0;
+ }
+
+ atomic_set(&line->blk_in_line, chk_in_line);
+ list_add_tail(&line->list, &l_mg->free_list);
+ l_mg->nr_free_lines++;
+
+ return chk_in_line;
+}
+
+static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+
+ line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+ if (!line->blk_bitmap)
+ return -ENOMEM;
+
+ line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+ if (!line->erase_bitmap)
+ goto free_blk_bitmap;
+
+
+ line->chks = kmalloc_array(lm->blk_per_line,
+ sizeof(struct nvm_chk_meta), GFP_KERNEL);
+ if (!line->chks)
+ goto free_erase_bitmap;
+
+ line->w_err_gc = kzalloc(sizeof(struct pblk_w_err_gc), GFP_KERNEL);
+ if (!line->w_err_gc)
+ goto free_chks;
+
+ return 0;
+
+free_chks:
+ kfree(line->chks);
+free_erase_bitmap:
+ kfree(line->erase_bitmap);
+free_blk_bitmap:
+ kfree(line->blk_bitmap);
+ return -ENOMEM;
+}
+
+static int pblk_line_mg_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ int i, bb_distance;
+
+ l_mg->nr_lines = geo->num_chk;
+ l_mg->log_line = l_mg->data_line = NULL;
+ l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
+ l_mg->nr_free_lines = 0;
+ bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+
+ INIT_LIST_HEAD(&l_mg->free_list);
+ INIT_LIST_HEAD(&l_mg->corrupt_list);
+ INIT_LIST_HEAD(&l_mg->bad_list);
+ INIT_LIST_HEAD(&l_mg->gc_full_list);
+ INIT_LIST_HEAD(&l_mg->gc_high_list);
+ INIT_LIST_HEAD(&l_mg->gc_mid_list);
+ INIT_LIST_HEAD(&l_mg->gc_low_list);
+ INIT_LIST_HEAD(&l_mg->gc_empty_list);
+ INIT_LIST_HEAD(&l_mg->gc_werr_list);
+
+ INIT_LIST_HEAD(&l_mg->emeta_list);
+
+ l_mg->gc_lists[0] = &l_mg->gc_werr_list;
+ l_mg->gc_lists[1] = &l_mg->gc_high_list;
+ l_mg->gc_lists[2] = &l_mg->gc_mid_list;
+ l_mg->gc_lists[3] = &l_mg->gc_low_list;
+
+ spin_lock_init(&l_mg->free_lock);
+ spin_lock_init(&l_mg->close_lock);
+ spin_lock_init(&l_mg->gc_lock);
+
+ l_mg->vsc_list = kcalloc(l_mg->nr_lines, sizeof(__le32), GFP_KERNEL);
+ if (!l_mg->vsc_list)
+ goto fail;
+
+ l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!l_mg->bb_template)
+ goto fail_free_vsc_list;
+
+ l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!l_mg->bb_aux)
+ goto fail_free_bb_template;
+
+ /* smeta is always small enough to fit on a kmalloc memory allocation,
+ * emeta depends on the number of LUNs allocated to the pblk instance
+ */
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ l_mg->sline_meta[i] = kmalloc(lm->smeta_len, GFP_KERNEL);
+ if (!l_mg->sline_meta[i])
+ goto fail_free_smeta;
+ }
+
+ /* emeta allocates three different buffers for managing metadata with
+ * in-memory and in-media layouts
+ */
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ struct pblk_emeta *emeta;
+
+ emeta = kmalloc(sizeof(struct pblk_emeta), GFP_KERNEL);
+ if (!emeta)
+ goto fail_free_emeta;
+
+ if (lm->emeta_len[0] > KMALLOC_MAX_CACHE_SIZE) {
+ l_mg->emeta_alloc_type = PBLK_VMALLOC_META;
+
+ emeta->buf = vmalloc(lm->emeta_len[0]);
+ if (!emeta->buf) {
+ kfree(emeta);
+ goto fail_free_emeta;
+ }
+
+ emeta->nr_entries = lm->emeta_sec[0];
+ l_mg->eline_meta[i] = emeta;
+ } else {
+ l_mg->emeta_alloc_type = PBLK_KMALLOC_META;
+
+ emeta->buf = kmalloc(lm->emeta_len[0], GFP_KERNEL);
+ if (!emeta->buf) {
+ kfree(emeta);
+ goto fail_free_emeta;
+ }
+
+ emeta->nr_entries = lm->emeta_sec[0];
+ l_mg->eline_meta[i] = emeta;
+ }
+ }
+
+ for (i = 0; i < l_mg->nr_lines; i++)
+ l_mg->vsc_list[i] = cpu_to_le32(EMPTY_ENTRY);
+
+ bb_distance = (geo->all_luns) * geo->ws_opt;
+ for (i = 0; i < lm->sec_per_line; i += bb_distance)
+ bitmap_set(l_mg->bb_template, i, geo->ws_opt);
+
+ return 0;
+
+fail_free_emeta:
+ while (--i >= 0) {
+ if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META)
+ vfree(l_mg->eline_meta[i]->buf);
+ else
+ kfree(l_mg->eline_meta[i]->buf);
+ kfree(l_mg->eline_meta[i]);
+ }
+fail_free_smeta:
+ for (i = 0; i < PBLK_DATA_LINES; i++)
+ kfree(l_mg->sline_meta[i]);
+ kfree(l_mg->bb_aux);
+fail_free_bb_template:
+ kfree(l_mg->bb_template);
+fail_free_vsc_list:
+ kfree(l_mg->vsc_list);
+fail:
+ return -ENOMEM;
+}
+
+static int pblk_line_meta_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ unsigned int smeta_len, emeta_len;
+ int i;
+
+ lm->sec_per_line = geo->clba * geo->all_luns;
+ lm->blk_per_line = geo->all_luns;
+ lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
+ lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
+ lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
+ lm->mid_thrs = lm->sec_per_line / 2;
+ lm->high_thrs = lm->sec_per_line / 4;
+ lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs;
+
+ /* Calculate necessary pages for smeta. See comment over struct
+ * line_smeta definition
+ */
+ i = 1;
+add_smeta_page:
+ lm->smeta_sec = i * geo->ws_opt;
+ lm->smeta_len = lm->smeta_sec * geo->csecs;
+
+ smeta_len = sizeof(struct line_smeta) + lm->lun_bitmap_len;
+ if (smeta_len > lm->smeta_len) {
+ i++;
+ goto add_smeta_page;
+ }
+
+ /* Calculate necessary pages for emeta. See comment over struct
+ * line_emeta definition
+ */
+ i = 1;
+add_emeta_page:
+ lm->emeta_sec[0] = i * geo->ws_opt;
+ lm->emeta_len[0] = lm->emeta_sec[0] * geo->csecs;
+
+ emeta_len = calc_emeta_len(pblk);
+ if (emeta_len > lm->emeta_len[0]) {
+ i++;
+ goto add_emeta_page;
+ }
+
+ lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0;
+
+ lm->min_blk_line = 1;
+ if (geo->all_luns > 1)
+ lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
+ lm->emeta_sec[0], geo->clba);
+
+ if (lm->min_blk_line > lm->blk_per_line) {
+ pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n",
+ lm->blk_per_line);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int pblk_lines_init(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line;
+ void *chunk_meta;
+ long nr_free_chks = 0;
+ int i, ret;
+
+ ret = pblk_line_meta_init(pblk);
+ if (ret)
+ return ret;
+
+ ret = pblk_line_mg_init(pblk);
+ if (ret)
+ return ret;
+
+ ret = pblk_luns_init(pblk);
+ if (ret)
+ goto fail_free_meta;
+
+ chunk_meta = pblk_chunk_get_meta(pblk);
+ if (IS_ERR(chunk_meta)) {
+ ret = PTR_ERR(chunk_meta);
+ goto fail_free_luns;
+ }
+
+ pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
+ GFP_KERNEL);
+ if (!pblk->lines) {
+ ret = -ENOMEM;
+ goto fail_free_chunk_meta;
+ }
+
+ for (i = 0; i < l_mg->nr_lines; i++) {
+ line = &pblk->lines[i];
+
+ ret = pblk_alloc_line_meta(pblk, line);
+ if (ret)
+ goto fail_free_lines;
+
+ nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i);
+ }
+
+ if (!nr_free_chks) {
+ pblk_err(pblk, "too many bad blocks prevent for sane instance\n");
+ ret = -EINTR;
+ goto fail_free_lines;
+ }
+
+ pblk_set_provision(pblk, nr_free_chks);
+
+ kfree(chunk_meta);
+ return 0;
+
+fail_free_lines:
+ while (--i >= 0)
+ pblk_line_meta_free(l_mg, &pblk->lines[i]);
+ kfree(pblk->lines);
+fail_free_chunk_meta:
+ kfree(chunk_meta);
+fail_free_luns:
+ kfree(pblk->luns);
+fail_free_meta:
+ pblk_line_mg_free(pblk);
+
+ return ret;
+}
+
+static int pblk_writer_init(struct pblk *pblk)
+{
+ pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
+ if (IS_ERR(pblk->writer_ts)) {
+ int err = PTR_ERR(pblk->writer_ts);
+
+ if (err != -EINTR)
+ pblk_err(pblk, "could not allocate writer kthread (%d)\n",
+ err);
+ return err;
+ }
+
+ timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
+ mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
+
+ return 0;
+}
+
+static void pblk_writer_stop(struct pblk *pblk)
+{
+ /* The pipeline must be stopped and the write buffer emptied before the
+ * write thread is stopped
+ */
+ WARN(pblk_rb_read_count(&pblk->rwb),
+ "Stopping not fully persisted write buffer\n");
+
+ WARN(pblk_rb_sync_count(&pblk->rwb),
+ "Stopping not fully synced write buffer\n");
+
+ del_timer_sync(&pblk->wtimer);
+ if (pblk->writer_ts)
+ kthread_stop(pblk->writer_ts);
+}
+
+static void pblk_free(struct pblk *pblk)
+{
+ pblk_lines_free(pblk);
+ pblk_l2p_free(pblk);
+ pblk_rwb_free(pblk);
+ pblk_core_free(pblk);
+
+ kfree(pblk);
+}
+
+static void pblk_tear_down(struct pblk *pblk, bool graceful)
+{
+ if (graceful)
+ __pblk_pipeline_flush(pblk);
+ __pblk_pipeline_stop(pblk);
+ pblk_writer_stop(pblk);
+ pblk_rb_sync_l2p(&pblk->rwb);
+ pblk_rl_free(&pblk->rl);
+
+ pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful);
+}
+
+static void pblk_exit(void *private, bool graceful)
+{
+ struct pblk *pblk = private;
+
+ down_write(&pblk_lock);
+ pblk_gc_exit(pblk, graceful);
+ pblk_tear_down(pblk, graceful);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#endif
+
+ pblk_free(pblk);
+ up_write(&pblk_lock);
+}
+
+static sector_t pblk_capacity(void *private)
+{
+ struct pblk *pblk = private;
+
+ return pblk->capacity * NR_PHY_IN_LOG;
+}
+
+static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+ int flags)
+{
+ struct nvm_geo *geo = &dev->geo;
+ struct request_queue *bqueue = dev->q;
+ struct request_queue *tqueue = tdisk->queue;
+ struct pblk *pblk;
+ int ret;
+
+ pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
+ if (!pblk)
+ return ERR_PTR(-ENOMEM);
+
+ pblk->dev = dev;
+ pblk->disk = tdisk;
+ pblk->state = PBLK_STATE_RUNNING;
+ pblk->gc.gc_enabled = 0;
+
+ if (!(geo->version == NVM_OCSSD_SPEC_12 ||
+ geo->version == NVM_OCSSD_SPEC_20)) {
+ pblk_err(pblk, "OCSSD version not supported (%u)\n",
+ geo->version);
+ kfree(pblk);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) {
+ pblk_err(pblk, "host-side L2P table not supported. (%x)\n",
+ geo->dom);
+ kfree(pblk);
+ return ERR_PTR(-EINVAL);
+ }
+
+ spin_lock_init(&pblk->resubmit_lock);
+ spin_lock_init(&pblk->trans_lock);
+ spin_lock_init(&pblk->lock);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_set(&pblk->inflight_writes, 0);
+ atomic_long_set(&pblk->padded_writes, 0);
+ atomic_long_set(&pblk->padded_wb, 0);
+ atomic_long_set(&pblk->req_writes, 0);
+ atomic_long_set(&pblk->sub_writes, 0);
+ atomic_long_set(&pblk->sync_writes, 0);
+ atomic_long_set(&pblk->inflight_reads, 0);
+ atomic_long_set(&pblk->cache_reads, 0);
+ atomic_long_set(&pblk->sync_reads, 0);
+ atomic_long_set(&pblk->recov_writes, 0);
+ atomic_long_set(&pblk->recov_writes, 0);
+ atomic_long_set(&pblk->recov_gc_writes, 0);
+ atomic_long_set(&pblk->recov_gc_reads, 0);
+#endif
+
+ atomic_long_set(&pblk->read_failed, 0);
+ atomic_long_set(&pblk->read_empty, 0);
+ atomic_long_set(&pblk->read_high_ecc, 0);
+ atomic_long_set(&pblk->read_failed_gc, 0);
+ atomic_long_set(&pblk->write_failed, 0);
+ atomic_long_set(&pblk->erase_failed, 0);
+
+ ret = pblk_core_init(pblk);
+ if (ret) {
+ pblk_err(pblk, "could not initialize core\n");
+ goto fail;
+ }
+
+ ret = pblk_lines_init(pblk);
+ if (ret) {
+ pblk_err(pblk, "could not initialize lines\n");
+ goto fail_free_core;
+ }
+
+ ret = pblk_rwb_init(pblk);
+ if (ret) {
+ pblk_err(pblk, "could not initialize write buffer\n");
+ goto fail_free_lines;
+ }
+
+ ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY);
+ if (ret) {
+ pblk_err(pblk, "could not initialize maps\n");
+ goto fail_free_rwb;
+ }
+
+ ret = pblk_writer_init(pblk);
+ if (ret) {
+ if (ret != -EINTR)
+ pblk_err(pblk, "could not initialize write thread\n");
+ goto fail_free_l2p;
+ }
+
+ ret = pblk_gc_init(pblk);
+ if (ret) {
+ pblk_err(pblk, "could not initialize gc\n");
+ goto fail_stop_writer;
+ }
+
+ /* inherit the size from the underlying device */
+ blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
+ blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
+
+ blk_queue_write_cache(tqueue, true, false);
+
+ tqueue->limits.discard_granularity = geo->clba * geo->csecs;
+ tqueue->limits.discard_alignment = 0;
+ blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
+ blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue);
+
+ pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
+ geo->all_luns, pblk->l_mg.nr_lines,
+ (unsigned long long)pblk->rl.nr_secs,
+ pblk->rwb.nr_entries);
+
+ wake_up_process(pblk->writer_ts);
+
+ /* Check if we need to start GC */
+ pblk_gc_should_kick(pblk);
+
+ return pblk;
+
+fail_stop_writer:
+ pblk_writer_stop(pblk);
+fail_free_l2p:
+ pblk_l2p_free(pblk);
+fail_free_rwb:
+ pblk_rwb_free(pblk);
+fail_free_lines:
+ pblk_lines_free(pblk);
+fail_free_core:
+ pblk_core_free(pblk);
+fail:
+ kfree(pblk);
+ return ERR_PTR(ret);
+}
+
+/* physical block device target */
+static struct nvm_tgt_type tt_pblk = {
+ .name = "pblk",
+ .version = {1, 0, 0},
+
+ .make_rq = pblk_make_rq,
+ .capacity = pblk_capacity,
+
+ .init = pblk_init,
+ .exit = pblk_exit,
+
+ .sysfs_init = pblk_sysfs_init,
+ .sysfs_exit = pblk_sysfs_exit,
+ .owner = THIS_MODULE,
+};
+
+static int __init pblk_module_init(void)
+{
+ int ret;
+
+ ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
+ if (ret)
+ return ret;
+ ret = nvm_register_tgt_type(&tt_pblk);
+ if (ret)
+ bioset_exit(&pblk_bio_set);
+ return ret;
+}
+
+static void pblk_module_exit(void)
+{
+ bioset_exit(&pblk_bio_set);
+ nvm_unregister_tgt_type(&tt_pblk);
+}
+
+module_init(pblk_module_init);
+module_exit(pblk_module_exit);
+MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
+MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
new file mode 100644
index 000000000..953ca31dd
--- /dev/null
+++ b/drivers/lightnvm/pblk-map.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-map.c - pblk's lba-ppa mapping strategy
+ *
+ */
+
+#include "pblk.h"
+
+static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+ struct ppa_addr *ppa_list,
+ unsigned long *lun_bitmap,
+ struct pblk_sec_meta *meta_list,
+ unsigned int valid_secs)
+{
+ struct pblk_line *line = pblk_line_get_data(pblk);
+ struct pblk_emeta *emeta;
+ struct pblk_w_ctx *w_ctx;
+ __le64 *lba_list;
+ u64 paddr;
+ int nr_secs = pblk->min_write_pgs;
+ int i;
+
+ if (pblk_line_is_full(line)) {
+ struct pblk_line *prev_line = line;
+
+ /* If we cannot allocate a new line, make sure to store metadata
+ * on current line and then fail
+ */
+ line = pblk_line_replace_data(pblk);
+ pblk_line_close_meta(pblk, prev_line);
+
+ if (!line)
+ return -EINTR;
+ }
+
+ emeta = line->emeta;
+ lba_list = emeta_to_lbas(pblk, emeta->buf);
+
+ paddr = pblk_alloc_page(pblk, line, nr_secs);
+
+ for (i = 0; i < nr_secs; i++, paddr++) {
+ __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
+ /* ppa to be sent to the device */
+ ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+
+ /* Write context for target bio completion on write buffer. Note
+ * that the write buffer is protected by the sync backpointer,
+ * and a single writer thread have access to each specific entry
+ * at a time. Thus, it is safe to modify the context for the
+ * entry we are setting up for submission without taking any
+ * lock or memory barrier.
+ */
+ if (i < valid_secs) {
+ kref_get(&line->ref);
+ w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
+ w_ctx->ppa = ppa_list[i];
+ meta_list[i].lba = cpu_to_le64(w_ctx->lba);
+ lba_list[paddr] = cpu_to_le64(w_ctx->lba);
+ if (lba_list[paddr] != addr_empty)
+ line->nr_valid_lbas++;
+ else
+ atomic64_inc(&pblk->pad_wa);
+ } else {
+ lba_list[paddr] = meta_list[i].lba = addr_empty;
+ __pblk_map_invalidate(pblk, line, paddr);
+ }
+ }
+
+ pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
+ return 0;
+}
+
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+ unsigned long *lun_bitmap, unsigned int valid_secs,
+ unsigned int off)
+{
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ unsigned int map_secs;
+ int min = pblk->min_write_pgs;
+ int i;
+
+ for (i = off; i < rqd->nr_ppas; i += min) {
+ map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+ if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+ lun_bitmap, &meta_list[i], map_secs)) {
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+ pblk_pipeline_stop(pblk);
+ }
+ }
+}
+
+/* only if erase_ppa is set, acquire erase semaphore */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int sentry, unsigned long *lun_bitmap,
+ unsigned int valid_secs, struct ppa_addr *erase_ppa)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ struct pblk_line *e_line, *d_line;
+ unsigned int map_secs;
+ int min = pblk->min_write_pgs;
+ int i, erase_lun;
+
+ for (i = 0; i < rqd->nr_ppas; i += min) {
+ map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+ if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+ lun_bitmap, &meta_list[i], map_secs)) {
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+ pblk_pipeline_stop(pblk);
+ }
+
+ erase_lun = pblk_ppa_to_pos(geo, rqd->ppa_list[i]);
+
+ /* line can change after page map. We might also be writing the
+ * last line.
+ */
+ e_line = pblk_line_get_erase(pblk);
+ if (!e_line)
+ return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
+ valid_secs, i + min);
+
+ spin_lock(&e_line->lock);
+ if (!test_bit(erase_lun, e_line->erase_bitmap)) {
+ set_bit(erase_lun, e_line->erase_bitmap);
+ atomic_dec(&e_line->left_eblks);
+
+ *erase_ppa = rqd->ppa_list[i];
+ erase_ppa->a.blk = e_line->id;
+
+ spin_unlock(&e_line->lock);
+
+ /* Avoid evaluating e_line->left_eblks */
+ return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
+ valid_secs, i + min);
+ }
+ spin_unlock(&e_line->lock);
+ }
+
+ d_line = pblk_line_get_data(pblk);
+
+ /* line can change after page map. We might also be writing the
+ * last line.
+ */
+ e_line = pblk_line_get_erase(pblk);
+ if (!e_line)
+ return;
+
+ /* Erase blocks that are bad in this line but might not be in next */
+ if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
+ bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) {
+ int bit = -1;
+
+retry:
+ bit = find_next_bit(d_line->blk_bitmap,
+ lm->blk_per_line, bit + 1);
+ if (bit >= lm->blk_per_line)
+ return;
+
+ spin_lock(&e_line->lock);
+ if (test_bit(bit, e_line->erase_bitmap)) {
+ spin_unlock(&e_line->lock);
+ goto retry;
+ }
+ spin_unlock(&e_line->lock);
+
+ set_bit(bit, e_line->erase_bitmap);
+ atomic_dec(&e_line->left_eblks);
+ *erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */
+ erase_ppa->a.blk = e_line->id;
+ }
+}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
new file mode 100644
index 000000000..d22c13b55
--- /dev/null
+++ b/drivers/lightnvm/pblk-rb.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * Based upon the circular ringbuffer.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rb.c - pblk's write buffer
+ */
+
+#include <linux/circ_buf.h>
+
+#include "pblk.h"
+
+static DECLARE_RWSEM(pblk_rb_lock);
+
+void pblk_rb_data_free(struct pblk_rb *rb)
+{
+ struct pblk_rb_pages *p, *t;
+
+ down_write(&pblk_rb_lock);
+ list_for_each_entry_safe(p, t, &rb->pages, list) {
+ free_pages((unsigned long)page_address(p->pages), p->order);
+ list_del(&p->list);
+ kfree(p);
+ }
+ up_write(&pblk_rb_lock);
+}
+
+/*
+ * Initialize ring buffer. The data and metadata buffers must be previously
+ * allocated and their size must be a power of two
+ * (Documentation/core-api/circular-buffers.rst)
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+ unsigned int power_size, unsigned int power_seg_sz)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ unsigned int init_entry = 0;
+ unsigned int alloc_order = power_size;
+ unsigned int max_order = MAX_ORDER - 1;
+ unsigned int order, iter;
+
+ down_write(&pblk_rb_lock);
+ rb->entries = rb_entry_base;
+ rb->seg_size = (1 << power_seg_sz);
+ rb->nr_entries = (1 << power_size);
+ rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
+ rb->flush_point = EMPTY_ENTRY;
+
+ spin_lock_init(&rb->w_lock);
+ spin_lock_init(&rb->s_lock);
+
+ INIT_LIST_HEAD(&rb->pages);
+
+ if (alloc_order >= max_order) {
+ order = max_order;
+ iter = (1 << (alloc_order - max_order));
+ } else {
+ order = alloc_order;
+ iter = 1;
+ }
+
+ do {
+ struct pblk_rb_entry *entry;
+ struct pblk_rb_pages *page_set;
+ void *kaddr;
+ unsigned long set_size;
+ int i;
+
+ page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
+ if (!page_set) {
+ up_write(&pblk_rb_lock);
+ return -ENOMEM;
+ }
+
+ page_set->order = order;
+ page_set->pages = alloc_pages(GFP_KERNEL, order);
+ if (!page_set->pages) {
+ kfree(page_set);
+ pblk_rb_data_free(rb);
+ up_write(&pblk_rb_lock);
+ return -ENOMEM;
+ }
+ kaddr = page_address(page_set->pages);
+
+ entry = &rb->entries[init_entry];
+ entry->data = kaddr;
+ entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+ entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+
+ set_size = (1 << order);
+ for (i = 1; i < set_size; i++) {
+ entry = &rb->entries[init_entry];
+ entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+ entry->data = kaddr + (i * rb->seg_size);
+ entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+ bio_list_init(&entry->w_ctx.bios);
+ }
+
+ list_add_tail(&page_set->list, &rb->pages);
+ iter--;
+ } while (iter > 0);
+ up_write(&pblk_rb_lock);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_set(&rb->inflight_flush_point, 0);
+#endif
+
+ /*
+ * Initialize rate-limiter, which controls access to the write buffer
+ * but user and GC I/O
+ */
+ pblk_rl_init(&pblk->rl, rb->nr_entries);
+
+ return 0;
+}
+
+/*
+ * pblk_rb_calculate_size -- calculate the size of the write buffer
+ */
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries)
+{
+ /* Alloc a write buffer that can at least fit 128 entries */
+ return (1 << max(get_count_order(nr_entries), 7));
+}
+
+void *pblk_rb_entries_ref(struct pblk_rb *rb)
+{
+ return rb->entries;
+}
+
+static void clean_wctx(struct pblk_w_ctx *w_ctx)
+{
+ int flags;
+
+ flags = READ_ONCE(w_ctx->flags);
+ WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY),
+ "pblk: overwriting unsubmitted data\n");
+
+ /* Release flags on context. Protect from writes and reads */
+ smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
+ pblk_ppa_set_empty(&w_ctx->ppa);
+ w_ctx->lba = ADDR_EMPTY;
+}
+
+#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
+#define pblk_rb_ring_space(rb, head, tail, size) \
+ (CIRC_SPACE(head, tail, size))
+
+/*
+ * Buffer space is calculated with respect to the back pointer signaling
+ * synchronized entries to the media.
+ */
+static unsigned int pblk_rb_space(struct pblk_rb *rb)
+{
+ unsigned int mem = READ_ONCE(rb->mem);
+ unsigned int sync = READ_ONCE(rb->sync);
+
+ return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
+}
+
+/*
+ * Buffer count is calculated with respect to the submission entry signaling the
+ * entries that are available to send to the media
+ */
+unsigned int pblk_rb_read_count(struct pblk_rb *rb)
+{
+ unsigned int mem = READ_ONCE(rb->mem);
+ unsigned int subm = READ_ONCE(rb->subm);
+
+ return pblk_rb_ring_count(mem, subm, rb->nr_entries);
+}
+
+unsigned int pblk_rb_sync_count(struct pblk_rb *rb)
+{
+ unsigned int mem = READ_ONCE(rb->mem);
+ unsigned int sync = READ_ONCE(rb->sync);
+
+ return pblk_rb_ring_count(mem, sync, rb->nr_entries);
+}
+
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
+{
+ unsigned int subm;
+
+ subm = READ_ONCE(rb->subm);
+ /* Commit read means updating submission pointer */
+ smp_store_release(&rb->subm,
+ (subm + nr_entries) & (rb->nr_entries - 1));
+
+ return subm;
+}
+
+static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_line *line;
+ struct pblk_rb_entry *entry;
+ struct pblk_w_ctx *w_ctx;
+ unsigned int user_io = 0, gc_io = 0;
+ unsigned int i;
+ int flags;
+
+ for (i = 0; i < to_update; i++) {
+ entry = &rb->entries[rb->l2p_update];
+ w_ctx = &entry->w_ctx;
+
+ flags = READ_ONCE(entry->w_ctx.flags);
+ if (flags & PBLK_IOTYPE_USER)
+ user_io++;
+ else if (flags & PBLK_IOTYPE_GC)
+ gc_io++;
+ else
+ WARN(1, "pblk: unknown IO type\n");
+
+ pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
+ entry->cacheline);
+
+ line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
+ kref_put(&line->ref, pblk_line_put);
+ clean_wctx(w_ctx);
+ rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1);
+ }
+
+ pblk_rl_out(&pblk->rl, user_io, gc_io);
+
+ return 0;
+}
+
+/*
+ * When we move the l2p_update pointer, we update the l2p table - lookups will
+ * point to the physical address instead of to the cacheline in the write buffer
+ * from this moment on.
+ */
+static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int mem, unsigned int sync)
+{
+ unsigned int space, count;
+ int ret = 0;
+
+ lockdep_assert_held(&rb->w_lock);
+
+ /* Update l2p only as buffer entries are being overwritten */
+ space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries);
+ if (space > nr_entries)
+ goto out;
+
+ count = nr_entries - space;
+ /* l2p_update used exclusively under rb->w_lock */
+ ret = __pblk_rb_update_l2p(rb, count);
+
+out:
+ return ret;
+}
+
+/*
+ * Update the l2p entry for all sectors stored on the write buffer. This means
+ * that all future lookups to the l2p table will point to a device address, not
+ * to the cacheline in the write buffer.
+ */
+void pblk_rb_sync_l2p(struct pblk_rb *rb)
+{
+ unsigned int sync;
+ unsigned int to_update;
+
+ spin_lock(&rb->w_lock);
+
+ /* Protect from reads and writes */
+ sync = smp_load_acquire(&rb->sync);
+
+ to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
+ __pblk_rb_update_l2p(rb, to_update);
+
+ spin_unlock(&rb->w_lock);
+}
+
+/*
+ * Write @nr_entries to ring buffer from @data buffer if there is enough space.
+ * Typically, 4KB data chunks coming from a bio will be copied to the ring
+ * buffer, thus the write will fail if not all incoming data can be copied.
+ *
+ */
+static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx,
+ struct pblk_rb_entry *entry)
+{
+ memcpy(entry->data, data, rb->seg_size);
+
+ entry->w_ctx.lba = w_ctx.lba;
+ entry->w_ctx.ppa = w_ctx.ppa;
+}
+
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, unsigned int ring_pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_rb_entry *entry;
+ int flags;
+
+ entry = &rb->entries[ring_pos];
+ flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ /* Caller must guarantee that the entry is free */
+ BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+ __pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+ pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
+ flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+ /* Release flags on write context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, struct pblk_line *line,
+ u64 paddr, unsigned int ring_pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_rb_entry *entry;
+ int flags;
+
+ entry = &rb->entries[ring_pos];
+ flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ /* Caller must guarantee that the entry is free */
+ BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+ __pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+ if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr))
+ entry->w_ctx.lba = ADDR_EMPTY;
+
+ flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+ /* Release flags on write context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
+ unsigned int pos)
+{
+ struct pblk_rb_entry *entry;
+ unsigned int sync, flush_point;
+
+ pblk_rb_sync_init(rb, NULL);
+ sync = READ_ONCE(rb->sync);
+
+ if (pos == sync) {
+ pblk_rb_sync_end(rb, NULL);
+ return 0;
+ }
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_inc(&rb->inflight_flush_point);
+#endif
+
+ flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
+ entry = &rb->entries[flush_point];
+
+ /* Protect flush points */
+ smp_store_release(&rb->flush_point, flush_point);
+
+ if (bio)
+ bio_list_add(&entry->w_ctx.bios, bio);
+
+ pblk_rb_sync_end(rb, NULL);
+
+ return bio ? 1 : 0;
+}
+
+static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos)
+{
+ unsigned int mem;
+ unsigned int sync;
+
+ sync = READ_ONCE(rb->sync);
+ mem = READ_ONCE(rb->mem);
+
+ if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries)
+ return 0;
+
+ if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
+ return 0;
+
+ *pos = mem;
+
+ return 1;
+}
+
+static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos)
+{
+ if (!__pblk_rb_may_write(rb, nr_entries, pos))
+ return 0;
+
+ /* Protect from read count */
+ smp_store_release(&rb->mem, (*pos + nr_entries) & (rb->nr_entries - 1));
+ return 1;
+}
+
+void pblk_rb_flush(struct pblk_rb *rb)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ unsigned int mem = READ_ONCE(rb->mem);
+
+ if (pblk_rb_flush_point_set(rb, NULL, mem))
+ return;
+
+ pblk_write_kick(pblk);
+}
+
+static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos, struct bio *bio,
+ int *io_ret)
+{
+ unsigned int mem;
+
+ if (!__pblk_rb_may_write(rb, nr_entries, pos))
+ return 0;
+
+ mem = (*pos + nr_entries) & (rb->nr_entries - 1);
+ *io_ret = NVM_IO_DONE;
+
+ if (bio->bi_opf & REQ_PREFLUSH) {
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+ atomic64_inc(&pblk->nr_flush);
+ if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem))
+ *io_ret = NVM_IO_OK;
+ }
+
+ /* Protect from read count */
+ smp_store_release(&rb->mem, mem);
+
+ return 1;
+}
+
+/*
+ * Atomically check that (i) there is space on the write buffer for the
+ * incoming I/O, and (ii) the current I/O type has enough budget in the write
+ * buffer (rate-limiter).
+ */
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+ unsigned int nr_entries, unsigned int *pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ int io_ret;
+
+ spin_lock(&rb->w_lock);
+ io_ret = pblk_rl_user_may_insert(&pblk->rl, nr_entries);
+ if (io_ret) {
+ spin_unlock(&rb->w_lock);
+ return io_ret;
+ }
+
+ if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &io_ret)) {
+ spin_unlock(&rb->w_lock);
+ return NVM_IO_REQUEUE;
+ }
+
+ pblk_rl_user_in(&pblk->rl, nr_entries);
+ spin_unlock(&rb->w_lock);
+
+ return io_ret;
+}
+
+/*
+ * Look at pblk_rb_may_write_user comment
+ */
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+ spin_lock(&rb->w_lock);
+ if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) {
+ spin_unlock(&rb->w_lock);
+ return 0;
+ }
+
+ if (!pblk_rb_may_write(rb, nr_entries, pos)) {
+ spin_unlock(&rb->w_lock);
+ return 0;
+ }
+
+ pblk_rl_gc_in(&pblk->rl, nr_entries);
+ spin_unlock(&rb->w_lock);
+
+ return 1;
+}
+
+/*
+ * Read available entries on rb and add them to the given bio. To avoid a memory
+ * copy, a page reference to the write buffer is used to be added to the bio.
+ *
+ * This function is used by the write thread to form the write bio that will
+ * persist data on the write buffer to the media.
+ */
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
+ unsigned int pos, unsigned int nr_entries,
+ unsigned int count)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct request_queue *q = pblk->dev->q;
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+ struct bio *bio = rqd->bio;
+ struct pblk_rb_entry *entry;
+ struct page *page;
+ unsigned int pad = 0, to_read = nr_entries;
+ unsigned int i;
+ int flags;
+
+ if (count < nr_entries) {
+ pad = nr_entries - count;
+ to_read = count;
+ }
+
+ c_ctx->sentry = pos;
+ c_ctx->nr_valid = to_read;
+ c_ctx->nr_padded = pad;
+
+ for (i = 0; i < to_read; i++) {
+ entry = &rb->entries[pos];
+
+ /* A write has been allowed into the buffer, but data is still
+ * being copied to it. It is ok to busy wait.
+ */
+try:
+ flags = READ_ONCE(entry->w_ctx.flags);
+ if (!(flags & PBLK_WRITTEN_DATA)) {
+ io_schedule();
+ goto try;
+ }
+
+ page = virt_to_page(entry->data);
+ if (!page) {
+ pblk_err(pblk, "could not allocate write bio page\n");
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_SUBMITTED_ENTRY;
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+ return NVM_IO_ERR;
+ }
+
+ if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) !=
+ rb->seg_size) {
+ pblk_err(pblk, "could not add page to write bio\n");
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_SUBMITTED_ENTRY;
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+ return NVM_IO_ERR;
+ }
+
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_SUBMITTED_ENTRY;
+
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+
+ pos = (pos + 1) & (rb->nr_entries - 1);
+ }
+
+ if (pad) {
+ if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) {
+ pblk_err(pblk, "could not pad page in write bio\n");
+ return NVM_IO_ERR;
+ }
+
+ if (pad < pblk->min_write_pgs)
+ atomic64_inc(&pblk->pad_dist[pad - 1]);
+ else
+ pblk_warn(pblk, "padding more than min. sectors\n");
+
+ atomic64_add(pad, &pblk->pad_wa);
+ }
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_add(pad, &pblk->padded_writes);
+#endif
+
+ return NVM_IO_OK;
+}
+
+/*
+ * Copy to bio only if the lba matches the one on the given cache entry.
+ * Otherwise, it means that the entry has been overwritten, and the bio should
+ * be directed to disk.
+ */
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+ struct ppa_addr ppa, int bio_iter, bool advanced_bio)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_rb_entry *entry;
+ struct pblk_w_ctx *w_ctx;
+ struct ppa_addr l2p_ppa;
+ u64 pos = pblk_addr_to_cacheline(ppa);
+ void *data;
+ int flags;
+ int ret = 1;
+
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ /* Caller must ensure that the access will not cause an overflow */
+ BUG_ON(pos >= rb->nr_entries);
+#endif
+ entry = &rb->entries[pos];
+ w_ctx = &entry->w_ctx;
+ flags = READ_ONCE(w_ctx->flags);
+
+ spin_lock(&rb->w_lock);
+ spin_lock(&pblk->trans_lock);
+ l2p_ppa = pblk_trans_map_get(pblk, lba);
+ spin_unlock(&pblk->trans_lock);
+
+ /* Check if the entry has been overwritten or is scheduled to be */
+ if (!pblk_ppa_comp(l2p_ppa, ppa) || w_ctx->lba != lba ||
+ flags & PBLK_WRITABLE_ENTRY) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Only advance the bio if it hasn't been advanced already. If advanced,
+ * this bio is at least a partial bio (i.e., it has partially been
+ * filled with data from the cache). If part of the data resides on the
+ * media, we will read later on
+ */
+ if (unlikely(!advanced_bio))
+ bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE);
+
+ data = bio_data(bio);
+ memcpy(data, entry->data, rb->seg_size);
+
+out:
+ spin_unlock(&rb->w_lock);
+ return ret;
+}
+
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos)
+{
+ unsigned int entry = pos & (rb->nr_entries - 1);
+
+ return &rb->entries[entry].w_ctx;
+}
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
+ __acquires(&rb->s_lock)
+{
+ if (flags)
+ spin_lock_irqsave(&rb->s_lock, *flags);
+ else
+ spin_lock_irq(&rb->s_lock);
+
+ return rb->sync;
+}
+
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
+ __releases(&rb->s_lock)
+{
+ lockdep_assert_held(&rb->s_lock);
+
+ if (flags)
+ spin_unlock_irqrestore(&rb->s_lock, *flags);
+ else
+ spin_unlock_irq(&rb->s_lock);
+}
+
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
+{
+ unsigned int sync, flush_point;
+ lockdep_assert_held(&rb->s_lock);
+
+ sync = READ_ONCE(rb->sync);
+ flush_point = READ_ONCE(rb->flush_point);
+
+ if (flush_point != EMPTY_ENTRY) {
+ unsigned int secs_to_flush;
+
+ secs_to_flush = pblk_rb_ring_count(flush_point, sync,
+ rb->nr_entries);
+ if (secs_to_flush < nr_entries) {
+ /* Protect flush points */
+ smp_store_release(&rb->flush_point, EMPTY_ENTRY);
+ }
+ }
+
+ sync = (sync + nr_entries) & (rb->nr_entries - 1);
+
+ /* Protect from counts */
+ smp_store_release(&rb->sync, sync);
+
+ return sync;
+}
+
+/* Calculate how many sectors to submit up to the current flush point. */
+unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb)
+{
+ unsigned int subm, sync, flush_point;
+ unsigned int submitted, to_flush;
+
+ /* Protect flush points */
+ flush_point = smp_load_acquire(&rb->flush_point);
+ if (flush_point == EMPTY_ENTRY)
+ return 0;
+
+ /* Protect syncs */
+ sync = smp_load_acquire(&rb->sync);
+
+ subm = READ_ONCE(rb->subm);
+ submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries);
+
+ /* The sync point itself counts as a sector to sync */
+ to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1;
+
+ return (submitted < to_flush) ? (to_flush - submitted) : 0;
+}
+
+/*
+ * Scan from the current position of the sync pointer to find the entry that
+ * corresponds to the given ppa. This is necessary since write requests can be
+ * completed out of order. The assumption is that the ppa is close to the sync
+ * pointer thus the search will not take long.
+ *
+ * The caller of this function must guarantee that the sync pointer will no
+ * reach the entry while it is using the metadata associated with it. With this
+ * assumption in mind, there is no need to take the sync lock.
+ */
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+ struct ppa_addr *ppa)
+{
+ unsigned int sync, subm, count;
+ unsigned int i;
+
+ sync = READ_ONCE(rb->sync);
+ subm = READ_ONCE(rb->subm);
+ count = pblk_rb_ring_count(subm, sync, rb->nr_entries);
+
+ for (i = 0; i < count; i++)
+ sync = (sync + 1) & (rb->nr_entries - 1);
+
+ return NULL;
+}
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb)
+{
+ struct pblk_rb_entry *entry;
+ int i;
+ int ret = 0;
+
+ spin_lock(&rb->w_lock);
+ spin_lock_irq(&rb->s_lock);
+
+ if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
+ (rb->sync == rb->l2p_update) &&
+ (rb->flush_point == EMPTY_ENTRY)) {
+ goto out;
+ }
+
+ if (!rb->entries) {
+ ret = 1;
+ goto out;
+ }
+
+ for (i = 0; i < rb->nr_entries; i++) {
+ entry = &rb->entries[i];
+
+ if (!entry->data) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock_irq(&rb->s_lock);
+ spin_unlock(&rb->w_lock);
+
+ return ret;
+}
+
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos)
+{
+ return (pos & (rb->nr_entries - 1));
+}
+
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
+{
+ return (pos >= rb->nr_entries);
+}
+
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_c_ctx *c;
+ ssize_t offset;
+ int queued_entries = 0;
+
+ spin_lock_irq(&rb->s_lock);
+ list_for_each_entry(c, &pblk->compl_list, list)
+ queued_entries++;
+ spin_unlock_irq(&rb->s_lock);
+
+ if (rb->flush_point != EMPTY_ENTRY)
+ offset = scnprintf(buf, PAGE_SIZE,
+ "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
+ rb->nr_entries,
+ rb->mem,
+ rb->subm,
+ rb->sync,
+ rb->l2p_update,
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_read(&rb->inflight_flush_point),
+#else
+ 0,
+#endif
+ rb->flush_point,
+ pblk_rb_read_count(rb),
+ pblk_rb_space(rb),
+ pblk_rb_flush_point_count(rb),
+ queued_entries);
+ else
+ offset = scnprintf(buf, PAGE_SIZE,
+ "%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n",
+ rb->nr_entries,
+ rb->mem,
+ rb->subm,
+ rb->sync,
+ rb->l2p_update,
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_read(&rb->inflight_flush_point),
+#else
+ 0,
+#endif
+ pblk_rb_read_count(rb),
+ pblk_rb_space(rb),
+ pblk_rb_flush_point_count(rb),
+ queued_entries);
+
+ return offset;
+}
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
new file mode 100644
index 000000000..5a46d7f93
--- /dev/null
+++ b/drivers/lightnvm/pblk-read.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-read.c - pblk's read path
+ */
+
+#include "pblk.h"
+
+/*
+ * There is no guarantee that the value read from cache has not been updated and
+ * resides at another location in the cache. We guarantee though that if the
+ * value is read from the cache, it belongs to the mapped lba. In order to
+ * guarantee and order between writes and reads are ordered, a flush must be
+ * issued.
+ */
+static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
+ sector_t lba, struct ppa_addr ppa,
+ int bio_iter, bool advanced_bio)
+{
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ /* Callers must ensure that the ppa points to a cache address */
+ BUG_ON(pblk_ppa_empty(ppa));
+ BUG_ON(!pblk_addr_in_cache(ppa));
+#endif
+
+ return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa,
+ bio_iter, advanced_bio);
+}
+
+static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct bio *bio, sector_t blba,
+ unsigned long *read_bitmap)
+{
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+ int nr_secs = rqd->nr_ppas;
+ bool advanced_bio = false;
+ int i, j = 0;
+
+ pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
+
+ for (i = 0; i < nr_secs; i++) {
+ struct ppa_addr p = ppas[i];
+ sector_t lba = blba + i;
+
+retry:
+ if (pblk_ppa_empty(p)) {
+ WARN_ON(test_and_set_bit(i, read_bitmap));
+ meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+
+ if (unlikely(!advanced_bio)) {
+ bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
+ advanced_bio = true;
+ }
+
+ goto next;
+ }
+
+ /* Try to read from write buffer. The address is later checked
+ * on the write buffer to prevent retrieving overwritten data.
+ */
+ if (pblk_addr_in_cache(p)) {
+ if (!pblk_read_from_cache(pblk, bio, lba, p, i,
+ advanced_bio)) {
+ pblk_lookup_l2p_seq(pblk, &p, lba, 1);
+ goto retry;
+ }
+ WARN_ON(test_and_set_bit(i, read_bitmap));
+ meta_list[i].lba = cpu_to_le64(lba);
+ advanced_bio = true;
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_inc(&pblk->cache_reads);
+#endif
+ } else {
+ /* Read from media non-cached sectors */
+ rqd->ppa_list[j++] = p;
+ }
+
+next:
+ if (advanced_bio)
+ bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+ }
+
+ if (pblk_io_aligned(pblk, nr_secs))
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
+ else
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_add(nr_secs, &pblk->inflight_reads);
+#endif
+}
+
+
+static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
+ sector_t blba)
+{
+ struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+ int nr_lbas = rqd->nr_ppas;
+ int i;
+
+ for (i = 0; i < nr_lbas; i++) {
+ u64 lba = le64_to_cpu(meta_lba_list[i].lba);
+
+ if (lba == ADDR_EMPTY)
+ continue;
+
+ if (lba != blba + i) {
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ struct ppa_addr *p;
+
+ p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr;
+ print_ppa(pblk, p, "seq", i);
+#endif
+ pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
+ lba, (u64)blba + i);
+ WARN_ON(1);
+ }
+ }
+}
+
+/*
+ * There can be holes in the lba list.
+ */
+static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
+ u64 *lba_list, int nr_lbas)
+{
+ struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+ int i, j;
+
+ for (i = 0, j = 0; i < nr_lbas; i++) {
+ u64 lba = lba_list[i];
+ u64 meta_lba;
+
+ if (lba == ADDR_EMPTY)
+ continue;
+
+ meta_lba = le64_to_cpu(meta_lba_list[j].lba);
+
+ if (lba != meta_lba) {
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ struct ppa_addr *p;
+ int nr_ppas = rqd->nr_ppas;
+
+ p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr;
+ print_ppa(pblk, p, "seq", j);
+#endif
+ pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
+ lba, meta_lba);
+ WARN_ON(1);
+ }
+
+ j++;
+ }
+
+ WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n");
+}
+
+static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct ppa_addr *ppa_list;
+ int i;
+
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ struct ppa_addr ppa = ppa_list[i];
+ struct pblk_line *line;
+
+ line = &pblk->lines[pblk_ppa_to_line(ppa)];
+ kref_put(&line->ref, pblk_line_put_wq);
+ }
+}
+
+static void pblk_end_user_read(struct bio *bio)
+{
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
+#endif
+ bio_endio(bio);
+}
+
+static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
+ bool put_line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ struct bio *int_bio = rqd->bio;
+ unsigned long start_time = r_ctx->start_time;
+
+ generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time);
+
+ if (rqd->error)
+ pblk_log_read_err(pblk, rqd);
+
+ pblk_read_check_seq(pblk, rqd, r_ctx->lba);
+
+ if (int_bio)
+ bio_put(int_bio);
+
+ if (put_line)
+ pblk_read_put_rqd_kref(pblk, rqd);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
+ atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
+#endif
+
+ pblk_free_rqd(pblk, rqd, PBLK_READ);
+ atomic_dec(&pblk->inflight_io);
+}
+
+static void pblk_end_io_read(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+ struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ struct bio *bio = (struct bio *)r_ctx->private;
+
+ pblk_end_user_read(bio);
+ __pblk_end_io_read(pblk, rqd, true);
+}
+
+static void pblk_end_partial_read(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+ struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ struct pblk_pr_ctx *pr_ctx = r_ctx->private;
+ struct bio *new_bio = rqd->bio;
+ struct bio *bio = pr_ctx->orig_bio;
+ struct bio_vec src_bv, dst_bv;
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ int bio_init_idx = pr_ctx->bio_init_idx;
+ unsigned long *read_bitmap = pr_ctx->bitmap;
+ int nr_secs = pr_ctx->orig_nr_secs;
+ int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
+ __le64 *lba_list_mem, *lba_list_media;
+ void *src_p, *dst_p;
+ int hole, i;
+
+ if (unlikely(nr_holes == 1)) {
+ struct ppa_addr ppa;
+
+ ppa = rqd->ppa_addr;
+ rqd->ppa_list = pr_ctx->ppa_ptr;
+ rqd->dma_ppa_list = pr_ctx->dma_ppa_list;
+ rqd->ppa_list[0] = ppa;
+ }
+
+ /* Re-use allocated memory for intermediate lbas */
+ lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
+ lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
+
+ for (i = 0; i < nr_secs; i++) {
+ lba_list_media[i] = meta_list[i].lba;
+ meta_list[i].lba = lba_list_mem[i];
+ }
+
+ /* Fill the holes in the original bio */
+ i = 0;
+ hole = find_first_zero_bit(read_bitmap, nr_secs);
+ do {
+ int line_id = pblk_ppa_to_line(rqd->ppa_list[i]);
+ struct pblk_line *line = &pblk->lines[line_id];
+
+ kref_put(&line->ref, pblk_line_put);
+
+ meta_list[hole].lba = lba_list_media[i];
+
+ src_bv = new_bio->bi_io_vec[i++];
+ dst_bv = bio->bi_io_vec[bio_init_idx + hole];
+
+ src_p = kmap_atomic(src_bv.bv_page);
+ dst_p = kmap_atomic(dst_bv.bv_page);
+
+ memcpy(dst_p + dst_bv.bv_offset,
+ src_p + src_bv.bv_offset,
+ PBLK_EXPOSED_PAGE_SIZE);
+
+ kunmap_atomic(src_p);
+ kunmap_atomic(dst_p);
+
+ mempool_free(src_bv.bv_page, &pblk->page_bio_pool);
+
+ hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
+ } while (hole < nr_secs);
+
+ bio_put(new_bio);
+ kfree(pr_ctx);
+
+ /* restore original request */
+ rqd->bio = NULL;
+ rqd->nr_ppas = nr_secs;
+
+ bio_endio(bio);
+ __pblk_end_io_read(pblk, rqd, false);
+}
+
+static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int bio_init_idx,
+ unsigned long *read_bitmap,
+ int nr_holes)
+{
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ struct pblk_pr_ctx *pr_ctx;
+ struct bio *new_bio, *bio = r_ctx->private;
+ __le64 *lba_list_mem;
+ int nr_secs = rqd->nr_ppas;
+ int i;
+
+ /* Re-use allocated memory for intermediate lbas */
+ lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
+
+ new_bio = bio_alloc(GFP_KERNEL, nr_holes);
+
+ if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
+ goto fail_bio_put;
+
+ if (nr_holes != new_bio->bi_vcnt) {
+ WARN_ONCE(1, "pblk: malformed bio\n");
+ goto fail_free_pages;
+ }
+
+ pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
+ if (!pr_ctx)
+ goto fail_free_pages;
+
+ for (i = 0; i < nr_secs; i++)
+ lba_list_mem[i] = meta_list[i].lba;
+
+ new_bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
+
+ rqd->bio = new_bio;
+ rqd->nr_ppas = nr_holes;
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
+
+ pr_ctx->ppa_ptr = NULL;
+ pr_ctx->orig_bio = bio;
+ bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
+ pr_ctx->bio_init_idx = bio_init_idx;
+ pr_ctx->orig_nr_secs = nr_secs;
+ r_ctx->private = pr_ctx;
+
+ if (unlikely(nr_holes == 1)) {
+ pr_ctx->ppa_ptr = rqd->ppa_list;
+ pr_ctx->dma_ppa_list = rqd->dma_ppa_list;
+ rqd->ppa_addr = rqd->ppa_list[0];
+ }
+ return 0;
+
+fail_free_pages:
+ pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt);
+fail_bio_put:
+ bio_put(new_bio);
+
+ return -ENOMEM;
+}
+
+static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int bio_init_idx,
+ unsigned long *read_bitmap, int nr_secs)
+{
+ int nr_holes;
+ int ret;
+
+ nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
+
+ if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap,
+ nr_holes))
+ return NVM_IO_ERR;
+
+ rqd->end_io = pblk_end_partial_read;
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ bio_put(rqd->bio);
+ pblk_err(pblk, "partial read IO submission failed\n");
+ goto err;
+ }
+
+ return NVM_IO_OK;
+
+err:
+ pblk_err(pblk, "failed to perform partial read\n");
+
+ /* Free allocated pages in new bio */
+ pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt);
+ __pblk_end_io_read(pblk, rqd, false);
+ return NVM_IO_ERR;
+}
+
+static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
+ sector_t lba, unsigned long *read_bitmap)
+{
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ struct ppa_addr ppa;
+
+ pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+retry:
+ if (pblk_ppa_empty(ppa)) {
+ WARN_ON(test_and_set_bit(0, read_bitmap));
+ meta_list[0].lba = cpu_to_le64(ADDR_EMPTY);
+ return;
+ }
+
+ /* Try to read from write buffer. The address is later checked on the
+ * write buffer to prevent retrieving overwritten data.
+ */
+ if (pblk_addr_in_cache(ppa)) {
+ if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0, 1)) {
+ pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+ goto retry;
+ }
+
+ WARN_ON(test_and_set_bit(0, read_bitmap));
+ meta_list[0].lba = cpu_to_le64(lba);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_inc(&pblk->cache_reads);
+#endif
+ } else {
+ rqd->ppa_addr = ppa;
+ }
+
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
+}
+
+int pblk_submit_read(struct pblk *pblk, struct bio *bio)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct request_queue *q = dev->q;
+ sector_t blba = pblk_get_lba(bio);
+ unsigned int nr_secs = pblk_get_secs(bio);
+ struct pblk_g_ctx *r_ctx;
+ struct nvm_rq *rqd;
+ unsigned int bio_init_idx;
+ DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA);
+ int ret = NVM_IO_ERR;
+
+ /* logic error: lba out-of-bounds. Ignore read request */
+ if (blba >= pblk->rl.nr_secs || nr_secs > PBLK_MAX_REQ_ADDRS) {
+ WARN(1, "pblk: read lba out of bounds (lba:%llu, nr:%d)\n",
+ (unsigned long long)blba, nr_secs);
+ return NVM_IO_ERR;
+ }
+
+ generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio),
+ &pblk->disk->part0);
+
+ bitmap_zero(read_bitmap, nr_secs);
+
+ rqd = pblk_alloc_rqd(pblk, PBLK_READ);
+
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->nr_ppas = nr_secs;
+ rqd->bio = NULL; /* cloned bio if needed */
+ rqd->private = pblk;
+ rqd->end_io = pblk_end_io_read;
+
+ r_ctx = nvm_rq_to_pdu(rqd);
+ r_ctx->start_time = jiffies;
+ r_ctx->lba = blba;
+ r_ctx->private = bio; /* original bio */
+
+ /* Save the index for this bio's start. This is needed in case
+ * we need to fill a partial read.
+ */
+ bio_init_idx = pblk_get_bi_idx(bio);
+
+ rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd->dma_meta_list);
+ if (!rqd->meta_list) {
+ pblk_err(pblk, "not able to allocate ppa list\n");
+ goto fail_rqd_free;
+ }
+
+ if (nr_secs > 1) {
+ rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
+ rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
+
+ pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap);
+ } else {
+ pblk_read_rq(pblk, rqd, bio, blba, read_bitmap);
+ }
+
+ if (bitmap_full(read_bitmap, nr_secs)) {
+ atomic_inc(&pblk->inflight_io);
+ __pblk_end_io_read(pblk, rqd, false);
+ return NVM_IO_DONE;
+ }
+
+ /* All sectors are to be read from the device */
+ if (bitmap_empty(read_bitmap, rqd->nr_ppas)) {
+ struct bio *int_bio = NULL;
+
+ /* Clone read bio to deal with read errors internally */
+ int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
+ if (!int_bio) {
+ pblk_err(pblk, "could not clone read bio\n");
+ goto fail_end_io;
+ }
+
+ rqd->bio = int_bio;
+
+ if (pblk_submit_io(pblk, rqd)) {
+ pblk_err(pblk, "read IO submission failed\n");
+ ret = NVM_IO_ERR;
+ goto fail_end_io;
+ }
+
+ return NVM_IO_OK;
+ }
+
+ /* The read bio request could be partially filled by the write buffer,
+ * but there are some holes that need to be read from the drive.
+ */
+ ret = pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap,
+ nr_secs);
+ if (ret)
+ goto fail_meta_free;
+
+ return NVM_IO_OK;
+
+fail_meta_free:
+ nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
+fail_rqd_free:
+ pblk_free_rqd(pblk, rqd, PBLK_READ);
+ return ret;
+fail_end_io:
+ __pblk_end_io_read(pblk, rqd, false);
+ return ret;
+}
+
+static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_line *line, u64 *lba_list,
+ u64 *paddr_list_gc, unsigned int nr_secs)
+{
+ struct ppa_addr ppa_list_l2p[PBLK_MAX_REQ_ADDRS];
+ struct ppa_addr ppa_gc;
+ int valid_secs = 0;
+ int i;
+
+ pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs);
+
+ for (i = 0; i < nr_secs; i++) {
+ if (lba_list[i] == ADDR_EMPTY)
+ continue;
+
+ ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id);
+ if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) {
+ paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY;
+ continue;
+ }
+
+ rqd->ppa_list[valid_secs++] = ppa_list_l2p[i];
+ }
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_add(valid_secs, &pblk->inflight_reads);
+#endif
+
+ return valid_secs;
+}
+
+static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_line *line, sector_t lba,
+ u64 paddr_gc)
+{
+ struct ppa_addr ppa_l2p, ppa_gc;
+ int valid_secs = 0;
+
+ if (lba == ADDR_EMPTY)
+ goto out;
+
+ /* logic error: lba out-of-bounds */
+ if (lba >= pblk->rl.nr_secs) {
+ WARN(1, "pblk: read lba out of bounds\n");
+ goto out;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ ppa_l2p = pblk_trans_map_get(pblk, lba);
+ spin_unlock(&pblk->trans_lock);
+
+ ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id);
+ if (!pblk_ppa_comp(ppa_l2p, ppa_gc))
+ goto out;
+
+ rqd->ppa_addr = ppa_l2p;
+ valid_secs = 1;
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+out:
+ return valid_secs;
+}
+
+int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct bio *bio;
+ struct nvm_rq rqd;
+ int data_len;
+ int ret = NVM_IO_OK;
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd.dma_meta_list);
+ if (!rqd.meta_list)
+ return -ENOMEM;
+
+ if (gc_rq->nr_secs > 1) {
+ rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size;
+ rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size;
+
+ gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line,
+ gc_rq->lba_list,
+ gc_rq->paddr_list,
+ gc_rq->nr_secs);
+ if (gc_rq->secs_to_gc == 1)
+ rqd.ppa_addr = rqd.ppa_list[0];
+ } else {
+ gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line,
+ gc_rq->lba_list[0],
+ gc_rq->paddr_list[0]);
+ }
+
+ if (!(gc_rq->secs_to_gc))
+ goto out;
+
+ data_len = (gc_rq->secs_to_gc) * geo->csecs;
+ bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
+ PBLK_VMALLOC_META, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ pblk_err(pblk, "could not allocate GC bio (%lu)\n",
+ PTR_ERR(bio));
+ goto err_free_dma;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd.opcode = NVM_OP_PREAD;
+ rqd.nr_ppas = gc_rq->secs_to_gc;
+ rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
+ rqd.bio = bio;
+
+ if (pblk_submit_io_sync(pblk, &rqd)) {
+ ret = -EIO;
+ pblk_err(pblk, "GC read request failed\n");
+ goto err_free_bio;
+ }
+
+ pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs);
+
+ atomic_dec(&pblk->inflight_io);
+
+ if (rqd.error) {
+ atomic_long_inc(&pblk->read_failed_gc);
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ pblk_print_failed_rqd(pblk, &rqd, rqd.error);
+#endif
+ }
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads);
+ atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads);
+ atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
+#endif
+
+out:
+ nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
+ return ret;
+
+err_free_bio:
+ bio_put(bio);
+err_free_dma:
+ nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
+ return ret;
+}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
new file mode 100644
index 000000000..df75d9cae
--- /dev/null
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -0,0 +1,1011 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-recovery.c - pblk's recovery path
+ */
+
+#include "pblk.h"
+
+int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
+{
+ u32 crc;
+
+ crc = pblk_calc_emeta_crc(pblk, emeta_buf);
+ if (le32_to_cpu(emeta_buf->crc) != crc)
+ return 1;
+
+ if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC)
+ return 1;
+
+ return 0;
+}
+
+static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_emeta *emeta = line->emeta;
+ struct line_emeta *emeta_buf = emeta->buf;
+ __le64 *lba_list;
+ u64 data_start, data_end;
+ u64 nr_valid_lbas, nr_lbas = 0;
+ u64 i;
+
+ lba_list = emeta_to_lbas(pblk, emeta_buf);
+ if (!lba_list)
+ return 1;
+
+ data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
+ data_end = line->emeta_ssec;
+ nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas);
+
+ for (i = data_start; i < data_end; i++) {
+ struct ppa_addr ppa;
+ int pos;
+
+ ppa = addr_to_gen_ppa(pblk, i, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+
+ /* Do not update bad blocks */
+ if (test_bit(pos, line->blk_bitmap))
+ continue;
+
+ if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
+ spin_lock(&line->lock);
+ if (test_and_set_bit(i, line->invalid_bitmap))
+ WARN_ONCE(1, "pblk: rec. double invalidate:\n");
+ else
+ le32_add_cpu(line->vsc, -1);
+ spin_unlock(&line->lock);
+
+ continue;
+ }
+
+ pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa);
+ nr_lbas++;
+ }
+
+ if (nr_valid_lbas != nr_lbas)
+ pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n",
+ line->id, nr_valid_lbas, nr_lbas);
+
+ line->left_msecs = 0;
+
+ return 0;
+}
+
+static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+
+ return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] -
+ nr_bb * geo->clba;
+}
+
+struct pblk_recov_alloc {
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ void *data;
+ dma_addr_t dma_ppa_list;
+ dma_addr_t dma_meta_list;
+};
+
+static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p, u64 r_ptr)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ u64 r_ptr_int;
+ int left_ppas;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+ left_ppas = line->cur_sec - r_ptr;
+ if (!left_ppas)
+ return 0;
+
+ r_ptr_int = r_ptr;
+
+next_read_rq:
+ memset(rqd, 0, pblk_g_rq_size);
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->csecs;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+
+ if (pblk_io_aligned(pblk, rq_ppas))
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
+ else
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ r_ptr_int += pblk->min_write_pgs;
+ ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
+ rqd->ppa_list[i] =
+ addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+ }
+
+ /* If read fails, more padding is needed */
+ ret = pblk_submit_io_sync(pblk, rqd);
+ if (ret) {
+ pblk_err(pblk, "I/O submission failed: %d\n", ret);
+ return ret;
+ }
+
+ atomic_dec(&pblk->inflight_io);
+
+ /* At this point, the read should not fail. If it does, it is a problem
+ * we cannot recover from here. Need FTL log.
+ */
+ if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
+ pblk_err(pblk, "L2P recovery failed (%d)\n", rqd->error);
+ return -EINTR;
+ }
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ u64 lba = le64_to_cpu(meta_list[i].lba);
+
+ if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+ continue;
+
+ pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+ }
+
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0)
+ goto next_read_rq;
+
+ return 0;
+}
+
+static void pblk_recov_complete(struct kref *ref)
+{
+ struct pblk_pad_rq *pad_rq = container_of(ref, struct pblk_pad_rq, ref);
+
+ complete(&pad_rq->wait);
+}
+
+static void pblk_end_io_recov(struct nvm_rq *rqd)
+{
+ struct pblk_pad_rq *pad_rq = rqd->private;
+ struct pblk *pblk = pad_rq->pblk;
+
+ pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
+
+ atomic_dec(&pblk->inflight_io);
+ kref_put(&pad_rq->ref, pblk_recov_complete);
+}
+
+static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
+ int left_ppas)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct pblk_pad_rq *pad_rq;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
+ u64 w_ptr = line->cur_sec;
+ int left_line_ppas, rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+
+ spin_lock(&line->lock);
+ left_line_ppas = line->left_msecs;
+ spin_unlock(&line->lock);
+
+ pad_rq = kmalloc(sizeof(struct pblk_pad_rq), GFP_KERNEL);
+ if (!pad_rq)
+ return -ENOMEM;
+
+ data = vzalloc(array_size(pblk->max_write_pgs, geo->csecs));
+ if (!data) {
+ ret = -ENOMEM;
+ goto free_rq;
+ }
+
+ pad_rq->pblk = pblk;
+ init_completion(&pad_rq->wait);
+ kref_init(&pad_rq->ref);
+
+next_pad_rq:
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (rq_ppas < pblk->min_write_pgs) {
+ pblk_err(pblk, "corrupted pad line %d\n", line->id);
+ goto fail_free_pad;
+ }
+
+ rq_len = rq_ppas * geo->csecs;
+
+ meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+ if (!meta_list) {
+ ret = -ENOMEM;
+ goto fail_free_pad;
+ }
+
+ ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+ dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+ bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
+ PBLK_VMALLOC_META, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ ret = PTR_ERR(bio);
+ goto fail_free_meta;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PWRITE;
+ rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+ rqd->end_io = pblk_end_io_recov;
+ rqd->private = pad_rq;
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+ ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ w_ptr += pblk->min_write_pgs;
+ ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
+ struct ppa_addr dev_ppa;
+ __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
+ dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+
+ pblk_map_invalidate(pblk, dev_ppa);
+ lba_list[w_ptr] = meta_list[i].lba = addr_empty;
+ rqd->ppa_list[i] = dev_ppa;
+ }
+ }
+
+ kref_get(&pad_rq->ref);
+ pblk_down_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pblk_err(pblk, "I/O submission failed: %d\n", ret);
+ pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+ goto fail_free_bio;
+ }
+
+ left_line_ppas -= rq_ppas;
+ left_ppas -= rq_ppas;
+ if (left_ppas && left_line_ppas)
+ goto next_pad_rq;
+
+ kref_put(&pad_rq->ref, pblk_recov_complete);
+
+ if (!wait_for_completion_io_timeout(&pad_rq->wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pblk_err(pblk, "pad write timed out\n");
+ ret = -ETIME;
+ }
+
+ if (!pblk_line_is_full(line))
+ pblk_err(pblk, "corrupted padded line: %d\n", line->id);
+
+ vfree(data);
+free_rq:
+ kfree(pad_rq);
+ return ret;
+
+fail_free_bio:
+ bio_put(bio);
+fail_free_meta:
+ nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+fail_free_pad:
+ kfree(pad_rq);
+ vfree(data);
+ return ret;
+}
+
+/* When this function is called, it means that not all upper pages have been
+ * written in a page that contains valid data. In order to recover this data, we
+ * first find the write pointer on the device, then we pad all necessary
+ * sectors, and finally attempt to read the valid data
+ */
+static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ u64 w_ptr = 0, r_ptr;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+ int rec_round;
+ int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+ /* we could recover up until the line write pointer */
+ r_ptr = line->cur_sec;
+ rec_round = 0;
+
+next_rq:
+ memset(rqd, 0, pblk_g_rq_size);
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->csecs;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+
+ if (pblk_io_aligned(pblk, rq_ppas))
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
+ else
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+ ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ w_ptr += pblk->min_write_pgs;
+ ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
+ rqd->ppa_list[i] =
+ addr_to_gen_ppa(pblk, w_ptr, line->id);
+ }
+
+ ret = pblk_submit_io_sync(pblk, rqd);
+ if (ret) {
+ pblk_err(pblk, "I/O submission failed: %d\n", ret);
+ return ret;
+ }
+
+ atomic_dec(&pblk->inflight_io);
+
+ /* This should not happen since the read failed during normal recovery,
+ * but the media works funny sometimes...
+ */
+ if (!rec_round++ && !rqd->error) {
+ rec_round = 0;
+ for (i = 0; i < rqd->nr_ppas; i++, r_ptr++) {
+ u64 lba = le64_to_cpu(meta_list[i].lba);
+
+ if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+ continue;
+
+ pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+ }
+ }
+
+ /* Reached the end of the written line */
+ if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+ int pad_secs, nr_error_bits, bit;
+ int ret;
+
+ bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+ nr_error_bits = rqd->nr_ppas - bit;
+
+ /* Roll back failed sectors */
+ line->cur_sec -= nr_error_bits;
+ line->left_msecs += nr_error_bits;
+ bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+ pad_secs = pblk_pad_distance(pblk);
+ if (pad_secs > line->left_msecs)
+ pad_secs = line->left_msecs;
+
+ ret = pblk_recov_pad_oob(pblk, line, pad_secs);
+ if (ret)
+ pblk_err(pblk, "OOB padding failed (err:%d)\n", ret);
+
+ ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
+ if (ret)
+ pblk_err(pblk, "OOB read failed (err:%d)\n", ret);
+
+ left_ppas = 0;
+ }
+
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0)
+ goto next_rq;
+
+ return ret;
+}
+
+static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p, int *done)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ u64 paddr;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+ int left_ppas = pblk_calc_sec_in_line(pblk, line);
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+ *done = 1;
+
+next_rq:
+ memset(rqd, 0, pblk_g_rq_size);
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->csecs;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+
+ if (pblk_io_aligned(pblk, rq_ppas))
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
+ else
+ rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+ ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ paddr += pblk->min_write_pgs;
+ ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
+ rqd->ppa_list[i] =
+ addr_to_gen_ppa(pblk, paddr, line->id);
+ }
+
+ ret = pblk_submit_io_sync(pblk, rqd);
+ if (ret) {
+ pblk_err(pblk, "I/O submission failed: %d\n", ret);
+ bio_put(bio);
+ return ret;
+ }
+
+ atomic_dec(&pblk->inflight_io);
+
+ /* Reached the end of the written line */
+ if (rqd->error) {
+ int nr_error_bits, bit;
+
+ bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+ nr_error_bits = rqd->nr_ppas - bit;
+
+ /* Roll back failed sectors */
+ line->cur_sec -= nr_error_bits;
+ line->left_msecs += nr_error_bits;
+ bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+ left_ppas = 0;
+ rqd->nr_ppas = bit;
+
+ if (rqd->error != NVM_RSP_ERR_EMPTYPAGE)
+ *done = 0;
+ }
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ u64 lba = le64_to_cpu(meta_list[i].lba);
+
+ if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+ continue;
+
+ pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+ }
+
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0)
+ goto next_rq;
+
+ return ret;
+}
+
+/* Scan line for lbas on out of bound area */
+static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct nvm_rq *rqd;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct pblk_recov_alloc p;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ int done, ret = 0;
+
+ meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+ if (!meta_list)
+ return -ENOMEM;
+
+ ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+ dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+ data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL);
+ if (!data) {
+ ret = -ENOMEM;
+ goto free_meta_list;
+ }
+
+ rqd = pblk_alloc_rqd(pblk, PBLK_READ);
+
+ p.ppa_list = ppa_list;
+ p.meta_list = meta_list;
+ p.rqd = rqd;
+ p.data = data;
+ p.dma_ppa_list = dma_ppa_list;
+ p.dma_meta_list = dma_meta_list;
+
+ ret = pblk_recov_scan_oob(pblk, line, p, &done);
+ if (ret) {
+ pblk_err(pblk, "could not recover L2P from OOB\n");
+ goto out;
+ }
+
+ if (!done) {
+ ret = pblk_recov_scan_all_oob(pblk, line, p);
+ if (ret) {
+ pblk_err(pblk, "could not recover L2P from OOB\n");
+ goto out;
+ }
+ }
+
+ if (pblk_line_is_full(line))
+ pblk_line_recov_close(pblk, line);
+
+out:
+ kfree(data);
+free_meta_list:
+ nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+
+ return ret;
+}
+
+/* Insert lines ordered by sequence number (seq_num) on list */
+static void pblk_recov_line_add_ordered(struct list_head *head,
+ struct pblk_line *line)
+{
+ struct pblk_line *t = NULL;
+
+ list_for_each_entry(t, head, list)
+ if (t->seq_nr > line->seq_nr)
+ break;
+
+ __list_add(&line->list, t->list.prev, &t->list);
+}
+
+static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ unsigned int emeta_secs;
+ u64 emeta_start;
+ struct ppa_addr ppa;
+ int pos;
+
+ emeta_secs = lm->emeta_sec[0];
+ emeta_start = lm->sec_per_line;
+
+ while (emeta_secs) {
+ emeta_start--;
+ ppa = addr_to_gen_ppa(pblk, emeta_start, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+ if (!test_bit(pos, line->blk_bitmap))
+ emeta_secs--;
+ }
+
+ return emeta_start;
+}
+
+static int pblk_recov_check_line_version(struct pblk *pblk,
+ struct line_emeta *emeta)
+{
+ struct line_header *header = &emeta->header;
+
+ if (header->version_major != EMETA_VERSION_MAJOR) {
+ pblk_err(pblk, "line major version mismatch: %d, expected: %d\n",
+ header->version_major, EMETA_VERSION_MAJOR);
+ return 1;
+ }
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ if (header->version_minor > EMETA_VERSION_MINOR)
+ pblk_info(pblk, "newer line minor version found: %d\n",
+ header->version_minor);
+#endif
+
+ return 0;
+}
+
+static void pblk_recov_wa_counters(struct pblk *pblk,
+ struct line_emeta *emeta)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct line_header *header = &emeta->header;
+ struct wa_counters *wa = emeta_to_wa(lm, emeta);
+
+ /* WA counters were introduced in emeta version 0.2 */
+ if (header->version_major > 0 || header->version_minor >= 2) {
+ u64 user = le64_to_cpu(wa->user);
+ u64 pad = le64_to_cpu(wa->pad);
+ u64 gc = le64_to_cpu(wa->gc);
+
+ atomic64_set(&pblk->user_wa, user);
+ atomic64_set(&pblk->pad_wa, pad);
+ atomic64_set(&pblk->gc_wa, gc);
+
+ pblk->user_rst_wa = user;
+ pblk->pad_rst_wa = pad;
+ pblk->gc_rst_wa = gc;
+ }
+}
+
+static int pblk_line_was_written(struct pblk_line *line,
+ struct pblk *pblk)
+{
+
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct nvm_chk_meta *chunk;
+ struct ppa_addr bppa;
+ int smeta_blk;
+
+ if (line->state == PBLK_LINESTATE_BAD)
+ return 0;
+
+ smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ if (smeta_blk >= lm->blk_per_line)
+ return 0;
+
+ bppa = pblk->luns[smeta_blk].bppa;
+ chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)];
+
+ if (chunk->state & NVM_CHK_ST_FREE)
+ return 0;
+
+ return 1;
+}
+
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line, *tline, *data_line = NULL;
+ struct pblk_smeta *smeta;
+ struct pblk_emeta *emeta;
+ struct line_smeta *smeta_buf;
+ int found_lines = 0, recovered_lines = 0, open_lines = 0;
+ int is_next = 0;
+ int meta_line;
+ int i, valid_uuid = 0;
+ LIST_HEAD(recov_list);
+
+ /* TODO: Implement FTL snapshot */
+
+ /* Scan recovery - takes place when FTL snapshot fails */
+ spin_lock(&l_mg->free_lock);
+ meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+ set_bit(meta_line, &l_mg->meta_bitmap);
+ smeta = l_mg->sline_meta[meta_line];
+ emeta = l_mg->eline_meta[meta_line];
+ smeta_buf = (struct line_smeta *)smeta;
+ spin_unlock(&l_mg->free_lock);
+
+ /* Order data lines using their sequence number */
+ for (i = 0; i < l_mg->nr_lines; i++) {
+ u32 crc;
+
+ line = &pblk->lines[i];
+
+ memset(smeta, 0, lm->smeta_len);
+ line->smeta = smeta;
+ line->lun_bitmap = ((void *)(smeta_buf)) +
+ sizeof(struct line_smeta);
+
+ if (!pblk_line_was_written(line, pblk))
+ continue;
+
+ /* Lines that cannot be read are assumed as not written here */
+ if (pblk_line_read_smeta(pblk, line))
+ continue;
+
+ crc = pblk_calc_smeta_crc(pblk, smeta_buf);
+ if (le32_to_cpu(smeta_buf->crc) != crc)
+ continue;
+
+ if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC)
+ continue;
+
+ if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) {
+ pblk_err(pblk, "found incompatible line version %u\n",
+ smeta_buf->header.version_major);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* The first valid instance uuid is used for initialization */
+ if (!valid_uuid) {
+ memcpy(pblk->instance_uuid, smeta_buf->header.uuid, 16);
+ valid_uuid = 1;
+ }
+
+ if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) {
+ pblk_debug(pblk, "ignore line %u due to uuid mismatch\n",
+ i);
+ continue;
+ }
+
+ /* Update line metadata */
+ spin_lock(&line->lock);
+ line->id = le32_to_cpu(smeta_buf->header.id);
+ line->type = le16_to_cpu(smeta_buf->header.type);
+ line->seq_nr = le64_to_cpu(smeta_buf->seq_nr);
+ spin_unlock(&line->lock);
+
+ /* Update general metadata */
+ spin_lock(&l_mg->free_lock);
+ if (line->seq_nr >= l_mg->d_seq_nr)
+ l_mg->d_seq_nr = line->seq_nr + 1;
+ l_mg->nr_free_lines--;
+ spin_unlock(&l_mg->free_lock);
+
+ if (pblk_line_recov_alloc(pblk, line))
+ goto out;
+
+ pblk_recov_line_add_ordered(&recov_list, line);
+ found_lines++;
+ pblk_debug(pblk, "recovering data line %d, seq:%llu\n",
+ line->id, smeta_buf->seq_nr);
+ }
+
+ if (!found_lines) {
+ pblk_setup_uuid(pblk);
+
+ spin_lock(&l_mg->free_lock);
+ WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+ &l_mg->meta_bitmap));
+ spin_unlock(&l_mg->free_lock);
+
+ goto out;
+ }
+
+ /* Verify closed blocks and recover this portion of L2P table*/
+ list_for_each_entry_safe(line, tline, &recov_list, list) {
+ recovered_lines++;
+
+ line->emeta_ssec = pblk_line_emeta_start(pblk, line);
+ line->emeta = emeta;
+ memset(line->emeta->buf, 0, lm->emeta_len[0]);
+
+ if (pblk_line_read_emeta(pblk, line, line->emeta->buf)) {
+ pblk_recov_l2p_from_oob(pblk, line);
+ goto next;
+ }
+
+ if (pblk_recov_check_emeta(pblk, line->emeta->buf)) {
+ pblk_recov_l2p_from_oob(pblk, line);
+ goto next;
+ }
+
+ if (pblk_recov_check_line_version(pblk, line->emeta->buf))
+ return ERR_PTR(-EINVAL);
+
+ pblk_recov_wa_counters(pblk, line->emeta->buf);
+
+ if (pblk_recov_l2p_from_emeta(pblk, line))
+ pblk_recov_l2p_from_oob(pblk, line);
+
+next:
+ if (pblk_line_is_full(line)) {
+ struct list_head *move_list;
+
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_CLOSED;
+ move_list = pblk_line_gc_list(pblk, line);
+ spin_unlock(&line->lock);
+
+ spin_lock(&l_mg->gc_lock);
+ list_move_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
+
+ kfree(line->map_bitmap);
+ line->map_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+ } else {
+ if (open_lines > 1)
+ pblk_err(pblk, "failed to recover L2P\n");
+
+ open_lines++;
+ line->meta_line = meta_line;
+ data_line = line;
+ }
+ }
+
+ if (!open_lines) {
+ spin_lock(&l_mg->free_lock);
+ WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+ &l_mg->meta_bitmap));
+ spin_unlock(&l_mg->free_lock);
+ pblk_line_replace_data(pblk);
+ } else {
+ spin_lock(&l_mg->free_lock);
+ /* Allocate next line for preparation */
+ l_mg->data_next = pblk_line_get(pblk);
+ if (l_mg->data_next) {
+ l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+ l_mg->data_next->type = PBLK_LINETYPE_DATA;
+ is_next = 1;
+ }
+ spin_unlock(&l_mg->free_lock);
+ }
+
+ if (is_next)
+ pblk_line_erase(pblk, l_mg->data_next);
+
+out:
+ if (found_lines != recovered_lines)
+ pblk_err(pblk, "failed to recover all found lines %d/%d\n",
+ found_lines, recovered_lines);
+
+ return data_line;
+}
+
+/*
+ * Pad current line
+ */
+int pblk_recov_pad(struct pblk *pblk)
+{
+ struct pblk_line *line;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ int left_msecs;
+ int ret = 0;
+
+ spin_lock(&l_mg->free_lock);
+ line = l_mg->data_line;
+ left_msecs = line->left_msecs;
+ spin_unlock(&l_mg->free_lock);
+
+ ret = pblk_recov_pad_oob(pblk, line, left_msecs);
+ if (ret) {
+ pblk_err(pblk, "tear down padding failed (%d)\n", ret);
+ return ret;
+ }
+
+ pblk_line_close_meta(pblk, line);
+ return ret;
+}
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
new file mode 100644
index 000000000..6a0616a6f
--- /dev/null
+++ b/drivers/lightnvm/pblk-rl.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rl.c - pblk's rate limiter for user I/O
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_rl_kick_u_timer(struct pblk_rl *rl)
+{
+ mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000));
+}
+
+int pblk_rl_is_limit(struct pblk_rl *rl)
+{
+ int rb_space;
+
+ rb_space = atomic_read(&rl->rb_space);
+
+ return (rb_space == 0);
+}
+
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+ int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
+ int rb_space = atomic_read(&rl->rb_space);
+
+ if (unlikely(rb_space >= 0) && (rb_space - nr_entries < 0))
+ return NVM_IO_ERR;
+
+ if (rb_user_cnt >= rl->rb_user_max)
+ return NVM_IO_REQUEUE;
+
+ return NVM_IO_OK;
+}
+
+void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries)
+{
+ int rb_space = atomic_read(&rl->rb_space);
+
+ if (unlikely(rb_space >= 0))
+ atomic_sub(nr_entries, &rl->rb_space);
+}
+
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+ int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt);
+ int rb_user_active;
+
+ /* If there is no user I/O let GC take over space on the write buffer */
+ rb_user_active = READ_ONCE(rl->rb_user_active);
+ return (!(rb_gc_cnt >= rl->rb_gc_max && rb_user_active));
+}
+
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
+{
+ atomic_add(nr_entries, &rl->rb_user_cnt);
+
+ /* Release user I/O state. Protect from GC */
+ smp_store_release(&rl->rb_user_active, 1);
+ pblk_rl_kick_u_timer(rl);
+}
+
+void pblk_rl_werr_line_in(struct pblk_rl *rl)
+{
+ atomic_inc(&rl->werr_lines);
+}
+
+void pblk_rl_werr_line_out(struct pblk_rl *rl)
+{
+ atomic_dec(&rl->werr_lines);
+}
+
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
+{
+ atomic_add(nr_entries, &rl->rb_gc_cnt);
+}
+
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc)
+{
+ atomic_sub(nr_user, &rl->rb_user_cnt);
+ atomic_sub(nr_gc, &rl->rb_gc_cnt);
+}
+
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
+{
+ return atomic_read(&rl->free_blocks);
+}
+
+unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl)
+{
+ return atomic_read(&rl->free_user_blocks);
+}
+
+static void __pblk_rl_update_rates(struct pblk_rl *rl,
+ unsigned long free_blocks)
+{
+ struct pblk *pblk = container_of(rl, struct pblk, rl);
+ int max = rl->rb_budget;
+ int werr_gc_needed = atomic_read(&rl->werr_lines);
+
+ if (free_blocks >= rl->high) {
+ if (werr_gc_needed) {
+ /* Allocate a small budget for recovering
+ * lines with write errors
+ */
+ rl->rb_gc_max = 1 << rl->rb_windows_pw;
+ rl->rb_user_max = max - rl->rb_gc_max;
+ rl->rb_state = PBLK_RL_WERR;
+ } else {
+ rl->rb_user_max = max;
+ rl->rb_gc_max = 0;
+ rl->rb_state = PBLK_RL_OFF;
+ }
+ } else if (free_blocks < rl->high) {
+ int shift = rl->high_pw - rl->rb_windows_pw;
+ int user_windows = free_blocks >> shift;
+ int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW;
+
+ rl->rb_user_max = user_max;
+ rl->rb_gc_max = max - user_max;
+
+ if (free_blocks <= rl->rsv_blocks) {
+ rl->rb_user_max = 0;
+ rl->rb_gc_max = max;
+ }
+
+ /* In the worst case, we will need to GC lines in the low list
+ * (high valid sector count). If there are lines to GC on high
+ * or mid lists, these will be prioritized
+ */
+ rl->rb_state = PBLK_RL_LOW;
+ }
+
+ if (rl->rb_state != PBLK_RL_OFF)
+ pblk_gc_should_start(pblk);
+ else
+ pblk_gc_should_stop(pblk);
+}
+
+void pblk_rl_update_rates(struct pblk_rl *rl)
+{
+ __pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl));
+}
+
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
+{
+ int blk_in_line = atomic_read(&line->blk_in_line);
+ int free_blocks;
+
+ atomic_add(blk_in_line, &rl->free_blocks);
+ free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks);
+
+ __pblk_rl_update_rates(rl, free_blocks);
+}
+
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
+ bool used)
+{
+ int blk_in_line = atomic_read(&line->blk_in_line);
+ int free_blocks;
+
+ atomic_sub(blk_in_line, &rl->free_blocks);
+
+ if (used)
+ free_blocks = atomic_sub_return(blk_in_line,
+ &rl->free_user_blocks);
+ else
+ free_blocks = atomic_read(&rl->free_user_blocks);
+
+ __pblk_rl_update_rates(rl, free_blocks);
+}
+
+int pblk_rl_high_thrs(struct pblk_rl *rl)
+{
+ return rl->high;
+}
+
+int pblk_rl_max_io(struct pblk_rl *rl)
+{
+ return rl->rb_max_io;
+}
+
+static void pblk_rl_u_timer(struct timer_list *t)
+{
+ struct pblk_rl *rl = from_timer(rl, t, u_timer);
+
+ /* Release user I/O state. Protect from GC */
+ smp_store_release(&rl->rb_user_active, 0);
+}
+
+void pblk_rl_free(struct pblk_rl *rl)
+{
+ del_timer(&rl->u_timer);
+}
+
+void pblk_rl_init(struct pblk_rl *rl, int budget)
+{
+ struct pblk *pblk = container_of(rl, struct pblk, rl);
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
+ int sec_meta, blk_meta;
+
+ unsigned int rb_windows;
+
+ /* Consider sectors used for metadata */
+ sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
+ blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
+
+ rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
+ rl->high_pw = get_count_order(rl->high);
+
+ rl->rsv_blocks = min_blocks;
+
+ /* This will always be a power-of-2 */
+ rb_windows = budget / PBLK_MAX_REQ_ADDRS;
+ rl->rb_windows_pw = get_count_order(rb_windows);
+
+ /* To start with, all buffer is available to user I/O writers */
+ rl->rb_budget = budget;
+ rl->rb_user_max = budget;
+ rl->rb_max_io = budget >> 1;
+ rl->rb_gc_max = 0;
+ rl->rb_state = PBLK_RL_HIGH;
+
+ atomic_set(&rl->rb_user_cnt, 0);
+ atomic_set(&rl->rb_gc_cnt, 0);
+ atomic_set(&rl->rb_space, -1);
+ atomic_set(&rl->werr_lines, 0);
+
+ timer_setup(&rl->u_timer, pblk_rl_u_timer, 0);
+
+ rl->rb_user_active = 0;
+ rl->rb_gc_active = 0;
+}
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
new file mode 100644
index 000000000..bdc86ee4c
--- /dev/null
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-sysfs.c - pblk's sysfs
+ *
+ */
+
+#include "pblk.h"
+
+static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ ssize_t sz = 0;
+ int i;
+
+ for (i = 0; i < geo->all_luns; i++) {
+ int active = 1;
+
+ rlun = &pblk->luns[i];
+ if (!down_trylock(&rlun->wr_sem)) {
+ active = 0;
+ up(&rlun->wr_sem);
+ }
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "pblk: pos:%d, ch:%d, lun:%d - %d\n",
+ i,
+ rlun->bppa.a.ch,
+ rlun->bppa.a.lun,
+ active);
+ }
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
+{
+ int free_blocks, free_user_blocks, total_blocks;
+ int rb_user_max, rb_user_cnt;
+ int rb_gc_max, rb_gc_cnt, rb_budget, rb_state;
+
+ free_blocks = pblk_rl_nr_free_blks(&pblk->rl);
+ free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl);
+ rb_user_max = pblk->rl.rb_user_max;
+ rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
+ rb_gc_max = pblk->rl.rb_gc_max;
+ rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt);
+ rb_budget = pblk->rl.rb_budget;
+ rb_state = pblk->rl.rb_state;
+
+ total_blocks = pblk->rl.total_blocks;
+
+ return snprintf(page, PAGE_SIZE,
+ "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n",
+ rb_user_cnt,
+ rb_user_max,
+ rb_gc_cnt,
+ rb_gc_max,
+ rb_state,
+ rb_budget,
+ pblk->rl.high,
+ free_blocks,
+ free_user_blocks,
+ total_blocks,
+ READ_ONCE(pblk->rl.rb_user_active));
+}
+
+static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page)
+{
+ int gc_enabled, gc_active;
+
+ pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active);
+ return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n",
+ gc_enabled, gc_active);
+}
+
+static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page)
+{
+ ssize_t sz;
+
+ sz = snprintf(page, PAGE_SIZE,
+ "read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n",
+ atomic_long_read(&pblk->read_failed),
+ atomic_long_read(&pblk->read_high_ecc),
+ atomic_long_read(&pblk->read_empty),
+ atomic_long_read(&pblk->read_failed_gc),
+ atomic_long_read(&pblk->write_failed),
+ atomic_long_read(&pblk->erase_failed));
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page)
+{
+ return pblk_rb_sysfs(&pblk->rwb, page);
+}
+
+static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ ssize_t sz = 0;
+
+ if (geo->version == NVM_OCSSD_SPEC_12) {
+ struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf;
+ struct nvm_addrf_12 *gppaf = (struct nvm_addrf_12 *)&geo->addrf;
+
+ sz = snprintf(page, PAGE_SIZE,
+ "g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+ pblk->addrf_len,
+ ppaf->blk_offset, ppaf->blk_len,
+ ppaf->pg_offset, ppaf->pg_len,
+ ppaf->lun_offset, ppaf->lun_len,
+ ppaf->ch_offset, ppaf->ch_len,
+ ppaf->pln_offset, ppaf->pln_len,
+ ppaf->sec_offset, ppaf->sec_len);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+ gppaf->blk_offset, gppaf->blk_len,
+ gppaf->pg_offset, gppaf->pg_len,
+ gppaf->lun_offset, gppaf->lun_len,
+ gppaf->ch_offset, gppaf->ch_len,
+ gppaf->pln_offset, gppaf->pln_len,
+ gppaf->sec_offset, gppaf->sec_len);
+ } else {
+ struct nvm_addrf *ppaf = &pblk->addrf;
+ struct nvm_addrf *gppaf = &geo->addrf;
+
+ sz = snprintf(page, PAGE_SIZE,
+ "pblk:(s:%d)ch:%d/%d,lun:%d/%d,chk:%d/%d/sec:%d/%d\n",
+ pblk->addrf_len,
+ ppaf->ch_offset, ppaf->ch_len,
+ ppaf->lun_offset, ppaf->lun_len,
+ ppaf->chk_offset, ppaf->chk_len,
+ ppaf->sec_offset, ppaf->sec_len);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "device:ch:%d/%d,lun:%d/%d,chk:%d/%d,sec:%d/%d\n",
+ gppaf->ch_offset, gppaf->ch_len,
+ gppaf->lun_offset, gppaf->lun_len,
+ gppaf->chk_offset, gppaf->chk_len,
+ gppaf->sec_offset, gppaf->sec_len);
+ }
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line;
+ ssize_t sz = 0;
+ int nr_free_lines;
+ int cur_data, cur_log;
+ int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0;
+ int d_line_cnt = 0, l_line_cnt = 0;
+ int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
+ int gc_werr = 0;
+
+ int bad = 0, cor = 0;
+ int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
+ int map_weight = 0, meta_weight = 0;
+
+ spin_lock(&l_mg->free_lock);
+ cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1;
+ cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1;
+ nr_free_lines = l_mg->nr_free_lines;
+
+ list_for_each_entry(line, &l_mg->free_list, list)
+ free_line_cnt++;
+ spin_unlock(&l_mg->free_lock);
+
+ spin_lock(&l_mg->close_lock);
+ list_for_each_entry(line, &l_mg->emeta_list, list)
+ emeta_line_cnt++;
+ spin_unlock(&l_mg->close_lock);
+
+ spin_lock(&l_mg->gc_lock);
+ list_for_each_entry(line, &l_mg->gc_full_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_full++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_high_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_high++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_mid_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_mid++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_low_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_low++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_empty_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_empty++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_werr_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_werr++;
+ }
+
+ list_for_each_entry(line, &l_mg->bad_list, list)
+ bad++;
+ list_for_each_entry(line, &l_mg->corrupt_list, list)
+ cor++;
+ spin_unlock(&l_mg->gc_lock);
+
+ spin_lock(&l_mg->free_lock);
+ if (l_mg->data_line) {
+ cur_sec = l_mg->data_line->cur_sec;
+ msecs = l_mg->data_line->left_msecs;
+ vsc = le32_to_cpu(*l_mg->data_line->vsc);
+ sec_in_line = l_mg->data_line->sec_in_line;
+ meta_weight = bitmap_weight(&l_mg->meta_bitmap,
+ PBLK_DATA_LINES);
+
+ spin_lock(&l_mg->data_line->lock);
+ if (l_mg->data_line->map_bitmap)
+ map_weight = bitmap_weight(l_mg->data_line->map_bitmap,
+ lm->sec_per_line);
+ else
+ map_weight = 0;
+ spin_unlock(&l_mg->data_line->lock);
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ if (nr_free_lines != free_line_cnt)
+ pblk_err(pblk, "corrupted free line list:%d/%d\n",
+ nr_free_lines, free_line_cnt);
+
+ sz = snprintf(page, PAGE_SIZE - sz,
+ "line: nluns:%d, nblks:%d, nsecs:%d\n",
+ geo->all_luns, lm->blk_per_line, lm->sec_per_line);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n",
+ cur_data, cur_log,
+ nr_free_lines,
+ emeta_line_cnt, meta_weight,
+ closed_line_cnt,
+ bad, cor,
+ d_line_cnt, l_line_cnt,
+ l_mg->nr_lines);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n",
+ gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr,
+ atomic_read(&pblk->gc.read_inflight_gc));
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
+ cur_data, cur_sec, msecs, vsc, sec_in_line,
+ map_weight, lm->sec_per_line,
+ atomic_read(&pblk->inflight_io));
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ ssize_t sz = 0;
+
+ sz = snprintf(page, PAGE_SIZE - sz,
+ "smeta - len:%d, secs:%d\n",
+ lm->smeta_len, lm->smeta_sec);
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "emeta - len:%d, sec:%d, bb_start:%d\n",
+ lm->emeta_len[0], lm->emeta_sec[0],
+ lm->emeta_bb);
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "bitmap lengths: sec:%d, blk:%d, lun:%d\n",
+ lm->sec_bitmap_len,
+ lm->blk_bitmap_len,
+ lm->lun_bitmap_len);
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "blk_line:%d, sec_line:%d, sec_blk:%d\n",
+ lm->blk_per_line,
+ lm->sec_per_line,
+ geo->clba);
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_get_sec_per_write(struct pblk *pblk, char *page)
+{
+ return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write);
+}
+
+static ssize_t pblk_get_write_amp(u64 user, u64 gc, u64 pad,
+ char *page)
+{
+ int sz;
+
+ sz = snprintf(page, PAGE_SIZE,
+ "user:%lld gc:%lld pad:%lld WA:",
+ user, gc, pad);
+
+ if (!user) {
+ sz += snprintf(page + sz, PAGE_SIZE - sz, "NaN\n");
+ } else {
+ u64 wa_int;
+ u32 wa_frac;
+
+ wa_int = (user + gc + pad) * 100000;
+ wa_int = div64_u64(wa_int, user);
+ wa_int = div_u64_rem(wa_int, 100000, &wa_frac);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz, "%llu.%05u\n",
+ wa_int, wa_frac);
+ }
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_get_write_amp_mileage(struct pblk *pblk, char *page)
+{
+ return pblk_get_write_amp(atomic64_read(&pblk->user_wa),
+ atomic64_read(&pblk->gc_wa), atomic64_read(&pblk->pad_wa),
+ page);
+}
+
+static ssize_t pblk_sysfs_get_write_amp_trip(struct pblk *pblk, char *page)
+{
+ return pblk_get_write_amp(
+ atomic64_read(&pblk->user_wa) - pblk->user_rst_wa,
+ atomic64_read(&pblk->gc_wa) - pblk->gc_rst_wa,
+ atomic64_read(&pblk->pad_wa) - pblk->pad_rst_wa, page);
+}
+
+static long long bucket_percentage(unsigned long long bucket,
+ unsigned long long total)
+{
+ int p = bucket * 100;
+
+ p = div_u64(p, total);
+
+ return p;
+}
+
+static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page)
+{
+ int sz = 0;
+ unsigned long long total;
+ unsigned long long total_buckets = 0;
+ int buckets = pblk->min_write_pgs - 1;
+ int i;
+
+ total = atomic64_read(&pblk->nr_flush) - pblk->nr_flush_rst;
+ if (!total) {
+ for (i = 0; i < (buckets + 1); i++)
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "%d:0 ", i);
+ sz += snprintf(page + sz, PAGE_SIZE - sz, "\n");
+
+ return sz;
+ }
+
+ for (i = 0; i < buckets; i++)
+ total_buckets += atomic64_read(&pblk->pad_dist[i]);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz, "0:%lld%% ",
+ bucket_percentage(total - total_buckets, total));
+
+ for (i = 0; i < buckets; i++) {
+ unsigned long long p;
+
+ p = bucket_percentage(atomic64_read(&pblk->pad_dist[i]),
+ total);
+ sz += snprintf(page + sz, PAGE_SIZE - sz, "%d:%lld%% ",
+ i + 1, p);
+ }
+ sz += snprintf(page + sz, PAGE_SIZE - sz, "\n");
+
+ return sz;
+}
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
+{
+ return snprintf(page, PAGE_SIZE,
+ "%lu\t%lu\t%ld\t%llu\t%ld\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
+ atomic_long_read(&pblk->inflight_writes),
+ atomic_long_read(&pblk->inflight_reads),
+ atomic_long_read(&pblk->req_writes),
+ (u64)atomic64_read(&pblk->nr_flush),
+ atomic_long_read(&pblk->padded_writes),
+ atomic_long_read(&pblk->padded_wb),
+ atomic_long_read(&pblk->sub_writes),
+ atomic_long_read(&pblk->sync_writes),
+ atomic_long_read(&pblk->recov_writes),
+ atomic_long_read(&pblk->recov_gc_writes),
+ atomic_long_read(&pblk->recov_gc_reads),
+ atomic_long_read(&pblk->cache_reads),
+ atomic_long_read(&pblk->sync_reads));
+}
+#endif
+
+static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
+ size_t len)
+{
+ size_t c_len;
+ int force;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &force))
+ return -EINVAL;
+
+ pblk_gc_sysfs_force(pblk, force);
+
+ return len;
+}
+
+static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk,
+ const char *page, size_t len)
+{
+ size_t c_len;
+ int sec_per_write;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &sec_per_write))
+ return -EINVAL;
+
+ if (sec_per_write < pblk->min_write_pgs
+ || sec_per_write > pblk->max_write_pgs
+ || sec_per_write % pblk->min_write_pgs != 0)
+ return -EINVAL;
+
+ pblk_set_sec_per_write(pblk, sec_per_write);
+
+ return len;
+}
+
+static ssize_t pblk_sysfs_set_write_amp_trip(struct pblk *pblk,
+ const char *page, size_t len)
+{
+ size_t c_len;
+ int reset_value;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &reset_value))
+ return -EINVAL;
+
+ if (reset_value != 0)
+ return -EINVAL;
+
+ pblk->user_rst_wa = atomic64_read(&pblk->user_wa);
+ pblk->pad_rst_wa = atomic64_read(&pblk->pad_wa);
+ pblk->gc_rst_wa = atomic64_read(&pblk->gc_wa);
+
+ return len;
+}
+
+
+static ssize_t pblk_sysfs_set_padding_dist(struct pblk *pblk,
+ const char *page, size_t len)
+{
+ size_t c_len;
+ int reset_value;
+ int buckets = pblk->min_write_pgs - 1;
+ int i;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &reset_value))
+ return -EINVAL;
+
+ if (reset_value != 0)
+ return -EINVAL;
+
+ for (i = 0; i < buckets; i++)
+ atomic64_set(&pblk->pad_dist[i], 0);
+
+ pblk->nr_flush_rst = atomic64_read(&pblk->nr_flush);
+
+ return len;
+}
+
+static struct attribute sys_write_luns = {
+ .name = "write_luns",
+ .mode = 0444,
+};
+
+static struct attribute sys_rate_limiter_attr = {
+ .name = "rate_limiter",
+ .mode = 0444,
+};
+
+static struct attribute sys_gc_state = {
+ .name = "gc_state",
+ .mode = 0444,
+};
+
+static struct attribute sys_errors_attr = {
+ .name = "errors",
+ .mode = 0444,
+};
+
+static struct attribute sys_rb_attr = {
+ .name = "write_buffer",
+ .mode = 0444,
+};
+
+static struct attribute sys_stats_ppaf_attr = {
+ .name = "ppa_format",
+ .mode = 0444,
+};
+
+static struct attribute sys_lines_attr = {
+ .name = "lines",
+ .mode = 0444,
+};
+
+static struct attribute sys_lines_info_attr = {
+ .name = "lines_info",
+ .mode = 0444,
+};
+
+static struct attribute sys_gc_force = {
+ .name = "gc_force",
+ .mode = 0200,
+};
+
+static struct attribute sys_max_sec_per_write = {
+ .name = "max_sec_per_write",
+ .mode = 0644,
+};
+
+static struct attribute sys_write_amp_mileage = {
+ .name = "write_amp_mileage",
+ .mode = 0444,
+};
+
+static struct attribute sys_write_amp_trip = {
+ .name = "write_amp_trip",
+ .mode = 0644,
+};
+
+static struct attribute sys_padding_dist = {
+ .name = "padding_dist",
+ .mode = 0644,
+};
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+static struct attribute sys_stats_debug_attr = {
+ .name = "stats",
+ .mode = 0444,
+};
+#endif
+
+static struct attribute *pblk_attrs[] = {
+ &sys_write_luns,
+ &sys_rate_limiter_attr,
+ &sys_errors_attr,
+ &sys_gc_state,
+ &sys_gc_force,
+ &sys_max_sec_per_write,
+ &sys_rb_attr,
+ &sys_stats_ppaf_attr,
+ &sys_lines_attr,
+ &sys_lines_info_attr,
+ &sys_write_amp_mileage,
+ &sys_write_amp_trip,
+ &sys_padding_dist,
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ &sys_stats_debug_attr,
+#endif
+ NULL,
+};
+
+static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+ if (strcmp(attr->name, "rate_limiter") == 0)
+ return pblk_sysfs_rate_limiter(pblk, buf);
+ else if (strcmp(attr->name, "write_luns") == 0)
+ return pblk_sysfs_luns_show(pblk, buf);
+ else if (strcmp(attr->name, "gc_state") == 0)
+ return pblk_sysfs_gc_state_show(pblk, buf);
+ else if (strcmp(attr->name, "errors") == 0)
+ return pblk_sysfs_stats(pblk, buf);
+ else if (strcmp(attr->name, "write_buffer") == 0)
+ return pblk_sysfs_write_buffer(pblk, buf);
+ else if (strcmp(attr->name, "ppa_format") == 0)
+ return pblk_sysfs_ppaf(pblk, buf);
+ else if (strcmp(attr->name, "lines") == 0)
+ return pblk_sysfs_lines(pblk, buf);
+ else if (strcmp(attr->name, "lines_info") == 0)
+ return pblk_sysfs_lines_info(pblk, buf);
+ else if (strcmp(attr->name, "max_sec_per_write") == 0)
+ return pblk_sysfs_get_sec_per_write(pblk, buf);
+ else if (strcmp(attr->name, "write_amp_mileage") == 0)
+ return pblk_sysfs_get_write_amp_mileage(pblk, buf);
+ else if (strcmp(attr->name, "write_amp_trip") == 0)
+ return pblk_sysfs_get_write_amp_trip(pblk, buf);
+ else if (strcmp(attr->name, "padding_dist") == 0)
+ return pblk_sysfs_get_padding_dist(pblk, buf);
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ else if (strcmp(attr->name, "stats") == 0)
+ return pblk_sysfs_stats_debug(pblk, buf);
+#endif
+ return 0;
+}
+
+static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+ if (strcmp(attr->name, "gc_force") == 0)
+ return pblk_sysfs_gc_force(pblk, buf, len);
+ else if (strcmp(attr->name, "max_sec_per_write") == 0)
+ return pblk_sysfs_set_sec_per_write(pblk, buf, len);
+ else if (strcmp(attr->name, "write_amp_trip") == 0)
+ return pblk_sysfs_set_write_amp_trip(pblk, buf, len);
+ else if (strcmp(attr->name, "padding_dist") == 0)
+ return pblk_sysfs_set_padding_dist(pblk, buf, len);
+ return 0;
+}
+
+static const struct sysfs_ops pblk_sysfs_ops = {
+ .show = pblk_sysfs_show,
+ .store = pblk_sysfs_store,
+};
+
+static struct kobj_type pblk_ktype = {
+ .sysfs_ops = &pblk_sysfs_ops,
+ .default_attrs = pblk_attrs,
+};
+
+int pblk_sysfs_init(struct gendisk *tdisk)
+{
+ struct pblk *pblk = tdisk->private_data;
+ struct device *parent_dev = disk_to_dev(pblk->disk);
+ int ret;
+
+ ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype,
+ kobject_get(&parent_dev->kobj),
+ "%s", "pblk");
+ if (ret) {
+ pblk_err(pblk, "could not register\n");
+ return ret;
+ }
+
+ kobject_uevent(&pblk->kobj, KOBJ_ADD);
+ return 0;
+}
+
+void pblk_sysfs_exit(struct gendisk *tdisk)
+{
+ struct pblk *pblk = tdisk->private_data;
+
+ kobject_uevent(&pblk->kobj, KOBJ_REMOVE);
+ kobject_del(&pblk->kobj);
+ kobject_put(&pblk->kobj);
+}
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
new file mode 100644
index 000000000..c3e038d4b
--- /dev/null
+++ b/drivers/lightnvm/pblk-write.c
@@ -0,0 +1,673 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-write.c - pblk's write path from write buffer to media
+ */
+
+#include "pblk.h"
+
+static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ struct bio *original_bio;
+ struct pblk_rb *rwb = &pblk->rwb;
+ unsigned long ret;
+ int i;
+
+ for (i = 0; i < c_ctx->nr_valid; i++) {
+ struct pblk_w_ctx *w_ctx;
+ int pos = c_ctx->sentry + i;
+ int flags;
+
+ w_ctx = pblk_rb_w_ctx(rwb, pos);
+ flags = READ_ONCE(w_ctx->flags);
+
+ if (flags & PBLK_FLUSH_ENTRY) {
+ flags &= ~PBLK_FLUSH_ENTRY;
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&w_ctx->flags, flags);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_dec(&rwb->inflight_flush_point);
+#endif
+ }
+
+ while ((original_bio = bio_list_pop(&w_ctx->bios)))
+ bio_endio(original_bio);
+ }
+
+ if (c_ctx->nr_padded)
+ pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
+ c_ctx->nr_padded);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_add(rqd->nr_ppas, &pblk->sync_writes);
+#endif
+
+ ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
+
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+
+ return ret;
+}
+
+static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
+ struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ list_del(&c_ctx->list);
+ return pblk_end_w_bio(pblk, rqd, c_ctx);
+}
+
+static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ struct pblk_c_ctx *c, *r;
+ unsigned long flags;
+ unsigned long pos;
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
+#endif
+
+ pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
+
+ pos = pblk_rb_sync_init(&pblk->rwb, &flags);
+ if (pos == c_ctx->sentry) {
+ pos = pblk_end_w_bio(pblk, rqd, c_ctx);
+
+retry:
+ list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
+ rqd = nvm_rq_from_c_ctx(c);
+ if (c->sentry == pos) {
+ pos = pblk_end_queued_w_bio(pblk, rqd, c);
+ goto retry;
+ }
+ }
+ } else {
+ WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
+ list_add_tail(&c_ctx->list, &pblk->compl_list);
+ }
+ pblk_rb_sync_end(&pblk->rwb, &flags);
+}
+
+/* Map remaining sectors in chunk, starting from ppa */
+static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line *line;
+ struct ppa_addr map_ppa = *ppa;
+ u64 paddr;
+ int done = 0;
+
+ line = &pblk->lines[pblk_ppa_to_line(*ppa)];
+ spin_lock(&line->lock);
+
+ while (!done) {
+ paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa);
+
+ if (!test_and_set_bit(paddr, line->map_bitmap))
+ line->left_msecs--;
+
+ if (!test_and_set_bit(paddr, line->invalid_bitmap))
+ le32_add_cpu(line->vsc, -1);
+
+ if (geo->version == NVM_OCSSD_SPEC_12) {
+ map_ppa.ppa++;
+ if (map_ppa.g.pg == geo->num_pg)
+ done = 1;
+ } else {
+ map_ppa.m.sec++;
+ if (map_ppa.m.sec == geo->clba)
+ done = 1;
+ }
+ }
+
+ line->w_err_gc->has_write_err = 1;
+ spin_unlock(&line->lock);
+}
+
+static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
+ unsigned int nr_entries)
+{
+ struct pblk_rb *rb = &pblk->rwb;
+ struct pblk_rb_entry *entry;
+ struct pblk_line *line;
+ struct pblk_w_ctx *w_ctx;
+ struct ppa_addr ppa_l2p;
+ int flags;
+ unsigned int pos, i;
+
+ spin_lock(&pblk->trans_lock);
+ pos = sentry;
+ for (i = 0; i < nr_entries; i++) {
+ entry = &rb->entries[pos];
+ w_ctx = &entry->w_ctx;
+
+ /* Check if the lba has been overwritten */
+ if (w_ctx->lba != ADDR_EMPTY) {
+ ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
+ if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
+ w_ctx->lba = ADDR_EMPTY;
+ }
+
+ /* Mark up the entry as submittable again */
+ flags = READ_ONCE(w_ctx->flags);
+ flags |= PBLK_WRITTEN_DATA;
+ /* Release flags on write context. Protect from writes */
+ smp_store_release(&w_ctx->flags, flags);
+
+ /* Decrese the reference count to the line as we will
+ * re-map these entries
+ */
+ line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
+ kref_put(&line->ref, pblk_line_put);
+
+ pos = (pos + 1) & (rb->nr_entries - 1);
+ }
+ spin_unlock(&pblk->trans_lock);
+}
+
+static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
+{
+ struct pblk_c_ctx *r_ctx;
+
+ r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL);
+ if (!r_ctx)
+ return;
+
+ r_ctx->lun_bitmap = NULL;
+ r_ctx->sentry = c_ctx->sentry;
+ r_ctx->nr_valid = c_ctx->nr_valid;
+ r_ctx->nr_padded = c_ctx->nr_padded;
+
+ spin_lock(&pblk->resubmit_lock);
+ list_add_tail(&r_ctx->list, &pblk->resubmit_list);
+ spin_unlock(&pblk->resubmit_lock);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
+#endif
+}
+
+static void pblk_submit_rec(struct work_struct *work)
+{
+ struct pblk_rec_ctx *recovery =
+ container_of(work, struct pblk_rec_ctx, ws_rec);
+ struct pblk *pblk = recovery->pblk;
+ struct nvm_rq *rqd = recovery->rqd;
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+ struct ppa_addr *ppa_list;
+
+ pblk_log_write_err(pblk, rqd);
+
+ if (rqd->nr_ppas == 1)
+ ppa_list = &rqd->ppa_addr;
+ else
+ ppa_list = rqd->ppa_list;
+
+ pblk_map_remaining(pblk, ppa_list);
+ pblk_queue_resubmit(pblk, c_ctx);
+
+ pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
+ if (c_ctx->nr_padded)
+ pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
+ c_ctx->nr_padded);
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+ mempool_free(recovery, &pblk->rec_pool);
+
+ atomic_dec(&pblk->inflight_io);
+}
+
+
+static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct pblk_rec_ctx *recovery;
+
+ recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC);
+ if (!recovery) {
+ pblk_err(pblk, "could not allocate recovery work\n");
+ return;
+ }
+
+ recovery->pblk = pblk;
+ recovery->rqd = rqd;
+
+ INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
+ queue_work(pblk->close_wq, &recovery->ws_rec);
+}
+
+static void pblk_end_io_write(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+
+ if (rqd->error) {
+ pblk_end_w_fail(pblk, rqd);
+ return;
+ }
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ else
+ WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
+#endif
+
+ pblk_complete_write(pblk, rqd, c_ctx);
+ atomic_dec(&pblk->inflight_io);
+}
+
+static void pblk_end_io_write_meta(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+ struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd);
+ struct pblk_line *line = m_ctx->private;
+ struct pblk_emeta *emeta = line->emeta;
+ int sync;
+
+ pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+
+ if (rqd->error) {
+ pblk_log_write_err(pblk, rqd);
+ pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id);
+ line->w_err_gc->has_write_err = 1;
+ }
+
+ sync = atomic_add_return(rqd->nr_ppas, &emeta->sync);
+ if (sync == emeta->nr_entries)
+ pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws,
+ GFP_ATOMIC, pblk->close_wq);
+
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
+
+ atomic_dec(&pblk->inflight_io);
+}
+
+static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int nr_secs,
+ nvm_end_io_fn(*end_io))
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+
+ /* Setup write request */
+ rqd->opcode = NVM_OP_PWRITE;
+ rqd->nr_ppas = nr_secs;
+ rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
+ rqd->private = pblk;
+ rqd->end_io = end_io;
+
+ rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd->dma_meta_list);
+ if (!rqd->meta_list)
+ return -ENOMEM;
+
+ rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
+ rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
+
+ return 0;
+}
+
+static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct ppa_addr *erase_ppa)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line *e_line = pblk_line_get_erase(pblk);
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+ unsigned int valid = c_ctx->nr_valid;
+ unsigned int padded = c_ctx->nr_padded;
+ unsigned int nr_secs = valid + padded;
+ unsigned long *lun_bitmap;
+ int ret;
+
+ lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+ if (!lun_bitmap)
+ return -ENOMEM;
+ c_ctx->lun_bitmap = lun_bitmap;
+
+ ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write);
+ if (ret) {
+ kfree(lun_bitmap);
+ return ret;
+ }
+
+ if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
+ pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
+ else
+ pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+ valid, erase_ppa);
+
+ return 0;
+}
+
+static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
+ unsigned int secs_to_flush)
+{
+ int secs_to_sync;
+
+ secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ if ((!secs_to_sync && secs_to_flush)
+ || (secs_to_sync < 0)
+ || (secs_to_sync > secs_avail && !secs_to_flush)) {
+ pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n",
+ secs_avail, secs_to_sync, secs_to_flush);
+ }
+#endif
+
+ return secs_to_sync;
+}
+
+int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_emeta *emeta = meta_line->emeta;
+ struct pblk_g_ctx *m_ctx;
+ struct bio *bio;
+ struct nvm_rq *rqd;
+ void *data;
+ u64 paddr;
+ int rq_ppas = pblk->min_write_pgs;
+ int id = meta_line->id;
+ int rq_len;
+ int i, j;
+ int ret;
+
+ rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
+
+ m_ctx = nvm_rq_to_pdu(rqd);
+ m_ctx->private = meta_line;
+
+ rq_len = rq_ppas * geo->csecs;
+ data = ((void *)emeta->buf) + emeta->mem;
+
+ bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
+ l_mg->emeta_alloc_type, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ pblk_err(pblk, "failed to map emeta io");
+ ret = PTR_ERR(bio);
+ goto fail_free_rqd;
+ }
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ rqd->bio = bio;
+
+ ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta);
+ if (ret)
+ goto fail_free_bio;
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ spin_lock(&meta_line->lock);
+ paddr = __pblk_alloc_page(pblk, meta_line, rq_ppas);
+ spin_unlock(&meta_line->lock);
+ for (j = 0; j < rq_ppas; j++, i++, paddr++)
+ rqd->ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id);
+ }
+
+ spin_lock(&l_mg->close_lock);
+ emeta->mem += rq_len;
+ if (emeta->mem >= lm->emeta_len[0])
+ list_del(&meta_line->list);
+ spin_unlock(&l_mg->close_lock);
+
+ pblk_down_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
+ goto fail_rollback;
+ }
+
+ return NVM_IO_OK;
+
+fail_rollback:
+ pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
+ spin_lock(&l_mg->close_lock);
+ pblk_dealloc_page(pblk, meta_line, rq_ppas);
+ list_add(&meta_line->list, &meta_line->list);
+ spin_unlock(&l_mg->close_lock);
+fail_free_bio:
+ bio_put(bio);
+fail_free_rqd:
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
+ return ret;
+}
+
+static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
+ struct pblk_line *meta_line,
+ struct nvm_rq *data_rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd);
+ struct pblk_line *data_line = pblk_line_get_data(pblk);
+ struct ppa_addr ppa, ppa_opt;
+ u64 paddr;
+ int pos_opt;
+
+ /* Schedule a metadata I/O that is half the distance from the data I/O
+ * with regards to the number of LUNs forming the pblk instance. This
+ * balances LUN conflicts across every I/O.
+ *
+ * When the LUN configuration changes (e.g., due to GC), this distance
+ * can align, which would result on metadata and data I/Os colliding. In
+ * this case, modify the distance to not be optimal, but move the
+ * optimal in the right direction.
+ */
+ paddr = pblk_lookup_page(pblk, meta_line);
+ ppa = addr_to_gen_ppa(pblk, paddr, 0);
+ ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0);
+ pos_opt = pblk_ppa_to_pos(geo, ppa_opt);
+
+ if (test_bit(pos_opt, data_c_ctx->lun_bitmap) ||
+ test_bit(pos_opt, data_line->blk_bitmap))
+ return true;
+
+ if (unlikely(pblk_ppa_comp(ppa_opt, ppa)))
+ data_line->meta_distance--;
+
+ return false;
+}
+
+static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
+ struct nvm_rq *data_rqd)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *meta_line;
+
+ spin_lock(&l_mg->close_lock);
+ if (list_empty(&l_mg->emeta_list)) {
+ spin_unlock(&l_mg->close_lock);
+ return NULL;
+ }
+ meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list);
+ if (meta_line->emeta->mem >= lm->emeta_len[0]) {
+ spin_unlock(&l_mg->close_lock);
+ return NULL;
+ }
+ spin_unlock(&l_mg->close_lock);
+
+ if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd))
+ return NULL;
+
+ return meta_line;
+}
+
+static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct ppa_addr erase_ppa;
+ struct pblk_line *meta_line;
+ int err;
+
+ pblk_ppa_set_empty(&erase_ppa);
+
+ /* Assign lbas to ppas and populate request structure */
+ err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
+ if (err) {
+ pblk_err(pblk, "could not setup write request: %d\n", err);
+ return NVM_IO_ERR;
+ }
+
+ meta_line = pblk_should_submit_meta_io(pblk, rqd);
+
+ /* Submit data write for current data line */
+ err = pblk_submit_io(pblk, rqd);
+ if (err) {
+ pblk_err(pblk, "data I/O submission failed: %d\n", err);
+ return NVM_IO_ERR;
+ }
+
+ if (!pblk_ppa_empty(erase_ppa)) {
+ /* Submit erase for next data line */
+ if (pblk_blk_erase_async(pblk, erase_ppa)) {
+ struct pblk_line *e_line = pblk_line_get_erase(pblk);
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int bit;
+
+ atomic_inc(&e_line->left_eblks);
+ bit = pblk_ppa_to_pos(geo, erase_ppa);
+ WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
+ }
+ }
+
+ if (meta_line) {
+ /* Submit metadata write for previous data line */
+ err = pblk_submit_meta_io(pblk, meta_line);
+ if (err) {
+ pblk_err(pblk, "metadata I/O submission failed: %d",
+ err);
+ return NVM_IO_ERR;
+ }
+ }
+
+ return NVM_IO_OK;
+}
+
+static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+ struct bio *bio = rqd->bio;
+
+ if (c_ctx->nr_padded)
+ pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid,
+ c_ctx->nr_padded);
+}
+
+static int pblk_submit_write(struct pblk *pblk)
+{
+ struct bio *bio;
+ struct nvm_rq *rqd;
+ unsigned int secs_avail, secs_to_sync, secs_to_com;
+ unsigned int secs_to_flush;
+ unsigned long pos;
+ unsigned int resubmit;
+
+ spin_lock(&pblk->resubmit_lock);
+ resubmit = !list_empty(&pblk->resubmit_list);
+ spin_unlock(&pblk->resubmit_lock);
+
+ /* Resubmit failed writes first */
+ if (resubmit) {
+ struct pblk_c_ctx *r_ctx;
+
+ spin_lock(&pblk->resubmit_lock);
+ r_ctx = list_first_entry(&pblk->resubmit_list,
+ struct pblk_c_ctx, list);
+ list_del(&r_ctx->list);
+ spin_unlock(&pblk->resubmit_lock);
+
+ secs_avail = r_ctx->nr_valid;
+ pos = r_ctx->sentry;
+
+ pblk_prepare_resubmit(pblk, pos, secs_avail);
+ secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
+ secs_avail);
+
+ kfree(r_ctx);
+ } else {
+ /* If there are no sectors in the cache,
+ * flushes (bios without data) will be cleared on
+ * the cache threads
+ */
+ secs_avail = pblk_rb_read_count(&pblk->rwb);
+ if (!secs_avail)
+ return 1;
+
+ secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
+ if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+ return 1;
+
+ secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
+ secs_to_flush);
+ if (secs_to_sync > pblk->max_write_pgs) {
+ pblk_err(pblk, "bad buffer sync calculation\n");
+ return 1;
+ }
+
+ secs_to_com = (secs_to_sync > secs_avail) ?
+ secs_avail : secs_to_sync;
+ pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+ }
+
+ bio = bio_alloc(GFP_KERNEL, secs_to_sync);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
+ rqd->bio = bio;
+
+ if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
+ secs_avail)) {
+ pblk_err(pblk, "corrupted write bio\n");
+ goto fail_put_bio;
+ }
+
+ if (pblk_submit_io_set(pblk, rqd))
+ goto fail_free_bio;
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_long_add(secs_to_sync, &pblk->sub_writes);
+#endif
+
+ return 0;
+
+fail_free_bio:
+ pblk_free_write_rqd(pblk, rqd);
+fail_put_bio:
+ bio_put(bio);
+ pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+
+ return 1;
+}
+
+int pblk_write_ts(void *data)
+{
+ struct pblk *pblk = data;
+
+ while (!kthread_should_stop()) {
+ if (!pblk_submit_write(pblk))
+ continue;
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+ return 0;
+}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
new file mode 100644
index 000000000..4760af7b6
--- /dev/null
+++ b/drivers/lightnvm/pblk.h
@@ -0,0 +1,1444 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.h)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Matias Bjorling <matias@cnexlabs.com>
+ * Write buffering: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a Physical Block-device target for Open-channel SSDs.
+ *
+ */
+
+#ifndef PBLK_H_
+#define PBLK_H_
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+#include <linux/crc32.h>
+#include <linux/uuid.h>
+
+#include <linux/lightnvm.h>
+
+/* Run only GC if less than 1/X blocks are free */
+#define GC_LIMIT_INVERSE 5
+#define GC_TIME_MSECS 1000
+
+#define PBLK_SECTOR (512)
+#define PBLK_EXPOSED_PAGE_SIZE (4096)
+#define PBLK_MAX_REQ_ADDRS (64)
+#define PBLK_MAX_REQ_ADDRS_PW (6)
+
+#define PBLK_NR_CLOSE_JOBS (4)
+
+#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
+
+#define PBLK_COMMAND_TIMEOUT_MS 30000
+
+/* Max 512 LUNs per device */
+#define PBLK_MAX_LUNS_BITMAP (4)
+
+#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
+
+/* Static pool sizes */
+#define PBLK_GEN_WS_POOL_SIZE (2)
+
+#define PBLK_DEFAULT_OP (11)
+
+enum {
+ PBLK_READ = READ,
+ PBLK_WRITE = WRITE,/* Write from write buffer */
+ PBLK_WRITE_INT, /* Internal write - no write buffer */
+ PBLK_READ_RECOV, /* Recovery read - errors allowed */
+ PBLK_ERASE,
+};
+
+enum {
+ /* IO Types */
+ PBLK_IOTYPE_USER = 1 << 0,
+ PBLK_IOTYPE_GC = 1 << 1,
+
+ /* Write buffer flags */
+ PBLK_FLUSH_ENTRY = 1 << 2,
+ PBLK_WRITTEN_DATA = 1 << 3,
+ PBLK_SUBMITTED_ENTRY = 1 << 4,
+ PBLK_WRITABLE_ENTRY = 1 << 5,
+};
+
+enum {
+ PBLK_BLK_ST_OPEN = 0x1,
+ PBLK_BLK_ST_CLOSED = 0x2,
+};
+
+struct pblk_sec_meta {
+ u64 reserved;
+ __le64 lba;
+};
+
+/* The number of GC lists and the rate-limiter states go together. This way the
+ * rate-limiter can dictate how much GC is needed based on resource utilization.
+ */
+#define PBLK_GC_NR_LISTS 4
+
+enum {
+ PBLK_RL_OFF = 0,
+ PBLK_RL_WERR = 1,
+ PBLK_RL_HIGH = 2,
+ PBLK_RL_MID = 3,
+ PBLK_RL_LOW = 4
+};
+
+#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
+#define pblk_dma_ppa_size (sizeof(u64) * PBLK_MAX_REQ_ADDRS)
+
+/* write buffer completion context */
+struct pblk_c_ctx {
+ struct list_head list; /* Head for out-of-order completion */
+
+ unsigned long *lun_bitmap; /* Luns used on current request */
+ unsigned int sentry;
+ unsigned int nr_valid;
+ unsigned int nr_padded;
+};
+
+/* read context */
+struct pblk_g_ctx {
+ void *private;
+ unsigned long start_time;
+ u64 lba;
+};
+
+/* partial read context */
+struct pblk_pr_ctx {
+ struct bio *orig_bio;
+ DECLARE_BITMAP(bitmap, NVM_MAX_VLBA);
+ unsigned int orig_nr_secs;
+ unsigned int bio_init_idx;
+ void *ppa_ptr;
+ dma_addr_t dma_ppa_list;
+};
+
+/* Pad context */
+struct pblk_pad_rq {
+ struct pblk *pblk;
+ struct completion wait;
+ struct kref ref;
+};
+
+/* Recovery context */
+struct pblk_rec_ctx {
+ struct pblk *pblk;
+ struct nvm_rq *rqd;
+ struct work_struct ws_rec;
+};
+
+/* Write context */
+struct pblk_w_ctx {
+ struct bio_list bios; /* Original bios - used for completion
+ * in REQ_FUA, REQ_FLUSH case
+ */
+ u64 lba; /* Logic addr. associated with entry */
+ struct ppa_addr ppa; /* Physic addr. associated with entry */
+ int flags; /* Write context flags */
+};
+
+struct pblk_rb_entry {
+ struct ppa_addr cacheline; /* Cacheline for this entry */
+ void *data; /* Pointer to data on this entry */
+ struct pblk_w_ctx w_ctx; /* Context for this entry */
+ struct list_head index; /* List head to enable indexes */
+};
+
+#define EMPTY_ENTRY (~0U)
+
+struct pblk_rb_pages {
+ struct page *pages;
+ int order;
+ struct list_head list;
+};
+
+struct pblk_rb {
+ struct pblk_rb_entry *entries; /* Ring buffer entries */
+ unsigned int mem; /* Write offset - points to next
+ * writable entry in memory
+ */
+ unsigned int subm; /* Read offset - points to last entry
+ * that has been submitted to the media
+ * to be persisted
+ */
+ unsigned int sync; /* Synced - backpointer that signals
+ * the last submitted entry that has
+ * been successfully persisted to media
+ */
+ unsigned int flush_point; /* Sync point - last entry that must be
+ * flushed to the media. Used with
+ * REQ_FLUSH and REQ_FUA
+ */
+ unsigned int l2p_update; /* l2p update point - next entry for
+ * which l2p mapping will be updated to
+ * contain a device ppa address (instead
+ * of a cacheline
+ */
+ unsigned int nr_entries; /* Number of entries in write buffer -
+ * must be a power of two
+ */
+ unsigned int seg_size; /* Size of the data segments being
+ * stored on each entry. Typically this
+ * will be 4KB
+ */
+
+ struct list_head pages; /* List of data pages */
+
+ spinlock_t w_lock; /* Write lock */
+ spinlock_t s_lock; /* Sync lock */
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */
+#endif
+};
+
+#define PBLK_RECOVERY_SECTORS 16
+
+struct pblk_lun {
+ struct ppa_addr bppa;
+ struct semaphore wr_sem;
+};
+
+struct pblk_gc_rq {
+ struct pblk_line *line;
+ void *data;
+ u64 paddr_list[PBLK_MAX_REQ_ADDRS];
+ u64 lba_list[PBLK_MAX_REQ_ADDRS];
+ int nr_secs;
+ int secs_to_gc;
+ struct list_head list;
+};
+
+struct pblk_gc {
+ /* These states are not protected by a lock since (i) they are in the
+ * fast path, and (ii) they are not critical.
+ */
+ int gc_active;
+ int gc_enabled;
+ int gc_forced;
+
+ struct task_struct *gc_ts;
+ struct task_struct *gc_writer_ts;
+ struct task_struct *gc_reader_ts;
+
+ struct workqueue_struct *gc_line_reader_wq;
+ struct workqueue_struct *gc_reader_wq;
+
+ struct timer_list gc_timer;
+
+ struct semaphore gc_sem;
+ atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */
+ atomic_t pipeline_gc; /* Number of lines in the GC pipeline -
+ * started reads to finished writes
+ */
+ int w_entries;
+
+ struct list_head w_list;
+ struct list_head r_list;
+
+ spinlock_t lock;
+ spinlock_t w_lock;
+ spinlock_t r_lock;
+};
+
+struct pblk_rl {
+ unsigned int high; /* Upper threshold for rate limiter (free run -
+ * user I/O rate limiter
+ */
+ unsigned int high_pw; /* High rounded up as a power of 2 */
+
+#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */
+#define PBLK_USER_LOW_THRS 10 /* Aggressive GC at 10% available blocks */
+
+ int rb_windows_pw; /* Number of rate windows in the write buffer
+ * given as a power-of-2. This guarantees that
+ * when user I/O is being rate limited, there
+ * will be reserved enough space for the GC to
+ * place its payload. A window is of
+ * pblk->max_write_pgs size, which in NVMe is
+ * 64, i.e., 256kb.
+ */
+ int rb_budget; /* Total number of entries available for I/O */
+ int rb_user_max; /* Max buffer entries available for user I/O */
+ int rb_gc_max; /* Max buffer entries available for GC I/O */
+ int rb_gc_rsv; /* Reserved buffer entries for GC I/O */
+ int rb_state; /* Rate-limiter current state */
+ int rb_max_io; /* Maximum size for an I/O giving the config */
+
+ atomic_t rb_user_cnt; /* User I/O buffer counter */
+ atomic_t rb_gc_cnt; /* GC I/O buffer counter */
+ atomic_t rb_space; /* Space limit in case of reaching capacity */
+
+ int rsv_blocks; /* Reserved blocks for GC */
+
+ int rb_user_active;
+ int rb_gc_active;
+
+ atomic_t werr_lines; /* Number of write error lines that needs gc */
+
+ struct timer_list u_timer;
+
+ unsigned long long nr_secs;
+ unsigned long total_blocks;
+
+ atomic_t free_blocks; /* Total number of free blocks (+ OP) */
+ atomic_t free_user_blocks; /* Number of user free blocks (no OP) */
+};
+
+#define PBLK_LINE_EMPTY (~0U)
+
+enum {
+ /* Line Types */
+ PBLK_LINETYPE_FREE = 0,
+ PBLK_LINETYPE_LOG = 1,
+ PBLK_LINETYPE_DATA = 2,
+
+ /* Line state */
+ PBLK_LINESTATE_NEW = 9,
+ PBLK_LINESTATE_FREE = 10,
+ PBLK_LINESTATE_OPEN = 11,
+ PBLK_LINESTATE_CLOSED = 12,
+ PBLK_LINESTATE_GC = 13,
+ PBLK_LINESTATE_BAD = 14,
+ PBLK_LINESTATE_CORRUPT = 15,
+
+ /* GC group */
+ PBLK_LINEGC_NONE = 20,
+ PBLK_LINEGC_EMPTY = 21,
+ PBLK_LINEGC_LOW = 22,
+ PBLK_LINEGC_MID = 23,
+ PBLK_LINEGC_HIGH = 24,
+ PBLK_LINEGC_FULL = 25,
+ PBLK_LINEGC_WERR = 26
+};
+
+#define PBLK_MAGIC 0x70626c6b /*pblk*/
+
+/* emeta/smeta persistent storage format versions:
+ * Changes in major version requires offline migration.
+ * Changes in minor version are handled automatically during
+ * recovery.
+ */
+
+#define SMETA_VERSION_MAJOR (0)
+#define SMETA_VERSION_MINOR (1)
+
+#define EMETA_VERSION_MAJOR (0)
+#define EMETA_VERSION_MINOR (2)
+
+struct line_header {
+ __le32 crc;
+ __le32 identifier; /* pblk identifier */
+ __u8 uuid[16]; /* instance uuid */
+ __le16 type; /* line type */
+ __u8 version_major; /* version major */
+ __u8 version_minor; /* version minor */
+ __le32 id; /* line id for current line */
+};
+
+struct line_smeta {
+ struct line_header header;
+
+ __le32 crc; /* Full structure including struct crc */
+ /* Previous line metadata */
+ __le32 prev_id; /* Line id for previous line */
+
+ /* Current line metadata */
+ __le64 seq_nr; /* Sequence number for current line */
+
+ /* Active writers */
+ __le32 window_wr_lun; /* Number of parallel LUNs to write */
+
+ __le32 rsvd[2];
+
+ __le64 lun_bitmap[];
+};
+
+
+/*
+ * Metadata layout in media:
+ * First sector:
+ * 1. struct line_emeta
+ * 2. bad block bitmap (u64 * window_wr_lun)
+ * 3. write amplification counters
+ * Mid sectors (start at lbas_sector):
+ * 3. nr_lbas (u64) forming lba list
+ * Last sectors (start at vsc_sector):
+ * 4. u32 valid sector count (vsc) for all lines (~0U: free line)
+ */
+struct line_emeta {
+ struct line_header header;
+
+ __le32 crc; /* Full structure including struct crc */
+
+ /* Previous line metadata */
+ __le32 prev_id; /* Line id for prev line */
+
+ /* Current line metadata */
+ __le64 seq_nr; /* Sequence number for current line */
+
+ /* Active writers */
+ __le32 window_wr_lun; /* Number of parallel LUNs to write */
+
+ /* Bookkeeping for recovery */
+ __le32 next_id; /* Line id for next line */
+ __le64 nr_lbas; /* Number of lbas mapped in line */
+ __le64 nr_valid_lbas; /* Number of valid lbas mapped in line */
+ __le64 bb_bitmap[]; /* Updated bad block bitmap for line */
+};
+
+
+/* Write amplification counters stored on media */
+struct wa_counters {
+ __le64 user; /* Number of user written sectors */
+ __le64 gc; /* Number of sectors written by GC*/
+ __le64 pad; /* Number of padded sectors */
+};
+
+struct pblk_emeta {
+ struct line_emeta *buf; /* emeta buffer in media format */
+ int mem; /* Write offset - points to next
+ * writable entry in memory
+ */
+ atomic_t sync; /* Synced - backpointer that signals the
+ * last entry that has been successfully
+ * persisted to media
+ */
+ unsigned int nr_entries; /* Number of emeta entries */
+};
+
+struct pblk_smeta {
+ struct line_smeta *buf; /* smeta buffer in persistent format */
+};
+
+struct pblk_w_err_gc {
+ int has_write_err;
+ __le64 *lba_list;
+};
+
+struct pblk_line {
+ struct pblk *pblk;
+ unsigned int id; /* Line number corresponds to the
+ * block line
+ */
+ unsigned int seq_nr; /* Unique line sequence number */
+
+ int state; /* PBLK_LINESTATE_X */
+ int type; /* PBLK_LINETYPE_X */
+ int gc_group; /* PBLK_LINEGC_X */
+ struct list_head list; /* Free, GC lists */
+
+ unsigned long *lun_bitmap; /* Bitmap for LUNs mapped in line */
+
+ struct nvm_chk_meta *chks; /* Chunks forming line */
+
+ struct pblk_smeta *smeta; /* Start metadata */
+ struct pblk_emeta *emeta; /* End medatada */
+
+ int meta_line; /* Metadata line id */
+ int meta_distance; /* Distance between data and metadata */
+
+ u64 smeta_ssec; /* Sector where smeta starts */
+ u64 emeta_ssec; /* Sector where emeta starts */
+
+ unsigned int sec_in_line; /* Number of usable secs in line */
+
+ atomic_t blk_in_line; /* Number of good blocks in line */
+ unsigned long *blk_bitmap; /* Bitmap for valid/invalid blocks */
+ unsigned long *erase_bitmap; /* Bitmap for erased blocks */
+
+ unsigned long *map_bitmap; /* Bitmap for mapped sectors in line */
+ unsigned long *invalid_bitmap; /* Bitmap for invalid sectors in line */
+
+ atomic_t left_eblks; /* Blocks left for erasing */
+ atomic_t left_seblks; /* Blocks left for sync erasing */
+
+ int left_msecs; /* Sectors left for mapping */
+ unsigned int cur_sec; /* Sector map pointer */
+ unsigned int nr_valid_lbas; /* Number of valid lbas in line */
+
+ __le32 *vsc; /* Valid sector count in line */
+
+ struct kref ref; /* Write buffer L2P references */
+
+ struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */
+
+ spinlock_t lock; /* Necessary for invalid_bitmap only */
+};
+
+#define PBLK_DATA_LINES 4
+
+enum {
+ PBLK_KMALLOC_META = 1,
+ PBLK_VMALLOC_META = 2,
+};
+
+enum {
+ PBLK_EMETA_TYPE_HEADER = 1, /* struct line_emeta first sector */
+ PBLK_EMETA_TYPE_LLBA = 2, /* lba list - type: __le64 */
+ PBLK_EMETA_TYPE_VSC = 3, /* vsc list - type: __le32 */
+};
+
+struct pblk_line_mgmt {
+ int nr_lines; /* Total number of full lines */
+ int nr_free_lines; /* Number of full lines in free list */
+
+ /* Free lists - use free_lock */
+ struct list_head free_list; /* Full lines ready to use */
+ struct list_head corrupt_list; /* Full lines corrupted */
+ struct list_head bad_list; /* Full lines bad */
+
+ /* GC lists - use gc_lock */
+ struct list_head *gc_lists[PBLK_GC_NR_LISTS];
+ struct list_head gc_high_list; /* Full lines ready to GC, high isc */
+ struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */
+ struct list_head gc_low_list; /* Full lines ready to GC, low isc */
+
+ struct list_head gc_werr_list; /* Write err recovery list */
+
+ struct list_head gc_full_list; /* Full lines ready to GC, no valid */
+ struct list_head gc_empty_list; /* Full lines close, all valid */
+
+ struct pblk_line *log_line; /* Current FTL log line */
+ struct pblk_line *data_line; /* Current data line */
+ struct pblk_line *log_next; /* Next FTL log line */
+ struct pblk_line *data_next; /* Next data line */
+
+ struct list_head emeta_list; /* Lines queued to schedule emeta */
+
+ __le32 *vsc_list; /* Valid sector counts for all lines */
+
+ /* Metadata allocation type: VMALLOC | KMALLOC */
+ int emeta_alloc_type;
+
+ /* Pre-allocated metadata for data lines */
+ struct pblk_smeta *sline_meta[PBLK_DATA_LINES];
+ struct pblk_emeta *eline_meta[PBLK_DATA_LINES];
+ unsigned long meta_bitmap;
+
+ /* Helpers for fast bitmap calculations */
+ unsigned long *bb_template;
+ unsigned long *bb_aux;
+
+ unsigned long d_seq_nr; /* Data line unique sequence number */
+ unsigned long l_seq_nr; /* Log line unique sequence number */
+
+ spinlock_t free_lock;
+ spinlock_t close_lock;
+ spinlock_t gc_lock;
+};
+
+struct pblk_line_meta {
+ unsigned int smeta_len; /* Total length for smeta */
+ unsigned int smeta_sec; /* Sectors needed for smeta */
+
+ unsigned int emeta_len[4]; /* Lengths for emeta:
+ * [0]: Total
+ * [1]: struct line_emeta +
+ * bb_bitmap + struct wa_counters
+ * [2]: L2P portion
+ * [3]: vsc
+ */
+ unsigned int emeta_sec[4]; /* Sectors needed for emeta. Same layout
+ * as emeta_len
+ */
+
+ unsigned int emeta_bb; /* Boundary for bb that affects emeta */
+
+ unsigned int vsc_list_len; /* Length for vsc list */
+ unsigned int sec_bitmap_len; /* Length for sector bitmap in line */
+ unsigned int blk_bitmap_len; /* Length for block bitmap in line */
+ unsigned int lun_bitmap_len; /* Length for lun bitmap in line */
+
+ unsigned int blk_per_line; /* Number of blocks in a full line */
+ unsigned int sec_per_line; /* Number of sectors in a line */
+ unsigned int dsec_per_line; /* Number of data sectors in a line */
+ unsigned int min_blk_line; /* Min. number of good blocks in line */
+
+ unsigned int mid_thrs; /* Threshold for GC mid list */
+ unsigned int high_thrs; /* Threshold for GC high list */
+
+ unsigned int meta_distance; /* Distance between data and metadata */
+};
+
+enum {
+ PBLK_STATE_RUNNING = 0,
+ PBLK_STATE_STOPPING = 1,
+ PBLK_STATE_RECOVERING = 2,
+ PBLK_STATE_STOPPED = 3,
+};
+
+/* Internal format to support not power-of-2 device formats */
+struct pblk_addrf {
+ /* gen to dev */
+ int sec_stripe;
+ int ch_stripe;
+ int lun_stripe;
+
+ /* dev to gen */
+ int sec_lun_stripe;
+ int sec_ws_stripe;
+};
+
+struct pblk {
+ struct nvm_tgt_dev *dev;
+ struct gendisk *disk;
+
+ struct kobject kobj;
+
+ struct pblk_lun *luns;
+
+ struct pblk_line *lines; /* Line array */
+ struct pblk_line_mgmt l_mg; /* Line management */
+ struct pblk_line_meta lm; /* Line metadata */
+
+ struct nvm_addrf addrf; /* Aligned address format */
+ struct pblk_addrf uaddrf; /* Unaligned address format */
+ int addrf_len;
+
+ struct pblk_rb rwb;
+
+ int state; /* pblk line state */
+
+ int min_write_pgs; /* Minimum amount of pages required by controller */
+ int max_write_pgs; /* Maximum amount of pages supported by controller */
+
+ sector_t capacity; /* Device capacity when bad blocks are subtracted */
+
+ int op; /* Percentage of device used for over-provisioning */
+ int op_blks; /* Number of blocks used for over-provisioning */
+
+ /* pblk provisioning values. Used by rate limiter */
+ struct pblk_rl rl;
+
+ int sec_per_write;
+
+ unsigned char instance_uuid[16];
+
+ /* Persistent write amplification counters, 4kb sector I/Os */
+ atomic64_t user_wa; /* Sectors written by user */
+ atomic64_t gc_wa; /* Sectors written by GC */
+ atomic64_t pad_wa; /* Padded sectors written */
+
+ /* Reset values for delta write amplification measurements */
+ u64 user_rst_wa;
+ u64 gc_rst_wa;
+ u64 pad_rst_wa;
+
+ /* Counters used for calculating padding distribution */
+ atomic64_t *pad_dist; /* Padding distribution buckets */
+ u64 nr_flush_rst; /* Flushes reset value for pad dist.*/
+ atomic64_t nr_flush; /* Number of flush/fua I/O */
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+ /* Non-persistent debug counters, 4kb sector I/Os */
+ atomic_long_t inflight_writes; /* Inflight writes (user and gc) */
+ atomic_long_t padded_writes; /* Sectors padded due to flush/fua */
+ atomic_long_t padded_wb; /* Sectors padded in write buffer */
+ atomic_long_t req_writes; /* Sectors stored on write buffer */
+ atomic_long_t sub_writes; /* Sectors submitted from buffer */
+ atomic_long_t sync_writes; /* Sectors synced to media */
+ atomic_long_t inflight_reads; /* Inflight sector read requests */
+ atomic_long_t cache_reads; /* Read requests that hit the cache */
+ atomic_long_t sync_reads; /* Completed sector read requests */
+ atomic_long_t recov_writes; /* Sectors submitted from recovery */
+ atomic_long_t recov_gc_writes; /* Sectors submitted from write GC */
+ atomic_long_t recov_gc_reads; /* Sectors submitted from read GC */
+#endif
+
+ spinlock_t lock;
+
+ atomic_long_t read_failed;
+ atomic_long_t read_empty;
+ atomic_long_t read_high_ecc;
+ atomic_long_t read_failed_gc;
+ atomic_long_t write_failed;
+ atomic_long_t erase_failed;
+
+ atomic_t inflight_io; /* General inflight I/O counter */
+
+ struct task_struct *writer_ts;
+
+ /* Simple translation map of logical addresses to physical addresses.
+ * The logical addresses is known by the host system, while the physical
+ * addresses are used when writing to the disk block device.
+ */
+ unsigned char *trans_map;
+ spinlock_t trans_lock;
+
+ struct list_head compl_list;
+
+ spinlock_t resubmit_lock; /* Resubmit list lock */
+ struct list_head resubmit_list; /* Resubmit list for failed writes*/
+
+ mempool_t page_bio_pool;
+ mempool_t gen_ws_pool;
+ mempool_t rec_pool;
+ mempool_t r_rq_pool;
+ mempool_t w_rq_pool;
+ mempool_t e_rq_pool;
+
+ struct workqueue_struct *close_wq;
+ struct workqueue_struct *bb_wq;
+ struct workqueue_struct *r_end_wq;
+
+ struct timer_list wtimer;
+
+ struct pblk_gc gc;
+};
+
+struct pblk_line_ws {
+ struct pblk *pblk;
+ struct pblk_line *line;
+ void *priv;
+ struct work_struct ws;
+};
+
+#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx))
+#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
+
+#define pblk_err(pblk, fmt, ...) \
+ pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+#define pblk_info(pblk, fmt, ...) \
+ pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+#define pblk_warn(pblk, fmt, ...) \
+ pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+#define pblk_debug(pblk, fmt, ...) \
+ pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+
+/*
+ * pblk ring buffer operations
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+ unsigned int power_size, unsigned int power_seg_sz);
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries);
+void *pblk_rb_entries_ref(struct pblk_rb *rb);
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+ unsigned int nr_entries, unsigned int *pos);
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos);
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, unsigned int pos);
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, struct pblk_line *line,
+ u64 paddr, unsigned int pos);
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos);
+void pblk_rb_flush(struct pblk_rb *rb);
+
+void pblk_rb_sync_l2p(struct pblk_rb *rb);
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
+ unsigned int pos, unsigned int nr_entries,
+ unsigned int count);
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+ struct ppa_addr ppa, int bio_iter, bool advanced_bio);
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+ struct ppa_addr *ppa);
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb);
+
+unsigned int pblk_rb_read_count(struct pblk_rb *rb);
+unsigned int pblk_rb_sync_count(struct pblk_rb *rb);
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos);
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb);
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos);
+void pblk_rb_data_free(struct pblk_rb *rb);
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
+
+/*
+ * pblk core
+ */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type);
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type);
+void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write);
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx);
+void pblk_discard(struct pblk *pblk, struct bio *bio);
+struct nvm_chk_meta *pblk_chunk_get_info(struct pblk *pblk);
+struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk,
+ struct nvm_chk_meta *lp,
+ struct ppa_addr ppa);
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line);
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+ unsigned int nr_secs, unsigned int len,
+ int alloc_type, gfp_t gfp_mask);
+struct pblk_line *pblk_line_get(struct pblk *pblk);
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
+struct pblk_line *pblk_line_get_data(struct pblk *pblk);
+struct pblk_line *pblk_line_get_erase(struct pblk *pblk);
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_is_full(struct pblk_line *line);
+void pblk_line_free(struct pblk_line *line);
+void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_close_ws(struct work_struct *work);
+void pblk_pipeline_stop(struct pblk *pblk);
+void __pblk_pipeline_stop(struct pblk *pblk);
+void __pblk_pipeline_flush(struct pblk *pblk);
+void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+ void (*work)(struct work_struct *), gfp_t gfp_mask,
+ struct workqueue_struct *wq);
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
+ void *emeta_buf);
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
+void pblk_line_put(struct kref *ref);
+void pblk_line_put_wq(struct kref *ref);
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line);
+u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line);
+void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
+u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+ unsigned long secs_to_flush);
+void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap);
+void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap);
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+ int nr_pages);
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+ int nr_pages);
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa);
+void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr);
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa);
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa);
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa, struct ppa_addr entry_line);
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+ struct pblk_line *gc_line, u64 paddr);
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+ u64 *lba_list, int nr_secs);
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+ sector_t blba, int nr_secs);
+
+/*
+ * pblk user I/O write path
+ */
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
+ unsigned long flags);
+int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
+
+/*
+ * pblk map
+ */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int sentry, unsigned long *lun_bitmap,
+ unsigned int valid_secs, struct ppa_addr *erase_ppa);
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+ unsigned long *lun_bitmap, unsigned int valid_secs,
+ unsigned int off);
+
+/*
+ * pblk write thread
+ */
+int pblk_write_ts(void *data);
+void pblk_write_timer_fn(struct timer_list *t);
+void pblk_write_should_kick(struct pblk *pblk);
+void pblk_write_kick(struct pblk *pblk);
+
+/*
+ * pblk read path
+ */
+extern struct bio_set pblk_bio_set;
+int pblk_submit_read(struct pblk *pblk, struct bio *bio);
+int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
+/*
+ * pblk recovery
+ */
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
+int pblk_recov_pad(struct pblk *pblk);
+int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
+
+/*
+ * pblk gc
+ */
+#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */
+#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */
+#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */
+#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */
+
+int pblk_gc_init(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk, bool graceful);
+void pblk_gc_should_start(struct pblk *pblk);
+void pblk_gc_should_stop(struct pblk *pblk);
+void pblk_gc_should_kick(struct pblk *pblk);
+void pblk_gc_free_full_lines(struct pblk *pblk);
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+ int *gc_active);
+int pblk_gc_sysfs_force(struct pblk *pblk, int force);
+
+/*
+ * pblk rate limiter
+ */
+void pblk_rl_init(struct pblk_rl *rl, int budget);
+void pblk_rl_free(struct pblk_rl *rl);
+void pblk_rl_update_rates(struct pblk_rl *rl);
+int pblk_rl_high_thrs(struct pblk_rl *rl);
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
+unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl);
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
+int pblk_rl_max_io(struct pblk_rl *rl);
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
+ bool used);
+int pblk_rl_is_limit(struct pblk_rl *rl);
+
+void pblk_rl_werr_line_in(struct pblk_rl *rl);
+void pblk_rl_werr_line_out(struct pblk_rl *rl);
+
+/*
+ * pblk sysfs
+ */
+int pblk_sysfs_init(struct gendisk *tdisk);
+void pblk_sysfs_exit(struct gendisk *tdisk);
+
+static inline void *pblk_malloc(size_t size, int type, gfp_t flags)
+{
+ if (type == PBLK_KMALLOC_META)
+ return kmalloc(size, flags);
+ return vmalloc(size);
+}
+
+static inline void pblk_mfree(void *ptr, int type)
+{
+ if (type == PBLK_KMALLOC_META)
+ kfree(ptr);
+ else
+ vfree(ptr);
+}
+
+static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx)
+{
+ return c_ctx - sizeof(struct nvm_rq);
+}
+
+static inline void *emeta_to_bb(struct line_emeta *emeta)
+{
+ return emeta->bb_bitmap;
+}
+
+static inline void *emeta_to_wa(struct pblk_line_meta *lm,
+ struct line_emeta *emeta)
+{
+ return emeta->bb_bitmap + lm->blk_bitmap_len;
+}
+
+static inline void *emeta_to_lbas(struct pblk *pblk, struct line_emeta *emeta)
+{
+ return ((void *)emeta + pblk->lm.emeta_len[1]);
+}
+
+static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta)
+{
+ return (emeta_to_lbas(pblk, emeta) + pblk->lm.emeta_len[2]);
+}
+
+static inline int pblk_line_vsc(struct pblk_line *line)
+{
+ return le32_to_cpu(*line->vsc);
+}
+
+static inline int pblk_pad_distance(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ return geo->mw_cunits * geo->all_luns * geo->ws_opt;
+}
+
+static inline int pblk_ppa_to_line(struct ppa_addr p)
+{
+ return p.a.blk;
+}
+
+static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+ return p.a.lun * geo->num_ch + p.a.ch;
+}
+
+static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
+ u64 line_id)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr ppa;
+
+ if (geo->version == NVM_OCSSD_SPEC_12) {
+ struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf;
+
+ ppa.ppa = 0;
+ ppa.g.blk = line_id;
+ ppa.g.pg = (paddr & ppaf->pg_mask) >> ppaf->pg_offset;
+ ppa.g.lun = (paddr & ppaf->lun_mask) >> ppaf->lun_offset;
+ ppa.g.ch = (paddr & ppaf->ch_mask) >> ppaf->ch_offset;
+ ppa.g.pl = (paddr & ppaf->pln_mask) >> ppaf->pln_offset;
+ ppa.g.sec = (paddr & ppaf->sec_mask) >> ppaf->sec_offset;
+ } else {
+ struct pblk_addrf *uaddrf = &pblk->uaddrf;
+ int secs, chnls, luns;
+
+ ppa.ppa = 0;
+
+ ppa.m.chk = line_id;
+
+ paddr = div_u64_rem(paddr, uaddrf->sec_stripe, &secs);
+ ppa.m.sec = secs;
+
+ paddr = div_u64_rem(paddr, uaddrf->ch_stripe, &chnls);
+ ppa.m.grp = chnls;
+
+ paddr = div_u64_rem(paddr, uaddrf->lun_stripe, &luns);
+ ppa.m.pu = luns;
+
+ ppa.m.sec += uaddrf->sec_stripe * paddr;
+ }
+
+ return ppa;
+}
+
+static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
+ struct ppa_addr p)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ u64 paddr;
+
+ if (geo->version == NVM_OCSSD_SPEC_12) {
+ struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf;
+
+ paddr = (u64)p.g.ch << ppaf->ch_offset;
+ paddr |= (u64)p.g.lun << ppaf->lun_offset;
+ paddr |= (u64)p.g.pg << ppaf->pg_offset;
+ paddr |= (u64)p.g.pl << ppaf->pln_offset;
+ paddr |= (u64)p.g.sec << ppaf->sec_offset;
+ } else {
+ struct pblk_addrf *uaddrf = &pblk->uaddrf;
+ u64 secs = p.m.sec;
+ int sec_stripe;
+
+ paddr = (u64)p.m.grp * uaddrf->sec_stripe;
+ paddr += (u64)p.m.pu * uaddrf->sec_lun_stripe;
+
+ secs = div_u64_rem(secs, uaddrf->sec_stripe, &sec_stripe);
+ paddr += secs * uaddrf->sec_ws_stripe;
+ paddr += sec_stripe;
+ }
+
+ return paddr;
+}
+
+static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
+{
+ struct ppa_addr ppa64;
+
+ ppa64.ppa = 0;
+
+ if (ppa32 == -1) {
+ ppa64.ppa = ADDR_EMPTY;
+ } else if (ppa32 & (1U << 31)) {
+ ppa64.c.line = ppa32 & ((~0U) >> 1);
+ ppa64.c.is_cached = 1;
+ } else {
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ if (geo->version == NVM_OCSSD_SPEC_12) {
+ struct nvm_addrf_12 *ppaf =
+ (struct nvm_addrf_12 *)&pblk->addrf;
+
+ ppa64.g.ch = (ppa32 & ppaf->ch_mask) >>
+ ppaf->ch_offset;
+ ppa64.g.lun = (ppa32 & ppaf->lun_mask) >>
+ ppaf->lun_offset;
+ ppa64.g.blk = (ppa32 & ppaf->blk_mask) >>
+ ppaf->blk_offset;
+ ppa64.g.pg = (ppa32 & ppaf->pg_mask) >>
+ ppaf->pg_offset;
+ ppa64.g.pl = (ppa32 & ppaf->pln_mask) >>
+ ppaf->pln_offset;
+ ppa64.g.sec = (ppa32 & ppaf->sec_mask) >>
+ ppaf->sec_offset;
+ } else {
+ struct nvm_addrf *lbaf = &pblk->addrf;
+
+ ppa64.m.grp = (ppa32 & lbaf->ch_mask) >>
+ lbaf->ch_offset;
+ ppa64.m.pu = (ppa32 & lbaf->lun_mask) >>
+ lbaf->lun_offset;
+ ppa64.m.chk = (ppa32 & lbaf->chk_mask) >>
+ lbaf->chk_offset;
+ ppa64.m.sec = (ppa32 & lbaf->sec_mask) >>
+ lbaf->sec_offset;
+ }
+ }
+
+ return ppa64;
+}
+
+static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
+{
+ u32 ppa32 = 0;
+
+ if (ppa64.ppa == ADDR_EMPTY) {
+ ppa32 = ~0U;
+ } else if (ppa64.c.is_cached) {
+ ppa32 |= ppa64.c.line;
+ ppa32 |= 1U << 31;
+ } else {
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ if (geo->version == NVM_OCSSD_SPEC_12) {
+ struct nvm_addrf_12 *ppaf =
+ (struct nvm_addrf_12 *)&pblk->addrf;
+
+ ppa32 |= ppa64.g.ch << ppaf->ch_offset;
+ ppa32 |= ppa64.g.lun << ppaf->lun_offset;
+ ppa32 |= ppa64.g.blk << ppaf->blk_offset;
+ ppa32 |= ppa64.g.pg << ppaf->pg_offset;
+ ppa32 |= ppa64.g.pl << ppaf->pln_offset;
+ ppa32 |= ppa64.g.sec << ppaf->sec_offset;
+ } else {
+ struct nvm_addrf *lbaf = &pblk->addrf;
+
+ ppa32 |= ppa64.m.grp << lbaf->ch_offset;
+ ppa32 |= ppa64.m.pu << lbaf->lun_offset;
+ ppa32 |= ppa64.m.chk << lbaf->chk_offset;
+ ppa32 |= ppa64.m.sec << lbaf->sec_offset;
+ }
+ }
+
+ return ppa32;
+}
+
+static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
+ sector_t lba)
+{
+ struct ppa_addr ppa;
+
+ if (pblk->addrf_len < 32) {
+ u32 *map = (u32 *)pblk->trans_map;
+
+ ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
+ } else {
+ struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
+
+ ppa = map[lba];
+ }
+
+ return ppa;
+}
+
+static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa)
+{
+ if (pblk->addrf_len < 32) {
+ u32 *map = (u32 *)pblk->trans_map;
+
+ map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
+ } else {
+ u64 *map = (u64 *)pblk->trans_map;
+
+ map[lba] = ppa.ppa;
+ }
+}
+
+static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
+{
+ return (ppa_addr.ppa == ADDR_EMPTY);
+}
+
+static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
+{
+ ppa_addr->ppa = ADDR_EMPTY;
+}
+
+static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa)
+{
+ return (lppa.ppa == rppa.ppa);
+}
+
+static inline int pblk_addr_in_cache(struct ppa_addr ppa)
+{
+ return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached);
+}
+
+static inline int pblk_addr_to_cacheline(struct ppa_addr ppa)
+{
+ return ppa.c.line;
+}
+
+static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
+{
+ struct ppa_addr p;
+
+ p.c.line = addr;
+ p.c.is_cached = 1;
+
+ return p;
+}
+
+static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
+ struct line_header *header)
+{
+ u32 crc = ~(u32)0;
+
+ crc = crc32_le(crc, (unsigned char *)header + sizeof(crc),
+ sizeof(struct line_header) - sizeof(crc));
+
+ return crc;
+}
+
+static inline u32 pblk_calc_smeta_crc(struct pblk *pblk,
+ struct line_smeta *smeta)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ u32 crc = ~(u32)0;
+
+ crc = crc32_le(crc, (unsigned char *)smeta +
+ sizeof(struct line_header) + sizeof(crc),
+ lm->smeta_len -
+ sizeof(struct line_header) - sizeof(crc));
+
+ return crc;
+}
+
+static inline u32 pblk_calc_emeta_crc(struct pblk *pblk,
+ struct line_emeta *emeta)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ u32 crc = ~(u32)0;
+
+ crc = crc32_le(crc, (unsigned char *)emeta +
+ sizeof(struct line_header) + sizeof(crc),
+ lm->emeta_len[0] -
+ sizeof(struct line_header) - sizeof(crc));
+
+ return crc;
+}
+
+static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int flags;
+
+ if (geo->version == NVM_OCSSD_SPEC_20)
+ return 0;
+
+ flags = geo->pln_mode >> 1;
+
+ if (type == PBLK_WRITE)
+ flags |= NVM_IO_SCRAMBLE_ENABLE;
+
+ return flags;
+}
+
+enum {
+ PBLK_READ_RANDOM = 0,
+ PBLK_READ_SEQUENTIAL = 1,
+};
+
+static inline int pblk_set_read_mode(struct pblk *pblk, int type)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int flags;
+
+ if (geo->version == NVM_OCSSD_SPEC_20)
+ return 0;
+
+ flags = NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE;
+ if (type == PBLK_READ_SEQUENTIAL)
+ flags |= geo->pln_mode >> 1;
+
+ return flags;
+}
+
+static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs)
+{
+ return !(nr_secs % pblk->min_write_pgs);
+}
+
+#ifdef CONFIG_NVM_PBLK_DEBUG
+static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p,
+ char *msg, int error)
+{
+ struct nvm_geo *geo = &pblk->dev->geo;
+
+ if (p->c.is_cached) {
+ pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n",
+ msg, error, (u64)p->c.line);
+ } else if (geo->version == NVM_OCSSD_SPEC_12) {
+ pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+ msg, error,
+ p->g.ch, p->g.lun, p->g.blk,
+ p->g.pg, p->g.pl, p->g.sec);
+ } else {
+ pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n",
+ msg, error,
+ p->m.grp, p->m.pu, p->m.chk, p->m.sec);
+ }
+}
+
+static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
+ int error)
+{
+ int bit = -1;
+
+ if (rqd->nr_ppas == 1) {
+ print_ppa(pblk, &rqd->ppa_addr, "rqd", error);
+ return;
+ }
+
+ while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
+ bit + 1)) < rqd->nr_ppas) {
+ print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error);
+ }
+
+ pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
+}
+
+static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
+ struct ppa_addr *ppas, int nr_ppas)
+{
+ struct nvm_geo *geo = &tgt_dev->geo;
+ struct ppa_addr *ppa;
+ int i;
+
+ for (i = 0; i < nr_ppas; i++) {
+ ppa = &ppas[i];
+
+ if (geo->version == NVM_OCSSD_SPEC_12) {
+ if (!ppa->c.is_cached &&
+ ppa->g.ch < geo->num_ch &&
+ ppa->g.lun < geo->num_lun &&
+ ppa->g.pl < geo->num_pln &&
+ ppa->g.blk < geo->num_chk &&
+ ppa->g.pg < geo->num_pg &&
+ ppa->g.sec < geo->ws_min)
+ continue;
+ } else {
+ if (!ppa->c.is_cached &&
+ ppa->m.grp < geo->num_ch &&
+ ppa->m.pu < geo->num_lun &&
+ ppa->m.chk < geo->num_chk &&
+ ppa->m.sec < geo->clba)
+ continue;
+ }
+
+ print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i);
+
+ return 1;
+ }
+ return 0;
+}
+
+static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct ppa_addr *ppa_list;
+
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+
+ if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (rqd->opcode == NVM_OP_PWRITE) {
+ struct pblk_line *line;
+ struct ppa_addr ppa;
+ int i;
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ ppa = ppa_list[i];
+ line = &pblk->lines[pblk_ppa_to_line(ppa)];
+
+ spin_lock(&line->lock);
+ if (line->state != PBLK_LINESTATE_OPEN) {
+ pblk_err(pblk, "bad ppa: line:%d,state:%d\n",
+ line->id, line->state);
+ WARN_ON(1);
+ spin_unlock(&line->lock);
+ return -EINVAL;
+ }
+ spin_unlock(&line->lock);
+ }
+ }
+
+ return 0;
+}
+#endif
+
+static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+
+ if (paddr > lm->sec_per_line)
+ return 1;
+
+ return 0;
+}
+
+static inline unsigned int pblk_get_bi_idx(struct bio *bio)
+{
+ return bio->bi_iter.bi_idx;
+}
+
+static inline sector_t pblk_get_lba(struct bio *bio)
+{
+ return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
+}
+
+static inline unsigned int pblk_get_secs(struct bio *bio)
+{
+ return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
+}
+
+static inline void pblk_setup_uuid(struct pblk *pblk)
+{
+ uuid_le uuid;
+
+ uuid_le_gen(&uuid);
+ memcpy(pblk->instance_uuid, uuid.b, 16);
+}
+#endif /* PBLK_H_ */