summaryrefslogtreecommitdiffstats
path: root/drivers/nvdimm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--drivers/nvdimm/Kconfig115
-rw-r--r--drivers/nvdimm/Makefile29
-rw-r--r--drivers/nvdimm/badrange.c293
-rw-r--r--drivers/nvdimm/blk.c351
-rw-r--r--drivers/nvdimm/btt.c1753
-rw-r--r--drivers/nvdimm/btt.h248
-rw-r--r--drivers/nvdimm/btt_devs.c375
-rw-r--r--drivers/nvdimm/bus.c1268
-rw-r--r--drivers/nvdimm/claim.c342
-rw-r--r--drivers/nvdimm/core.c473
-rw-r--r--drivers/nvdimm/dax_devs.c141
-rw-r--r--drivers/nvdimm/dimm.c142
-rw-r--r--drivers/nvdimm/dimm_devs.c717
-rw-r--r--drivers/nvdimm/e820.c94
-rw-r--r--drivers/nvdimm/label.c1212
-rw-r--r--drivers/nvdimm/label.h156
-rw-r--r--drivers/nvdimm/namespace_devs.c2622
-rw-r--r--drivers/nvdimm/nd-core.h137
-rw-r--r--drivers/nvdimm/nd.h430
-rw-r--r--drivers/nvdimm/of_pmem.c119
-rw-r--r--drivers/nvdimm/pfn.h58
-rw-r--r--drivers/nvdimm/pfn_devs.c748
-rw-r--r--drivers/nvdimm/pmem.c613
-rw-r--r--drivers/nvdimm/pmem.h43
-rw-r--r--drivers/nvdimm/region.c168
-rw-r--r--drivers/nvdimm/region_devs.c1235
26 files changed, 13882 insertions, 0 deletions
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
new file mode 100644
index 000000000..9d36473dc
--- /dev/null
+++ b/drivers/nvdimm/Kconfig
@@ -0,0 +1,115 @@
+menuconfig LIBNVDIMM
+ tristate "NVDIMM (Non-Volatile Memory Device) Support"
+ depends on PHYS_ADDR_T_64BIT
+ depends on HAS_IOMEM
+ depends on BLK_DEV
+ help
+ Generic support for non-volatile memory devices including
+ ACPI-6-NFIT defined resources. On platforms that define an
+ NFIT, or otherwise can discover NVDIMM resources, a libnvdimm
+ bus is registered to advertise PMEM (persistent memory)
+ namespaces (/dev/pmemX) and BLK (sliding mmio window(s))
+ namespaces (/dev/ndblkX.Y). A PMEM namespace refers to a
+ memory resource that may span multiple DIMMs and support DAX
+ (see CONFIG_DAX). A BLK namespace refers to an NVDIMM control
+ region which exposes an mmio register set for windowed access
+ mode to non-volatile memory.
+
+if LIBNVDIMM
+
+config BLK_DEV_PMEM
+ tristate "PMEM: Persistent memory block device support"
+ default LIBNVDIMM
+ select DAX_DRIVER
+ select ND_BTT if BTT
+ select ND_PFN if NVDIMM_PFN
+ help
+ Memory ranges for PMEM are described by either an NFIT
+ (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a
+ non-standard OEM-specific E820 memory type (type-12, see
+ CONFIG_X86_PMEM_LEGACY), or it is manually specified by the
+ 'memmap=nn[KMG]!ss[KMG]' kernel command line (see
+ Documentation/admin-guide/kernel-parameters.rst). This driver converts
+ these persistent memory ranges into block devices that are
+ capable of DAX (direct-access) file system mappings. See
+ Documentation/nvdimm/nvdimm.txt for more details.
+
+ Say Y if you want to use an NVDIMM
+
+config ND_BLK
+ tristate "BLK: Block data window (aperture) device support"
+ default LIBNVDIMM
+ select ND_BTT if BTT
+ help
+ Support NVDIMMs, or other devices, that implement a BLK-mode
+ access capability. BLK-mode access uses memory-mapped-i/o
+ apertures to access persistent media.
+
+ Say Y if your platform firmware emits an ACPI.NFIT table
+ (CONFIG_ACPI_NFIT), or otherwise exposes BLK-mode
+ capabilities.
+
+config ND_CLAIM
+ bool
+
+config ND_BTT
+ tristate
+
+config BTT
+ bool "BTT: Block Translation Table (atomic sector updates)"
+ default y if LIBNVDIMM
+ select ND_CLAIM
+ help
+ The Block Translation Table (BTT) provides atomic sector
+ update semantics for persistent memory devices, so that
+ applications that rely on sector writes not being torn (a
+ guarantee that typical disks provide) can continue to do so.
+ The BTT manifests itself as an alternate personality for an
+ NVDIMM namespace, i.e. a namespace can be in raw mode (pmemX,
+ ndblkX.Y, etc...), or 'sectored' mode, (pmemXs, ndblkX.Ys,
+ etc...).
+
+ Select Y if unsure
+
+config ND_PFN
+ tristate
+
+config NVDIMM_PFN
+ bool "PFN: Map persistent (device) memory"
+ default LIBNVDIMM
+ depends on ZONE_DEVICE
+ select ND_CLAIM
+ help
+ Map persistent memory, i.e. advertise it to the memory
+ management sub-system. By default persistent memory does
+ not support direct I/O, RDMA, or any other usage that
+ requires a 'struct page' to mediate an I/O request. This
+ driver allocates and initializes the infrastructure needed
+ to support those use cases.
+
+ Select Y if unsure
+
+config NVDIMM_DAX
+ bool "NVDIMM DAX: Raw access to persistent memory"
+ default LIBNVDIMM
+ depends on NVDIMM_PFN
+ help
+ Support raw device dax access to a persistent memory
+ namespace. For environments that want to hard partition
+ persistent memory, this capability provides a mechanism to
+ sub-divide a namespace into character devices that can only be
+ accessed via DAX (mmap(2)).
+
+ Select Y if unsure
+
+config OF_PMEM
+ tristate "Device-tree support for persistent memory regions"
+ depends on OF
+ default LIBNVDIMM
+ help
+ Allows regions of persistent memory to be described in the
+ device-tree.
+
+ Select Y if unsure.
+
+endif
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
new file mode 100644
index 000000000..e8847045d
--- /dev/null
+++ b/drivers/nvdimm/Makefile
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
+obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
+obj-$(CONFIG_ND_BTT) += nd_btt.o
+obj-$(CONFIG_ND_BLK) += nd_blk.o
+obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
+obj-$(CONFIG_OF_PMEM) += of_pmem.o
+
+nd_pmem-y := pmem.o
+
+nd_btt-y := btt.o
+
+nd_blk-y := blk.o
+
+nd_e820-y := e820.o
+
+libnvdimm-y := core.o
+libnvdimm-y += bus.o
+libnvdimm-y += dimm_devs.o
+libnvdimm-y += dimm.o
+libnvdimm-y += region_devs.o
+libnvdimm-y += region.o
+libnvdimm-y += namespace_devs.o
+libnvdimm-y += label.o
+libnvdimm-y += badrange.o
+libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
+libnvdimm-$(CONFIG_BTT) += btt_devs.o
+libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o
+libnvdimm-$(CONFIG_NVDIMM_DAX) += dax_devs.o
diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
new file mode 100644
index 000000000..e068d72b4
--- /dev/null
+++ b/drivers/nvdimm/badrange.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/ctype.h>
+#include <linux/ndctl.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include "nd-core.h"
+#include "nd.h"
+
+void badrange_init(struct badrange *badrange)
+{
+ INIT_LIST_HEAD(&badrange->list);
+ spin_lock_init(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_init);
+
+static void append_badrange_entry(struct badrange *badrange,
+ struct badrange_entry *bre, u64 addr, u64 length)
+{
+ lockdep_assert_held(&badrange->lock);
+ bre->start = addr;
+ bre->length = length;
+ list_add_tail(&bre->list, &badrange->list);
+}
+
+static int alloc_and_append_badrange_entry(struct badrange *badrange,
+ u64 addr, u64 length, gfp_t flags)
+{
+ struct badrange_entry *bre;
+
+ bre = kzalloc(sizeof(*bre), flags);
+ if (!bre)
+ return -ENOMEM;
+
+ append_badrange_entry(badrange, bre, addr, length);
+ return 0;
+}
+
+static int add_badrange(struct badrange *badrange, u64 addr, u64 length)
+{
+ struct badrange_entry *bre, *bre_new;
+
+ spin_unlock(&badrange->lock);
+ bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL);
+ spin_lock(&badrange->lock);
+
+ if (list_empty(&badrange->list)) {
+ if (!bre_new)
+ return -ENOMEM;
+ append_badrange_entry(badrange, bre_new, addr, length);
+ return 0;
+ }
+
+ /*
+ * There is a chance this is a duplicate, check for those first.
+ * This will be the common case as ARS_STATUS returns all known
+ * errors in the SPA space, and we can't query it per region
+ */
+ list_for_each_entry(bre, &badrange->list, list)
+ if (bre->start == addr) {
+ /* If length has changed, update this list entry */
+ if (bre->length != length)
+ bre->length = length;
+ kfree(bre_new);
+ return 0;
+ }
+
+ /*
+ * If not a duplicate or a simple length update, add the entry as is,
+ * as any overlapping ranges will get resolved when the list is consumed
+ * and converted to badblocks
+ */
+ if (!bre_new)
+ return -ENOMEM;
+ append_badrange_entry(badrange, bre_new, addr, length);
+
+ return 0;
+}
+
+int badrange_add(struct badrange *badrange, u64 addr, u64 length)
+{
+ int rc;
+
+ spin_lock(&badrange->lock);
+ rc = add_badrange(badrange, addr, length);
+ spin_unlock(&badrange->lock);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(badrange_add);
+
+void badrange_forget(struct badrange *badrange, phys_addr_t start,
+ unsigned int len)
+{
+ struct list_head *badrange_list = &badrange->list;
+ u64 clr_end = start + len - 1;
+ struct badrange_entry *bre, *next;
+
+ spin_lock(&badrange->lock);
+
+ /*
+ * [start, clr_end] is the badrange interval being cleared.
+ * [bre->start, bre_end] is the badrange_list entry we're comparing
+ * the above interval against. The badrange list entry may need
+ * to be modified (update either start or length), deleted, or
+ * split into two based on the overlap characteristics
+ */
+
+ list_for_each_entry_safe(bre, next, badrange_list, list) {
+ u64 bre_end = bre->start + bre->length - 1;
+
+ /* Skip intervals with no intersection */
+ if (bre_end < start)
+ continue;
+ if (bre->start > clr_end)
+ continue;
+ /* Delete completely overlapped badrange entries */
+ if ((bre->start >= start) && (bre_end <= clr_end)) {
+ list_del(&bre->list);
+ kfree(bre);
+ continue;
+ }
+ /* Adjust start point of partially cleared entries */
+ if ((start <= bre->start) && (clr_end > bre->start)) {
+ bre->length -= clr_end - bre->start + 1;
+ bre->start = clr_end + 1;
+ continue;
+ }
+ /* Adjust bre->length for partial clearing at the tail end */
+ if ((bre->start < start) && (bre_end <= clr_end)) {
+ /* bre->start remains the same */
+ bre->length = start - bre->start;
+ continue;
+ }
+ /*
+ * If clearing in the middle of an entry, we split it into
+ * two by modifying the current entry to represent one half of
+ * the split, and adding a new entry for the second half.
+ */
+ if ((bre->start < start) && (bre_end > clr_end)) {
+ u64 new_start = clr_end + 1;
+ u64 new_len = bre_end - new_start + 1;
+
+ /* Add new entry covering the right half */
+ alloc_and_append_badrange_entry(badrange, new_start,
+ new_len, GFP_NOWAIT);
+ /* Adjust this entry to cover the left half */
+ bre->length = start - bre->start;
+ continue;
+ }
+ }
+ spin_unlock(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_forget);
+
+static void set_badblock(struct badblocks *bb, sector_t s, int num)
+{
+ dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n",
+ (u64) s * 512, (u64) num * 512);
+ /* this isn't an error as the hardware will still throw an exception */
+ if (badblocks_set(bb, s, num, 1))
+ dev_info_once(bb->dev, "%s: failed for sector %llx\n",
+ __func__, (u64) s);
+}
+
+/**
+ * __add_badblock_range() - Convert a physical address range to bad sectors
+ * @bb: badblocks instance to populate
+ * @ns_offset: namespace offset where the error range begins (in bytes)
+ * @len: number of bytes of badrange to be added
+ *
+ * This assumes that the range provided with (ns_offset, len) is within
+ * the bounds of physical addresses for this namespace, i.e. lies in the
+ * interval [ns_start, ns_start + ns_size)
+ */
+static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
+{
+ const unsigned int sector_size = 512;
+ sector_t start_sector, end_sector;
+ u64 num_sectors;
+ u32 rem;
+
+ start_sector = div_u64(ns_offset, sector_size);
+ end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
+ if (rem)
+ end_sector++;
+ num_sectors = end_sector - start_sector;
+
+ if (unlikely(num_sectors > (u64)INT_MAX)) {
+ u64 remaining = num_sectors;
+ sector_t s = start_sector;
+
+ while (remaining) {
+ int done = min_t(u64, remaining, INT_MAX);
+
+ set_badblock(bb, s, done);
+ remaining -= done;
+ s += done;
+ }
+ } else
+ set_badblock(bb, start_sector, num_sectors);
+}
+
+static void badblocks_populate(struct badrange *badrange,
+ struct badblocks *bb, const struct resource *res)
+{
+ struct badrange_entry *bre;
+
+ if (list_empty(&badrange->list))
+ return;
+
+ list_for_each_entry(bre, &badrange->list, list) {
+ u64 bre_end = bre->start + bre->length - 1;
+
+ /* Discard intervals with no intersection */
+ if (bre_end < res->start)
+ continue;
+ if (bre->start > res->end)
+ continue;
+ /* Deal with any overlap after start of the namespace */
+ if (bre->start >= res->start) {
+ u64 start = bre->start;
+ u64 len;
+
+ if (bre_end <= res->end)
+ len = bre->length;
+ else
+ len = res->start + resource_size(res)
+ - bre->start;
+ __add_badblock_range(bb, start - res->start, len);
+ continue;
+ }
+ /*
+ * Deal with overlap for badrange starting before
+ * the namespace.
+ */
+ if (bre->start < res->start) {
+ u64 len;
+
+ if (bre_end < res->end)
+ len = bre->start + bre->length - res->start;
+ else
+ len = resource_size(res);
+ __add_badblock_range(bb, 0, len);
+ }
+ }
+}
+
+/**
+ * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks
+ * @region: parent region of the range to interrogate
+ * @bb: badblocks instance to populate
+ * @res: resource range to consider
+ *
+ * The badrange list generated during bus initialization may contain
+ * multiple, possibly overlapping physical address ranges. Compare each
+ * of these ranges to the resource range currently being initialized,
+ * and add badblocks entries for all matching sub-ranges
+ */
+void nvdimm_badblocks_populate(struct nd_region *nd_region,
+ struct badblocks *bb, const struct resource *res)
+{
+ struct nvdimm_bus *nvdimm_bus;
+
+ if (!is_memory(&nd_region->dev)) {
+ dev_WARN_ONCE(&nd_region->dev, 1,
+ "%s only valid for pmem regions\n", __func__);
+ return;
+ }
+ nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+
+ nvdimm_bus_lock(&nvdimm_bus->dev);
+ badblocks_populate(&nvdimm_bus->badrange, bb, res);
+ nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
new file mode 100644
index 000000000..db45c6bbb
--- /dev/null
+++ b/drivers/nvdimm/blk.c
@@ -0,0 +1,351 @@
+/*
+ * NVDIMM Block Window Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/nd.h>
+#include <linux/sizes.h>
+#include "nd.h"
+
+static u32 nsblk_meta_size(struct nd_namespace_blk *nsblk)
+{
+ return nsblk->lbasize - ((nsblk->lbasize >= 4096) ? 4096 : 512);
+}
+
+static u32 nsblk_internal_lbasize(struct nd_namespace_blk *nsblk)
+{
+ return roundup(nsblk->lbasize, INT_LBASIZE_ALIGNMENT);
+}
+
+static u32 nsblk_sector_size(struct nd_namespace_blk *nsblk)
+{
+ return nsblk->lbasize - nsblk_meta_size(nsblk);
+}
+
+static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
+ resource_size_t ns_offset, unsigned int len)
+{
+ int i;
+
+ for (i = 0; i < nsblk->num_resources; i++) {
+ if (ns_offset < resource_size(nsblk->res[i])) {
+ if (ns_offset + len > resource_size(nsblk->res[i])) {
+ dev_WARN_ONCE(&nsblk->common.dev, 1,
+ "illegal request\n");
+ return SIZE_MAX;
+ }
+ return nsblk->res[i]->start + ns_offset;
+ }
+ ns_offset -= resource_size(nsblk->res[i]);
+ }
+
+ dev_WARN_ONCE(&nsblk->common.dev, 1, "request out of range\n");
+ return SIZE_MAX;
+}
+
+static struct nd_blk_region *to_ndbr(struct nd_namespace_blk *nsblk)
+{
+ struct nd_region *nd_region;
+ struct device *parent;
+
+ parent = nsblk->common.dev.parent;
+ nd_region = container_of(parent, struct nd_region, dev);
+ return container_of(nd_region, struct nd_blk_region, nd_region);
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
+ struct bio_integrity_payload *bip, u64 lba, int rw)
+{
+ struct nd_blk_region *ndbr = to_ndbr(nsblk);
+ unsigned int len = nsblk_meta_size(nsblk);
+ resource_size_t dev_offset, ns_offset;
+ u32 internal_lbasize, sector_size;
+ int err = 0;
+
+ internal_lbasize = nsblk_internal_lbasize(nsblk);
+ sector_size = nsblk_sector_size(nsblk);
+ ns_offset = lba * internal_lbasize + sector_size;
+ dev_offset = to_dev_offset(nsblk, ns_offset, len);
+ if (dev_offset == SIZE_MAX)
+ return -EIO;
+
+ while (len) {
+ unsigned int cur_len;
+ struct bio_vec bv;
+ void *iobuf;
+
+ bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
+ /*
+ * The 'bv' obtained from bvec_iter_bvec has its .bv_len and
+ * .bv_offset already adjusted for iter->bi_bvec_done, and we
+ * can use those directly
+ */
+
+ cur_len = min(len, bv.bv_len);
+ iobuf = kmap_atomic(bv.bv_page);
+ err = ndbr->do_io(ndbr, dev_offset, iobuf + bv.bv_offset,
+ cur_len, rw);
+ kunmap_atomic(iobuf);
+ if (err)
+ return err;
+
+ len -= cur_len;
+ dev_offset += cur_len;
+ if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len))
+ return -EIO;
+ }
+
+ return err;
+}
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
+ struct bio_integrity_payload *bip, u64 lba, int rw)
+{
+ return 0;
+}
+#endif
+
+static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
+ struct bio_integrity_payload *bip, struct page *page,
+ unsigned int len, unsigned int off, int rw, sector_t sector)
+{
+ struct nd_blk_region *ndbr = to_ndbr(nsblk);
+ resource_size_t dev_offset, ns_offset;
+ u32 internal_lbasize, sector_size;
+ int err = 0;
+ void *iobuf;
+ u64 lba;
+
+ internal_lbasize = nsblk_internal_lbasize(nsblk);
+ sector_size = nsblk_sector_size(nsblk);
+ while (len) {
+ unsigned int cur_len;
+
+ /*
+ * If we don't have an integrity payload, we don't have to
+ * split the bvec into sectors, as this would cause unnecessary
+ * Block Window setup/move steps. the do_io routine is capable
+ * of handling len <= PAGE_SIZE.
+ */
+ cur_len = bip ? min(len, sector_size) : len;
+
+ lba = div_u64(sector << SECTOR_SHIFT, sector_size);
+ ns_offset = lba * internal_lbasize;
+ dev_offset = to_dev_offset(nsblk, ns_offset, cur_len);
+ if (dev_offset == SIZE_MAX)
+ return -EIO;
+
+ iobuf = kmap_atomic(page);
+ err = ndbr->do_io(ndbr, dev_offset, iobuf + off, cur_len, rw);
+ kunmap_atomic(iobuf);
+ if (err)
+ return err;
+
+ if (bip) {
+ err = nd_blk_rw_integrity(nsblk, bip, lba, rw);
+ if (err)
+ return err;
+ }
+ len -= cur_len;
+ off += cur_len;
+ sector += sector_size >> SECTOR_SHIFT;
+ }
+
+ return err;
+}
+
+static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct bio_integrity_payload *bip;
+ struct nd_namespace_blk *nsblk;
+ struct bvec_iter iter;
+ unsigned long start;
+ struct bio_vec bvec;
+ int err = 0, rw;
+ bool do_acct;
+
+ if (!bio_integrity_prep(bio))
+ return BLK_QC_T_NONE;
+
+ bip = bio_integrity(bio);
+ nsblk = q->queuedata;
+ rw = bio_data_dir(bio);
+ do_acct = nd_iostat_start(bio, &start);
+ bio_for_each_segment(bvec, bio, iter) {
+ unsigned int len = bvec.bv_len;
+
+ BUG_ON(len > PAGE_SIZE);
+ err = nsblk_do_bvec(nsblk, bip, bvec.bv_page, len,
+ bvec.bv_offset, rw, iter.bi_sector);
+ if (err) {
+ dev_dbg(&nsblk->common.dev,
+ "io error in %s sector %lld, len %d,\n",
+ (rw == READ) ? "READ" : "WRITE",
+ (unsigned long long) iter.bi_sector, len);
+ bio->bi_status = errno_to_blk_status(err);
+ break;
+ }
+ }
+ if (do_acct)
+ nd_iostat_end(bio, start);
+
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+}
+
+static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
+ resource_size_t offset, void *iobuf, size_t n, int rw,
+ unsigned long flags)
+{
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev);
+ struct nd_blk_region *ndbr = to_ndbr(nsblk);
+ resource_size_t dev_offset;
+
+ dev_offset = to_dev_offset(nsblk, offset, n);
+
+ if (unlikely(offset + n > nsblk->size)) {
+ dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+ return -EFAULT;
+ }
+
+ if (dev_offset == SIZE_MAX)
+ return -EIO;
+
+ return ndbr->do_io(ndbr, dev_offset, iobuf, n, rw);
+}
+
+static const struct block_device_operations nd_blk_fops = {
+ .owner = THIS_MODULE,
+ .revalidate_disk = nvdimm_revalidate_disk,
+};
+
+static void nd_blk_release_queue(void *q)
+{
+ blk_cleanup_queue(q);
+}
+
+static void nd_blk_release_disk(void *disk)
+{
+ del_gendisk(disk);
+ put_disk(disk);
+}
+
+static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
+{
+ struct device *dev = &nsblk->common.dev;
+ resource_size_t available_disk_size;
+ struct request_queue *q;
+ struct gendisk *disk;
+ u64 internal_nlba;
+
+ internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk));
+ available_disk_size = internal_nlba * nsblk_sector_size(nsblk);
+
+ q = blk_alloc_queue(GFP_KERNEL);
+ if (!q)
+ return -ENOMEM;
+ if (devm_add_action_or_reset(dev, nd_blk_release_queue, q))
+ return -ENOMEM;
+
+ blk_queue_make_request(q, nd_blk_make_request);
+ blk_queue_max_hw_sectors(q, UINT_MAX);
+ blk_queue_logical_block_size(q, nsblk_sector_size(nsblk));
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+ q->queuedata = nsblk;
+
+ disk = alloc_disk(0);
+ if (!disk)
+ return -ENOMEM;
+
+ disk->first_minor = 0;
+ disk->fops = &nd_blk_fops;
+ disk->queue = q;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name);
+
+ if (devm_add_action_or_reset(dev, nd_blk_release_disk, disk))
+ return -ENOMEM;
+
+ if (nsblk_meta_size(nsblk)) {
+ int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
+
+ if (rc)
+ return rc;
+ }
+
+ set_capacity(disk, available_disk_size >> SECTOR_SHIFT);
+ device_add_disk(dev, disk, NULL);
+ revalidate_disk(disk);
+ return 0;
+}
+
+static int nd_blk_probe(struct device *dev)
+{
+ struct nd_namespace_common *ndns;
+ struct nd_namespace_blk *nsblk;
+
+ ndns = nvdimm_namespace_common_probe(dev);
+ if (IS_ERR(ndns))
+ return PTR_ERR(ndns);
+
+ nsblk = to_nd_namespace_blk(&ndns->dev);
+ nsblk->size = nvdimm_namespace_capacity(ndns);
+ dev_set_drvdata(dev, nsblk);
+
+ ndns->rw_bytes = nsblk_rw_bytes;
+ if (is_nd_btt(dev))
+ return nvdimm_namespace_attach_btt(ndns);
+ else if (nd_btt_probe(dev, ndns) == 0) {
+ /* we'll come back as btt-blk */
+ return -ENXIO;
+ } else
+ return nsblk_attach_disk(nsblk);
+}
+
+static int nd_blk_remove(struct device *dev)
+{
+ if (is_nd_btt(dev))
+ nvdimm_namespace_detach_btt(to_nd_btt(dev));
+ return 0;
+}
+
+static struct nd_device_driver nd_blk_driver = {
+ .probe = nd_blk_probe,
+ .remove = nd_blk_remove,
+ .drv = {
+ .name = "nd_blk",
+ },
+ .type = ND_DRIVER_NAMESPACE_BLK,
+};
+
+static int __init nd_blk_init(void)
+{
+ return nd_driver_register(&nd_blk_driver);
+}
+
+static void __exit nd_blk_exit(void)
+{
+ driver_unregister(&nd_blk_driver.drv);
+}
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_BLK);
+module_init(nd_blk_init);
+module_exit(nd_blk_exit);
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
new file mode 100644
index 000000000..b6823e66a
--- /dev/null
+++ b/drivers/nvdimm/btt.c
@@ -0,0 +1,1753 @@
+/*
+ * Block Translation Table
+ * Copyright (c) 2014-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/highmem.h>
+#include <linux/debugfs.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/hdreg.h>
+#include <linux/genhd.h>
+#include <linux/sizes.h>
+#include <linux/ndctl.h>
+#include <linux/fs.h>
+#include <linux/nd.h>
+#include <linux/backing-dev.h>
+#include "btt.h"
+#include "nd.h"
+
+enum log_ent_request {
+ LOG_NEW_ENT = 0,
+ LOG_OLD_ENT
+};
+
+static struct device *to_dev(struct arena_info *arena)
+{
+ return &arena->nd_btt->dev;
+}
+
+static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset)
+{
+ return offset + nd_btt->initial_offset;
+}
+
+static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
+ void *buf, size_t n, unsigned long flags)
+{
+ struct nd_btt *nd_btt = arena->nd_btt;
+ struct nd_namespace_common *ndns = nd_btt->ndns;
+
+ /* arena offsets may be shifted from the base of the device */
+ offset = adjust_initial_offset(nd_btt, offset);
+ return nvdimm_read_bytes(ndns, offset, buf, n, flags);
+}
+
+static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
+ void *buf, size_t n, unsigned long flags)
+{
+ struct nd_btt *nd_btt = arena->nd_btt;
+ struct nd_namespace_common *ndns = nd_btt->ndns;
+
+ /* arena offsets may be shifted from the base of the device */
+ offset = adjust_initial_offset(nd_btt, offset);
+ return nvdimm_write_bytes(ndns, offset, buf, n, flags);
+}
+
+static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
+{
+ int ret;
+
+ /*
+ * infooff and info2off should always be at least 512B aligned.
+ * We rely on that to make sure rw_bytes does error clearing
+ * correctly, so make sure that is the case.
+ */
+ dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512),
+ "arena->infooff: %#llx is unaligned\n", arena->infooff);
+ dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512),
+ "arena->info2off: %#llx is unaligned\n", arena->info2off);
+
+ ret = arena_write_bytes(arena, arena->info2off, super,
+ sizeof(struct btt_sb), 0);
+ if (ret)
+ return ret;
+
+ return arena_write_bytes(arena, arena->infooff, super,
+ sizeof(struct btt_sb), 0);
+}
+
+static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
+{
+ return arena_read_bytes(arena, arena->infooff, super,
+ sizeof(struct btt_sb), 0);
+}
+
+/*
+ * 'raw' version of btt_map write
+ * Assumptions:
+ * mapping is in little-endian
+ * mapping contains 'E' and 'Z' flags as desired
+ */
+static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping,
+ unsigned long flags)
+{
+ u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
+
+ if (unlikely(lba >= arena->external_nlba))
+ dev_err_ratelimited(to_dev(arena),
+ "%s: lba %#x out of range (max: %#x)\n",
+ __func__, lba, arena->external_nlba);
+ return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags);
+}
+
+static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
+ u32 z_flag, u32 e_flag, unsigned long rwb_flags)
+{
+ u32 ze;
+ __le32 mapping_le;
+
+ /*
+ * This 'mapping' is supposed to be just the LBA mapping, without
+ * any flags set, so strip the flag bits.
+ */
+ mapping = ent_lba(mapping);
+
+ ze = (z_flag << 1) + e_flag;
+ switch (ze) {
+ case 0:
+ /*
+ * We want to set neither of the Z or E flags, and
+ * in the actual layout, this means setting the bit
+ * positions of both to '1' to indicate a 'normal'
+ * map entry
+ */
+ mapping |= MAP_ENT_NORMAL;
+ break;
+ case 1:
+ mapping |= (1 << MAP_ERR_SHIFT);
+ break;
+ case 2:
+ mapping |= (1 << MAP_TRIM_SHIFT);
+ break;
+ default:
+ /*
+ * The case where Z and E are both sent in as '1' could be
+ * construed as a valid 'normal' case, but we decide not to,
+ * to avoid confusion
+ */
+ dev_err_ratelimited(to_dev(arena),
+ "Invalid use of Z and E flags\n");
+ return -EIO;
+ }
+
+ mapping_le = cpu_to_le32(mapping);
+ return __btt_map_write(arena, lba, mapping_le, rwb_flags);
+}
+
+static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
+ int *trim, int *error, unsigned long rwb_flags)
+{
+ int ret;
+ __le32 in;
+ u32 raw_mapping, postmap, ze, z_flag, e_flag;
+ u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
+
+ if (unlikely(lba >= arena->external_nlba))
+ dev_err_ratelimited(to_dev(arena),
+ "%s: lba %#x out of range (max: %#x)\n",
+ __func__, lba, arena->external_nlba);
+
+ ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags);
+ if (ret)
+ return ret;
+
+ raw_mapping = le32_to_cpu(in);
+
+ z_flag = ent_z_flag(raw_mapping);
+ e_flag = ent_e_flag(raw_mapping);
+ ze = (z_flag << 1) + e_flag;
+ postmap = ent_lba(raw_mapping);
+
+ /* Reuse the {z,e}_flag variables for *trim and *error */
+ z_flag = 0;
+ e_flag = 0;
+
+ switch (ze) {
+ case 0:
+ /* Initial state. Return postmap = premap */
+ *mapping = lba;
+ break;
+ case 1:
+ *mapping = postmap;
+ e_flag = 1;
+ break;
+ case 2:
+ *mapping = postmap;
+ z_flag = 1;
+ break;
+ case 3:
+ *mapping = postmap;
+ break;
+ default:
+ return -EIO;
+ }
+
+ if (trim)
+ *trim = z_flag;
+ if (error)
+ *error = e_flag;
+
+ return ret;
+}
+
+static int btt_log_group_read(struct arena_info *arena, u32 lane,
+ struct log_group *log)
+{
+ return arena_read_bytes(arena,
+ arena->logoff + (lane * LOG_GRP_SIZE), log,
+ LOG_GRP_SIZE, 0);
+}
+
+static struct dentry *debugfs_root;
+
+static void arena_debugfs_init(struct arena_info *a, struct dentry *parent,
+ int idx)
+{
+ char dirname[32];
+ struct dentry *d;
+
+ /* If for some reason, parent bttN was not created, exit */
+ if (!parent)
+ return;
+
+ snprintf(dirname, 32, "arena%d", idx);
+ d = debugfs_create_dir(dirname, parent);
+ if (IS_ERR_OR_NULL(d))
+ return;
+ a->debugfs_dir = d;
+
+ debugfs_create_x64("size", S_IRUGO, d, &a->size);
+ debugfs_create_x64("external_lba_start", S_IRUGO, d,
+ &a->external_lba_start);
+ debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba);
+ debugfs_create_u32("internal_lbasize", S_IRUGO, d,
+ &a->internal_lbasize);
+ debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba);
+ debugfs_create_u32("external_lbasize", S_IRUGO, d,
+ &a->external_lbasize);
+ debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree);
+ debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major);
+ debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor);
+ debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff);
+ debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff);
+ debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff);
+ debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff);
+ debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
+ debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
+ debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
+ debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]);
+ debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]);
+}
+
+static void btt_debugfs_init(struct btt *btt)
+{
+ int i = 0;
+ struct arena_info *arena;
+
+ btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev),
+ debugfs_root);
+ if (IS_ERR_OR_NULL(btt->debugfs_dir))
+ return;
+
+ list_for_each_entry(arena, &btt->arena_list, list) {
+ arena_debugfs_init(arena, btt->debugfs_dir, i);
+ i++;
+ }
+}
+
+static u32 log_seq(struct log_group *log, int log_idx)
+{
+ return le32_to_cpu(log->ent[log_idx].seq);
+}
+
+/*
+ * This function accepts two log entries, and uses the
+ * sequence number to find the 'older' entry.
+ * It also updates the sequence number in this old entry to
+ * make it the 'new' one if the mark_flag is set.
+ * Finally, it returns which of the entries was the older one.
+ *
+ * TODO The logic feels a bit kludge-y. make it better..
+ */
+static int btt_log_get_old(struct arena_info *a, struct log_group *log)
+{
+ int idx0 = a->log_index[0];
+ int idx1 = a->log_index[1];
+ int old;
+
+ /*
+ * the first ever time this is seen, the entry goes into [0]
+ * the next time, the following logic works out to put this
+ * (next) entry into [1]
+ */
+ if (log_seq(log, idx0) == 0) {
+ log->ent[idx0].seq = cpu_to_le32(1);
+ return 0;
+ }
+
+ if (log_seq(log, idx0) == log_seq(log, idx1))
+ return -EINVAL;
+ if (log_seq(log, idx0) + log_seq(log, idx1) > 5)
+ return -EINVAL;
+
+ if (log_seq(log, idx0) < log_seq(log, idx1)) {
+ if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1)
+ old = 0;
+ else
+ old = 1;
+ } else {
+ if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1)
+ old = 1;
+ else
+ old = 0;
+ }
+
+ return old;
+}
+
+/*
+ * This function copies the desired (old/new) log entry into ent if
+ * it is not NULL. It returns the sub-slot number (0 or 1)
+ * where the desired log entry was found. Negative return values
+ * indicate errors.
+ */
+static int btt_log_read(struct arena_info *arena, u32 lane,
+ struct log_entry *ent, int old_flag)
+{
+ int ret;
+ int old_ent, ret_ent;
+ struct log_group log;
+
+ ret = btt_log_group_read(arena, lane, &log);
+ if (ret)
+ return -EIO;
+
+ old_ent = btt_log_get_old(arena, &log);
+ if (old_ent < 0 || old_ent > 1) {
+ dev_err(to_dev(arena),
+ "log corruption (%d): lane %d seq [%d, %d]\n",
+ old_ent, lane, log.ent[arena->log_index[0]].seq,
+ log.ent[arena->log_index[1]].seq);
+ /* TODO set error state? */
+ return -EIO;
+ }
+
+ ret_ent = (old_flag ? old_ent : (1 - old_ent));
+
+ if (ent != NULL)
+ memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE);
+
+ return ret_ent;
+}
+
+/*
+ * This function commits a log entry to media
+ * It does _not_ prepare the freelist entry for the next write
+ * btt_flog_write is the wrapper for updating the freelist elements
+ */
+static int __btt_log_write(struct arena_info *arena, u32 lane,
+ u32 sub, struct log_entry *ent, unsigned long flags)
+{
+ int ret;
+ u32 group_slot = arena->log_index[sub];
+ unsigned int log_half = LOG_ENT_SIZE / 2;
+ void *src = ent;
+ u64 ns_off;
+
+ ns_off = arena->logoff + (lane * LOG_GRP_SIZE) +
+ (group_slot * LOG_ENT_SIZE);
+ /* split the 16B write into atomic, durable halves */
+ ret = arena_write_bytes(arena, ns_off, src, log_half, flags);
+ if (ret)
+ return ret;
+
+ ns_off += log_half;
+ src += log_half;
+ return arena_write_bytes(arena, ns_off, src, log_half, flags);
+}
+
+static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
+ struct log_entry *ent)
+{
+ int ret;
+
+ ret = __btt_log_write(arena, lane, sub, ent, NVDIMM_IO_ATOMIC);
+ if (ret)
+ return ret;
+
+ /* prepare the next free entry */
+ arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
+ if (++(arena->freelist[lane].seq) == 4)
+ arena->freelist[lane].seq = 1;
+ if (ent_e_flag(le32_to_cpu(ent->old_map)))
+ arena->freelist[lane].has_err = 1;
+ arena->freelist[lane].block = ent_lba(le32_to_cpu(ent->old_map));
+
+ return ret;
+}
+
+/*
+ * This function initializes the BTT map to the initial state, which is
+ * all-zeroes, and indicates an identity mapping
+ */
+static int btt_map_init(struct arena_info *arena)
+{
+ int ret = -EINVAL;
+ void *zerobuf;
+ size_t offset = 0;
+ size_t chunk_size = SZ_2M;
+ size_t mapsize = arena->logoff - arena->mapoff;
+
+ zerobuf = kzalloc(chunk_size, GFP_KERNEL);
+ if (!zerobuf)
+ return -ENOMEM;
+
+ /*
+ * mapoff should always be at least 512B aligned. We rely on that to
+ * make sure rw_bytes does error clearing correctly, so make sure that
+ * is the case.
+ */
+ dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512),
+ "arena->mapoff: %#llx is unaligned\n", arena->mapoff);
+
+ while (mapsize) {
+ size_t size = min(mapsize, chunk_size);
+
+ dev_WARN_ONCE(to_dev(arena), size < 512,
+ "chunk size: %#zx is unaligned\n", size);
+ ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
+ size, 0);
+ if (ret)
+ goto free;
+
+ offset += size;
+ mapsize -= size;
+ cond_resched();
+ }
+
+ free:
+ kfree(zerobuf);
+ return ret;
+}
+
+/*
+ * This function initializes the BTT log with 'fake' entries pointing
+ * to the initial reserved set of blocks as being free
+ */
+static int btt_log_init(struct arena_info *arena)
+{
+ size_t logsize = arena->info2off - arena->logoff;
+ size_t chunk_size = SZ_4K, offset = 0;
+ struct log_entry ent;
+ void *zerobuf;
+ int ret;
+ u32 i;
+
+ zerobuf = kzalloc(chunk_size, GFP_KERNEL);
+ if (!zerobuf)
+ return -ENOMEM;
+ /*
+ * logoff should always be at least 512B aligned. We rely on that to
+ * make sure rw_bytes does error clearing correctly, so make sure that
+ * is the case.
+ */
+ dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512),
+ "arena->logoff: %#llx is unaligned\n", arena->logoff);
+
+ while (logsize) {
+ size_t size = min(logsize, chunk_size);
+
+ dev_WARN_ONCE(to_dev(arena), size < 512,
+ "chunk size: %#zx is unaligned\n", size);
+ ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf,
+ size, 0);
+ if (ret)
+ goto free;
+
+ offset += size;
+ logsize -= size;
+ cond_resched();
+ }
+
+ for (i = 0; i < arena->nfree; i++) {
+ ent.lba = cpu_to_le32(i);
+ ent.old_map = cpu_to_le32(arena->external_nlba + i);
+ ent.new_map = cpu_to_le32(arena->external_nlba + i);
+ ent.seq = cpu_to_le32(LOG_SEQ_INIT);
+ ret = __btt_log_write(arena, i, 0, &ent, 0);
+ if (ret)
+ goto free;
+ }
+
+ free:
+ kfree(zerobuf);
+ return ret;
+}
+
+static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
+{
+ return arena->dataoff + ((u64)lba * arena->internal_lbasize);
+}
+
+static int arena_clear_freelist_error(struct arena_info *arena, u32 lane)
+{
+ int ret = 0;
+
+ if (arena->freelist[lane].has_err) {
+ void *zero_page = page_address(ZERO_PAGE(0));
+ u32 lba = arena->freelist[lane].block;
+ u64 nsoff = to_namespace_offset(arena, lba);
+ unsigned long len = arena->sector_size;
+
+ mutex_lock(&arena->err_lock);
+
+ while (len) {
+ unsigned long chunk = min(len, PAGE_SIZE);
+
+ ret = arena_write_bytes(arena, nsoff, zero_page,
+ chunk, 0);
+ if (ret)
+ break;
+ len -= chunk;
+ nsoff += chunk;
+ if (len == 0)
+ arena->freelist[lane].has_err = 0;
+ }
+ mutex_unlock(&arena->err_lock);
+ }
+ return ret;
+}
+
+static int btt_freelist_init(struct arena_info *arena)
+{
+ int new, ret;
+ struct log_entry log_new;
+ u32 i, map_entry, log_oldmap, log_newmap;
+
+ arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry),
+ GFP_KERNEL);
+ if (!arena->freelist)
+ return -ENOMEM;
+
+ for (i = 0; i < arena->nfree; i++) {
+ new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT);
+ if (new < 0)
+ return new;
+
+ /* old and new map entries with any flags stripped out */
+ log_oldmap = ent_lba(le32_to_cpu(log_new.old_map));
+ log_newmap = ent_lba(le32_to_cpu(log_new.new_map));
+
+ /* sub points to the next one to be overwritten */
+ arena->freelist[i].sub = 1 - new;
+ arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
+ arena->freelist[i].block = log_oldmap;
+
+ /*
+ * FIXME: if error clearing fails during init, we want to make
+ * the BTT read-only
+ */
+ if (ent_e_flag(le32_to_cpu(log_new.old_map)) &&
+ !ent_normal(le32_to_cpu(log_new.old_map))) {
+ arena->freelist[i].has_err = 1;
+ ret = arena_clear_freelist_error(arena, i);
+ if (ret)
+ dev_err_ratelimited(to_dev(arena),
+ "Unable to clear known errors\n");
+ }
+
+ /* This implies a newly created or untouched flog entry */
+ if (log_oldmap == log_newmap)
+ continue;
+
+ /* Check if map recovery is needed */
+ ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+ /*
+ * The map_entry from btt_read_map is stripped of any flag bits,
+ * so use the stripped out versions from the log as well for
+ * testing whether recovery is needed. For restoration, use the
+ * 'raw' version of the log entries as that captured what we
+ * were going to write originally.
+ */
+ if ((log_newmap != map_entry) && (log_oldmap == map_entry)) {
+ /*
+ * Last transaction wrote the flog, but wasn't able
+ * to complete the map write. So fix up the map.
+ */
+ ret = btt_map_write(arena, le32_to_cpu(log_new.lba),
+ le32_to_cpu(log_new.new_map), 0, 0, 0);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static bool ent_is_padding(struct log_entry *ent)
+{
+ return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0)
+ && (ent->seq == 0);
+}
+
+/*
+ * Detecting valid log indices: We read a log group (see the comments in btt.h
+ * for a description of a 'log_group' and its 'slots'), and iterate over its
+ * four slots. We expect that a padding slot will be all-zeroes, and use this
+ * to detect a padding slot vs. an actual entry.
+ *
+ * If a log_group is in the initial state, i.e. hasn't been used since the
+ * creation of this BTT layout, it will have three of the four slots with
+ * zeroes. We skip over these log_groups for the detection of log_index. If
+ * all log_groups are in the initial state (i.e. the BTT has never been
+ * written to), it is safe to assume the 'new format' of log entries in slots
+ * (0, 1).
+ */
+static int log_set_indices(struct arena_info *arena)
+{
+ bool idx_set = false, initial_state = true;
+ int ret, log_index[2] = {-1, -1};
+ u32 i, j, next_idx = 0;
+ struct log_group log;
+ u32 pad_count = 0;
+
+ for (i = 0; i < arena->nfree; i++) {
+ ret = btt_log_group_read(arena, i, &log);
+ if (ret < 0)
+ return ret;
+
+ for (j = 0; j < 4; j++) {
+ if (!idx_set) {
+ if (ent_is_padding(&log.ent[j])) {
+ pad_count++;
+ continue;
+ } else {
+ /* Skip if index has been recorded */
+ if ((next_idx == 1) &&
+ (j == log_index[0]))
+ continue;
+ /* valid entry, record index */
+ log_index[next_idx] = j;
+ next_idx++;
+ }
+ if (next_idx == 2) {
+ /* two valid entries found */
+ idx_set = true;
+ } else if (next_idx > 2) {
+ /* too many valid indices */
+ return -ENXIO;
+ }
+ } else {
+ /*
+ * once the indices have been set, just verify
+ * that all subsequent log groups are either in
+ * their initial state or follow the same
+ * indices.
+ */
+ if (j == log_index[0]) {
+ /* entry must be 'valid' */
+ if (ent_is_padding(&log.ent[j]))
+ return -ENXIO;
+ } else if (j == log_index[1]) {
+ ;
+ /*
+ * log_index[1] can be padding if the
+ * lane never got used and it is still
+ * in the initial state (three 'padding'
+ * entries)
+ */
+ } else {
+ /* entry must be invalid (padding) */
+ if (!ent_is_padding(&log.ent[j]))
+ return -ENXIO;
+ }
+ }
+ }
+ /*
+ * If any of the log_groups have more than one valid,
+ * non-padding entry, then the we are no longer in the
+ * initial_state
+ */
+ if (pad_count < 3)
+ initial_state = false;
+ pad_count = 0;
+ }
+
+ if (!initial_state && !idx_set)
+ return -ENXIO;
+
+ /*
+ * If all the entries in the log were in the initial state,
+ * assume new padding scheme
+ */
+ if (initial_state)
+ log_index[1] = 1;
+
+ /*
+ * Only allow the known permutations of log/padding indices,
+ * i.e. (0, 1), and (0, 2)
+ */
+ if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2)))
+ ; /* known index possibilities */
+ else {
+ dev_err(to_dev(arena), "Found an unknown padding scheme\n");
+ return -ENXIO;
+ }
+
+ arena->log_index[0] = log_index[0];
+ arena->log_index[1] = log_index[1];
+ dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]);
+ dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]);
+ return 0;
+}
+
+static int btt_rtt_init(struct arena_info *arena)
+{
+ arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
+ if (arena->rtt == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int btt_maplocks_init(struct arena_info *arena)
+{
+ u32 i;
+
+ arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock),
+ GFP_KERNEL);
+ if (!arena->map_locks)
+ return -ENOMEM;
+
+ for (i = 0; i < arena->nfree; i++)
+ spin_lock_init(&arena->map_locks[i].lock);
+
+ return 0;
+}
+
+static struct arena_info *alloc_arena(struct btt *btt, size_t size,
+ size_t start, size_t arena_off)
+{
+ struct arena_info *arena;
+ u64 logsize, mapsize, datasize;
+ u64 available = size;
+
+ arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL);
+ if (!arena)
+ return NULL;
+ arena->nd_btt = btt->nd_btt;
+ arena->sector_size = btt->sector_size;
+ mutex_init(&arena->err_lock);
+
+ if (!size)
+ return arena;
+
+ arena->size = size;
+ arena->external_lba_start = start;
+ arena->external_lbasize = btt->lbasize;
+ arena->internal_lbasize = roundup(arena->external_lbasize,
+ INT_LBASIZE_ALIGNMENT);
+ arena->nfree = BTT_DEFAULT_NFREE;
+ arena->version_major = btt->nd_btt->version_major;
+ arena->version_minor = btt->nd_btt->version_minor;
+
+ if (available % BTT_PG_SIZE)
+ available -= (available % BTT_PG_SIZE);
+
+ /* Two pages are reserved for the super block and its copy */
+ available -= 2 * BTT_PG_SIZE;
+
+ /* The log takes a fixed amount of space based on nfree */
+ logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE);
+ available -= logsize;
+
+ /* Calculate optimal split between map and data area */
+ arena->internal_nlba = div_u64(available - BTT_PG_SIZE,
+ arena->internal_lbasize + MAP_ENT_SIZE);
+ arena->external_nlba = arena->internal_nlba - arena->nfree;
+
+ mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE);
+ datasize = available - mapsize;
+
+ /* 'Absolute' values, relative to start of storage space */
+ arena->infooff = arena_off;
+ arena->dataoff = arena->infooff + BTT_PG_SIZE;
+ arena->mapoff = arena->dataoff + datasize;
+ arena->logoff = arena->mapoff + mapsize;
+ arena->info2off = arena->logoff + logsize;
+
+ /* Default log indices are (0,1) */
+ arena->log_index[0] = 0;
+ arena->log_index[1] = 1;
+ return arena;
+}
+
+static void free_arenas(struct btt *btt)
+{
+ struct arena_info *arena, *next;
+
+ list_for_each_entry_safe(arena, next, &btt->arena_list, list) {
+ list_del(&arena->list);
+ kfree(arena->rtt);
+ kfree(arena->map_locks);
+ kfree(arena->freelist);
+ debugfs_remove_recursive(arena->debugfs_dir);
+ kfree(arena);
+ }
+}
+
+/*
+ * This function reads an existing valid btt superblock and
+ * populates the corresponding arena_info struct
+ */
+static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super,
+ u64 arena_off)
+{
+ arena->internal_nlba = le32_to_cpu(super->internal_nlba);
+ arena->internal_lbasize = le32_to_cpu(super->internal_lbasize);
+ arena->external_nlba = le32_to_cpu(super->external_nlba);
+ arena->external_lbasize = le32_to_cpu(super->external_lbasize);
+ arena->nfree = le32_to_cpu(super->nfree);
+ arena->version_major = le16_to_cpu(super->version_major);
+ arena->version_minor = le16_to_cpu(super->version_minor);
+
+ arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off +
+ le64_to_cpu(super->nextoff));
+ arena->infooff = arena_off;
+ arena->dataoff = arena_off + le64_to_cpu(super->dataoff);
+ arena->mapoff = arena_off + le64_to_cpu(super->mapoff);
+ arena->logoff = arena_off + le64_to_cpu(super->logoff);
+ arena->info2off = arena_off + le64_to_cpu(super->info2off);
+
+ arena->size = (le64_to_cpu(super->nextoff) > 0)
+ ? (le64_to_cpu(super->nextoff))
+ : (arena->info2off - arena->infooff + BTT_PG_SIZE);
+
+ arena->flags = le32_to_cpu(super->flags);
+}
+
+static int discover_arenas(struct btt *btt)
+{
+ int ret = 0;
+ struct arena_info *arena;
+ struct btt_sb *super;
+ size_t remaining = btt->rawsize;
+ u64 cur_nlba = 0;
+ size_t cur_off = 0;
+ int num_arenas = 0;
+
+ super = kzalloc(sizeof(*super), GFP_KERNEL);
+ if (!super)
+ return -ENOMEM;
+
+ while (remaining) {
+ /* Alloc memory for arena */
+ arena = alloc_arena(btt, 0, 0, 0);
+ if (!arena) {
+ ret = -ENOMEM;
+ goto out_super;
+ }
+
+ arena->infooff = cur_off;
+ ret = btt_info_read(arena, super);
+ if (ret)
+ goto out;
+
+ if (!nd_btt_arena_is_valid(btt->nd_btt, super)) {
+ if (remaining == btt->rawsize) {
+ btt->init_state = INIT_NOTFOUND;
+ dev_info(to_dev(arena), "No existing arenas\n");
+ goto out;
+ } else {
+ dev_err(to_dev(arena),
+ "Found corrupted metadata!\n");
+ ret = -ENODEV;
+ goto out;
+ }
+ }
+
+ arena->external_lba_start = cur_nlba;
+ parse_arena_meta(arena, super, cur_off);
+
+ ret = log_set_indices(arena);
+ if (ret) {
+ dev_err(to_dev(arena),
+ "Unable to deduce log/padding indices\n");
+ goto out;
+ }
+
+ ret = btt_freelist_init(arena);
+ if (ret)
+ goto out;
+
+ ret = btt_rtt_init(arena);
+ if (ret)
+ goto out;
+
+ ret = btt_maplocks_init(arena);
+ if (ret)
+ goto out;
+
+ list_add_tail(&arena->list, &btt->arena_list);
+
+ remaining -= arena->size;
+ cur_off += arena->size;
+ cur_nlba += arena->external_nlba;
+ num_arenas++;
+
+ if (arena->nextoff == 0)
+ break;
+ }
+ btt->num_arenas = num_arenas;
+ btt->nlba = cur_nlba;
+ btt->init_state = INIT_READY;
+
+ kfree(super);
+ return ret;
+
+ out:
+ kfree(arena);
+ free_arenas(btt);
+ out_super:
+ kfree(super);
+ return ret;
+}
+
+static int create_arenas(struct btt *btt)
+{
+ size_t remaining = btt->rawsize;
+ size_t cur_off = 0;
+
+ while (remaining) {
+ struct arena_info *arena;
+ size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining);
+
+ remaining -= arena_size;
+ if (arena_size < ARENA_MIN_SIZE)
+ break;
+
+ arena = alloc_arena(btt, arena_size, btt->nlba, cur_off);
+ if (!arena) {
+ free_arenas(btt);
+ return -ENOMEM;
+ }
+ btt->nlba += arena->external_nlba;
+ if (remaining >= ARENA_MIN_SIZE)
+ arena->nextoff = arena->size;
+ else
+ arena->nextoff = 0;
+ cur_off += arena_size;
+ list_add_tail(&arena->list, &btt->arena_list);
+ }
+
+ return 0;
+}
+
+/*
+ * This function completes arena initialization by writing
+ * all the metadata.
+ * It is only called for an uninitialized arena when a write
+ * to that arena occurs for the first time.
+ */
+static int btt_arena_write_layout(struct arena_info *arena)
+{
+ int ret;
+ u64 sum;
+ struct btt_sb *super;
+ struct nd_btt *nd_btt = arena->nd_btt;
+ const u8 *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev);
+
+ ret = btt_map_init(arena);
+ if (ret)
+ return ret;
+
+ ret = btt_log_init(arena);
+ if (ret)
+ return ret;
+
+ super = kzalloc(sizeof(struct btt_sb), GFP_NOIO);
+ if (!super)
+ return -ENOMEM;
+
+ strncpy(super->signature, BTT_SIG, BTT_SIG_LEN);
+ memcpy(super->uuid, nd_btt->uuid, 16);
+ memcpy(super->parent_uuid, parent_uuid, 16);
+ super->flags = cpu_to_le32(arena->flags);
+ super->version_major = cpu_to_le16(arena->version_major);
+ super->version_minor = cpu_to_le16(arena->version_minor);
+ super->external_lbasize = cpu_to_le32(arena->external_lbasize);
+ super->external_nlba = cpu_to_le32(arena->external_nlba);
+ super->internal_lbasize = cpu_to_le32(arena->internal_lbasize);
+ super->internal_nlba = cpu_to_le32(arena->internal_nlba);
+ super->nfree = cpu_to_le32(arena->nfree);
+ super->infosize = cpu_to_le32(sizeof(struct btt_sb));
+ super->nextoff = cpu_to_le64(arena->nextoff);
+ /*
+ * Subtract arena->infooff (arena start) so numbers are relative
+ * to 'this' arena
+ */
+ super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff);
+ super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff);
+ super->logoff = cpu_to_le64(arena->logoff - arena->infooff);
+ super->info2off = cpu_to_le64(arena->info2off - arena->infooff);
+
+ super->flags = 0;
+ sum = nd_sb_checksum((struct nd_gen_sb *) super);
+ super->checksum = cpu_to_le64(sum);
+
+ ret = btt_info_write(arena, super);
+
+ kfree(super);
+ return ret;
+}
+
+/*
+ * This function completes the initialization for the BTT namespace
+ * such that it is ready to accept IOs
+ */
+static int btt_meta_init(struct btt *btt)
+{
+ int ret = 0;
+ struct arena_info *arena;
+
+ mutex_lock(&btt->init_lock);
+ list_for_each_entry(arena, &btt->arena_list, list) {
+ ret = btt_arena_write_layout(arena);
+ if (ret)
+ goto unlock;
+
+ ret = btt_freelist_init(arena);
+ if (ret)
+ goto unlock;
+
+ ret = btt_rtt_init(arena);
+ if (ret)
+ goto unlock;
+
+ ret = btt_maplocks_init(arena);
+ if (ret)
+ goto unlock;
+ }
+
+ btt->init_state = INIT_READY;
+
+ unlock:
+ mutex_unlock(&btt->init_lock);
+ return ret;
+}
+
+static u32 btt_meta_size(struct btt *btt)
+{
+ return btt->lbasize - btt->sector_size;
+}
+
+/*
+ * This function calculates the arena in which the given LBA lies
+ * by doing a linear walk. This is acceptable since we expect only
+ * a few arenas. If we have backing devices that get much larger,
+ * we can construct a balanced binary tree of arenas at init time
+ * so that this range search becomes faster.
+ */
+static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap,
+ struct arena_info **arena)
+{
+ struct arena_info *arena_list;
+ __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size);
+
+ list_for_each_entry(arena_list, &btt->arena_list, list) {
+ if (lba < arena_list->external_nlba) {
+ *arena = arena_list;
+ *premap = lba;
+ return 0;
+ }
+ lba -= arena_list->external_nlba;
+ }
+
+ return -EIO;
+}
+
+/*
+ * The following (lock_map, unlock_map) are mostly just to improve
+ * readability, since they index into an array of locks
+ */
+static void lock_map(struct arena_info *arena, u32 premap)
+ __acquires(&arena->map_locks[idx].lock)
+{
+ u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
+
+ spin_lock(&arena->map_locks[idx].lock);
+}
+
+static void unlock_map(struct arena_info *arena, u32 premap)
+ __releases(&arena->map_locks[idx].lock)
+{
+ u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
+
+ spin_unlock(&arena->map_locks[idx].lock);
+}
+
+static int btt_data_read(struct arena_info *arena, struct page *page,
+ unsigned int off, u32 lba, u32 len)
+{
+ int ret;
+ u64 nsoff = to_namespace_offset(arena, lba);
+ void *mem = kmap_atomic(page);
+
+ ret = arena_read_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
+ kunmap_atomic(mem);
+
+ return ret;
+}
+
+static int btt_data_write(struct arena_info *arena, u32 lba,
+ struct page *page, unsigned int off, u32 len)
+{
+ int ret;
+ u64 nsoff = to_namespace_offset(arena, lba);
+ void *mem = kmap_atomic(page);
+
+ ret = arena_write_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
+ kunmap_atomic(mem);
+
+ return ret;
+}
+
+static void zero_fill_data(struct page *page, unsigned int off, u32 len)
+{
+ void *mem = kmap_atomic(page);
+
+ memset(mem + off, 0, len);
+ kunmap_atomic(mem);
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
+ struct arena_info *arena, u32 postmap, int rw)
+{
+ unsigned int len = btt_meta_size(btt);
+ u64 meta_nsoff;
+ int ret = 0;
+
+ if (bip == NULL)
+ return 0;
+
+ meta_nsoff = to_namespace_offset(arena, postmap) + btt->sector_size;
+
+ while (len) {
+ unsigned int cur_len;
+ struct bio_vec bv;
+ void *mem;
+
+ bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
+ /*
+ * The 'bv' obtained from bvec_iter_bvec has its .bv_len and
+ * .bv_offset already adjusted for iter->bi_bvec_done, and we
+ * can use those directly
+ */
+
+ cur_len = min(len, bv.bv_len);
+ mem = kmap_atomic(bv.bv_page);
+ if (rw)
+ ret = arena_write_bytes(arena, meta_nsoff,
+ mem + bv.bv_offset, cur_len,
+ NVDIMM_IO_ATOMIC);
+ else
+ ret = arena_read_bytes(arena, meta_nsoff,
+ mem + bv.bv_offset, cur_len,
+ NVDIMM_IO_ATOMIC);
+
+ kunmap_atomic(mem);
+ if (ret)
+ return ret;
+
+ len -= cur_len;
+ meta_nsoff += cur_len;
+ if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len))
+ return -EIO;
+ }
+
+ return ret;
+}
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
+ struct arena_info *arena, u32 postmap, int rw)
+{
+ return 0;
+}
+#endif
+
+static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
+ struct page *page, unsigned int off, sector_t sector,
+ unsigned int len)
+{
+ int ret = 0;
+ int t_flag, e_flag;
+ struct arena_info *arena = NULL;
+ u32 lane = 0, premap, postmap;
+
+ while (len) {
+ u32 cur_len;
+
+ lane = nd_region_acquire_lane(btt->nd_region);
+
+ ret = lba_to_arena(btt, sector, &premap, &arena);
+ if (ret)
+ goto out_lane;
+
+ cur_len = min(btt->sector_size, len);
+
+ ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag,
+ NVDIMM_IO_ATOMIC);
+ if (ret)
+ goto out_lane;
+
+ /*
+ * We loop to make sure that the post map LBA didn't change
+ * from under us between writing the RTT and doing the actual
+ * read.
+ */
+ while (1) {
+ u32 new_map;
+ int new_t, new_e;
+
+ if (t_flag) {
+ zero_fill_data(page, off, cur_len);
+ goto out_lane;
+ }
+
+ if (e_flag) {
+ ret = -EIO;
+ goto out_lane;
+ }
+
+ arena->rtt[lane] = RTT_VALID | postmap;
+ /*
+ * Barrier to make sure this write is not reordered
+ * to do the verification map_read before the RTT store
+ */
+ barrier();
+
+ ret = btt_map_read(arena, premap, &new_map, &new_t,
+ &new_e, NVDIMM_IO_ATOMIC);
+ if (ret)
+ goto out_rtt;
+
+ if ((postmap == new_map) && (t_flag == new_t) &&
+ (e_flag == new_e))
+ break;
+
+ postmap = new_map;
+ t_flag = new_t;
+ e_flag = new_e;
+ }
+
+ ret = btt_data_read(arena, page, off, postmap, cur_len);
+ if (ret) {
+ /* Media error - set the e_flag */
+ if (btt_map_write(arena, premap, postmap, 0, 1, NVDIMM_IO_ATOMIC))
+ dev_warn_ratelimited(to_dev(arena),
+ "Error persistently tracking bad blocks at %#x\n",
+ premap);
+ goto out_rtt;
+ }
+
+ if (bip) {
+ ret = btt_rw_integrity(btt, bip, arena, postmap, READ);
+ if (ret)
+ goto out_rtt;
+ }
+
+ arena->rtt[lane] = RTT_INVALID;
+ nd_region_release_lane(btt->nd_region, lane);
+
+ len -= cur_len;
+ off += cur_len;
+ sector += btt->sector_size >> SECTOR_SHIFT;
+ }
+
+ return 0;
+
+ out_rtt:
+ arena->rtt[lane] = RTT_INVALID;
+ out_lane:
+ nd_region_release_lane(btt->nd_region, lane);
+ return ret;
+}
+
+/*
+ * Normally, arena_{read,write}_bytes will take care of the initial offset
+ * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem,
+ * we need the final, raw namespace offset here
+ */
+static bool btt_is_badblock(struct btt *btt, struct arena_info *arena,
+ u32 postmap)
+{
+ u64 nsoff = adjust_initial_offset(arena->nd_btt,
+ to_namespace_offset(arena, postmap));
+ sector_t phys_sector = nsoff >> 9;
+
+ return is_bad_pmem(btt->phys_bb, phys_sector, arena->internal_lbasize);
+}
+
+static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
+ sector_t sector, struct page *page, unsigned int off,
+ unsigned int len)
+{
+ int ret = 0;
+ struct arena_info *arena = NULL;
+ u32 premap = 0, old_postmap, new_postmap, lane = 0, i;
+ struct log_entry log;
+ int sub;
+
+ while (len) {
+ u32 cur_len;
+ int e_flag;
+
+ retry:
+ lane = nd_region_acquire_lane(btt->nd_region);
+
+ ret = lba_to_arena(btt, sector, &premap, &arena);
+ if (ret)
+ goto out_lane;
+ cur_len = min(btt->sector_size, len);
+
+ if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) {
+ ret = -EIO;
+ goto out_lane;
+ }
+
+ if (btt_is_badblock(btt, arena, arena->freelist[lane].block))
+ arena->freelist[lane].has_err = 1;
+
+ if (mutex_is_locked(&arena->err_lock)
+ || arena->freelist[lane].has_err) {
+ nd_region_release_lane(btt->nd_region, lane);
+
+ ret = arena_clear_freelist_error(arena, lane);
+ if (ret)
+ return ret;
+
+ /* OK to acquire a different lane/free block */
+ goto retry;
+ }
+
+ new_postmap = arena->freelist[lane].block;
+
+ /* Wait if the new block is being read from */
+ for (i = 0; i < arena->nfree; i++)
+ while (arena->rtt[i] == (RTT_VALID | new_postmap))
+ cpu_relax();
+
+
+ if (new_postmap >= arena->internal_nlba) {
+ ret = -EIO;
+ goto out_lane;
+ }
+
+ ret = btt_data_write(arena, new_postmap, page, off, cur_len);
+ if (ret)
+ goto out_lane;
+
+ if (bip) {
+ ret = btt_rw_integrity(btt, bip, arena, new_postmap,
+ WRITE);
+ if (ret)
+ goto out_lane;
+ }
+
+ lock_map(arena, premap);
+ ret = btt_map_read(arena, premap, &old_postmap, NULL, &e_flag,
+ NVDIMM_IO_ATOMIC);
+ if (ret)
+ goto out_map;
+ if (old_postmap >= arena->internal_nlba) {
+ ret = -EIO;
+ goto out_map;
+ }
+ if (e_flag)
+ set_e_flag(old_postmap);
+
+ log.lba = cpu_to_le32(premap);
+ log.old_map = cpu_to_le32(old_postmap);
+ log.new_map = cpu_to_le32(new_postmap);
+ log.seq = cpu_to_le32(arena->freelist[lane].seq);
+ sub = arena->freelist[lane].sub;
+ ret = btt_flog_write(arena, lane, sub, &log);
+ if (ret)
+ goto out_map;
+
+ ret = btt_map_write(arena, premap, new_postmap, 0, 0,
+ NVDIMM_IO_ATOMIC);
+ if (ret)
+ goto out_map;
+
+ unlock_map(arena, premap);
+ nd_region_release_lane(btt->nd_region, lane);
+
+ if (e_flag) {
+ ret = arena_clear_freelist_error(arena, lane);
+ if (ret)
+ return ret;
+ }
+
+ len -= cur_len;
+ off += cur_len;
+ sector += btt->sector_size >> SECTOR_SHIFT;
+ }
+
+ return 0;
+
+ out_map:
+ unlock_map(arena, premap);
+ out_lane:
+ nd_region_release_lane(btt->nd_region, lane);
+ return ret;
+}
+
+static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
+ struct page *page, unsigned int len, unsigned int off,
+ unsigned int op, sector_t sector)
+{
+ int ret;
+
+ if (!op_is_write(op)) {
+ ret = btt_read_pg(btt, bip, page, off, sector, len);
+ flush_dcache_page(page);
+ } else {
+ flush_dcache_page(page);
+ ret = btt_write_pg(btt, bip, sector, page, off, len);
+ }
+
+ return ret;
+}
+
+static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ struct btt *btt = q->queuedata;
+ struct bvec_iter iter;
+ unsigned long start;
+ struct bio_vec bvec;
+ int err = 0;
+ bool do_acct;
+
+ if (!bio_integrity_prep(bio))
+ return BLK_QC_T_NONE;
+
+ do_acct = nd_iostat_start(bio, &start);
+ bio_for_each_segment(bvec, bio, iter) {
+ unsigned int len = bvec.bv_len;
+
+ if (len > PAGE_SIZE || len < btt->sector_size ||
+ len % btt->sector_size) {
+ dev_err_ratelimited(&btt->nd_btt->dev,
+ "unaligned bio segment (len: %d)\n", len);
+ bio->bi_status = BLK_STS_IOERR;
+ break;
+ }
+
+ err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
+ bio_op(bio), iter.bi_sector);
+ if (err) {
+ dev_err(&btt->nd_btt->dev,
+ "io error in %s sector %lld, len %d,\n",
+ (op_is_write(bio_op(bio))) ? "WRITE" :
+ "READ",
+ (unsigned long long) iter.bi_sector, len);
+ bio->bi_status = errno_to_blk_status(err);
+ break;
+ }
+ }
+ if (do_acct)
+ nd_iostat_end(bio, start);
+
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+}
+
+static int btt_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, unsigned int op)
+{
+ struct btt *btt = bdev->bd_disk->private_data;
+ int rc;
+ unsigned int len;
+
+ len = hpage_nr_pages(page) * PAGE_SIZE;
+ rc = btt_do_bvec(btt, NULL, page, len, 0, op, sector);
+ if (rc == 0)
+ page_endio(page, op_is_write(op), 0);
+
+ return rc;
+}
+
+
+static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+ /* some standard values */
+ geo->heads = 1 << 6;
+ geo->sectors = 1 << 5;
+ geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+ return 0;
+}
+
+static const struct block_device_operations btt_fops = {
+ .owner = THIS_MODULE,
+ .rw_page = btt_rw_page,
+ .getgeo = btt_getgeo,
+ .revalidate_disk = nvdimm_revalidate_disk,
+};
+
+static int btt_blk_init(struct btt *btt)
+{
+ struct nd_btt *nd_btt = btt->nd_btt;
+ struct nd_namespace_common *ndns = nd_btt->ndns;
+
+ /* create a new disk and request queue for btt */
+ btt->btt_queue = blk_alloc_queue(GFP_KERNEL);
+ if (!btt->btt_queue)
+ return -ENOMEM;
+
+ btt->btt_disk = alloc_disk(0);
+ if (!btt->btt_disk) {
+ blk_cleanup_queue(btt->btt_queue);
+ return -ENOMEM;
+ }
+
+ nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
+ btt->btt_disk->first_minor = 0;
+ btt->btt_disk->fops = &btt_fops;
+ btt->btt_disk->private_data = btt;
+ btt->btt_disk->queue = btt->btt_queue;
+ btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
+ btt->btt_disk->queue->backing_dev_info->capabilities |=
+ BDI_CAP_SYNCHRONOUS_IO;
+
+ blk_queue_make_request(btt->btt_queue, btt_make_request);
+ blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
+ blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX);
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_queue);
+ btt->btt_queue->queuedata = btt;
+
+ if (btt_meta_size(btt)) {
+ int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
+
+ if (rc) {
+ del_gendisk(btt->btt_disk);
+ put_disk(btt->btt_disk);
+ blk_cleanup_queue(btt->btt_queue);
+ return rc;
+ }
+ }
+ set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
+ device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL);
+ btt->nd_btt->size = btt->nlba * (u64)btt->sector_size;
+ revalidate_disk(btt->btt_disk);
+
+ return 0;
+}
+
+static void btt_blk_cleanup(struct btt *btt)
+{
+ del_gendisk(btt->btt_disk);
+ put_disk(btt->btt_disk);
+ blk_cleanup_queue(btt->btt_queue);
+}
+
+/**
+ * btt_init - initialize a block translation table for the given device
+ * @nd_btt: device with BTT geometry and backing device info
+ * @rawsize: raw size in bytes of the backing device
+ * @lbasize: lba size of the backing device
+ * @uuid: A uuid for the backing device - this is stored on media
+ * @maxlane: maximum number of parallel requests the device can handle
+ *
+ * Initialize a Block Translation Table on a backing device to provide
+ * single sector power fail atomicity.
+ *
+ * Context:
+ * Might sleep.
+ *
+ * Returns:
+ * Pointer to a new struct btt on success, NULL on failure.
+ */
+static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
+ u32 lbasize, u8 *uuid, struct nd_region *nd_region)
+{
+ int ret;
+ struct btt *btt;
+ struct nd_namespace_io *nsio;
+ struct device *dev = &nd_btt->dev;
+
+ btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL);
+ if (!btt)
+ return NULL;
+
+ btt->nd_btt = nd_btt;
+ btt->rawsize = rawsize;
+ btt->lbasize = lbasize;
+ btt->sector_size = ((lbasize >= 4096) ? 4096 : 512);
+ INIT_LIST_HEAD(&btt->arena_list);
+ mutex_init(&btt->init_lock);
+ btt->nd_region = nd_region;
+ nsio = to_nd_namespace_io(&nd_btt->ndns->dev);
+ btt->phys_bb = &nsio->bb;
+
+ ret = discover_arenas(btt);
+ if (ret) {
+ dev_err(dev, "init: error in arena_discover: %d\n", ret);
+ return NULL;
+ }
+
+ if (btt->init_state != INIT_READY && nd_region->ro) {
+ dev_warn(dev, "%s is read-only, unable to init btt metadata\n",
+ dev_name(&nd_region->dev));
+ return NULL;
+ } else if (btt->init_state != INIT_READY) {
+ btt->num_arenas = (rawsize / ARENA_MAX_SIZE) +
+ ((rawsize % ARENA_MAX_SIZE) ? 1 : 0);
+ dev_dbg(dev, "init: %d arenas for %llu rawsize\n",
+ btt->num_arenas, rawsize);
+
+ ret = create_arenas(btt);
+ if (ret) {
+ dev_info(dev, "init: create_arenas: %d\n", ret);
+ return NULL;
+ }
+
+ ret = btt_meta_init(btt);
+ if (ret) {
+ dev_err(dev, "init: error in meta_init: %d\n", ret);
+ return NULL;
+ }
+ }
+
+ ret = btt_blk_init(btt);
+ if (ret) {
+ dev_err(dev, "init: error in blk_init: %d\n", ret);
+ return NULL;
+ }
+
+ btt_debugfs_init(btt);
+
+ return btt;
+}
+
+/**
+ * btt_fini - de-initialize a BTT
+ * @btt: the BTT handle that was generated by btt_init
+ *
+ * De-initialize a Block Translation Table on device removal
+ *
+ * Context:
+ * Might sleep.
+ */
+static void btt_fini(struct btt *btt)
+{
+ if (btt) {
+ btt_blk_cleanup(btt);
+ free_arenas(btt);
+ debugfs_remove_recursive(btt->debugfs_dir);
+ }
+}
+
+int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
+{
+ struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
+ struct nd_region *nd_region;
+ struct btt_sb *btt_sb;
+ struct btt *btt;
+ size_t rawsize;
+
+ if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) {
+ dev_dbg(&nd_btt->dev, "incomplete btt configuration\n");
+ return -ENODEV;
+ }
+
+ btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
+ if (!btt_sb)
+ return -ENOMEM;
+
+ /*
+ * If this returns < 0, that is ok as it just means there wasn't
+ * an existing BTT, and we're creating a new one. We still need to
+ * call this as we need the version dependent fields in nd_btt to be
+ * set correctly based on the holder class
+ */
+ nd_btt_version(nd_btt, ndns, btt_sb);
+
+ rawsize = nvdimm_namespace_capacity(ndns) - nd_btt->initial_offset;
+ if (rawsize < ARENA_MIN_SIZE) {
+ dev_dbg(&nd_btt->dev, "%s must be at least %ld bytes\n",
+ dev_name(&ndns->dev),
+ ARENA_MIN_SIZE + nd_btt->initial_offset);
+ return -ENXIO;
+ }
+ nd_region = to_nd_region(nd_btt->dev.parent);
+ btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid,
+ nd_region);
+ if (!btt)
+ return -ENOMEM;
+ nd_btt->btt = btt;
+
+ return 0;
+}
+EXPORT_SYMBOL(nvdimm_namespace_attach_btt);
+
+int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt)
+{
+ struct btt *btt = nd_btt->btt;
+
+ btt_fini(btt);
+ nd_btt->btt = NULL;
+
+ return 0;
+}
+EXPORT_SYMBOL(nvdimm_namespace_detach_btt);
+
+static int __init nd_btt_init(void)
+{
+ int rc = 0;
+
+ debugfs_root = debugfs_create_dir("btt", NULL);
+ if (IS_ERR_OR_NULL(debugfs_root))
+ rc = -ENXIO;
+
+ return rc;
+}
+
+static void __exit nd_btt_exit(void)
+{
+ debugfs_remove_recursive(debugfs_root);
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT);
+MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+module_init(nd_btt_init);
+module_exit(nd_btt_exit);
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
new file mode 100644
index 000000000..ddff49c70
--- /dev/null
+++ b/drivers/nvdimm/btt.h
@@ -0,0 +1,248 @@
+/*
+ * Block Translation Table library
+ * Copyright (c) 2014-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_BTT_H
+#define _LINUX_BTT_H
+
+#include <linux/badblocks.h>
+#include <linux/types.h>
+
+#define BTT_SIG_LEN 16
+#define BTT_SIG "BTT_ARENA_INFO\0"
+#define MAP_ENT_SIZE 4
+#define MAP_TRIM_SHIFT 31
+#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT)
+#define MAP_ERR_SHIFT 30
+#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
+#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
+#define MAP_ENT_NORMAL 0xC0000000
+#define LOG_GRP_SIZE sizeof(struct log_group)
+#define LOG_ENT_SIZE sizeof(struct log_entry)
+#define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */
+#define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */
+#define RTT_VALID (1UL << 31)
+#define RTT_INVALID 0
+#define BTT_PG_SIZE 4096
+#define BTT_DEFAULT_NFREE ND_MAX_LANES
+#define LOG_SEQ_INIT 1
+
+#define IB_FLAG_ERROR 0x00000001
+#define IB_FLAG_ERROR_MASK 0x00000001
+
+#define ent_lba(ent) (ent & MAP_LBA_MASK)
+#define ent_e_flag(ent) (!!(ent & MAP_ERR_MASK))
+#define ent_z_flag(ent) (!!(ent & MAP_TRIM_MASK))
+#define set_e_flag(ent) (ent |= MAP_ERR_MASK)
+/* 'normal' is both e and z flags set */
+#define ent_normal(ent) (ent_e_flag(ent) && ent_z_flag(ent))
+
+enum btt_init_state {
+ INIT_UNCHECKED = 0,
+ INIT_NOTFOUND,
+ INIT_READY
+};
+
+/*
+ * A log group represents one log 'lane', and consists of four log entries.
+ * Two of the four entries are valid entries, and the remaining two are
+ * padding. Due to an old bug in the padding location, we need to perform a
+ * test to determine the padding scheme being used, and use that scheme
+ * thereafter.
+ *
+ * In kernels prior to 4.15, 'log group' would have actual log entries at
+ * indices (0, 2) and padding at indices (1, 3), where as the correct/updated
+ * format has log entries at indices (0, 1) and padding at indices (2, 3).
+ *
+ * Old (pre 4.15) format:
+ * +-----------------+-----------------+
+ * | ent[0] | ent[1] |
+ * | 16B | 16B |
+ * | lba/old/new/seq | pad |
+ * +-----------------------------------+
+ * | ent[2] | ent[3] |
+ * | 16B | 16B |
+ * | lba/old/new/seq | pad |
+ * +-----------------+-----------------+
+ *
+ * New format:
+ * +-----------------+-----------------+
+ * | ent[0] | ent[1] |
+ * | 16B | 16B |
+ * | lba/old/new/seq | lba/old/new/seq |
+ * +-----------------------------------+
+ * | ent[2] | ent[3] |
+ * | 16B | 16B |
+ * | pad | pad |
+ * +-----------------+-----------------+
+ *
+ * We detect during start-up which format is in use, and set
+ * arena->log_index[(0, 1)] with the detected format.
+ */
+
+struct log_entry {
+ __le32 lba;
+ __le32 old_map;
+ __le32 new_map;
+ __le32 seq;
+};
+
+struct log_group {
+ struct log_entry ent[4];
+};
+
+struct btt_sb {
+ u8 signature[BTT_SIG_LEN];
+ u8 uuid[16];
+ u8 parent_uuid[16];
+ __le32 flags;
+ __le16 version_major;
+ __le16 version_minor;
+ __le32 external_lbasize;
+ __le32 external_nlba;
+ __le32 internal_lbasize;
+ __le32 internal_nlba;
+ __le32 nfree;
+ __le32 infosize;
+ __le64 nextoff;
+ __le64 dataoff;
+ __le64 mapoff;
+ __le64 logoff;
+ __le64 info2off;
+ u8 padding[3968];
+ __le64 checksum;
+};
+
+struct free_entry {
+ u32 block;
+ u8 sub;
+ u8 seq;
+ u8 has_err;
+};
+
+struct aligned_lock {
+ union {
+ spinlock_t lock;
+ u8 cacheline_padding[L1_CACHE_BYTES];
+ };
+};
+
+/**
+ * struct arena_info - handle for an arena
+ * @size: Size in bytes this arena occupies on the raw device.
+ * This includes arena metadata.
+ * @external_lba_start: The first external LBA in this arena.
+ * @internal_nlba: Number of internal blocks available in the arena
+ * including nfree reserved blocks
+ * @internal_lbasize: Internal and external lba sizes may be different as
+ * we can round up 'odd' external lbasizes such as 520B
+ * to be aligned.
+ * @external_nlba: Number of blocks contributed by the arena to the number
+ * reported to upper layers. (internal_nlba - nfree)
+ * @external_lbasize: LBA size as exposed to upper layers.
+ * @nfree: A reserve number of 'free' blocks that is used to
+ * handle incoming writes.
+ * @version_major: Metadata layout version major.
+ * @version_minor: Metadata layout version minor.
+ * @sector_size: The Linux sector size - 512 or 4096
+ * @nextoff: Offset in bytes to the start of the next arena.
+ * @infooff: Offset in bytes to the info block of this arena.
+ * @dataoff: Offset in bytes to the data area of this arena.
+ * @mapoff: Offset in bytes to the map area of this arena.
+ * @logoff: Offset in bytes to the log area of this arena.
+ * @info2off: Offset in bytes to the backup info block of this arena.
+ * @freelist: Pointer to in-memory list of free blocks
+ * @rtt: Pointer to in-memory "Read Tracking Table"
+ * @map_locks: Spinlocks protecting concurrent map writes
+ * @nd_btt: Pointer to parent nd_btt structure.
+ * @list: List head for list of arenas
+ * @debugfs_dir: Debugfs dentry
+ * @flags: Arena flags - may signify error states.
+ * @err_lock: Mutex for synchronizing error clearing.
+ * @log_index: Indices of the valid log entries in a log_group
+ *
+ * arena_info is a per-arena handle. Once an arena is narrowed down for an
+ * IO, this struct is passed around for the duration of the IO.
+ */
+struct arena_info {
+ u64 size; /* Total bytes for this arena */
+ u64 external_lba_start;
+ u32 internal_nlba;
+ u32 internal_lbasize;
+ u32 external_nlba;
+ u32 external_lbasize;
+ u32 nfree;
+ u16 version_major;
+ u16 version_minor;
+ u32 sector_size;
+ /* Byte offsets to the different on-media structures */
+ u64 nextoff;
+ u64 infooff;
+ u64 dataoff;
+ u64 mapoff;
+ u64 logoff;
+ u64 info2off;
+ /* Pointers to other in-memory structures for this arena */
+ struct free_entry *freelist;
+ u32 *rtt;
+ struct aligned_lock *map_locks;
+ struct nd_btt *nd_btt;
+ struct list_head list;
+ struct dentry *debugfs_dir;
+ /* Arena flags */
+ u32 flags;
+ struct mutex err_lock;
+ int log_index[2];
+};
+
+/**
+ * struct btt - handle for a BTT instance
+ * @btt_disk: Pointer to the gendisk for BTT device
+ * @btt_queue: Pointer to the request queue for the BTT device
+ * @arena_list: Head of the list of arenas
+ * @debugfs_dir: Debugfs dentry
+ * @nd_btt: Parent nd_btt struct
+ * @nlba: Number of logical blocks exposed to the upper layers
+ * after removing the amount of space needed by metadata
+ * @rawsize: Total size in bytes of the available backing device
+ * @lbasize: LBA size as requested and presented to upper layers.
+ * This is sector_size + size of any metadata.
+ * @sector_size: The Linux sector size - 512 or 4096
+ * @lanes: Per-lane spinlocks
+ * @init_lock: Mutex used for the BTT initialization
+ * @init_state: Flag describing the initialization state for the BTT
+ * @num_arenas: Number of arenas in the BTT instance
+ * @phys_bb: Pointer to the namespace's badblocks structure
+ */
+struct btt {
+ struct gendisk *btt_disk;
+ struct request_queue *btt_queue;
+ struct list_head arena_list;
+ struct dentry *debugfs_dir;
+ struct nd_btt *nd_btt;
+ u64 nlba;
+ unsigned long long rawsize;
+ u32 lbasize;
+ u32 sector_size;
+ struct nd_region *nd_region;
+ struct mutex init_lock;
+ int init_state;
+ int num_arenas;
+ struct badblocks *phys_bb;
+};
+
+bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super);
+int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns,
+ struct btt_sb *btt_sb);
+
+#endif
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
new file mode 100644
index 000000000..9486acc08
--- /dev/null
+++ b/drivers/nvdimm/btt_devs.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "btt.h"
+#include "nd.h"
+
+static void nd_btt_release(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ dev_dbg(dev, "trace\n");
+ nd_detach_ndns(&nd_btt->dev, &nd_btt->ndns);
+ ida_simple_remove(&nd_region->btt_ida, nd_btt->id);
+ kfree(nd_btt->uuid);
+ kfree(nd_btt);
+}
+
+static struct device_type nd_btt_device_type = {
+ .name = "nd_btt",
+ .release = nd_btt_release,
+};
+
+bool is_nd_btt(struct device *dev)
+{
+ return dev->type == &nd_btt_device_type;
+}
+EXPORT_SYMBOL(is_nd_btt);
+
+struct nd_btt *to_nd_btt(struct device *dev)
+{
+ struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev);
+
+ WARN_ON(!is_nd_btt(dev));
+ return nd_btt;
+}
+EXPORT_SYMBOL(to_nd_btt);
+
+static const unsigned long btt_lbasize_supported[] = { 512, 520, 528,
+ 4096, 4104, 4160, 4224, 0 };
+
+static ssize_t sector_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ return nd_size_select_show(nd_btt->lbasize, btt_lbasize_supported, buf);
+}
+
+static ssize_t sector_size_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ rc = nd_size_select_store(dev, buf, &nd_btt->lbasize,
+ btt_lbasize_supported);
+ dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+ buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(sector_size);
+
+static ssize_t uuid_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ if (nd_btt->uuid)
+ return sprintf(buf, "%pUb\n", nd_btt->uuid);
+ return sprintf(buf, "\n");
+}
+
+static ssize_t uuid_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len);
+ dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+ buf[len - 1] == '\n' ? "" : "\n");
+ device_unlock(dev);
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t namespace_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ rc = sprintf(buf, "%s\n", nd_btt->ndns
+ ? dev_name(&nd_btt->ndns->dev) : "");
+ nvdimm_bus_unlock(dev);
+ return rc;
+}
+
+static ssize_t namespace_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ rc = nd_namespace_store(dev, &nd_btt->ndns, buf, len);
+ dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+ buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RW(namespace);
+
+static ssize_t size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ if (dev->driver)
+ rc = sprintf(buf, "%llu\n", nd_btt->size);
+ else {
+ /* no size to convey if the btt instance is disabled */
+ rc = -ENXIO;
+ }
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t log_zero_flags_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "Y\n");
+}
+static DEVICE_ATTR_RO(log_zero_flags);
+
+static struct attribute *nd_btt_attributes[] = {
+ &dev_attr_sector_size.attr,
+ &dev_attr_namespace.attr,
+ &dev_attr_uuid.attr,
+ &dev_attr_size.attr,
+ &dev_attr_log_zero_flags.attr,
+ NULL,
+};
+
+static struct attribute_group nd_btt_attribute_group = {
+ .attrs = nd_btt_attributes,
+};
+
+static const struct attribute_group *nd_btt_attribute_groups[] = {
+ &nd_btt_attribute_group,
+ &nd_device_attribute_group,
+ &nd_numa_attribute_group,
+ NULL,
+};
+
+static struct device *__nd_btt_create(struct nd_region *nd_region,
+ unsigned long lbasize, u8 *uuid,
+ struct nd_namespace_common *ndns)
+{
+ struct nd_btt *nd_btt;
+ struct device *dev;
+
+ nd_btt = kzalloc(sizeof(*nd_btt), GFP_KERNEL);
+ if (!nd_btt)
+ return NULL;
+
+ nd_btt->id = ida_simple_get(&nd_region->btt_ida, 0, 0, GFP_KERNEL);
+ if (nd_btt->id < 0)
+ goto out_nd_btt;
+
+ nd_btt->lbasize = lbasize;
+ if (uuid) {
+ uuid = kmemdup(uuid, 16, GFP_KERNEL);
+ if (!uuid)
+ goto out_put_id;
+ }
+ nd_btt->uuid = uuid;
+ dev = &nd_btt->dev;
+ dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id);
+ dev->parent = &nd_region->dev;
+ dev->type = &nd_btt_device_type;
+ dev->groups = nd_btt_attribute_groups;
+ device_initialize(&nd_btt->dev);
+ if (ndns && !__nd_attach_ndns(&nd_btt->dev, ndns, &nd_btt->ndns)) {
+ dev_dbg(&ndns->dev, "failed, already claimed by %s\n",
+ dev_name(ndns->claim));
+ put_device(dev);
+ return NULL;
+ }
+ return dev;
+
+out_put_id:
+ ida_simple_remove(&nd_region->btt_ida, nd_btt->id);
+
+out_nd_btt:
+ kfree(nd_btt);
+ return NULL;
+}
+
+struct device *nd_btt_create(struct nd_region *nd_region)
+{
+ struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL);
+
+ __nd_device_register(dev);
+ return dev;
+}
+
+/**
+ * nd_btt_arena_is_valid - check if the metadata layout is valid
+ * @nd_btt: device with BTT geometry and backing device info
+ * @super: pointer to the arena's info block being tested
+ *
+ * Check consistency of the btt info block with itself by validating
+ * the checksum, and with the parent namespace by verifying the
+ * parent_uuid contained in the info block with the one supplied in.
+ *
+ * Returns:
+ * false for an invalid info block, true for a valid one
+ */
+bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super)
+{
+ const u8 *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev);
+ u64 checksum;
+
+ if (memcmp(super->signature, BTT_SIG, BTT_SIG_LEN) != 0)
+ return false;
+
+ if (!guid_is_null((guid_t *)&super->parent_uuid))
+ if (memcmp(super->parent_uuid, parent_uuid, 16) != 0)
+ return false;
+
+ checksum = le64_to_cpu(super->checksum);
+ super->checksum = 0;
+ if (checksum != nd_sb_checksum((struct nd_gen_sb *) super))
+ return false;
+ super->checksum = cpu_to_le64(checksum);
+
+ /* TODO: figure out action for this */
+ if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0)
+ dev_info(&nd_btt->dev, "Found arena with an error flag\n");
+
+ return true;
+}
+EXPORT_SYMBOL(nd_btt_arena_is_valid);
+
+int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns,
+ struct btt_sb *btt_sb)
+{
+ if (ndns->claim_class == NVDIMM_CCLASS_BTT2) {
+ /* Probe/setup for BTT v2.0 */
+ nd_btt->initial_offset = 0;
+ nd_btt->version_major = 2;
+ nd_btt->version_minor = 0;
+ if (nvdimm_read_bytes(ndns, 0, btt_sb, sizeof(*btt_sb), 0))
+ return -ENXIO;
+ if (!nd_btt_arena_is_valid(nd_btt, btt_sb))
+ return -ENODEV;
+ if ((le16_to_cpu(btt_sb->version_major) != 2) ||
+ (le16_to_cpu(btt_sb->version_minor) != 0))
+ return -ENODEV;
+ } else {
+ /*
+ * Probe/setup for BTT v1.1 (NVDIMM_CCLASS_NONE or
+ * NVDIMM_CCLASS_BTT)
+ */
+ nd_btt->initial_offset = SZ_4K;
+ nd_btt->version_major = 1;
+ nd_btt->version_minor = 1;
+ if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0))
+ return -ENXIO;
+ if (!nd_btt_arena_is_valid(nd_btt, btt_sb))
+ return -ENODEV;
+ if ((le16_to_cpu(btt_sb->version_major) != 1) ||
+ (le16_to_cpu(btt_sb->version_minor) != 1))
+ return -ENODEV;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(nd_btt_version);
+
+static int __nd_btt_probe(struct nd_btt *nd_btt,
+ struct nd_namespace_common *ndns, struct btt_sb *btt_sb)
+{
+ int rc;
+
+ if (!btt_sb || !ndns || !nd_btt)
+ return -ENODEV;
+
+ if (nvdimm_namespace_capacity(ndns) < SZ_16M)
+ return -ENXIO;
+
+ rc = nd_btt_version(nd_btt, ndns, btt_sb);
+ if (rc < 0)
+ return rc;
+
+ nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize);
+ nd_btt->uuid = kmemdup(btt_sb->uuid, 16, GFP_KERNEL);
+ if (!nd_btt->uuid)
+ return -ENOMEM;
+
+ __nd_device_register(&nd_btt->dev);
+
+ return 0;
+}
+
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns)
+{
+ int rc;
+ struct device *btt_dev;
+ struct btt_sb *btt_sb;
+ struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+ if (ndns->force_raw)
+ return -ENODEV;
+
+ switch (ndns->claim_class) {
+ case NVDIMM_CCLASS_NONE:
+ case NVDIMM_CCLASS_BTT:
+ case NVDIMM_CCLASS_BTT2:
+ break;
+ default:
+ return -ENODEV;
+ }
+
+ nvdimm_bus_lock(&ndns->dev);
+ btt_dev = __nd_btt_create(nd_region, 0, NULL, ndns);
+ nvdimm_bus_unlock(&ndns->dev);
+ if (!btt_dev)
+ return -ENOMEM;
+ btt_sb = devm_kzalloc(dev, sizeof(*btt_sb), GFP_KERNEL);
+ rc = __nd_btt_probe(to_nd_btt(btt_dev), ndns, btt_sb);
+ dev_dbg(dev, "btt: %s\n", rc == 0 ? dev_name(btt_dev) : "<none>");
+ if (rc < 0) {
+ struct nd_btt *nd_btt = to_nd_btt(btt_dev);
+
+ nd_detach_ndns(btt_dev, &nd_btt->ndns);
+ put_device(btt_dev);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(nd_btt_probe);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
new file mode 100644
index 000000000..48a070a37
--- /dev/null
+++ b/drivers/nvdimm/bus.c
@@ -0,0 +1,1268 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/libnvdimm.h>
+#include <linux/sched/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/fcntl.h>
+#include <linux/async.h>
+#include <linux/genhd.h>
+#include <linux/ndctl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "nd.h"
+#include "pfn.h"
+
+int nvdimm_major;
+static int nvdimm_bus_major;
+static struct class *nd_class;
+static DEFINE_IDA(nd_ida);
+
+static int to_nd_device_type(struct device *dev)
+{
+ if (is_nvdimm(dev))
+ return ND_DEVICE_DIMM;
+ else if (is_memory(dev))
+ return ND_DEVICE_REGION_PMEM;
+ else if (is_nd_blk(dev))
+ return ND_DEVICE_REGION_BLK;
+ else if (is_nd_dax(dev))
+ return ND_DEVICE_DAX_PMEM;
+ else if (is_nd_region(dev->parent))
+ return nd_region_to_nstype(to_nd_region(dev->parent));
+
+ return 0;
+}
+
+static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ /*
+ * Ensure that region devices always have their numa node set as
+ * early as possible.
+ */
+ if (is_nd_region(dev))
+ set_dev_node(dev, to_nd_region(dev)->numa_node);
+ return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT,
+ to_nd_device_type(dev));
+}
+
+static struct module *to_bus_provider(struct device *dev)
+{
+ /* pin bus providers while regions are enabled */
+ if (is_nd_region(dev)) {
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ return nvdimm_bus->nd_desc->module;
+ }
+ return NULL;
+}
+
+static void nvdimm_bus_probe_start(struct nvdimm_bus *nvdimm_bus)
+{
+ nvdimm_bus_lock(&nvdimm_bus->dev);
+ nvdimm_bus->probe_active++;
+ nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+static void nvdimm_bus_probe_end(struct nvdimm_bus *nvdimm_bus)
+{
+ nvdimm_bus_lock(&nvdimm_bus->dev);
+ if (--nvdimm_bus->probe_active == 0)
+ wake_up(&nvdimm_bus->wait);
+ nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+static int nvdimm_bus_probe(struct device *dev)
+{
+ struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
+ struct module *provider = to_bus_provider(dev);
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ int rc;
+
+ if (!try_module_get(provider))
+ return -ENXIO;
+
+ dev_dbg(&nvdimm_bus->dev, "START: %s.probe(%s)\n",
+ dev->driver->name, dev_name(dev));
+
+ nvdimm_bus_probe_start(nvdimm_bus);
+ rc = nd_drv->probe(dev);
+ if (rc == 0)
+ nd_region_probe_success(nvdimm_bus, dev);
+ else
+ nd_region_disable(nvdimm_bus, dev);
+ nvdimm_bus_probe_end(nvdimm_bus);
+
+ dev_dbg(&nvdimm_bus->dev, "END: %s.probe(%s) = %d\n", dev->driver->name,
+ dev_name(dev), rc);
+
+ if (rc != 0)
+ module_put(provider);
+ return rc;
+}
+
+static int nvdimm_bus_remove(struct device *dev)
+{
+ struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
+ struct module *provider = to_bus_provider(dev);
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ int rc = 0;
+
+ if (nd_drv->remove)
+ rc = nd_drv->remove(dev);
+ nd_region_disable(nvdimm_bus, dev);
+
+ dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
+ dev_name(dev), rc);
+ module_put(provider);
+ return rc;
+}
+
+static void nvdimm_bus_shutdown(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ struct nd_device_driver *nd_drv = NULL;
+
+ if (dev->driver)
+ nd_drv = to_nd_device_driver(dev->driver);
+
+ if (nd_drv && nd_drv->shutdown) {
+ nd_drv->shutdown(dev);
+ dev_dbg(&nvdimm_bus->dev, "%s.shutdown(%s)\n",
+ dev->driver->name, dev_name(dev));
+ }
+}
+
+void nd_device_notify(struct device *dev, enum nvdimm_event event)
+{
+ device_lock(dev);
+ if (dev->driver) {
+ struct nd_device_driver *nd_drv;
+
+ nd_drv = to_nd_device_driver(dev->driver);
+ if (nd_drv->notify)
+ nd_drv->notify(dev, event);
+ }
+ device_unlock(dev);
+}
+EXPORT_SYMBOL(nd_device_notify);
+
+void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+
+ if (!nvdimm_bus)
+ return;
+
+ /* caller is responsible for holding a reference on the device */
+ nd_device_notify(&nd_region->dev, event);
+}
+EXPORT_SYMBOL_GPL(nvdimm_region_notify);
+
+struct clear_badblocks_context {
+ resource_size_t phys, cleared;
+};
+
+static int nvdimm_clear_badblocks_region(struct device *dev, void *data)
+{
+ struct clear_badblocks_context *ctx = data;
+ struct nd_region *nd_region;
+ resource_size_t ndr_end;
+ sector_t sector;
+
+ /* make sure device is a region */
+ if (!is_memory(dev))
+ return 0;
+
+ nd_region = to_nd_region(dev);
+ ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
+
+ /* make sure we are in the region */
+ if (ctx->phys < nd_region->ndr_start
+ || (ctx->phys + ctx->cleared) > ndr_end)
+ return 0;
+
+ sector = (ctx->phys - nd_region->ndr_start) / 512;
+ badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512);
+
+ if (nd_region->bb_state)
+ sysfs_notify_dirent(nd_region->bb_state);
+
+ return 0;
+}
+
+static void nvdimm_clear_badblocks_regions(struct nvdimm_bus *nvdimm_bus,
+ phys_addr_t phys, u64 cleared)
+{
+ struct clear_badblocks_context ctx = {
+ .phys = phys,
+ .cleared = cleared,
+ };
+
+ device_for_each_child(&nvdimm_bus->dev, &ctx,
+ nvdimm_clear_badblocks_region);
+}
+
+static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
+ phys_addr_t phys, u64 cleared)
+{
+ if (cleared > 0)
+ badrange_forget(&nvdimm_bus->badrange, phys, cleared);
+
+ if (cleared > 0 && cleared / 512)
+ nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
+}
+
+long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
+ unsigned int len)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ struct nvdimm_bus_descriptor *nd_desc;
+ struct nd_cmd_clear_error clear_err;
+ struct nd_cmd_ars_cap ars_cap;
+ u32 clear_err_unit, mask;
+ unsigned int noio_flag;
+ int cmd_rc, rc;
+
+ if (!nvdimm_bus)
+ return -ENXIO;
+
+ nd_desc = nvdimm_bus->nd_desc;
+ /*
+ * if ndctl does not exist, it's PMEM_LEGACY and
+ * we want to just pretend everything is handled.
+ */
+ if (!nd_desc->ndctl)
+ return len;
+
+ memset(&ars_cap, 0, sizeof(ars_cap));
+ ars_cap.address = phys;
+ ars_cap.length = len;
+ noio_flag = memalloc_noio_save();
+ rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, &ars_cap,
+ sizeof(ars_cap), &cmd_rc);
+ memalloc_noio_restore(noio_flag);
+ if (rc < 0)
+ return rc;
+ if (cmd_rc < 0)
+ return cmd_rc;
+ clear_err_unit = ars_cap.clear_err_unit;
+ if (!clear_err_unit || !is_power_of_2(clear_err_unit))
+ return -ENXIO;
+
+ mask = clear_err_unit - 1;
+ if ((phys | len) & mask)
+ return -ENXIO;
+ memset(&clear_err, 0, sizeof(clear_err));
+ clear_err.address = phys;
+ clear_err.length = len;
+ noio_flag = memalloc_noio_save();
+ rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_CLEAR_ERROR, &clear_err,
+ sizeof(clear_err), &cmd_rc);
+ memalloc_noio_restore(noio_flag);
+ if (rc < 0)
+ return rc;
+ if (cmd_rc < 0)
+ return cmd_rc;
+
+ nvdimm_account_cleared_poison(nvdimm_bus, phys, clear_err.cleared);
+
+ return clear_err.cleared;
+}
+EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
+
+static int nvdimm_bus_match(struct device *dev, struct device_driver *drv);
+
+static struct bus_type nvdimm_bus_type = {
+ .name = "nd",
+ .uevent = nvdimm_bus_uevent,
+ .match = nvdimm_bus_match,
+ .probe = nvdimm_bus_probe,
+ .remove = nvdimm_bus_remove,
+ .shutdown = nvdimm_bus_shutdown,
+};
+
+static void nvdimm_bus_release(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus;
+
+ nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+ ida_simple_remove(&nd_ida, nvdimm_bus->id);
+ kfree(nvdimm_bus);
+}
+
+static bool is_nvdimm_bus(struct device *dev)
+{
+ return dev->release == nvdimm_bus_release;
+}
+
+struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
+{
+ struct device *dev;
+
+ for (dev = nd_dev; dev; dev = dev->parent)
+ if (is_nvdimm_bus(dev))
+ break;
+ dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n");
+ if (dev)
+ return to_nvdimm_bus(dev);
+ return NULL;
+}
+
+struct nvdimm_bus *to_nvdimm_bus(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus;
+
+ nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+ WARN_ON(!is_nvdimm_bus(dev));
+ return nvdimm_bus;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm_bus);
+
+struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
+ struct nvdimm_bus_descriptor *nd_desc)
+{
+ struct nvdimm_bus *nvdimm_bus;
+ int rc;
+
+ nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL);
+ if (!nvdimm_bus)
+ return NULL;
+ INIT_LIST_HEAD(&nvdimm_bus->list);
+ INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
+ init_waitqueue_head(&nvdimm_bus->wait);
+ nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
+ mutex_init(&nvdimm_bus->reconfig_mutex);
+ badrange_init(&nvdimm_bus->badrange);
+ if (nvdimm_bus->id < 0) {
+ kfree(nvdimm_bus);
+ return NULL;
+ }
+ nvdimm_bus->nd_desc = nd_desc;
+ nvdimm_bus->dev.parent = parent;
+ nvdimm_bus->dev.release = nvdimm_bus_release;
+ nvdimm_bus->dev.groups = nd_desc->attr_groups;
+ nvdimm_bus->dev.bus = &nvdimm_bus_type;
+ nvdimm_bus->dev.of_node = nd_desc->of_node;
+ dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id);
+ rc = device_register(&nvdimm_bus->dev);
+ if (rc) {
+ dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc);
+ goto err;
+ }
+
+ return nvdimm_bus;
+ err:
+ put_device(&nvdimm_bus->dev);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_register);
+
+void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
+{
+ if (!nvdimm_bus)
+ return;
+ device_unregister(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
+
+static int child_unregister(struct device *dev, void *data)
+{
+ /*
+ * the singular ndctl class device per bus needs to be
+ * "device_destroy"ed, so skip it here
+ *
+ * i.e. remove classless children
+ */
+ if (dev->class)
+ /* pass */;
+ else
+ nd_device_unregister(dev, ND_SYNC);
+ return 0;
+}
+
+static void free_badrange_list(struct list_head *badrange_list)
+{
+ struct badrange_entry *bre, *next;
+
+ list_for_each_entry_safe(bre, next, badrange_list, list) {
+ list_del(&bre->list);
+ kfree(bre);
+ }
+ list_del_init(badrange_list);
+}
+
+static int nd_bus_remove(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+ mutex_lock(&nvdimm_bus_list_mutex);
+ list_del_init(&nvdimm_bus->list);
+ mutex_unlock(&nvdimm_bus_list_mutex);
+
+ wait_event(nvdimm_bus->wait,
+ atomic_read(&nvdimm_bus->ioctl_active) == 0);
+
+ nd_synchronize();
+ device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
+
+ spin_lock(&nvdimm_bus->badrange.lock);
+ free_badrange_list(&nvdimm_bus->badrange.list);
+ spin_unlock(&nvdimm_bus->badrange.lock);
+
+ nvdimm_bus_destroy_ndctl(nvdimm_bus);
+
+ return 0;
+}
+
+static int nd_bus_probe(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+ int rc;
+
+ rc = nvdimm_bus_create_ndctl(nvdimm_bus);
+ if (rc)
+ return rc;
+
+ mutex_lock(&nvdimm_bus_list_mutex);
+ list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list);
+ mutex_unlock(&nvdimm_bus_list_mutex);
+
+ /* enable bus provider attributes to look up their local context */
+ dev_set_drvdata(dev, nvdimm_bus->nd_desc);
+
+ return 0;
+}
+
+static struct nd_device_driver nd_bus_driver = {
+ .probe = nd_bus_probe,
+ .remove = nd_bus_remove,
+ .drv = {
+ .name = "nd_bus",
+ .suppress_bind_attrs = true,
+ .bus = &nvdimm_bus_type,
+ .owner = THIS_MODULE,
+ .mod_name = KBUILD_MODNAME,
+ },
+};
+
+static int nvdimm_bus_match(struct device *dev, struct device_driver *drv)
+{
+ struct nd_device_driver *nd_drv = to_nd_device_driver(drv);
+
+ if (is_nvdimm_bus(dev) && nd_drv == &nd_bus_driver)
+ return true;
+
+ return !!test_bit(to_nd_device_type(dev), &nd_drv->type);
+}
+
+static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain);
+
+void nd_synchronize(void)
+{
+ async_synchronize_full_domain(&nd_async_domain);
+}
+EXPORT_SYMBOL_GPL(nd_synchronize);
+
+static void nd_async_device_register(void *d, async_cookie_t cookie)
+{
+ struct device *dev = d;
+
+ if (device_add(dev) != 0) {
+ dev_err(dev, "%s: failed\n", __func__);
+ put_device(dev);
+ }
+ put_device(dev);
+ if (dev->parent)
+ put_device(dev->parent);
+}
+
+static void nd_async_device_unregister(void *d, async_cookie_t cookie)
+{
+ struct device *dev = d;
+
+ /* flush bus operations before delete */
+ nvdimm_bus_lock(dev);
+ nvdimm_bus_unlock(dev);
+
+ device_unregister(dev);
+ put_device(dev);
+}
+
+void __nd_device_register(struct device *dev)
+{
+ if (!dev)
+ return;
+ dev->bus = &nvdimm_bus_type;
+ if (dev->parent)
+ get_device(dev->parent);
+ get_device(dev);
+ async_schedule_domain(nd_async_device_register, dev,
+ &nd_async_domain);
+}
+
+void nd_device_register(struct device *dev)
+{
+ device_initialize(dev);
+ __nd_device_register(dev);
+}
+EXPORT_SYMBOL(nd_device_register);
+
+void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
+{
+ bool killed;
+
+ switch (mode) {
+ case ND_ASYNC:
+ /*
+ * In the async case this is being triggered with the
+ * device lock held and the unregistration work needs to
+ * be moved out of line iff this is thread has won the
+ * race to schedule the deletion.
+ */
+ if (!kill_device(dev))
+ return;
+
+ get_device(dev);
+ async_schedule_domain(nd_async_device_unregister, dev,
+ &nd_async_domain);
+ break;
+ case ND_SYNC:
+ /*
+ * In the sync case the device is being unregistered due
+ * to a state change of the parent. Claim the kill state
+ * to synchronize against other unregistration requests,
+ * or otherwise let the async path handle it if the
+ * unregistration was already queued.
+ */
+ device_lock(dev);
+ killed = kill_device(dev);
+ device_unlock(dev);
+
+ if (!killed)
+ return;
+
+ nd_synchronize();
+ device_unregister(dev);
+ break;
+ }
+}
+EXPORT_SYMBOL(nd_device_unregister);
+
+/**
+ * __nd_driver_register() - register a region or a namespace driver
+ * @nd_drv: driver to register
+ * @owner: automatically set by nd_driver_register() macro
+ * @mod_name: automatically set by nd_driver_register() macro
+ */
+int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
+ const char *mod_name)
+{
+ struct device_driver *drv = &nd_drv->drv;
+
+ if (!nd_drv->type) {
+ pr_debug("driver type bitmask not set (%pf)\n",
+ __builtin_return_address(0));
+ return -EINVAL;
+ }
+
+ if (!nd_drv->probe) {
+ pr_debug("%s ->probe() must be specified\n", mod_name);
+ return -EINVAL;
+ }
+
+ drv->bus = &nvdimm_bus_type;
+ drv->owner = owner;
+ drv->mod_name = mod_name;
+
+ return driver_register(drv);
+}
+EXPORT_SYMBOL(__nd_driver_register);
+
+int nvdimm_revalidate_disk(struct gendisk *disk)
+{
+ struct device *dev = disk_to_dev(disk)->parent;
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ int disk_ro = get_disk_ro(disk);
+
+ /*
+ * Upgrade to read-only if the region is read-only preserve as
+ * read-only if the disk is already read-only.
+ */
+ if (disk_ro || nd_region->ro == disk_ro)
+ return 0;
+
+ dev_info(dev, "%s read-only, marking %s read-only\n",
+ dev_name(&nd_region->dev), disk->disk_name);
+ set_disk_ro(disk, 1);
+
+ return 0;
+
+}
+EXPORT_SYMBOL(nvdimm_revalidate_disk);
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, ND_DEVICE_MODALIAS_FMT "\n",
+ to_nd_device_type(dev));
+}
+static DEVICE_ATTR_RO(modalias);
+
+static ssize_t devtype_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%s\n", dev->type->name);
+}
+static DEVICE_ATTR_RO(devtype);
+
+static struct attribute *nd_device_attributes[] = {
+ &dev_attr_modalias.attr,
+ &dev_attr_devtype.attr,
+ NULL,
+};
+
+/*
+ * nd_device_attribute_group - generic attributes for all devices on an nd bus
+ */
+struct attribute_group nd_device_attribute_group = {
+ .attrs = nd_device_attributes,
+};
+EXPORT_SYMBOL_GPL(nd_device_attribute_group);
+
+static ssize_t numa_node_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", dev_to_node(dev));
+}
+static DEVICE_ATTR_RO(numa_node);
+
+static struct attribute *nd_numa_attributes[] = {
+ &dev_attr_numa_node.attr,
+ NULL,
+};
+
+static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a,
+ int n)
+{
+ if (!IS_ENABLED(CONFIG_NUMA))
+ return 0;
+
+ return a->mode;
+}
+
+/*
+ * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus
+ */
+struct attribute_group nd_numa_attribute_group = {
+ .attrs = nd_numa_attributes,
+ .is_visible = nd_numa_attr_visible,
+};
+EXPORT_SYMBOL_GPL(nd_numa_attribute_group);
+
+int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus)
+{
+ dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id);
+ struct device *dev;
+
+ dev = device_create(nd_class, &nvdimm_bus->dev, devt, nvdimm_bus,
+ "ndctl%d", nvdimm_bus->id);
+
+ if (IS_ERR(dev))
+ dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %ld\n",
+ nvdimm_bus->id, PTR_ERR(dev));
+ return PTR_ERR_OR_ZERO(dev);
+}
+
+void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus)
+{
+ device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id));
+}
+
+static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = {
+ [ND_CMD_IMPLEMENTED] = { },
+ [ND_CMD_SMART] = {
+ .out_num = 2,
+ .out_sizes = { 4, 128, },
+ },
+ [ND_CMD_SMART_THRESHOLD] = {
+ .out_num = 2,
+ .out_sizes = { 4, 8, },
+ },
+ [ND_CMD_DIMM_FLAGS] = {
+ .out_num = 2,
+ .out_sizes = { 4, 4 },
+ },
+ [ND_CMD_GET_CONFIG_SIZE] = {
+ .out_num = 3,
+ .out_sizes = { 4, 4, 4, },
+ },
+ [ND_CMD_GET_CONFIG_DATA] = {
+ .in_num = 2,
+ .in_sizes = { 4, 4, },
+ .out_num = 2,
+ .out_sizes = { 4, UINT_MAX, },
+ },
+ [ND_CMD_SET_CONFIG_DATA] = {
+ .in_num = 3,
+ .in_sizes = { 4, 4, UINT_MAX, },
+ .out_num = 1,
+ .out_sizes = { 4, },
+ },
+ [ND_CMD_VENDOR] = {
+ .in_num = 3,
+ .in_sizes = { 4, 4, UINT_MAX, },
+ .out_num = 3,
+ .out_sizes = { 4, 4, UINT_MAX, },
+ },
+ [ND_CMD_CALL] = {
+ .in_num = 2,
+ .in_sizes = { sizeof(struct nd_cmd_pkg), UINT_MAX, },
+ .out_num = 1,
+ .out_sizes = { UINT_MAX, },
+ },
+};
+
+const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd)
+{
+ if (cmd < ARRAY_SIZE(__nd_cmd_dimm_descs))
+ return &__nd_cmd_dimm_descs[cmd];
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_dimm_desc);
+
+static const struct nd_cmd_desc __nd_cmd_bus_descs[] = {
+ [ND_CMD_IMPLEMENTED] = { },
+ [ND_CMD_ARS_CAP] = {
+ .in_num = 2,
+ .in_sizes = { 8, 8, },
+ .out_num = 4,
+ .out_sizes = { 4, 4, 4, 4, },
+ },
+ [ND_CMD_ARS_START] = {
+ .in_num = 5,
+ .in_sizes = { 8, 8, 2, 1, 5, },
+ .out_num = 2,
+ .out_sizes = { 4, 4, },
+ },
+ [ND_CMD_ARS_STATUS] = {
+ .out_num = 3,
+ .out_sizes = { 4, 4, UINT_MAX, },
+ },
+ [ND_CMD_CLEAR_ERROR] = {
+ .in_num = 2,
+ .in_sizes = { 8, 8, },
+ .out_num = 3,
+ .out_sizes = { 4, 4, 8, },
+ },
+ [ND_CMD_CALL] = {
+ .in_num = 2,
+ .in_sizes = { sizeof(struct nd_cmd_pkg), UINT_MAX, },
+ .out_num = 1,
+ .out_sizes = { UINT_MAX, },
+ },
+};
+
+const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd)
+{
+ if (cmd < ARRAY_SIZE(__nd_cmd_bus_descs))
+ return &__nd_cmd_bus_descs[cmd];
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_bus_desc);
+
+u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
+ const struct nd_cmd_desc *desc, int idx, void *buf)
+{
+ if (idx >= desc->in_num)
+ return UINT_MAX;
+
+ if (desc->in_sizes[idx] < UINT_MAX)
+ return desc->in_sizes[idx];
+
+ if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && idx == 2) {
+ struct nd_cmd_set_config_hdr *hdr = buf;
+
+ return hdr->in_length;
+ } else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) {
+ struct nd_cmd_vendor_hdr *hdr = buf;
+
+ return hdr->in_length;
+ } else if (cmd == ND_CMD_CALL) {
+ struct nd_cmd_pkg *pkg = buf;
+
+ return pkg->nd_size_in;
+ }
+
+ return UINT_MAX;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_in_size);
+
+u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
+ const struct nd_cmd_desc *desc, int idx, const u32 *in_field,
+ const u32 *out_field, unsigned long remainder)
+{
+ if (idx >= desc->out_num)
+ return UINT_MAX;
+
+ if (desc->out_sizes[idx] < UINT_MAX)
+ return desc->out_sizes[idx];
+
+ if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && idx == 1)
+ return in_field[1];
+ else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2)
+ return out_field[1];
+ else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 2) {
+ /*
+ * Per table 9-276 ARS Data in ACPI 6.1, out_field[1] is
+ * "Size of Output Buffer in bytes, including this
+ * field."
+ */
+ if (out_field[1] < 4)
+ return 0;
+ /*
+ * ACPI 6.1 is ambiguous if 'status' is included in the
+ * output size. If we encounter an output size that
+ * overshoots the remainder by 4 bytes, assume it was
+ * including 'status'.
+ */
+ if (out_field[1] - 4 == remainder)
+ return remainder;
+ return out_field[1] - 8;
+ } else if (cmd == ND_CMD_CALL) {
+ struct nd_cmd_pkg *pkg = (struct nd_cmd_pkg *) in_field;
+
+ return pkg->nd_size_out;
+ }
+
+
+ return UINT_MAX;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_out_size);
+
+void wait_nvdimm_bus_probe_idle(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ do {
+ if (nvdimm_bus->probe_active == 0)
+ break;
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+ wait_event(nvdimm_bus->wait,
+ nvdimm_bus->probe_active == 0);
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ } while (true);
+}
+
+static int nd_pmem_forget_poison_check(struct device *dev, void *data)
+{
+ struct nd_cmd_clear_error *clear_err =
+ (struct nd_cmd_clear_error *)data;
+ struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
+ struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL;
+ struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL;
+ struct nd_namespace_common *ndns = NULL;
+ struct nd_namespace_io *nsio;
+ resource_size_t offset = 0, end_trunc = 0, start, end, pstart, pend;
+
+ if (nd_dax || !dev->driver)
+ return 0;
+
+ start = clear_err->address;
+ end = clear_err->address + clear_err->cleared - 1;
+
+ if (nd_btt || nd_pfn || nd_dax) {
+ if (nd_btt)
+ ndns = nd_btt->ndns;
+ else if (nd_pfn)
+ ndns = nd_pfn->ndns;
+ else if (nd_dax)
+ ndns = nd_dax->nd_pfn.ndns;
+
+ if (!ndns)
+ return 0;
+ } else
+ ndns = to_ndns(dev);
+
+ nsio = to_nd_namespace_io(&ndns->dev);
+ pstart = nsio->res.start + offset;
+ pend = nsio->res.end - end_trunc;
+
+ if ((pstart >= start) && (pend <= end))
+ return -EBUSY;
+
+ return 0;
+
+}
+
+static int nd_ns_forget_poison_check(struct device *dev, void *data)
+{
+ return device_for_each_child(dev, data, nd_pmem_forget_poison_check);
+}
+
+/* set_config requires an idle interleave set */
+static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
+ struct nvdimm *nvdimm, unsigned int cmd, void *data)
+{
+ struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+
+ /* ask the bus provider if it would like to block this request */
+ if (nd_desc->clear_to_send) {
+ int rc = nd_desc->clear_to_send(nd_desc, nvdimm, cmd);
+
+ if (rc)
+ return rc;
+ }
+
+ /* require clear error to go through the pmem driver */
+ if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR)
+ return device_for_each_child(&nvdimm_bus->dev, data,
+ nd_ns_forget_poison_check);
+
+ if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
+ return 0;
+
+ /* prevent label manipulation while the kernel owns label updates */
+ wait_nvdimm_bus_probe_idle(&nvdimm_bus->dev);
+ if (atomic_read(&nvdimm->busy))
+ return -EBUSY;
+ return 0;
+}
+
+static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
+ int read_only, unsigned int ioctl_cmd, unsigned long arg)
+{
+ struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+ const struct nd_cmd_desc *desc = NULL;
+ unsigned int cmd = _IOC_NR(ioctl_cmd);
+ struct device *dev = &nvdimm_bus->dev;
+ void __user *p = (void __user *) arg;
+ char *out_env = NULL, *in_env = NULL;
+ const char *cmd_name, *dimm_name;
+ u32 in_len = 0, out_len = 0;
+ unsigned int func = cmd;
+ unsigned long cmd_mask;
+ struct nd_cmd_pkg pkg;
+ int rc, i, cmd_rc;
+ void *buf = NULL;
+ u64 buf_len = 0;
+
+ if (nvdimm) {
+ desc = nd_cmd_dimm_desc(cmd);
+ cmd_name = nvdimm_cmd_name(cmd);
+ cmd_mask = nvdimm->cmd_mask;
+ dimm_name = dev_name(&nvdimm->dev);
+ } else {
+ desc = nd_cmd_bus_desc(cmd);
+ cmd_name = nvdimm_bus_cmd_name(cmd);
+ cmd_mask = nd_desc->cmd_mask;
+ dimm_name = "bus";
+ }
+
+ if (cmd == ND_CMD_CALL) {
+ if (copy_from_user(&pkg, p, sizeof(pkg)))
+ return -EFAULT;
+ }
+
+ if (!desc ||
+ (desc->out_num + desc->in_num == 0) ||
+ cmd > ND_CMD_CALL ||
+ !test_bit(cmd, &cmd_mask))
+ return -ENOTTY;
+
+ /* fail write commands (when read-only) */
+ if (read_only)
+ switch (cmd) {
+ case ND_CMD_VENDOR:
+ case ND_CMD_SET_CONFIG_DATA:
+ case ND_CMD_ARS_START:
+ case ND_CMD_CLEAR_ERROR:
+ case ND_CMD_CALL:
+ dev_dbg(dev, "'%s' command while read-only.\n",
+ nvdimm ? nvdimm_cmd_name(cmd)
+ : nvdimm_bus_cmd_name(cmd));
+ return -EPERM;
+ default:
+ break;
+ }
+
+ /* process an input envelope */
+ in_env = kzalloc(ND_CMD_MAX_ENVELOPE, GFP_KERNEL);
+ if (!in_env)
+ return -ENOMEM;
+ for (i = 0; i < desc->in_num; i++) {
+ u32 in_size, copy;
+
+ in_size = nd_cmd_in_size(nvdimm, cmd, desc, i, in_env);
+ if (in_size == UINT_MAX) {
+ dev_err(dev, "%s:%s unknown input size cmd: %s field: %d\n",
+ __func__, dimm_name, cmd_name, i);
+ rc = -ENXIO;
+ goto out;
+ }
+ if (in_len < ND_CMD_MAX_ENVELOPE)
+ copy = min_t(u32, ND_CMD_MAX_ENVELOPE - in_len, in_size);
+ else
+ copy = 0;
+ if (copy && copy_from_user(&in_env[in_len], p + in_len, copy)) {
+ rc = -EFAULT;
+ goto out;
+ }
+ in_len += in_size;
+ }
+
+ if (cmd == ND_CMD_CALL) {
+ func = pkg.nd_command;
+ dev_dbg(dev, "%s, idx: %llu, in: %u, out: %u, len %llu\n",
+ dimm_name, pkg.nd_command,
+ in_len, out_len, buf_len);
+ }
+
+ /* process an output envelope */
+ out_env = kzalloc(ND_CMD_MAX_ENVELOPE, GFP_KERNEL);
+ if (!out_env) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < desc->out_num; i++) {
+ u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i,
+ (u32 *) in_env, (u32 *) out_env, 0);
+ u32 copy;
+
+ if (out_size == UINT_MAX) {
+ dev_dbg(dev, "%s unknown output size cmd: %s field: %d\n",
+ dimm_name, cmd_name, i);
+ rc = -EFAULT;
+ goto out;
+ }
+ if (out_len < ND_CMD_MAX_ENVELOPE)
+ copy = min_t(u32, ND_CMD_MAX_ENVELOPE - out_len, out_size);
+ else
+ copy = 0;
+ if (copy && copy_from_user(&out_env[out_len],
+ p + in_len + out_len, copy)) {
+ rc = -EFAULT;
+ goto out;
+ }
+ out_len += out_size;
+ }
+
+ buf_len = (u64) out_len + (u64) in_len;
+ if (buf_len > ND_IOCTL_MAX_BUFLEN) {
+ dev_dbg(dev, "%s cmd: %s buf_len: %llu > %d\n", dimm_name,
+ cmd_name, buf_len, ND_IOCTL_MAX_BUFLEN);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ buf = vmalloc(buf_len);
+ if (!buf) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (copy_from_user(buf, p, buf_len)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, func, buf);
+ if (rc)
+ goto out_unlock;
+
+ rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, &cmd_rc);
+ if (rc < 0)
+ goto out_unlock;
+
+ if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) {
+ struct nd_cmd_clear_error *clear_err = buf;
+
+ nvdimm_account_cleared_poison(nvdimm_bus, clear_err->address,
+ clear_err->cleared);
+ }
+
+ if (copy_to_user(p, buf, buf_len))
+ rc = -EFAULT;
+
+out_unlock:
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+out:
+ kfree(in_env);
+ kfree(out_env);
+ vfree(buf);
+ return rc;
+}
+
+enum nd_ioctl_mode {
+ BUS_IOCTL,
+ DIMM_IOCTL,
+};
+
+static int match_dimm(struct device *dev, void *data)
+{
+ long id = (long) data;
+
+ if (is_nvdimm(dev)) {
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ return nvdimm->id == id;
+ }
+
+ return 0;
+}
+
+static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+ enum nd_ioctl_mode mode)
+
+{
+ struct nvdimm_bus *nvdimm_bus, *found = NULL;
+ long id = (long) file->private_data;
+ struct nvdimm *nvdimm = NULL;
+ int rc, ro;
+
+ ro = ((file->f_flags & O_ACCMODE) == O_RDONLY);
+ mutex_lock(&nvdimm_bus_list_mutex);
+ list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
+ if (mode == DIMM_IOCTL) {
+ struct device *dev;
+
+ dev = device_find_child(&nvdimm_bus->dev,
+ file->private_data, match_dimm);
+ if (!dev)
+ continue;
+ nvdimm = to_nvdimm(dev);
+ found = nvdimm_bus;
+ } else if (nvdimm_bus->id == id) {
+ found = nvdimm_bus;
+ }
+
+ if (found) {
+ atomic_inc(&nvdimm_bus->ioctl_active);
+ break;
+ }
+ }
+ mutex_unlock(&nvdimm_bus_list_mutex);
+
+ if (!found)
+ return -ENXIO;
+
+ nvdimm_bus = found;
+ rc = __nd_ioctl(nvdimm_bus, nvdimm, ro, cmd, arg);
+
+ if (nvdimm)
+ put_device(&nvdimm->dev);
+ if (atomic_dec_and_test(&nvdimm_bus->ioctl_active))
+ wake_up(&nvdimm_bus->wait);
+
+ return rc;
+}
+
+static long bus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ return nd_ioctl(file, cmd, arg, BUS_IOCTL);
+}
+
+static long dimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ return nd_ioctl(file, cmd, arg, DIMM_IOCTL);
+}
+
+static int nd_open(struct inode *inode, struct file *file)
+{
+ long minor = iminor(inode);
+
+ file->private_data = (void *) minor;
+ return 0;
+}
+
+static const struct file_operations nvdimm_bus_fops = {
+ .owner = THIS_MODULE,
+ .open = nd_open,
+ .unlocked_ioctl = bus_ioctl,
+ .compat_ioctl = bus_ioctl,
+ .llseek = noop_llseek,
+};
+
+static const struct file_operations nvdimm_fops = {
+ .owner = THIS_MODULE,
+ .open = nd_open,
+ .unlocked_ioctl = dimm_ioctl,
+ .compat_ioctl = dimm_ioctl,
+ .llseek = noop_llseek,
+};
+
+int __init nvdimm_bus_init(void)
+{
+ int rc;
+
+ rc = bus_register(&nvdimm_bus_type);
+ if (rc)
+ return rc;
+
+ rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops);
+ if (rc < 0)
+ goto err_bus_chrdev;
+ nvdimm_bus_major = rc;
+
+ rc = register_chrdev(0, "dimmctl", &nvdimm_fops);
+ if (rc < 0)
+ goto err_dimm_chrdev;
+ nvdimm_major = rc;
+
+ nd_class = class_create(THIS_MODULE, "nd");
+ if (IS_ERR(nd_class)) {
+ rc = PTR_ERR(nd_class);
+ goto err_class;
+ }
+
+ rc = driver_register(&nd_bus_driver.drv);
+ if (rc)
+ goto err_nd_bus;
+
+ return 0;
+
+ err_nd_bus:
+ class_destroy(nd_class);
+ err_class:
+ unregister_chrdev(nvdimm_major, "dimmctl");
+ err_dimm_chrdev:
+ unregister_chrdev(nvdimm_bus_major, "ndctl");
+ err_bus_chrdev:
+ bus_unregister(&nvdimm_bus_type);
+
+ return rc;
+}
+
+void nvdimm_bus_exit(void)
+{
+ driver_unregister(&nd_bus_driver.drv);
+ class_destroy(nd_class);
+ unregister_chrdev(nvdimm_bus_major, "ndctl");
+ unregister_chrdev(nvdimm_major, "dimmctl");
+ bus_unregister(&nvdimm_bus_type);
+ ida_destroy(&nd_ida);
+}
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
new file mode 100644
index 000000000..fb667bf46
--- /dev/null
+++ b/drivers/nvdimm/claim.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include "nd-core.h"
+#include "pmem.h"
+#include "pfn.h"
+#include "btt.h"
+#include "nd.h"
+
+void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns)
+{
+ struct nd_namespace_common *ndns = *_ndns;
+ struct nvdimm_bus *nvdimm_bus;
+
+ if (!ndns)
+ return;
+
+ nvdimm_bus = walk_to_nvdimm_bus(&ndns->dev);
+ lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+ dev_WARN_ONCE(dev, ndns->claim != dev, "%s: invalid claim\n", __func__);
+ ndns->claim = NULL;
+ *_ndns = NULL;
+ put_device(&ndns->dev);
+}
+
+void nd_detach_ndns(struct device *dev,
+ struct nd_namespace_common **_ndns)
+{
+ struct nd_namespace_common *ndns = *_ndns;
+
+ if (!ndns)
+ return;
+ get_device(&ndns->dev);
+ nvdimm_bus_lock(&ndns->dev);
+ __nd_detach_ndns(dev, _ndns);
+ nvdimm_bus_unlock(&ndns->dev);
+ put_device(&ndns->dev);
+}
+
+bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+ struct nd_namespace_common **_ndns)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&attach->dev);
+
+ if (attach->claim)
+ return false;
+ lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+ dev_WARN_ONCE(dev, *_ndns, "%s: invalid claim\n", __func__);
+ attach->claim = dev;
+ *_ndns = attach;
+ get_device(&attach->dev);
+ return true;
+}
+
+bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+ struct nd_namespace_common **_ndns)
+{
+ bool claimed;
+
+ nvdimm_bus_lock(&attach->dev);
+ claimed = __nd_attach_ndns(dev, attach, _ndns);
+ nvdimm_bus_unlock(&attach->dev);
+ return claimed;
+}
+
+static int namespace_match(struct device *dev, void *data)
+{
+ char *name = data;
+
+ return strcmp(name, dev_name(dev)) == 0;
+}
+
+static bool is_idle(struct device *dev, struct nd_namespace_common *ndns)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct device *seed = NULL;
+
+ if (is_nd_btt(dev))
+ seed = nd_region->btt_seed;
+ else if (is_nd_pfn(dev))
+ seed = nd_region->pfn_seed;
+ else if (is_nd_dax(dev))
+ seed = nd_region->dax_seed;
+
+ if (seed == dev || ndns || dev->driver)
+ return false;
+ return true;
+}
+
+struct nd_pfn *to_nd_pfn_safe(struct device *dev)
+{
+ /*
+ * pfn device attributes are re-used by dax device instances, so we
+ * need to be careful to correct device-to-nd_pfn conversion.
+ */
+ if (is_nd_pfn(dev))
+ return to_nd_pfn(dev);
+
+ if (is_nd_dax(dev)) {
+ struct nd_dax *nd_dax = to_nd_dax(dev);
+
+ return &nd_dax->nd_pfn;
+ }
+
+ WARN_ON(1);
+ return NULL;
+}
+
+static void nd_detach_and_reset(struct device *dev,
+ struct nd_namespace_common **_ndns)
+{
+ /* detach the namespace and destroy / reset the device */
+ __nd_detach_ndns(dev, _ndns);
+ if (is_idle(dev, *_ndns)) {
+ nd_device_unregister(dev, ND_ASYNC);
+ } else if (is_nd_btt(dev)) {
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ nd_btt->lbasize = 0;
+ kfree(nd_btt->uuid);
+ nd_btt->uuid = NULL;
+ } else if (is_nd_pfn(dev) || is_nd_dax(dev)) {
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+
+ kfree(nd_pfn->uuid);
+ nd_pfn->uuid = NULL;
+ nd_pfn->mode = PFN_MODE_NONE;
+ }
+}
+
+ssize_t nd_namespace_store(struct device *dev,
+ struct nd_namespace_common **_ndns, const char *buf,
+ size_t len)
+{
+ struct nd_namespace_common *ndns;
+ struct device *found;
+ char *name;
+
+ if (dev->driver) {
+ dev_dbg(dev, "namespace already active\n");
+ return -EBUSY;
+ }
+
+ name = kstrndup(buf, len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+ strim(name);
+
+ if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0)
+ /* pass */;
+ else {
+ len = -EINVAL;
+ goto out;
+ }
+
+ ndns = *_ndns;
+ if (strcmp(name, "") == 0) {
+ nd_detach_and_reset(dev, _ndns);
+ goto out;
+ } else if (ndns) {
+ dev_dbg(dev, "namespace already set to: %s\n",
+ dev_name(&ndns->dev));
+ len = -EBUSY;
+ goto out;
+ }
+
+ found = device_find_child(dev->parent, name, namespace_match);
+ if (!found) {
+ dev_dbg(dev, "'%s' not found under %s\n", name,
+ dev_name(dev->parent));
+ len = -ENODEV;
+ goto out;
+ }
+
+ ndns = to_ndns(found);
+
+ switch (ndns->claim_class) {
+ case NVDIMM_CCLASS_NONE:
+ break;
+ case NVDIMM_CCLASS_BTT:
+ case NVDIMM_CCLASS_BTT2:
+ if (!is_nd_btt(dev)) {
+ len = -EBUSY;
+ goto out_attach;
+ }
+ break;
+ case NVDIMM_CCLASS_PFN:
+ if (!is_nd_pfn(dev)) {
+ len = -EBUSY;
+ goto out_attach;
+ }
+ break;
+ case NVDIMM_CCLASS_DAX:
+ if (!is_nd_dax(dev)) {
+ len = -EBUSY;
+ goto out_attach;
+ }
+ break;
+ default:
+ len = -EBUSY;
+ goto out_attach;
+ break;
+ }
+
+ if (__nvdimm_namespace_capacity(ndns) < SZ_16M) {
+ dev_dbg(dev, "%s too small to host\n", name);
+ len = -ENXIO;
+ goto out_attach;
+ }
+
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(dev));
+ if (!__nd_attach_ndns(dev, ndns, _ndns)) {
+ dev_dbg(dev, "%s already claimed\n",
+ dev_name(&ndns->dev));
+ len = -EBUSY;
+ }
+
+ out_attach:
+ put_device(&ndns->dev); /* from device_find_child */
+ out:
+ kfree(name);
+ return len;
+}
+
+/*
+ * nd_sb_checksum: compute checksum for a generic info block
+ *
+ * Returns a fletcher64 checksum of everything in the given info block
+ * except the last field (since that's where the checksum lives).
+ */
+u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb)
+{
+ u64 sum;
+ __le64 sum_save;
+
+ BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
+ BUILD_BUG_ON(sizeof(struct nd_pfn_sb) != SZ_4K);
+ BUILD_BUG_ON(sizeof(struct nd_gen_sb) != SZ_4K);
+
+ sum_save = nd_gen_sb->checksum;
+ nd_gen_sb->checksum = 0;
+ sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1);
+ nd_gen_sb->checksum = sum_save;
+ return sum;
+}
+EXPORT_SYMBOL(nd_sb_checksum);
+
+static int nsio_rw_bytes(struct nd_namespace_common *ndns,
+ resource_size_t offset, void *buf, size_t size, int rw,
+ unsigned long flags)
+{
+ struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+ unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
+ sector_t sector = offset >> 9;
+ int rc = 0;
+
+ if (unlikely(!size))
+ return 0;
+
+ if (unlikely(offset + size > nsio->size)) {
+ dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+ return -EFAULT;
+ }
+
+ if (rw == READ) {
+ if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
+ return -EIO;
+ if (memcpy_mcsafe(buf, nsio->addr + offset, size) != 0)
+ return -EIO;
+ return 0;
+ }
+
+ if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
+ if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
+ && !(flags & NVDIMM_IO_ATOMIC)) {
+ long cleared;
+
+ might_sleep();
+ cleared = nvdimm_clear_poison(&ndns->dev,
+ nsio->res.start + offset, size);
+ if (cleared < size)
+ rc = -EIO;
+ if (cleared > 0 && cleared / 512) {
+ cleared /= 512;
+ badblocks_clear(&nsio->bb, sector, cleared);
+ }
+ arch_invalidate_pmem(nsio->addr + offset, size);
+ } else
+ rc = -EIO;
+ }
+
+ memcpy_flushcache(nsio->addr + offset, buf, size);
+ nvdimm_flush(to_nd_region(ndns->dev.parent));
+
+ return rc;
+}
+
+int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio)
+{
+ struct resource *res = &nsio->res;
+ struct nd_namespace_common *ndns = &nsio->common;
+
+ nsio->size = resource_size(res);
+ if (!devm_request_mem_region(dev, res->start, resource_size(res),
+ dev_name(&ndns->dev))) {
+ dev_warn(dev, "could not reserve region %pR\n", res);
+ return -EBUSY;
+ }
+
+ ndns->rw_bytes = nsio_rw_bytes;
+ if (devm_init_badblocks(dev, &nsio->bb))
+ return -ENOMEM;
+ nvdimm_badblocks_populate(to_nd_region(ndns->dev.parent), &nsio->bb,
+ &nsio->res);
+
+ nsio->addr = devm_memremap(dev, res->start, resource_size(res),
+ ARCH_MEMREMAP_PMEM);
+
+ return PTR_ERR_OR_ZERO(nsio->addr);
+}
+EXPORT_SYMBOL_GPL(devm_nsio_enable);
+
+void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio)
+{
+ struct resource *res = &nsio->res;
+
+ devm_memunmap(dev, nsio->addr);
+ devm_exit_badblocks(dev, &nsio->bb);
+ devm_release_mem_region(dev, res->start, resource_size(res));
+}
+EXPORT_SYMBOL_GPL(devm_nsio_disable);
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
new file mode 100644
index 000000000..acce05085
--- /dev/null
+++ b/drivers/nvdimm/core.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/ctype.h>
+#include <linux/ndctl.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include "nd-core.h"
+#include "nd.h"
+
+LIST_HEAD(nvdimm_bus_list);
+DEFINE_MUTEX(nvdimm_bus_list_mutex);
+
+void nvdimm_bus_lock(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!nvdimm_bus)
+ return;
+ mutex_lock(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(nvdimm_bus_lock);
+
+void nvdimm_bus_unlock(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!nvdimm_bus)
+ return;
+ mutex_unlock(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(nvdimm_bus_unlock);
+
+bool is_nvdimm_bus_locked(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!nvdimm_bus)
+ return false;
+ return mutex_is_locked(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(is_nvdimm_bus_locked);
+
+struct nvdimm_map {
+ struct nvdimm_bus *nvdimm_bus;
+ struct list_head list;
+ resource_size_t offset;
+ unsigned long flags;
+ size_t size;
+ union {
+ void *mem;
+ void __iomem *iomem;
+ };
+ struct kref kref;
+};
+
+static struct nvdimm_map *find_nvdimm_map(struct device *dev,
+ resource_size_t offset)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ struct nvdimm_map *nvdimm_map;
+
+ list_for_each_entry(nvdimm_map, &nvdimm_bus->mapping_list, list)
+ if (nvdimm_map->offset == offset)
+ return nvdimm_map;
+ return NULL;
+}
+
+static struct nvdimm_map *alloc_nvdimm_map(struct device *dev,
+ resource_size_t offset, size_t size, unsigned long flags)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ struct nvdimm_map *nvdimm_map;
+
+ nvdimm_map = kzalloc(sizeof(*nvdimm_map), GFP_KERNEL);
+ if (!nvdimm_map)
+ return NULL;
+
+ INIT_LIST_HEAD(&nvdimm_map->list);
+ nvdimm_map->nvdimm_bus = nvdimm_bus;
+ nvdimm_map->offset = offset;
+ nvdimm_map->flags = flags;
+ nvdimm_map->size = size;
+ kref_init(&nvdimm_map->kref);
+
+ if (!request_mem_region(offset, size, dev_name(&nvdimm_bus->dev))) {
+ dev_err(&nvdimm_bus->dev, "failed to request %pa + %zd for %s\n",
+ &offset, size, dev_name(dev));
+ goto err_request_region;
+ }
+
+ if (flags)
+ nvdimm_map->mem = memremap(offset, size, flags);
+ else
+ nvdimm_map->iomem = ioremap(offset, size);
+
+ if (!nvdimm_map->mem)
+ goto err_map;
+
+ dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev), "%s: bus unlocked!",
+ __func__);
+ list_add(&nvdimm_map->list, &nvdimm_bus->mapping_list);
+
+ return nvdimm_map;
+
+ err_map:
+ release_mem_region(offset, size);
+ err_request_region:
+ kfree(nvdimm_map);
+ return NULL;
+}
+
+static void nvdimm_map_release(struct kref *kref)
+{
+ struct nvdimm_bus *nvdimm_bus;
+ struct nvdimm_map *nvdimm_map;
+
+ nvdimm_map = container_of(kref, struct nvdimm_map, kref);
+ nvdimm_bus = nvdimm_map->nvdimm_bus;
+
+ dev_dbg(&nvdimm_bus->dev, "%pa\n", &nvdimm_map->offset);
+ list_del(&nvdimm_map->list);
+ if (nvdimm_map->flags)
+ memunmap(nvdimm_map->mem);
+ else
+ iounmap(nvdimm_map->iomem);
+ release_mem_region(nvdimm_map->offset, nvdimm_map->size);
+ kfree(nvdimm_map);
+}
+
+static void nvdimm_map_put(void *data)
+{
+ struct nvdimm_map *nvdimm_map = data;
+ struct nvdimm_bus *nvdimm_bus = nvdimm_map->nvdimm_bus;
+
+ nvdimm_bus_lock(&nvdimm_bus->dev);
+ kref_put(&nvdimm_map->kref, nvdimm_map_release);
+ nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+/**
+ * devm_nvdimm_memremap - map a resource that is shared across regions
+ * @dev: device that will own a reference to the shared mapping
+ * @offset: physical base address of the mapping
+ * @size: mapping size
+ * @flags: memremap flags, or, if zero, perform an ioremap instead
+ */
+void *devm_nvdimm_memremap(struct device *dev, resource_size_t offset,
+ size_t size, unsigned long flags)
+{
+ struct nvdimm_map *nvdimm_map;
+
+ nvdimm_bus_lock(dev);
+ nvdimm_map = find_nvdimm_map(dev, offset);
+ if (!nvdimm_map)
+ nvdimm_map = alloc_nvdimm_map(dev, offset, size, flags);
+ else
+ kref_get(&nvdimm_map->kref);
+ nvdimm_bus_unlock(dev);
+
+ if (!nvdimm_map)
+ return NULL;
+
+ if (devm_add_action_or_reset(dev, nvdimm_map_put, nvdimm_map))
+ return NULL;
+
+ return nvdimm_map->mem;
+}
+EXPORT_SYMBOL_GPL(devm_nvdimm_memremap);
+
+u64 nd_fletcher64(void *addr, size_t len, bool le)
+{
+ u32 *buf = addr;
+ u32 lo32 = 0;
+ u64 hi32 = 0;
+ int i;
+
+ for (i = 0; i < len / sizeof(u32); i++) {
+ lo32 += le ? le32_to_cpu((__le32) buf[i]) : buf[i];
+ hi32 += lo32;
+ }
+
+ return hi32 << 32 | lo32;
+}
+EXPORT_SYMBOL_GPL(nd_fletcher64);
+
+struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus)
+{
+ /* struct nvdimm_bus definition is private to libnvdimm */
+ return nvdimm_bus->nd_desc;
+}
+EXPORT_SYMBOL_GPL(to_nd_desc);
+
+struct device *to_nvdimm_bus_dev(struct nvdimm_bus *nvdimm_bus)
+{
+ /* struct nvdimm_bus definition is private to libnvdimm */
+ return &nvdimm_bus->dev;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm_bus_dev);
+
+static bool is_uuid_sep(char sep)
+{
+ if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0')
+ return true;
+ return false;
+}
+
+static int nd_uuid_parse(struct device *dev, u8 *uuid_out, const char *buf,
+ size_t len)
+{
+ const char *str = buf;
+ u8 uuid[16];
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ if (!isxdigit(str[0]) || !isxdigit(str[1])) {
+ dev_dbg(dev, "pos: %d buf[%zd]: %c buf[%zd]: %c\n",
+ i, str - buf, str[0],
+ str + 1 - buf, str[1]);
+ return -EINVAL;
+ }
+
+ uuid[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]);
+ str += 2;
+ if (is_uuid_sep(*str))
+ str++;
+ }
+
+ memcpy(uuid_out, uuid, sizeof(uuid));
+ return 0;
+}
+
+/**
+ * nd_uuid_store: common implementation for writing 'uuid' sysfs attributes
+ * @dev: container device for the uuid property
+ * @uuid_out: uuid buffer to replace
+ * @buf: raw sysfs buffer to parse
+ *
+ * Enforce that uuids can only be changed while the device is disabled
+ * (driver detached)
+ * LOCKING: expects device_lock() is held on entry
+ */
+int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
+ size_t len)
+{
+ u8 uuid[16];
+ int rc;
+
+ if (dev->driver)
+ return -EBUSY;
+
+ rc = nd_uuid_parse(dev, uuid, buf, len);
+ if (rc)
+ return rc;
+
+ kfree(*uuid_out);
+ *uuid_out = kmemdup(uuid, sizeof(uuid), GFP_KERNEL);
+ if (!(*uuid_out))
+ return -ENOMEM;
+
+ return 0;
+}
+
+ssize_t nd_size_select_show(unsigned long current_size,
+ const unsigned long *supported, char *buf)
+{
+ ssize_t len = 0;
+ int i;
+
+ for (i = 0; supported[i]; i++)
+ if (current_size == supported[i])
+ len += sprintf(buf + len, "[%ld] ", supported[i]);
+ else
+ len += sprintf(buf + len, "%ld ", supported[i]);
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+
+ssize_t nd_size_select_store(struct device *dev, const char *buf,
+ unsigned long *current_size, const unsigned long *supported)
+{
+ unsigned long lbasize;
+ int rc, i;
+
+ if (dev->driver)
+ return -EBUSY;
+
+ rc = kstrtoul(buf, 0, &lbasize);
+ if (rc)
+ return rc;
+
+ for (i = 0; supported[i]; i++)
+ if (lbasize == supported[i])
+ break;
+
+ if (supported[i]) {
+ *current_size = lbasize;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+
+static ssize_t commands_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int cmd, len = 0;
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+ struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+
+ for_each_set_bit(cmd, &nd_desc->cmd_mask, BITS_PER_LONG)
+ len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd));
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+static DEVICE_ATTR_RO(commands);
+
+static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus)
+{
+ struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+ struct device *parent = nvdimm_bus->dev.parent;
+
+ if (nd_desc->provider_name)
+ return nd_desc->provider_name;
+ else if (parent)
+ return dev_name(parent);
+ else
+ return "unknown";
+}
+
+static ssize_t provider_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+ return sprintf(buf, "%s\n", nvdimm_bus_provider(nvdimm_bus));
+}
+static DEVICE_ATTR_RO(provider);
+
+static int flush_namespaces(struct device *dev, void *data)
+{
+ device_lock(dev);
+ device_unlock(dev);
+ return 0;
+}
+
+static int flush_regions_dimms(struct device *dev, void *data)
+{
+ device_lock(dev);
+ device_unlock(dev);
+ device_for_each_child(dev, NULL, flush_namespaces);
+ return 0;
+}
+
+static ssize_t wait_probe_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+ struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+ int rc;
+
+ if (nd_desc->flush_probe) {
+ rc = nd_desc->flush_probe(nd_desc);
+ if (rc)
+ return rc;
+ }
+ nd_synchronize();
+ device_for_each_child(dev, NULL, flush_regions_dimms);
+ return sprintf(buf, "1\n");
+}
+static DEVICE_ATTR_RO(wait_probe);
+
+static struct attribute *nvdimm_bus_attributes[] = {
+ &dev_attr_commands.attr,
+ &dev_attr_wait_probe.attr,
+ &dev_attr_provider.attr,
+ NULL,
+};
+
+struct attribute_group nvdimm_bus_attribute_group = {
+ .attrs = nvdimm_bus_attributes,
+};
+EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
+
+int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
+{
+ return badrange_add(&nvdimm_bus->badrange, addr, length);
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange);
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
+{
+ struct blk_integrity bi;
+
+ if (meta_size == 0)
+ return 0;
+
+ memset(&bi, 0, sizeof(bi));
+
+ bi.tuple_size = meta_size;
+ bi.tag_size = meta_size;
+
+ blk_integrity_register(disk, &bi);
+ blk_queue_max_integrity_segments(disk->queue, 1);
+
+ return 0;
+}
+EXPORT_SYMBOL(nd_integrity_init);
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
+{
+ return 0;
+}
+EXPORT_SYMBOL(nd_integrity_init);
+
+#endif
+
+static __init int libnvdimm_init(void)
+{
+ int rc;
+
+ rc = nvdimm_bus_init();
+ if (rc)
+ return rc;
+ rc = nvdimm_init();
+ if (rc)
+ goto err_dimm;
+ rc = nd_region_init();
+ if (rc)
+ goto err_region;
+
+ nd_label_init();
+
+ return 0;
+ err_region:
+ nvdimm_exit();
+ err_dimm:
+ nvdimm_bus_exit();
+ return rc;
+}
+
+static __exit void libnvdimm_exit(void)
+{
+ WARN_ON(!list_empty(&nvdimm_bus_list));
+ nd_region_exit();
+ nvdimm_exit();
+ nvdimm_bus_exit();
+ nd_region_devs_exit();
+ nvdimm_devs_exit();
+}
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+subsys_initcall(libnvdimm_init);
+module_exit(libnvdimm_exit);
diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
new file mode 100644
index 000000000..326f02ffc
--- /dev/null
+++ b/drivers/nvdimm/dax_devs.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "pfn.h"
+#include "nd.h"
+
+static void nd_dax_release(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_dax *nd_dax = to_nd_dax(dev);
+ struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
+
+ dev_dbg(dev, "trace\n");
+ nd_detach_ndns(dev, &nd_pfn->ndns);
+ ida_simple_remove(&nd_region->dax_ida, nd_pfn->id);
+ kfree(nd_pfn->uuid);
+ kfree(nd_dax);
+}
+
+static struct device_type nd_dax_device_type = {
+ .name = "nd_dax",
+ .release = nd_dax_release,
+};
+
+bool is_nd_dax(struct device *dev)
+{
+ return dev ? dev->type == &nd_dax_device_type : false;
+}
+EXPORT_SYMBOL(is_nd_dax);
+
+struct nd_dax *to_nd_dax(struct device *dev)
+{
+ struct nd_dax *nd_dax = container_of(dev, struct nd_dax, nd_pfn.dev);
+
+ WARN_ON(!is_nd_dax(dev));
+ return nd_dax;
+}
+EXPORT_SYMBOL(to_nd_dax);
+
+static const struct attribute_group *nd_dax_attribute_groups[] = {
+ &nd_pfn_attribute_group,
+ &nd_device_attribute_group,
+ &nd_numa_attribute_group,
+ NULL,
+};
+
+static struct nd_dax *nd_dax_alloc(struct nd_region *nd_region)
+{
+ struct nd_pfn *nd_pfn;
+ struct nd_dax *nd_dax;
+ struct device *dev;
+
+ nd_dax = kzalloc(sizeof(*nd_dax), GFP_KERNEL);
+ if (!nd_dax)
+ return NULL;
+
+ nd_pfn = &nd_dax->nd_pfn;
+ nd_pfn->id = ida_simple_get(&nd_region->dax_ida, 0, 0, GFP_KERNEL);
+ if (nd_pfn->id < 0) {
+ kfree(nd_dax);
+ return NULL;
+ }
+
+ dev = &nd_pfn->dev;
+ dev_set_name(dev, "dax%d.%d", nd_region->id, nd_pfn->id);
+ dev->groups = nd_dax_attribute_groups;
+ dev->type = &nd_dax_device_type;
+ dev->parent = &nd_region->dev;
+
+ return nd_dax;
+}
+
+struct device *nd_dax_create(struct nd_region *nd_region)
+{
+ struct device *dev = NULL;
+ struct nd_dax *nd_dax;
+
+ if (!is_memory(&nd_region->dev))
+ return NULL;
+
+ nd_dax = nd_dax_alloc(nd_region);
+ if (nd_dax)
+ dev = nd_pfn_devinit(&nd_dax->nd_pfn, NULL);
+ __nd_device_register(dev);
+ return dev;
+}
+
+int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns)
+{
+ int rc;
+ struct nd_dax *nd_dax;
+ struct device *dax_dev;
+ struct nd_pfn *nd_pfn;
+ struct nd_pfn_sb *pfn_sb;
+ struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+ if (ndns->force_raw)
+ return -ENODEV;
+
+ switch (ndns->claim_class) {
+ case NVDIMM_CCLASS_NONE:
+ case NVDIMM_CCLASS_DAX:
+ break;
+ default:
+ return -ENODEV;
+ }
+
+ nvdimm_bus_lock(&ndns->dev);
+ nd_dax = nd_dax_alloc(nd_region);
+ nd_pfn = &nd_dax->nd_pfn;
+ dax_dev = nd_pfn_devinit(nd_pfn, ndns);
+ nvdimm_bus_unlock(&ndns->dev);
+ if (!dax_dev)
+ return -ENOMEM;
+ pfn_sb = devm_kmalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
+ nd_pfn->pfn_sb = pfn_sb;
+ rc = nd_pfn_validate(nd_pfn, DAX_SIG);
+ dev_dbg(dev, "dax: %s\n", rc == 0 ? dev_name(dax_dev) : "<none>");
+ if (rc < 0) {
+ nd_detach_ndns(dax_dev, &nd_pfn->ndns);
+ put_device(dax_dev);
+ } else
+ __nd_device_register(dax_dev);
+
+ return rc;
+}
+EXPORT_SYMBOL(nd_dax_probe);
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
new file mode 100644
index 000000000..6c8fb7590
--- /dev/null
+++ b/drivers/nvdimm/dimm.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/ndctl.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/nd.h>
+#include "label.h"
+#include "nd.h"
+
+static int nvdimm_probe(struct device *dev)
+{
+ struct nvdimm_drvdata *ndd;
+ int rc;
+
+ rc = nvdimm_check_config_data(dev);
+ if (rc) {
+ /* not required for non-aliased nvdimm, ex. NVDIMM-N */
+ if (rc == -ENOTTY)
+ rc = 0;
+ return rc;
+ }
+
+ /* reset locked, to be validated below... */
+ nvdimm_clear_locked(dev);
+
+ ndd = kzalloc(sizeof(*ndd), GFP_KERNEL);
+ if (!ndd)
+ return -ENOMEM;
+
+ dev_set_drvdata(dev, ndd);
+ ndd->dpa.name = dev_name(dev);
+ ndd->ns_current = -1;
+ ndd->ns_next = -1;
+ ndd->dpa.start = 0;
+ ndd->dpa.end = -1;
+ ndd->dev = dev;
+ get_device(dev);
+ kref_init(&ndd->kref);
+
+ /*
+ * EACCES failures reading the namespace label-area-properties
+ * are interpreted as the DIMM capacity being locked but the
+ * namespace labels themselves being accessible.
+ */
+ rc = nvdimm_init_nsarea(ndd);
+ if (rc == -EACCES) {
+ /*
+ * See nvdimm_namespace_common_probe() where we fail to
+ * allow namespaces to probe while the DIMM is locked,
+ * but we do allow for namespace enumeration.
+ */
+ nvdimm_set_locked(dev);
+ rc = 0;
+ }
+ if (rc)
+ goto err;
+
+ /*
+ * EACCES failures reading the namespace label-data are
+ * interpreted as the label area being locked in addition to the
+ * DIMM capacity. We fail the dimm probe to prevent regions from
+ * attempting to parse the label area.
+ */
+ rc = nvdimm_init_config_data(ndd);
+ if (rc == -EACCES)
+ nvdimm_set_locked(dev);
+ if (rc)
+ goto err;
+
+ dev_dbg(dev, "config data size: %d\n", ndd->nsarea.config_size);
+
+ nvdimm_bus_lock(dev);
+ ndd->ns_current = nd_label_validate(ndd);
+ ndd->ns_next = nd_label_next_nsindex(ndd->ns_current);
+ nd_label_copy(ndd, to_next_namespace_index(ndd),
+ to_current_namespace_index(ndd));
+ if (ndd->ns_current >= 0) {
+ rc = nd_label_reserve_dpa(ndd);
+ if (rc == 0)
+ nvdimm_set_aliasing(dev);
+ }
+ nvdimm_bus_unlock(dev);
+
+ if (rc)
+ goto err;
+
+ return 0;
+
+ err:
+ put_ndd(ndd);
+ return rc;
+}
+
+static int nvdimm_remove(struct device *dev)
+{
+ struct nvdimm_drvdata *ndd = dev_get_drvdata(dev);
+
+ if (!ndd)
+ return 0;
+
+ nvdimm_bus_lock(dev);
+ dev_set_drvdata(dev, NULL);
+ nvdimm_bus_unlock(dev);
+ put_ndd(ndd);
+
+ return 0;
+}
+
+static struct nd_device_driver nvdimm_driver = {
+ .probe = nvdimm_probe,
+ .remove = nvdimm_remove,
+ .drv = {
+ .name = "nvdimm",
+ },
+ .type = ND_DRIVER_DIMM,
+};
+
+int __init nvdimm_init(void)
+{
+ return nd_driver_register(&nvdimm_driver);
+}
+
+void nvdimm_exit(void)
+{
+ driver_unregister(&nvdimm_driver.drv);
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DIMM);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
new file mode 100644
index 000000000..f0e0e3b42
--- /dev/null
+++ b/drivers/nvdimm/dimm_devs.c
@@ -0,0 +1,717 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/vmalloc.h>
+#include <linux/device.h>
+#include <linux/ndctl.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "label.h"
+#include "pmem.h"
+#include "nd.h"
+
+static DEFINE_IDA(dimm_ida);
+
+/*
+ * Retrieve bus and dimm handle and return if this bus supports
+ * get_config_data commands
+ */
+int nvdimm_check_config_data(struct device *dev)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ if (!nvdimm->cmd_mask ||
+ !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) {
+ if (test_bit(NDD_ALIASING, &nvdimm->flags))
+ return -ENXIO;
+ else
+ return -ENOTTY;
+ }
+
+ return 0;
+}
+
+static int validate_dimm(struct nvdimm_drvdata *ndd)
+{
+ int rc;
+
+ if (!ndd)
+ return -EINVAL;
+
+ rc = nvdimm_check_config_data(ndd->dev);
+ if (rc)
+ dev_dbg(ndd->dev, "%pf: %s error: %d\n",
+ __builtin_return_address(0), __func__, rc);
+ return rc;
+}
+
+/**
+ * nvdimm_init_nsarea - determine the geometry of a dimm's namespace area
+ * @nvdimm: dimm to initialize
+ */
+int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
+{
+ struct nd_cmd_get_config_size *cmd = &ndd->nsarea;
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+ struct nvdimm_bus_descriptor *nd_desc;
+ int rc = validate_dimm(ndd);
+ int cmd_rc = 0;
+
+ if (rc)
+ return rc;
+
+ if (cmd->config_size)
+ return 0; /* already valid */
+
+ memset(cmd, 0, sizeof(*cmd));
+ nd_desc = nvdimm_bus->nd_desc;
+ rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+ ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), &cmd_rc);
+ if (rc < 0)
+ return rc;
+ return cmd_rc;
+}
+
+int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+ int rc = validate_dimm(ndd), cmd_rc = 0;
+ struct nd_cmd_get_config_data_hdr *cmd;
+ struct nvdimm_bus_descriptor *nd_desc;
+ u32 max_cmd_size, config_size;
+ size_t offset;
+
+ if (rc)
+ return rc;
+
+ if (ndd->data)
+ return 0;
+
+ if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0
+ || ndd->nsarea.config_size < ND_LABEL_MIN_SIZE) {
+ dev_dbg(ndd->dev, "failed to init config data area: (%d:%d)\n",
+ ndd->nsarea.max_xfer, ndd->nsarea.config_size);
+ return -ENXIO;
+ }
+
+ ndd->data = kvmalloc(ndd->nsarea.config_size, GFP_KERNEL);
+ if (!ndd->data)
+ return -ENOMEM;
+
+ max_cmd_size = min_t(u32, PAGE_SIZE, ndd->nsarea.max_xfer);
+ cmd = kzalloc(max_cmd_size + sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ nd_desc = nvdimm_bus->nd_desc;
+ for (config_size = ndd->nsarea.config_size, offset = 0;
+ config_size; config_size -= cmd->in_length,
+ offset += cmd->in_length) {
+ cmd->in_length = min(config_size, max_cmd_size);
+ cmd->in_offset = offset;
+ rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+ ND_CMD_GET_CONFIG_DATA, cmd,
+ cmd->in_length + sizeof(*cmd), &cmd_rc);
+ if (rc < 0)
+ break;
+ if (cmd_rc < 0) {
+ rc = cmd_rc;
+ break;
+ }
+ memcpy(ndd->data + offset, cmd->out_buf, cmd->in_length);
+ }
+ dev_dbg(ndd->dev, "len: %zu rc: %d\n", offset, rc);
+ kfree(cmd);
+
+ return rc;
+}
+
+int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
+ void *buf, size_t len)
+{
+ size_t max_cmd_size, buf_offset;
+ struct nd_cmd_set_config_hdr *cmd;
+ int rc = validate_dimm(ndd), cmd_rc = 0;
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+ struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+
+ if (rc)
+ return rc;
+
+ if (!ndd->data)
+ return -ENXIO;
+
+ if (offset + len > ndd->nsarea.config_size)
+ return -ENXIO;
+
+ max_cmd_size = min_t(u32, PAGE_SIZE, len);
+ max_cmd_size = min_t(u32, max_cmd_size, ndd->nsarea.max_xfer);
+ cmd = kzalloc(max_cmd_size + sizeof(*cmd) + sizeof(u32), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ for (buf_offset = 0; len; len -= cmd->in_length,
+ buf_offset += cmd->in_length) {
+ size_t cmd_size;
+
+ cmd->in_offset = offset + buf_offset;
+ cmd->in_length = min(max_cmd_size, len);
+ memcpy(cmd->in_buf, buf + buf_offset, cmd->in_length);
+
+ /* status is output in the last 4-bytes of the command buffer */
+ cmd_size = sizeof(*cmd) + cmd->in_length + sizeof(u32);
+
+ rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+ ND_CMD_SET_CONFIG_DATA, cmd, cmd_size, &cmd_rc);
+ if (rc < 0)
+ break;
+ if (cmd_rc < 0) {
+ rc = cmd_rc;
+ break;
+ }
+ }
+ kfree(cmd);
+
+ return rc;
+}
+
+void nvdimm_set_aliasing(struct device *dev)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ set_bit(NDD_ALIASING, &nvdimm->flags);
+}
+
+void nvdimm_set_locked(struct device *dev)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ set_bit(NDD_LOCKED, &nvdimm->flags);
+}
+
+void nvdimm_clear_locked(struct device *dev)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ clear_bit(NDD_LOCKED, &nvdimm->flags);
+}
+
+static void nvdimm_release(struct device *dev)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ ida_simple_remove(&dimm_ida, nvdimm->id);
+ kfree(nvdimm);
+}
+
+static struct device_type nvdimm_device_type = {
+ .name = "nvdimm",
+ .release = nvdimm_release,
+};
+
+bool is_nvdimm(struct device *dev)
+{
+ return dev->type == &nvdimm_device_type;
+}
+
+struct nvdimm *to_nvdimm(struct device *dev)
+{
+ struct nvdimm *nvdimm = container_of(dev, struct nvdimm, dev);
+
+ WARN_ON(!is_nvdimm(dev));
+ return nvdimm;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm);
+
+struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr)
+{
+ struct nd_region *nd_region = &ndbr->nd_region;
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+
+ return nd_mapping->nvdimm;
+}
+EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm);
+
+unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr)
+{
+ /* pmem mapping properties are private to libnvdimm */
+ return ARCH_MEMREMAP_PMEM;
+}
+EXPORT_SYMBOL_GPL(nd_blk_memremap_flags);
+
+struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping)
+{
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev));
+
+ return dev_get_drvdata(&nvdimm->dev);
+}
+EXPORT_SYMBOL(to_ndd);
+
+void nvdimm_drvdata_release(struct kref *kref)
+{
+ struct nvdimm_drvdata *ndd = container_of(kref, typeof(*ndd), kref);
+ struct device *dev = ndd->dev;
+ struct resource *res, *_r;
+
+ dev_dbg(dev, "trace\n");
+ nvdimm_bus_lock(dev);
+ for_each_dpa_resource_safe(ndd, res, _r)
+ nvdimm_free_dpa(ndd, res);
+ nvdimm_bus_unlock(dev);
+
+ kvfree(ndd->data);
+ kfree(ndd);
+ put_device(dev);
+}
+
+void get_ndd(struct nvdimm_drvdata *ndd)
+{
+ kref_get(&ndd->kref);
+}
+
+void put_ndd(struct nvdimm_drvdata *ndd)
+{
+ if (ndd)
+ kref_put(&ndd->kref, nvdimm_drvdata_release);
+}
+
+const char *nvdimm_name(struct nvdimm *nvdimm)
+{
+ return dev_name(&nvdimm->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_name);
+
+struct kobject *nvdimm_kobj(struct nvdimm *nvdimm)
+{
+ return &nvdimm->dev.kobj;
+}
+EXPORT_SYMBOL_GPL(nvdimm_kobj);
+
+unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm)
+{
+ return nvdimm->cmd_mask;
+}
+EXPORT_SYMBOL_GPL(nvdimm_cmd_mask);
+
+void *nvdimm_provider_data(struct nvdimm *nvdimm)
+{
+ if (nvdimm)
+ return nvdimm->provider_data;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nvdimm_provider_data);
+
+static ssize_t commands_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ int cmd, len = 0;
+
+ if (!nvdimm->cmd_mask)
+ return sprintf(buf, "\n");
+
+ for_each_set_bit(cmd, &nvdimm->cmd_mask, BITS_PER_LONG)
+ len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd));
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+static DEVICE_ATTR_RO(commands);
+
+static ssize_t flags_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ return sprintf(buf, "%s%s\n",
+ test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
+ test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
+}
+static DEVICE_ATTR_RO(flags);
+
+static ssize_t state_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+
+ /*
+ * The state may be in the process of changing, userspace should
+ * quiesce probing if it wants a static answer
+ */
+ nvdimm_bus_lock(dev);
+ nvdimm_bus_unlock(dev);
+ return sprintf(buf, "%s\n", atomic_read(&nvdimm->busy)
+ ? "active" : "idle");
+}
+static DEVICE_ATTR_RO(state);
+
+static ssize_t __available_slots_show(struct nvdimm_drvdata *ndd, char *buf)
+{
+ struct device *dev;
+ ssize_t rc;
+ u32 nfree;
+
+ if (!ndd)
+ return -ENXIO;
+
+ dev = ndd->dev;
+ nvdimm_bus_lock(dev);
+ nfree = nd_label_nfree(ndd);
+ if (nfree - 1 > nfree) {
+ dev_WARN_ONCE(dev, 1, "we ate our last label?\n");
+ nfree = 0;
+ } else
+ nfree--;
+ rc = sprintf(buf, "%d\n", nfree);
+ nvdimm_bus_unlock(dev);
+ return rc;
+}
+
+static ssize_t available_slots_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ ssize_t rc;
+
+ device_lock(dev);
+ rc = __available_slots_show(dev_get_drvdata(dev), buf);
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(available_slots);
+
+static struct attribute *nvdimm_attributes[] = {
+ &dev_attr_state.attr,
+ &dev_attr_flags.attr,
+ &dev_attr_commands.attr,
+ &dev_attr_available_slots.attr,
+ NULL,
+};
+
+struct attribute_group nvdimm_attribute_group = {
+ .attrs = nvdimm_attributes,
+};
+EXPORT_SYMBOL_GPL(nvdimm_attribute_group);
+
+struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
+ const struct attribute_group **groups, unsigned long flags,
+ unsigned long cmd_mask, int num_flush,
+ struct resource *flush_wpq)
+{
+ struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
+ struct device *dev;
+
+ if (!nvdimm)
+ return NULL;
+
+ nvdimm->id = ida_simple_get(&dimm_ida, 0, 0, GFP_KERNEL);
+ if (nvdimm->id < 0) {
+ kfree(nvdimm);
+ return NULL;
+ }
+ nvdimm->provider_data = provider_data;
+ nvdimm->flags = flags;
+ nvdimm->cmd_mask = cmd_mask;
+ nvdimm->num_flush = num_flush;
+ nvdimm->flush_wpq = flush_wpq;
+ atomic_set(&nvdimm->busy, 0);
+ dev = &nvdimm->dev;
+ dev_set_name(dev, "nmem%d", nvdimm->id);
+ dev->parent = &nvdimm_bus->dev;
+ dev->type = &nvdimm_device_type;
+ dev->devt = MKDEV(nvdimm_major, nvdimm->id);
+ dev->groups = groups;
+ nd_device_register(dev);
+
+ return nvdimm;
+}
+EXPORT_SYMBOL_GPL(nvdimm_create);
+
+int alias_dpa_busy(struct device *dev, void *data)
+{
+ resource_size_t map_end, blk_start, new;
+ struct blk_alloc_info *info = data;
+ struct nd_mapping *nd_mapping;
+ struct nd_region *nd_region;
+ struct nvdimm_drvdata *ndd;
+ struct resource *res;
+ int i;
+
+ if (!is_memory(dev))
+ return 0;
+
+ nd_region = to_nd_region(dev);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ nd_mapping = &nd_region->mapping[i];
+ if (nd_mapping->nvdimm == info->nd_mapping->nvdimm)
+ break;
+ }
+
+ if (i >= nd_region->ndr_mappings)
+ return 0;
+
+ ndd = to_ndd(nd_mapping);
+ map_end = nd_mapping->start + nd_mapping->size - 1;
+ blk_start = nd_mapping->start;
+
+ /*
+ * In the allocation case ->res is set to free space that we are
+ * looking to validate against PMEM aliasing collision rules
+ * (i.e. BLK is allocated after all aliased PMEM).
+ */
+ if (info->res) {
+ if (info->res->start >= nd_mapping->start
+ && info->res->start < map_end)
+ /* pass */;
+ else
+ return 0;
+ }
+
+ retry:
+ /*
+ * Find the free dpa from the end of the last pmem allocation to
+ * the end of the interleave-set mapping.
+ */
+ for_each_dpa_resource(ndd, res) {
+ if (strncmp(res->name, "pmem", 4) != 0)
+ continue;
+ if ((res->start >= blk_start && res->start < map_end)
+ || (res->end >= blk_start
+ && res->end <= map_end)) {
+ new = max(blk_start, min(map_end + 1, res->end + 1));
+ if (new != blk_start) {
+ blk_start = new;
+ goto retry;
+ }
+ }
+ }
+
+ /* update the free space range with the probed blk_start */
+ if (info->res && blk_start > info->res->start) {
+ info->res->start = max(info->res->start, blk_start);
+ if (info->res->start > info->res->end)
+ info->res->end = info->res->start - 1;
+ return 1;
+ }
+
+ info->available -= blk_start - nd_mapping->start;
+
+ return 0;
+}
+
+/**
+ * nd_blk_available_dpa - account the unused dpa of BLK region
+ * @nd_mapping: container of dpa-resource-root + labels
+ *
+ * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges, but
+ * we arrange for them to never start at an lower dpa than the last
+ * PMEM allocation in an aliased region.
+ */
+resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct blk_alloc_info info = {
+ .nd_mapping = nd_mapping,
+ .available = nd_mapping->size,
+ .res = NULL,
+ };
+ struct resource *res;
+
+ if (!ndd)
+ return 0;
+
+ device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy);
+
+ /* now account for busy blk allocations in unaliased dpa */
+ for_each_dpa_resource(ndd, res) {
+ if (strncmp(res->name, "blk", 3) != 0)
+ continue;
+ info.available -= resource_size(res);
+ }
+
+ return info.available;
+}
+
+/**
+ * nd_pmem_max_contiguous_dpa - For the given dimm+region, return the max
+ * contiguous unallocated dpa range.
+ * @nd_region: constrain available space check to this reference region
+ * @nd_mapping: container of dpa-resource-root + labels
+ */
+resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nvdimm_bus *nvdimm_bus;
+ resource_size_t max = 0;
+ struct resource *res;
+
+ /* if a dimm is disabled the available capacity is zero */
+ if (!ndd)
+ return 0;
+
+ nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+ if (__reserve_free_pmem(&nd_region->dev, nd_mapping->nvdimm))
+ return 0;
+ for_each_dpa_resource(ndd, res) {
+ if (strcmp(res->name, "pmem-reserve") != 0)
+ continue;
+ if (resource_size(res) > max)
+ max = resource_size(res);
+ }
+ release_free_pmem(nvdimm_bus, nd_mapping);
+ return max;
+}
+
+/**
+ * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa
+ * @nd_mapping: container of dpa-resource-root + labels
+ * @nd_region: constrain available space check to this reference region
+ * @overlap: calculate available space assuming this level of overlap
+ *
+ * Validate that a PMEM label, if present, aligns with the start of an
+ * interleave set and truncate the available size at the lowest BLK
+ * overlap point.
+ *
+ * The expectation is that this routine is called multiple times as it
+ * probes for the largest BLK encroachment for any single member DIMM of
+ * the interleave set. Once that value is determined the PMEM-limit for
+ * the set can be established.
+ */
+resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, resource_size_t *overlap)
+{
+ resource_size_t map_start, map_end, busy = 0, available, blk_start;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res;
+ const char *reason;
+
+ if (!ndd)
+ return 0;
+
+ map_start = nd_mapping->start;
+ map_end = map_start + nd_mapping->size - 1;
+ blk_start = max(map_start, map_end + 1 - *overlap);
+ for_each_dpa_resource(ndd, res) {
+ if (res->start >= map_start && res->start < map_end) {
+ if (strncmp(res->name, "blk", 3) == 0)
+ blk_start = min(blk_start,
+ max(map_start, res->start));
+ else if (res->end > map_end) {
+ reason = "misaligned to iset";
+ goto err;
+ } else
+ busy += resource_size(res);
+ } else if (res->end >= map_start && res->end <= map_end) {
+ if (strncmp(res->name, "blk", 3) == 0) {
+ /*
+ * If a BLK allocation overlaps the start of
+ * PMEM the entire interleave set may now only
+ * be used for BLK.
+ */
+ blk_start = map_start;
+ } else
+ busy += resource_size(res);
+ } else if (map_start > res->start && map_start < res->end) {
+ /* total eclipse of the mapping */
+ busy += nd_mapping->size;
+ blk_start = map_start;
+ }
+ }
+
+ *overlap = map_end + 1 - blk_start;
+ available = blk_start - map_start;
+ if (busy < available)
+ return available - busy;
+ return 0;
+
+ err:
+ nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason);
+ return 0;
+}
+
+void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res)
+{
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev));
+ kfree(res->name);
+ __release_region(&ndd->dpa, res->start, resource_size(res));
+}
+
+struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
+ struct nd_label_id *label_id, resource_size_t start,
+ resource_size_t n)
+{
+ char *name = kmemdup(label_id, sizeof(*label_id), GFP_KERNEL);
+ struct resource *res;
+
+ if (!name)
+ return NULL;
+
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev));
+ res = __request_region(&ndd->dpa, start, n, name, 0);
+ if (!res)
+ kfree(name);
+ return res;
+}
+
+/**
+ * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id
+ * @nvdimm: container of dpa-resource-root + labels
+ * @label_id: dpa resource name of the form {pmem|blk}-<human readable uuid>
+ */
+resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
+ struct nd_label_id *label_id)
+{
+ resource_size_t allocated = 0;
+ struct resource *res;
+
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id->id) == 0)
+ allocated += resource_size(res);
+
+ return allocated;
+}
+
+static int count_dimms(struct device *dev, void *c)
+{
+ int *count = c;
+
+ if (is_nvdimm(dev))
+ (*count)++;
+ return 0;
+}
+
+int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count)
+{
+ int count = 0;
+ /* Flush any possible dimm registration failures */
+ nd_synchronize();
+
+ device_for_each_child(&nvdimm_bus->dev, &count, count_dimms);
+ dev_dbg(&nvdimm_bus->dev, "count: %d\n", count);
+ if (count != dimm_count)
+ return -ENXIO;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count);
+
+void __exit nvdimm_devs_exit(void)
+{
+ ida_destroy(&dimm_ida);
+}
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
new file mode 100644
index 000000000..521eaf53a
--- /dev/null
+++ b/drivers/nvdimm/e820.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
+ */
+#include <linux/platform_device.h>
+#include <linux/memory_hotplug.h>
+#include <linux/libnvdimm.h>
+#include <linux/module.h>
+
+static const struct attribute_group *e820_pmem_attribute_groups[] = {
+ &nvdimm_bus_attribute_group,
+ NULL,
+};
+
+static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
+ &nd_region_attribute_group,
+ &nd_device_attribute_group,
+ NULL,
+};
+
+static int e820_pmem_remove(struct platform_device *pdev)
+{
+ struct nvdimm_bus *nvdimm_bus = platform_get_drvdata(pdev);
+
+ nvdimm_bus_unregister(nvdimm_bus);
+ return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int e820_range_to_nid(resource_size_t addr)
+{
+ return memory_add_physaddr_to_nid(addr);
+}
+#else
+static int e820_range_to_nid(resource_size_t addr)
+{
+ return NUMA_NO_NODE;
+}
+#endif
+
+static int e820_register_one(struct resource *res, void *data)
+{
+ struct nd_region_desc ndr_desc;
+ struct nvdimm_bus *nvdimm_bus = data;
+
+ memset(&ndr_desc, 0, sizeof(ndr_desc));
+ ndr_desc.res = res;
+ ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
+ ndr_desc.numa_node = e820_range_to_nid(res->start);
+ set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+ if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
+ return -ENXIO;
+ return 0;
+}
+
+static int e820_pmem_probe(struct platform_device *pdev)
+{
+ static struct nvdimm_bus_descriptor nd_desc;
+ struct device *dev = &pdev->dev;
+ struct nvdimm_bus *nvdimm_bus;
+ int rc = -ENXIO;
+
+ nd_desc.attr_groups = e820_pmem_attribute_groups;
+ nd_desc.provider_name = "e820";
+ nd_desc.module = THIS_MODULE;
+ nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
+ if (!nvdimm_bus)
+ goto err;
+ platform_set_drvdata(pdev, nvdimm_bus);
+
+ rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+ IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one);
+ if (rc)
+ goto err;
+ return 0;
+err:
+ nvdimm_bus_unregister(nvdimm_bus);
+ dev_err(dev, "failed to register legacy persistent memory ranges\n");
+ return rc;
+}
+
+static struct platform_driver e820_pmem_driver = {
+ .probe = e820_pmem_probe,
+ .remove = e820_pmem_remove,
+ .driver = {
+ .name = "e820_pmem",
+ },
+};
+
+module_platform_driver(e820_pmem_driver);
+
+MODULE_ALIAS("platform:e820_pmem*");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
new file mode 100644
index 000000000..19e3469d5
--- /dev/null
+++ b/drivers/nvdimm/label.c
@@ -0,0 +1,1212 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/ndctl.h>
+#include <linux/uuid.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "label.h"
+#include "nd.h"
+
+static guid_t nvdimm_btt_guid;
+static guid_t nvdimm_btt2_guid;
+static guid_t nvdimm_pfn_guid;
+static guid_t nvdimm_dax_guid;
+
+static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0";
+
+static u32 best_seq(u32 a, u32 b)
+{
+ a &= NSINDEX_SEQ_MASK;
+ b &= NSINDEX_SEQ_MASK;
+
+ if (a == 0 || a == b)
+ return b;
+ else if (b == 0)
+ return a;
+ else if (nd_inc_seq(a) == b)
+ return b;
+ else
+ return a;
+}
+
+unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd)
+{
+ return ndd->nslabel_size;
+}
+
+static size_t __sizeof_namespace_index(u32 nslot)
+{
+ return ALIGN(sizeof(struct nd_namespace_index) + DIV_ROUND_UP(nslot, 8),
+ NSINDEX_ALIGN);
+}
+
+static int __nvdimm_num_label_slots(struct nvdimm_drvdata *ndd,
+ size_t index_size)
+{
+ return (ndd->nsarea.config_size - index_size * 2) /
+ sizeof_namespace_label(ndd);
+}
+
+int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd)
+{
+ u32 tmp_nslot, n;
+
+ tmp_nslot = ndd->nsarea.config_size / sizeof_namespace_label(ndd);
+ n = __sizeof_namespace_index(tmp_nslot) / NSINDEX_ALIGN;
+
+ return __nvdimm_num_label_slots(ndd, NSINDEX_ALIGN * n);
+}
+
+size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
+{
+ u32 nslot, space, size;
+
+ /*
+ * Per UEFI 2.7, the minimum size of the Label Storage Area is large
+ * enough to hold 2 index blocks and 2 labels. The minimum index
+ * block size is 256 bytes, and the minimum label size is 256 bytes.
+ */
+ nslot = nvdimm_num_label_slots(ndd);
+ space = ndd->nsarea.config_size - nslot * sizeof_namespace_label(ndd);
+ size = __sizeof_namespace_index(nslot) * 2;
+ if (size <= space && nslot >= 2)
+ return size / 2;
+
+ dev_err(ndd->dev, "label area (%d) too small to host (%d byte) labels\n",
+ ndd->nsarea.config_size, sizeof_namespace_label(ndd));
+ return 0;
+}
+
+static int __nd_label_validate(struct nvdimm_drvdata *ndd)
+{
+ /*
+ * On media label format consists of two index blocks followed
+ * by an array of labels. None of these structures are ever
+ * updated in place. A sequence number tracks the current
+ * active index and the next one to write, while labels are
+ * written to free slots.
+ *
+ * +------------+
+ * | |
+ * | nsindex0 |
+ * | |
+ * +------------+
+ * | |
+ * | nsindex1 |
+ * | |
+ * +------------+
+ * | label0 |
+ * +------------+
+ * | label1 |
+ * +------------+
+ * | |
+ * ....nslot...
+ * | |
+ * +------------+
+ * | labelN |
+ * +------------+
+ */
+ struct nd_namespace_index *nsindex[] = {
+ to_namespace_index(ndd, 0),
+ to_namespace_index(ndd, 1),
+ };
+ const int num_index = ARRAY_SIZE(nsindex);
+ struct device *dev = ndd->dev;
+ bool valid[2] = { 0 };
+ int i, num_valid = 0;
+ u32 seq;
+
+ for (i = 0; i < num_index; i++) {
+ u32 nslot;
+ u8 sig[NSINDEX_SIG_LEN];
+ u64 sum_save, sum, size;
+ unsigned int version, labelsize;
+
+ memcpy(sig, nsindex[i]->sig, NSINDEX_SIG_LEN);
+ if (memcmp(sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN) != 0) {
+ dev_dbg(dev, "nsindex%d signature invalid\n", i);
+ continue;
+ }
+
+ /* label sizes larger than 128 arrived with v1.2 */
+ version = __le16_to_cpu(nsindex[i]->major) * 100
+ + __le16_to_cpu(nsindex[i]->minor);
+ if (version >= 102)
+ labelsize = 1 << (7 + nsindex[i]->labelsize);
+ else
+ labelsize = 128;
+
+ if (labelsize != sizeof_namespace_label(ndd)) {
+ dev_dbg(dev, "nsindex%d labelsize %d invalid\n",
+ i, nsindex[i]->labelsize);
+ continue;
+ }
+
+ sum_save = __le64_to_cpu(nsindex[i]->checksum);
+ nsindex[i]->checksum = __cpu_to_le64(0);
+ sum = nd_fletcher64(nsindex[i], sizeof_namespace_index(ndd), 1);
+ nsindex[i]->checksum = __cpu_to_le64(sum_save);
+ if (sum != sum_save) {
+ dev_dbg(dev, "nsindex%d checksum invalid\n", i);
+ continue;
+ }
+
+ seq = __le32_to_cpu(nsindex[i]->seq);
+ if ((seq & NSINDEX_SEQ_MASK) == 0) {
+ dev_dbg(dev, "nsindex%d sequence: %#x invalid\n", i, seq);
+ continue;
+ }
+
+ /* sanity check the index against expected values */
+ if (__le64_to_cpu(nsindex[i]->myoff)
+ != i * sizeof_namespace_index(ndd)) {
+ dev_dbg(dev, "nsindex%d myoff: %#llx invalid\n",
+ i, (unsigned long long)
+ __le64_to_cpu(nsindex[i]->myoff));
+ continue;
+ }
+ if (__le64_to_cpu(nsindex[i]->otheroff)
+ != (!i) * sizeof_namespace_index(ndd)) {
+ dev_dbg(dev, "nsindex%d otheroff: %#llx invalid\n",
+ i, (unsigned long long)
+ __le64_to_cpu(nsindex[i]->otheroff));
+ continue;
+ }
+
+ size = __le64_to_cpu(nsindex[i]->mysize);
+ if (size > sizeof_namespace_index(ndd)
+ || size < sizeof(struct nd_namespace_index)) {
+ dev_dbg(dev, "nsindex%d mysize: %#llx invalid\n", i, size);
+ continue;
+ }
+
+ nslot = __le32_to_cpu(nsindex[i]->nslot);
+ if (nslot * sizeof_namespace_label(ndd)
+ + 2 * sizeof_namespace_index(ndd)
+ > ndd->nsarea.config_size) {
+ dev_dbg(dev, "nsindex%d nslot: %u invalid, config_size: %#x\n",
+ i, nslot, ndd->nsarea.config_size);
+ continue;
+ }
+ valid[i] = true;
+ num_valid++;
+ }
+
+ switch (num_valid) {
+ case 0:
+ break;
+ case 1:
+ for (i = 0; i < num_index; i++)
+ if (valid[i])
+ return i;
+ /* can't have num_valid > 0 but valid[] = { false, false } */
+ WARN_ON(1);
+ break;
+ default:
+ /* pick the best index... */
+ seq = best_seq(__le32_to_cpu(nsindex[0]->seq),
+ __le32_to_cpu(nsindex[1]->seq));
+ if (seq == (__le32_to_cpu(nsindex[1]->seq) & NSINDEX_SEQ_MASK))
+ return 1;
+ else
+ return 0;
+ break;
+ }
+
+ return -1;
+}
+
+int nd_label_validate(struct nvdimm_drvdata *ndd)
+{
+ /*
+ * In order to probe for and validate namespace index blocks we
+ * need to know the size of the labels, and we can't trust the
+ * size of the labels until we validate the index blocks.
+ * Resolve this dependency loop by probing for known label
+ * sizes, but default to v1.2 256-byte namespace labels if
+ * discovery fails.
+ */
+ int label_size[] = { 128, 256 };
+ int i, rc;
+
+ for (i = 0; i < ARRAY_SIZE(label_size); i++) {
+ ndd->nslabel_size = label_size[i];
+ rc = __nd_label_validate(ndd);
+ if (rc >= 0)
+ return rc;
+ }
+
+ return -1;
+}
+
+void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst,
+ struct nd_namespace_index *src)
+{
+ if (dst && src)
+ /* pass */;
+ else
+ return;
+
+ memcpy(dst, src, sizeof_namespace_index(ndd));
+}
+
+static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd)
+{
+ void *base = to_namespace_index(ndd, 0);
+
+ return base + 2 * sizeof_namespace_index(ndd);
+}
+
+static int to_slot(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_label *nd_label)
+{
+ unsigned long label, base;
+
+ label = (unsigned long) nd_label;
+ base = (unsigned long) nd_label_base(ndd);
+
+ return (label - base) / sizeof_namespace_label(ndd);
+}
+
+static struct nd_namespace_label *to_label(struct nvdimm_drvdata *ndd, int slot)
+{
+ unsigned long label, base;
+
+ base = (unsigned long) nd_label_base(ndd);
+ label = base + sizeof_namespace_label(ndd) * slot;
+
+ return (struct nd_namespace_label *) label;
+}
+
+#define for_each_clear_bit_le(bit, addr, size) \
+ for ((bit) = find_next_zero_bit_le((addr), (size), 0); \
+ (bit) < (size); \
+ (bit) = find_next_zero_bit_le((addr), (size), (bit) + 1))
+
+/**
+ * preamble_index - common variable initialization for nd_label_* routines
+ * @ndd: dimm container for the relevant label set
+ * @idx: namespace_index index
+ * @nsindex_out: on return set to the currently active namespace index
+ * @free: on return set to the free label bitmap in the index
+ * @nslot: on return set to the number of slots in the label space
+ */
+static bool preamble_index(struct nvdimm_drvdata *ndd, int idx,
+ struct nd_namespace_index **nsindex_out,
+ unsigned long **free, u32 *nslot)
+{
+ struct nd_namespace_index *nsindex;
+
+ nsindex = to_namespace_index(ndd, idx);
+ if (nsindex == NULL)
+ return false;
+
+ *free = (unsigned long *) nsindex->free;
+ *nslot = __le32_to_cpu(nsindex->nslot);
+ *nsindex_out = nsindex;
+
+ return true;
+}
+
+char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags)
+{
+ if (!label_id || !uuid)
+ return NULL;
+ snprintf(label_id->id, ND_LABEL_ID_SIZE, "%s-%pUb",
+ flags & NSLABEL_FLAG_LOCAL ? "blk" : "pmem", uuid);
+ return label_id->id;
+}
+
+static bool preamble_current(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_index **nsindex,
+ unsigned long **free, u32 *nslot)
+{
+ return preamble_index(ndd, ndd->ns_current, nsindex,
+ free, nslot);
+}
+
+static bool preamble_next(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_index **nsindex,
+ unsigned long **free, u32 *nslot)
+{
+ return preamble_index(ndd, ndd->ns_next, nsindex,
+ free, nslot);
+}
+
+static bool slot_valid(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_label *nd_label, u32 slot)
+{
+ /* check that we are written where we expect to be written */
+ if (slot != __le32_to_cpu(nd_label->slot))
+ return false;
+
+ /* check that DPA allocations are page aligned */
+ if ((__le64_to_cpu(nd_label->dpa)
+ | __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
+ return false;
+
+ /* check checksum */
+ if (namespace_label_has(ndd, checksum)) {
+ u64 sum, sum_save;
+
+ sum_save = __le64_to_cpu(nd_label->checksum);
+ nd_label->checksum = __cpu_to_le64(0);
+ sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1);
+ nd_label->checksum = __cpu_to_le64(sum_save);
+ if (sum != sum_save) {
+ dev_dbg(ndd->dev, "fail checksum. slot: %d expect: %#llx\n",
+ slot, sum);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot, slot;
+
+ if (!preamble_current(ndd, &nsindex, &free, &nslot))
+ return 0; /* no label, nothing to reserve */
+
+ for_each_clear_bit_le(slot, free, nslot) {
+ struct nd_namespace_label *nd_label;
+ struct nd_region *nd_region = NULL;
+ u8 label_uuid[NSLABEL_UUID_LEN];
+ struct nd_label_id label_id;
+ struct resource *res;
+ u32 flags;
+
+ nd_label = to_label(ndd, slot);
+
+ if (!slot_valid(ndd, nd_label, slot))
+ continue;
+
+ memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
+ flags = __le32_to_cpu(nd_label->flags);
+ nd_label_gen_id(&label_id, label_uuid, flags);
+ res = nvdimm_allocate_dpa(ndd, &label_id,
+ __le64_to_cpu(nd_label->dpa),
+ __le64_to_cpu(nd_label->rawsize));
+ nd_dbg_dpa(nd_region, ndd, res, "reserve\n");
+ if (!res)
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+int nd_label_active_count(struct nvdimm_drvdata *ndd)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot, slot;
+ int count = 0;
+
+ if (!preamble_current(ndd, &nsindex, &free, &nslot))
+ return 0;
+
+ for_each_clear_bit_le(slot, free, nslot) {
+ struct nd_namespace_label *nd_label;
+
+ nd_label = to_label(ndd, slot);
+
+ if (!slot_valid(ndd, nd_label, slot)) {
+ u32 label_slot = __le32_to_cpu(nd_label->slot);
+ u64 size = __le64_to_cpu(nd_label->rawsize);
+ u64 dpa = __le64_to_cpu(nd_label->dpa);
+
+ dev_dbg(ndd->dev,
+ "slot%d invalid slot: %d dpa: %llx size: %llx\n",
+ slot, label_slot, dpa, size);
+ continue;
+ }
+ count++;
+ }
+ return count;
+}
+
+struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot, slot;
+
+ if (!preamble_current(ndd, &nsindex, &free, &nslot))
+ return NULL;
+
+ for_each_clear_bit_le(slot, free, nslot) {
+ struct nd_namespace_label *nd_label;
+
+ nd_label = to_label(ndd, slot);
+ if (!slot_valid(ndd, nd_label, slot))
+ continue;
+
+ if (n-- == 0)
+ return to_label(ndd, slot);
+ }
+
+ return NULL;
+}
+
+u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot, slot;
+
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return UINT_MAX;
+
+ WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
+
+ slot = find_next_bit_le(free, nslot, 0);
+ if (slot == nslot)
+ return UINT_MAX;
+
+ clear_bit_le(slot, free);
+
+ return slot;
+}
+
+bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot;
+
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return false;
+
+ WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
+
+ if (slot < nslot)
+ return !test_and_set_bit_le(slot, free);
+ return false;
+}
+
+u32 nd_label_nfree(struct nvdimm_drvdata *ndd)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long *free;
+ u32 nslot;
+
+ WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
+
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return nvdimm_num_label_slots(ndd);
+
+ return bitmap_weight(free, nslot);
+}
+
+static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq,
+ unsigned long flags)
+{
+ struct nd_namespace_index *nsindex;
+ unsigned long offset;
+ u64 checksum;
+ u32 nslot;
+ int rc;
+
+ nsindex = to_namespace_index(ndd, index);
+ if (flags & ND_NSINDEX_INIT)
+ nslot = nvdimm_num_label_slots(ndd);
+ else
+ nslot = __le32_to_cpu(nsindex->nslot);
+
+ memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN);
+ memset(&nsindex->flags, 0, 3);
+ nsindex->labelsize = sizeof_namespace_label(ndd) >> 8;
+ nsindex->seq = __cpu_to_le32(seq);
+ offset = (unsigned long) nsindex
+ - (unsigned long) to_namespace_index(ndd, 0);
+ nsindex->myoff = __cpu_to_le64(offset);
+ nsindex->mysize = __cpu_to_le64(sizeof_namespace_index(ndd));
+ offset = (unsigned long) to_namespace_index(ndd,
+ nd_label_next_nsindex(index))
+ - (unsigned long) to_namespace_index(ndd, 0);
+ nsindex->otheroff = __cpu_to_le64(offset);
+ offset = (unsigned long) nd_label_base(ndd)
+ - (unsigned long) to_namespace_index(ndd, 0);
+ nsindex->labeloff = __cpu_to_le64(offset);
+ nsindex->nslot = __cpu_to_le32(nslot);
+ nsindex->major = __cpu_to_le16(1);
+ if (sizeof_namespace_label(ndd) < 256)
+ nsindex->minor = __cpu_to_le16(1);
+ else
+ nsindex->minor = __cpu_to_le16(2);
+ nsindex->checksum = __cpu_to_le64(0);
+ if (flags & ND_NSINDEX_INIT) {
+ unsigned long *free = (unsigned long *) nsindex->free;
+ u32 nfree = ALIGN(nslot, BITS_PER_LONG);
+ int last_bits, i;
+
+ memset(nsindex->free, 0xff, nfree / 8);
+ for (i = 0, last_bits = nfree - nslot; i < last_bits; i++)
+ clear_bit_le(nslot + i, free);
+ }
+ checksum = nd_fletcher64(nsindex, sizeof_namespace_index(ndd), 1);
+ nsindex->checksum = __cpu_to_le64(checksum);
+ rc = nvdimm_set_config_data(ndd, __le64_to_cpu(nsindex->myoff),
+ nsindex, sizeof_namespace_index(ndd));
+ if (rc < 0)
+ return rc;
+
+ if (flags & ND_NSINDEX_INIT)
+ return 0;
+
+ /* copy the index we just wrote to the new 'next' */
+ WARN_ON(index != ndd->ns_next);
+ nd_label_copy(ndd, to_current_namespace_index(ndd), nsindex);
+ ndd->ns_current = nd_label_next_nsindex(ndd->ns_current);
+ ndd->ns_next = nd_label_next_nsindex(ndd->ns_next);
+ WARN_ON(ndd->ns_current == ndd->ns_next);
+
+ return 0;
+}
+
+static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_label *nd_label)
+{
+ return (unsigned long) nd_label
+ - (unsigned long) to_namespace_index(ndd, 0);
+}
+
+enum nvdimm_claim_class to_nvdimm_cclass(guid_t *guid)
+{
+ if (guid_equal(guid, &nvdimm_btt_guid))
+ return NVDIMM_CCLASS_BTT;
+ else if (guid_equal(guid, &nvdimm_btt2_guid))
+ return NVDIMM_CCLASS_BTT2;
+ else if (guid_equal(guid, &nvdimm_pfn_guid))
+ return NVDIMM_CCLASS_PFN;
+ else if (guid_equal(guid, &nvdimm_dax_guid))
+ return NVDIMM_CCLASS_DAX;
+ else if (guid_equal(guid, &guid_null))
+ return NVDIMM_CCLASS_NONE;
+
+ return NVDIMM_CCLASS_UNKNOWN;
+}
+
+static const guid_t *to_abstraction_guid(enum nvdimm_claim_class claim_class,
+ guid_t *target)
+{
+ if (claim_class == NVDIMM_CCLASS_BTT)
+ return &nvdimm_btt_guid;
+ else if (claim_class == NVDIMM_CCLASS_BTT2)
+ return &nvdimm_btt2_guid;
+ else if (claim_class == NVDIMM_CCLASS_PFN)
+ return &nvdimm_pfn_guid;
+ else if (claim_class == NVDIMM_CCLASS_DAX)
+ return &nvdimm_dax_guid;
+ else if (claim_class == NVDIMM_CCLASS_UNKNOWN) {
+ /*
+ * If we're modifying a namespace for which we don't
+ * know the claim_class, don't touch the existing guid.
+ */
+ return target;
+ } else
+ return &guid_null;
+}
+
+static void reap_victim(struct nd_mapping *nd_mapping,
+ struct nd_label_ent *victim)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ u32 slot = to_slot(ndd, victim->label);
+
+ dev_dbg(ndd->dev, "free: %d\n", slot);
+ nd_label_free_slot(ndd, slot);
+ victim->label = NULL;
+}
+
+static int __pmem_label_update(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm,
+ int pos, unsigned long flags)
+{
+ struct nd_namespace_common *ndns = &nspm->nsio.common;
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_namespace_label *nd_label;
+ struct nd_namespace_index *nsindex;
+ struct nd_label_ent *label_ent;
+ struct nd_label_id label_id;
+ struct resource *res;
+ unsigned long *free;
+ u32 nslot, slot;
+ size_t offset;
+ u64 cookie;
+ int rc;
+
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return -ENXIO;
+
+ cookie = nd_region_interleave_set_cookie(nd_region, nsindex);
+ nd_label_gen_id(&label_id, nspm->uuid, 0);
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id.id) == 0)
+ break;
+
+ if (!res) {
+ WARN_ON_ONCE(1);
+ return -ENXIO;
+ }
+
+ /* allocate and write the label to the staging (next) index */
+ slot = nd_label_alloc_slot(ndd);
+ if (slot == UINT_MAX)
+ return -ENXIO;
+ dev_dbg(ndd->dev, "allocated: %d\n", slot);
+
+ nd_label = to_label(ndd, slot);
+ memset(nd_label, 0, sizeof_namespace_label(ndd));
+ memcpy(nd_label->uuid, nspm->uuid, NSLABEL_UUID_LEN);
+ if (nspm->alt_name)
+ memcpy(nd_label->name, nspm->alt_name, NSLABEL_NAME_LEN);
+ nd_label->flags = __cpu_to_le32(flags);
+ nd_label->nlabel = __cpu_to_le16(nd_region->ndr_mappings);
+ nd_label->position = __cpu_to_le16(pos);
+ nd_label->isetcookie = __cpu_to_le64(cookie);
+ nd_label->rawsize = __cpu_to_le64(resource_size(res));
+ nd_label->lbasize = __cpu_to_le64(nspm->lbasize);
+ nd_label->dpa = __cpu_to_le64(res->start);
+ nd_label->slot = __cpu_to_le32(slot);
+ if (namespace_label_has(ndd, type_guid))
+ guid_copy(&nd_label->type_guid, &nd_set->type_guid);
+ if (namespace_label_has(ndd, abstraction_guid))
+ guid_copy(&nd_label->abstraction_guid,
+ to_abstraction_guid(ndns->claim_class,
+ &nd_label->abstraction_guid));
+ if (namespace_label_has(ndd, checksum)) {
+ u64 sum;
+
+ nd_label->checksum = __cpu_to_le64(0);
+ sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1);
+ nd_label->checksum = __cpu_to_le64(sum);
+ }
+ nd_dbg_dpa(nd_region, ndd, res, "\n");
+
+ /* update label */
+ offset = nd_label_offset(ndd, nd_label);
+ rc = nvdimm_set_config_data(ndd, offset, nd_label,
+ sizeof_namespace_label(ndd));
+ if (rc < 0)
+ return rc;
+
+ /* Garbage collect the previous label */
+ mutex_lock(&nd_mapping->lock);
+ list_for_each_entry(label_ent, &nd_mapping->labels, list) {
+ if (!label_ent->label)
+ continue;
+ if (test_and_clear_bit(ND_LABEL_REAP, &label_ent->flags)
+ || memcmp(nspm->uuid, label_ent->label->uuid,
+ NSLABEL_UUID_LEN) == 0)
+ reap_victim(nd_mapping, label_ent);
+ }
+
+ /* update index */
+ rc = nd_label_write_index(ndd, ndd->ns_next,
+ nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
+ if (rc == 0) {
+ list_for_each_entry(label_ent, &nd_mapping->labels, list)
+ if (!label_ent->label) {
+ label_ent->label = nd_label;
+ nd_label = NULL;
+ break;
+ }
+ dev_WARN_ONCE(&nspm->nsio.common.dev, nd_label,
+ "failed to track label: %d\n",
+ to_slot(ndd, nd_label));
+ if (nd_label)
+ rc = -ENXIO;
+ }
+ mutex_unlock(&nd_mapping->lock);
+
+ return rc;
+}
+
+static bool is_old_resource(struct resource *res, struct resource **list, int n)
+{
+ int i;
+
+ if (res->flags & DPA_RESOURCE_ADJUSTED)
+ return false;
+ for (i = 0; i < n; i++)
+ if (res == list[i])
+ return true;
+ return false;
+}
+
+static struct resource *to_resource(struct nvdimm_drvdata *ndd,
+ struct nd_namespace_label *nd_label)
+{
+ struct resource *res;
+
+ for_each_dpa_resource(ndd, res) {
+ if (res->start != __le64_to_cpu(nd_label->dpa))
+ continue;
+ if (resource_size(res) != __le64_to_cpu(nd_label->rawsize))
+ continue;
+ return res;
+ }
+
+ return NULL;
+}
+
+/*
+ * 1/ Account all the labels that can be freed after this update
+ * 2/ Allocate and write the label to the staging (next) index
+ * 3/ Record the resources in the namespace device
+ */
+static int __blk_label_update(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk,
+ int num_labels)
+{
+ int i, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO;
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+ struct nd_namespace_common *ndns = &nsblk->common;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_namespace_label *nd_label;
+ struct nd_label_ent *label_ent, *e;
+ struct nd_namespace_index *nsindex;
+ unsigned long *free, *victim_map = NULL;
+ struct resource *res, **old_res_list;
+ struct nd_label_id label_id;
+ u8 uuid[NSLABEL_UUID_LEN];
+ int min_dpa_idx = 0;
+ LIST_HEAD(list);
+ u32 nslot, slot;
+
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return -ENXIO;
+
+ old_res_list = nsblk->res;
+ nfree = nd_label_nfree(ndd);
+ old_num_resources = nsblk->num_resources;
+ nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+
+ /*
+ * We need to loop over the old resources a few times, which seems a
+ * bit inefficient, but we need to know that we have the label
+ * space before we start mutating the tracking structures.
+ * Otherwise the recovery method of last resort for userspace is
+ * disable and re-enable the parent region.
+ */
+ alloc = 0;
+ for_each_dpa_resource(ndd, res) {
+ if (strcmp(res->name, label_id.id) != 0)
+ continue;
+ if (!is_old_resource(res, old_res_list, old_num_resources))
+ alloc++;
+ }
+
+ victims = 0;
+ if (old_num_resources) {
+ /* convert old local-label-map to dimm-slot victim-map */
+ victim_map = kcalloc(BITS_TO_LONGS(nslot), sizeof(long),
+ GFP_KERNEL);
+ if (!victim_map)
+ return -ENOMEM;
+
+ /* mark unused labels for garbage collection */
+ for_each_clear_bit_le(slot, free, nslot) {
+ nd_label = to_label(ndd, slot);
+ memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
+ if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+ res = to_resource(ndd, nd_label);
+ if (res && is_old_resource(res, old_res_list,
+ old_num_resources))
+ continue;
+ slot = to_slot(ndd, nd_label);
+ set_bit(slot, victim_map);
+ victims++;
+ }
+ }
+
+ /* don't allow updates that consume the last label */
+ if (nfree - alloc < 0 || nfree - alloc + victims < 1) {
+ dev_info(&nsblk->common.dev, "insufficient label space\n");
+ kfree(victim_map);
+ return -ENOSPC;
+ }
+ /* from here on we need to abort on error */
+
+
+ /* assign all resources to the namespace before writing the labels */
+ nsblk->res = NULL;
+ nsblk->num_resources = 0;
+ for_each_dpa_resource(ndd, res) {
+ if (strcmp(res->name, label_id.id) != 0)
+ continue;
+ if (!nsblk_add_resource(nd_region, ndd, nsblk, res->start)) {
+ rc = -ENOMEM;
+ goto abort;
+ }
+ }
+
+ /* release slots associated with any invalidated UUIDs */
+ mutex_lock(&nd_mapping->lock);
+ list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list)
+ if (test_and_clear_bit(ND_LABEL_REAP, &label_ent->flags)) {
+ reap_victim(nd_mapping, label_ent);
+ list_move(&label_ent->list, &list);
+ }
+ mutex_unlock(&nd_mapping->lock);
+
+ /*
+ * Find the resource associated with the first label in the set
+ * per the v1.2 namespace specification.
+ */
+ for (i = 0; i < nsblk->num_resources; i++) {
+ struct resource *min = nsblk->res[min_dpa_idx];
+
+ res = nsblk->res[i];
+ if (res->start < min->start)
+ min_dpa_idx = i;
+ }
+
+ for (i = 0; i < nsblk->num_resources; i++) {
+ size_t offset;
+
+ res = nsblk->res[i];
+ if (is_old_resource(res, old_res_list, old_num_resources))
+ continue; /* carry-over */
+ slot = nd_label_alloc_slot(ndd);
+ if (slot == UINT_MAX) {
+ rc = -ENXIO;
+ goto abort;
+ }
+ dev_dbg(ndd->dev, "allocated: %d\n", slot);
+
+ nd_label = to_label(ndd, slot);
+ memset(nd_label, 0, sizeof_namespace_label(ndd));
+ memcpy(nd_label->uuid, nsblk->uuid, NSLABEL_UUID_LEN);
+ if (nsblk->alt_name)
+ memcpy(nd_label->name, nsblk->alt_name,
+ NSLABEL_NAME_LEN);
+ nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_LOCAL);
+
+ /*
+ * Use the presence of the type_guid as a flag to
+ * determine isetcookie usage and nlabel + position
+ * policy for blk-aperture namespaces.
+ */
+ if (namespace_label_has(ndd, type_guid)) {
+ if (i == min_dpa_idx) {
+ nd_label->nlabel = __cpu_to_le16(nsblk->num_resources);
+ nd_label->position = __cpu_to_le16(0);
+ } else {
+ nd_label->nlabel = __cpu_to_le16(0xffff);
+ nd_label->position = __cpu_to_le16(0xffff);
+ }
+ nd_label->isetcookie = __cpu_to_le64(nd_set->cookie2);
+ } else {
+ nd_label->nlabel = __cpu_to_le16(0); /* N/A */
+ nd_label->position = __cpu_to_le16(0); /* N/A */
+ nd_label->isetcookie = __cpu_to_le64(0); /* N/A */
+ }
+
+ nd_label->dpa = __cpu_to_le64(res->start);
+ nd_label->rawsize = __cpu_to_le64(resource_size(res));
+ nd_label->lbasize = __cpu_to_le64(nsblk->lbasize);
+ nd_label->slot = __cpu_to_le32(slot);
+ if (namespace_label_has(ndd, type_guid))
+ guid_copy(&nd_label->type_guid, &nd_set->type_guid);
+ if (namespace_label_has(ndd, abstraction_guid))
+ guid_copy(&nd_label->abstraction_guid,
+ to_abstraction_guid(ndns->claim_class,
+ &nd_label->abstraction_guid));
+
+ if (namespace_label_has(ndd, checksum)) {
+ u64 sum;
+
+ nd_label->checksum = __cpu_to_le64(0);
+ sum = nd_fletcher64(nd_label,
+ sizeof_namespace_label(ndd), 1);
+ nd_label->checksum = __cpu_to_le64(sum);
+ }
+
+ /* update label */
+ offset = nd_label_offset(ndd, nd_label);
+ rc = nvdimm_set_config_data(ndd, offset, nd_label,
+ sizeof_namespace_label(ndd));
+ if (rc < 0)
+ goto abort;
+ }
+
+ /* free up now unused slots in the new index */
+ for_each_set_bit(slot, victim_map, victim_map ? nslot : 0) {
+ dev_dbg(ndd->dev, "free: %d\n", slot);
+ nd_label_free_slot(ndd, slot);
+ }
+
+ /* update index */
+ rc = nd_label_write_index(ndd, ndd->ns_next,
+ nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
+ if (rc)
+ goto abort;
+
+ /*
+ * Now that the on-dimm labels are up to date, fix up the tracking
+ * entries in nd_mapping->labels
+ */
+ nlabel = 0;
+ mutex_lock(&nd_mapping->lock);
+ list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
+ nd_label = label_ent->label;
+ if (!nd_label)
+ continue;
+ nlabel++;
+ memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
+ if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+ nlabel--;
+ list_move(&label_ent->list, &list);
+ label_ent->label = NULL;
+ }
+ list_splice_tail_init(&list, &nd_mapping->labels);
+ mutex_unlock(&nd_mapping->lock);
+
+ if (nlabel + nsblk->num_resources > num_labels) {
+ /*
+ * Bug, we can't end up with more resources than
+ * available labels
+ */
+ WARN_ON_ONCE(1);
+ rc = -ENXIO;
+ goto out;
+ }
+
+ mutex_lock(&nd_mapping->lock);
+ label_ent = list_first_entry_or_null(&nd_mapping->labels,
+ typeof(*label_ent), list);
+ if (!label_ent) {
+ WARN_ON(1);
+ mutex_unlock(&nd_mapping->lock);
+ rc = -ENXIO;
+ goto out;
+ }
+ for_each_clear_bit_le(slot, free, nslot) {
+ nd_label = to_label(ndd, slot);
+ memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
+ if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+ res = to_resource(ndd, nd_label);
+ res->flags &= ~DPA_RESOURCE_ADJUSTED;
+ dev_vdbg(&nsblk->common.dev, "assign label slot: %d\n", slot);
+ list_for_each_entry_from(label_ent, &nd_mapping->labels, list) {
+ if (label_ent->label)
+ continue;
+ label_ent->label = nd_label;
+ nd_label = NULL;
+ break;
+ }
+ if (nd_label)
+ dev_WARN(&nsblk->common.dev,
+ "failed to track label slot%d\n", slot);
+ }
+ mutex_unlock(&nd_mapping->lock);
+
+ out:
+ kfree(old_res_list);
+ kfree(victim_map);
+ return rc;
+
+ abort:
+ /*
+ * 1/ repair the allocated label bitmap in the index
+ * 2/ restore the resource list
+ */
+ nd_label_copy(ndd, nsindex, to_current_namespace_index(ndd));
+ kfree(nsblk->res);
+ nsblk->res = old_res_list;
+ nsblk->num_resources = old_num_resources;
+ old_res_list = NULL;
+ goto out;
+}
+
+static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
+{
+ int i, old_num_labels = 0;
+ struct nd_label_ent *label_ent;
+ struct nd_namespace_index *nsindex;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+
+ mutex_lock(&nd_mapping->lock);
+ list_for_each_entry(label_ent, &nd_mapping->labels, list)
+ old_num_labels++;
+ mutex_unlock(&nd_mapping->lock);
+
+ /*
+ * We need to preserve all the old labels for the mapping so
+ * they can be garbage collected after writing the new labels.
+ */
+ for (i = old_num_labels; i < num_labels; i++) {
+ label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL);
+ if (!label_ent)
+ return -ENOMEM;
+ mutex_lock(&nd_mapping->lock);
+ list_add_tail(&label_ent->list, &nd_mapping->labels);
+ mutex_unlock(&nd_mapping->lock);
+ }
+
+ if (ndd->ns_current == -1 || ndd->ns_next == -1)
+ /* pass */;
+ else
+ return max(num_labels, old_num_labels);
+
+ nsindex = to_namespace_index(ndd, 0);
+ memset(nsindex, 0, ndd->nsarea.config_size);
+ for (i = 0; i < 2; i++) {
+ int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT);
+
+ if (rc)
+ return rc;
+ }
+ ndd->ns_next = 1;
+ ndd->ns_current = 0;
+
+ return max(num_labels, old_num_labels);
+}
+
+static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_label_ent *label_ent, *e;
+ struct nd_namespace_index *nsindex;
+ u8 label_uuid[NSLABEL_UUID_LEN];
+ unsigned long *free;
+ LIST_HEAD(list);
+ u32 nslot, slot;
+ int active = 0;
+
+ if (!uuid)
+ return 0;
+
+ /* no index || no labels == nothing to delete */
+ if (!preamble_next(ndd, &nsindex, &free, &nslot))
+ return 0;
+
+ mutex_lock(&nd_mapping->lock);
+ list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
+ struct nd_namespace_label *nd_label = label_ent->label;
+
+ if (!nd_label)
+ continue;
+ active++;
+ memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
+ if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+ active--;
+ slot = to_slot(ndd, nd_label);
+ nd_label_free_slot(ndd, slot);
+ dev_dbg(ndd->dev, "free: %d\n", slot);
+ list_move_tail(&label_ent->list, &list);
+ label_ent->label = NULL;
+ }
+ list_splice_tail_init(&list, &nd_mapping->labels);
+
+ if (active == 0) {
+ nd_mapping_free_labels(nd_mapping);
+ dev_dbg(ndd->dev, "no more active labels\n");
+ }
+ mutex_unlock(&nd_mapping->lock);
+
+ return nd_label_write_index(ndd, ndd->ns_next,
+ nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
+}
+
+int nd_pmem_namespace_label_update(struct nd_region *nd_region,
+ struct nd_namespace_pmem *nspm, resource_size_t size)
+{
+ int i, rc;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res;
+ int count = 0;
+
+ if (size == 0) {
+ rc = del_labels(nd_mapping, nspm->uuid);
+ if (rc)
+ return rc;
+ continue;
+ }
+
+ for_each_dpa_resource(ndd, res)
+ if (strncmp(res->name, "pmem", 4) == 0)
+ count++;
+ WARN_ON_ONCE(!count);
+
+ rc = init_labels(nd_mapping, count);
+ if (rc < 0)
+ return rc;
+
+ rc = __pmem_label_update(nd_region, nd_mapping, nspm, i,
+ NSLABEL_FLAG_UPDATING);
+ if (rc)
+ return rc;
+ }
+
+ if (size == 0)
+ return 0;
+
+ /* Clear the UPDATING flag per UEFI 2.7 expectations */
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+ rc = __pmem_label_update(nd_region, nd_mapping, nspm, i, 0);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+int nd_blk_namespace_label_update(struct nd_region *nd_region,
+ struct nd_namespace_blk *nsblk, resource_size_t size)
+{
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct resource *res;
+ int count = 0;
+
+ if (size == 0)
+ return del_labels(nd_mapping, nsblk->uuid);
+
+ for_each_dpa_resource(to_ndd(nd_mapping), res)
+ count++;
+
+ count = init_labels(nd_mapping, count);
+ if (count < 0)
+ return count;
+
+ return __blk_label_update(nd_region, nd_mapping, nsblk, count);
+}
+
+int __init nd_label_init(void)
+{
+ WARN_ON(guid_parse(NVDIMM_BTT_GUID, &nvdimm_btt_guid));
+ WARN_ON(guid_parse(NVDIMM_BTT2_GUID, &nvdimm_btt2_guid));
+ WARN_ON(guid_parse(NVDIMM_PFN_GUID, &nvdimm_pfn_guid));
+ WARN_ON(guid_parse(NVDIMM_DAX_GUID, &nvdimm_dax_guid));
+
+ return 0;
+}
diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h
new file mode 100644
index 000000000..52f9fcada
--- /dev/null
+++ b/drivers/nvdimm/label.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __LABEL_H__
+#define __LABEL_H__
+
+#include <linux/ndctl.h>
+#include <linux/sizes.h>
+#include <linux/uuid.h>
+#include <linux/io.h>
+
+enum {
+ NSINDEX_SIG_LEN = 16,
+ NSINDEX_ALIGN = 256,
+ NSINDEX_SEQ_MASK = 0x3,
+ NSLABEL_UUID_LEN = 16,
+ NSLABEL_NAME_LEN = 64,
+ NSLABEL_FLAG_ROLABEL = 0x1, /* read-only label */
+ NSLABEL_FLAG_LOCAL = 0x2, /* DIMM-local namespace */
+ NSLABEL_FLAG_BTT = 0x4, /* namespace contains a BTT */
+ NSLABEL_FLAG_UPDATING = 0x8, /* label being updated */
+ BTT_ALIGN = 4096, /* all btt structures */
+ BTTINFO_SIG_LEN = 16,
+ BTTINFO_UUID_LEN = 16,
+ BTTINFO_FLAG_ERROR = 0x1, /* error state (read-only) */
+ BTTINFO_MAJOR_VERSION = 1,
+ ND_LABEL_MIN_SIZE = 256 * 4, /* see sizeof_namespace_index() */
+ ND_LABEL_ID_SIZE = 50,
+ ND_NSINDEX_INIT = 0x1,
+};
+
+/**
+ * struct nd_namespace_index - label set superblock
+ * @sig: NAMESPACE_INDEX\0
+ * @flags: placeholder
+ * @seq: sequence number for this index
+ * @myoff: offset of this index in label area
+ * @mysize: size of this index struct
+ * @otheroff: offset of other index
+ * @labeloff: offset of first label slot
+ * @nslot: total number of label slots
+ * @major: label area major version
+ * @minor: label area minor version
+ * @checksum: fletcher64 of all fields
+ * @free[0]: bitmap, nlabel bits
+ *
+ * The size of free[] is rounded up so the total struct size is a
+ * multiple of NSINDEX_ALIGN bytes. Any bits this allocates beyond
+ * nlabel bits must be zero.
+ */
+struct nd_namespace_index {
+ u8 sig[NSINDEX_SIG_LEN];
+ u8 flags[3];
+ u8 labelsize;
+ __le32 seq;
+ __le64 myoff;
+ __le64 mysize;
+ __le64 otheroff;
+ __le64 labeloff;
+ __le32 nslot;
+ __le16 major;
+ __le16 minor;
+ __le64 checksum;
+ u8 free[0];
+};
+
+/**
+ * struct nd_namespace_label - namespace superblock
+ * @uuid: UUID per RFC 4122
+ * @name: optional name (NULL-terminated)
+ * @flags: see NSLABEL_FLAG_*
+ * @nlabel: num labels to describe this ns
+ * @position: labels position in set
+ * @isetcookie: interleave set cookie
+ * @lbasize: LBA size in bytes or 0 for pmem
+ * @dpa: DPA of NVM range on this DIMM
+ * @rawsize: size of namespace
+ * @slot: slot of this label in label area
+ * @unused: must be zero
+ */
+struct nd_namespace_label {
+ u8 uuid[NSLABEL_UUID_LEN];
+ u8 name[NSLABEL_NAME_LEN];
+ __le32 flags;
+ __le16 nlabel;
+ __le16 position;
+ __le64 isetcookie;
+ __le64 lbasize;
+ __le64 dpa;
+ __le64 rawsize;
+ __le32 slot;
+ /*
+ * Accessing fields past this point should be gated by a
+ * namespace_label_has() check.
+ */
+ u8 align;
+ u8 reserved[3];
+ guid_t type_guid;
+ guid_t abstraction_guid;
+ u8 reserved2[88];
+ __le64 checksum;
+};
+
+#define NVDIMM_BTT_GUID "8aed63a2-29a2-4c66-8b12-f05d15d3922a"
+#define NVDIMM_BTT2_GUID "18633bfc-1735-4217-8ac9-17239282d3f8"
+#define NVDIMM_PFN_GUID "266400ba-fb9f-4677-bcb0-968f11d0d225"
+#define NVDIMM_DAX_GUID "97a86d9c-3cdd-4eda-986f-5068b4f80088"
+
+/**
+ * struct nd_label_id - identifier string for dpa allocation
+ * @id: "{blk|pmem}-<namespace uuid>"
+ */
+struct nd_label_id {
+ char id[ND_LABEL_ID_SIZE];
+};
+
+/*
+ * If the 'best' index is invalid, so is the 'next' index. Otherwise,
+ * the next index is MOD(index+1, 2)
+ */
+static inline int nd_label_next_nsindex(int index)
+{
+ if (index < 0)
+ return -1;
+
+ return (index + 1) % 2;
+}
+
+struct nvdimm_drvdata;
+int nd_label_validate(struct nvdimm_drvdata *ndd);
+void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst,
+ struct nd_namespace_index *src);
+size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd);
+int nd_label_active_count(struct nvdimm_drvdata *ndd);
+struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n);
+u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd);
+bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot);
+u32 nd_label_nfree(struct nvdimm_drvdata *ndd);
+enum nvdimm_claim_class to_nvdimm_cclass(guid_t *guid);
+struct nd_region;
+struct nd_namespace_pmem;
+struct nd_namespace_blk;
+int nd_pmem_namespace_label_update(struct nd_region *nd_region,
+ struct nd_namespace_pmem *nspm, resource_size_t size);
+int nd_blk_namespace_label_update(struct nd_region *nd_region,
+ struct nd_namespace_blk *nsblk, resource_size_t size);
+#endif /* __LABEL_H__ */
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
new file mode 100644
index 000000000..63640c315
--- /dev/null
+++ b/drivers/nvdimm/namespace_devs.c
@@ -0,0 +1,2622 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "pmem.h"
+#include "nd.h"
+
+static void namespace_io_release(struct device *dev)
+{
+ struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+ kfree(nsio);
+}
+
+static void namespace_pmem_release(struct device *dev)
+{
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+
+ if (nspm->id >= 0)
+ ida_simple_remove(&nd_region->ns_ida, nspm->id);
+ kfree(nspm->alt_name);
+ kfree(nspm->uuid);
+ kfree(nspm);
+}
+
+static void namespace_blk_release(struct device *dev)
+{
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+
+ if (nsblk->id >= 0)
+ ida_simple_remove(&nd_region->ns_ida, nsblk->id);
+ kfree(nsblk->alt_name);
+ kfree(nsblk->uuid);
+ kfree(nsblk->res);
+ kfree(nsblk);
+}
+
+static const struct device_type namespace_io_device_type = {
+ .name = "nd_namespace_io",
+ .release = namespace_io_release,
+};
+
+static const struct device_type namespace_pmem_device_type = {
+ .name = "nd_namespace_pmem",
+ .release = namespace_pmem_release,
+};
+
+static const struct device_type namespace_blk_device_type = {
+ .name = "nd_namespace_blk",
+ .release = namespace_blk_release,
+};
+
+static bool is_namespace_pmem(const struct device *dev)
+{
+ return dev ? dev->type == &namespace_pmem_device_type : false;
+}
+
+static bool is_namespace_blk(const struct device *dev)
+{
+ return dev ? dev->type == &namespace_blk_device_type : false;
+}
+
+static bool is_namespace_io(const struct device *dev)
+{
+ return dev ? dev->type == &namespace_io_device_type : false;
+}
+
+static int is_uuid_busy(struct device *dev, void *data)
+{
+ u8 *uuid1 = data, *uuid2 = NULL;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ uuid2 = nspm->uuid;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ uuid2 = nsblk->uuid;
+ } else if (is_nd_btt(dev)) {
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ uuid2 = nd_btt->uuid;
+ } else if (is_nd_pfn(dev)) {
+ struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+ uuid2 = nd_pfn->uuid;
+ }
+
+ if (uuid2 && memcmp(uuid1, uuid2, NSLABEL_UUID_LEN) == 0)
+ return -EBUSY;
+
+ return 0;
+}
+
+static int is_namespace_uuid_busy(struct device *dev, void *data)
+{
+ if (is_nd_region(dev))
+ return device_for_each_child(dev, data, is_uuid_busy);
+ return 0;
+}
+
+/**
+ * nd_is_uuid_unique - verify that no other namespace has @uuid
+ * @dev: any device on a nvdimm_bus
+ * @uuid: uuid to check
+ */
+bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!nvdimm_bus)
+ return false;
+ WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
+ if (device_for_each_child(&nvdimm_bus->dev, uuid,
+ is_namespace_uuid_busy) != 0)
+ return false;
+ return true;
+}
+
+bool pmem_should_map_pages(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_namespace_common *ndns = to_ndns(dev);
+ struct nd_namespace_io *nsio;
+
+ if (!IS_ENABLED(CONFIG_ZONE_DEVICE))
+ return false;
+
+ if (!test_bit(ND_REGION_PAGEMAP, &nd_region->flags))
+ return false;
+
+ if (is_nd_pfn(dev) || is_nd_btt(dev))
+ return false;
+
+ if (ndns->force_raw)
+ return false;
+
+ nsio = to_nd_namespace_io(dev);
+ if (region_intersects(nsio->res.start, resource_size(&nsio->res),
+ IORESOURCE_SYSTEM_RAM,
+ IORES_DESC_NONE) == REGION_MIXED)
+ return false;
+
+ return ARCH_MEMREMAP_PMEM == MEMREMAP_WB;
+}
+EXPORT_SYMBOL(pmem_should_map_pages);
+
+unsigned int pmem_sector_size(struct nd_namespace_common *ndns)
+{
+ if (is_namespace_pmem(&ndns->dev)) {
+ struct nd_namespace_pmem *nspm;
+
+ nspm = to_nd_namespace_pmem(&ndns->dev);
+ if (nspm->lbasize == 0 || nspm->lbasize == 512)
+ /* default */;
+ else if (nspm->lbasize == 4096)
+ return 4096;
+ else
+ dev_WARN(&ndns->dev, "unsupported sector size: %ld\n",
+ nspm->lbasize);
+ }
+
+ /*
+ * There is no namespace label (is_namespace_io()), or the label
+ * indicates the default sector size.
+ */
+ return 512;
+}
+EXPORT_SYMBOL(pmem_sector_size);
+
+const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
+ char *name)
+{
+ struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+ const char *suffix = NULL;
+
+ if (ndns->claim && is_nd_btt(ndns->claim))
+ suffix = "s";
+
+ if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) {
+ int nsidx = 0;
+
+ if (is_namespace_pmem(&ndns->dev)) {
+ struct nd_namespace_pmem *nspm;
+
+ nspm = to_nd_namespace_pmem(&ndns->dev);
+ nsidx = nspm->id;
+ }
+
+ if (nsidx)
+ sprintf(name, "pmem%d.%d%s", nd_region->id, nsidx,
+ suffix ? suffix : "");
+ else
+ sprintf(name, "pmem%d%s", nd_region->id,
+ suffix ? suffix : "");
+ } else if (is_namespace_blk(&ndns->dev)) {
+ struct nd_namespace_blk *nsblk;
+
+ nsblk = to_nd_namespace_blk(&ndns->dev);
+ sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id,
+ suffix ? suffix : "");
+ } else {
+ return NULL;
+ }
+
+ return name;
+}
+EXPORT_SYMBOL(nvdimm_namespace_disk_name);
+
+const u8 *nd_dev_to_uuid(struct device *dev)
+{
+ static const u8 null_uuid[16];
+
+ if (!dev)
+ return null_uuid;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ return nspm->uuid;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ return nsblk->uuid;
+ } else
+ return null_uuid;
+}
+EXPORT_SYMBOL(nd_dev_to_uuid);
+
+static ssize_t nstype_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+
+ return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region));
+}
+static DEVICE_ATTR_RO(nstype);
+
+static ssize_t __alt_name_store(struct device *dev, const char *buf,
+ const size_t len)
+{
+ char *input, *pos, *alt_name, **ns_altname;
+ ssize_t rc;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ ns_altname = &nspm->alt_name;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ ns_altname = &nsblk->alt_name;
+ } else
+ return -ENXIO;
+
+ if (dev->driver || to_ndns(dev)->claim)
+ return -EBUSY;
+
+ input = kmemdup(buf, len + 1, GFP_KERNEL);
+ if (!input)
+ return -ENOMEM;
+
+ input[len] = '\0';
+ pos = strim(input);
+ if (strlen(pos) + 1 > NSLABEL_NAME_LEN) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ alt_name = kzalloc(NSLABEL_NAME_LEN, GFP_KERNEL);
+ if (!alt_name) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ kfree(*ns_altname);
+ *ns_altname = alt_name;
+ sprintf(*ns_altname, "%s", pos);
+ rc = len;
+
+out:
+ kfree(input);
+ return rc;
+}
+
+static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk)
+{
+ struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_label_id label_id;
+ resource_size_t size = 0;
+ struct resource *res;
+
+ if (!nsblk->uuid)
+ return 0;
+ nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id.id) == 0)
+ size += resource_size(res);
+ return size;
+}
+
+static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
+{
+ struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_label_id label_id;
+ struct resource *res;
+ int count, i;
+
+ if (!nsblk->uuid || !nsblk->lbasize || !ndd)
+ return false;
+
+ count = 0;
+ nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+ for_each_dpa_resource(ndd, res) {
+ if (strcmp(res->name, label_id.id) != 0)
+ continue;
+ /*
+ * Resources with unacknowledged adjustments indicate a
+ * failure to update labels
+ */
+ if (res->flags & DPA_RESOURCE_ADJUSTED)
+ return false;
+ count++;
+ }
+
+ /* These values match after a successful label update */
+ if (count != nsblk->num_resources)
+ return false;
+
+ for (i = 0; i < nsblk->num_resources; i++) {
+ struct resource *found = NULL;
+
+ for_each_dpa_resource(ndd, res)
+ if (res == nsblk->res[i]) {
+ found = res;
+ break;
+ }
+ /* stale resource */
+ if (!found)
+ return false;
+ }
+
+ return true;
+}
+
+resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
+{
+ resource_size_t size;
+
+ nvdimm_bus_lock(&nsblk->common.dev);
+ size = __nd_namespace_blk_validate(nsblk);
+ nvdimm_bus_unlock(&nsblk->common.dev);
+
+ return size;
+}
+EXPORT_SYMBOL(nd_namespace_blk_validate);
+
+
+static int nd_namespace_label_update(struct nd_region *nd_region,
+ struct device *dev)
+{
+ dev_WARN_ONCE(dev, dev->driver || to_ndns(dev)->claim,
+ "namespace must be idle during label update\n");
+ if (dev->driver || to_ndns(dev)->claim)
+ return 0;
+
+ /*
+ * Only allow label writes that will result in a valid namespace
+ * or deletion of an existing namespace.
+ */
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+ resource_size_t size = resource_size(&nspm->nsio.res);
+
+ if (size == 0 && nspm->uuid)
+ /* delete allocation */;
+ else if (!nspm->uuid)
+ return 0;
+
+ return nd_pmem_namespace_label_update(nd_region, nspm, size);
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+ resource_size_t size = nd_namespace_blk_size(nsblk);
+
+ if (size == 0 && nsblk->uuid)
+ /* delete allocation */;
+ else if (!nsblk->uuid || !nsblk->lbasize)
+ return 0;
+
+ return nd_blk_namespace_label_update(nd_region, nsblk, size);
+ } else
+ return -ENXIO;
+}
+
+static ssize_t alt_name_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ ssize_t rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ rc = __alt_name_store(dev, buf, len);
+ if (rc >= 0)
+ rc = nd_namespace_label_update(nd_region, dev);
+ dev_dbg(dev, "%s(%zd)\n", rc < 0 ? "fail " : "", rc);
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc < 0 ? rc : len;
+}
+
+static ssize_t alt_name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ char *ns_altname;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ ns_altname = nspm->alt_name;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ ns_altname = nsblk->alt_name;
+ } else
+ return -ENXIO;
+
+ return sprintf(buf, "%s\n", ns_altname ? ns_altname : "");
+}
+static DEVICE_ATTR_RW(alt_name);
+
+static int scan_free(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
+ resource_size_t n)
+{
+ bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ int rc = 0;
+
+ while (n) {
+ struct resource *res, *last;
+ resource_size_t new_start;
+
+ last = NULL;
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id->id) == 0)
+ last = res;
+ res = last;
+ if (!res)
+ return 0;
+
+ if (n >= resource_size(res)) {
+ n -= resource_size(res);
+ nd_dbg_dpa(nd_region, ndd, res, "delete %d\n", rc);
+ nvdimm_free_dpa(ndd, res);
+ /* retry with last resource deleted */
+ continue;
+ }
+
+ /*
+ * Keep BLK allocations relegated to high DPA as much as
+ * possible
+ */
+ if (is_blk)
+ new_start = res->start + n;
+ else
+ new_start = res->start;
+
+ rc = adjust_resource(res, new_start, resource_size(res) - n);
+ if (rc == 0)
+ res->flags |= DPA_RESOURCE_ADJUSTED;
+ nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc);
+ break;
+ }
+
+ return rc;
+}
+
+/**
+ * shrink_dpa_allocation - for each dimm in region free n bytes for label_id
+ * @nd_region: the set of dimms to reclaim @n bytes from
+ * @label_id: unique identifier for the namespace consuming this dpa range
+ * @n: number of bytes per-dimm to release
+ *
+ * Assumes resources are ordered. Starting from the end try to
+ * adjust_resource() the allocation to @n, but if @n is larger than the
+ * allocation delete it and find the 'new' last allocation in the label
+ * set.
+ */
+static int shrink_dpa_allocation(struct nd_region *nd_region,
+ struct nd_label_id *label_id, resource_size_t n)
+{
+ int i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ int rc;
+
+ rc = scan_free(nd_region, nd_mapping, label_id, n);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+static resource_size_t init_dpa_allocation(struct nd_label_id *label_id,
+ struct nd_region *nd_region, struct nd_mapping *nd_mapping,
+ resource_size_t n)
+{
+ bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ resource_size_t first_dpa;
+ struct resource *res;
+ int rc = 0;
+
+ /* allocate blk from highest dpa first */
+ if (is_blk)
+ first_dpa = nd_mapping->start + nd_mapping->size - n;
+ else
+ first_dpa = nd_mapping->start;
+
+ /* first resource allocation for this label-id or dimm */
+ res = nvdimm_allocate_dpa(ndd, label_id, first_dpa, n);
+ if (!res)
+ rc = -EBUSY;
+
+ nd_dbg_dpa(nd_region, ndd, res, "init %d\n", rc);
+ return rc ? n : 0;
+}
+
+
+/**
+ * space_valid() - validate free dpa space against constraints
+ * @nd_region: hosting region of the free space
+ * @ndd: dimm device data for debug
+ * @label_id: namespace id to allocate space
+ * @prev: potential allocation that precedes free space
+ * @next: allocation that follows the given free space range
+ * @exist: first allocation with same id in the mapping
+ * @n: range that must satisfied for pmem allocations
+ * @valid: free space range to validate
+ *
+ * BLK-space is valid as long as it does not precede a PMEM
+ * allocation in a given region. PMEM-space must be contiguous
+ * and adjacent to an existing existing allocation (if one
+ * exists). If reserving PMEM any space is valid.
+ */
+static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd,
+ struct nd_label_id *label_id, struct resource *prev,
+ struct resource *next, struct resource *exist,
+ resource_size_t n, struct resource *valid)
+{
+ bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0;
+ bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
+
+ if (valid->start >= valid->end)
+ goto invalid;
+
+ if (is_reserve)
+ return;
+
+ if (!is_pmem) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nvdimm_bus *nvdimm_bus;
+ struct blk_alloc_info info = {
+ .nd_mapping = nd_mapping,
+ .available = nd_mapping->size,
+ .res = valid,
+ };
+
+ WARN_ON(!is_nd_blk(&nd_region->dev));
+ nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+ device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy);
+ return;
+ }
+
+ /* allocation needs to be contiguous, so this is all or nothing */
+ if (resource_size(valid) < n)
+ goto invalid;
+
+ /* we've got all the space we need and no existing allocation */
+ if (!exist)
+ return;
+
+ /* allocation needs to be contiguous with the existing namespace */
+ if (valid->start == exist->end + 1
+ || valid->end == exist->start - 1)
+ return;
+
+ invalid:
+ /* truncate @valid size to 0 */
+ valid->end = valid->start - 1;
+}
+
+enum alloc_loc {
+ ALLOC_ERR = 0, ALLOC_BEFORE, ALLOC_MID, ALLOC_AFTER,
+};
+
+static resource_size_t scan_allocate(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
+ resource_size_t n)
+{
+ resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1;
+ bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res, *exist = NULL, valid;
+ const resource_size_t to_allocate = n;
+ int first;
+
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(label_id->id, res->name) == 0)
+ exist = res;
+
+ valid.start = nd_mapping->start;
+ valid.end = mapping_end;
+ valid.name = "free space";
+ retry:
+ first = 0;
+ for_each_dpa_resource(ndd, res) {
+ struct resource *next = res->sibling, *new_res = NULL;
+ resource_size_t allocate, available = 0;
+ enum alloc_loc loc = ALLOC_ERR;
+ const char *action;
+ int rc = 0;
+
+ /* ignore resources outside this nd_mapping */
+ if (res->start > mapping_end)
+ continue;
+ if (res->end < nd_mapping->start)
+ continue;
+
+ /* space at the beginning of the mapping */
+ if (!first++ && res->start > nd_mapping->start) {
+ valid.start = nd_mapping->start;
+ valid.end = res->start - 1;
+ space_valid(nd_region, ndd, label_id, NULL, next, exist,
+ to_allocate, &valid);
+ available = resource_size(&valid);
+ if (available)
+ loc = ALLOC_BEFORE;
+ }
+
+ /* space between allocations */
+ if (!loc && next) {
+ valid.start = res->start + resource_size(res);
+ valid.end = min(mapping_end, next->start - 1);
+ space_valid(nd_region, ndd, label_id, res, next, exist,
+ to_allocate, &valid);
+ available = resource_size(&valid);
+ if (available)
+ loc = ALLOC_MID;
+ }
+
+ /* space at the end of the mapping */
+ if (!loc && !next) {
+ valid.start = res->start + resource_size(res);
+ valid.end = mapping_end;
+ space_valid(nd_region, ndd, label_id, res, next, exist,
+ to_allocate, &valid);
+ available = resource_size(&valid);
+ if (available)
+ loc = ALLOC_AFTER;
+ }
+
+ if (!loc || !available)
+ continue;
+ allocate = min(available, n);
+ switch (loc) {
+ case ALLOC_BEFORE:
+ if (strcmp(res->name, label_id->id) == 0) {
+ /* adjust current resource up */
+ rc = adjust_resource(res, res->start - allocate,
+ resource_size(res) + allocate);
+ action = "cur grow up";
+ } else
+ action = "allocate";
+ break;
+ case ALLOC_MID:
+ if (strcmp(next->name, label_id->id) == 0) {
+ /* adjust next resource up */
+ rc = adjust_resource(next, next->start
+ - allocate, resource_size(next)
+ + allocate);
+ new_res = next;
+ action = "next grow up";
+ } else if (strcmp(res->name, label_id->id) == 0) {
+ action = "grow down";
+ } else
+ action = "allocate";
+ break;
+ case ALLOC_AFTER:
+ if (strcmp(res->name, label_id->id) == 0)
+ action = "grow down";
+ else
+ action = "allocate";
+ break;
+ default:
+ return n;
+ }
+
+ if (strcmp(action, "allocate") == 0) {
+ /* BLK allocate bottom up */
+ if (!is_pmem)
+ valid.start += available - allocate;
+
+ new_res = nvdimm_allocate_dpa(ndd, label_id,
+ valid.start, allocate);
+ if (!new_res)
+ rc = -EBUSY;
+ } else if (strcmp(action, "grow down") == 0) {
+ /* adjust current resource down */
+ rc = adjust_resource(res, res->start, resource_size(res)
+ + allocate);
+ if (rc == 0)
+ res->flags |= DPA_RESOURCE_ADJUSTED;
+ }
+
+ if (!new_res)
+ new_res = res;
+
+ nd_dbg_dpa(nd_region, ndd, new_res, "%s(%d) %d\n",
+ action, loc, rc);
+
+ if (rc)
+ return n;
+
+ n -= allocate;
+ if (n) {
+ /*
+ * Retry scan with newly inserted resources.
+ * For example, if we did an ALLOC_BEFORE
+ * insertion there may also have been space
+ * available for an ALLOC_AFTER insertion, so we
+ * need to check this same resource again
+ */
+ goto retry;
+ } else
+ return 0;
+ }
+
+ /*
+ * If we allocated nothing in the BLK case it may be because we are in
+ * an initial "pmem-reserve pass". Only do an initial BLK allocation
+ * when none of the DPA space is reserved.
+ */
+ if ((is_pmem || !ndd->dpa.child) && n == to_allocate)
+ return init_dpa_allocation(label_id, nd_region, nd_mapping, n);
+ return n;
+}
+
+static int merge_dpa(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, struct nd_label_id *label_id)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res;
+
+ if (strncmp("pmem", label_id->id, 4) == 0)
+ return 0;
+ retry:
+ for_each_dpa_resource(ndd, res) {
+ int rc;
+ struct resource *next = res->sibling;
+ resource_size_t end = res->start + resource_size(res);
+
+ if (!next || strcmp(res->name, label_id->id) != 0
+ || strcmp(next->name, label_id->id) != 0
+ || end != next->start)
+ continue;
+ end += resource_size(next);
+ nvdimm_free_dpa(ndd, next);
+ rc = adjust_resource(res, res->start, end - res->start);
+ nd_dbg_dpa(nd_region, ndd, res, "merge %d\n", rc);
+ if (rc)
+ return rc;
+ res->flags |= DPA_RESOURCE_ADJUSTED;
+ goto retry;
+ }
+
+ return 0;
+}
+
+int __reserve_free_pmem(struct device *dev, void *data)
+{
+ struct nvdimm *nvdimm = data;
+ struct nd_region *nd_region;
+ struct nd_label_id label_id;
+ int i;
+
+ if (!is_memory(dev))
+ return 0;
+
+ nd_region = to_nd_region(dev);
+ if (nd_region->ndr_mappings == 0)
+ return 0;
+
+ memset(&label_id, 0, sizeof(label_id));
+ strcat(label_id.id, "pmem-reserve");
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ resource_size_t n, rem = 0;
+
+ if (nd_mapping->nvdimm != nvdimm)
+ continue;
+
+ n = nd_pmem_available_dpa(nd_region, nd_mapping, &rem);
+ if (n == 0)
+ return 0;
+ rem = scan_allocate(nd_region, nd_mapping, &label_id, n);
+ dev_WARN_ONCE(&nd_region->dev, rem,
+ "pmem reserve underrun: %#llx of %#llx bytes\n",
+ (unsigned long long) n - rem,
+ (unsigned long long) n);
+ return rem ? -ENXIO : 0;
+ }
+
+ return 0;
+}
+
+void release_free_pmem(struct nvdimm_bus *nvdimm_bus,
+ struct nd_mapping *nd_mapping)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res, *_res;
+
+ for_each_dpa_resource_safe(ndd, res, _res)
+ if (strcmp(res->name, "pmem-reserve") == 0)
+ nvdimm_free_dpa(ndd, res);
+}
+
+static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus,
+ struct nd_mapping *nd_mapping)
+{
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+ int rc;
+
+ rc = device_for_each_child(&nvdimm_bus->dev, nvdimm,
+ __reserve_free_pmem);
+ if (rc)
+ release_free_pmem(nvdimm_bus, nd_mapping);
+ return rc;
+}
+
+/**
+ * grow_dpa_allocation - for each dimm allocate n bytes for @label_id
+ * @nd_region: the set of dimms to allocate @n more bytes from
+ * @label_id: unique identifier for the namespace consuming this dpa range
+ * @n: number of bytes per-dimm to add to the existing allocation
+ *
+ * Assumes resources are ordered. For BLK regions, first consume
+ * BLK-only available DPA free space, then consume PMEM-aliased DPA
+ * space starting at the highest DPA. For PMEM regions start
+ * allocations from the start of an interleave set and end at the first
+ * BLK allocation or the end of the interleave set, whichever comes
+ * first.
+ */
+static int grow_dpa_allocation(struct nd_region *nd_region,
+ struct nd_label_id *label_id, resource_size_t n)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+ bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
+ int i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ resource_size_t rem = n;
+ int rc, j;
+
+ /*
+ * In the BLK case try once with all unallocated PMEM
+ * reserved, and once without
+ */
+ for (j = is_pmem; j < 2; j++) {
+ bool blk_only = j == 0;
+
+ if (blk_only) {
+ rc = reserve_free_pmem(nvdimm_bus, nd_mapping);
+ if (rc)
+ return rc;
+ }
+ rem = scan_allocate(nd_region, nd_mapping,
+ label_id, rem);
+ if (blk_only)
+ release_free_pmem(nvdimm_bus, nd_mapping);
+
+ /* try again and allow encroachments into PMEM */
+ if (rem == 0)
+ break;
+ }
+
+ dev_WARN_ONCE(&nd_region->dev, rem,
+ "allocation underrun: %#llx of %#llx bytes\n",
+ (unsigned long long) n - rem,
+ (unsigned long long) n);
+ if (rem)
+ return -ENXIO;
+
+ rc = merge_dpa(nd_region, nd_mapping, label_id);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+static void nd_namespace_pmem_set_resource(struct nd_region *nd_region,
+ struct nd_namespace_pmem *nspm, resource_size_t size)
+{
+ struct resource *res = &nspm->nsio.res;
+ resource_size_t offset = 0;
+
+ if (size && !nspm->uuid) {
+ WARN_ON_ONCE(1);
+ size = 0;
+ }
+
+ if (size && nspm->uuid) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_label_id label_id;
+ struct resource *res;
+
+ if (!ndd) {
+ size = 0;
+ goto out;
+ }
+
+ nd_label_gen_id(&label_id, nspm->uuid, 0);
+
+ /* calculate a spa offset from the dpa allocation offset */
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id.id) == 0) {
+ offset = (res->start - nd_mapping->start)
+ * nd_region->ndr_mappings;
+ goto out;
+ }
+
+ WARN_ON_ONCE(1);
+ size = 0;
+ }
+
+ out:
+ res->start = nd_region->ndr_start + offset;
+ res->end = res->start + size - 1;
+}
+
+static bool uuid_not_set(const u8 *uuid, struct device *dev, const char *where)
+{
+ if (!uuid) {
+ dev_dbg(dev, "%s: uuid not set\n", where);
+ return true;
+ }
+ return false;
+}
+
+static ssize_t __size_store(struct device *dev, unsigned long long val)
+{
+ resource_size_t allocated = 0, available = 0;
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_namespace_common *ndns = to_ndns(dev);
+ struct nd_mapping *nd_mapping;
+ struct nvdimm_drvdata *ndd;
+ struct nd_label_id label_id;
+ u32 flags = 0, remainder;
+ int rc, i, id = -1;
+ u8 *uuid = NULL;
+
+ if (dev->driver || ndns->claim)
+ return -EBUSY;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ uuid = nspm->uuid;
+ id = nspm->id;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ uuid = nsblk->uuid;
+ flags = NSLABEL_FLAG_LOCAL;
+ id = nsblk->id;
+ }
+
+ /*
+ * We need a uuid for the allocation-label and dimm(s) on which
+ * to store the label.
+ */
+ if (uuid_not_set(uuid, dev, __func__))
+ return -ENXIO;
+ if (nd_region->ndr_mappings == 0) {
+ dev_dbg(dev, "not associated with dimm(s)\n");
+ return -ENXIO;
+ }
+
+ div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder);
+ if (remainder) {
+ dev_dbg(dev, "%llu is not %dK aligned\n", val,
+ (SZ_4K * nd_region->ndr_mappings) / SZ_1K);
+ return -EINVAL;
+ }
+
+ nd_label_gen_id(&label_id, uuid, flags);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ nd_mapping = &nd_region->mapping[i];
+ ndd = to_ndd(nd_mapping);
+
+ /*
+ * All dimms in an interleave set, or the base dimm for a blk
+ * region, need to be enabled for the size to be changed.
+ */
+ if (!ndd)
+ return -ENXIO;
+
+ allocated += nvdimm_allocated_dpa(ndd, &label_id);
+ }
+ available = nd_region_allocatable_dpa(nd_region);
+
+ if (val > available + allocated)
+ return -ENOSPC;
+
+ if (val == allocated)
+ return 0;
+
+ val = div_u64(val, nd_region->ndr_mappings);
+ allocated = div_u64(allocated, nd_region->ndr_mappings);
+ if (val < allocated)
+ rc = shrink_dpa_allocation(nd_region, &label_id,
+ allocated - val);
+ else
+ rc = grow_dpa_allocation(nd_region, &label_id, val - allocated);
+
+ if (rc)
+ return rc;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ nd_namespace_pmem_set_resource(nd_region, nspm,
+ val * nd_region->ndr_mappings);
+ }
+
+ /*
+ * Try to delete the namespace if we deleted all of its
+ * allocation, this is not the seed or 0th device for the
+ * region, and it is not actively claimed by a btt, pfn, or dax
+ * instance.
+ */
+ if (val == 0 && id != 0 && nd_region->ns_seed != dev && !ndns->claim)
+ nd_device_unregister(dev, ND_ASYNC);
+
+ return rc;
+}
+
+static ssize_t size_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ unsigned long long val;
+ u8 **uuid = NULL;
+ int rc;
+
+ rc = kstrtoull(buf, 0, &val);
+ if (rc)
+ return rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ rc = __size_store(dev, val);
+ if (rc >= 0)
+ rc = nd_namespace_label_update(nd_region, dev);
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ uuid = &nspm->uuid;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ uuid = &nsblk->uuid;
+ }
+
+ if (rc == 0 && val == 0 && uuid) {
+ /* setting size zero == 'delete namespace' */
+ kfree(*uuid);
+ *uuid = NULL;
+ }
+
+ dev_dbg(dev, "%llx %s (%d)\n", val, rc < 0 ? "fail" : "success", rc);
+
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc < 0 ? rc : len;
+}
+
+resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
+{
+ struct device *dev = &ndns->dev;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ return resource_size(&nspm->nsio.res);
+ } else if (is_namespace_blk(dev)) {
+ return nd_namespace_blk_size(to_nd_namespace_blk(dev));
+ } else if (is_namespace_io(dev)) {
+ struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+ return resource_size(&nsio->res);
+ } else
+ WARN_ONCE(1, "unknown namespace type\n");
+ return 0;
+}
+
+resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
+{
+ resource_size_t size;
+
+ nvdimm_bus_lock(&ndns->dev);
+ size = __nvdimm_namespace_capacity(ndns);
+ nvdimm_bus_unlock(&ndns->dev);
+
+ return size;
+}
+EXPORT_SYMBOL(nvdimm_namespace_capacity);
+
+bool nvdimm_namespace_locked(struct nd_namespace_common *ndns)
+{
+ int i;
+ bool locked = false;
+ struct device *dev = &ndns->dev;
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ if (test_bit(NDD_LOCKED, &nvdimm->flags)) {
+ dev_dbg(dev, "%s locked\n", nvdimm_name(nvdimm));
+ locked = true;
+ }
+ }
+ return locked;
+}
+EXPORT_SYMBOL(nvdimm_namespace_locked);
+
+static ssize_t size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)
+ nvdimm_namespace_capacity(to_ndns(dev)));
+}
+static DEVICE_ATTR(size, 0444, size_show, size_store);
+
+static u8 *namespace_to_uuid(struct device *dev)
+{
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ return nspm->uuid;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ return nsblk->uuid;
+ } else
+ return ERR_PTR(-ENXIO);
+}
+
+static ssize_t uuid_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u8 *uuid = namespace_to_uuid(dev);
+
+ if (IS_ERR(uuid))
+ return PTR_ERR(uuid);
+ if (uuid)
+ return sprintf(buf, "%pUb\n", uuid);
+ return sprintf(buf, "\n");
+}
+
+/**
+ * namespace_update_uuid - check for a unique uuid and whether we're "renaming"
+ * @nd_region: parent region so we can updates all dimms in the set
+ * @dev: namespace type for generating label_id
+ * @new_uuid: incoming uuid
+ * @old_uuid: reference to the uuid storage location in the namespace object
+ */
+static int namespace_update_uuid(struct nd_region *nd_region,
+ struct device *dev, u8 *new_uuid, u8 **old_uuid)
+{
+ u32 flags = is_namespace_blk(dev) ? NSLABEL_FLAG_LOCAL : 0;
+ struct nd_label_id old_label_id;
+ struct nd_label_id new_label_id;
+ int i;
+
+ if (!nd_is_uuid_unique(dev, new_uuid))
+ return -EINVAL;
+
+ if (*old_uuid == NULL)
+ goto out;
+
+ /*
+ * If we've already written a label with this uuid, then it's
+ * too late to rename because we can't reliably update the uuid
+ * without losing the old namespace. Userspace must delete this
+ * namespace to abandon the old uuid.
+ */
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+ /*
+ * This check by itself is sufficient because old_uuid
+ * would be NULL above if this uuid did not exist in the
+ * currently written set.
+ *
+ * FIXME: can we delete uuid with zero dpa allocated?
+ */
+ if (list_empty(&nd_mapping->labels))
+ return -EBUSY;
+ }
+
+ nd_label_gen_id(&old_label_id, *old_uuid, flags);
+ nd_label_gen_id(&new_label_id, new_uuid, flags);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_label_ent *label_ent;
+ struct resource *res;
+
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, old_label_id.id) == 0)
+ sprintf((void *) res->name, "%s",
+ new_label_id.id);
+
+ mutex_lock(&nd_mapping->lock);
+ list_for_each_entry(label_ent, &nd_mapping->labels, list) {
+ struct nd_namespace_label *nd_label = label_ent->label;
+ struct nd_label_id label_id;
+
+ if (!nd_label)
+ continue;
+ nd_label_gen_id(&label_id, nd_label->uuid,
+ __le32_to_cpu(nd_label->flags));
+ if (strcmp(old_label_id.id, label_id.id) == 0)
+ set_bit(ND_LABEL_REAP, &label_ent->flags);
+ }
+ mutex_unlock(&nd_mapping->lock);
+ }
+ kfree(*old_uuid);
+ out:
+ *old_uuid = new_uuid;
+ return 0;
+}
+
+static ssize_t uuid_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ u8 *uuid = NULL;
+ ssize_t rc = 0;
+ u8 **ns_uuid;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ ns_uuid = &nspm->uuid;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ ns_uuid = &nsblk->uuid;
+ } else
+ return -ENXIO;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ if (to_ndns(dev)->claim)
+ rc = -EBUSY;
+ if (rc >= 0)
+ rc = nd_uuid_store(dev, &uuid, buf, len);
+ if (rc >= 0)
+ rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid);
+ if (rc >= 0)
+ rc = nd_namespace_label_update(nd_region, dev);
+ else
+ kfree(uuid);
+ dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+ buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc < 0 ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t resource_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct resource *res;
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ res = &nspm->nsio.res;
+ } else if (is_namespace_io(dev)) {
+ struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+ res = &nsio->res;
+ } else
+ return -ENXIO;
+
+ /* no address to convey if the namespace has no allocation */
+ if (resource_size(res) == 0)
+ return -ENXIO;
+ return sprintf(buf, "%#llx\n", (unsigned long long) res->start);
+}
+static DEVICE_ATTR_RO(resource);
+
+static const unsigned long blk_lbasize_supported[] = { 512, 520, 528,
+ 4096, 4104, 4160, 4224, 0 };
+
+static const unsigned long pmem_lbasize_supported[] = { 512, 4096, 0 };
+
+static ssize_t sector_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ return nd_size_select_show(nsblk->lbasize,
+ blk_lbasize_supported, buf);
+ }
+
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ return nd_size_select_show(nspm->lbasize,
+ pmem_lbasize_supported, buf);
+ }
+ return -ENXIO;
+}
+
+static ssize_t sector_size_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ const unsigned long *supported;
+ unsigned long *lbasize;
+ ssize_t rc = 0;
+
+ if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ lbasize = &nsblk->lbasize;
+ supported = blk_lbasize_supported;
+ } else if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ lbasize = &nspm->lbasize;
+ supported = pmem_lbasize_supported;
+ } else
+ return -ENXIO;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ if (to_ndns(dev)->claim)
+ rc = -EBUSY;
+ if (rc >= 0)
+ rc = nd_size_select_store(dev, buf, lbasize, supported);
+ if (rc >= 0)
+ rc = nd_namespace_label_update(nd_region, dev);
+ dev_dbg(dev, "result: %zd %s: %s%s", rc, rc < 0 ? "tried" : "wrote",
+ buf, buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(sector_size);
+
+static ssize_t dpa_extents_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_label_id label_id;
+ int count = 0, i;
+ u8 *uuid = NULL;
+ u32 flags = 0;
+
+ nvdimm_bus_lock(dev);
+ if (is_namespace_pmem(dev)) {
+ struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+ uuid = nspm->uuid;
+ flags = 0;
+ } else if (is_namespace_blk(dev)) {
+ struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+ uuid = nsblk->uuid;
+ flags = NSLABEL_FLAG_LOCAL;
+ }
+
+ if (!uuid)
+ goto out;
+
+ nd_label_gen_id(&label_id, uuid, flags);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct resource *res;
+
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id.id) == 0)
+ count++;
+ }
+ out:
+ nvdimm_bus_unlock(dev);
+
+ return sprintf(buf, "%d\n", count);
+}
+static DEVICE_ATTR_RO(dpa_extents);
+
+static int btt_claim_class(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ int i, loop_bitmask = 0;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_namespace_index *nsindex;
+
+ /*
+ * If any of the DIMMs do not support labels the only
+ * possible BTT format is v1.
+ */
+ if (!ndd) {
+ loop_bitmask = 0;
+ break;
+ }
+
+ nsindex = to_namespace_index(ndd, ndd->ns_current);
+ if (nsindex == NULL)
+ loop_bitmask |= 1;
+ else {
+ /* check whether existing labels are v1.1 or v1.2 */
+ if (__le16_to_cpu(nsindex->major) == 1
+ && __le16_to_cpu(nsindex->minor) == 1)
+ loop_bitmask |= 2;
+ else
+ loop_bitmask |= 4;
+ }
+ }
+ /*
+ * If nsindex is null loop_bitmask's bit 0 will be set, and if an index
+ * block is found, a v1.1 label for any mapping will set bit 1, and a
+ * v1.2 label will set bit 2.
+ *
+ * At the end of the loop, at most one of the three bits must be set.
+ * If multiple bits were set, it means the different mappings disagree
+ * about their labels, and this must be cleaned up first.
+ *
+ * If all the label index blocks are found to agree, nsindex of NULL
+ * implies labels haven't been initialized yet, and when they will,
+ * they will be of the 1.2 format, so we can assume BTT2.0
+ *
+ * If 1.1 labels are found, we enforce BTT1.1, and if 1.2 labels are
+ * found, we enforce BTT2.0
+ *
+ * If the loop was never entered, default to BTT1.1 (legacy namespaces)
+ */
+ switch (loop_bitmask) {
+ case 0:
+ case 2:
+ return NVDIMM_CCLASS_BTT;
+ case 1:
+ case 4:
+ return NVDIMM_CCLASS_BTT2;
+ default:
+ return -ENXIO;
+ }
+}
+
+static ssize_t holder_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_namespace_common *ndns = to_ndns(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : "");
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(holder);
+
+static ssize_t __holder_class_store(struct device *dev, const char *buf)
+{
+ struct nd_namespace_common *ndns = to_ndns(dev);
+
+ if (dev->driver || ndns->claim)
+ return -EBUSY;
+
+ if (strcmp(buf, "btt") == 0 || strcmp(buf, "btt\n") == 0)
+ ndns->claim_class = btt_claim_class(dev);
+ else if (strcmp(buf, "pfn") == 0 || strcmp(buf, "pfn\n") == 0)
+ ndns->claim_class = NVDIMM_CCLASS_PFN;
+ else if (strcmp(buf, "dax") == 0 || strcmp(buf, "dax\n") == 0)
+ ndns->claim_class = NVDIMM_CCLASS_DAX;
+ else if (strcmp(buf, "") == 0 || strcmp(buf, "\n") == 0)
+ ndns->claim_class = NVDIMM_CCLASS_NONE;
+ else
+ return -EINVAL;
+
+ /* btt_claim_class() could've returned an error */
+ if (ndns->claim_class < 0)
+ return ndns->claim_class;
+
+ return 0;
+}
+
+static ssize_t holder_class_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ ssize_t rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ rc = __holder_class_store(dev, buf);
+ if (rc >= 0)
+ rc = nd_namespace_label_update(nd_region, dev);
+ dev_dbg(dev, "%s(%zd)\n", rc < 0 ? "fail " : "", rc);
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc < 0 ? rc : len;
+}
+
+static ssize_t holder_class_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_namespace_common *ndns = to_ndns(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ if (ndns->claim_class == NVDIMM_CCLASS_NONE)
+ rc = sprintf(buf, "\n");
+ else if ((ndns->claim_class == NVDIMM_CCLASS_BTT) ||
+ (ndns->claim_class == NVDIMM_CCLASS_BTT2))
+ rc = sprintf(buf, "btt\n");
+ else if (ndns->claim_class == NVDIMM_CCLASS_PFN)
+ rc = sprintf(buf, "pfn\n");
+ else if (ndns->claim_class == NVDIMM_CCLASS_DAX)
+ rc = sprintf(buf, "dax\n");
+ else
+ rc = sprintf(buf, "<unknown>\n");
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RW(holder_class);
+
+static ssize_t mode_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_namespace_common *ndns = to_ndns(dev);
+ struct device *claim;
+ char *mode;
+ ssize_t rc;
+
+ device_lock(dev);
+ claim = ndns->claim;
+ if (claim && is_nd_btt(claim))
+ mode = "safe";
+ else if (claim && is_nd_pfn(claim))
+ mode = "memory";
+ else if (claim && is_nd_dax(claim))
+ mode = "dax";
+ else if (!claim && pmem_should_map_pages(dev))
+ mode = "memory";
+ else
+ mode = "raw";
+ rc = sprintf(buf, "%s\n", mode);
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(mode);
+
+static ssize_t force_raw_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ bool force_raw;
+ int rc = strtobool(buf, &force_raw);
+
+ if (rc)
+ return rc;
+
+ to_ndns(dev)->force_raw = force_raw;
+ return len;
+}
+
+static ssize_t force_raw_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", to_ndns(dev)->force_raw);
+}
+static DEVICE_ATTR_RW(force_raw);
+
+static struct attribute *nd_namespace_attributes[] = {
+ &dev_attr_nstype.attr,
+ &dev_attr_size.attr,
+ &dev_attr_mode.attr,
+ &dev_attr_uuid.attr,
+ &dev_attr_holder.attr,
+ &dev_attr_resource.attr,
+ &dev_attr_alt_name.attr,
+ &dev_attr_force_raw.attr,
+ &dev_attr_sector_size.attr,
+ &dev_attr_dpa_extents.attr,
+ &dev_attr_holder_class.attr,
+ NULL,
+};
+
+static umode_t namespace_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+
+ if (a == &dev_attr_resource.attr) {
+ if (is_namespace_blk(dev))
+ return 0;
+ return 0400;
+ }
+
+ if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
+ if (a == &dev_attr_size.attr)
+ return 0644;
+
+ return a->mode;
+ }
+
+ if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr
+ || a == &dev_attr_holder.attr
+ || a == &dev_attr_holder_class.attr
+ || a == &dev_attr_force_raw.attr
+ || a == &dev_attr_mode.attr)
+ return a->mode;
+
+ return 0;
+}
+
+static struct attribute_group nd_namespace_attribute_group = {
+ .attrs = nd_namespace_attributes,
+ .is_visible = namespace_visible,
+};
+
+static const struct attribute_group *nd_namespace_attribute_groups[] = {
+ &nd_device_attribute_group,
+ &nd_namespace_attribute_group,
+ &nd_numa_attribute_group,
+ NULL,
+};
+
+struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
+{
+ struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
+ struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL;
+ struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL;
+ struct nd_namespace_common *ndns = NULL;
+ resource_size_t size;
+
+ if (nd_btt || nd_pfn || nd_dax) {
+ if (nd_btt)
+ ndns = nd_btt->ndns;
+ else if (nd_pfn)
+ ndns = nd_pfn->ndns;
+ else if (nd_dax)
+ ndns = nd_dax->nd_pfn.ndns;
+
+ if (!ndns)
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * Flush any in-progess probes / removals in the driver
+ * for the raw personality of this namespace.
+ */
+ device_lock(&ndns->dev);
+ device_unlock(&ndns->dev);
+ if (ndns->dev.driver) {
+ dev_dbg(&ndns->dev, "is active, can't bind %s\n",
+ dev_name(dev));
+ return ERR_PTR(-EBUSY);
+ }
+ if (dev_WARN_ONCE(&ndns->dev, ndns->claim != dev,
+ "host (%s) vs claim (%s) mismatch\n",
+ dev_name(dev),
+ dev_name(ndns->claim)))
+ return ERR_PTR(-ENXIO);
+ } else {
+ ndns = to_ndns(dev);
+ if (ndns->claim) {
+ dev_dbg(dev, "claimed by %s, failing probe\n",
+ dev_name(ndns->claim));
+
+ return ERR_PTR(-ENXIO);
+ }
+ }
+
+ if (nvdimm_namespace_locked(ndns))
+ return ERR_PTR(-EACCES);
+
+ size = nvdimm_namespace_capacity(ndns);
+ if (size < ND_MIN_NAMESPACE_SIZE) {
+ dev_dbg(&ndns->dev, "%pa, too small must be at least %#x\n",
+ &size, ND_MIN_NAMESPACE_SIZE);
+ return ERR_PTR(-ENODEV);
+ }
+
+ if (is_namespace_pmem(&ndns->dev)) {
+ struct nd_namespace_pmem *nspm;
+
+ nspm = to_nd_namespace_pmem(&ndns->dev);
+ if (uuid_not_set(nspm->uuid, &ndns->dev, __func__))
+ return ERR_PTR(-ENODEV);
+ } else if (is_namespace_blk(&ndns->dev)) {
+ struct nd_namespace_blk *nsblk;
+
+ nsblk = to_nd_namespace_blk(&ndns->dev);
+ if (uuid_not_set(nsblk->uuid, &ndns->dev, __func__))
+ return ERR_PTR(-ENODEV);
+ if (!nsblk->lbasize) {
+ dev_dbg(&ndns->dev, "sector size not set\n");
+ return ERR_PTR(-ENODEV);
+ }
+ if (!nd_namespace_blk_validate(nsblk))
+ return ERR_PTR(-ENODEV);
+ }
+
+ return ndns;
+}
+EXPORT_SYMBOL(nvdimm_namespace_common_probe);
+
+static struct device **create_namespace_io(struct nd_region *nd_region)
+{
+ struct nd_namespace_io *nsio;
+ struct device *dev, **devs;
+ struct resource *res;
+
+ nsio = kzalloc(sizeof(*nsio), GFP_KERNEL);
+ if (!nsio)
+ return NULL;
+
+ devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL);
+ if (!devs) {
+ kfree(nsio);
+ return NULL;
+ }
+
+ dev = &nsio->common.dev;
+ dev->type = &namespace_io_device_type;
+ dev->parent = &nd_region->dev;
+ res = &nsio->res;
+ res->name = dev_name(&nd_region->dev);
+ res->flags = IORESOURCE_MEM;
+ res->start = nd_region->ndr_start;
+ res->end = res->start + nd_region->ndr_size - 1;
+
+ devs[0] = dev;
+ return devs;
+}
+
+static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid,
+ u64 cookie, u16 pos)
+{
+ struct nd_namespace_label *found = NULL;
+ int i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_label_ent *label_ent;
+ bool found_uuid = false;
+
+ list_for_each_entry(label_ent, &nd_mapping->labels, list) {
+ struct nd_namespace_label *nd_label = label_ent->label;
+ u16 position, nlabel;
+ u64 isetcookie;
+
+ if (!nd_label)
+ continue;
+ isetcookie = __le64_to_cpu(nd_label->isetcookie);
+ position = __le16_to_cpu(nd_label->position);
+ nlabel = __le16_to_cpu(nd_label->nlabel);
+
+ if (isetcookie != cookie)
+ continue;
+
+ if (memcmp(nd_label->uuid, uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+
+ if (namespace_label_has(ndd, type_guid)
+ && !guid_equal(&nd_set->type_guid,
+ &nd_label->type_guid)) {
+ dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n",
+ nd_set->type_guid.b,
+ nd_label->type_guid.b);
+ continue;
+ }
+
+ if (found_uuid) {
+ dev_dbg(ndd->dev, "duplicate entry for uuid\n");
+ return false;
+ }
+ found_uuid = true;
+ if (nlabel != nd_region->ndr_mappings)
+ continue;
+ if (position != pos)
+ continue;
+ found = nd_label;
+ break;
+ }
+ if (found)
+ break;
+ }
+ return found != NULL;
+}
+
+static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
+{
+ int i;
+
+ if (!pmem_id)
+ return -ENODEV;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_namespace_label *nd_label = NULL;
+ u64 hw_start, hw_end, pmem_start, pmem_end;
+ struct nd_label_ent *label_ent;
+
+ lockdep_assert_held(&nd_mapping->lock);
+ list_for_each_entry(label_ent, &nd_mapping->labels, list) {
+ nd_label = label_ent->label;
+ if (!nd_label)
+ continue;
+ if (memcmp(nd_label->uuid, pmem_id, NSLABEL_UUID_LEN) == 0)
+ break;
+ nd_label = NULL;
+ }
+
+ if (!nd_label) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ /*
+ * Check that this label is compliant with the dpa
+ * range published in NFIT
+ */
+ hw_start = nd_mapping->start;
+ hw_end = hw_start + nd_mapping->size;
+ pmem_start = __le64_to_cpu(nd_label->dpa);
+ pmem_end = pmem_start + __le64_to_cpu(nd_label->rawsize);
+ if (pmem_start >= hw_start && pmem_start < hw_end
+ && pmem_end <= hw_end && pmem_end > hw_start)
+ /* pass */;
+ else {
+ dev_dbg(&nd_region->dev, "%s invalid label for %pUb\n",
+ dev_name(ndd->dev), nd_label->uuid);
+ return -EINVAL;
+ }
+
+ /* move recently validated label to the front of the list */
+ list_move(&label_ent->list, &nd_mapping->labels);
+ }
+ return 0;
+}
+
+/**
+ * create_namespace_pmem - validate interleave set labelling, retrieve label0
+ * @nd_region: region with mappings to validate
+ * @nspm: target namespace to create
+ * @nd_label: target pmem namespace label to evaluate
+ */
+static struct device *create_namespace_pmem(struct nd_region *nd_region,
+ struct nd_namespace_index *nsindex,
+ struct nd_namespace_label *nd_label)
+{
+ u64 cookie = nd_region_interleave_set_cookie(nd_region, nsindex);
+ u64 altcookie = nd_region_interleave_set_altcookie(nd_region);
+ struct nd_label_ent *label_ent;
+ struct nd_namespace_pmem *nspm;
+ struct nd_mapping *nd_mapping;
+ resource_size_t size = 0;
+ struct resource *res;
+ struct device *dev;
+ int rc = 0;
+ u16 i;
+
+ if (cookie == 0) {
+ dev_dbg(&nd_region->dev, "invalid interleave-set-cookie\n");
+ return ERR_PTR(-ENXIO);
+ }
+
+ if (__le64_to_cpu(nd_label->isetcookie) != cookie) {
+ dev_dbg(&nd_region->dev, "invalid cookie in label: %pUb\n",
+ nd_label->uuid);
+ if (__le64_to_cpu(nd_label->isetcookie) != altcookie)
+ return ERR_PTR(-EAGAIN);
+
+ dev_dbg(&nd_region->dev, "valid altcookie in label: %pUb\n",
+ nd_label->uuid);
+ }
+
+ nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
+ if (!nspm)
+ return ERR_PTR(-ENOMEM);
+
+ nspm->id = -1;
+ dev = &nspm->nsio.common.dev;
+ dev->type = &namespace_pmem_device_type;
+ dev->parent = &nd_region->dev;
+ res = &nspm->nsio.res;
+ res->name = dev_name(&nd_region->dev);
+ res->flags = IORESOURCE_MEM;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ if (has_uuid_at_pos(nd_region, nd_label->uuid, cookie, i))
+ continue;
+ if (has_uuid_at_pos(nd_region, nd_label->uuid, altcookie, i))
+ continue;
+ break;
+ }
+
+ if (i < nd_region->ndr_mappings) {
+ struct nvdimm *nvdimm = nd_region->mapping[i].nvdimm;
+
+ /*
+ * Give up if we don't find an instance of a uuid at each
+ * position (from 0 to nd_region->ndr_mappings - 1), or if we
+ * find a dimm with two instances of the same uuid.
+ */
+ dev_err(&nd_region->dev, "%s missing label for %pUb\n",
+ nvdimm_name(nvdimm), nd_label->uuid);
+ rc = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * Fix up each mapping's 'labels' to have the validated pmem label for
+ * that position at labels[0], and NULL at labels[1]. In the process,
+ * check that the namespace aligns with interleave-set. We know
+ * that it does not overlap with any blk namespaces by virtue of
+ * the dimm being enabled (i.e. nd_label_reserve_dpa()
+ * succeeded).
+ */
+ rc = select_pmem_id(nd_region, nd_label->uuid);
+ if (rc)
+ goto err;
+
+ /* Calculate total size and populate namespace properties from label0 */
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_namespace_label *label0;
+ struct nvdimm_drvdata *ndd;
+
+ nd_mapping = &nd_region->mapping[i];
+ label_ent = list_first_entry_or_null(&nd_mapping->labels,
+ typeof(*label_ent), list);
+ label0 = label_ent ? label_ent->label : NULL;
+
+ if (!label0) {
+ WARN_ON(1);
+ continue;
+ }
+
+ size += __le64_to_cpu(label0->rawsize);
+ if (__le16_to_cpu(label0->position) != 0)
+ continue;
+ WARN_ON(nspm->alt_name || nspm->uuid);
+ nspm->alt_name = kmemdup((void __force *) label0->name,
+ NSLABEL_NAME_LEN, GFP_KERNEL);
+ nspm->uuid = kmemdup((void __force *) label0->uuid,
+ NSLABEL_UUID_LEN, GFP_KERNEL);
+ nspm->lbasize = __le64_to_cpu(label0->lbasize);
+ ndd = to_ndd(nd_mapping);
+ if (namespace_label_has(ndd, abstraction_guid))
+ nspm->nsio.common.claim_class
+ = to_nvdimm_cclass(&label0->abstraction_guid);
+
+ }
+
+ if (!nspm->alt_name || !nspm->uuid) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ nd_namespace_pmem_set_resource(nd_region, nspm, size);
+
+ return dev;
+ err:
+ namespace_pmem_release(dev);
+ switch (rc) {
+ case -EINVAL:
+ dev_dbg(&nd_region->dev, "invalid label(s)\n");
+ break;
+ case -ENODEV:
+ dev_dbg(&nd_region->dev, "label not found\n");
+ break;
+ default:
+ dev_dbg(&nd_region->dev, "unexpected err: %d\n", rc);
+ break;
+ }
+ return ERR_PTR(rc);
+}
+
+struct resource *nsblk_add_resource(struct nd_region *nd_region,
+ struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk,
+ resource_size_t start)
+{
+ struct nd_label_id label_id;
+ struct resource *res;
+
+ nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+ res = krealloc(nsblk->res,
+ sizeof(void *) * (nsblk->num_resources + 1),
+ GFP_KERNEL);
+ if (!res)
+ return NULL;
+ nsblk->res = (struct resource **) res;
+ for_each_dpa_resource(ndd, res)
+ if (strcmp(res->name, label_id.id) == 0
+ && res->start == start) {
+ nsblk->res[nsblk->num_resources++] = res;
+ return res;
+ }
+ return NULL;
+}
+
+static struct device *nd_namespace_blk_create(struct nd_region *nd_region)
+{
+ struct nd_namespace_blk *nsblk;
+ struct device *dev;
+
+ if (!is_nd_blk(&nd_region->dev))
+ return NULL;
+
+ nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
+ if (!nsblk)
+ return NULL;
+
+ dev = &nsblk->common.dev;
+ dev->type = &namespace_blk_device_type;
+ nsblk->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL);
+ if (nsblk->id < 0) {
+ kfree(nsblk);
+ return NULL;
+ }
+ dev_set_name(dev, "namespace%d.%d", nd_region->id, nsblk->id);
+ dev->parent = &nd_region->dev;
+ dev->groups = nd_namespace_attribute_groups;
+
+ return &nsblk->common.dev;
+}
+
+static struct device *nd_namespace_pmem_create(struct nd_region *nd_region)
+{
+ struct nd_namespace_pmem *nspm;
+ struct resource *res;
+ struct device *dev;
+
+ if (!is_memory(&nd_region->dev))
+ return NULL;
+
+ nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
+ if (!nspm)
+ return NULL;
+
+ dev = &nspm->nsio.common.dev;
+ dev->type = &namespace_pmem_device_type;
+ dev->parent = &nd_region->dev;
+ res = &nspm->nsio.res;
+ res->name = dev_name(&nd_region->dev);
+ res->flags = IORESOURCE_MEM;
+
+ nspm->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL);
+ if (nspm->id < 0) {
+ kfree(nspm);
+ return NULL;
+ }
+ dev_set_name(dev, "namespace%d.%d", nd_region->id, nspm->id);
+ dev->parent = &nd_region->dev;
+ dev->groups = nd_namespace_attribute_groups;
+ nd_namespace_pmem_set_resource(nd_region, nspm, 0);
+
+ return dev;
+}
+
+void nd_region_create_ns_seed(struct nd_region *nd_region)
+{
+ WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+
+ if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_IO)
+ return;
+
+ if (is_nd_blk(&nd_region->dev))
+ nd_region->ns_seed = nd_namespace_blk_create(nd_region);
+ else
+ nd_region->ns_seed = nd_namespace_pmem_create(nd_region);
+
+ /*
+ * Seed creation failures are not fatal, provisioning is simply
+ * disabled until memory becomes available
+ */
+ if (!nd_region->ns_seed)
+ dev_err(&nd_region->dev, "failed to create %s namespace\n",
+ is_nd_blk(&nd_region->dev) ? "blk" : "pmem");
+ else
+ nd_device_register(nd_region->ns_seed);
+}
+
+void nd_region_create_dax_seed(struct nd_region *nd_region)
+{
+ WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+ nd_region->dax_seed = nd_dax_create(nd_region);
+ /*
+ * Seed creation failures are not fatal, provisioning is simply
+ * disabled until memory becomes available
+ */
+ if (!nd_region->dax_seed)
+ dev_err(&nd_region->dev, "failed to create dax namespace\n");
+}
+
+void nd_region_create_pfn_seed(struct nd_region *nd_region)
+{
+ WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+ nd_region->pfn_seed = nd_pfn_create(nd_region);
+ /*
+ * Seed creation failures are not fatal, provisioning is simply
+ * disabled until memory becomes available
+ */
+ if (!nd_region->pfn_seed)
+ dev_err(&nd_region->dev, "failed to create pfn namespace\n");
+}
+
+void nd_region_create_btt_seed(struct nd_region *nd_region)
+{
+ WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+ nd_region->btt_seed = nd_btt_create(nd_region);
+ /*
+ * Seed creation failures are not fatal, provisioning is simply
+ * disabled until memory becomes available
+ */
+ if (!nd_region->btt_seed)
+ dev_err(&nd_region->dev, "failed to create btt namespace\n");
+}
+
+static int add_namespace_resource(struct nd_region *nd_region,
+ struct nd_namespace_label *nd_label, struct device **devs,
+ int count)
+{
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ int i;
+
+ for (i = 0; i < count; i++) {
+ u8 *uuid = namespace_to_uuid(devs[i]);
+ struct resource *res;
+
+ if (IS_ERR_OR_NULL(uuid)) {
+ WARN_ON(1);
+ continue;
+ }
+
+ if (memcmp(uuid, nd_label->uuid, NSLABEL_UUID_LEN) != 0)
+ continue;
+ if (is_namespace_blk(devs[i])) {
+ res = nsblk_add_resource(nd_region, ndd,
+ to_nd_namespace_blk(devs[i]),
+ __le64_to_cpu(nd_label->dpa));
+ if (!res)
+ return -ENXIO;
+ nd_dbg_dpa(nd_region, ndd, res, "%d assign\n", count);
+ } else {
+ dev_err(&nd_region->dev,
+ "error: conflicting extents for uuid: %pUb\n",
+ nd_label->uuid);
+ return -ENXIO;
+ }
+ break;
+ }
+
+ return i;
+}
+
+static struct device *create_namespace_blk(struct nd_region *nd_region,
+ struct nd_namespace_label *nd_label, int count)
+{
+
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_namespace_blk *nsblk;
+ char name[NSLABEL_NAME_LEN];
+ struct device *dev = NULL;
+ struct resource *res;
+
+ if (namespace_label_has(ndd, type_guid)) {
+ if (!guid_equal(&nd_set->type_guid, &nd_label->type_guid)) {
+ dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n",
+ nd_set->type_guid.b,
+ nd_label->type_guid.b);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ if (nd_label->isetcookie != __cpu_to_le64(nd_set->cookie2)) {
+ dev_dbg(ndd->dev, "expect cookie %#llx got %#llx\n",
+ nd_set->cookie2,
+ __le64_to_cpu(nd_label->isetcookie));
+ return ERR_PTR(-EAGAIN);
+ }
+ }
+
+ nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
+ if (!nsblk)
+ return ERR_PTR(-ENOMEM);
+ dev = &nsblk->common.dev;
+ dev->type = &namespace_blk_device_type;
+ dev->parent = &nd_region->dev;
+ nsblk->id = -1;
+ nsblk->lbasize = __le64_to_cpu(nd_label->lbasize);
+ nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN,
+ GFP_KERNEL);
+ if (namespace_label_has(ndd, abstraction_guid))
+ nsblk->common.claim_class
+ = to_nvdimm_cclass(&nd_label->abstraction_guid);
+ if (!nsblk->uuid)
+ goto blk_err;
+ memcpy(name, nd_label->name, NSLABEL_NAME_LEN);
+ if (name[0]) {
+ nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN,
+ GFP_KERNEL);
+ if (!nsblk->alt_name)
+ goto blk_err;
+ }
+ res = nsblk_add_resource(nd_region, ndd, nsblk,
+ __le64_to_cpu(nd_label->dpa));
+ if (!res)
+ goto blk_err;
+ nd_dbg_dpa(nd_region, ndd, res, "%d: assign\n", count);
+ return dev;
+ blk_err:
+ namespace_blk_release(dev);
+ return ERR_PTR(-ENXIO);
+}
+
+static int cmp_dpa(const void *a, const void *b)
+{
+ const struct device *dev_a = *(const struct device **) a;
+ const struct device *dev_b = *(const struct device **) b;
+ struct nd_namespace_blk *nsblk_a, *nsblk_b;
+ struct nd_namespace_pmem *nspm_a, *nspm_b;
+
+ if (is_namespace_io(dev_a))
+ return 0;
+
+ if (is_namespace_blk(dev_a)) {
+ nsblk_a = to_nd_namespace_blk(dev_a);
+ nsblk_b = to_nd_namespace_blk(dev_b);
+
+ return memcmp(&nsblk_a->res[0]->start, &nsblk_b->res[0]->start,
+ sizeof(resource_size_t));
+ }
+
+ nspm_a = to_nd_namespace_pmem(dev_a);
+ nspm_b = to_nd_namespace_pmem(dev_b);
+
+ return memcmp(&nspm_a->nsio.res.start, &nspm_b->nsio.res.start,
+ sizeof(resource_size_t));
+}
+
+static struct device **scan_labels(struct nd_region *nd_region)
+{
+ int i, count = 0;
+ struct device *dev, **devs = NULL;
+ struct nd_label_ent *label_ent, *e;
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ resource_size_t map_end = nd_mapping->start + nd_mapping->size - 1;
+
+ /* "safe" because create_namespace_pmem() might list_move() label_ent */
+ list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
+ struct nd_namespace_label *nd_label = label_ent->label;
+ struct device **__devs;
+ u32 flags;
+
+ if (!nd_label)
+ continue;
+ flags = __le32_to_cpu(nd_label->flags);
+ if (is_nd_blk(&nd_region->dev)
+ == !!(flags & NSLABEL_FLAG_LOCAL))
+ /* pass, region matches label type */;
+ else
+ continue;
+
+ /* skip labels that describe extents outside of the region */
+ if (__le64_to_cpu(nd_label->dpa) < nd_mapping->start ||
+ __le64_to_cpu(nd_label->dpa) > map_end)
+ continue;
+
+ i = add_namespace_resource(nd_region, nd_label, devs, count);
+ if (i < 0)
+ goto err;
+ if (i < count)
+ continue;
+ __devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL);
+ if (!__devs)
+ goto err;
+ memcpy(__devs, devs, sizeof(dev) * count);
+ kfree(devs);
+ devs = __devs;
+
+ if (is_nd_blk(&nd_region->dev))
+ dev = create_namespace_blk(nd_region, nd_label, count);
+ else {
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_namespace_index *nsindex;
+
+ nsindex = to_namespace_index(ndd, ndd->ns_current);
+ dev = create_namespace_pmem(nd_region, nsindex, nd_label);
+ }
+
+ if (IS_ERR(dev)) {
+ switch (PTR_ERR(dev)) {
+ case -EAGAIN:
+ /* skip invalid labels */
+ continue;
+ case -ENODEV:
+ /* fallthrough to seed creation */
+ break;
+ default:
+ goto err;
+ }
+ } else
+ devs[count++] = dev;
+
+ }
+
+ dev_dbg(&nd_region->dev, "discovered %d %s namespace%s\n",
+ count, is_nd_blk(&nd_region->dev)
+ ? "blk" : "pmem", count == 1 ? "" : "s");
+
+ if (count == 0) {
+ /* Publish a zero-sized namespace for userspace to configure. */
+ nd_mapping_free_labels(nd_mapping);
+
+ devs = kcalloc(2, sizeof(dev), GFP_KERNEL);
+ if (!devs)
+ goto err;
+ if (is_nd_blk(&nd_region->dev)) {
+ struct nd_namespace_blk *nsblk;
+
+ nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
+ if (!nsblk)
+ goto err;
+ dev = &nsblk->common.dev;
+ dev->type = &namespace_blk_device_type;
+ } else {
+ struct nd_namespace_pmem *nspm;
+
+ nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
+ if (!nspm)
+ goto err;
+ dev = &nspm->nsio.common.dev;
+ dev->type = &namespace_pmem_device_type;
+ nd_namespace_pmem_set_resource(nd_region, nspm, 0);
+ }
+ dev->parent = &nd_region->dev;
+ devs[count++] = dev;
+ } else if (is_memory(&nd_region->dev)) {
+ /* clean unselected labels */
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct list_head *l, *e;
+ LIST_HEAD(list);
+ int j;
+
+ nd_mapping = &nd_region->mapping[i];
+ if (list_empty(&nd_mapping->labels)) {
+ WARN_ON(1);
+ continue;
+ }
+
+ j = count;
+ list_for_each_safe(l, e, &nd_mapping->labels) {
+ if (!j--)
+ break;
+ list_move_tail(l, &list);
+ }
+ nd_mapping_free_labels(nd_mapping);
+ list_splice_init(&list, &nd_mapping->labels);
+ }
+ }
+
+ if (count > 1)
+ sort(devs, count, sizeof(struct device *), cmp_dpa, NULL);
+
+ return devs;
+
+ err:
+ if (devs) {
+ for (i = 0; devs[i]; i++)
+ if (is_nd_blk(&nd_region->dev))
+ namespace_blk_release(devs[i]);
+ else
+ namespace_pmem_release(devs[i]);
+ kfree(devs);
+ }
+ return NULL;
+}
+
+static struct device **create_namespaces(struct nd_region *nd_region)
+{
+ struct nd_mapping *nd_mapping;
+ struct device **devs;
+ int i;
+
+ if (nd_region->ndr_mappings == 0)
+ return NULL;
+
+ /* lock down all mappings while we scan labels */
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ nd_mapping = &nd_region->mapping[i];
+ mutex_lock_nested(&nd_mapping->lock, i);
+ }
+
+ devs = scan_labels(nd_region);
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ int reverse = nd_region->ndr_mappings - 1 - i;
+
+ nd_mapping = &nd_region->mapping[reverse];
+ mutex_unlock(&nd_mapping->lock);
+ }
+
+ return devs;
+}
+
+static int init_active_labels(struct nd_region *nd_region)
+{
+ int i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+ struct nd_label_ent *label_ent;
+ int count, j;
+
+ /*
+ * If the dimm is disabled then we may need to prevent
+ * the region from being activated.
+ */
+ if (!ndd) {
+ if (test_bit(NDD_LOCKED, &nvdimm->flags))
+ /* fail, label data may be unreadable */;
+ else if (test_bit(NDD_ALIASING, &nvdimm->flags))
+ /* fail, labels needed to disambiguate dpa */;
+ else
+ return 0;
+
+ dev_err(&nd_region->dev, "%s: is %s, failing probe\n",
+ dev_name(&nd_mapping->nvdimm->dev),
+ test_bit(NDD_LOCKED, &nvdimm->flags)
+ ? "locked" : "disabled");
+ return -ENXIO;
+ }
+ nd_mapping->ndd = ndd;
+ atomic_inc(&nvdimm->busy);
+ get_ndd(ndd);
+
+ count = nd_label_active_count(ndd);
+ dev_dbg(ndd->dev, "count: %d\n", count);
+ if (!count)
+ continue;
+ for (j = 0; j < count; j++) {
+ struct nd_namespace_label *label;
+
+ label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL);
+ if (!label_ent)
+ break;
+ label = nd_label_active(ndd, j);
+ label_ent->label = label;
+
+ mutex_lock(&nd_mapping->lock);
+ list_add_tail(&label_ent->list, &nd_mapping->labels);
+ mutex_unlock(&nd_mapping->lock);
+ }
+
+ if (j >= count)
+ continue;
+
+ mutex_lock(&nd_mapping->lock);
+ nd_mapping_free_labels(nd_mapping);
+ mutex_unlock(&nd_mapping->lock);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
+{
+ struct device **devs = NULL;
+ int i, rc = 0, type;
+
+ *err = 0;
+ nvdimm_bus_lock(&nd_region->dev);
+ rc = init_active_labels(nd_region);
+ if (rc) {
+ nvdimm_bus_unlock(&nd_region->dev);
+ return rc;
+ }
+
+ type = nd_region_to_nstype(nd_region);
+ switch (type) {
+ case ND_DEVICE_NAMESPACE_IO:
+ devs = create_namespace_io(nd_region);
+ break;
+ case ND_DEVICE_NAMESPACE_PMEM:
+ case ND_DEVICE_NAMESPACE_BLK:
+ devs = create_namespaces(nd_region);
+ break;
+ default:
+ break;
+ }
+ nvdimm_bus_unlock(&nd_region->dev);
+
+ if (!devs)
+ return -ENODEV;
+
+ for (i = 0; devs[i]; i++) {
+ struct device *dev = devs[i];
+ int id;
+
+ if (type == ND_DEVICE_NAMESPACE_BLK) {
+ struct nd_namespace_blk *nsblk;
+
+ nsblk = to_nd_namespace_blk(dev);
+ id = ida_simple_get(&nd_region->ns_ida, 0, 0,
+ GFP_KERNEL);
+ nsblk->id = id;
+ } else if (type == ND_DEVICE_NAMESPACE_PMEM) {
+ struct nd_namespace_pmem *nspm;
+
+ nspm = to_nd_namespace_pmem(dev);
+ id = ida_simple_get(&nd_region->ns_ida, 0, 0,
+ GFP_KERNEL);
+ nspm->id = id;
+ } else
+ id = i;
+
+ if (id < 0)
+ break;
+ dev_set_name(dev, "namespace%d.%d", nd_region->id, id);
+ dev->groups = nd_namespace_attribute_groups;
+ nd_device_register(dev);
+ }
+ if (i)
+ nd_region->ns_seed = devs[0];
+
+ if (devs[i]) {
+ int j;
+
+ for (j = i; devs[j]; j++) {
+ struct device *dev = devs[j];
+
+ device_initialize(dev);
+ put_device(dev);
+ }
+ *err = j - i;
+ /*
+ * All of the namespaces we tried to register failed, so
+ * fail region activation.
+ */
+ if (*err == 0)
+ rc = -ENODEV;
+ }
+ kfree(devs);
+
+ if (rc == -ENODEV)
+ return rc;
+
+ return i;
+}
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
new file mode 100644
index 000000000..adf62a6c0
--- /dev/null
+++ b/drivers/nvdimm/nd-core.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ND_CORE_H__
+#define __ND_CORE_H__
+#include <linux/libnvdimm.h>
+#include <linux/device.h>
+#include <linux/libnvdimm.h>
+#include <linux/sizes.h>
+#include <linux/mutex.h>
+#include <linux/nd.h>
+
+extern struct list_head nvdimm_bus_list;
+extern struct mutex nvdimm_bus_list_mutex;
+extern int nvdimm_major;
+
+struct nvdimm_bus {
+ struct nvdimm_bus_descriptor *nd_desc;
+ wait_queue_head_t wait;
+ struct list_head list;
+ struct device dev;
+ int id, probe_active;
+ atomic_t ioctl_active;
+ struct list_head mapping_list;
+ struct mutex reconfig_mutex;
+ struct badrange badrange;
+};
+
+struct nvdimm {
+ unsigned long flags;
+ void *provider_data;
+ unsigned long cmd_mask;
+ struct device dev;
+ atomic_t busy;
+ int id, num_flush;
+ struct resource *flush_wpq;
+};
+
+/**
+ * struct blk_alloc_info - tracking info for BLK dpa scanning
+ * @nd_mapping: blk region mapping boundaries
+ * @available: decremented in alias_dpa_busy as aliased PMEM is scanned
+ * @busy: decremented in blk_dpa_busy to account for ranges already
+ * handled by alias_dpa_busy
+ * @res: alias_dpa_busy interprets this a free space range that needs to
+ * be truncated to the valid BLK allocation starting DPA, blk_dpa_busy
+ * treats it as a busy range that needs the aliased PMEM ranges
+ * truncated.
+ */
+struct blk_alloc_info {
+ struct nd_mapping *nd_mapping;
+ resource_size_t available, busy;
+ struct resource *res;
+};
+
+bool is_nvdimm(struct device *dev);
+bool is_nd_pmem(struct device *dev);
+bool is_nd_volatile(struct device *dev);
+bool is_nd_blk(struct device *dev);
+static inline bool is_nd_region(struct device *dev)
+{
+ return is_nd_pmem(dev) || is_nd_blk(dev) || is_nd_volatile(dev);
+}
+static inline bool is_memory(struct device *dev)
+{
+ return is_nd_pmem(dev) || is_nd_volatile(dev);
+}
+struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
+int __init nvdimm_bus_init(void);
+void nvdimm_bus_exit(void);
+void nvdimm_devs_exit(void);
+void nd_region_devs_exit(void);
+void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+struct nd_region;
+void nd_region_create_ns_seed(struct nd_region *nd_region);
+void nd_region_create_btt_seed(struct nd_region *nd_region);
+void nd_region_create_pfn_seed(struct nd_region *nd_region);
+void nd_region_create_dax_seed(struct nd_region *nd_region);
+void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
+void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
+void nd_synchronize(void);
+int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus);
+int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus);
+int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus);
+void __nd_device_register(struct device *dev);
+int nd_match_dimm(struct device *dev, void *data);
+struct nd_label_id;
+char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags);
+bool nd_is_uuid_unique(struct device *dev, u8 *uuid);
+struct nd_region;
+struct nvdimm_drvdata;
+struct nd_mapping;
+void nd_mapping_free_labels(struct nd_mapping *nd_mapping);
+
+int __reserve_free_pmem(struct device *dev, void *data);
+void release_free_pmem(struct nvdimm_bus *nvdimm_bus,
+ struct nd_mapping *nd_mapping);
+
+resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping);
+resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region);
+resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
+ struct nd_mapping *nd_mapping, resource_size_t *overlap);
+resource_size_t nd_blk_available_dpa(struct nd_region *nd_region);
+resource_size_t nd_region_available_dpa(struct nd_region *nd_region);
+int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
+ resource_size_t size);
+resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
+ struct nd_label_id *label_id);
+int alias_dpa_busy(struct device *dev, void *data);
+struct resource *nsblk_add_resource(struct nd_region *nd_region,
+ struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk,
+ resource_size_t start);
+int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd);
+void get_ndd(struct nvdimm_drvdata *ndd);
+resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
+void nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns);
+void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns);
+bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+ struct nd_namespace_common **_ndns);
+bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+ struct nd_namespace_common **_ndns);
+ssize_t nd_namespace_store(struct device *dev,
+ struct nd_namespace_common **_ndns, const char *buf,
+ size_t len);
+struct nd_pfn *to_nd_pfn_safe(struct device *dev);
+#endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
new file mode 100644
index 000000000..01e194a58
--- /dev/null
+++ b/drivers/nvdimm/nd.h
@@ -0,0 +1,430 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ND_H__
+#define __ND_H__
+#include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/ndctl.h>
+#include <linux/types.h>
+#include <linux/nd.h>
+#include "label.h"
+
+enum {
+ /*
+ * Limits the maximum number of block apertures a dimm can
+ * support and is an input to the geometry/on-disk-format of a
+ * BTT instance
+ */
+ ND_MAX_LANES = 256,
+ INT_LBASIZE_ALIGNMENT = 64,
+ NVDIMM_IO_ATOMIC = 1,
+};
+
+struct nvdimm_drvdata {
+ struct device *dev;
+ int nslabel_size;
+ struct nd_cmd_get_config_size nsarea;
+ void *data;
+ int ns_current, ns_next;
+ struct resource dpa;
+ struct kref kref;
+};
+
+struct nd_region_data {
+ int ns_count;
+ int ns_active;
+ unsigned int hints_shift;
+ void __iomem *flush_wpq[0];
+};
+
+static inline void __iomem *ndrd_get_flush_wpq(struct nd_region_data *ndrd,
+ int dimm, int hint)
+{
+ unsigned int num = 1 << ndrd->hints_shift;
+ unsigned int mask = num - 1;
+
+ return ndrd->flush_wpq[dimm * num + (hint & mask)];
+}
+
+static inline void ndrd_set_flush_wpq(struct nd_region_data *ndrd, int dimm,
+ int hint, void __iomem *flush)
+{
+ unsigned int num = 1 << ndrd->hints_shift;
+ unsigned int mask = num - 1;
+
+ ndrd->flush_wpq[dimm * num + (hint & mask)] = flush;
+}
+
+static inline struct nd_namespace_index *to_namespace_index(
+ struct nvdimm_drvdata *ndd, int i)
+{
+ if (i < 0)
+ return NULL;
+
+ return ndd->data + sizeof_namespace_index(ndd) * i;
+}
+
+static inline struct nd_namespace_index *to_current_namespace_index(
+ struct nvdimm_drvdata *ndd)
+{
+ return to_namespace_index(ndd, ndd->ns_current);
+}
+
+static inline struct nd_namespace_index *to_next_namespace_index(
+ struct nvdimm_drvdata *ndd)
+{
+ return to_namespace_index(ndd, ndd->ns_next);
+}
+
+unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd);
+
+#define namespace_label_has(ndd, field) \
+ (offsetof(struct nd_namespace_label, field) \
+ < sizeof_namespace_label(ndd))
+
+#define nd_dbg_dpa(r, d, res, fmt, arg...) \
+ dev_dbg((r) ? &(r)->dev : (d)->dev, "%s: %.13s: %#llx @ %#llx " fmt, \
+ (r) ? dev_name((d)->dev) : "", res ? res->name : "null", \
+ (unsigned long long) (res ? resource_size(res) : 0), \
+ (unsigned long long) (res ? res->start : 0), ##arg)
+
+#define for_each_dpa_resource(ndd, res) \
+ for (res = (ndd)->dpa.child; res; res = res->sibling)
+
+#define for_each_dpa_resource_safe(ndd, res, next) \
+ for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \
+ res; res = next, next = next ? next->sibling : NULL)
+
+struct nd_percpu_lane {
+ int count;
+ spinlock_t lock;
+};
+
+enum nd_label_flags {
+ ND_LABEL_REAP,
+};
+struct nd_label_ent {
+ struct list_head list;
+ unsigned long flags;
+ struct nd_namespace_label *label;
+};
+
+enum nd_mapping_lock_class {
+ ND_MAPPING_CLASS0,
+ ND_MAPPING_UUID_SCAN,
+};
+
+struct nd_mapping {
+ struct nvdimm *nvdimm;
+ u64 start;
+ u64 size;
+ int position;
+ struct list_head labels;
+ struct mutex lock;
+ /*
+ * @ndd is for private use at region enable / disable time for
+ * get_ndd() + put_ndd(), all other nd_mapping to ndd
+ * conversions use to_ndd() which respects enabled state of the
+ * nvdimm.
+ */
+ struct nvdimm_drvdata *ndd;
+};
+
+struct nd_region {
+ struct device dev;
+ struct ida ns_ida;
+ struct ida btt_ida;
+ struct ida pfn_ida;
+ struct ida dax_ida;
+ unsigned long flags;
+ struct device *ns_seed;
+ struct device *btt_seed;
+ struct device *pfn_seed;
+ struct device *dax_seed;
+ u16 ndr_mappings;
+ u64 ndr_size;
+ u64 ndr_start;
+ int id, num_lanes, ro, numa_node;
+ void *provider_data;
+ struct kernfs_node *bb_state;
+ struct badblocks bb;
+ struct nd_interleave_set *nd_set;
+ struct nd_percpu_lane __percpu *lane;
+ struct nd_mapping mapping[0];
+};
+
+struct nd_blk_region {
+ int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+ int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
+ void *iobuf, u64 len, int rw);
+ void *blk_provider_data;
+ struct nd_region nd_region;
+};
+
+/*
+ * Lookup next in the repeating sequence of 01, 10, and 11.
+ */
+static inline unsigned nd_inc_seq(unsigned seq)
+{
+ static const unsigned next[] = { 0, 2, 3, 1 };
+
+ return next[seq & 3];
+}
+
+struct btt;
+struct nd_btt {
+ struct device dev;
+ struct nd_namespace_common *ndns;
+ struct btt *btt;
+ unsigned long lbasize;
+ u64 size;
+ u8 *uuid;
+ int id;
+ int initial_offset;
+ u16 version_major;
+ u16 version_minor;
+};
+
+enum nd_pfn_mode {
+ PFN_MODE_NONE,
+ PFN_MODE_RAM,
+ PFN_MODE_PMEM,
+};
+
+struct nd_pfn {
+ int id;
+ u8 *uuid;
+ struct device dev;
+ unsigned long align;
+ unsigned long npfns;
+ enum nd_pfn_mode mode;
+ struct nd_pfn_sb *pfn_sb;
+ struct nd_namespace_common *ndns;
+};
+
+struct nd_dax {
+ struct nd_pfn nd_pfn;
+};
+
+enum nd_async_mode {
+ ND_SYNC,
+ ND_ASYNC,
+};
+
+int nd_integrity_init(struct gendisk *disk, unsigned long meta_size);
+void wait_nvdimm_bus_probe_idle(struct device *dev);
+void nd_device_register(struct device *dev);
+void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
+void nd_device_notify(struct device *dev, enum nvdimm_event event);
+int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
+ size_t len);
+ssize_t nd_size_select_show(unsigned long current_size,
+ const unsigned long *supported, char *buf);
+ssize_t nd_size_select_store(struct device *dev, const char *buf,
+ unsigned long *current_size, const unsigned long *supported);
+int __init nvdimm_init(void);
+int __init nd_region_init(void);
+int __init nd_label_init(void);
+void nvdimm_exit(void);
+void nd_region_exit(void);
+struct nvdimm;
+struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping);
+int nvdimm_check_config_data(struct device *dev);
+int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd);
+int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
+int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
+ void *buf, size_t len);
+long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
+ unsigned int len);
+void nvdimm_set_aliasing(struct device *dev);
+void nvdimm_set_locked(struct device *dev);
+void nvdimm_clear_locked(struct device *dev);
+struct nd_btt *to_nd_btt(struct device *dev);
+
+struct nd_gen_sb {
+ char reserved[SZ_4K - 8];
+ __le64 checksum;
+};
+
+u64 nd_sb_checksum(struct nd_gen_sb *sb);
+#if IS_ENABLED(CONFIG_BTT)
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns);
+bool is_nd_btt(struct device *dev);
+struct device *nd_btt_create(struct nd_region *nd_region);
+#else
+static inline int nd_btt_probe(struct device *dev,
+ struct nd_namespace_common *ndns)
+{
+ return -ENODEV;
+}
+
+static inline bool is_nd_btt(struct device *dev)
+{
+ return false;
+}
+
+static inline struct device *nd_btt_create(struct nd_region *nd_region)
+{
+ return NULL;
+}
+#endif
+
+struct nd_pfn *to_nd_pfn(struct device *dev);
+#if IS_ENABLED(CONFIG_NVDIMM_PFN)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE
+#else
+#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE
+#endif
+
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
+bool is_nd_pfn(struct device *dev);
+struct device *nd_pfn_create(struct nd_region *nd_region);
+struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
+ struct nd_namespace_common *ndns);
+int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig);
+extern struct attribute_group nd_pfn_attribute_group;
+#else
+static inline int nd_pfn_probe(struct device *dev,
+ struct nd_namespace_common *ndns)
+{
+ return -ENODEV;
+}
+
+static inline bool is_nd_pfn(struct device *dev)
+{
+ return false;
+}
+
+static inline struct device *nd_pfn_create(struct nd_region *nd_region)
+{
+ return NULL;
+}
+
+static inline int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
+{
+ return -ENODEV;
+}
+#endif
+
+struct nd_dax *to_nd_dax(struct device *dev);
+#if IS_ENABLED(CONFIG_NVDIMM_DAX)
+int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns);
+bool is_nd_dax(struct device *dev);
+struct device *nd_dax_create(struct nd_region *nd_region);
+#else
+static inline int nd_dax_probe(struct device *dev,
+ struct nd_namespace_common *ndns)
+{
+ return -ENODEV;
+}
+
+static inline bool is_nd_dax(struct device *dev)
+{
+ return false;
+}
+
+static inline struct device *nd_dax_create(struct nd_region *nd_region)
+{
+ return NULL;
+}
+#endif
+
+int nd_region_to_nstype(struct nd_region *nd_region);
+int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
+u64 nd_region_interleave_set_cookie(struct nd_region *nd_region,
+ struct nd_namespace_index *nsindex);
+u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region);
+void nvdimm_bus_lock(struct device *dev);
+void nvdimm_bus_unlock(struct device *dev);
+bool is_nvdimm_bus_locked(struct device *dev);
+int nvdimm_revalidate_disk(struct gendisk *disk);
+void nvdimm_drvdata_release(struct kref *kref);
+void put_ndd(struct nvdimm_drvdata *ndd);
+int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd);
+void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res);
+struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
+ struct nd_label_id *label_id, resource_size_t start,
+ resource_size_t n);
+resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
+bool nvdimm_namespace_locked(struct nd_namespace_common *ndns);
+struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev);
+int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
+int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt);
+const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
+ char *name);
+unsigned int pmem_sector_size(struct nd_namespace_common *ndns);
+void nvdimm_badblocks_populate(struct nd_region *nd_region,
+ struct badblocks *bb, const struct resource *res);
+#if IS_ENABLED(CONFIG_ND_CLAIM)
+int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap);
+int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio);
+void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio);
+#else
+static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+ struct dev_pagemap *pgmap)
+{
+ return -ENXIO;
+}
+static inline int devm_nsio_enable(struct device *dev,
+ struct nd_namespace_io *nsio)
+{
+ return -ENXIO;
+}
+static inline void devm_nsio_disable(struct device *dev,
+ struct nd_namespace_io *nsio)
+{
+}
+#endif
+int nd_blk_region_init(struct nd_region *nd_region);
+int nd_region_activate(struct nd_region *nd_region);
+void __nd_iostat_start(struct bio *bio, unsigned long *start);
+static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
+{
+ struct gendisk *disk = bio->bi_disk;
+
+ if (!blk_queue_io_stat(disk->queue))
+ return false;
+
+ *start = jiffies;
+ generic_start_io_acct(disk->queue, bio_op(bio), bio_sectors(bio),
+ &disk->part0);
+ return true;
+}
+static inline void nd_iostat_end(struct bio *bio, unsigned long start)
+{
+ struct gendisk *disk = bio->bi_disk;
+
+ generic_end_io_acct(disk->queue, bio_op(bio), &disk->part0, start);
+}
+static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
+ unsigned int len)
+{
+ if (bb->count) {
+ sector_t first_bad;
+ int num_bad;
+
+ return !!badblocks_check(bb, sector, len / 512, &first_bad,
+ &num_bad);
+ }
+
+ return false;
+}
+resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk);
+const u8 *nd_dev_to_uuid(struct device *dev);
+bool pmem_should_map_pages(struct device *dev);
+#endif /* __ND_H__ */
diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c
new file mode 100644
index 000000000..0a701837d
--- /dev/null
+++ b/drivers/nvdimm/of_pmem.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#define pr_fmt(fmt) "of_pmem: " fmt
+
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/libnvdimm.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
+#include <linux/slab.h>
+
+static const struct attribute_group *region_attr_groups[] = {
+ &nd_region_attribute_group,
+ &nd_device_attribute_group,
+ NULL,
+};
+
+static const struct attribute_group *bus_attr_groups[] = {
+ &nvdimm_bus_attribute_group,
+ NULL,
+};
+
+struct of_pmem_private {
+ struct nvdimm_bus_descriptor bus_desc;
+ struct nvdimm_bus *bus;
+};
+
+static int of_pmem_region_probe(struct platform_device *pdev)
+{
+ struct of_pmem_private *priv;
+ struct device_node *np;
+ struct nvdimm_bus *bus;
+ bool is_volatile;
+ int i;
+
+ np = dev_of_node(&pdev->dev);
+ if (!np)
+ return -ENXIO;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->bus_desc.attr_groups = bus_attr_groups;
+ priv->bus_desc.provider_name = "of_pmem";
+ priv->bus_desc.module = THIS_MODULE;
+ priv->bus_desc.of_node = np;
+
+ priv->bus = bus = nvdimm_bus_register(&pdev->dev, &priv->bus_desc);
+ if (!bus) {
+ kfree(priv);
+ return -ENODEV;
+ }
+ platform_set_drvdata(pdev, priv);
+
+ is_volatile = !!of_find_property(np, "volatile", NULL);
+ dev_dbg(&pdev->dev, "Registering %s regions from %pOF\n",
+ is_volatile ? "volatile" : "non-volatile", np);
+
+ for (i = 0; i < pdev->num_resources; i++) {
+ struct nd_region_desc ndr_desc;
+ struct nd_region *region;
+
+ /*
+ * NB: libnvdimm copies the data from ndr_desc into it's own
+ * structures so passing a stack pointer is fine.
+ */
+ memset(&ndr_desc, 0, sizeof(ndr_desc));
+ ndr_desc.attr_groups = region_attr_groups;
+ ndr_desc.numa_node = dev_to_node(&pdev->dev);
+ ndr_desc.res = &pdev->resource[i];
+ ndr_desc.of_node = np;
+ set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+
+ if (is_volatile)
+ region = nvdimm_volatile_region_create(bus, &ndr_desc);
+ else
+ region = nvdimm_pmem_region_create(bus, &ndr_desc);
+
+ if (!region)
+ dev_warn(&pdev->dev, "Unable to register region %pR from %pOF\n",
+ ndr_desc.res, np);
+ else
+ dev_dbg(&pdev->dev, "Registered region %pR from %pOF\n",
+ ndr_desc.res, np);
+ }
+
+ return 0;
+}
+
+static int of_pmem_region_remove(struct platform_device *pdev)
+{
+ struct of_pmem_private *priv = platform_get_drvdata(pdev);
+
+ nvdimm_bus_unregister(priv->bus);
+ kfree(priv);
+
+ return 0;
+}
+
+static const struct of_device_id of_pmem_region_match[] = {
+ { .compatible = "pmem-region" },
+ { },
+};
+
+static struct platform_driver of_pmem_region_driver = {
+ .probe = of_pmem_region_probe,
+ .remove = of_pmem_region_remove,
+ .driver = {
+ .name = "of_pmem",
+ .owner = THIS_MODULE,
+ .of_match_table = of_pmem_region_match,
+ },
+};
+
+module_platform_driver(of_pmem_region_driver);
+MODULE_DEVICE_TABLE(of, of_pmem_region_match);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("IBM Corporation");
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
new file mode 100644
index 000000000..e901e3a3b
--- /dev/null
+++ b/drivers/nvdimm/pfn.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2014-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __NVDIMM_PFN_H
+#define __NVDIMM_PFN_H
+
+#include <linux/types.h>
+#include <linux/mmzone.h>
+
+#define PFN_SIG_LEN 16
+#define PFN_SIG "NVDIMM_PFN_INFO\0"
+#define DAX_SIG "NVDIMM_DAX_INFO\0"
+
+struct nd_pfn_sb {
+ u8 signature[PFN_SIG_LEN];
+ u8 uuid[16];
+ u8 parent_uuid[16];
+ __le32 flags;
+ __le16 version_major;
+ __le16 version_minor;
+ __le64 dataoff; /* relative to namespace_base + start_pad */
+ __le64 npfns;
+ __le32 mode;
+ /* minor-version-1 additions for section alignment */
+ __le32 start_pad;
+ __le32 end_trunc;
+ /* minor-version-2 record the base alignment of the mapping */
+ __le32 align;
+ /* minor-version-3 guarantee the padding and flags are zero */
+ u8 padding[4000];
+ __le64 checksum;
+};
+
+#ifdef CONFIG_SPARSEMEM
+#define PFN_SECTION_ALIGN_DOWN(x) SECTION_ALIGN_DOWN(x)
+#define PFN_SECTION_ALIGN_UP(x) SECTION_ALIGN_UP(x)
+#else
+/*
+ * In this case ZONE_DEVICE=n and we will disable 'pfn' device support,
+ * but we still want pmem to compile.
+ */
+#define PFN_SECTION_ALIGN_DOWN(x) (x)
+#define PFN_SECTION_ALIGN_UP(x) (x)
+#endif
+
+#define PHYS_SECTION_ALIGN_DOWN(x) PFN_PHYS(PFN_SECTION_ALIGN_DOWN(PHYS_PFN(x)))
+#define PHYS_SECTION_ALIGN_UP(x) PFN_PHYS(PFN_SECTION_ALIGN_UP(PHYS_PFN(x)))
+#endif /* __NVDIMM_PFN_H */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
new file mode 100644
index 000000000..86ed09b2a
--- /dev/null
+++ b/drivers/nvdimm/pfn_devs.c
@@ -0,0 +1,748 @@
+/*
+ * Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/memremap.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "pfn.h"
+#include "nd.h"
+
+static void nd_pfn_release(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+ dev_dbg(dev, "trace\n");
+ nd_detach_ndns(&nd_pfn->dev, &nd_pfn->ndns);
+ ida_simple_remove(&nd_region->pfn_ida, nd_pfn->id);
+ kfree(nd_pfn->uuid);
+ kfree(nd_pfn);
+}
+
+static struct device_type nd_pfn_device_type = {
+ .name = "nd_pfn",
+ .release = nd_pfn_release,
+};
+
+bool is_nd_pfn(struct device *dev)
+{
+ return dev ? dev->type == &nd_pfn_device_type : false;
+}
+EXPORT_SYMBOL(is_nd_pfn);
+
+struct nd_pfn *to_nd_pfn(struct device *dev)
+{
+ struct nd_pfn *nd_pfn = container_of(dev, struct nd_pfn, dev);
+
+ WARN_ON(!is_nd_pfn(dev));
+ return nd_pfn;
+}
+EXPORT_SYMBOL(to_nd_pfn);
+
+static ssize_t mode_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+
+ switch (nd_pfn->mode) {
+ case PFN_MODE_RAM:
+ return sprintf(buf, "ram\n");
+ case PFN_MODE_PMEM:
+ return sprintf(buf, "pmem\n");
+ default:
+ return sprintf(buf, "none\n");
+ }
+}
+
+static ssize_t mode_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+ ssize_t rc = 0;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ if (dev->driver)
+ rc = -EBUSY;
+ else {
+ size_t n = len - 1;
+
+ if (strncmp(buf, "pmem\n", n) == 0
+ || strncmp(buf, "pmem", n) == 0) {
+ nd_pfn->mode = PFN_MODE_PMEM;
+ } else if (strncmp(buf, "ram\n", n) == 0
+ || strncmp(buf, "ram", n) == 0)
+ nd_pfn->mode = PFN_MODE_RAM;
+ else if (strncmp(buf, "none\n", n) == 0
+ || strncmp(buf, "none", n) == 0)
+ nd_pfn->mode = PFN_MODE_NONE;
+ else
+ rc = -EINVAL;
+ }
+ dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+ buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(mode);
+
+static ssize_t align_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+
+ return sprintf(buf, "%ld\n", nd_pfn->align);
+}
+
+static const unsigned long *nd_pfn_supported_alignments(void)
+{
+ /*
+ * This needs to be a non-static variable because the *_SIZE
+ * macros aren't always constants.
+ */
+ const unsigned long supported_alignments[] = {
+ PAGE_SIZE,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ HPAGE_PMD_SIZE,
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ HPAGE_PUD_SIZE,
+#endif
+#endif
+ 0,
+ };
+ static unsigned long data[ARRAY_SIZE(supported_alignments)];
+
+ memcpy(data, supported_alignments, sizeof(data));
+
+ return data;
+}
+
+static ssize_t align_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ rc = nd_size_select_store(dev, buf, &nd_pfn->align,
+ nd_pfn_supported_alignments());
+ dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+ buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(align);
+
+static ssize_t uuid_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+
+ if (nd_pfn->uuid)
+ return sprintf(buf, "%pUb\n", nd_pfn->uuid);
+ return sprintf(buf, "\n");
+}
+
+static ssize_t uuid_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ rc = nd_uuid_store(dev, &nd_pfn->uuid, buf, len);
+ dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+ buf[len - 1] == '\n' ? "" : "\n");
+ device_unlock(dev);
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t namespace_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ rc = sprintf(buf, "%s\n", nd_pfn->ndns
+ ? dev_name(&nd_pfn->ndns->dev) : "");
+ nvdimm_bus_unlock(dev);
+ return rc;
+}
+
+static ssize_t namespace_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ rc = nd_namespace_store(dev, &nd_pfn->ndns, buf, len);
+ dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+ buf[len - 1] == '\n' ? "" : "\n");
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RW(namespace);
+
+static ssize_t resource_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ if (dev->driver) {
+ struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+ u64 offset = __le64_to_cpu(pfn_sb->dataoff);
+ struct nd_namespace_common *ndns = nd_pfn->ndns;
+ u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+ struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+
+ rc = sprintf(buf, "%#llx\n", (unsigned long long) nsio->res.start
+ + start_pad + offset);
+ } else {
+ /* no address to convey if the pfn instance is disabled */
+ rc = -ENXIO;
+ }
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(resource);
+
+static ssize_t size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ if (dev->driver) {
+ struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+ u64 offset = __le64_to_cpu(pfn_sb->dataoff);
+ struct nd_namespace_common *ndns = nd_pfn->ndns;
+ u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+ u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+ struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+
+ rc = sprintf(buf, "%llu\n", (unsigned long long)
+ resource_size(&nsio->res) - start_pad
+ - end_trunc - offset);
+ } else {
+ /* no size to convey if the pfn instance is disabled */
+ rc = -ENXIO;
+ }
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t supported_alignments_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return nd_size_select_show(0, nd_pfn_supported_alignments(), buf);
+}
+static DEVICE_ATTR_RO(supported_alignments);
+
+static struct attribute *nd_pfn_attributes[] = {
+ &dev_attr_mode.attr,
+ &dev_attr_namespace.attr,
+ &dev_attr_uuid.attr,
+ &dev_attr_align.attr,
+ &dev_attr_resource.attr,
+ &dev_attr_size.attr,
+ &dev_attr_supported_alignments.attr,
+ NULL,
+};
+
+static umode_t pfn_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ if (a == &dev_attr_resource.attr)
+ return 0400;
+ return a->mode;
+}
+
+struct attribute_group nd_pfn_attribute_group = {
+ .attrs = nd_pfn_attributes,
+ .is_visible = pfn_visible,
+};
+
+static const struct attribute_group *nd_pfn_attribute_groups[] = {
+ &nd_pfn_attribute_group,
+ &nd_device_attribute_group,
+ &nd_numa_attribute_group,
+ NULL,
+};
+
+struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
+ struct nd_namespace_common *ndns)
+{
+ struct device *dev;
+
+ if (!nd_pfn)
+ return NULL;
+
+ nd_pfn->mode = PFN_MODE_NONE;
+ nd_pfn->align = PFN_DEFAULT_ALIGNMENT;
+ dev = &nd_pfn->dev;
+ device_initialize(&nd_pfn->dev);
+ if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
+ dev_dbg(&ndns->dev, "failed, already claimed by %s\n",
+ dev_name(ndns->claim));
+ put_device(dev);
+ return NULL;
+ }
+ return dev;
+}
+
+static struct nd_pfn *nd_pfn_alloc(struct nd_region *nd_region)
+{
+ struct nd_pfn *nd_pfn;
+ struct device *dev;
+
+ nd_pfn = kzalloc(sizeof(*nd_pfn), GFP_KERNEL);
+ if (!nd_pfn)
+ return NULL;
+
+ nd_pfn->id = ida_simple_get(&nd_region->pfn_ida, 0, 0, GFP_KERNEL);
+ if (nd_pfn->id < 0) {
+ kfree(nd_pfn);
+ return NULL;
+ }
+
+ dev = &nd_pfn->dev;
+ dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id);
+ dev->groups = nd_pfn_attribute_groups;
+ dev->type = &nd_pfn_device_type;
+ dev->parent = &nd_region->dev;
+
+ return nd_pfn;
+}
+
+struct device *nd_pfn_create(struct nd_region *nd_region)
+{
+ struct nd_pfn *nd_pfn;
+ struct device *dev;
+
+ if (!is_memory(&nd_region->dev))
+ return NULL;
+
+ nd_pfn = nd_pfn_alloc(nd_region);
+ dev = nd_pfn_devinit(nd_pfn, NULL);
+
+ __nd_device_register(dev);
+ return dev;
+}
+
+/**
+ * nd_pfn_validate - read and validate info-block
+ * @nd_pfn: fsdax namespace runtime state / properties
+ * @sig: 'devdax' or 'fsdax' signature
+ *
+ * Upon return the info-block buffer contents (->pfn_sb) are
+ * indeterminate when validation fails, and a coherent info-block
+ * otherwise.
+ */
+int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
+{
+ u64 checksum, offset;
+ enum nd_pfn_mode mode;
+ struct nd_namespace_io *nsio;
+ unsigned long align, start_pad;
+ struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+ struct nd_namespace_common *ndns = nd_pfn->ndns;
+ const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev);
+
+ if (!pfn_sb || !ndns)
+ return -ENODEV;
+
+ if (!is_memory(nd_pfn->dev.parent))
+ return -ENODEV;
+
+ if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0))
+ return -ENXIO;
+
+ if (memcmp(pfn_sb->signature, sig, PFN_SIG_LEN) != 0)
+ return -ENODEV;
+
+ checksum = le64_to_cpu(pfn_sb->checksum);
+ pfn_sb->checksum = 0;
+ if (checksum != nd_sb_checksum((struct nd_gen_sb *) pfn_sb))
+ return -ENODEV;
+ pfn_sb->checksum = cpu_to_le64(checksum);
+
+ if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0)
+ return -ENODEV;
+
+ if (__le16_to_cpu(pfn_sb->version_minor) < 1) {
+ pfn_sb->start_pad = 0;
+ pfn_sb->end_trunc = 0;
+ }
+
+ if (__le16_to_cpu(pfn_sb->version_minor) < 2)
+ pfn_sb->align = 0;
+
+ switch (le32_to_cpu(pfn_sb->mode)) {
+ case PFN_MODE_RAM:
+ case PFN_MODE_PMEM:
+ break;
+ default:
+ return -ENXIO;
+ }
+
+ align = le32_to_cpu(pfn_sb->align);
+ offset = le64_to_cpu(pfn_sb->dataoff);
+ start_pad = le32_to_cpu(pfn_sb->start_pad);
+ if (align == 0)
+ align = 1UL << ilog2(offset);
+ mode = le32_to_cpu(pfn_sb->mode);
+
+ if (!nd_pfn->uuid) {
+ /*
+ * When probing a namepace via nd_pfn_probe() the uuid
+ * is NULL (see: nd_pfn_devinit()) we init settings from
+ * pfn_sb
+ */
+ nd_pfn->uuid = kmemdup(pfn_sb->uuid, 16, GFP_KERNEL);
+ if (!nd_pfn->uuid)
+ return -ENOMEM;
+ nd_pfn->align = align;
+ nd_pfn->mode = mode;
+ } else {
+ /*
+ * When probing a pfn / dax instance we validate the
+ * live settings against the pfn_sb
+ */
+ if (memcmp(nd_pfn->uuid, pfn_sb->uuid, 16) != 0)
+ return -ENODEV;
+
+ /*
+ * If the uuid validates, but other settings mismatch
+ * return EINVAL because userspace has managed to change
+ * the configuration without specifying new
+ * identification.
+ */
+ if (nd_pfn->align != align || nd_pfn->mode != mode) {
+ dev_err(&nd_pfn->dev,
+ "init failed, settings mismatch\n");
+ dev_dbg(&nd_pfn->dev, "align: %lx:%lx mode: %d:%d\n",
+ nd_pfn->align, align, nd_pfn->mode,
+ mode);
+ return -EINVAL;
+ }
+ }
+
+ if (align > nvdimm_namespace_capacity(ndns)) {
+ dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
+ align, nvdimm_namespace_capacity(ndns));
+ return -EINVAL;
+ }
+
+ /*
+ * These warnings are verbose because they can only trigger in
+ * the case where the physical address alignment of the
+ * namespace has changed since the pfn superblock was
+ * established.
+ */
+ nsio = to_nd_namespace_io(&ndns->dev);
+ if (offset >= resource_size(&nsio->res)) {
+ dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n",
+ dev_name(&ndns->dev));
+ return -EBUSY;
+ }
+
+ if ((align && !IS_ALIGNED(nsio->res.start + offset + start_pad, align))
+ || !IS_ALIGNED(offset, PAGE_SIZE)) {
+ dev_err(&nd_pfn->dev,
+ "bad offset: %#llx dax disabled align: %#lx\n",
+ offset, align);
+ return -ENXIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(nd_pfn_validate);
+
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
+{
+ int rc;
+ struct nd_pfn *nd_pfn;
+ struct device *pfn_dev;
+ struct nd_pfn_sb *pfn_sb;
+ struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+ if (ndns->force_raw)
+ return -ENODEV;
+
+ switch (ndns->claim_class) {
+ case NVDIMM_CCLASS_NONE:
+ case NVDIMM_CCLASS_PFN:
+ break;
+ default:
+ return -ENODEV;
+ }
+
+ nvdimm_bus_lock(&ndns->dev);
+ nd_pfn = nd_pfn_alloc(nd_region);
+ pfn_dev = nd_pfn_devinit(nd_pfn, ndns);
+ nvdimm_bus_unlock(&ndns->dev);
+ if (!pfn_dev)
+ return -ENOMEM;
+ pfn_sb = devm_kmalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
+ nd_pfn = to_nd_pfn(pfn_dev);
+ nd_pfn->pfn_sb = pfn_sb;
+ rc = nd_pfn_validate(nd_pfn, PFN_SIG);
+ dev_dbg(dev, "pfn: %s\n", rc == 0 ? dev_name(pfn_dev) : "<none>");
+ if (rc < 0) {
+ nd_detach_ndns(pfn_dev, &nd_pfn->ndns);
+ put_device(pfn_dev);
+ } else
+ __nd_device_register(pfn_dev);
+
+ return rc;
+}
+EXPORT_SYMBOL(nd_pfn_probe);
+
+/*
+ * We hotplug memory at section granularity, pad the reserved area from
+ * the previous section base to the namespace base address.
+ */
+static unsigned long init_altmap_base(resource_size_t base)
+{
+ unsigned long base_pfn = PHYS_PFN(base);
+
+ return PFN_SECTION_ALIGN_DOWN(base_pfn);
+}
+
+static unsigned long init_altmap_reserve(resource_size_t base)
+{
+ unsigned long reserve = PFN_UP(SZ_8K);
+ unsigned long base_pfn = PHYS_PFN(base);
+
+ reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn);
+ return reserve;
+}
+
+static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
+{
+ struct resource *res = &pgmap->res;
+ struct vmem_altmap *altmap = &pgmap->altmap;
+ struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+ u64 offset = le64_to_cpu(pfn_sb->dataoff);
+ u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+ u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+ struct nd_namespace_common *ndns = nd_pfn->ndns;
+ struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+ resource_size_t base = nsio->res.start + start_pad;
+ struct vmem_altmap __altmap = {
+ .base_pfn = init_altmap_base(base),
+ .reserve = init_altmap_reserve(base),
+ };
+
+ memcpy(res, &nsio->res, sizeof(*res));
+ res->start += start_pad;
+ res->end -= end_trunc;
+
+ if (nd_pfn->mode == PFN_MODE_RAM) {
+ if (offset < SZ_8K)
+ return -EINVAL;
+ nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
+ pgmap->altmap_valid = false;
+ } else if (nd_pfn->mode == PFN_MODE_PMEM) {
+ nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
+ - offset) / PAGE_SIZE);
+ if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
+ dev_info(&nd_pfn->dev,
+ "number of pfns truncated from %lld to %ld\n",
+ le64_to_cpu(nd_pfn->pfn_sb->npfns),
+ nd_pfn->npfns);
+ memcpy(altmap, &__altmap, sizeof(*altmap));
+ altmap->free = PHYS_PFN(offset - SZ_8K);
+ altmap->alloc = 0;
+ pgmap->altmap_valid = true;
+ } else
+ return -ENXIO;
+
+ return 0;
+}
+
+static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys)
+{
+ return min_t(u64, PHYS_SECTION_ALIGN_DOWN(phys),
+ ALIGN_DOWN(phys, nd_pfn->align));
+}
+
+/*
+ * Check if pmem collides with 'System RAM', or other regions when
+ * section aligned. Trim it accordingly.
+ */
+static void trim_pfn_device(struct nd_pfn *nd_pfn, u32 *start_pad, u32 *end_trunc)
+{
+ struct nd_namespace_common *ndns = nd_pfn->ndns;
+ struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+ struct nd_region *nd_region = to_nd_region(nd_pfn->dev.parent);
+ const resource_size_t start = nsio->res.start;
+ const resource_size_t end = start + resource_size(&nsio->res);
+ resource_size_t adjust, size;
+
+ *start_pad = 0;
+ *end_trunc = 0;
+
+ adjust = start - PHYS_SECTION_ALIGN_DOWN(start);
+ size = resource_size(&nsio->res) + adjust;
+ if (region_intersects(start - adjust, size, IORESOURCE_SYSTEM_RAM,
+ IORES_DESC_NONE) == REGION_MIXED
+ || nd_region_conflict(nd_region, start - adjust, size))
+ *start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
+
+ /* Now check that end of the range does not collide. */
+ adjust = PHYS_SECTION_ALIGN_UP(end) - end;
+ size = resource_size(&nsio->res) + adjust;
+ if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
+ IORES_DESC_NONE) == REGION_MIXED
+ || !IS_ALIGNED(end, nd_pfn->align)
+ || nd_region_conflict(nd_region, start, size))
+ *end_trunc = end - phys_pmem_align_down(nd_pfn, end);
+}
+
+static int nd_pfn_init(struct nd_pfn *nd_pfn)
+{
+ u32 dax_label_reserve = is_nd_dax(&nd_pfn->dev) ? SZ_128K : 0;
+ struct nd_namespace_common *ndns = nd_pfn->ndns;
+ struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+ resource_size_t start, size;
+ struct nd_region *nd_region;
+ u32 start_pad, end_trunc;
+ struct nd_pfn_sb *pfn_sb;
+ unsigned long npfns;
+ phys_addr_t offset;
+ const char *sig;
+ u64 checksum;
+ int rc;
+
+ pfn_sb = devm_kmalloc(&nd_pfn->dev, sizeof(*pfn_sb), GFP_KERNEL);
+ if (!pfn_sb)
+ return -ENOMEM;
+
+ nd_pfn->pfn_sb = pfn_sb;
+ if (is_nd_dax(&nd_pfn->dev))
+ sig = DAX_SIG;
+ else
+ sig = PFN_SIG;
+
+ rc = nd_pfn_validate(nd_pfn, sig);
+ if (rc != -ENODEV)
+ return rc;
+
+ /* no info block, do init */;
+ memset(pfn_sb, 0, sizeof(*pfn_sb));
+
+ nd_region = to_nd_region(nd_pfn->dev.parent);
+ if (nd_region->ro) {
+ dev_info(&nd_pfn->dev,
+ "%s is read-only, unable to init metadata\n",
+ dev_name(&nd_region->dev));
+ return -ENXIO;
+ }
+
+ memset(pfn_sb, 0, sizeof(*pfn_sb));
+
+ trim_pfn_device(nd_pfn, &start_pad, &end_trunc);
+ if (start_pad + end_trunc)
+ dev_info(&nd_pfn->dev, "%s alignment collision, truncate %d bytes\n",
+ dev_name(&ndns->dev), start_pad + end_trunc);
+
+ /*
+ * Note, we use 64 here for the standard size of struct page,
+ * debugging options may cause it to be larger in which case the
+ * implementation will limit the pfns advertised through
+ * ->direct_access() to those that are included in the memmap.
+ */
+ start = nsio->res.start + start_pad;
+ size = resource_size(&nsio->res);
+ npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K)
+ / PAGE_SIZE);
+ if (nd_pfn->mode == PFN_MODE_PMEM) {
+ /*
+ * The altmap should be padded out to the block size used
+ * when populating the vmemmap. This *should* be equal to
+ * PMD_SIZE for most architectures.
+ */
+ offset = ALIGN(start + SZ_8K + 64 * npfns + dax_label_reserve,
+ max(nd_pfn->align, PMD_SIZE)) - start;
+ } else if (nd_pfn->mode == PFN_MODE_RAM)
+ offset = ALIGN(start + SZ_8K + dax_label_reserve,
+ nd_pfn->align) - start;
+ else
+ return -ENXIO;
+
+ if (offset + start_pad + end_trunc >= size) {
+ dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
+ dev_name(&ndns->dev));
+ return -ENXIO;
+ }
+
+ npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
+ pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
+ pfn_sb->dataoff = cpu_to_le64(offset);
+ pfn_sb->npfns = cpu_to_le64(npfns);
+ memcpy(pfn_sb->signature, sig, PFN_SIG_LEN);
+ memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
+ memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
+ pfn_sb->version_major = cpu_to_le16(1);
+ pfn_sb->version_minor = cpu_to_le16(3);
+ pfn_sb->start_pad = cpu_to_le32(start_pad);
+ pfn_sb->end_trunc = cpu_to_le32(end_trunc);
+ pfn_sb->align = cpu_to_le32(nd_pfn->align);
+ checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
+ pfn_sb->checksum = cpu_to_le64(checksum);
+
+ return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0);
+}
+
+/*
+ * Determine the effective resource range and vmem_altmap from an nd_pfn
+ * instance.
+ */
+int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
+{
+ int rc;
+
+ if (!nd_pfn->uuid || !nd_pfn->ndns)
+ return -ENODEV;
+
+ rc = nd_pfn_init(nd_pfn);
+ if (rc)
+ return rc;
+
+ /* we need a valid pfn_sb before we can init a dev_pagemap */
+ return __nvdimm_setup_pfn(nd_pfn, pgmap);
+}
+EXPORT_SYMBOL_GPL(nvdimm_setup_pfn);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
new file mode 100644
index 000000000..d6f981f6e
--- /dev/null
+++ b/drivers/nvdimm/pmem.c
@@ -0,0 +1,613 @@
+/*
+ * Persistent Memory Driver
+ *
+ * Copyright (c) 2014-2015, Intel Corporation.
+ * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
+ * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <asm/cacheflush.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/set_memory.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/badblocks.h>
+#include <linux/memremap.h>
+#include <linux/vmalloc.h>
+#include <linux/blk-mq.h>
+#include <linux/pfn_t.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/dax.h>
+#include <linux/nd.h>
+#include <linux/backing-dev.h>
+#include "pmem.h"
+#include "pfn.h"
+#include "nd.h"
+#include "nd-core.h"
+
+static struct device *to_dev(struct pmem_device *pmem)
+{
+ /*
+ * nvdimm bus services need a 'dev' parameter, and we record the device
+ * at init in bb.dev.
+ */
+ return pmem->bb.dev;
+}
+
+static struct nd_region *to_region(struct pmem_device *pmem)
+{
+ return to_nd_region(to_dev(pmem)->parent);
+}
+
+static void hwpoison_clear(struct pmem_device *pmem,
+ phys_addr_t phys, unsigned int len)
+{
+ unsigned long pfn_start, pfn_end, pfn;
+
+ /* only pmem in the linear map supports HWPoison */
+ if (is_vmalloc_addr(pmem->virt_addr))
+ return;
+
+ pfn_start = PHYS_PFN(phys);
+ pfn_end = pfn_start + PHYS_PFN(len);
+ for (pfn = pfn_start; pfn < pfn_end; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ /*
+ * Note, no need to hold a get_dev_pagemap() reference
+ * here since we're in the driver I/O path and
+ * outstanding I/O requests pin the dev_pagemap.
+ */
+ if (test_and_clear_pmem_poison(page))
+ clear_mce_nospec(pfn);
+ }
+}
+
+static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
+ phys_addr_t offset, unsigned int len)
+{
+ struct device *dev = to_dev(pmem);
+ sector_t sector;
+ long cleared;
+ blk_status_t rc = BLK_STS_OK;
+
+ sector = (offset - pmem->data_offset) / 512;
+
+ cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
+ if (cleared < len)
+ rc = BLK_STS_IOERR;
+ if (cleared > 0 && cleared / 512) {
+ hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
+ cleared /= 512;
+ dev_dbg(dev, "%#llx clear %ld sector%s\n",
+ (unsigned long long) sector, cleared,
+ cleared > 1 ? "s" : "");
+ badblocks_clear(&pmem->bb, sector, cleared);
+ if (pmem->bb_state)
+ sysfs_notify_dirent(pmem->bb_state);
+ }
+
+ arch_invalidate_pmem(pmem->virt_addr + offset, len);
+
+ return rc;
+}
+
+static void write_pmem(void *pmem_addr, struct page *page,
+ unsigned int off, unsigned int len)
+{
+ unsigned int chunk;
+ void *mem;
+
+ while (len) {
+ mem = kmap_atomic(page);
+ chunk = min_t(unsigned int, len, PAGE_SIZE - off);
+ memcpy_flushcache(pmem_addr, mem + off, chunk);
+ kunmap_atomic(mem);
+ len -= chunk;
+ off = 0;
+ page++;
+ pmem_addr += chunk;
+ }
+}
+
+static blk_status_t read_pmem(struct page *page, unsigned int off,
+ void *pmem_addr, unsigned int len)
+{
+ unsigned int chunk;
+ unsigned long rem;
+ void *mem;
+
+ while (len) {
+ mem = kmap_atomic(page);
+ chunk = min_t(unsigned int, len, PAGE_SIZE - off);
+ rem = memcpy_mcsafe(mem + off, pmem_addr, chunk);
+ kunmap_atomic(mem);
+ if (rem)
+ return BLK_STS_IOERR;
+ len -= chunk;
+ off = 0;
+ page++;
+ pmem_addr += chunk;
+ }
+ return BLK_STS_OK;
+}
+
+static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
+ unsigned int len, unsigned int off, unsigned int op,
+ sector_t sector)
+{
+ blk_status_t rc = BLK_STS_OK;
+ bool bad_pmem = false;
+ phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
+ void *pmem_addr = pmem->virt_addr + pmem_off;
+
+ if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
+ bad_pmem = true;
+
+ if (!op_is_write(op)) {
+ if (unlikely(bad_pmem))
+ rc = BLK_STS_IOERR;
+ else {
+ rc = read_pmem(page, off, pmem_addr, len);
+ flush_dcache_page(page);
+ }
+ } else {
+ /*
+ * Note that we write the data both before and after
+ * clearing poison. The write before clear poison
+ * handles situations where the latest written data is
+ * preserved and the clear poison operation simply marks
+ * the address range as valid without changing the data.
+ * In this case application software can assume that an
+ * interrupted write will either return the new good
+ * data or an error.
+ *
+ * However, if pmem_clear_poison() leaves the data in an
+ * indeterminate state we need to perform the write
+ * after clear poison.
+ */
+ flush_dcache_page(page);
+ write_pmem(pmem_addr, page, off, len);
+ if (unlikely(bad_pmem)) {
+ rc = pmem_clear_poison(pmem, pmem_off, len);
+ write_pmem(pmem_addr, page, off, len);
+ }
+ }
+
+ return rc;
+}
+
+static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
+{
+ blk_status_t rc = 0;
+ bool do_acct;
+ unsigned long start;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ struct pmem_device *pmem = q->queuedata;
+ struct nd_region *nd_region = to_region(pmem);
+
+ if (bio->bi_opf & REQ_PREFLUSH)
+ nvdimm_flush(nd_region);
+
+ do_acct = nd_iostat_start(bio, &start);
+ bio_for_each_segment(bvec, bio, iter) {
+ rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
+ bvec.bv_offset, bio_op(bio), iter.bi_sector);
+ if (rc) {
+ bio->bi_status = rc;
+ break;
+ }
+ }
+ if (do_acct)
+ nd_iostat_end(bio, start);
+
+ if (bio->bi_opf & REQ_FUA)
+ nvdimm_flush(nd_region);
+
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+}
+
+static int pmem_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, unsigned int op)
+{
+ struct pmem_device *pmem = bdev->bd_queue->queuedata;
+ blk_status_t rc;
+
+ rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
+ 0, op, sector);
+
+ /*
+ * The ->rw_page interface is subtle and tricky. The core
+ * retries on any error, so we can only invoke page_endio() in
+ * the successful completion case. Otherwise, we'll see crashes
+ * caused by double completion.
+ */
+ if (rc == 0)
+ page_endio(page, op_is_write(op), 0);
+
+ return blk_status_to_errno(rc);
+}
+
+/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
+__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
+{
+ resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
+
+ if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
+ PFN_PHYS(nr_pages))))
+ return -EIO;
+
+ if (kaddr)
+ *kaddr = pmem->virt_addr + offset;
+ if (pfn)
+ *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+
+ /*
+ * If badblocks are present, limit known good range to the
+ * requested range.
+ */
+ if (unlikely(pmem->bb.count))
+ return nr_pages;
+ return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
+}
+
+static const struct block_device_operations pmem_fops = {
+ .owner = THIS_MODULE,
+ .rw_page = pmem_rw_page,
+ .revalidate_disk = nvdimm_revalidate_disk,
+};
+
+static long pmem_dax_direct_access(struct dax_device *dax_dev,
+ pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
+{
+ struct pmem_device *pmem = dax_get_private(dax_dev);
+
+ return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
+}
+
+/*
+ * Use the 'no check' versions of copy_from_iter_flushcache() and
+ * copy_to_iter_mcsafe() to bypass HARDENED_USERCOPY overhead. Bounds
+ * checking, both file offset and device offset, is handled by
+ * dax_iomap_actor()
+ */
+static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ return _copy_from_iter_flushcache(addr, bytes, i);
+}
+
+static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ return _copy_to_iter_mcsafe(addr, bytes, i);
+}
+
+static const struct dax_operations pmem_dax_ops = {
+ .direct_access = pmem_dax_direct_access,
+ .copy_from_iter = pmem_copy_from_iter,
+ .copy_to_iter = pmem_copy_to_iter,
+};
+
+static const struct attribute_group *pmem_attribute_groups[] = {
+ &dax_attribute_group,
+ NULL,
+};
+
+static void pmem_release_queue(void *q)
+{
+ blk_cleanup_queue(q);
+}
+
+static void pmem_freeze_queue(struct percpu_ref *ref)
+{
+ struct request_queue *q;
+
+ q = container_of(ref, typeof(*q), q_usage_counter);
+ blk_freeze_queue_start(q);
+}
+
+static void pmem_release_disk(void *__pmem)
+{
+ struct pmem_device *pmem = __pmem;
+
+ kill_dax(pmem->dax_dev);
+ put_dax(pmem->dax_dev);
+ del_gendisk(pmem->disk);
+ put_disk(pmem->disk);
+}
+
+static void pmem_release_pgmap_ops(void *__pgmap)
+{
+ dev_pagemap_put_ops();
+}
+
+static void fsdax_pagefree(struct page *page, void *data)
+{
+ wake_up_var(&page->_refcount);
+}
+
+static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
+{
+ dev_pagemap_get_ops();
+ if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
+ return -ENOMEM;
+ pgmap->type = MEMORY_DEVICE_FS_DAX;
+ pgmap->page_free = fsdax_pagefree;
+
+ return 0;
+}
+
+static int pmem_attach_disk(struct device *dev,
+ struct nd_namespace_common *ndns)
+{
+ struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+ struct nd_region *nd_region = to_nd_region(dev->parent);
+ int nid = dev_to_node(dev), fua;
+ struct resource *res = &nsio->res;
+ struct resource bb_res;
+ struct nd_pfn *nd_pfn = NULL;
+ struct dax_device *dax_dev;
+ struct nd_pfn_sb *pfn_sb;
+ struct pmem_device *pmem;
+ struct request_queue *q;
+ struct device *gendev;
+ struct gendisk *disk;
+ void *addr;
+ int rc;
+
+ pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
+ if (!pmem)
+ return -ENOMEM;
+
+ /* while nsio_rw_bytes is active, parse a pfn info block if present */
+ if (is_nd_pfn(dev)) {
+ nd_pfn = to_nd_pfn(dev);
+ rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap);
+ if (rc)
+ return rc;
+ }
+
+ /* we're attaching a block device, disable raw namespace access */
+ devm_nsio_disable(dev, nsio);
+
+ dev_set_drvdata(dev, pmem);
+ pmem->phys_addr = res->start;
+ pmem->size = resource_size(res);
+ fua = nvdimm_has_flush(nd_region);
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) || fua < 0) {
+ dev_warn(dev, "unable to guarantee persistence of writes\n");
+ fua = 0;
+ }
+
+ if (!devm_request_mem_region(dev, res->start, resource_size(res),
+ dev_name(&ndns->dev))) {
+ dev_warn(dev, "could not reserve region %pR\n", res);
+ return -EBUSY;
+ }
+
+ q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev), NULL);
+ if (!q)
+ return -ENOMEM;
+
+ if (devm_add_action_or_reset(dev, pmem_release_queue, q))
+ return -ENOMEM;
+
+ pmem->pfn_flags = PFN_DEV;
+ pmem->pgmap.ref = &q->q_usage_counter;
+ pmem->pgmap.kill = pmem_freeze_queue;
+ if (is_nd_pfn(dev)) {
+ if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+ return -ENOMEM;
+ addr = devm_memremap_pages(dev, &pmem->pgmap);
+ pfn_sb = nd_pfn->pfn_sb;
+ pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
+ pmem->pfn_pad = resource_size(res) -
+ resource_size(&pmem->pgmap.res);
+ pmem->pfn_flags |= PFN_MAP;
+ memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
+ bb_res.start += pmem->data_offset;
+ } else if (pmem_should_map_pages(dev)) {
+ memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
+ pmem->pgmap.altmap_valid = false;
+ if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+ return -ENOMEM;
+ addr = devm_memremap_pages(dev, &pmem->pgmap);
+ pmem->pfn_flags |= PFN_MAP;
+ memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
+ } else {
+ addr = devm_memremap(dev, pmem->phys_addr,
+ pmem->size, ARCH_MEMREMAP_PMEM);
+ memcpy(&bb_res, &nsio->res, sizeof(bb_res));
+ }
+
+ if (IS_ERR(addr))
+ return PTR_ERR(addr);
+ pmem->virt_addr = addr;
+
+ blk_queue_write_cache(q, true, fua);
+ blk_queue_make_request(q, pmem_make_request);
+ blk_queue_physical_block_size(q, PAGE_SIZE);
+ blk_queue_logical_block_size(q, pmem_sector_size(ndns));
+ blk_queue_max_hw_sectors(q, UINT_MAX);
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+ if (pmem->pfn_flags & PFN_MAP)
+ blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+ q->queuedata = pmem;
+
+ disk = alloc_disk_node(0, nid);
+ if (!disk)
+ return -ENOMEM;
+ pmem->disk = disk;
+
+ disk->fops = &pmem_fops;
+ disk->queue = q;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
+ nvdimm_namespace_disk_name(ndns, disk->disk_name);
+ set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
+ / 512);
+ if (devm_init_badblocks(dev, &pmem->bb))
+ return -ENOMEM;
+ nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
+ disk->bb = &pmem->bb;
+
+ dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
+ if (!dax_dev) {
+ put_disk(disk);
+ return -ENOMEM;
+ }
+ dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
+ pmem->dax_dev = dax_dev;
+
+ gendev = disk_to_dev(disk);
+ gendev->groups = pmem_attribute_groups;
+
+ device_add_disk(dev, disk, NULL);
+ if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
+ return -ENOMEM;
+
+ revalidate_disk(disk);
+
+ pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd,
+ "badblocks");
+ if (!pmem->bb_state)
+ dev_warn(dev, "'badblocks' notification disabled\n");
+
+ return 0;
+}
+
+static int nd_pmem_probe(struct device *dev)
+{
+ struct nd_namespace_common *ndns;
+
+ ndns = nvdimm_namespace_common_probe(dev);
+ if (IS_ERR(ndns))
+ return PTR_ERR(ndns);
+
+ if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
+ return -ENXIO;
+
+ if (is_nd_btt(dev))
+ return nvdimm_namespace_attach_btt(ndns);
+
+ if (is_nd_pfn(dev))
+ return pmem_attach_disk(dev, ndns);
+
+ /* if we find a valid info-block we'll come back as that personality */
+ if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
+ || nd_dax_probe(dev, ndns) == 0)
+ return -ENXIO;
+
+ /* ...otherwise we're just a raw pmem device */
+ return pmem_attach_disk(dev, ndns);
+}
+
+static int nd_pmem_remove(struct device *dev)
+{
+ struct pmem_device *pmem = dev_get_drvdata(dev);
+
+ if (is_nd_btt(dev))
+ nvdimm_namespace_detach_btt(to_nd_btt(dev));
+ else {
+ /*
+ * Note, this assumes device_lock() context to not race
+ * nd_pmem_notify()
+ */
+ sysfs_put(pmem->bb_state);
+ pmem->bb_state = NULL;
+ }
+ nvdimm_flush(to_nd_region(dev->parent));
+
+ return 0;
+}
+
+static void nd_pmem_shutdown(struct device *dev)
+{
+ nvdimm_flush(to_nd_region(dev->parent));
+}
+
+static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
+{
+ struct nd_region *nd_region;
+ resource_size_t offset = 0, end_trunc = 0;
+ struct nd_namespace_common *ndns;
+ struct nd_namespace_io *nsio;
+ struct resource res;
+ struct badblocks *bb;
+ struct kernfs_node *bb_state;
+
+ if (event != NVDIMM_REVALIDATE_POISON)
+ return;
+
+ if (is_nd_btt(dev)) {
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ ndns = nd_btt->ndns;
+ nd_region = to_nd_region(ndns->dev.parent);
+ nsio = to_nd_namespace_io(&ndns->dev);
+ bb = &nsio->bb;
+ bb_state = NULL;
+ } else {
+ struct pmem_device *pmem = dev_get_drvdata(dev);
+
+ nd_region = to_region(pmem);
+ bb = &pmem->bb;
+ bb_state = pmem->bb_state;
+
+ if (is_nd_pfn(dev)) {
+ struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+ struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+
+ ndns = nd_pfn->ndns;
+ offset = pmem->data_offset +
+ __le32_to_cpu(pfn_sb->start_pad);
+ end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+ } else {
+ ndns = to_ndns(dev);
+ }
+
+ nsio = to_nd_namespace_io(&ndns->dev);
+ }
+
+ res.start = nsio->res.start + offset;
+ res.end = nsio->res.end - end_trunc;
+ nvdimm_badblocks_populate(nd_region, bb, &res);
+ if (bb_state)
+ sysfs_notify_dirent(bb_state);
+}
+
+MODULE_ALIAS("pmem");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
+static struct nd_device_driver nd_pmem_driver = {
+ .probe = nd_pmem_probe,
+ .remove = nd_pmem_remove,
+ .notify = nd_pmem_notify,
+ .shutdown = nd_pmem_shutdown,
+ .drv = {
+ .name = "nd_pmem",
+ },
+ .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
+};
+
+module_nd_driver(nd_pmem_driver);
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
new file mode 100644
index 000000000..59cfe13ea
--- /dev/null
+++ b/drivers/nvdimm/pmem.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NVDIMM_PMEM_H__
+#define __NVDIMM_PMEM_H__
+#include <linux/page-flags.h>
+#include <linux/badblocks.h>
+#include <linux/types.h>
+#include <linux/pfn_t.h>
+#include <linux/fs.h>
+
+/* this definition is in it's own header for tools/testing/nvdimm to consume */
+struct pmem_device {
+ /* One contiguous memory region per device */
+ phys_addr_t phys_addr;
+ /* when non-zero this device is hosting a 'pfn' instance */
+ phys_addr_t data_offset;
+ u64 pfn_flags;
+ void *virt_addr;
+ /* immutable base size of the namespace */
+ size_t size;
+ /* trim size when namespace capacity has been section aligned */
+ u32 pfn_pad;
+ struct kernfs_node *bb_state;
+ struct badblocks bb;
+ struct dax_device *dax_dev;
+ struct gendisk *disk;
+ struct dev_pagemap pgmap;
+};
+
+long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn);
+
+#ifdef CONFIG_MEMORY_FAILURE
+static inline bool test_and_clear_pmem_poison(struct page *page)
+{
+ return TestClearPageHWPoison(page);
+}
+#else
+static inline bool test_and_clear_pmem_poison(struct page *page)
+{
+ return false;
+}
+#endif
+#endif /* __NVDIMM_PMEM_H__ */
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
new file mode 100644
index 000000000..22224b21c
--- /dev/null
+++ b/drivers/nvdimm/region.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "nd.h"
+
+static int nd_region_probe(struct device *dev)
+{
+ int err, rc;
+ static unsigned long once;
+ struct nd_region_data *ndrd;
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ if (nd_region->num_lanes > num_online_cpus()
+ && nd_region->num_lanes < num_possible_cpus()
+ && !test_and_set_bit(0, &once)) {
+ dev_dbg(dev, "online cpus (%d) < concurrent i/o lanes (%d) < possible cpus (%d)\n",
+ num_online_cpus(), nd_region->num_lanes,
+ num_possible_cpus());
+ dev_dbg(dev, "setting nr_cpus=%d may yield better libnvdimm device performance\n",
+ nd_region->num_lanes);
+ }
+
+ rc = nd_region_activate(nd_region);
+ if (rc)
+ return rc;
+
+ rc = nd_blk_region_init(nd_region);
+ if (rc)
+ return rc;
+
+ if (is_memory(&nd_region->dev)) {
+ struct resource ndr_res;
+
+ if (devm_init_badblocks(dev, &nd_region->bb))
+ return -ENODEV;
+ nd_region->bb_state = sysfs_get_dirent(nd_region->dev.kobj.sd,
+ "badblocks");
+ if (!nd_region->bb_state)
+ dev_warn(&nd_region->dev,
+ "'badblocks' notification disabled\n");
+ ndr_res.start = nd_region->ndr_start;
+ ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1;
+ nvdimm_badblocks_populate(nd_region, &nd_region->bb, &ndr_res);
+ }
+
+ rc = nd_region_register_namespaces(nd_region, &err);
+ if (rc < 0)
+ return rc;
+
+ ndrd = dev_get_drvdata(dev);
+ ndrd->ns_active = rc;
+ ndrd->ns_count = rc + err;
+
+ if (rc && err && rc == err)
+ return -ENODEV;
+
+ nd_region->btt_seed = nd_btt_create(nd_region);
+ nd_region->pfn_seed = nd_pfn_create(nd_region);
+ nd_region->dax_seed = nd_dax_create(nd_region);
+ if (err == 0)
+ return 0;
+
+ /*
+ * Given multiple namespaces per region, we do not want to
+ * disable all the successfully registered peer namespaces upon
+ * a single registration failure. If userspace is missing a
+ * namespace that it expects it can disable/re-enable the region
+ * to retry discovery after correcting the failure.
+ * <regionX>/namespaces returns the current
+ * "<async-registered>/<total>" namespace count.
+ */
+ dev_err(dev, "failed to register %d namespace%s, continuing...\n",
+ err, err == 1 ? "" : "s");
+ return 0;
+}
+
+static int child_unregister(struct device *dev, void *data)
+{
+ nd_device_unregister(dev, ND_SYNC);
+ return 0;
+}
+
+static int nd_region_remove(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ device_for_each_child(dev, NULL, child_unregister);
+
+ /* flush attribute readers and disable */
+ nvdimm_bus_lock(dev);
+ nd_region->ns_seed = NULL;
+ nd_region->btt_seed = NULL;
+ nd_region->pfn_seed = NULL;
+ nd_region->dax_seed = NULL;
+ dev_set_drvdata(dev, NULL);
+ nvdimm_bus_unlock(dev);
+
+ /*
+ * Note, this assumes device_lock() context to not race
+ * nd_region_notify()
+ */
+ sysfs_put(nd_region->bb_state);
+ nd_region->bb_state = NULL;
+
+ return 0;
+}
+
+static int child_notify(struct device *dev, void *data)
+{
+ nd_device_notify(dev, *(enum nvdimm_event *) data);
+ return 0;
+}
+
+static void nd_region_notify(struct device *dev, enum nvdimm_event event)
+{
+ if (event == NVDIMM_REVALIDATE_POISON) {
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct resource res;
+
+ if (is_memory(&nd_region->dev)) {
+ res.start = nd_region->ndr_start;
+ res.end = nd_region->ndr_start +
+ nd_region->ndr_size - 1;
+ nvdimm_badblocks_populate(nd_region,
+ &nd_region->bb, &res);
+ if (nd_region->bb_state)
+ sysfs_notify_dirent(nd_region->bb_state);
+ }
+ }
+ device_for_each_child(dev, &event, child_notify);
+}
+
+static struct nd_device_driver nd_region_driver = {
+ .probe = nd_region_probe,
+ .remove = nd_region_remove,
+ .notify = nd_region_notify,
+ .drv = {
+ .name = "nd_region",
+ },
+ .type = ND_DRIVER_REGION_BLK | ND_DRIVER_REGION_PMEM,
+};
+
+int __init nd_region_init(void)
+{
+ return nd_driver_register(&nd_region_driver);
+}
+
+void nd_region_exit(void)
+{
+ driver_unregister(&nd_region_driver.drv);
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM);
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_BLK);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
new file mode 100644
index 000000000..609fc4505
--- /dev/null
+++ b/drivers/nvdimm/region_devs.c
@@ -0,0 +1,1235 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/sort.h>
+#include <linux/io.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "nd.h"
+
+/*
+ * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is
+ * irrelevant.
+ */
+#include <linux/io-64-nonatomic-hi-lo.h>
+
+static DEFINE_IDA(region_ida);
+static DEFINE_PER_CPU(int, flush_idx);
+
+static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm,
+ struct nd_region_data *ndrd)
+{
+ int i, j;
+
+ dev_dbg(dev, "%s: map %d flush address%s\n", nvdimm_name(nvdimm),
+ nvdimm->num_flush, nvdimm->num_flush == 1 ? "" : "es");
+ for (i = 0; i < (1 << ndrd->hints_shift); i++) {
+ struct resource *res = &nvdimm->flush_wpq[i];
+ unsigned long pfn = PHYS_PFN(res->start);
+ void __iomem *flush_page;
+
+ /* check if flush hints share a page */
+ for (j = 0; j < i; j++) {
+ struct resource *res_j = &nvdimm->flush_wpq[j];
+ unsigned long pfn_j = PHYS_PFN(res_j->start);
+
+ if (pfn == pfn_j)
+ break;
+ }
+
+ if (j < i)
+ flush_page = (void __iomem *) ((unsigned long)
+ ndrd_get_flush_wpq(ndrd, dimm, j)
+ & PAGE_MASK);
+ else
+ flush_page = devm_nvdimm_ioremap(dev,
+ PFN_PHYS(pfn), PAGE_SIZE);
+ if (!flush_page)
+ return -ENXIO;
+ ndrd_set_flush_wpq(ndrd, dimm, i, flush_page
+ + (res->start & ~PAGE_MASK));
+ }
+
+ return 0;
+}
+
+int nd_region_activate(struct nd_region *nd_region)
+{
+ int i, j, num_flush = 0;
+ struct nd_region_data *ndrd;
+ struct device *dev = &nd_region->dev;
+ size_t flush_data_size = sizeof(void *);
+
+ nvdimm_bus_lock(&nd_region->dev);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ /* at least one null hint slot per-dimm for the "no-hint" case */
+ flush_data_size += sizeof(void *);
+ num_flush = min_not_zero(num_flush, nvdimm->num_flush);
+ if (!nvdimm->num_flush)
+ continue;
+ flush_data_size += nvdimm->num_flush * sizeof(void *);
+ }
+ nvdimm_bus_unlock(&nd_region->dev);
+
+ ndrd = devm_kzalloc(dev, sizeof(*ndrd) + flush_data_size, GFP_KERNEL);
+ if (!ndrd)
+ return -ENOMEM;
+ dev_set_drvdata(dev, ndrd);
+
+ if (!num_flush)
+ return 0;
+
+ ndrd->hints_shift = ilog2(num_flush);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+ int rc = nvdimm_map_flush(&nd_region->dev, nvdimm, i, ndrd);
+
+ if (rc)
+ return rc;
+ }
+
+ /*
+ * Clear out entries that are duplicates. This should prevent the
+ * extra flushings.
+ */
+ for (i = 0; i < nd_region->ndr_mappings - 1; i++) {
+ /* ignore if NULL already */
+ if (!ndrd_get_flush_wpq(ndrd, i, 0))
+ continue;
+
+ for (j = i + 1; j < nd_region->ndr_mappings; j++)
+ if (ndrd_get_flush_wpq(ndrd, i, 0) ==
+ ndrd_get_flush_wpq(ndrd, j, 0))
+ ndrd_set_flush_wpq(ndrd, j, 0, NULL);
+ }
+
+ return 0;
+}
+
+static void nd_region_release(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ u16 i;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ put_device(&nvdimm->dev);
+ }
+ free_percpu(nd_region->lane);
+ ida_simple_remove(&region_ida, nd_region->id);
+ if (is_nd_blk(dev))
+ kfree(to_nd_blk_region(dev));
+ else
+ kfree(nd_region);
+}
+
+static struct device_type nd_blk_device_type = {
+ .name = "nd_blk",
+ .release = nd_region_release,
+};
+
+static struct device_type nd_pmem_device_type = {
+ .name = "nd_pmem",
+ .release = nd_region_release,
+};
+
+static struct device_type nd_volatile_device_type = {
+ .name = "nd_volatile",
+ .release = nd_region_release,
+};
+
+bool is_nd_pmem(struct device *dev)
+{
+ return dev ? dev->type == &nd_pmem_device_type : false;
+}
+
+bool is_nd_blk(struct device *dev)
+{
+ return dev ? dev->type == &nd_blk_device_type : false;
+}
+
+bool is_nd_volatile(struct device *dev)
+{
+ return dev ? dev->type == &nd_volatile_device_type : false;
+}
+
+struct nd_region *to_nd_region(struct device *dev)
+{
+ struct nd_region *nd_region = container_of(dev, struct nd_region, dev);
+
+ WARN_ON(dev->type->release != nd_region_release);
+ return nd_region;
+}
+EXPORT_SYMBOL_GPL(to_nd_region);
+
+struct device *nd_region_dev(struct nd_region *nd_region)
+{
+ if (!nd_region)
+ return NULL;
+ return &nd_region->dev;
+}
+EXPORT_SYMBOL_GPL(nd_region_dev);
+
+struct nd_blk_region *to_nd_blk_region(struct device *dev)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ WARN_ON(!is_nd_blk(dev));
+ return container_of(nd_region, struct nd_blk_region, nd_region);
+}
+EXPORT_SYMBOL_GPL(to_nd_blk_region);
+
+void *nd_region_provider_data(struct nd_region *nd_region)
+{
+ return nd_region->provider_data;
+}
+EXPORT_SYMBOL_GPL(nd_region_provider_data);
+
+void *nd_blk_region_provider_data(struct nd_blk_region *ndbr)
+{
+ return ndbr->blk_provider_data;
+}
+EXPORT_SYMBOL_GPL(nd_blk_region_provider_data);
+
+void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data)
+{
+ ndbr->blk_provider_data = data;
+}
+EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
+
+/**
+ * nd_region_to_nstype() - region to an integer namespace type
+ * @nd_region: region-device to interrogate
+ *
+ * This is the 'nstype' attribute of a region as well, an input to the
+ * MODALIAS for namespace devices, and bit number for a nvdimm_bus to match
+ * namespace devices with namespace drivers.
+ */
+int nd_region_to_nstype(struct nd_region *nd_region)
+{
+ if (is_memory(&nd_region->dev)) {
+ u16 i, alias;
+
+ for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ if (test_bit(NDD_ALIASING, &nvdimm->flags))
+ alias++;
+ }
+ if (alias)
+ return ND_DEVICE_NAMESPACE_PMEM;
+ else
+ return ND_DEVICE_NAMESPACE_IO;
+ } else if (is_nd_blk(&nd_region->dev)) {
+ return ND_DEVICE_NAMESPACE_BLK;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(nd_region_to_nstype);
+
+static ssize_t size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ unsigned long long size = 0;
+
+ if (is_memory(dev)) {
+ size = nd_region->ndr_size;
+ } else if (nd_region->ndr_mappings == 1) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+
+ size = nd_mapping->size;
+ }
+
+ return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t deep_flush_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ /*
+ * NOTE: in the nvdimm_has_flush() error case this attribute is
+ * not visible.
+ */
+ return sprintf(buf, "%d\n", nvdimm_has_flush(nd_region));
+}
+
+static ssize_t deep_flush_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ bool flush;
+ int rc = strtobool(buf, &flush);
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ if (rc)
+ return rc;
+ if (!flush)
+ return -EINVAL;
+ nvdimm_flush(nd_region);
+
+ return len;
+}
+static DEVICE_ATTR_RW(deep_flush);
+
+static ssize_t mappings_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ return sprintf(buf, "%d\n", nd_region->ndr_mappings);
+}
+static DEVICE_ATTR_RO(mappings);
+
+static ssize_t nstype_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region));
+}
+static DEVICE_ATTR_RO(nstype);
+
+static ssize_t set_cookie_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+ ssize_t rc = 0;
+
+ if (is_memory(dev) && nd_set)
+ /* pass, should be precluded by region_visible */;
+ else
+ return -ENXIO;
+
+ /*
+ * The cookie to show depends on which specification of the
+ * labels we are using. If there are not labels then default to
+ * the v1.1 namespace label cookie definition. To read all this
+ * data we need to wait for probing to settle.
+ */
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ if (nd_region->ndr_mappings) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+
+ if (ndd) {
+ struct nd_namespace_index *nsindex;
+
+ nsindex = to_namespace_index(ndd, ndd->ns_current);
+ rc = sprintf(buf, "%#llx\n",
+ nd_region_interleave_set_cookie(nd_region,
+ nsindex));
+ }
+ }
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ if (rc)
+ return rc;
+ return sprintf(buf, "%#llx\n", nd_set->cookie1);
+}
+static DEVICE_ATTR_RO(set_cookie);
+
+resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
+{
+ resource_size_t blk_max_overlap = 0, available, overlap;
+ int i;
+
+ WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+
+ retry:
+ available = 0;
+ overlap = blk_max_overlap;
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+
+ /* if a dimm is disabled the available capacity is zero */
+ if (!ndd)
+ return 0;
+
+ if (is_memory(&nd_region->dev)) {
+ available += nd_pmem_available_dpa(nd_region,
+ nd_mapping, &overlap);
+ if (overlap > blk_max_overlap) {
+ blk_max_overlap = overlap;
+ goto retry;
+ }
+ } else if (is_nd_blk(&nd_region->dev))
+ available += nd_blk_available_dpa(nd_region);
+ }
+
+ return available;
+}
+
+resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region)
+{
+ resource_size_t available = 0;
+ int i;
+
+ if (is_memory(&nd_region->dev))
+ available = PHYS_ADDR_MAX;
+
+ WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+ if (is_memory(&nd_region->dev))
+ available = min(available,
+ nd_pmem_max_contiguous_dpa(nd_region,
+ nd_mapping));
+ else if (is_nd_blk(&nd_region->dev))
+ available += nd_blk_available_dpa(nd_region);
+ }
+ if (is_memory(&nd_region->dev))
+ return available * nd_region->ndr_mappings;
+ return available;
+}
+
+static ssize_t available_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ unsigned long long available = 0;
+
+ /*
+ * Flush in-flight updates and grab a snapshot of the available
+ * size. Of course, this value is potentially invalidated the
+ * memory nvdimm_bus_lock() is dropped, but that's userspace's
+ * problem to not race itself.
+ */
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ available = nd_region_available_dpa(nd_region);
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return sprintf(buf, "%llu\n", available);
+}
+static DEVICE_ATTR_RO(available_size);
+
+static ssize_t max_available_extent_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ unsigned long long available = 0;
+
+ device_lock(dev);
+ nvdimm_bus_lock(dev);
+ wait_nvdimm_bus_probe_idle(dev);
+ available = nd_region_allocatable_dpa(nd_region);
+ nvdimm_bus_unlock(dev);
+ device_unlock(dev);
+
+ return sprintf(buf, "%llu\n", available);
+}
+static DEVICE_ATTR_RO(max_available_extent);
+
+static ssize_t init_namespaces_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region_data *ndrd = dev_get_drvdata(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ if (ndrd)
+ rc = sprintf(buf, "%d/%d\n", ndrd->ns_active, ndrd->ns_count);
+ else
+ rc = -ENXIO;
+ nvdimm_bus_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(init_namespaces);
+
+static ssize_t namespace_seed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ if (nd_region->ns_seed)
+ rc = sprintf(buf, "%s\n", dev_name(nd_region->ns_seed));
+ else
+ rc = sprintf(buf, "\n");
+ nvdimm_bus_unlock(dev);
+ return rc;
+}
+static DEVICE_ATTR_RO(namespace_seed);
+
+static ssize_t btt_seed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ if (nd_region->btt_seed)
+ rc = sprintf(buf, "%s\n", dev_name(nd_region->btt_seed));
+ else
+ rc = sprintf(buf, "\n");
+ nvdimm_bus_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(btt_seed);
+
+static ssize_t pfn_seed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ if (nd_region->pfn_seed)
+ rc = sprintf(buf, "%s\n", dev_name(nd_region->pfn_seed));
+ else
+ rc = sprintf(buf, "\n");
+ nvdimm_bus_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(pfn_seed);
+
+static ssize_t dax_seed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ ssize_t rc;
+
+ nvdimm_bus_lock(dev);
+ if (nd_region->dax_seed)
+ rc = sprintf(buf, "%s\n", dev_name(nd_region->dax_seed));
+ else
+ rc = sprintf(buf, "\n");
+ nvdimm_bus_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR_RO(dax_seed);
+
+static ssize_t read_only_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ return sprintf(buf, "%d\n", nd_region->ro);
+}
+
+static ssize_t read_only_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ bool ro;
+ int rc = strtobool(buf, &ro);
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ if (rc)
+ return rc;
+
+ nd_region->ro = ro;
+ return len;
+}
+static DEVICE_ATTR_RW(read_only);
+
+static ssize_t region_badblocks_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ ssize_t rc;
+
+ device_lock(dev);
+ if (dev->driver)
+ rc = badblocks_show(&nd_region->bb, buf, 0);
+ else
+ rc = -ENXIO;
+ device_unlock(dev);
+
+ return rc;
+}
+static DEVICE_ATTR(badblocks, 0444, region_badblocks_show, NULL);
+
+static ssize_t resource_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ return sprintf(buf, "%#llx\n", nd_region->ndr_start);
+}
+static DEVICE_ATTR_RO(resource);
+
+static ssize_t persistence_domain_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ if (test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags))
+ return sprintf(buf, "cpu_cache\n");
+ else if (test_bit(ND_REGION_PERSIST_MEMCTRL, &nd_region->flags))
+ return sprintf(buf, "memory_controller\n");
+ else
+ return sprintf(buf, "\n");
+}
+static DEVICE_ATTR_RO(persistence_domain);
+
+static struct attribute *nd_region_attributes[] = {
+ &dev_attr_size.attr,
+ &dev_attr_nstype.attr,
+ &dev_attr_mappings.attr,
+ &dev_attr_btt_seed.attr,
+ &dev_attr_pfn_seed.attr,
+ &dev_attr_dax_seed.attr,
+ &dev_attr_deep_flush.attr,
+ &dev_attr_read_only.attr,
+ &dev_attr_set_cookie.attr,
+ &dev_attr_available_size.attr,
+ &dev_attr_max_available_extent.attr,
+ &dev_attr_namespace_seed.attr,
+ &dev_attr_init_namespaces.attr,
+ &dev_attr_badblocks.attr,
+ &dev_attr_resource.attr,
+ &dev_attr_persistence_domain.attr,
+ NULL,
+};
+
+static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, typeof(*dev), kobj);
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+ int type = nd_region_to_nstype(nd_region);
+
+ if (!is_memory(dev) && a == &dev_attr_pfn_seed.attr)
+ return 0;
+
+ if (!is_memory(dev) && a == &dev_attr_dax_seed.attr)
+ return 0;
+
+ if (!is_memory(dev) && a == &dev_attr_badblocks.attr)
+ return 0;
+
+ if (a == &dev_attr_resource.attr) {
+ if (is_memory(dev))
+ return 0400;
+ else
+ return 0;
+ }
+
+ if (a == &dev_attr_deep_flush.attr) {
+ int has_flush = nvdimm_has_flush(nd_region);
+
+ if (has_flush == 1)
+ return a->mode;
+ else if (has_flush == 0)
+ return 0444;
+ else
+ return 0;
+ }
+
+ if (a == &dev_attr_persistence_domain.attr) {
+ if ((nd_region->flags & (BIT(ND_REGION_PERSIST_CACHE)
+ | BIT(ND_REGION_PERSIST_MEMCTRL))) == 0)
+ return 0;
+ return a->mode;
+ }
+
+ if (a != &dev_attr_set_cookie.attr
+ && a != &dev_attr_available_size.attr)
+ return a->mode;
+
+ if ((type == ND_DEVICE_NAMESPACE_PMEM
+ || type == ND_DEVICE_NAMESPACE_BLK)
+ && a == &dev_attr_available_size.attr)
+ return a->mode;
+ else if (is_memory(dev) && nd_set)
+ return a->mode;
+
+ return 0;
+}
+
+struct attribute_group nd_region_attribute_group = {
+ .attrs = nd_region_attributes,
+ .is_visible = region_visible,
+};
+EXPORT_SYMBOL_GPL(nd_region_attribute_group);
+
+u64 nd_region_interleave_set_cookie(struct nd_region *nd_region,
+ struct nd_namespace_index *nsindex)
+{
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+
+ if (!nd_set)
+ return 0;
+
+ if (nsindex && __le16_to_cpu(nsindex->major) == 1
+ && __le16_to_cpu(nsindex->minor) == 1)
+ return nd_set->cookie1;
+ return nd_set->cookie2;
+}
+
+u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region)
+{
+ struct nd_interleave_set *nd_set = nd_region->nd_set;
+
+ if (nd_set)
+ return nd_set->altcookie;
+ return 0;
+}
+
+void nd_mapping_free_labels(struct nd_mapping *nd_mapping)
+{
+ struct nd_label_ent *label_ent, *e;
+
+ lockdep_assert_held(&nd_mapping->lock);
+ list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
+ list_del(&label_ent->list);
+ kfree(label_ent);
+ }
+}
+
+/*
+ * Upon successful probe/remove, take/release a reference on the
+ * associated interleave set (if present), and plant new btt + namespace
+ * seeds. Also, on the removal of a BLK region, notify the provider to
+ * disable the region.
+ */
+static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
+ struct device *dev, bool probe)
+{
+ struct nd_region *nd_region;
+
+ if (!probe && is_nd_region(dev)) {
+ int i;
+
+ nd_region = to_nd_region(dev);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm_drvdata *ndd = nd_mapping->ndd;
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ mutex_lock(&nd_mapping->lock);
+ nd_mapping_free_labels(nd_mapping);
+ mutex_unlock(&nd_mapping->lock);
+
+ put_ndd(ndd);
+ nd_mapping->ndd = NULL;
+ if (ndd)
+ atomic_dec(&nvdimm->busy);
+ }
+ }
+ if (dev->parent && is_nd_region(dev->parent) && probe) {
+ nd_region = to_nd_region(dev->parent);
+ nvdimm_bus_lock(dev);
+ if (nd_region->ns_seed == dev)
+ nd_region_create_ns_seed(nd_region);
+ nvdimm_bus_unlock(dev);
+ }
+ if (is_nd_btt(dev) && probe) {
+ struct nd_btt *nd_btt = to_nd_btt(dev);
+
+ nd_region = to_nd_region(dev->parent);
+ nvdimm_bus_lock(dev);
+ if (nd_region->btt_seed == dev)
+ nd_region_create_btt_seed(nd_region);
+ if (nd_region->ns_seed == &nd_btt->ndns->dev)
+ nd_region_create_ns_seed(nd_region);
+ nvdimm_bus_unlock(dev);
+ }
+ if (is_nd_pfn(dev) && probe) {
+ struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+ nd_region = to_nd_region(dev->parent);
+ nvdimm_bus_lock(dev);
+ if (nd_region->pfn_seed == dev)
+ nd_region_create_pfn_seed(nd_region);
+ if (nd_region->ns_seed == &nd_pfn->ndns->dev)
+ nd_region_create_ns_seed(nd_region);
+ nvdimm_bus_unlock(dev);
+ }
+ if (is_nd_dax(dev) && probe) {
+ struct nd_dax *nd_dax = to_nd_dax(dev);
+
+ nd_region = to_nd_region(dev->parent);
+ nvdimm_bus_lock(dev);
+ if (nd_region->dax_seed == dev)
+ nd_region_create_dax_seed(nd_region);
+ if (nd_region->ns_seed == &nd_dax->nd_pfn.ndns->dev)
+ nd_region_create_ns_seed(nd_region);
+ nvdimm_bus_unlock(dev);
+ }
+}
+
+void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
+{
+ nd_region_notify_driver_action(nvdimm_bus, dev, true);
+}
+
+void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev)
+{
+ nd_region_notify_driver_action(nvdimm_bus, dev, false);
+}
+
+static ssize_t mappingN(struct device *dev, char *buf, int n)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct nd_mapping *nd_mapping;
+ struct nvdimm *nvdimm;
+
+ if (n >= nd_region->ndr_mappings)
+ return -ENXIO;
+ nd_mapping = &nd_region->mapping[n];
+ nvdimm = nd_mapping->nvdimm;
+
+ return sprintf(buf, "%s,%llu,%llu,%d\n", dev_name(&nvdimm->dev),
+ nd_mapping->start, nd_mapping->size,
+ nd_mapping->position);
+}
+
+#define REGION_MAPPING(idx) \
+static ssize_t mapping##idx##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return mappingN(dev, buf, idx); \
+} \
+static DEVICE_ATTR_RO(mapping##idx)
+
+/*
+ * 32 should be enough for a while, even in the presence of socket
+ * interleave a 32-way interleave set is a degenerate case.
+ */
+REGION_MAPPING(0);
+REGION_MAPPING(1);
+REGION_MAPPING(2);
+REGION_MAPPING(3);
+REGION_MAPPING(4);
+REGION_MAPPING(5);
+REGION_MAPPING(6);
+REGION_MAPPING(7);
+REGION_MAPPING(8);
+REGION_MAPPING(9);
+REGION_MAPPING(10);
+REGION_MAPPING(11);
+REGION_MAPPING(12);
+REGION_MAPPING(13);
+REGION_MAPPING(14);
+REGION_MAPPING(15);
+REGION_MAPPING(16);
+REGION_MAPPING(17);
+REGION_MAPPING(18);
+REGION_MAPPING(19);
+REGION_MAPPING(20);
+REGION_MAPPING(21);
+REGION_MAPPING(22);
+REGION_MAPPING(23);
+REGION_MAPPING(24);
+REGION_MAPPING(25);
+REGION_MAPPING(26);
+REGION_MAPPING(27);
+REGION_MAPPING(28);
+REGION_MAPPING(29);
+REGION_MAPPING(30);
+REGION_MAPPING(31);
+
+static umode_t mapping_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct nd_region *nd_region = to_nd_region(dev);
+
+ if (n < nd_region->ndr_mappings)
+ return a->mode;
+ return 0;
+}
+
+static struct attribute *mapping_attributes[] = {
+ &dev_attr_mapping0.attr,
+ &dev_attr_mapping1.attr,
+ &dev_attr_mapping2.attr,
+ &dev_attr_mapping3.attr,
+ &dev_attr_mapping4.attr,
+ &dev_attr_mapping5.attr,
+ &dev_attr_mapping6.attr,
+ &dev_attr_mapping7.attr,
+ &dev_attr_mapping8.attr,
+ &dev_attr_mapping9.attr,
+ &dev_attr_mapping10.attr,
+ &dev_attr_mapping11.attr,
+ &dev_attr_mapping12.attr,
+ &dev_attr_mapping13.attr,
+ &dev_attr_mapping14.attr,
+ &dev_attr_mapping15.attr,
+ &dev_attr_mapping16.attr,
+ &dev_attr_mapping17.attr,
+ &dev_attr_mapping18.attr,
+ &dev_attr_mapping19.attr,
+ &dev_attr_mapping20.attr,
+ &dev_attr_mapping21.attr,
+ &dev_attr_mapping22.attr,
+ &dev_attr_mapping23.attr,
+ &dev_attr_mapping24.attr,
+ &dev_attr_mapping25.attr,
+ &dev_attr_mapping26.attr,
+ &dev_attr_mapping27.attr,
+ &dev_attr_mapping28.attr,
+ &dev_attr_mapping29.attr,
+ &dev_attr_mapping30.attr,
+ &dev_attr_mapping31.attr,
+ NULL,
+};
+
+struct attribute_group nd_mapping_attribute_group = {
+ .is_visible = mapping_visible,
+ .attrs = mapping_attributes,
+};
+EXPORT_SYMBOL_GPL(nd_mapping_attribute_group);
+
+int nd_blk_region_init(struct nd_region *nd_region)
+{
+ struct device *dev = &nd_region->dev;
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+ if (!is_nd_blk(dev))
+ return 0;
+
+ if (nd_region->ndr_mappings < 1) {
+ dev_dbg(dev, "invalid BLK region\n");
+ return -ENXIO;
+ }
+
+ return to_nd_blk_region(dev)->enable(nvdimm_bus, dev);
+}
+
+/**
+ * nd_region_acquire_lane - allocate and lock a lane
+ * @nd_region: region id and number of lanes possible
+ *
+ * A lane correlates to a BLK-data-window and/or a log slot in the BTT.
+ * We optimize for the common case where there are 256 lanes, one
+ * per-cpu. For larger systems we need to lock to share lanes. For now
+ * this implementation assumes the cost of maintaining an allocator for
+ * free lanes is on the order of the lock hold time, so it implements a
+ * static lane = cpu % num_lanes mapping.
+ *
+ * In the case of a BTT instance on top of a BLK namespace a lane may be
+ * acquired recursively. We lock on the first instance.
+ *
+ * In the case of a BTT instance on top of PMEM, we only acquire a lane
+ * for the BTT metadata updates.
+ */
+unsigned int nd_region_acquire_lane(struct nd_region *nd_region)
+{
+ unsigned int cpu, lane;
+
+ cpu = get_cpu();
+ if (nd_region->num_lanes < nr_cpu_ids) {
+ struct nd_percpu_lane *ndl_lock, *ndl_count;
+
+ lane = cpu % nd_region->num_lanes;
+ ndl_count = per_cpu_ptr(nd_region->lane, cpu);
+ ndl_lock = per_cpu_ptr(nd_region->lane, lane);
+ if (ndl_count->count++ == 0)
+ spin_lock(&ndl_lock->lock);
+ } else
+ lane = cpu;
+
+ return lane;
+}
+EXPORT_SYMBOL(nd_region_acquire_lane);
+
+void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
+{
+ if (nd_region->num_lanes < nr_cpu_ids) {
+ unsigned int cpu = get_cpu();
+ struct nd_percpu_lane *ndl_lock, *ndl_count;
+
+ ndl_count = per_cpu_ptr(nd_region->lane, cpu);
+ ndl_lock = per_cpu_ptr(nd_region->lane, lane);
+ if (--ndl_count->count == 0)
+ spin_unlock(&ndl_lock->lock);
+ put_cpu();
+ }
+ put_cpu();
+}
+EXPORT_SYMBOL(nd_region_release_lane);
+
+static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc, struct device_type *dev_type,
+ const char *caller)
+{
+ struct nd_region *nd_region;
+ struct device *dev;
+ void *region_buf;
+ unsigned int i;
+ int ro = 0;
+
+ for (i = 0; i < ndr_desc->num_mappings; i++) {
+ struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
+ struct nvdimm *nvdimm = mapping->nvdimm;
+
+ if ((mapping->start | mapping->size) % SZ_4K) {
+ dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n",
+ caller, dev_name(&nvdimm->dev), i);
+
+ return NULL;
+ }
+
+ if (test_bit(NDD_UNARMED, &nvdimm->flags))
+ ro = 1;
+ }
+
+ if (dev_type == &nd_blk_device_type) {
+ struct nd_blk_region_desc *ndbr_desc;
+ struct nd_blk_region *ndbr;
+
+ ndbr_desc = to_blk_region_desc(ndr_desc);
+ ndbr = kzalloc(sizeof(*ndbr) + sizeof(struct nd_mapping)
+ * ndr_desc->num_mappings,
+ GFP_KERNEL);
+ if (ndbr) {
+ nd_region = &ndbr->nd_region;
+ ndbr->enable = ndbr_desc->enable;
+ ndbr->do_io = ndbr_desc->do_io;
+ }
+ region_buf = ndbr;
+ } else {
+ nd_region = kzalloc(sizeof(struct nd_region)
+ + sizeof(struct nd_mapping)
+ * ndr_desc->num_mappings,
+ GFP_KERNEL);
+ region_buf = nd_region;
+ }
+
+ if (!region_buf)
+ return NULL;
+ nd_region->id = ida_simple_get(&region_ida, 0, 0, GFP_KERNEL);
+ if (nd_region->id < 0)
+ goto err_id;
+
+ nd_region->lane = alloc_percpu(struct nd_percpu_lane);
+ if (!nd_region->lane)
+ goto err_percpu;
+
+ for (i = 0; i < nr_cpu_ids; i++) {
+ struct nd_percpu_lane *ndl;
+
+ ndl = per_cpu_ptr(nd_region->lane, i);
+ spin_lock_init(&ndl->lock);
+ ndl->count = 0;
+ }
+
+ for (i = 0; i < ndr_desc->num_mappings; i++) {
+ struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
+ struct nvdimm *nvdimm = mapping->nvdimm;
+
+ nd_region->mapping[i].nvdimm = nvdimm;
+ nd_region->mapping[i].start = mapping->start;
+ nd_region->mapping[i].size = mapping->size;
+ nd_region->mapping[i].position = mapping->position;
+ INIT_LIST_HEAD(&nd_region->mapping[i].labels);
+ mutex_init(&nd_region->mapping[i].lock);
+
+ get_device(&nvdimm->dev);
+ }
+ nd_region->ndr_mappings = ndr_desc->num_mappings;
+ nd_region->provider_data = ndr_desc->provider_data;
+ nd_region->nd_set = ndr_desc->nd_set;
+ nd_region->num_lanes = ndr_desc->num_lanes;
+ nd_region->flags = ndr_desc->flags;
+ nd_region->ro = ro;
+ nd_region->numa_node = ndr_desc->numa_node;
+ ida_init(&nd_region->ns_ida);
+ ida_init(&nd_region->btt_ida);
+ ida_init(&nd_region->pfn_ida);
+ ida_init(&nd_region->dax_ida);
+ dev = &nd_region->dev;
+ dev_set_name(dev, "region%d", nd_region->id);
+ dev->parent = &nvdimm_bus->dev;
+ dev->type = dev_type;
+ dev->groups = ndr_desc->attr_groups;
+ dev->of_node = ndr_desc->of_node;
+ nd_region->ndr_size = resource_size(ndr_desc->res);
+ nd_region->ndr_start = ndr_desc->res->start;
+ nd_device_register(dev);
+
+ return nd_region;
+
+ err_percpu:
+ ida_simple_remove(&region_ida, nd_region->id);
+ err_id:
+ kfree(region_buf);
+ return NULL;
+}
+
+struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc)
+{
+ ndr_desc->num_lanes = ND_MAX_LANES;
+ return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type,
+ __func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create);
+
+struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc)
+{
+ if (ndr_desc->num_mappings > 1)
+ return NULL;
+ ndr_desc->num_lanes = min(ndr_desc->num_lanes, ND_MAX_LANES);
+ return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type,
+ __func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_blk_region_create);
+
+struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc)
+{
+ ndr_desc->num_lanes = ND_MAX_LANES;
+ return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type,
+ __func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
+
+/**
+ * nvdimm_flush - flush any posted write queues between the cpu and pmem media
+ * @nd_region: blk or interleaved pmem region
+ */
+void nvdimm_flush(struct nd_region *nd_region)
+{
+ struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
+ int i, idx;
+
+ /*
+ * Try to encourage some diversity in flush hint addresses
+ * across cpus assuming a limited number of flush hints.
+ */
+ idx = this_cpu_read(flush_idx);
+ idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));
+
+ /*
+ * The first wmb() is needed to 'sfence' all previous writes
+ * such that they are architecturally visible for the platform
+ * buffer flush. Note that we've already arranged for pmem
+ * writes to avoid the cache via memcpy_flushcache(). The final
+ * wmb() ensures ordering for the NVDIMM flush write.
+ */
+ wmb();
+ for (i = 0; i < nd_region->ndr_mappings; i++)
+ if (ndrd_get_flush_wpq(ndrd, i, 0))
+ writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
+ wmb();
+}
+EXPORT_SYMBOL_GPL(nvdimm_flush);
+
+/**
+ * nvdimm_has_flush - determine write flushing requirements
+ * @nd_region: blk or interleaved pmem region
+ *
+ * Returns 1 if writes require flushing
+ * Returns 0 if writes do not require flushing
+ * Returns -ENXIO if flushing capability can not be determined
+ */
+int nvdimm_has_flush(struct nd_region *nd_region)
+{
+ int i;
+
+ /* no nvdimm or pmem api == flushing capability unknown */
+ if (nd_region->ndr_mappings == 0
+ || !IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API))
+ return -ENXIO;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ /* flush hints present / available */
+ if (nvdimm->num_flush)
+ return 1;
+ }
+
+ /*
+ * The platform defines dimm devices without hints, assume
+ * platform persistence mechanism like ADR
+ */
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_has_flush);
+
+int nvdimm_has_cache(struct nd_region *nd_region)
+{
+ return is_nd_pmem(&nd_region->dev) &&
+ !test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags);
+}
+EXPORT_SYMBOL_GPL(nvdimm_has_cache);
+
+struct conflict_context {
+ struct nd_region *nd_region;
+ resource_size_t start, size;
+};
+
+static int region_conflict(struct device *dev, void *data)
+{
+ struct nd_region *nd_region;
+ struct conflict_context *ctx = data;
+ resource_size_t res_end, region_end, region_start;
+
+ if (!is_memory(dev))
+ return 0;
+
+ nd_region = to_nd_region(dev);
+ if (nd_region == ctx->nd_region)
+ return 0;
+
+ res_end = ctx->start + ctx->size;
+ region_start = nd_region->ndr_start;
+ region_end = region_start + nd_region->ndr_size;
+ if (ctx->start >= region_start && ctx->start < region_end)
+ return -EBUSY;
+ if (res_end > region_start && res_end <= region_end)
+ return -EBUSY;
+ return 0;
+}
+
+int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
+ resource_size_t size)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+ struct conflict_context ctx = {
+ .nd_region = nd_region,
+ .start = start,
+ .size = size,
+ };
+
+ return device_for_each_child(&nvdimm_bus->dev, &ctx, region_conflict);
+}
+
+void __exit nd_region_devs_exit(void)
+{
+ ida_destroy(&region_ida);
+}