Adding upstream version 6.6.15.upstream/6.6.15

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-11 08:27:49 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-11 08:27:49 +0000
commit: ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
tree: b2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/nvdimm
parent: Initial commit. (diff)
download: linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
30 files changed, 15231 insertions, 0 deletions
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
new file mode 100644
index 0000000000..77b06d54cc
--- /dev/null
+++ b/drivers/nvdimm/Kconfig
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig LIBNVDIMM
+	tristate "NVDIMM (Non-Volatile Memory Device) Support"
+	depends on PHYS_ADDR_T_64BIT
+	depends on HAS_IOMEM
+	depends on BLK_DEV
+	select MEMREGION
+	help
+	  Generic support for non-volatile memory devices including
+	  ACPI-6-NFIT defined resources.  On platforms that define an
+	  NFIT, or otherwise can discover NVDIMM resources, a libnvdimm
+	  bus is registered to advertise PMEM (persistent memory)
+	  namespaces (/dev/pmemX). A PMEM namespace refers to a
+	  memory resource that may span multiple DIMMs and support DAX
+	  (see CONFIG_DAX).
+
+if LIBNVDIMM
+
+config BLK_DEV_PMEM
+	tristate "PMEM: Persistent memory block device support"
+	default LIBNVDIMM
+	select DAX
+	select ND_BTT if BTT
+	select ND_PFN if NVDIMM_PFN
+	help
+	  Memory ranges for PMEM are described by either an NFIT
+	  (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a
+	  non-standard OEM-specific E820 memory type (type-12, see
+	  CONFIG_X86_PMEM_LEGACY), or it is manually specified by the
+	  'memmap=nn[KMG]!ss[KMG]' kernel command line (see
+	  Documentation/admin-guide/kernel-parameters.rst).  This driver converts
+	  these persistent memory ranges into block devices that are
+	  capable of DAX (direct-access) file system mappings.  See
+	  Documentation/driver-api/nvdimm/nvdimm.rst for more details.
+
+	  Say Y if you want to use an NVDIMM
+
+config ND_CLAIM
+	bool
+
+config ND_BTT
+	tristate
+
+config BTT
+	bool "BTT: Block Translation Table (atomic sector updates)"
+	default y if LIBNVDIMM
+	select ND_CLAIM
+	help
+	  The Block Translation Table (BTT) provides atomic sector
+	  update semantics for persistent memory devices, so that
+	  applications that rely on sector writes not being torn (a
+	  guarantee that typical disks provide) can continue to do so.
+	  The BTT manifests itself as an alternate personality for an
+	  NVDIMM namespace, i.e. a namespace can be in raw mode pmemX,
+	  or 'sectored' mode.
+
+	  Select Y if unsure
+
+config ND_PFN
+	tristate
+
+config NVDIMM_PFN
+	bool "PFN: Map persistent (device) memory"
+	default LIBNVDIMM
+	depends on ZONE_DEVICE
+	select ND_CLAIM
+	help
+	  Map persistent memory, i.e. advertise it to the memory
+	  management sub-system.  By default persistent memory does
+	  not support direct I/O, RDMA, or any other usage that
+	  requires a 'struct page' to mediate an I/O request.  This
+	  driver allocates and initializes the infrastructure needed
+	  to support those use cases.
+
+	  Select Y if unsure
+
+config NVDIMM_DAX
+	bool "NVDIMM DAX: Raw access to persistent memory"
+	default LIBNVDIMM
+	depends on NVDIMM_PFN
+	help
+	  Support raw device dax access to a persistent memory
+	  namespace.  For environments that want to hard partition
+	  persistent memory, this capability provides a mechanism to
+	  sub-divide a namespace into character devices that can only be
+	  accessed via DAX (mmap(2)).
+
+	  Select Y if unsure
+
+config OF_PMEM
+	tristate "Device-tree support for persistent memory regions"
+	depends on OF
+	default LIBNVDIMM
+	help
+	  Allows regions of persistent memory to be described in the
+	  device-tree.
+
+	  Select Y if unsure.
+
+config NVDIMM_KEYS
+	def_bool y
+	depends on ENCRYPTED_KEYS
+	depends on (LIBNVDIMM=ENCRYPTED_KEYS) || LIBNVDIMM=m
+
+config NVDIMM_KMSAN
+	bool
+	depends on KMSAN
+	help
+	  KMSAN, and other memory debug facilities, increase the size of
+	  'struct page' to contain extra metadata. This collides with
+	  the NVDIMM capability to store a potentially
+	  larger-than-"System RAM" size 'struct page' array in a
+	  reservation of persistent memory rather than limited /
+	  precious DRAM. However, that reservation needs to persist for
+	  the life of the given NVDIMM namespace. If you are using KMSAN
+	  to debug an issue unrelated to NVDIMMs or DAX then say N to this
+	  option. Otherwise, say Y but understand that any namespaces
+	  (with the page array stored pmem) created with this build of
+	  the kernel will permanently reserve and strand excess
+	  capacity compared to the CONFIG_KMSAN=n case.
+
+	  Select N if unsure.
+
+config NVDIMM_TEST_BUILD
+	tristate "Build the unit test core"
+	depends on m
+	depends on COMPILE_TEST && X86_64
+	default m if COMPILE_TEST
+	help
+	  Build the core of the unit test infrastructure. The result of
+	  this build is non-functional for unit test execution, but it
+	  otherwise helps catch build errors induced by changes to the
+	  core devm_memremap_pages() implementation and other
+	  infrastructure.
+
+config NVDIMM_SECURITY_TEST
+	bool "Enable NVDIMM security unit tests"
+	depends on NVDIMM_KEYS
+	help
+	  The NVDIMM and CXL subsystems support unit testing of their device
+	  security state machines. The NVDIMM_SECURITY_TEST option disables CPU
+	  cache maintenance operations around events like secure erase and
+	  overwrite.  Also, when enabled, the NVDIMM subsystem core helps the unit
+	  test implement a mock state machine.
+
+	  Select N if unsure.
+
+endif
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
new file mode 100644
index 0000000000..ba0296dca9
--- /dev/null
+++ b/drivers/nvdimm/Makefile
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
+obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
+obj-$(CONFIG_ND_BTT) += nd_btt.o
+obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
+obj-$(CONFIG_OF_PMEM) += of_pmem.o
+obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o
+
+nd_pmem-y := pmem.o
+
+nd_btt-y := btt.o
+
+nd_e820-y := e820.o
+
+libnvdimm-y := core.o
+libnvdimm-y += bus.o
+libnvdimm-y += dimm_devs.o
+libnvdimm-$(CONFIG_PERF_EVENTS) += nd_perf.o
+libnvdimm-y += dimm.o
+libnvdimm-y += region_devs.o
+libnvdimm-y += region.o
+libnvdimm-y += namespace_devs.o
+libnvdimm-y += label.o
+libnvdimm-y += badrange.o
+libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
+libnvdimm-$(CONFIG_BTT) += btt_devs.o
+libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o
+libnvdimm-$(CONFIG_NVDIMM_DAX) += dax_devs.o
+libnvdimm-$(CONFIG_NVDIMM_KEYS) += security.o
+
+TOOLS := ../../tools
+TEST_SRC := $(TOOLS)/testing/nvdimm/test
+obj-$(CONFIG_NVDIMM_TEST_BUILD) += $(TEST_SRC)/iomap.o
diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
new file mode 100644
index 0000000000..aaf6e215a8
--- /dev/null
+++ b/drivers/nvdimm/badrange.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ */
+#include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/ctype.h>
+#include <linux/ndctl.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include "nd-core.h"
+#include "nd.h"
+
+void badrange_init(struct badrange *badrange)
+{
+	INIT_LIST_HEAD(&badrange->list);
+	spin_lock_init(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_init);
+
+static void append_badrange_entry(struct badrange *badrange,
+		struct badrange_entry *bre, u64 addr, u64 length)
+{
+	lockdep_assert_held(&badrange->lock);
+	bre->start = addr;
+	bre->length = length;
+	list_add_tail(&bre->list, &badrange->list);
+}
+
+static int alloc_and_append_badrange_entry(struct badrange *badrange,
+		u64 addr, u64 length, gfp_t flags)
+{
+	struct badrange_entry *bre;
+
+	bre = kzalloc(sizeof(*bre), flags);
+	if (!bre)
+		return -ENOMEM;
+
+	append_badrange_entry(badrange, bre, addr, length);
+	return 0;
+}
+
+static int add_badrange(struct badrange *badrange, u64 addr, u64 length)
+{
+	struct badrange_entry *bre, *bre_new;
+
+	spin_unlock(&badrange->lock);
+	bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL);
+	spin_lock(&badrange->lock);
+
+	if (list_empty(&badrange->list)) {
+		if (!bre_new)
+			return -ENOMEM;
+		append_badrange_entry(badrange, bre_new, addr, length);
+		return 0;
+	}
+
+	/*
+	 * There is a chance this is a duplicate, check for those first.
+	 * This will be the common case as ARS_STATUS returns all known
+	 * errors in the SPA space, and we can't query it per region
+	 */
+	list_for_each_entry(bre, &badrange->list, list)
+		if (bre->start == addr) {
+			/* If length has changed, update this list entry */
+			if (bre->length != length)
+				bre->length = length;
+			kfree(bre_new);
+			return 0;
+		}
+
+	/*
+	 * If not a duplicate or a simple length update, add the entry as is,
+	 * as any overlapping ranges will get resolved when the list is consumed
+	 * and converted to badblocks
+	 */
+	if (!bre_new)
+		return -ENOMEM;
+	append_badrange_entry(badrange, bre_new, addr, length);
+
+	return 0;
+}
+
+int badrange_add(struct badrange *badrange, u64 addr, u64 length)
+{
+	int rc;
+
+	spin_lock(&badrange->lock);
+	rc = add_badrange(badrange, addr, length);
+	spin_unlock(&badrange->lock);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(badrange_add);
+
+void badrange_forget(struct badrange *badrange, phys_addr_t start,
+		unsigned int len)
+{
+	struct list_head *badrange_list = &badrange->list;
+	u64 clr_end = start + len - 1;
+	struct badrange_entry *bre, *next;
+
+	spin_lock(&badrange->lock);
+
+	/*
+	 * [start, clr_end] is the badrange interval being cleared.
+	 * [bre->start, bre_end] is the badrange_list entry we're comparing
+	 * the above interval against. The badrange list entry may need
+	 * to be modified (update either start or length), deleted, or
+	 * split into two based on the overlap characteristics
+	 */
+
+	list_for_each_entry_safe(bre, next, badrange_list, list) {
+		u64 bre_end = bre->start + bre->length - 1;
+
+		/* Skip intervals with no intersection */
+		if (bre_end < start)
+			continue;
+		if (bre->start >  clr_end)
+			continue;
+		/* Delete completely overlapped badrange entries */
+		if ((bre->start >= start) && (bre_end <= clr_end)) {
+			list_del(&bre->list);
+			kfree(bre);
+			continue;
+		}
+		/* Adjust start point of partially cleared entries */
+		if ((start <= bre->start) && (clr_end > bre->start)) {
+			bre->length -= clr_end - bre->start + 1;
+			bre->start = clr_end + 1;
+			continue;
+		}
+		/* Adjust bre->length for partial clearing at the tail end */
+		if ((bre->start < start) && (bre_end <= clr_end)) {
+			/* bre->start remains the same */
+			bre->length = start - bre->start;
+			continue;
+		}
+		/*
+		 * If clearing in the middle of an entry, we split it into
+		 * two by modifying the current entry to represent one half of
+		 * the split, and adding a new entry for the second half.
+		 */
+		if ((bre->start < start) && (bre_end > clr_end)) {
+			u64 new_start = clr_end + 1;
+			u64 new_len = bre_end - new_start + 1;
+
+			/* Add new entry covering the right half */
+			alloc_and_append_badrange_entry(badrange, new_start,
+					new_len, GFP_NOWAIT);
+			/* Adjust this entry to cover the left half */
+			bre->length = start - bre->start;
+			continue;
+		}
+	}
+	spin_unlock(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_forget);
+
+static void set_badblock(struct badblocks *bb, sector_t s, int num)
+{
+	dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n",
+			(u64) s * 512, (u64) num * 512);
+	/* this isn't an error as the hardware will still throw an exception */
+	if (badblocks_set(bb, s, num, 1))
+		dev_info_once(bb->dev, "%s: failed for sector %llx\n",
+				__func__, (u64) s);
+}
+
+/**
+ * __add_badblock_range() - Convert a physical address range to bad sectors
+ * @bb:		badblocks instance to populate
+ * @ns_offset:	namespace offset where the error range begins (in bytes)
+ * @len:	number of bytes of badrange to be added
+ *
+ * This assumes that the range provided with (ns_offset, len) is within
+ * the bounds of physical addresses for this namespace, i.e. lies in the
+ * interval [ns_start, ns_start + ns_size)
+ */
+static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
+{
+	const unsigned int sector_size = 512;
+	sector_t start_sector, end_sector;
+	u64 num_sectors;
+	u32 rem;
+
+	start_sector = div_u64(ns_offset, sector_size);
+	end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
+	if (rem)
+		end_sector++;
+	num_sectors = end_sector - start_sector;
+
+	if (unlikely(num_sectors > (u64)INT_MAX)) {
+		u64 remaining = num_sectors;
+		sector_t s = start_sector;
+
+		while (remaining) {
+			int done = min_t(u64, remaining, INT_MAX);
+
+			set_badblock(bb, s, done);
+			remaining -= done;
+			s += done;
+		}
+	} else
+		set_badblock(bb, start_sector, num_sectors);
+}
+
+static void badblocks_populate(struct badrange *badrange,
+		struct badblocks *bb, const struct range *range)
+{
+	struct badrange_entry *bre;
+
+	if (list_empty(&badrange->list))
+		return;
+
+	list_for_each_entry(bre, &badrange->list, list) {
+		u64 bre_end = bre->start + bre->length - 1;
+
+		/* Discard intervals with no intersection */
+		if (bre_end < range->start)
+			continue;
+		if (bre->start > range->end)
+			continue;
+		/* Deal with any overlap after start of the namespace */
+		if (bre->start >= range->start) {
+			u64 start = bre->start;
+			u64 len;
+
+			if (bre_end <= range->end)
+				len = bre->length;
+			else
+				len = range->start + range_len(range)
+					- bre->start;
+			__add_badblock_range(bb, start - range->start, len);
+			continue;
+		}
+		/*
+		 * Deal with overlap for badrange starting before
+		 * the namespace.
+		 */
+		if (bre->start < range->start) {
+			u64 len;
+
+			if (bre_end < range->end)
+				len = bre->start + bre->length - range->start;
+			else
+				len = range_len(range);
+			__add_badblock_range(bb, 0, len);
+		}
+	}
+}
+
+/**
+ * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks
+ * @region: parent region of the range to interrogate
+ * @bb: badblocks instance to populate
+ * @res: resource range to consider
+ *
+ * The badrange list generated during bus initialization may contain
+ * multiple, possibly overlapping physical address ranges.  Compare each
+ * of these ranges to the resource range currently being initialized,
+ * and add badblocks entries for all matching sub-ranges
+ */
+void nvdimm_badblocks_populate(struct nd_region *nd_region,
+		struct badblocks *bb, const struct range *range)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	if (!is_memory(&nd_region->dev)) {
+		dev_WARN_ONCE(&nd_region->dev, 1,
+				"%s only valid for pmem regions\n", __func__);
+		return;
+	}
+	nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	badblocks_populate(&nvdimm_bus->badrange, bb, range);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
new file mode 100644
index 0000000000..d5593b0dc7
--- /dev/null
+++ b/drivers/nvdimm/btt.c
@@ -0,0 +1,1725 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Block Translation Table
+ * Copyright (c) 2014-2015, Intel Corporation.
+ */
+#include <linux/highmem.h>
+#include <linux/debugfs.h>
+#include <linux/blkdev.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/hdreg.h>
+#include <linux/sizes.h>
+#include <linux/ndctl.h>
+#include <linux/fs.h>
+#include <linux/nd.h>
+#include <linux/backing-dev.h>
+#include "btt.h"
+#include "nd.h"
+
+enum log_ent_request {
+	LOG_NEW_ENT = 0,
+	LOG_OLD_ENT
+};
+
+static struct device *to_dev(struct arena_info *arena)
+{
+	return &arena->nd_btt->dev;
+}
+
+static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset)
+{
+	return offset + nd_btt->initial_offset;
+}
+
+static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
+		void *buf, size_t n, unsigned long flags)
+{
+	struct nd_btt *nd_btt = arena->nd_btt;
+	struct nd_namespace_common *ndns = nd_btt->ndns;
+
+	/* arena offsets may be shifted from the base of the device */
+	offset = adjust_initial_offset(nd_btt, offset);
+	return nvdimm_read_bytes(ndns, offset, buf, n, flags);
+}
+
+static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
+		void *buf, size_t n, unsigned long flags)
+{
+	struct nd_btt *nd_btt = arena->nd_btt;
+	struct nd_namespace_common *ndns = nd_btt->ndns;
+
+	/* arena offsets may be shifted from the base of the device */
+	offset = adjust_initial_offset(nd_btt, offset);
+	return nvdimm_write_bytes(ndns, offset, buf, n, flags);
+}
+
+static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
+{
+	int ret;
+
+	/*
+	 * infooff and info2off should always be at least 512B aligned.
+	 * We rely on that to make sure rw_bytes does error clearing
+	 * correctly, so make sure that is the case.
+	 */
+	dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512),
+		"arena->infooff: %#llx is unaligned\n", arena->infooff);
+	dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512),
+		"arena->info2off: %#llx is unaligned\n", arena->info2off);
+
+	ret = arena_write_bytes(arena, arena->info2off, super,
+			sizeof(struct btt_sb), 0);
+	if (ret)
+		return ret;
+
+	return arena_write_bytes(arena, arena->infooff, super,
+			sizeof(struct btt_sb), 0);
+}
+
+static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
+{
+	return arena_read_bytes(arena, arena->infooff, super,
+			sizeof(struct btt_sb), 0);
+}
+
+/*
+ * 'raw' version of btt_map write
+ * Assumptions:
+ *   mapping is in little-endian
+ *   mapping contains 'E' and 'Z' flags as desired
+ */
+static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping,
+		unsigned long flags)
+{
+	u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
+
+	if (unlikely(lba >= arena->external_nlba))
+		dev_err_ratelimited(to_dev(arena),
+			"%s: lba %#x out of range (max: %#x)\n",
+			__func__, lba, arena->external_nlba);
+	return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags);
+}
+
+static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
+			u32 z_flag, u32 e_flag, unsigned long rwb_flags)
+{
+	u32 ze;
+	__le32 mapping_le;
+
+	/*
+	 * This 'mapping' is supposed to be just the LBA mapping, without
+	 * any flags set, so strip the flag bits.
+	 */
+	mapping = ent_lba(mapping);
+
+	ze = (z_flag << 1) + e_flag;
+	switch (ze) {
+	case 0:
+		/*
+		 * We want to set neither of the Z or E flags, and
+		 * in the actual layout, this means setting the bit
+		 * positions of both to '1' to indicate a 'normal'
+		 * map entry
+		 */
+		mapping |= MAP_ENT_NORMAL;
+		break;
+	case 1:
+		mapping |= (1 << MAP_ERR_SHIFT);
+		break;
+	case 2:
+		mapping |= (1 << MAP_TRIM_SHIFT);
+		break;
+	default:
+		/*
+		 * The case where Z and E are both sent in as '1' could be
+		 * construed as a valid 'normal' case, but we decide not to,
+		 * to avoid confusion
+		 */
+		dev_err_ratelimited(to_dev(arena),
+			"Invalid use of Z and E flags\n");
+		return -EIO;
+	}
+
+	mapping_le = cpu_to_le32(mapping);
+	return __btt_map_write(arena, lba, mapping_le, rwb_flags);
+}
+
+static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
+			int *trim, int *error, unsigned long rwb_flags)
+{
+	int ret;
+	__le32 in;
+	u32 raw_mapping, postmap, ze, z_flag, e_flag;
+	u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
+
+	if (unlikely(lba >= arena->external_nlba))
+		dev_err_ratelimited(to_dev(arena),
+			"%s: lba %#x out of range (max: %#x)\n",
+			__func__, lba, arena->external_nlba);
+
+	ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags);
+	if (ret)
+		return ret;
+
+	raw_mapping = le32_to_cpu(in);
+
+	z_flag = ent_z_flag(raw_mapping);
+	e_flag = ent_e_flag(raw_mapping);
+	ze = (z_flag << 1) + e_flag;
+	postmap = ent_lba(raw_mapping);
+
+	/* Reuse the {z,e}_flag variables for *trim and *error */
+	z_flag = 0;
+	e_flag = 0;
+
+	switch (ze) {
+	case 0:
+		/* Initial state. Return postmap = premap */
+		*mapping = lba;
+		break;
+	case 1:
+		*mapping = postmap;
+		e_flag = 1;
+		break;
+	case 2:
+		*mapping = postmap;
+		z_flag = 1;
+		break;
+	case 3:
+		*mapping = postmap;
+		break;
+	default:
+		return -EIO;
+	}
+
+	if (trim)
+		*trim = z_flag;
+	if (error)
+		*error = e_flag;
+
+	return ret;
+}
+
+static int btt_log_group_read(struct arena_info *arena, u32 lane,
+			struct log_group *log)
+{
+	return arena_read_bytes(arena,
+			arena->logoff + (lane * LOG_GRP_SIZE), log,
+			LOG_GRP_SIZE, 0);
+}
+
+static struct dentry *debugfs_root;
+
+static void arena_debugfs_init(struct arena_info *a, struct dentry *parent,
+				int idx)
+{
+	char dirname[32];
+	struct dentry *d;
+
+	/* If for some reason, parent bttN was not created, exit */
+	if (!parent)
+		return;
+
+	snprintf(dirname, 32, "arena%d", idx);
+	d = debugfs_create_dir(dirname, parent);
+	if (IS_ERR_OR_NULL(d))
+		return;
+	a->debugfs_dir = d;
+
+	debugfs_create_x64("size", S_IRUGO, d, &a->size);
+	debugfs_create_x64("external_lba_start", S_IRUGO, d,
+				&a->external_lba_start);
+	debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba);
+	debugfs_create_u32("internal_lbasize", S_IRUGO, d,
+				&a->internal_lbasize);
+	debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba);
+	debugfs_create_u32("external_lbasize", S_IRUGO, d,
+				&a->external_lbasize);
+	debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree);
+	debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major);
+	debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor);
+	debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff);
+	debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff);
+	debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff);
+	debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff);
+	debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
+	debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
+	debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
+	debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]);
+	debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]);
+}
+
+static void btt_debugfs_init(struct btt *btt)
+{
+	int i = 0;
+	struct arena_info *arena;
+
+	btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev),
+						debugfs_root);
+	if (IS_ERR_OR_NULL(btt->debugfs_dir))
+		return;
+
+	list_for_each_entry(arena, &btt->arena_list, list) {
+		arena_debugfs_init(arena, btt->debugfs_dir, i);
+		i++;
+	}
+}
+
+static u32 log_seq(struct log_group *log, int log_idx)
+{
+	return le32_to_cpu(log->ent[log_idx].seq);
+}
+
+/*
+ * This function accepts two log entries, and uses the
+ * sequence number to find the 'older' entry.
+ * It also updates the sequence number in this old entry to
+ * make it the 'new' one if the mark_flag is set.
+ * Finally, it returns which of the entries was the older one.
+ *
+ * TODO The logic feels a bit kludge-y. make it better..
+ */
+static int btt_log_get_old(struct arena_info *a, struct log_group *log)
+{
+	int idx0 = a->log_index[0];
+	int idx1 = a->log_index[1];
+	int old;
+
+	/*
+	 * the first ever time this is seen, the entry goes into [0]
+	 * the next time, the following logic works out to put this
+	 * (next) entry into [1]
+	 */
+	if (log_seq(log, idx0) == 0) {
+		log->ent[idx0].seq = cpu_to_le32(1);
+		return 0;
+	}
+
+	if (log_seq(log, idx0) == log_seq(log, idx1))
+		return -EINVAL;
+	if (log_seq(log, idx0) + log_seq(log, idx1) > 5)
+		return -EINVAL;
+
+	if (log_seq(log, idx0) < log_seq(log, idx1)) {
+		if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1)
+			old = 0;
+		else
+			old = 1;
+	} else {
+		if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1)
+			old = 1;
+		else
+			old = 0;
+	}
+
+	return old;
+}
+
+/*
+ * This function copies the desired (old/new) log entry into ent if
+ * it is not NULL. It returns the sub-slot number (0 or 1)
+ * where the desired log entry was found. Negative return values
+ * indicate errors.
+ */
+static int btt_log_read(struct arena_info *arena, u32 lane,
+			struct log_entry *ent, int old_flag)
+{
+	int ret;
+	int old_ent, ret_ent;
+	struct log_group log;
+
+	ret = btt_log_group_read(arena, lane, &log);
+	if (ret)
+		return -EIO;
+
+	old_ent = btt_log_get_old(arena, &log);
+	if (old_ent < 0 || old_ent > 1) {
+		dev_err(to_dev(arena),
+				"log corruption (%d): lane %d seq [%d, %d]\n",
+				old_ent, lane, log.ent[arena->log_index[0]].seq,
+				log.ent[arena->log_index[1]].seq);
+		/* TODO set error state? */
+		return -EIO;
+	}
+
+	ret_ent = (old_flag ? old_ent : (1 - old_ent));
+
+	if (ent != NULL)
+		memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE);
+
+	return ret_ent;
+}
+
+/*
+ * This function commits a log entry to media
+ * It does _not_ prepare the freelist entry for the next write
+ * btt_flog_write is the wrapper for updating the freelist elements
+ */
+static int __btt_log_write(struct arena_info *arena, u32 lane,
+			u32 sub, struct log_entry *ent, unsigned long flags)
+{
+	int ret;
+	u32 group_slot = arena->log_index[sub];
+	unsigned int log_half = LOG_ENT_SIZE / 2;
+	void *src = ent;
+	u64 ns_off;
+
+	ns_off = arena->logoff + (lane * LOG_GRP_SIZE) +
+		(group_slot * LOG_ENT_SIZE);
+	/* split the 16B write into atomic, durable halves */
+	ret = arena_write_bytes(arena, ns_off, src, log_half, flags);
+	if (ret)
+		return ret;
+
+	ns_off += log_half;
+	src += log_half;
+	return arena_write_bytes(arena, ns_off, src, log_half, flags);
+}
+
+static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
+			struct log_entry *ent)
+{
+	int ret;
+
+	ret = __btt_log_write(arena, lane, sub, ent, NVDIMM_IO_ATOMIC);
+	if (ret)
+		return ret;
+
+	/* prepare the next free entry */
+	arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
+	if (++(arena->freelist[lane].seq) == 4)
+		arena->freelist[lane].seq = 1;
+	if (ent_e_flag(le32_to_cpu(ent->old_map)))
+		arena->freelist[lane].has_err = 1;
+	arena->freelist[lane].block = ent_lba(le32_to_cpu(ent->old_map));
+
+	return ret;
+}
+
+/*
+ * This function initializes the BTT map to the initial state, which is
+ * all-zeroes, and indicates an identity mapping
+ */
+static int btt_map_init(struct arena_info *arena)
+{
+	int ret = -EINVAL;
+	void *zerobuf;
+	size_t offset = 0;
+	size_t chunk_size = SZ_2M;
+	size_t mapsize = arena->logoff - arena->mapoff;
+
+	zerobuf = kzalloc(chunk_size, GFP_KERNEL);
+	if (!zerobuf)
+		return -ENOMEM;
+
+	/*
+	 * mapoff should always be at least 512B  aligned. We rely on that to
+	 * make sure rw_bytes does error clearing correctly, so make sure that
+	 * is the case.
+	 */
+	dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512),
+		"arena->mapoff: %#llx is unaligned\n", arena->mapoff);
+
+	while (mapsize) {
+		size_t size = min(mapsize, chunk_size);
+
+		dev_WARN_ONCE(to_dev(arena), size < 512,
+			"chunk size: %#zx is unaligned\n", size);
+		ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
+				size, 0);
+		if (ret)
+			goto free;
+
+		offset += size;
+		mapsize -= size;
+		cond_resched();
+	}
+
+ free:
+	kfree(zerobuf);
+	return ret;
+}
+
+/*
+ * This function initializes the BTT log with 'fake' entries pointing
+ * to the initial reserved set of blocks as being free
+ */
+static int btt_log_init(struct arena_info *arena)
+{
+	size_t logsize = arena->info2off - arena->logoff;
+	size_t chunk_size = SZ_4K, offset = 0;
+	struct log_entry ent;
+	void *zerobuf;
+	int ret;
+	u32 i;
+
+	zerobuf = kzalloc(chunk_size, GFP_KERNEL);
+	if (!zerobuf)
+		return -ENOMEM;
+	/*
+	 * logoff should always be at least 512B  aligned. We rely on that to
+	 * make sure rw_bytes does error clearing correctly, so make sure that
+	 * is the case.
+	 */
+	dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512),
+		"arena->logoff: %#llx is unaligned\n", arena->logoff);
+
+	while (logsize) {
+		size_t size = min(logsize, chunk_size);
+
+		dev_WARN_ONCE(to_dev(arena), size < 512,
+			"chunk size: %#zx is unaligned\n", size);
+		ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf,
+				size, 0);
+		if (ret)
+			goto free;
+
+		offset += size;
+		logsize -= size;
+		cond_resched();
+	}
+
+	for (i = 0; i < arena->nfree; i++) {
+		ent.lba = cpu_to_le32(i);
+		ent.old_map = cpu_to_le32(arena->external_nlba + i);
+		ent.new_map = cpu_to_le32(arena->external_nlba + i);
+		ent.seq = cpu_to_le32(LOG_SEQ_INIT);
+		ret = __btt_log_write(arena, i, 0, &ent, 0);
+		if (ret)
+			goto free;
+	}
+
+ free:
+	kfree(zerobuf);
+	return ret;
+}
+
+static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
+{
+	return arena->dataoff + ((u64)lba * arena->internal_lbasize);
+}
+
+static int arena_clear_freelist_error(struct arena_info *arena, u32 lane)
+{
+	int ret = 0;
+
+	if (arena->freelist[lane].has_err) {
+		void *zero_page = page_address(ZERO_PAGE(0));
+		u32 lba = arena->freelist[lane].block;
+		u64 nsoff = to_namespace_offset(arena, lba);
+		unsigned long len = arena->sector_size;
+
+		mutex_lock(&arena->err_lock);
+
+		while (len) {
+			unsigned long chunk = min(len, PAGE_SIZE);
+
+			ret = arena_write_bytes(arena, nsoff, zero_page,
+				chunk, 0);
+			if (ret)
+				break;
+			len -= chunk;
+			nsoff += chunk;
+			if (len == 0)
+				arena->freelist[lane].has_err = 0;
+		}
+		mutex_unlock(&arena->err_lock);
+	}
+	return ret;
+}
+
+static int btt_freelist_init(struct arena_info *arena)
+{
+	int new, ret;
+	struct log_entry log_new;
+	u32 i, map_entry, log_oldmap, log_newmap;
+
+	arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry),
+					GFP_KERNEL);
+	if (!arena->freelist)
+		return -ENOMEM;
+
+	for (i = 0; i < arena->nfree; i++) {
+		new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT);
+		if (new < 0)
+			return new;
+
+		/* old and new map entries with any flags stripped out */
+		log_oldmap = ent_lba(le32_to_cpu(log_new.old_map));
+		log_newmap = ent_lba(le32_to_cpu(log_new.new_map));
+
+		/* sub points to the next one to be overwritten */
+		arena->freelist[i].sub = 1 - new;
+		arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
+		arena->freelist[i].block = log_oldmap;
+
+		/*
+		 * FIXME: if error clearing fails during init, we want to make
+		 * the BTT read-only
+		 */
+		if (ent_e_flag(le32_to_cpu(log_new.old_map)) &&
+		    !ent_normal(le32_to_cpu(log_new.old_map))) {
+			arena->freelist[i].has_err = 1;
+			ret = arena_clear_freelist_error(arena, i);
+			if (ret)
+				dev_err_ratelimited(to_dev(arena),
+					"Unable to clear known errors\n");
+		}
+
+		/* This implies a newly created or untouched flog entry */
+		if (log_oldmap == log_newmap)
+			continue;
+
+		/* Check if map recovery is needed */
+		ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry,
+				NULL, NULL, 0);
+		if (ret)
+			return ret;
+
+		/*
+		 * The map_entry from btt_read_map is stripped of any flag bits,
+		 * so use the stripped out versions from the log as well for
+		 * testing whether recovery is needed. For restoration, use the
+		 * 'raw' version of the log entries as that captured what we
+		 * were going to write originally.
+		 */
+		if ((log_newmap != map_entry) && (log_oldmap == map_entry)) {
+			/*
+			 * Last transaction wrote the flog, but wasn't able
+			 * to complete the map write. So fix up the map.
+			 */
+			ret = btt_map_write(arena, le32_to_cpu(log_new.lba),
+					le32_to_cpu(log_new.new_map), 0, 0, 0);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static bool ent_is_padding(struct log_entry *ent)
+{
+	return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0)
+		&& (ent->seq == 0);
+}
+
+/*
+ * Detecting valid log indices: We read a log group (see the comments in btt.h
+ * for a description of a 'log_group' and its 'slots'), and iterate over its
+ * four slots. We expect that a padding slot will be all-zeroes, and use this
+ * to detect a padding slot vs. an actual entry.
+ *
+ * If a log_group is in the initial state, i.e. hasn't been used since the
+ * creation of this BTT layout, it will have three of the four slots with
+ * zeroes. We skip over these log_groups for the detection of log_index. If
+ * all log_groups are in the initial state (i.e. the BTT has never been
+ * written to), it is safe to assume the 'new format' of log entries in slots
+ * (0, 1).
+ */
+static int log_set_indices(struct arena_info *arena)
+{
+	bool idx_set = false, initial_state = true;
+	int ret, log_index[2] = {-1, -1};
+	u32 i, j, next_idx = 0;
+	struct log_group log;
+	u32 pad_count = 0;
+
+	for (i = 0; i < arena->nfree; i++) {
+		ret = btt_log_group_read(arena, i, &log);
+		if (ret < 0)
+			return ret;
+
+		for (j = 0; j < 4; j++) {
+			if (!idx_set) {
+				if (ent_is_padding(&log.ent[j])) {
+					pad_count++;
+					continue;
+				} else {
+					/* Skip if index has been recorded */
+					if ((next_idx == 1) &&
+						(j == log_index[0]))
+						continue;
+					/* valid entry, record index */
+					log_index[next_idx] = j;
+					next_idx++;
+				}
+				if (next_idx == 2) {
+					/* two valid entries found */
+					idx_set = true;
+				} else if (next_idx > 2) {
+					/* too many valid indices */
+					return -ENXIO;
+				}
+			} else {
+				/*
+				 * once the indices have been set, just verify
+				 * that all subsequent log groups are either in
+				 * their initial state or follow the same
+				 * indices.
+				 */
+				if (j == log_index[0]) {
+					/* entry must be 'valid' */
+					if (ent_is_padding(&log.ent[j]))
+						return -ENXIO;
+				} else if (j == log_index[1]) {
+					;
+					/*
+					 * log_index[1] can be padding if the
+					 * lane never got used and it is still
+					 * in the initial state (three 'padding'
+					 * entries)
+					 */
+				} else {
+					/* entry must be invalid (padding) */
+					if (!ent_is_padding(&log.ent[j]))
+						return -ENXIO;
+				}
+			}
+		}
+		/*
+		 * If any of the log_groups have more than one valid,
+		 * non-padding entry, then the we are no longer in the
+		 * initial_state
+		 */
+		if (pad_count < 3)
+			initial_state = false;
+		pad_count = 0;
+	}
+
+	if (!initial_state && !idx_set)
+		return -ENXIO;
+
+	/*
+	 * If all the entries in the log were in the initial state,
+	 * assume new padding scheme
+	 */
+	if (initial_state)
+		log_index[1] = 1;
+
+	/*
+	 * Only allow the known permutations of log/padding indices,
+	 * i.e. (0, 1), and (0, 2)
+	 */
+	if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2)))
+		; /* known index possibilities */
+	else {
+		dev_err(to_dev(arena), "Found an unknown padding scheme\n");
+		return -ENXIO;
+	}
+
+	arena->log_index[0] = log_index[0];
+	arena->log_index[1] = log_index[1];
+	dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]);
+	dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]);
+	return 0;
+}
+
+static int btt_rtt_init(struct arena_info *arena)
+{
+	arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
+	if (arena->rtt == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int btt_maplocks_init(struct arena_info *arena)
+{
+	u32 i;
+
+	arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock),
+				GFP_KERNEL);
+	if (!arena->map_locks)
+		return -ENOMEM;
+
+	for (i = 0; i < arena->nfree; i++)
+		spin_lock_init(&arena->map_locks[i].lock);
+
+	return 0;
+}
+
+static struct arena_info *alloc_arena(struct btt *btt, size_t size,
+				size_t start, size_t arena_off)
+{
+	struct arena_info *arena;
+	u64 logsize, mapsize, datasize;
+	u64 available = size;
+
+	arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL);
+	if (!arena)
+		return NULL;
+	arena->nd_btt = btt->nd_btt;
+	arena->sector_size = btt->sector_size;
+	mutex_init(&arena->err_lock);
+
+	if (!size)
+		return arena;
+
+	arena->size = size;
+	arena->external_lba_start = start;
+	arena->external_lbasize = btt->lbasize;
+	arena->internal_lbasize = roundup(arena->external_lbasize,
+					INT_LBASIZE_ALIGNMENT);
+	arena->nfree = BTT_DEFAULT_NFREE;
+	arena->version_major = btt->nd_btt->version_major;
+	arena->version_minor = btt->nd_btt->version_minor;
+
+	if (available % BTT_PG_SIZE)
+		available -= (available % BTT_PG_SIZE);
+
+	/* Two pages are reserved for the super block and its copy */
+	available -= 2 * BTT_PG_SIZE;
+
+	/* The log takes a fixed amount of space based on nfree */
+	logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE);
+	available -= logsize;
+
+	/* Calculate optimal split between map and data area */
+	arena->internal_nlba = div_u64(available - BTT_PG_SIZE,
+			arena->internal_lbasize + MAP_ENT_SIZE);
+	arena->external_nlba = arena->internal_nlba - arena->nfree;
+
+	mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE);
+	datasize = available - mapsize;
+
+	/* 'Absolute' values, relative to start of storage space */
+	arena->infooff = arena_off;
+	arena->dataoff = arena->infooff + BTT_PG_SIZE;
+	arena->mapoff = arena->dataoff + datasize;
+	arena->logoff = arena->mapoff + mapsize;
+	arena->info2off = arena->logoff + logsize;
+
+	/* Default log indices are (0,1) */
+	arena->log_index[0] = 0;
+	arena->log_index[1] = 1;
+	return arena;
+}
+
+static void free_arenas(struct btt *btt)
+{
+	struct arena_info *arena, *next;
+
+	list_for_each_entry_safe(arena, next, &btt->arena_list, list) {
+		list_del(&arena->list);
+		kfree(arena->rtt);
+		kfree(arena->map_locks);
+		kfree(arena->freelist);
+		debugfs_remove_recursive(arena->debugfs_dir);
+		kfree(arena);
+	}
+}
+
+/*
+ * This function reads an existing valid btt superblock and
+ * populates the corresponding arena_info struct
+ */
+static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super,
+				u64 arena_off)
+{
+	arena->internal_nlba = le32_to_cpu(super->internal_nlba);
+	arena->internal_lbasize = le32_to_cpu(super->internal_lbasize);
+	arena->external_nlba = le32_to_cpu(super->external_nlba);
+	arena->external_lbasize = le32_to_cpu(super->external_lbasize);
+	arena->nfree = le32_to_cpu(super->nfree);
+	arena->version_major = le16_to_cpu(super->version_major);
+	arena->version_minor = le16_to_cpu(super->version_minor);
+
+	arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off +
+			le64_to_cpu(super->nextoff));
+	arena->infooff = arena_off;
+	arena->dataoff = arena_off + le64_to_cpu(super->dataoff);
+	arena->mapoff = arena_off + le64_to_cpu(super->mapoff);
+	arena->logoff = arena_off + le64_to_cpu(super->logoff);
+	arena->info2off = arena_off + le64_to_cpu(super->info2off);
+
+	arena->size = (le64_to_cpu(super->nextoff) > 0)
+		? (le64_to_cpu(super->nextoff))
+		: (arena->info2off - arena->infooff + BTT_PG_SIZE);
+
+	arena->flags = le32_to_cpu(super->flags);
+}
+
+static int discover_arenas(struct btt *btt)
+{
+	int ret = 0;
+	struct arena_info *arena;
+	struct btt_sb *super;
+	size_t remaining = btt->rawsize;
+	u64 cur_nlba = 0;
+	size_t cur_off = 0;
+	int num_arenas = 0;
+
+	super = kzalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		return -ENOMEM;
+
+	while (remaining) {
+		/* Alloc memory for arena */
+		arena = alloc_arena(btt, 0, 0, 0);
+		if (!arena) {
+			ret = -ENOMEM;
+			goto out_super;
+		}
+
+		arena->infooff = cur_off;
+		ret = btt_info_read(arena, super);
+		if (ret)
+			goto out;
+
+		if (!nd_btt_arena_is_valid(btt->nd_btt, super)) {
+			if (remaining == btt->rawsize) {
+				btt->init_state = INIT_NOTFOUND;
+				dev_info(to_dev(arena), "No existing arenas\n");
+				goto out;
+			} else {
+				dev_err(to_dev(arena),
+						"Found corrupted metadata!\n");
+				ret = -ENODEV;
+				goto out;
+			}
+		}
+
+		arena->external_lba_start = cur_nlba;
+		parse_arena_meta(arena, super, cur_off);
+
+		ret = log_set_indices(arena);
+		if (ret) {
+			dev_err(to_dev(arena),
+				"Unable to deduce log/padding indices\n");
+			goto out;
+		}
+
+		ret = btt_freelist_init(arena);
+		if (ret)
+			goto out;
+
+		ret = btt_rtt_init(arena);
+		if (ret)
+			goto out;
+
+		ret = btt_maplocks_init(arena);
+		if (ret)
+			goto out;
+
+		list_add_tail(&arena->list, &btt->arena_list);
+
+		remaining -= arena->size;
+		cur_off += arena->size;
+		cur_nlba += arena->external_nlba;
+		num_arenas++;
+
+		if (arena->nextoff == 0)
+			break;
+	}
+	btt->num_arenas = num_arenas;
+	btt->nlba = cur_nlba;
+	btt->init_state = INIT_READY;
+
+	kfree(super);
+	return ret;
+
+ out:
+	kfree(arena);
+	free_arenas(btt);
+ out_super:
+	kfree(super);
+	return ret;
+}
+
+static int create_arenas(struct btt *btt)
+{
+	size_t remaining = btt->rawsize;
+	size_t cur_off = 0;
+
+	while (remaining) {
+		struct arena_info *arena;
+		size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining);
+
+		remaining -= arena_size;
+		if (arena_size < ARENA_MIN_SIZE)
+			break;
+
+		arena = alloc_arena(btt, arena_size, btt->nlba, cur_off);
+		if (!arena) {
+			free_arenas(btt);
+			return -ENOMEM;
+		}
+		btt->nlba += arena->external_nlba;
+		if (remaining >= ARENA_MIN_SIZE)
+			arena->nextoff = arena->size;
+		else
+			arena->nextoff = 0;
+		cur_off += arena_size;
+		list_add_tail(&arena->list, &btt->arena_list);
+	}
+
+	return 0;
+}
+
+/*
+ * This function completes arena initialization by writing
+ * all the metadata.
+ * It is only called for an uninitialized arena when a write
+ * to that arena occurs for the first time.
+ */
+static int btt_arena_write_layout(struct arena_info *arena)
+{
+	int ret;
+	u64 sum;
+	struct btt_sb *super;
+	struct nd_btt *nd_btt = arena->nd_btt;
+	const uuid_t *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev);
+
+	ret = btt_map_init(arena);
+	if (ret)
+		return ret;
+
+	ret = btt_log_init(arena);
+	if (ret)
+		return ret;
+
+	super = kzalloc(sizeof(struct btt_sb), GFP_NOIO);
+	if (!super)
+		return -ENOMEM;
+
+	strncpy(super->signature, BTT_SIG, BTT_SIG_LEN);
+	export_uuid(super->uuid, nd_btt->uuid);
+	export_uuid(super->parent_uuid, parent_uuid);
+	super->flags = cpu_to_le32(arena->flags);
+	super->version_major = cpu_to_le16(arena->version_major);
+	super->version_minor = cpu_to_le16(arena->version_minor);
+	super->external_lbasize = cpu_to_le32(arena->external_lbasize);
+	super->external_nlba = cpu_to_le32(arena->external_nlba);
+	super->internal_lbasize = cpu_to_le32(arena->internal_lbasize);
+	super->internal_nlba = cpu_to_le32(arena->internal_nlba);
+	super->nfree = cpu_to_le32(arena->nfree);
+	super->infosize = cpu_to_le32(sizeof(struct btt_sb));
+	super->nextoff = cpu_to_le64(arena->nextoff);
+	/*
+	 * Subtract arena->infooff (arena start) so numbers are relative
+	 * to 'this' arena
+	 */
+	super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff);
+	super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff);
+	super->logoff = cpu_to_le64(arena->logoff - arena->infooff);
+	super->info2off = cpu_to_le64(arena->info2off - arena->infooff);
+
+	super->flags = 0;
+	sum = nd_sb_checksum((struct nd_gen_sb *) super);
+	super->checksum = cpu_to_le64(sum);
+
+	ret = btt_info_write(arena, super);
+
+	kfree(super);
+	return ret;
+}
+
+/*
+ * This function completes the initialization for the BTT namespace
+ * such that it is ready to accept IOs
+ */
+static int btt_meta_init(struct btt *btt)
+{
+	int ret = 0;
+	struct arena_info *arena;
+
+	mutex_lock(&btt->init_lock);
+	list_for_each_entry(arena, &btt->arena_list, list) {
+		ret = btt_arena_write_layout(arena);
+		if (ret)
+			goto unlock;
+
+		ret = btt_freelist_init(arena);
+		if (ret)
+			goto unlock;
+
+		ret = btt_rtt_init(arena);
+		if (ret)
+			goto unlock;
+
+		ret = btt_maplocks_init(arena);
+		if (ret)
+			goto unlock;
+	}
+
+	btt->init_state = INIT_READY;
+
+ unlock:
+	mutex_unlock(&btt->init_lock);
+	return ret;
+}
+
+static u32 btt_meta_size(struct btt *btt)
+{
+	return btt->lbasize - btt->sector_size;
+}
+
+/*
+ * This function calculates the arena in which the given LBA lies
+ * by doing a linear walk. This is acceptable since we expect only
+ * a few arenas. If we have backing devices that get much larger,
+ * we can construct a balanced binary tree of arenas at init time
+ * so that this range search becomes faster.
+ */
+static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap,
+				struct arena_info **arena)
+{
+	struct arena_info *arena_list;
+	__u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size);
+
+	list_for_each_entry(arena_list, &btt->arena_list, list) {
+		if (lba < arena_list->external_nlba) {
+			*arena = arena_list;
+			*premap = lba;
+			return 0;
+		}
+		lba -= arena_list->external_nlba;
+	}
+
+	return -EIO;
+}
+
+/*
+ * The following (lock_map, unlock_map) are mostly just to improve
+ * readability, since they index into an array of locks
+ */
+static void lock_map(struct arena_info *arena, u32 premap)
+		__acquires(&arena->map_locks[idx].lock)
+{
+	u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
+
+	spin_lock(&arena->map_locks[idx].lock);
+}
+
+static void unlock_map(struct arena_info *arena, u32 premap)
+		__releases(&arena->map_locks[idx].lock)
+{
+	u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
+
+	spin_unlock(&arena->map_locks[idx].lock);
+}
+
+static int btt_data_read(struct arena_info *arena, struct page *page,
+			unsigned int off, u32 lba, u32 len)
+{
+	int ret;
+	u64 nsoff = to_namespace_offset(arena, lba);
+	void *mem = kmap_atomic(page);
+
+	ret = arena_read_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
+	kunmap_atomic(mem);
+
+	return ret;
+}
+
+static int btt_data_write(struct arena_info *arena, u32 lba,
+			struct page *page, unsigned int off, u32 len)
+{
+	int ret;
+	u64 nsoff = to_namespace_offset(arena, lba);
+	void *mem = kmap_atomic(page);
+
+	ret = arena_write_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
+	kunmap_atomic(mem);
+
+	return ret;
+}
+
+static void zero_fill_data(struct page *page, unsigned int off, u32 len)
+{
+	void *mem = kmap_atomic(page);
+
+	memset(mem + off, 0, len);
+	kunmap_atomic(mem);
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
+			struct arena_info *arena, u32 postmap, int rw)
+{
+	unsigned int len = btt_meta_size(btt);
+	u64 meta_nsoff;
+	int ret = 0;
+
+	if (bip == NULL)
+		return 0;
+
+	meta_nsoff = to_namespace_offset(arena, postmap) + btt->sector_size;
+
+	while (len) {
+		unsigned int cur_len;
+		struct bio_vec bv;
+		void *mem;
+
+		bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
+		/*
+		 * The 'bv' obtained from bvec_iter_bvec has its .bv_len and
+		 * .bv_offset already adjusted for iter->bi_bvec_done, and we
+		 * can use those directly
+		 */
+
+		cur_len = min(len, bv.bv_len);
+		mem = bvec_kmap_local(&bv);
+		if (rw)
+			ret = arena_write_bytes(arena, meta_nsoff, mem, cur_len,
+					NVDIMM_IO_ATOMIC);
+		else
+			ret = arena_read_bytes(arena, meta_nsoff, mem, cur_len,
+					NVDIMM_IO_ATOMIC);
+
+		kunmap_local(mem);
+		if (ret)
+			return ret;
+
+		len -= cur_len;
+		meta_nsoff += cur_len;
+		if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len))
+			return -EIO;
+	}
+
+	return ret;
+}
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
+			struct arena_info *arena, u32 postmap, int rw)
+{
+	return 0;
+}
+#endif
+
+static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
+			struct page *page, unsigned int off, sector_t sector,
+			unsigned int len)
+{
+	int ret = 0;
+	int t_flag, e_flag;
+	struct arena_info *arena = NULL;
+	u32 lane = 0, premap, postmap;
+
+	while (len) {
+		u32 cur_len;
+
+		lane = nd_region_acquire_lane(btt->nd_region);
+
+		ret = lba_to_arena(btt, sector, &premap, &arena);
+		if (ret)
+			goto out_lane;
+
+		cur_len = min(btt->sector_size, len);
+
+		ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag,
+				NVDIMM_IO_ATOMIC);
+		if (ret)
+			goto out_lane;
+
+		/*
+		 * We loop to make sure that the post map LBA didn't change
+		 * from under us between writing the RTT and doing the actual
+		 * read.
+		 */
+		while (1) {
+			u32 new_map;
+			int new_t, new_e;
+
+			if (t_flag) {
+				zero_fill_data(page, off, cur_len);
+				goto out_lane;
+			}
+
+			if (e_flag) {
+				ret = -EIO;
+				goto out_lane;
+			}
+
+			arena->rtt[lane] = RTT_VALID | postmap;
+			/*
+			 * Barrier to make sure this write is not reordered
+			 * to do the verification map_read before the RTT store
+			 */
+			barrier();
+
+			ret = btt_map_read(arena, premap, &new_map, &new_t,
+						&new_e, NVDIMM_IO_ATOMIC);
+			if (ret)
+				goto out_rtt;
+
+			if ((postmap == new_map) && (t_flag == new_t) &&
+					(e_flag == new_e))
+				break;
+
+			postmap = new_map;
+			t_flag = new_t;
+			e_flag = new_e;
+		}
+
+		ret = btt_data_read(arena, page, off, postmap, cur_len);
+		if (ret) {
+			/* Media error - set the e_flag */
+			if (btt_map_write(arena, premap, postmap, 0, 1, NVDIMM_IO_ATOMIC))
+				dev_warn_ratelimited(to_dev(arena),
+					"Error persistently tracking bad blocks at %#x\n",
+					premap);
+			goto out_rtt;
+		}
+
+		if (bip) {
+			ret = btt_rw_integrity(btt, bip, arena, postmap, READ);
+			if (ret)
+				goto out_rtt;
+		}
+
+		arena->rtt[lane] = RTT_INVALID;
+		nd_region_release_lane(btt->nd_region, lane);
+
+		len -= cur_len;
+		off += cur_len;
+		sector += btt->sector_size >> SECTOR_SHIFT;
+	}
+
+	return 0;
+
+ out_rtt:
+	arena->rtt[lane] = RTT_INVALID;
+ out_lane:
+	nd_region_release_lane(btt->nd_region, lane);
+	return ret;
+}
+
+/*
+ * Normally, arena_{read,write}_bytes will take care of the initial offset
+ * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem,
+ * we need the final, raw namespace offset here
+ */
+static bool btt_is_badblock(struct btt *btt, struct arena_info *arena,
+		u32 postmap)
+{
+	u64 nsoff = adjust_initial_offset(arena->nd_btt,
+			to_namespace_offset(arena, postmap));
+	sector_t phys_sector = nsoff >> 9;
+
+	return is_bad_pmem(btt->phys_bb, phys_sector, arena->internal_lbasize);
+}
+
+static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
+			sector_t sector, struct page *page, unsigned int off,
+			unsigned int len)
+{
+	int ret = 0;
+	struct arena_info *arena = NULL;
+	u32 premap = 0, old_postmap, new_postmap, lane = 0, i;
+	struct log_entry log;
+	int sub;
+
+	while (len) {
+		u32 cur_len;
+		int e_flag;
+
+ retry:
+		lane = nd_region_acquire_lane(btt->nd_region);
+
+		ret = lba_to_arena(btt, sector, &premap, &arena);
+		if (ret)
+			goto out_lane;
+		cur_len = min(btt->sector_size, len);
+
+		if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) {
+			ret = -EIO;
+			goto out_lane;
+		}
+
+		if (btt_is_badblock(btt, arena, arena->freelist[lane].block))
+			arena->freelist[lane].has_err = 1;
+
+		if (mutex_is_locked(&arena->err_lock)
+				|| arena->freelist[lane].has_err) {
+			nd_region_release_lane(btt->nd_region, lane);
+
+			ret = arena_clear_freelist_error(arena, lane);
+			if (ret)
+				return ret;
+
+			/* OK to acquire a different lane/free block */
+			goto retry;
+		}
+
+		new_postmap = arena->freelist[lane].block;
+
+		/* Wait if the new block is being read from */
+		for (i = 0; i < arena->nfree; i++)
+			while (arena->rtt[i] == (RTT_VALID | new_postmap))
+				cpu_relax();
+
+
+		if (new_postmap >= arena->internal_nlba) {
+			ret = -EIO;
+			goto out_lane;
+		}
+
+		ret = btt_data_write(arena, new_postmap, page, off, cur_len);
+		if (ret)
+			goto out_lane;
+
+		if (bip) {
+			ret = btt_rw_integrity(btt, bip, arena, new_postmap,
+						WRITE);
+			if (ret)
+				goto out_lane;
+		}
+
+		lock_map(arena, premap);
+		ret = btt_map_read(arena, premap, &old_postmap, NULL, &e_flag,
+				NVDIMM_IO_ATOMIC);
+		if (ret)
+			goto out_map;
+		if (old_postmap >= arena->internal_nlba) {
+			ret = -EIO;
+			goto out_map;
+		}
+		if (e_flag)
+			set_e_flag(old_postmap);
+
+		log.lba = cpu_to_le32(premap);
+		log.old_map = cpu_to_le32(old_postmap);
+		log.new_map = cpu_to_le32(new_postmap);
+		log.seq = cpu_to_le32(arena->freelist[lane].seq);
+		sub = arena->freelist[lane].sub;
+		ret = btt_flog_write(arena, lane, sub, &log);
+		if (ret)
+			goto out_map;
+
+		ret = btt_map_write(arena, premap, new_postmap, 0, 0,
+			NVDIMM_IO_ATOMIC);
+		if (ret)
+			goto out_map;
+
+		unlock_map(arena, premap);
+		nd_region_release_lane(btt->nd_region, lane);
+
+		if (e_flag) {
+			ret = arena_clear_freelist_error(arena, lane);
+			if (ret)
+				return ret;
+		}
+
+		len -= cur_len;
+		off += cur_len;
+		sector += btt->sector_size >> SECTOR_SHIFT;
+	}
+
+	return 0;
+
+ out_map:
+	unlock_map(arena, premap);
+ out_lane:
+	nd_region_release_lane(btt->nd_region, lane);
+	return ret;
+}
+
+static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
+			struct page *page, unsigned int len, unsigned int off,
+			enum req_op op, sector_t sector)
+{
+	int ret;
+
+	if (!op_is_write(op)) {
+		ret = btt_read_pg(btt, bip, page, off, sector, len);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		ret = btt_write_pg(btt, bip, sector, page, off, len);
+	}
+
+	return ret;
+}
+
+static void btt_submit_bio(struct bio *bio)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	struct btt *btt = bio->bi_bdev->bd_disk->private_data;
+	struct bvec_iter iter;
+	unsigned long start;
+	struct bio_vec bvec;
+	int err = 0;
+	bool do_acct;
+
+	if (!bio_integrity_prep(bio))
+		return;
+
+	do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
+	if (do_acct)
+		start = bio_start_io_acct(bio);
+	bio_for_each_segment(bvec, bio, iter) {
+		unsigned int len = bvec.bv_len;
+
+		if (len > PAGE_SIZE || len < btt->sector_size ||
+				len % btt->sector_size) {
+			dev_err_ratelimited(&btt->nd_btt->dev,
+				"unaligned bio segment (len: %d)\n", len);
+			bio->bi_status = BLK_STS_IOERR;
+			break;
+		}
+
+		err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
+				  bio_op(bio), iter.bi_sector);
+		if (err) {
+			dev_err(&btt->nd_btt->dev,
+					"io error in %s sector %lld, len %d,\n",
+					(op_is_write(bio_op(bio))) ? "WRITE" :
+					"READ",
+					(unsigned long long) iter.bi_sector, len);
+			bio->bi_status = errno_to_blk_status(err);
+			break;
+		}
+	}
+	if (do_acct)
+		bio_end_io_acct(bio, start);
+
+	bio_endio(bio);
+}
+
+static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+	/* some standard values */
+	geo->heads = 1 << 6;
+	geo->sectors = 1 << 5;
+	geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+	return 0;
+}
+
+static const struct block_device_operations btt_fops = {
+	.owner =		THIS_MODULE,
+	.submit_bio =		btt_submit_bio,
+	.getgeo =		btt_getgeo,
+};
+
+static int btt_blk_init(struct btt *btt)
+{
+	struct nd_btt *nd_btt = btt->nd_btt;
+	struct nd_namespace_common *ndns = nd_btt->ndns;
+	int rc = -ENOMEM;
+
+	btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE);
+	if (!btt->btt_disk)
+		return -ENOMEM;
+
+	nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
+	btt->btt_disk->first_minor = 0;
+	btt->btt_disk->fops = &btt_fops;
+	btt->btt_disk->private_data = btt;
+
+	blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size);
+	blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue);
+	blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue);
+
+	if (btt_meta_size(btt)) {
+		rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
+		if (rc)
+			goto out_cleanup_disk;
+	}
+
+	set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
+	rc = device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL);
+	if (rc)
+		goto out_cleanup_disk;
+
+	btt->nd_btt->size = btt->nlba * (u64)btt->sector_size;
+	nvdimm_check_and_set_ro(btt->btt_disk);
+
+	return 0;
+
+out_cleanup_disk:
+	put_disk(btt->btt_disk);
+	return rc;
+}
+
+static void btt_blk_cleanup(struct btt *btt)
+{
+	del_gendisk(btt->btt_disk);
+	put_disk(btt->btt_disk);
+}
+
+/**
+ * btt_init - initialize a block translation table for the given device
+ * @nd_btt:	device with BTT geometry and backing device info
+ * @rawsize:	raw size in bytes of the backing device
+ * @lbasize:	lba size of the backing device
+ * @uuid:	A uuid for the backing device - this is stored on media
+ * @maxlane:	maximum number of parallel requests the device can handle
+ *
+ * Initialize a Block Translation Table on a backing device to provide
+ * single sector power fail atomicity.
+ *
+ * Context:
+ * Might sleep.
+ *
+ * Returns:
+ * Pointer to a new struct btt on success, NULL on failure.
+ */
+static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
+			    u32 lbasize, uuid_t *uuid,
+			    struct nd_region *nd_region)
+{
+	int ret;
+	struct btt *btt;
+	struct nd_namespace_io *nsio;
+	struct device *dev = &nd_btt->dev;
+
+	btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL);
+	if (!btt)
+		return NULL;
+
+	btt->nd_btt = nd_btt;
+	btt->rawsize = rawsize;
+	btt->lbasize = lbasize;
+	btt->sector_size = ((lbasize >= 4096) ? 4096 : 512);
+	INIT_LIST_HEAD(&btt->arena_list);
+	mutex_init(&btt->init_lock);
+	btt->nd_region = nd_region;
+	nsio = to_nd_namespace_io(&nd_btt->ndns->dev);
+	btt->phys_bb = &nsio->bb;
+
+	ret = discover_arenas(btt);
+	if (ret) {
+		dev_err(dev, "init: error in arena_discover: %d\n", ret);
+		return NULL;
+	}
+
+	if (btt->init_state != INIT_READY && nd_region->ro) {
+		dev_warn(dev, "%s is read-only, unable to init btt metadata\n",
+				dev_name(&nd_region->dev));
+		return NULL;
+	} else if (btt->init_state != INIT_READY) {
+		btt->num_arenas = (rawsize / ARENA_MAX_SIZE) +
+			((rawsize % ARENA_MAX_SIZE) ? 1 : 0);
+		dev_dbg(dev, "init: %d arenas for %llu rawsize\n",
+				btt->num_arenas, rawsize);
+
+		ret = create_arenas(btt);
+		if (ret) {
+			dev_info(dev, "init: create_arenas: %d\n", ret);
+			return NULL;
+		}
+
+		ret = btt_meta_init(btt);
+		if (ret) {
+			dev_err(dev, "init: error in meta_init: %d\n", ret);
+			return NULL;
+		}
+	}
+
+	ret = btt_blk_init(btt);
+	if (ret) {
+		dev_err(dev, "init: error in blk_init: %d\n", ret);
+		return NULL;
+	}
+
+	btt_debugfs_init(btt);
+
+	return btt;
+}
+
+/**
+ * btt_fini - de-initialize a BTT
+ * @btt:	the BTT handle that was generated by btt_init
+ *
+ * De-initialize a Block Translation Table on device removal
+ *
+ * Context:
+ * Might sleep.
+ */
+static void btt_fini(struct btt *btt)
+{
+	if (btt) {
+		btt_blk_cleanup(btt);
+		free_arenas(btt);
+		debugfs_remove_recursive(btt->debugfs_dir);
+	}
+}
+
+int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
+{
+	struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
+	struct nd_region *nd_region;
+	struct btt_sb *btt_sb;
+	struct btt *btt;
+	size_t size, rawsize;
+	int rc;
+
+	if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) {
+		dev_dbg(&nd_btt->dev, "incomplete btt configuration\n");
+		return -ENODEV;
+	}
+
+	btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
+	if (!btt_sb)
+		return -ENOMEM;
+
+	size = nvdimm_namespace_capacity(ndns);
+	rc = devm_namespace_enable(&nd_btt->dev, ndns, size);
+	if (rc)
+		return rc;
+
+	/*
+	 * If this returns < 0, that is ok as it just means there wasn't
+	 * an existing BTT, and we're creating a new one. We still need to
+	 * call this as we need the version dependent fields in nd_btt to be
+	 * set correctly based on the holder class
+	 */
+	nd_btt_version(nd_btt, ndns, btt_sb);
+
+	rawsize = size - nd_btt->initial_offset;
+	if (rawsize < ARENA_MIN_SIZE) {
+		dev_dbg(&nd_btt->dev, "%s must be at least %ld bytes\n",
+				dev_name(&ndns->dev),
+				ARENA_MIN_SIZE + nd_btt->initial_offset);
+		return -ENXIO;
+	}
+	nd_region = to_nd_region(nd_btt->dev.parent);
+	btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid,
+		       nd_region);
+	if (!btt)
+		return -ENOMEM;
+	nd_btt->btt = btt;
+
+	return 0;
+}
+EXPORT_SYMBOL(nvdimm_namespace_attach_btt);
+
+int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt)
+{
+	struct btt *btt = nd_btt->btt;
+
+	btt_fini(btt);
+	nd_btt->btt = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL(nvdimm_namespace_detach_btt);
+
+static int __init nd_btt_init(void)
+{
+	int rc = 0;
+
+	debugfs_root = debugfs_create_dir("btt", NULL);
+	if (IS_ERR_OR_NULL(debugfs_root))
+		rc = -ENXIO;
+
+	return rc;
+}
+
+static void __exit nd_btt_exit(void)
+{
+	debugfs_remove_recursive(debugfs_root);
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT);
+MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+module_init(nd_btt_init);
+module_exit(nd_btt_exit);
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
new file mode 100644
index 0000000000..0c76c0333f
--- /dev/null
+++ b/drivers/nvdimm/btt.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Block Translation Table library
+ * Copyright (c) 2014-2015, Intel Corporation.
+ */
+
+#ifndef _LINUX_BTT_H
+#define _LINUX_BTT_H
+
+#include <linux/types.h>
+
+#define BTT_SIG_LEN 16
+#define BTT_SIG "BTT_ARENA_INFO\0"
+#define MAP_ENT_SIZE 4
+#define MAP_TRIM_SHIFT 31
+#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT)
+#define MAP_ERR_SHIFT 30
+#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
+#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
+#define MAP_ENT_NORMAL 0xC0000000
+#define LOG_GRP_SIZE sizeof(struct log_group)
+#define LOG_ENT_SIZE sizeof(struct log_entry)
+#define ARENA_MIN_SIZE (1UL << 24)	/* 16 MB */
+#define ARENA_MAX_SIZE (1ULL << 39)	/* 512 GB */
+#define RTT_VALID (1UL << 31)
+#define RTT_INVALID 0
+#define BTT_PG_SIZE 4096
+#define BTT_DEFAULT_NFREE ND_MAX_LANES
+#define LOG_SEQ_INIT 1
+
+#define IB_FLAG_ERROR 0x00000001
+#define IB_FLAG_ERROR_MASK 0x00000001
+
+#define ent_lba(ent) (ent & MAP_LBA_MASK)
+#define ent_e_flag(ent) (!!(ent & MAP_ERR_MASK))
+#define ent_z_flag(ent) (!!(ent & MAP_TRIM_MASK))
+#define set_e_flag(ent) (ent |= MAP_ERR_MASK)
+/* 'normal' is both e and z flags set */
+#define ent_normal(ent) (ent_e_flag(ent) && ent_z_flag(ent))
+
+enum btt_init_state {
+	INIT_UNCHECKED = 0,
+	INIT_NOTFOUND,
+	INIT_READY
+};
+
+/*
+ * A log group represents one log 'lane', and consists of four log entries.
+ * Two of the four entries are valid entries, and the remaining two are
+ * padding. Due to an old bug in the padding location, we need to perform a
+ * test to determine the padding scheme being used, and use that scheme
+ * thereafter.
+ *
+ * In kernels prior to 4.15, 'log group' would have actual log entries at
+ * indices (0, 2) and padding at indices (1, 3), where as the correct/updated
+ * format has log entries at indices (0, 1) and padding at indices (2, 3).
+ *
+ * Old (pre 4.15) format:
+ * +-----------------+-----------------+
+ * |      ent[0]     |      ent[1]     |
+ * |       16B       |       16B       |
+ * | lba/old/new/seq |       pad       |
+ * +-----------------------------------+
+ * |      ent[2]     |      ent[3]     |
+ * |       16B       |       16B       |
+ * | lba/old/new/seq |       pad       |
+ * +-----------------+-----------------+
+ *
+ * New format:
+ * +-----------------+-----------------+
+ * |      ent[0]     |      ent[1]     |
+ * |       16B       |       16B       |
+ * | lba/old/new/seq | lba/old/new/seq |
+ * +-----------------------------------+
+ * |      ent[2]     |      ent[3]     |
+ * |       16B       |       16B       |
+ * |       pad       |       pad       |
+ * +-----------------+-----------------+
+ *
+ * We detect during start-up which format is in use, and set
+ * arena->log_index[(0, 1)] with the detected format.
+ */
+
+struct log_entry {
+	__le32 lba;
+	__le32 old_map;
+	__le32 new_map;
+	__le32 seq;
+};
+
+struct log_group {
+	struct log_entry ent[4];
+};
+
+struct btt_sb {
+	u8 signature[BTT_SIG_LEN];
+	u8 uuid[16];
+	u8 parent_uuid[16];
+	__le32 flags;
+	__le16 version_major;
+	__le16 version_minor;
+	__le32 external_lbasize;
+	__le32 external_nlba;
+	__le32 internal_lbasize;
+	__le32 internal_nlba;
+	__le32 nfree;
+	__le32 infosize;
+	__le64 nextoff;
+	__le64 dataoff;
+	__le64 mapoff;
+	__le64 logoff;
+	__le64 info2off;
+	u8 padding[3968];
+	__le64 checksum;
+};
+
+struct free_entry {
+	u32 block;
+	u8 sub;
+	u8 seq;
+	u8 has_err;
+};
+
+struct aligned_lock {
+	union {
+		spinlock_t lock;
+		u8 cacheline_padding[L1_CACHE_BYTES];
+	};
+};
+
+/**
+ * struct arena_info - handle for an arena
+ * @size:		Size in bytes this arena occupies on the raw device.
+ *			This includes arena metadata.
+ * @external_lba_start:	The first external LBA in this arena.
+ * @internal_nlba:	Number of internal blocks available in the arena
+ *			including nfree reserved blocks
+ * @internal_lbasize:	Internal and external lba sizes may be different as
+ *			we can round up 'odd' external lbasizes such as 520B
+ *			to be aligned.
+ * @external_nlba:	Number of blocks contributed by the arena to the number
+ *			reported to upper layers. (internal_nlba - nfree)
+ * @external_lbasize:	LBA size as exposed to upper layers.
+ * @nfree:		A reserve number of 'free' blocks that is used to
+ *			handle incoming writes.
+ * @version_major:	Metadata layout version major.
+ * @version_minor:	Metadata layout version minor.
+ * @sector_size:	The Linux sector size - 512 or 4096
+ * @nextoff:		Offset in bytes to the start of the next arena.
+ * @infooff:		Offset in bytes to the info block of this arena.
+ * @dataoff:		Offset in bytes to the data area of this arena.
+ * @mapoff:		Offset in bytes to the map area of this arena.
+ * @logoff:		Offset in bytes to the log area of this arena.
+ * @info2off:		Offset in bytes to the backup info block of this arena.
+ * @freelist:		Pointer to in-memory list of free blocks
+ * @rtt:		Pointer to in-memory "Read Tracking Table"
+ * @map_locks:		Spinlocks protecting concurrent map writes
+ * @nd_btt:		Pointer to parent nd_btt structure.
+ * @list:		List head for list of arenas
+ * @debugfs_dir:	Debugfs dentry
+ * @flags:		Arena flags - may signify error states.
+ * @err_lock:		Mutex for synchronizing error clearing.
+ * @log_index:		Indices of the valid log entries in a log_group
+ *
+ * arena_info is a per-arena handle. Once an arena is narrowed down for an
+ * IO, this struct is passed around for the duration of the IO.
+ */
+struct arena_info {
+	u64 size;			/* Total bytes for this arena */
+	u64 external_lba_start;
+	u32 internal_nlba;
+	u32 internal_lbasize;
+	u32 external_nlba;
+	u32 external_lbasize;
+	u32 nfree;
+	u16 version_major;
+	u16 version_minor;
+	u32 sector_size;
+	/* Byte offsets to the different on-media structures */
+	u64 nextoff;
+	u64 infooff;
+	u64 dataoff;
+	u64 mapoff;
+	u64 logoff;
+	u64 info2off;
+	/* Pointers to other in-memory structures for this arena */
+	struct free_entry *freelist;
+	u32 *rtt;
+	struct aligned_lock *map_locks;
+	struct nd_btt *nd_btt;
+	struct list_head list;
+	struct dentry *debugfs_dir;
+	/* Arena flags */
+	u32 flags;
+	struct mutex err_lock;
+	int log_index[2];
+};
+
+struct badblocks;
+
+/**
+ * struct btt - handle for a BTT instance
+ * @btt_disk:		Pointer to the gendisk for BTT device
+ * @arena_list:		Head of the list of arenas
+ * @debugfs_dir:	Debugfs dentry
+ * @nd_btt:		Parent nd_btt struct
+ * @nlba:		Number of logical blocks exposed to the	upper layers
+ *			after removing the amount of space needed by metadata
+ * @rawsize:		Total size in bytes of the available backing device
+ * @lbasize:		LBA size as requested and presented to upper layers.
+ *			This is sector_size + size of any metadata.
+ * @sector_size:	The Linux sector size - 512 or 4096
+ * @lanes:		Per-lane spinlocks
+ * @init_lock:		Mutex used for the BTT initialization
+ * @init_state:		Flag describing the initialization state for the BTT
+ * @num_arenas:		Number of arenas in the BTT instance
+ * @phys_bb:		Pointer to the namespace's badblocks structure
+ */
+struct btt {
+	struct gendisk *btt_disk;
+	struct list_head arena_list;
+	struct dentry *debugfs_dir;
+	struct nd_btt *nd_btt;
+	u64 nlba;
+	unsigned long long rawsize;
+	u32 lbasize;
+	u32 sector_size;
+	struct nd_region *nd_region;
+	struct mutex init_lock;
+	int init_state;
+	int num_arenas;
+	struct badblocks *phys_bb;
+};
+
+bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super);
+int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns,
+		struct btt_sb *btt_sb);
+
+#endif
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
new file mode 100644
index 0000000000..fabbb31f2c
--- /dev/null
+++ b/drivers/nvdimm/btt_devs.c
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "btt.h"
+#include "nd.h"
+
+static void nd_btt_release(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+
+	dev_dbg(dev, "trace\n");
+	nd_detach_ndns(&nd_btt->dev, &nd_btt->ndns);
+	ida_simple_remove(&nd_region->btt_ida, nd_btt->id);
+	kfree(nd_btt->uuid);
+	kfree(nd_btt);
+}
+
+struct nd_btt *to_nd_btt(struct device *dev)
+{
+	struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev);
+
+	WARN_ON(!is_nd_btt(dev));
+	return nd_btt;
+}
+EXPORT_SYMBOL(to_nd_btt);
+
+static const unsigned long btt_lbasize_supported[] = { 512, 520, 528,
+	4096, 4104, 4160, 4224, 0 };
+
+static ssize_t sector_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+
+	return nd_size_select_show(nd_btt->lbasize, btt_lbasize_supported, buf);
+}
+
+static ssize_t sector_size_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	rc = nd_size_select_store(dev, buf, &nd_btt->lbasize,
+			btt_lbasize_supported);
+	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+			buf[len - 1] == '\n' ? "" : "\n");
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(sector_size);
+
+static ssize_t uuid_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+
+	if (nd_btt->uuid)
+		return sprintf(buf, "%pUb\n", nd_btt->uuid);
+	return sprintf(buf, "\n");
+}
+
+static ssize_t uuid_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len);
+	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+			buf[len - 1] == '\n' ? "" : "\n");
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t namespace_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	rc = sprintf(buf, "%s\n", nd_btt->ndns
+			? dev_name(&nd_btt->ndns->dev) : "");
+	nvdimm_bus_unlock(dev);
+	return rc;
+}
+
+static ssize_t namespace_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	rc = nd_namespace_store(dev, &nd_btt->ndns, buf, len);
+	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+			buf[len - 1] == '\n' ? "" : "\n");
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(namespace);
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	if (dev->driver)
+		rc = sprintf(buf, "%llu\n", nd_btt->size);
+	else {
+		/* no size to convey if the btt instance is disabled */
+		rc = -ENXIO;
+	}
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t log_zero_flags_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Y\n");
+}
+static DEVICE_ATTR_RO(log_zero_flags);
+
+static struct attribute *nd_btt_attributes[] = {
+	&dev_attr_sector_size.attr,
+	&dev_attr_namespace.attr,
+	&dev_attr_uuid.attr,
+	&dev_attr_size.attr,
+	&dev_attr_log_zero_flags.attr,
+	NULL,
+};
+
+static struct attribute_group nd_btt_attribute_group = {
+	.attrs = nd_btt_attributes,
+};
+
+static const struct attribute_group *nd_btt_attribute_groups[] = {
+	&nd_btt_attribute_group,
+	&nd_device_attribute_group,
+	&nd_numa_attribute_group,
+	NULL,
+};
+
+static const struct device_type nd_btt_device_type = {
+	.name = "nd_btt",
+	.release = nd_btt_release,
+	.groups = nd_btt_attribute_groups,
+};
+
+bool is_nd_btt(struct device *dev)
+{
+	return dev->type == &nd_btt_device_type;
+}
+EXPORT_SYMBOL(is_nd_btt);
+
+static struct lock_class_key nvdimm_btt_key;
+
+static struct device *__nd_btt_create(struct nd_region *nd_region,
+				      unsigned long lbasize, uuid_t *uuid,
+				      struct nd_namespace_common *ndns)
+{
+	struct nd_btt *nd_btt;
+	struct device *dev;
+
+	nd_btt = kzalloc(sizeof(*nd_btt), GFP_KERNEL);
+	if (!nd_btt)
+		return NULL;
+
+	nd_btt->id = ida_simple_get(&nd_region->btt_ida, 0, 0, GFP_KERNEL);
+	if (nd_btt->id < 0)
+		goto out_nd_btt;
+
+	nd_btt->lbasize = lbasize;
+	if (uuid) {
+		uuid = kmemdup(uuid, 16, GFP_KERNEL);
+		if (!uuid)
+			goto out_put_id;
+	}
+	nd_btt->uuid = uuid;
+	dev = &nd_btt->dev;
+	dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id);
+	dev->parent = &nd_region->dev;
+	dev->type = &nd_btt_device_type;
+	device_initialize(&nd_btt->dev);
+	lockdep_set_class(&nd_btt->dev.mutex, &nvdimm_btt_key);
+	if (ndns && !__nd_attach_ndns(&nd_btt->dev, ndns, &nd_btt->ndns)) {
+		dev_dbg(&ndns->dev, "failed, already claimed by %s\n",
+				dev_name(ndns->claim));
+		put_device(dev);
+		return NULL;
+	}
+	return dev;
+
+out_put_id:
+	ida_simple_remove(&nd_region->btt_ida, nd_btt->id);
+
+out_nd_btt:
+	kfree(nd_btt);
+	return NULL;
+}
+
+struct device *nd_btt_create(struct nd_region *nd_region)
+{
+	struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL);
+
+	nd_device_register(dev);
+	return dev;
+}
+
+/**
+ * nd_btt_arena_is_valid - check if the metadata layout is valid
+ * @nd_btt:	device with BTT geometry and backing device info
+ * @super:	pointer to the arena's info block being tested
+ *
+ * Check consistency of the btt info block with itself by validating
+ * the checksum, and with the parent namespace by verifying the
+ * parent_uuid contained in the info block with the one supplied in.
+ *
+ * Returns:
+ * false for an invalid info block, true for a valid one
+ */
+bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super)
+{
+	const uuid_t *ns_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev);
+	uuid_t parent_uuid;
+	u64 checksum;
+
+	if (memcmp(super->signature, BTT_SIG, BTT_SIG_LEN) != 0)
+		return false;
+
+	import_uuid(&parent_uuid, super->parent_uuid);
+	if (!uuid_is_null(&parent_uuid))
+		if (!uuid_equal(&parent_uuid, ns_uuid))
+			return false;
+
+	checksum = le64_to_cpu(super->checksum);
+	super->checksum = 0;
+	if (checksum != nd_sb_checksum((struct nd_gen_sb *) super))
+		return false;
+	super->checksum = cpu_to_le64(checksum);
+
+	/* TODO: figure out action for this */
+	if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0)
+		dev_info(&nd_btt->dev, "Found arena with an error flag\n");
+
+	return true;
+}
+EXPORT_SYMBOL(nd_btt_arena_is_valid);
+
+int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns,
+		struct btt_sb *btt_sb)
+{
+	if (ndns->claim_class == NVDIMM_CCLASS_BTT2) {
+		/* Probe/setup for BTT v2.0 */
+		nd_btt->initial_offset = 0;
+		nd_btt->version_major = 2;
+		nd_btt->version_minor = 0;
+		if (nvdimm_read_bytes(ndns, 0, btt_sb, sizeof(*btt_sb), 0))
+			return -ENXIO;
+		if (!nd_btt_arena_is_valid(nd_btt, btt_sb))
+			return -ENODEV;
+		if ((le16_to_cpu(btt_sb->version_major) != 2) ||
+				(le16_to_cpu(btt_sb->version_minor) != 0))
+			return -ENODEV;
+	} else {
+		/*
+		 * Probe/setup for BTT v1.1 (NVDIMM_CCLASS_NONE or
+		 * NVDIMM_CCLASS_BTT)
+		 */
+		nd_btt->initial_offset = SZ_4K;
+		nd_btt->version_major = 1;
+		nd_btt->version_minor = 1;
+		if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0))
+			return -ENXIO;
+		if (!nd_btt_arena_is_valid(nd_btt, btt_sb))
+			return -ENODEV;
+		if ((le16_to_cpu(btt_sb->version_major) != 1) ||
+				(le16_to_cpu(btt_sb->version_minor) != 1))
+			return -ENODEV;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(nd_btt_version);
+
+static int __nd_btt_probe(struct nd_btt *nd_btt,
+		struct nd_namespace_common *ndns, struct btt_sb *btt_sb)
+{
+	int rc;
+
+	if (!btt_sb || !ndns || !nd_btt)
+		return -ENODEV;
+
+	if (nvdimm_namespace_capacity(ndns) < SZ_16M)
+		return -ENXIO;
+
+	rc = nd_btt_version(nd_btt, ndns, btt_sb);
+	if (rc < 0)
+		return rc;
+
+	nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize);
+	nd_btt->uuid = kmemdup(&btt_sb->uuid, sizeof(uuid_t), GFP_KERNEL);
+	if (!nd_btt->uuid)
+		return -ENOMEM;
+
+	nd_device_register(&nd_btt->dev);
+
+	return 0;
+}
+
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns)
+{
+	int rc;
+	struct device *btt_dev;
+	struct btt_sb *btt_sb;
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+	if (ndns->force_raw)
+		return -ENODEV;
+
+	switch (ndns->claim_class) {
+	case NVDIMM_CCLASS_NONE:
+	case NVDIMM_CCLASS_BTT:
+	case NVDIMM_CCLASS_BTT2:
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	nvdimm_bus_lock(&ndns->dev);
+	btt_dev = __nd_btt_create(nd_region, 0, NULL, ndns);
+	nvdimm_bus_unlock(&ndns->dev);
+	if (!btt_dev)
+		return -ENOMEM;
+	btt_sb = devm_kzalloc(dev, sizeof(*btt_sb), GFP_KERNEL);
+	rc = __nd_btt_probe(to_nd_btt(btt_dev), ndns, btt_sb);
+	dev_dbg(dev, "btt: %s\n", rc == 0 ? dev_name(btt_dev) : "<none>");
+	if (rc < 0) {
+		struct nd_btt *nd_btt = to_nd_btt(btt_dev);
+
+		nd_detach_ndns(btt_dev, &nd_btt->ndns);
+		put_device(btt_dev);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(nd_btt_probe);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
new file mode 100644
index 0000000000..5852fe2905
--- /dev/null
+++ b/drivers/nvdimm/bus.c
@@ -0,0 +1,1355 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/libnvdimm.h>
+#include <linux/sched/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/fcntl.h>
+#include <linux/async.h>
+#include <linux/ndctl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "nd.h"
+#include "pfn.h"
+
+int nvdimm_major;
+static int nvdimm_bus_major;
+static struct class *nd_class;
+static DEFINE_IDA(nd_ida);
+
+static int to_nd_device_type(const struct device *dev)
+{
+	if (is_nvdimm(dev))
+		return ND_DEVICE_DIMM;
+	else if (is_memory(dev))
+		return ND_DEVICE_REGION_PMEM;
+	else if (is_nd_dax(dev))
+		return ND_DEVICE_DAX_PMEM;
+	else if (is_nd_region(dev->parent))
+		return nd_region_to_nstype(to_nd_region(dev->parent));
+
+	return 0;
+}
+
+static int nvdimm_bus_uevent(const struct device *dev, struct kobj_uevent_env *env)
+{
+	return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT,
+			to_nd_device_type(dev));
+}
+
+static struct module *to_bus_provider(struct device *dev)
+{
+	/* pin bus providers while regions are enabled */
+	if (is_nd_region(dev)) {
+		struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+		return nvdimm_bus->nd_desc->module;
+	}
+	return NULL;
+}
+
+static void nvdimm_bus_probe_start(struct nvdimm_bus *nvdimm_bus)
+{
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	nvdimm_bus->probe_active++;
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+static void nvdimm_bus_probe_end(struct nvdimm_bus *nvdimm_bus)
+{
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	if (--nvdimm_bus->probe_active == 0)
+		wake_up(&nvdimm_bus->wait);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+static int nvdimm_bus_probe(struct device *dev)
+{
+	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
+	struct module *provider = to_bus_provider(dev);
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	int rc;
+
+	if (!try_module_get(provider))
+		return -ENXIO;
+
+	dev_dbg(&nvdimm_bus->dev, "START: %s.probe(%s)\n",
+			dev->driver->name, dev_name(dev));
+
+	nvdimm_bus_probe_start(nvdimm_bus);
+	rc = nd_drv->probe(dev);
+	if ((rc == 0 || rc == -EOPNOTSUPP) &&
+			dev->parent && is_nd_region(dev->parent))
+		nd_region_advance_seeds(to_nd_region(dev->parent), dev);
+	nvdimm_bus_probe_end(nvdimm_bus);
+
+	dev_dbg(&nvdimm_bus->dev, "END: %s.probe(%s) = %d\n", dev->driver->name,
+			dev_name(dev), rc);
+
+	if (rc != 0)
+		module_put(provider);
+	return rc;
+}
+
+static void nvdimm_bus_remove(struct device *dev)
+{
+	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
+	struct module *provider = to_bus_provider(dev);
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	if (nd_drv->remove)
+		nd_drv->remove(dev);
+
+	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s)\n", dev->driver->name,
+			dev_name(dev));
+	module_put(provider);
+}
+
+static void nvdimm_bus_shutdown(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nd_device_driver *nd_drv = NULL;
+
+	if (dev->driver)
+		nd_drv = to_nd_device_driver(dev->driver);
+
+	if (nd_drv && nd_drv->shutdown) {
+		nd_drv->shutdown(dev);
+		dev_dbg(&nvdimm_bus->dev, "%s.shutdown(%s)\n",
+				dev->driver->name, dev_name(dev));
+	}
+}
+
+void nd_device_notify(struct device *dev, enum nvdimm_event event)
+{
+	device_lock(dev);
+	if (dev->driver) {
+		struct nd_device_driver *nd_drv;
+
+		nd_drv = to_nd_device_driver(dev->driver);
+		if (nd_drv->notify)
+			nd_drv->notify(dev, event);
+	}
+	device_unlock(dev);
+}
+EXPORT_SYMBOL(nd_device_notify);
+
+void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+
+	if (!nvdimm_bus)
+		return;
+
+	/* caller is responsible for holding a reference on the device */
+	nd_device_notify(&nd_region->dev, event);
+}
+EXPORT_SYMBOL_GPL(nvdimm_region_notify);
+
+struct clear_badblocks_context {
+	resource_size_t phys, cleared;
+};
+
+static int nvdimm_clear_badblocks_region(struct device *dev, void *data)
+{
+	struct clear_badblocks_context *ctx = data;
+	struct nd_region *nd_region;
+	resource_size_t ndr_end;
+	sector_t sector;
+
+	/* make sure device is a region */
+	if (!is_memory(dev))
+		return 0;
+
+	nd_region = to_nd_region(dev);
+	ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
+
+	/* make sure we are in the region */
+	if (ctx->phys < nd_region->ndr_start ||
+	    (ctx->phys + ctx->cleared - 1) > ndr_end)
+		return 0;
+
+	sector = (ctx->phys - nd_region->ndr_start) / 512;
+	badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512);
+
+	if (nd_region->bb_state)
+		sysfs_notify_dirent(nd_region->bb_state);
+
+	return 0;
+}
+
+static void nvdimm_clear_badblocks_regions(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t phys, u64 cleared)
+{
+	struct clear_badblocks_context ctx = {
+		.phys = phys,
+		.cleared = cleared,
+	};
+
+	device_for_each_child(&nvdimm_bus->dev, &ctx,
+			nvdimm_clear_badblocks_region);
+}
+
+static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t phys, u64 cleared)
+{
+	if (cleared > 0)
+		badrange_forget(&nvdimm_bus->badrange, phys, cleared);
+
+	if (cleared > 0 && cleared / 512)
+		nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
+}
+
+long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
+		unsigned int len)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc;
+	struct nd_cmd_clear_error clear_err;
+	struct nd_cmd_ars_cap ars_cap;
+	u32 clear_err_unit, mask;
+	unsigned int noio_flag;
+	int cmd_rc, rc;
+
+	if (!nvdimm_bus)
+		return -ENXIO;
+
+	nd_desc = nvdimm_bus->nd_desc;
+	/*
+	 * if ndctl does not exist, it's PMEM_LEGACY and
+	 * we want to just pretend everything is handled.
+	 */
+	if (!nd_desc->ndctl)
+		return len;
+
+	memset(&ars_cap, 0, sizeof(ars_cap));
+	ars_cap.address = phys;
+	ars_cap.length = len;
+	noio_flag = memalloc_noio_save();
+	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, &ars_cap,
+			sizeof(ars_cap), &cmd_rc);
+	memalloc_noio_restore(noio_flag);
+	if (rc < 0)
+		return rc;
+	if (cmd_rc < 0)
+		return cmd_rc;
+	clear_err_unit = ars_cap.clear_err_unit;
+	if (!clear_err_unit || !is_power_of_2(clear_err_unit))
+		return -ENXIO;
+
+	mask = clear_err_unit - 1;
+	if ((phys | len) & mask)
+		return -ENXIO;
+	memset(&clear_err, 0, sizeof(clear_err));
+	clear_err.address = phys;
+	clear_err.length = len;
+	noio_flag = memalloc_noio_save();
+	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_CLEAR_ERROR, &clear_err,
+			sizeof(clear_err), &cmd_rc);
+	memalloc_noio_restore(noio_flag);
+	if (rc < 0)
+		return rc;
+	if (cmd_rc < 0)
+		return cmd_rc;
+
+	nvdimm_account_cleared_poison(nvdimm_bus, phys, clear_err.cleared);
+
+	return clear_err.cleared;
+}
+EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
+
+static int nvdimm_bus_match(struct device *dev, struct device_driver *drv);
+
+static struct bus_type nvdimm_bus_type = {
+	.name = "nd",
+	.uevent = nvdimm_bus_uevent,
+	.match = nvdimm_bus_match,
+	.probe = nvdimm_bus_probe,
+	.remove = nvdimm_bus_remove,
+	.shutdown = nvdimm_bus_shutdown,
+};
+
+static void nvdimm_bus_release(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+	ida_simple_remove(&nd_ida, nvdimm_bus->id);
+	kfree(nvdimm_bus);
+}
+
+static const struct device_type nvdimm_bus_dev_type = {
+	.release = nvdimm_bus_release,
+	.groups = nvdimm_bus_attribute_groups,
+};
+
+bool is_nvdimm_bus(struct device *dev)
+{
+	return dev->type == &nvdimm_bus_dev_type;
+}
+
+struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
+{
+	struct device *dev;
+
+	for (dev = nd_dev; dev; dev = dev->parent)
+		if (is_nvdimm_bus(dev))
+			break;
+	dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n");
+	if (dev)
+		return to_nvdimm_bus(dev);
+	return NULL;
+}
+
+struct nvdimm_bus *to_nvdimm_bus(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+	WARN_ON(!is_nvdimm_bus(dev));
+	return nvdimm_bus;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm_bus);
+
+struct nvdimm_bus *nvdimm_to_bus(struct nvdimm *nvdimm)
+{
+	return to_nvdimm_bus(nvdimm->dev.parent);
+}
+EXPORT_SYMBOL_GPL(nvdimm_to_bus);
+
+static struct lock_class_key nvdimm_bus_key;
+
+struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
+		struct nvdimm_bus_descriptor *nd_desc)
+{
+	struct nvdimm_bus *nvdimm_bus;
+	int rc;
+
+	nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL);
+	if (!nvdimm_bus)
+		return NULL;
+	INIT_LIST_HEAD(&nvdimm_bus->list);
+	INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
+	init_waitqueue_head(&nvdimm_bus->wait);
+	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
+	if (nvdimm_bus->id < 0) {
+		kfree(nvdimm_bus);
+		return NULL;
+	}
+	mutex_init(&nvdimm_bus->reconfig_mutex);
+	badrange_init(&nvdimm_bus->badrange);
+	nvdimm_bus->nd_desc = nd_desc;
+	nvdimm_bus->dev.parent = parent;
+	nvdimm_bus->dev.type = &nvdimm_bus_dev_type;
+	nvdimm_bus->dev.groups = nd_desc->attr_groups;
+	nvdimm_bus->dev.bus = &nvdimm_bus_type;
+	nvdimm_bus->dev.of_node = nd_desc->of_node;
+	device_initialize(&nvdimm_bus->dev);
+	lockdep_set_class(&nvdimm_bus->dev.mutex, &nvdimm_bus_key);
+	device_set_pm_not_required(&nvdimm_bus->dev);
+	rc = dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(&nvdimm_bus->dev);
+	if (rc) {
+		dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc);
+		goto err;
+	}
+
+	return nvdimm_bus;
+ err:
+	put_device(&nvdimm_bus->dev);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_register);
+
+void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
+{
+	if (!nvdimm_bus)
+		return;
+	device_unregister(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
+
+static int child_unregister(struct device *dev, void *data)
+{
+	/*
+	 * the singular ndctl class device per bus needs to be
+	 * "device_destroy"ed, so skip it here
+	 *
+	 * i.e. remove classless children
+	 */
+	if (dev->class)
+		return 0;
+
+	if (is_nvdimm(dev))
+		nvdimm_delete(to_nvdimm(dev));
+	else
+		nd_device_unregister(dev, ND_SYNC);
+
+	return 0;
+}
+
+static void free_badrange_list(struct list_head *badrange_list)
+{
+	struct badrange_entry *bre, *next;
+
+	list_for_each_entry_safe(bre, next, badrange_list, list) {
+		list_del(&bre->list);
+		kfree(bre);
+	}
+	list_del_init(badrange_list);
+}
+
+static void nd_bus_remove(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+	mutex_lock(&nvdimm_bus_list_mutex);
+	list_del_init(&nvdimm_bus->list);
+	mutex_unlock(&nvdimm_bus_list_mutex);
+
+	wait_event(nvdimm_bus->wait,
+			atomic_read(&nvdimm_bus->ioctl_active) == 0);
+
+	nd_synchronize();
+	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
+
+	spin_lock(&nvdimm_bus->badrange.lock);
+	free_badrange_list(&nvdimm_bus->badrange.list);
+	spin_unlock(&nvdimm_bus->badrange.lock);
+
+	nvdimm_bus_destroy_ndctl(nvdimm_bus);
+}
+
+static int nd_bus_probe(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	int rc;
+
+	rc = nvdimm_bus_create_ndctl(nvdimm_bus);
+	if (rc)
+		return rc;
+
+	mutex_lock(&nvdimm_bus_list_mutex);
+	list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list);
+	mutex_unlock(&nvdimm_bus_list_mutex);
+
+	/* enable bus provider attributes to look up their local context */
+	dev_set_drvdata(dev, nvdimm_bus->nd_desc);
+
+	return 0;
+}
+
+static struct nd_device_driver nd_bus_driver = {
+	.probe = nd_bus_probe,
+	.remove = nd_bus_remove,
+	.drv = {
+		.name = "nd_bus",
+		.suppress_bind_attrs = true,
+		.bus = &nvdimm_bus_type,
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+	},
+};
+
+static int nvdimm_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct nd_device_driver *nd_drv = to_nd_device_driver(drv);
+
+	if (is_nvdimm_bus(dev) && nd_drv == &nd_bus_driver)
+		return true;
+
+	return !!test_bit(to_nd_device_type(dev), &nd_drv->type);
+}
+
+static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain);
+
+void nd_synchronize(void)
+{
+	async_synchronize_full_domain(&nd_async_domain);
+}
+EXPORT_SYMBOL_GPL(nd_synchronize);
+
+static void nd_async_device_register(void *d, async_cookie_t cookie)
+{
+	struct device *dev = d;
+
+	if (device_add(dev) != 0) {
+		dev_err(dev, "%s: failed\n", __func__);
+		put_device(dev);
+	}
+	put_device(dev);
+	if (dev->parent)
+		put_device(dev->parent);
+}
+
+static void nd_async_device_unregister(void *d, async_cookie_t cookie)
+{
+	struct device *dev = d;
+
+	/* flush bus operations before delete */
+	nvdimm_bus_lock(dev);
+	nvdimm_bus_unlock(dev);
+
+	device_unregister(dev);
+	put_device(dev);
+}
+
+static void __nd_device_register(struct device *dev, bool sync)
+{
+	if (!dev)
+		return;
+
+	/*
+	 * Ensure that region devices always have their NUMA node set as
+	 * early as possible. This way we are able to make certain that
+	 * any memory associated with the creation and the creation
+	 * itself of the region is associated with the correct node.
+	 */
+	if (is_nd_region(dev))
+		set_dev_node(dev, to_nd_region(dev)->numa_node);
+
+	dev->bus = &nvdimm_bus_type;
+	device_set_pm_not_required(dev);
+	if (dev->parent) {
+		get_device(dev->parent);
+		if (dev_to_node(dev) == NUMA_NO_NODE)
+			set_dev_node(dev, dev_to_node(dev->parent));
+	}
+	get_device(dev);
+
+	if (sync)
+		nd_async_device_register(dev, 0);
+	else
+		async_schedule_dev_domain(nd_async_device_register, dev,
+					  &nd_async_domain);
+}
+
+void nd_device_register(struct device *dev)
+{
+	__nd_device_register(dev, false);
+}
+EXPORT_SYMBOL(nd_device_register);
+
+void nd_device_register_sync(struct device *dev)
+{
+	__nd_device_register(dev, true);
+}
+
+void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
+{
+	bool killed;
+
+	switch (mode) {
+	case ND_ASYNC:
+		/*
+		 * In the async case this is being triggered with the
+		 * device lock held and the unregistration work needs to
+		 * be moved out of line iff this is thread has won the
+		 * race to schedule the deletion.
+		 */
+		if (!kill_device(dev))
+			return;
+
+		get_device(dev);
+		async_schedule_domain(nd_async_device_unregister, dev,
+				&nd_async_domain);
+		break;
+	case ND_SYNC:
+		/*
+		 * In the sync case the device is being unregistered due
+		 * to a state change of the parent. Claim the kill state
+		 * to synchronize against other unregistration requests,
+		 * or otherwise let the async path handle it if the
+		 * unregistration was already queued.
+		 */
+		device_lock(dev);
+		killed = kill_device(dev);
+		device_unlock(dev);
+
+		if (!killed)
+			return;
+
+		nd_synchronize();
+		device_unregister(dev);
+		break;
+	}
+}
+EXPORT_SYMBOL(nd_device_unregister);
+
+/**
+ * __nd_driver_register() - register a region or a namespace driver
+ * @nd_drv: driver to register
+ * @owner: automatically set by nd_driver_register() macro
+ * @mod_name: automatically set by nd_driver_register() macro
+ */
+int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
+		const char *mod_name)
+{
+	struct device_driver *drv = &nd_drv->drv;
+
+	if (!nd_drv->type) {
+		pr_debug("driver type bitmask not set (%ps)\n",
+				__builtin_return_address(0));
+		return -EINVAL;
+	}
+
+	if (!nd_drv->probe) {
+		pr_debug("%s ->probe() must be specified\n", mod_name);
+		return -EINVAL;
+	}
+
+	drv->bus = &nvdimm_bus_type;
+	drv->owner = owner;
+	drv->mod_name = mod_name;
+
+	return driver_register(drv);
+}
+EXPORT_SYMBOL(__nd_driver_register);
+
+void nvdimm_check_and_set_ro(struct gendisk *disk)
+{
+	struct device *dev = disk_to_dev(disk)->parent;
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	int disk_ro = get_disk_ro(disk);
+
+	/* catch the disk up with the region ro state */
+	if (disk_ro == nd_region->ro)
+		return;
+
+	dev_info(dev, "%s read-%s, marking %s read-%s\n",
+		 dev_name(&nd_region->dev), nd_region->ro ? "only" : "write",
+		 disk->disk_name, nd_region->ro ? "only" : "write");
+	set_disk_ro(disk, nd_region->ro);
+}
+EXPORT_SYMBOL(nvdimm_check_and_set_ro);
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	return sprintf(buf, ND_DEVICE_MODALIAS_FMT "\n",
+			to_nd_device_type(dev));
+}
+static DEVICE_ATTR_RO(modalias);
+
+static ssize_t devtype_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	return sprintf(buf, "%s\n", dev->type->name);
+}
+static DEVICE_ATTR_RO(devtype);
+
+static struct attribute *nd_device_attributes[] = {
+	&dev_attr_modalias.attr,
+	&dev_attr_devtype.attr,
+	NULL,
+};
+
+/*
+ * nd_device_attribute_group - generic attributes for all devices on an nd bus
+ */
+const struct attribute_group nd_device_attribute_group = {
+	.attrs = nd_device_attributes,
+};
+
+static ssize_t numa_node_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", dev_to_node(dev));
+}
+static DEVICE_ATTR_RO(numa_node);
+
+static int nvdimm_dev_to_target_node(struct device *dev)
+{
+	struct device *parent = dev->parent;
+	struct nd_region *nd_region = NULL;
+
+	if (is_nd_region(dev))
+		nd_region = to_nd_region(dev);
+	else if (parent && is_nd_region(parent))
+		nd_region = to_nd_region(parent);
+
+	if (!nd_region)
+		return NUMA_NO_NODE;
+	return nd_region->target_node;
+}
+
+static ssize_t target_node_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", nvdimm_dev_to_target_node(dev));
+}
+static DEVICE_ATTR_RO(target_node);
+
+static struct attribute *nd_numa_attributes[] = {
+	&dev_attr_numa_node.attr,
+	&dev_attr_target_node.attr,
+	NULL,
+};
+
+static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a,
+		int n)
+{
+	struct device *dev = container_of(kobj, typeof(*dev), kobj);
+
+	if (!IS_ENABLED(CONFIG_NUMA))
+		return 0;
+
+	if (a == &dev_attr_target_node.attr &&
+			nvdimm_dev_to_target_node(dev) == NUMA_NO_NODE)
+		return 0;
+
+	return a->mode;
+}
+
+/*
+ * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus
+ */
+const struct attribute_group nd_numa_attribute_group = {
+	.attrs = nd_numa_attributes,
+	.is_visible = nd_numa_attr_visible,
+};
+
+static void ndctl_release(struct device *dev)
+{
+	kfree(dev);
+}
+
+static struct lock_class_key nvdimm_ndctl_key;
+
+int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus)
+{
+	dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id);
+	struct device *dev;
+	int rc;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &nvdimm_ndctl_key);
+	device_set_pm_not_required(dev);
+	dev->class = nd_class;
+	dev->parent = &nvdimm_bus->dev;
+	dev->devt = devt;
+	dev->release = ndctl_release;
+	rc = dev_set_name(dev, "ndctl%d", nvdimm_bus->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc) {
+		dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %d\n",
+				nvdimm_bus->id, rc);
+		goto err;
+	}
+	return 0;
+
+err:
+	put_device(dev);
+	return rc;
+}
+
+void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus)
+{
+	device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id));
+}
+
+static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = {
+	[ND_CMD_IMPLEMENTED] = { },
+	[ND_CMD_SMART] = {
+		.out_num = 2,
+		.out_sizes = { 4, 128, },
+	},
+	[ND_CMD_SMART_THRESHOLD] = {
+		.out_num = 2,
+		.out_sizes = { 4, 8, },
+	},
+	[ND_CMD_DIMM_FLAGS] = {
+		.out_num = 2,
+		.out_sizes = { 4, 4 },
+	},
+	[ND_CMD_GET_CONFIG_SIZE] = {
+		.out_num = 3,
+		.out_sizes = { 4, 4, 4, },
+	},
+	[ND_CMD_GET_CONFIG_DATA] = {
+		.in_num = 2,
+		.in_sizes = { 4, 4, },
+		.out_num = 2,
+		.out_sizes = { 4, UINT_MAX, },
+	},
+	[ND_CMD_SET_CONFIG_DATA] = {
+		.in_num = 3,
+		.in_sizes = { 4, 4, UINT_MAX, },
+		.out_num = 1,
+		.out_sizes = { 4, },
+	},
+	[ND_CMD_VENDOR] = {
+		.in_num = 3,
+		.in_sizes = { 4, 4, UINT_MAX, },
+		.out_num = 3,
+		.out_sizes = { 4, 4, UINT_MAX, },
+	},
+	[ND_CMD_CALL] = {
+		.in_num = 2,
+		.in_sizes = { sizeof(struct nd_cmd_pkg), UINT_MAX, },
+		.out_num = 1,
+		.out_sizes = { UINT_MAX, },
+	},
+};
+
+const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd)
+{
+	if (cmd < ARRAY_SIZE(__nd_cmd_dimm_descs))
+		return &__nd_cmd_dimm_descs[cmd];
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_dimm_desc);
+
+static const struct nd_cmd_desc __nd_cmd_bus_descs[] = {
+	[ND_CMD_IMPLEMENTED] = { },
+	[ND_CMD_ARS_CAP] = {
+		.in_num = 2,
+		.in_sizes = { 8, 8, },
+		.out_num = 4,
+		.out_sizes = { 4, 4, 4, 4, },
+	},
+	[ND_CMD_ARS_START] = {
+		.in_num = 5,
+		.in_sizes = { 8, 8, 2, 1, 5, },
+		.out_num = 2,
+		.out_sizes = { 4, 4, },
+	},
+	[ND_CMD_ARS_STATUS] = {
+		.out_num = 3,
+		.out_sizes = { 4, 4, UINT_MAX, },
+	},
+	[ND_CMD_CLEAR_ERROR] = {
+		.in_num = 2,
+		.in_sizes = { 8, 8, },
+		.out_num = 3,
+		.out_sizes = { 4, 4, 8, },
+	},
+	[ND_CMD_CALL] = {
+		.in_num = 2,
+		.in_sizes = { sizeof(struct nd_cmd_pkg), UINT_MAX, },
+		.out_num = 1,
+		.out_sizes = { UINT_MAX, },
+	},
+};
+
+const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd)
+{
+	if (cmd < ARRAY_SIZE(__nd_cmd_bus_descs))
+		return &__nd_cmd_bus_descs[cmd];
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_bus_desc);
+
+u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
+		const struct nd_cmd_desc *desc, int idx, void *buf)
+{
+	if (idx >= desc->in_num)
+		return UINT_MAX;
+
+	if (desc->in_sizes[idx] < UINT_MAX)
+		return desc->in_sizes[idx];
+
+	if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && idx == 2) {
+		struct nd_cmd_set_config_hdr *hdr = buf;
+
+		return hdr->in_length;
+	} else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) {
+		struct nd_cmd_vendor_hdr *hdr = buf;
+
+		return hdr->in_length;
+	} else if (cmd == ND_CMD_CALL) {
+		struct nd_cmd_pkg *pkg = buf;
+
+		return pkg->nd_size_in;
+	}
+
+	return UINT_MAX;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_in_size);
+
+u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
+		const struct nd_cmd_desc *desc, int idx, const u32 *in_field,
+		const u32 *out_field, unsigned long remainder)
+{
+	if (idx >= desc->out_num)
+		return UINT_MAX;
+
+	if (desc->out_sizes[idx] < UINT_MAX)
+		return desc->out_sizes[idx];
+
+	if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && idx == 1)
+		return in_field[1];
+	else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2)
+		return out_field[1];
+	else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 2) {
+		/*
+		 * Per table 9-276 ARS Data in ACPI 6.1, out_field[1] is
+		 * "Size of Output Buffer in bytes, including this
+		 * field."
+		 */
+		if (out_field[1] < 4)
+			return 0;
+		/*
+		 * ACPI 6.1 is ambiguous if 'status' is included in the
+		 * output size. If we encounter an output size that
+		 * overshoots the remainder by 4 bytes, assume it was
+		 * including 'status'.
+		 */
+		if (out_field[1] - 4 == remainder)
+			return remainder;
+		return out_field[1] - 8;
+	} else if (cmd == ND_CMD_CALL) {
+		struct nd_cmd_pkg *pkg = (struct nd_cmd_pkg *) in_field;
+
+		return pkg->nd_size_out;
+	}
+
+
+	return UINT_MAX;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_out_size);
+
+void wait_nvdimm_bus_probe_idle(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	do {
+		if (nvdimm_bus->probe_active == 0)
+			break;
+		nvdimm_bus_unlock(dev);
+		device_unlock(dev);
+		wait_event(nvdimm_bus->wait,
+				nvdimm_bus->probe_active == 0);
+		device_lock(dev);
+		nvdimm_bus_lock(dev);
+	} while (true);
+}
+
+static int nd_pmem_forget_poison_check(struct device *dev, void *data)
+{
+	struct nd_cmd_clear_error *clear_err =
+		(struct nd_cmd_clear_error *)data;
+	struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
+	struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL;
+	struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL;
+	struct nd_namespace_common *ndns = NULL;
+	struct nd_namespace_io *nsio;
+	resource_size_t offset = 0, end_trunc = 0, start, end, pstart, pend;
+
+	if (nd_dax || !dev->driver)
+		return 0;
+
+	start = clear_err->address;
+	end = clear_err->address + clear_err->cleared - 1;
+
+	if (nd_btt || nd_pfn || nd_dax) {
+		if (nd_btt)
+			ndns = nd_btt->ndns;
+		else if (nd_pfn)
+			ndns = nd_pfn->ndns;
+		else if (nd_dax)
+			ndns = nd_dax->nd_pfn.ndns;
+
+		if (!ndns)
+			return 0;
+	} else
+		ndns = to_ndns(dev);
+
+	nsio = to_nd_namespace_io(&ndns->dev);
+	pstart = nsio->res.start + offset;
+	pend = nsio->res.end - end_trunc;
+
+	if ((pstart >= start) && (pend <= end))
+		return -EBUSY;
+
+	return 0;
+
+}
+
+static int nd_ns_forget_poison_check(struct device *dev, void *data)
+{
+	return device_for_each_child(dev, data, nd_pmem_forget_poison_check);
+}
+
+/* set_config requires an idle interleave set */
+static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
+		struct nvdimm *nvdimm, unsigned int cmd, void *data)
+{
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+
+	/* ask the bus provider if it would like to block this request */
+	if (nd_desc->clear_to_send) {
+		int rc = nd_desc->clear_to_send(nd_desc, nvdimm, cmd, data);
+
+		if (rc)
+			return rc;
+	}
+
+	/* require clear error to go through the pmem driver */
+	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR)
+		return device_for_each_child(&nvdimm_bus->dev, data,
+				nd_ns_forget_poison_check);
+
+	if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
+		return 0;
+
+	/* prevent label manipulation while the kernel owns label updates */
+	wait_nvdimm_bus_probe_idle(&nvdimm_bus->dev);
+	if (atomic_read(&nvdimm->busy))
+		return -EBUSY;
+	return 0;
+}
+
+static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
+		int read_only, unsigned int ioctl_cmd, unsigned long arg)
+{
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	const struct nd_cmd_desc *desc = NULL;
+	unsigned int cmd = _IOC_NR(ioctl_cmd);
+	struct device *dev = &nvdimm_bus->dev;
+	void __user *p = (void __user *) arg;
+	char *out_env = NULL, *in_env = NULL;
+	const char *cmd_name, *dimm_name;
+	u32 in_len = 0, out_len = 0;
+	unsigned int func = cmd;
+	unsigned long cmd_mask;
+	struct nd_cmd_pkg pkg;
+	int rc, i, cmd_rc;
+	void *buf = NULL;
+	u64 buf_len = 0;
+
+	if (nvdimm) {
+		desc = nd_cmd_dimm_desc(cmd);
+		cmd_name = nvdimm_cmd_name(cmd);
+		cmd_mask = nvdimm->cmd_mask;
+		dimm_name = dev_name(&nvdimm->dev);
+	} else {
+		desc = nd_cmd_bus_desc(cmd);
+		cmd_name = nvdimm_bus_cmd_name(cmd);
+		cmd_mask = nd_desc->cmd_mask;
+		dimm_name = "bus";
+	}
+
+	/* Validate command family support against bus declared support */
+	if (cmd == ND_CMD_CALL) {
+		unsigned long *mask;
+
+		if (copy_from_user(&pkg, p, sizeof(pkg)))
+			return -EFAULT;
+
+		if (nvdimm) {
+			if (pkg.nd_family > NVDIMM_FAMILY_MAX)
+				return -EINVAL;
+			mask = &nd_desc->dimm_family_mask;
+		} else {
+			if (pkg.nd_family > NVDIMM_BUS_FAMILY_MAX)
+				return -EINVAL;
+			mask = &nd_desc->bus_family_mask;
+		}
+
+		if (!test_bit(pkg.nd_family, mask))
+			return -EINVAL;
+	}
+
+	if (!desc ||
+	    (desc->out_num + desc->in_num == 0) ||
+	    cmd > ND_CMD_CALL ||
+	    !test_bit(cmd, &cmd_mask))
+		return -ENOTTY;
+
+	/* fail write commands (when read-only) */
+	if (read_only)
+		switch (cmd) {
+		case ND_CMD_VENDOR:
+		case ND_CMD_SET_CONFIG_DATA:
+		case ND_CMD_ARS_START:
+		case ND_CMD_CLEAR_ERROR:
+		case ND_CMD_CALL:
+			dev_dbg(dev, "'%s' command while read-only.\n",
+					nvdimm ? nvdimm_cmd_name(cmd)
+					: nvdimm_bus_cmd_name(cmd));
+			return -EPERM;
+		default:
+			break;
+		}
+
+	/* process an input envelope */
+	in_env = kzalloc(ND_CMD_MAX_ENVELOPE, GFP_KERNEL);
+	if (!in_env)
+		return -ENOMEM;
+	for (i = 0; i < desc->in_num; i++) {
+		u32 in_size, copy;
+
+		in_size = nd_cmd_in_size(nvdimm, cmd, desc, i, in_env);
+		if (in_size == UINT_MAX) {
+			dev_err(dev, "%s:%s unknown input size cmd: %s field: %d\n",
+					__func__, dimm_name, cmd_name, i);
+			rc = -ENXIO;
+			goto out;
+		}
+		if (in_len < ND_CMD_MAX_ENVELOPE)
+			copy = min_t(u32, ND_CMD_MAX_ENVELOPE - in_len, in_size);
+		else
+			copy = 0;
+		if (copy && copy_from_user(&in_env[in_len], p + in_len, copy)) {
+			rc = -EFAULT;
+			goto out;
+		}
+		in_len += in_size;
+	}
+
+	if (cmd == ND_CMD_CALL) {
+		func = pkg.nd_command;
+		dev_dbg(dev, "%s, idx: %llu, in: %u, out: %u, len %llu\n",
+				dimm_name, pkg.nd_command,
+				in_len, out_len, buf_len);
+	}
+
+	/* process an output envelope */
+	out_env = kzalloc(ND_CMD_MAX_ENVELOPE, GFP_KERNEL);
+	if (!out_env) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < desc->out_num; i++) {
+		u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i,
+				(u32 *) in_env, (u32 *) out_env, 0);
+		u32 copy;
+
+		if (out_size == UINT_MAX) {
+			dev_dbg(dev, "%s unknown output size cmd: %s field: %d\n",
+					dimm_name, cmd_name, i);
+			rc = -EFAULT;
+			goto out;
+		}
+		if (out_len < ND_CMD_MAX_ENVELOPE)
+			copy = min_t(u32, ND_CMD_MAX_ENVELOPE - out_len, out_size);
+		else
+			copy = 0;
+		if (copy && copy_from_user(&out_env[out_len],
+					p + in_len + out_len, copy)) {
+			rc = -EFAULT;
+			goto out;
+		}
+		out_len += out_size;
+	}
+
+	buf_len = (u64) out_len + (u64) in_len;
+	if (buf_len > ND_IOCTL_MAX_BUFLEN) {
+		dev_dbg(dev, "%s cmd: %s buf_len: %llu > %d\n", dimm_name,
+				cmd_name, buf_len, ND_IOCTL_MAX_BUFLEN);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	buf = vmalloc(buf_len);
+	if (!buf) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if (copy_from_user(buf, p, buf_len)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, func, buf);
+	if (rc)
+		goto out_unlock;
+
+	rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, &cmd_rc);
+	if (rc < 0)
+		goto out_unlock;
+
+	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) {
+		struct nd_cmd_clear_error *clear_err = buf;
+
+		nvdimm_account_cleared_poison(nvdimm_bus, clear_err->address,
+				clear_err->cleared);
+	}
+
+	if (copy_to_user(p, buf, buf_len))
+		rc = -EFAULT;
+
+out_unlock:
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+out:
+	kfree(in_env);
+	kfree(out_env);
+	vfree(buf);
+	return rc;
+}
+
+enum nd_ioctl_mode {
+	BUS_IOCTL,
+	DIMM_IOCTL,
+};
+
+static int match_dimm(struct device *dev, void *data)
+{
+	long id = (long) data;
+
+	if (is_nvdimm(dev)) {
+		struct nvdimm *nvdimm = to_nvdimm(dev);
+
+		return nvdimm->id == id;
+	}
+
+	return 0;
+}
+
+static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+		enum nd_ioctl_mode mode)
+
+{
+	struct nvdimm_bus *nvdimm_bus, *found = NULL;
+	long id = (long) file->private_data;
+	struct nvdimm *nvdimm = NULL;
+	int rc, ro;
+
+	ro = ((file->f_flags & O_ACCMODE) == O_RDONLY);
+	mutex_lock(&nvdimm_bus_list_mutex);
+	list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
+		if (mode == DIMM_IOCTL) {
+			struct device *dev;
+
+			dev = device_find_child(&nvdimm_bus->dev,
+					file->private_data, match_dimm);
+			if (!dev)
+				continue;
+			nvdimm = to_nvdimm(dev);
+			found = nvdimm_bus;
+		} else if (nvdimm_bus->id == id) {
+			found = nvdimm_bus;
+		}
+
+		if (found) {
+			atomic_inc(&nvdimm_bus->ioctl_active);
+			break;
+		}
+	}
+	mutex_unlock(&nvdimm_bus_list_mutex);
+
+	if (!found)
+		return -ENXIO;
+
+	nvdimm_bus = found;
+	rc = __nd_ioctl(nvdimm_bus, nvdimm, ro, cmd, arg);
+
+	if (nvdimm)
+		put_device(&nvdimm->dev);
+	if (atomic_dec_and_test(&nvdimm_bus->ioctl_active))
+		wake_up(&nvdimm_bus->wait);
+
+	return rc;
+}
+
+static long bus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	return nd_ioctl(file, cmd, arg, BUS_IOCTL);
+}
+
+static long dimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	return nd_ioctl(file, cmd, arg, DIMM_IOCTL);
+}
+
+static int nd_open(struct inode *inode, struct file *file)
+{
+	long minor = iminor(inode);
+
+	file->private_data = (void *) minor;
+	return 0;
+}
+
+static const struct file_operations nvdimm_bus_fops = {
+	.owner = THIS_MODULE,
+	.open = nd_open,
+	.unlocked_ioctl = bus_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
+	.llseek = noop_llseek,
+};
+
+static const struct file_operations nvdimm_fops = {
+	.owner = THIS_MODULE,
+	.open = nd_open,
+	.unlocked_ioctl = dimm_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
+	.llseek = noop_llseek,
+};
+
+int __init nvdimm_bus_init(void)
+{
+	int rc;
+
+	rc = bus_register(&nvdimm_bus_type);
+	if (rc)
+		return rc;
+
+	rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops);
+	if (rc < 0)
+		goto err_bus_chrdev;
+	nvdimm_bus_major = rc;
+
+	rc = register_chrdev(0, "dimmctl", &nvdimm_fops);
+	if (rc < 0)
+		goto err_dimm_chrdev;
+	nvdimm_major = rc;
+
+	nd_class = class_create("nd");
+	if (IS_ERR(nd_class)) {
+		rc = PTR_ERR(nd_class);
+		goto err_class;
+	}
+
+	rc = driver_register(&nd_bus_driver.drv);
+	if (rc)
+		goto err_nd_bus;
+
+	return 0;
+
+ err_nd_bus:
+	class_destroy(nd_class);
+ err_class:
+	unregister_chrdev(nvdimm_major, "dimmctl");
+ err_dimm_chrdev:
+	unregister_chrdev(nvdimm_bus_major, "ndctl");
+ err_bus_chrdev:
+	bus_unregister(&nvdimm_bus_type);
+
+	return rc;
+}
+
+void nvdimm_bus_exit(void)
+{
+	driver_unregister(&nd_bus_driver.drv);
+	class_destroy(nd_class);
+	unregister_chrdev(nvdimm_bus_major, "ndctl");
+	unregister_chrdev(nvdimm_major, "dimmctl");
+	bus_unregister(&nvdimm_bus_type);
+	ida_destroy(&nd_ida);
+}
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
new file mode 100644
index 0000000000..030dbde6b0
--- /dev/null
+++ b/drivers/nvdimm/claim.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/badblocks.h>
+#include "nd-core.h"
+#include "pmem.h"
+#include "pfn.h"
+#include "btt.h"
+#include "nd.h"
+
+void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns)
+{
+	struct nd_namespace_common *ndns = *_ndns;
+	struct nvdimm_bus *nvdimm_bus;
+
+	if (!ndns)
+		return;
+
+	nvdimm_bus = walk_to_nvdimm_bus(&ndns->dev);
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+	dev_WARN_ONCE(dev, ndns->claim != dev, "%s: invalid claim\n", __func__);
+	ndns->claim = NULL;
+	*_ndns = NULL;
+	put_device(&ndns->dev);
+}
+
+void nd_detach_ndns(struct device *dev,
+		struct nd_namespace_common **_ndns)
+{
+	struct nd_namespace_common *ndns = *_ndns;
+
+	if (!ndns)
+		return;
+	get_device(&ndns->dev);
+	nvdimm_bus_lock(&ndns->dev);
+	__nd_detach_ndns(dev, _ndns);
+	nvdimm_bus_unlock(&ndns->dev);
+	put_device(&ndns->dev);
+}
+
+bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+		struct nd_namespace_common **_ndns)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&attach->dev);
+
+	if (attach->claim)
+		return false;
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+	dev_WARN_ONCE(dev, *_ndns, "%s: invalid claim\n", __func__);
+	attach->claim = dev;
+	*_ndns = attach;
+	get_device(&attach->dev);
+	return true;
+}
+
+bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+		struct nd_namespace_common **_ndns)
+{
+	bool claimed;
+
+	nvdimm_bus_lock(&attach->dev);
+	claimed = __nd_attach_ndns(dev, attach, _ndns);
+	nvdimm_bus_unlock(&attach->dev);
+	return claimed;
+}
+
+static int namespace_match(struct device *dev, void *data)
+{
+	char *name = data;
+
+	return strcmp(name, dev_name(dev)) == 0;
+}
+
+static bool is_idle(struct device *dev, struct nd_namespace_common *ndns)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct device *seed = NULL;
+
+	if (is_nd_btt(dev))
+		seed = nd_region->btt_seed;
+	else if (is_nd_pfn(dev))
+		seed = nd_region->pfn_seed;
+	else if (is_nd_dax(dev))
+		seed = nd_region->dax_seed;
+
+	if (seed == dev || ndns || dev->driver)
+		return false;
+	return true;
+}
+
+struct nd_pfn *to_nd_pfn_safe(struct device *dev)
+{
+	/*
+	 * pfn device attributes are re-used by dax device instances, so we
+	 * need to be careful to correct device-to-nd_pfn conversion.
+	 */
+	if (is_nd_pfn(dev))
+		return to_nd_pfn(dev);
+
+	if (is_nd_dax(dev)) {
+		struct nd_dax *nd_dax = to_nd_dax(dev);
+
+		return &nd_dax->nd_pfn;
+	}
+
+	WARN_ON(1);
+	return NULL;
+}
+
+static void nd_detach_and_reset(struct device *dev,
+		struct nd_namespace_common **_ndns)
+{
+	/* detach the namespace and destroy / reset the device */
+	__nd_detach_ndns(dev, _ndns);
+	if (is_idle(dev, *_ndns)) {
+		nd_device_unregister(dev, ND_ASYNC);
+	} else if (is_nd_btt(dev)) {
+		struct nd_btt *nd_btt = to_nd_btt(dev);
+
+		nd_btt->lbasize = 0;
+		kfree(nd_btt->uuid);
+		nd_btt->uuid = NULL;
+	} else if (is_nd_pfn(dev) || is_nd_dax(dev)) {
+		struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+
+		kfree(nd_pfn->uuid);
+		nd_pfn->uuid = NULL;
+		nd_pfn->mode = PFN_MODE_NONE;
+	}
+}
+
+ssize_t nd_namespace_store(struct device *dev,
+		struct nd_namespace_common **_ndns, const char *buf,
+		size_t len)
+{
+	struct nd_namespace_common *ndns;
+	struct device *found;
+	char *name;
+
+	if (dev->driver) {
+		dev_dbg(dev, "namespace already active\n");
+		return -EBUSY;
+	}
+
+	name = kstrndup(buf, len, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+	strim(name);
+
+	if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0)
+		/* pass */;
+	else {
+		len = -EINVAL;
+		goto out;
+	}
+
+	ndns = *_ndns;
+	if (strcmp(name, "") == 0) {
+		nd_detach_and_reset(dev, _ndns);
+		goto out;
+	} else if (ndns) {
+		dev_dbg(dev, "namespace already set to: %s\n",
+				dev_name(&ndns->dev));
+		len = -EBUSY;
+		goto out;
+	}
+
+	found = device_find_child(dev->parent, name, namespace_match);
+	if (!found) {
+		dev_dbg(dev, "'%s' not found under %s\n", name,
+				dev_name(dev->parent));
+		len = -ENODEV;
+		goto out;
+	}
+
+	ndns = to_ndns(found);
+
+	switch (ndns->claim_class) {
+	case NVDIMM_CCLASS_NONE:
+		break;
+	case NVDIMM_CCLASS_BTT:
+	case NVDIMM_CCLASS_BTT2:
+		if (!is_nd_btt(dev)) {
+			len = -EBUSY;
+			goto out_attach;
+		}
+		break;
+	case NVDIMM_CCLASS_PFN:
+		if (!is_nd_pfn(dev)) {
+			len = -EBUSY;
+			goto out_attach;
+		}
+		break;
+	case NVDIMM_CCLASS_DAX:
+		if (!is_nd_dax(dev)) {
+			len = -EBUSY;
+			goto out_attach;
+		}
+		break;
+	default:
+		len = -EBUSY;
+		goto out_attach;
+		break;
+	}
+
+	if (__nvdimm_namespace_capacity(ndns) < SZ_16M) {
+		dev_dbg(dev, "%s too small to host\n", name);
+		len = -ENXIO;
+		goto out_attach;
+	}
+
+	WARN_ON_ONCE(!is_nvdimm_bus_locked(dev));
+	if (!__nd_attach_ndns(dev, ndns, _ndns)) {
+		dev_dbg(dev, "%s already claimed\n",
+				dev_name(&ndns->dev));
+		len = -EBUSY;
+	}
+
+ out_attach:
+	put_device(&ndns->dev); /* from device_find_child */
+ out:
+	kfree(name);
+	return len;
+}
+
+/*
+ * nd_sb_checksum: compute checksum for a generic info block
+ *
+ * Returns a fletcher64 checksum of everything in the given info block
+ * except the last field (since that's where the checksum lives).
+ */
+u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb)
+{
+	u64 sum;
+	__le64 sum_save;
+
+	BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
+	BUILD_BUG_ON(sizeof(struct nd_pfn_sb) != SZ_4K);
+	BUILD_BUG_ON(sizeof(struct nd_gen_sb) != SZ_4K);
+
+	sum_save = nd_gen_sb->checksum;
+	nd_gen_sb->checksum = 0;
+	sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1);
+	nd_gen_sb->checksum = sum_save;
+	return sum;
+}
+EXPORT_SYMBOL(nd_sb_checksum);
+
+static int nsio_rw_bytes(struct nd_namespace_common *ndns,
+		resource_size_t offset, void *buf, size_t size, int rw,
+		unsigned long flags)
+{
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
+	sector_t sector = offset >> 9;
+	int rc = 0, ret = 0;
+
+	if (unlikely(!size))
+		return 0;
+
+	if (unlikely(offset + size > nsio->size)) {
+		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+		return -EFAULT;
+	}
+
+	if (rw == READ) {
+		if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
+			return -EIO;
+		if (copy_mc_to_kernel(buf, nsio->addr + offset, size) != 0)
+			return -EIO;
+		return 0;
+	}
+
+	if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
+		if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
+				&& !(flags & NVDIMM_IO_ATOMIC)) {
+			long cleared;
+
+			might_sleep();
+			cleared = nvdimm_clear_poison(&ndns->dev,
+					nsio->res.start + offset, size);
+			if (cleared < size)
+				rc = -EIO;
+			if (cleared > 0 && cleared / 512) {
+				cleared /= 512;
+				badblocks_clear(&nsio->bb, sector, cleared);
+			}
+			arch_invalidate_pmem(nsio->addr + offset, size);
+		} else
+			rc = -EIO;
+	}
+
+	memcpy_flushcache(nsio->addr + offset, buf, size);
+	ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL);
+	if (ret)
+		rc = ret;
+
+	return rc;
+}
+
+int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio,
+		resource_size_t size)
+{
+	struct nd_namespace_common *ndns = &nsio->common;
+	struct range range = {
+		.start = nsio->res.start,
+		.end = nsio->res.end,
+	};
+
+	nsio->size = size;
+	if (!devm_request_mem_region(dev, range.start, size,
+				dev_name(&ndns->dev))) {
+		dev_warn(dev, "could not reserve region %pR\n", &nsio->res);
+		return -EBUSY;
+	}
+
+	ndns->rw_bytes = nsio_rw_bytes;
+	if (devm_init_badblocks(dev, &nsio->bb))
+		return -ENOMEM;
+	nvdimm_badblocks_populate(to_nd_region(ndns->dev.parent), &nsio->bb,
+			&range);
+
+	nsio->addr = devm_memremap(dev, range.start, size, ARCH_MEMREMAP_PMEM);
+
+	return PTR_ERR_OR_ZERO(nsio->addr);
+}
+
+void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio)
+{
+	struct resource *res = &nsio->res;
+
+	devm_memunmap(dev, nsio->addr);
+	devm_exit_badblocks(dev, &nsio->bb);
+	devm_release_mem_region(dev, res->start, nsio->size);
+}
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
new file mode 100644
index 0000000000..d91799b71d
--- /dev/null
+++ b/drivers/nvdimm/core.c
@@ -0,0 +1,576 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#include <linux/libnvdimm.h>
+#include <linux/suspend.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
+#include <linux/device.h>
+#include <linux/ctype.h>
+#include <linux/ndctl.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include "nd-core.h"
+#include "nd.h"
+
+LIST_HEAD(nvdimm_bus_list);
+DEFINE_MUTEX(nvdimm_bus_list_mutex);
+
+void nvdimm_bus_lock(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	if (!nvdimm_bus)
+		return;
+	mutex_lock(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(nvdimm_bus_lock);
+
+void nvdimm_bus_unlock(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	if (!nvdimm_bus)
+		return;
+	mutex_unlock(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(nvdimm_bus_unlock);
+
+bool is_nvdimm_bus_locked(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	if (!nvdimm_bus)
+		return false;
+	return mutex_is_locked(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(is_nvdimm_bus_locked);
+
+struct nvdimm_map {
+	struct nvdimm_bus *nvdimm_bus;
+	struct list_head list;
+	resource_size_t offset;
+	unsigned long flags;
+	size_t size;
+	union {
+		void *mem;
+		void __iomem *iomem;
+	};
+	struct kref kref;
+};
+
+static struct nvdimm_map *find_nvdimm_map(struct device *dev,
+		resource_size_t offset)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nvdimm_map *nvdimm_map;
+
+	list_for_each_entry(nvdimm_map, &nvdimm_bus->mapping_list, list)
+		if (nvdimm_map->offset == offset)
+			return nvdimm_map;
+	return NULL;
+}
+
+static struct nvdimm_map *alloc_nvdimm_map(struct device *dev,
+		resource_size_t offset, size_t size, unsigned long flags)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nvdimm_map *nvdimm_map;
+
+	nvdimm_map = kzalloc(sizeof(*nvdimm_map), GFP_KERNEL);
+	if (!nvdimm_map)
+		return NULL;
+
+	INIT_LIST_HEAD(&nvdimm_map->list);
+	nvdimm_map->nvdimm_bus = nvdimm_bus;
+	nvdimm_map->offset = offset;
+	nvdimm_map->flags = flags;
+	nvdimm_map->size = size;
+	kref_init(&nvdimm_map->kref);
+
+	if (!request_mem_region(offset, size, dev_name(&nvdimm_bus->dev))) {
+		dev_err(&nvdimm_bus->dev, "failed to request %pa + %zd for %s\n",
+				&offset, size, dev_name(dev));
+		goto err_request_region;
+	}
+
+	if (flags)
+		nvdimm_map->mem = memremap(offset, size, flags);
+	else
+		nvdimm_map->iomem = ioremap(offset, size);
+
+	if (!nvdimm_map->mem)
+		goto err_map;
+
+	dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev), "%s: bus unlocked!",
+			__func__);
+	list_add(&nvdimm_map->list, &nvdimm_bus->mapping_list);
+
+	return nvdimm_map;
+
+ err_map:
+	release_mem_region(offset, size);
+ err_request_region:
+	kfree(nvdimm_map);
+	return NULL;
+}
+
+static void nvdimm_map_release(struct kref *kref)
+{
+	struct nvdimm_bus *nvdimm_bus;
+	struct nvdimm_map *nvdimm_map;
+
+	nvdimm_map = container_of(kref, struct nvdimm_map, kref);
+	nvdimm_bus = nvdimm_map->nvdimm_bus;
+
+	dev_dbg(&nvdimm_bus->dev, "%pa\n", &nvdimm_map->offset);
+	list_del(&nvdimm_map->list);
+	if (nvdimm_map->flags)
+		memunmap(nvdimm_map->mem);
+	else
+		iounmap(nvdimm_map->iomem);
+	release_mem_region(nvdimm_map->offset, nvdimm_map->size);
+	kfree(nvdimm_map);
+}
+
+static void nvdimm_map_put(void *data)
+{
+	struct nvdimm_map *nvdimm_map = data;
+	struct nvdimm_bus *nvdimm_bus = nvdimm_map->nvdimm_bus;
+
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	kref_put(&nvdimm_map->kref, nvdimm_map_release);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+/**
+ * devm_nvdimm_memremap - map a resource that is shared across regions
+ * @dev: device that will own a reference to the shared mapping
+ * @offset: physical base address of the mapping
+ * @size: mapping size
+ * @flags: memremap flags, or, if zero, perform an ioremap instead
+ */
+void *devm_nvdimm_memremap(struct device *dev, resource_size_t offset,
+		size_t size, unsigned long flags)
+{
+	struct nvdimm_map *nvdimm_map;
+
+	nvdimm_bus_lock(dev);
+	nvdimm_map = find_nvdimm_map(dev, offset);
+	if (!nvdimm_map)
+		nvdimm_map = alloc_nvdimm_map(dev, offset, size, flags);
+	else
+		kref_get(&nvdimm_map->kref);
+	nvdimm_bus_unlock(dev);
+
+	if (!nvdimm_map)
+		return NULL;
+
+	if (devm_add_action_or_reset(dev, nvdimm_map_put, nvdimm_map))
+		return NULL;
+
+	return nvdimm_map->mem;
+}
+EXPORT_SYMBOL_GPL(devm_nvdimm_memremap);
+
+u64 nd_fletcher64(void *addr, size_t len, bool le)
+{
+	u32 *buf = addr;
+	u32 lo32 = 0;
+	u64 hi32 = 0;
+	int i;
+
+	for (i = 0; i < len / sizeof(u32); i++) {
+		lo32 += le ? le32_to_cpu((__le32) buf[i]) : buf[i];
+		hi32 += lo32;
+	}
+
+	return hi32 << 32 | lo32;
+}
+EXPORT_SYMBOL_GPL(nd_fletcher64);
+
+struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus)
+{
+	/* struct nvdimm_bus definition is private to libnvdimm */
+	return nvdimm_bus->nd_desc;
+}
+EXPORT_SYMBOL_GPL(to_nd_desc);
+
+struct device *to_nvdimm_bus_dev(struct nvdimm_bus *nvdimm_bus)
+{
+	/* struct nvdimm_bus definition is private to libnvdimm */
+	return &nvdimm_bus->dev;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm_bus_dev);
+
+/**
+ * nd_uuid_store: common implementation for writing 'uuid' sysfs attributes
+ * @dev: container device for the uuid property
+ * @uuid_out: uuid buffer to replace
+ * @buf: raw sysfs buffer to parse
+ *
+ * Enforce that uuids can only be changed while the device is disabled
+ * (driver detached)
+ * LOCKING: expects device_lock() is held on entry
+ */
+int nd_uuid_store(struct device *dev, uuid_t **uuid_out, const char *buf,
+		size_t len)
+{
+	uuid_t uuid;
+	int rc;
+
+	if (dev->driver)
+		return -EBUSY;
+
+	rc = uuid_parse(buf, &uuid);
+	if (rc)
+		return rc;
+
+	kfree(*uuid_out);
+	*uuid_out = kmemdup(&uuid, sizeof(uuid), GFP_KERNEL);
+	if (!(*uuid_out))
+		return -ENOMEM;
+
+	return 0;
+}
+
+ssize_t nd_size_select_show(unsigned long current_size,
+		const unsigned long *supported, char *buf)
+{
+	ssize_t len = 0;
+	int i;
+
+	for (i = 0; supported[i]; i++)
+		if (current_size == supported[i])
+			len += sprintf(buf + len, "[%ld] ", supported[i]);
+		else
+			len += sprintf(buf + len, "%ld ", supported[i]);
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
+ssize_t nd_size_select_store(struct device *dev, const char *buf,
+		unsigned long *current_size, const unsigned long *supported)
+{
+	unsigned long lbasize;
+	int rc, i;
+
+	if (dev->driver)
+		return -EBUSY;
+
+	rc = kstrtoul(buf, 0, &lbasize);
+	if (rc)
+		return rc;
+
+	for (i = 0; supported[i]; i++)
+		if (lbasize == supported[i])
+			break;
+
+	if (supported[i]) {
+		*current_size = lbasize;
+		return 0;
+	} else {
+		return -EINVAL;
+	}
+}
+
+static ssize_t commands_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	int cmd, len = 0;
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+
+	for_each_set_bit(cmd, &nd_desc->cmd_mask, BITS_PER_LONG)
+		len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd));
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+static DEVICE_ATTR_RO(commands);
+
+static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus)
+{
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	struct device *parent = nvdimm_bus->dev.parent;
+
+	if (nd_desc->provider_name)
+		return nd_desc->provider_name;
+	else if (parent)
+		return dev_name(parent);
+	else
+		return "unknown";
+}
+
+static ssize_t provider_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+	return sprintf(buf, "%s\n", nvdimm_bus_provider(nvdimm_bus));
+}
+static DEVICE_ATTR_RO(provider);
+
+static int flush_namespaces(struct device *dev, void *data)
+{
+	device_lock(dev);
+	device_unlock(dev);
+	return 0;
+}
+
+static int flush_regions_dimms(struct device *dev, void *data)
+{
+	device_lock(dev);
+	device_unlock(dev);
+	device_for_each_child(dev, NULL, flush_namespaces);
+	return 0;
+}
+
+static ssize_t wait_probe_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	int rc;
+
+	if (nd_desc->flush_probe) {
+		rc = nd_desc->flush_probe(nd_desc);
+		if (rc)
+			return rc;
+	}
+	nd_synchronize();
+	device_for_each_child(dev, NULL, flush_regions_dimms);
+	return sprintf(buf, "1\n");
+}
+static DEVICE_ATTR_RO(wait_probe);
+
+static struct attribute *nvdimm_bus_attributes[] = {
+	&dev_attr_commands.attr,
+	&dev_attr_wait_probe.attr,
+	&dev_attr_provider.attr,
+	NULL,
+};
+
+static const struct attribute_group nvdimm_bus_attribute_group = {
+	.attrs = nvdimm_bus_attributes,
+};
+
+static ssize_t capability_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	enum nvdimm_fwa_capability cap;
+
+	if (!nd_desc->fw_ops)
+		return -EOPNOTSUPP;
+
+	cap = nd_desc->fw_ops->capability(nd_desc);
+
+	switch (cap) {
+	case NVDIMM_FWA_CAP_QUIESCE:
+		return sprintf(buf, "quiesce\n");
+	case NVDIMM_FWA_CAP_LIVE:
+		return sprintf(buf, "live\n");
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static DEVICE_ATTR_RO(capability);
+
+static ssize_t activate_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	enum nvdimm_fwa_capability cap;
+	enum nvdimm_fwa_state state;
+
+	if (!nd_desc->fw_ops)
+		return -EOPNOTSUPP;
+
+	cap = nd_desc->fw_ops->capability(nd_desc);
+	state = nd_desc->fw_ops->activate_state(nd_desc);
+
+	if (cap < NVDIMM_FWA_CAP_QUIESCE)
+		return -EOPNOTSUPP;
+
+	switch (state) {
+	case NVDIMM_FWA_IDLE:
+		return sprintf(buf, "idle\n");
+	case NVDIMM_FWA_BUSY:
+		return sprintf(buf, "busy\n");
+	case NVDIMM_FWA_ARMED:
+		return sprintf(buf, "armed\n");
+	case NVDIMM_FWA_ARM_OVERFLOW:
+		return sprintf(buf, "overflow\n");
+	default:
+		return -ENXIO;
+	}
+}
+
+static int exec_firmware_activate(void *data)
+{
+	struct nvdimm_bus_descriptor *nd_desc = data;
+
+	return nd_desc->fw_ops->activate(nd_desc);
+}
+
+static ssize_t activate_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	enum nvdimm_fwa_state state;
+	bool quiesce;
+	ssize_t rc;
+
+	if (!nd_desc->fw_ops)
+		return -EOPNOTSUPP;
+
+	if (sysfs_streq(buf, "live"))
+		quiesce = false;
+	else if (sysfs_streq(buf, "quiesce"))
+		quiesce = true;
+	else
+		return -EINVAL;
+
+	state = nd_desc->fw_ops->activate_state(nd_desc);
+
+	switch (state) {
+	case NVDIMM_FWA_BUSY:
+		rc = -EBUSY;
+		break;
+	case NVDIMM_FWA_ARMED:
+	case NVDIMM_FWA_ARM_OVERFLOW:
+		if (quiesce)
+			rc = hibernate_quiet_exec(exec_firmware_activate, nd_desc);
+		else
+			rc = nd_desc->fw_ops->activate(nd_desc);
+		break;
+	case NVDIMM_FWA_IDLE:
+	default:
+		rc = -ENXIO;
+	}
+
+	if (rc == 0)
+		rc = len;
+	return rc;
+}
+
+static DEVICE_ATTR_ADMIN_RW(activate);
+
+static umode_t nvdimm_bus_firmware_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, typeof(*dev), kobj);
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	enum nvdimm_fwa_capability cap;
+
+	/*
+	 * Both 'activate' and 'capability' disappear when no ops
+	 * detected, or a negative capability is indicated.
+	 */
+	if (!nd_desc->fw_ops)
+		return 0;
+
+	cap = nd_desc->fw_ops->capability(nd_desc);
+	if (cap < NVDIMM_FWA_CAP_QUIESCE)
+		return 0;
+
+	return a->mode;
+}
+static struct attribute *nvdimm_bus_firmware_attributes[] = {
+	&dev_attr_activate.attr,
+	&dev_attr_capability.attr,
+	NULL,
+};
+
+static const struct attribute_group nvdimm_bus_firmware_attribute_group = {
+	.name = "firmware",
+	.attrs = nvdimm_bus_firmware_attributes,
+	.is_visible = nvdimm_bus_firmware_visible,
+};
+
+const struct attribute_group *nvdimm_bus_attribute_groups[] = {
+	&nvdimm_bus_attribute_group,
+	&nvdimm_bus_firmware_attribute_group,
+	NULL,
+};
+
+int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
+{
+	return badrange_add(&nvdimm_bus->badrange, addr, length);
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange);
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
+{
+	struct blk_integrity bi;
+
+	if (meta_size == 0)
+		return 0;
+
+	memset(&bi, 0, sizeof(bi));
+
+	bi.tuple_size = meta_size;
+	bi.tag_size = meta_size;
+
+	blk_integrity_register(disk, &bi);
+	blk_queue_max_integrity_segments(disk->queue, 1);
+
+	return 0;
+}
+EXPORT_SYMBOL(nd_integrity_init);
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
+{
+	return 0;
+}
+EXPORT_SYMBOL(nd_integrity_init);
+
+#endif
+
+static __init int libnvdimm_init(void)
+{
+	int rc;
+
+	rc = nvdimm_bus_init();
+	if (rc)
+		return rc;
+	rc = nvdimm_init();
+	if (rc)
+		goto err_dimm;
+	rc = nd_region_init();
+	if (rc)
+		goto err_region;
+
+	nd_label_init();
+
+	return 0;
+ err_region:
+	nvdimm_exit();
+ err_dimm:
+	nvdimm_bus_exit();
+	return rc;
+}
+
+static __exit void libnvdimm_exit(void)
+{
+	WARN_ON(!list_empty(&nvdimm_bus_list));
+	nd_region_exit();
+	nvdimm_exit();
+	nvdimm_bus_exit();
+	nvdimm_devs_exit();
+}
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+subsys_initcall(libnvdimm_init);
+module_exit(libnvdimm_exit);
diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
new file mode 100644
index 0000000000..3bd61f2457
--- /dev/null
+++ b/drivers/nvdimm/dax_devs.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
+ */
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "pfn.h"
+#include "nd.h"
+
+static void nd_dax_release(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_dax *nd_dax = to_nd_dax(dev);
+	struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
+
+	dev_dbg(dev, "trace\n");
+	nd_detach_ndns(dev, &nd_pfn->ndns);
+	ida_simple_remove(&nd_region->dax_ida, nd_pfn->id);
+	kfree(nd_pfn->uuid);
+	kfree(nd_dax);
+}
+
+struct nd_dax *to_nd_dax(struct device *dev)
+{
+	struct nd_dax *nd_dax = container_of(dev, struct nd_dax, nd_pfn.dev);
+
+	WARN_ON(!is_nd_dax(dev));
+	return nd_dax;
+}
+EXPORT_SYMBOL(to_nd_dax);
+
+static const struct device_type nd_dax_device_type = {
+	.name = "nd_dax",
+	.release = nd_dax_release,
+	.groups = nd_pfn_attribute_groups,
+};
+
+bool is_nd_dax(const struct device *dev)
+{
+	return dev ? dev->type == &nd_dax_device_type : false;
+}
+EXPORT_SYMBOL(is_nd_dax);
+
+static struct nd_dax *nd_dax_alloc(struct nd_region *nd_region)
+{
+	struct nd_pfn *nd_pfn;
+	struct nd_dax *nd_dax;
+	struct device *dev;
+
+	nd_dax = kzalloc(sizeof(*nd_dax), GFP_KERNEL);
+	if (!nd_dax)
+		return NULL;
+
+	nd_pfn = &nd_dax->nd_pfn;
+	nd_pfn->id = ida_simple_get(&nd_region->dax_ida, 0, 0, GFP_KERNEL);
+	if (nd_pfn->id < 0) {
+		kfree(nd_dax);
+		return NULL;
+	}
+
+	dev = &nd_pfn->dev;
+	dev_set_name(dev, "dax%d.%d", nd_region->id, nd_pfn->id);
+	dev->type = &nd_dax_device_type;
+	dev->parent = &nd_region->dev;
+
+	return nd_dax;
+}
+
+struct device *nd_dax_create(struct nd_region *nd_region)
+{
+	struct device *dev = NULL;
+	struct nd_dax *nd_dax;
+
+	if (!is_memory(&nd_region->dev))
+		return NULL;
+
+	nd_dax = nd_dax_alloc(nd_region);
+	if (nd_dax)
+		dev = nd_pfn_devinit(&nd_dax->nd_pfn, NULL);
+	nd_device_register(dev);
+	return dev;
+}
+
+int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns)
+{
+	int rc;
+	struct nd_dax *nd_dax;
+	struct device *dax_dev;
+	struct nd_pfn *nd_pfn;
+	struct nd_pfn_sb *pfn_sb;
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+	if (ndns->force_raw)
+		return -ENODEV;
+
+	switch (ndns->claim_class) {
+	case NVDIMM_CCLASS_NONE:
+	case NVDIMM_CCLASS_DAX:
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	nvdimm_bus_lock(&ndns->dev);
+	nd_dax = nd_dax_alloc(nd_region);
+	nd_pfn = &nd_dax->nd_pfn;
+	dax_dev = nd_pfn_devinit(nd_pfn, ndns);
+	nvdimm_bus_unlock(&ndns->dev);
+	if (!dax_dev)
+		return -ENOMEM;
+	pfn_sb = devm_kmalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
+	nd_pfn->pfn_sb = pfn_sb;
+	rc = nd_pfn_validate(nd_pfn, DAX_SIG);
+	dev_dbg(dev, "dax: %s\n", rc == 0 ? dev_name(dax_dev) : "<none>");
+	if (rc < 0) {
+		nd_detach_ndns(dax_dev, &nd_pfn->ndns);
+		put_device(dax_dev);
+	} else
+		nd_device_register(dax_dev);
+
+	return rc;
+}
+EXPORT_SYMBOL(nd_dax_probe);
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
new file mode 100644
index 0000000000..91d9163ee3
--- /dev/null
+++ b/drivers/nvdimm/dimm.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/ndctl.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/nd.h>
+#include "label.h"
+#include "nd.h"
+
+static int nvdimm_probe(struct device *dev)
+{
+	struct nvdimm_drvdata *ndd;
+	int rc;
+
+	rc = nvdimm_security_setup_events(dev);
+	if (rc < 0) {
+		dev_err(dev, "security event setup failed: %d\n", rc);
+		return rc;
+	}
+
+	rc = nvdimm_check_config_data(dev);
+	if (rc) {
+		/* not required for non-aliased nvdimm, ex. NVDIMM-N */
+		if (rc == -ENOTTY)
+			rc = 0;
+		return rc;
+	}
+
+	/*
+	 * The locked status bit reflects explicit status codes from the
+	 * label reading commands, revalidate it each time the driver is
+	 * activated and re-reads the label area.
+	 */
+	nvdimm_clear_locked(dev);
+
+	ndd = kzalloc(sizeof(*ndd), GFP_KERNEL);
+	if (!ndd)
+		return -ENOMEM;
+
+	dev_set_drvdata(dev, ndd);
+	ndd->dpa.name = dev_name(dev);
+	ndd->ns_current = -1;
+	ndd->ns_next = -1;
+	ndd->dpa.start = 0;
+	ndd->dpa.end = -1;
+	ndd->dev = dev;
+	get_device(dev);
+	kref_init(&ndd->kref);
+
+	/*
+	 * Attempt to unlock, if the DIMM supports security commands,
+	 * otherwise the locked indication is determined by explicit
+	 * status codes from the label reading commands.
+	 */
+	rc = nvdimm_security_unlock(dev);
+	if (rc < 0)
+		dev_dbg(dev, "failed to unlock dimm: %d\n", rc);
+
+
+	/*
+	 * EACCES failures reading the namespace label-area-properties
+	 * are interpreted as the DIMM capacity being locked but the
+	 * namespace labels themselves being accessible.
+	 */
+	rc = nvdimm_init_nsarea(ndd);
+	if (rc == -EACCES) {
+		/*
+		 * See nvdimm_namespace_common_probe() where we fail to
+		 * allow namespaces to probe while the DIMM is locked,
+		 * but we do allow for namespace enumeration.
+		 */
+		nvdimm_set_locked(dev);
+		rc = 0;
+	}
+	if (rc)
+		goto err;
+
+	/*
+	 * EACCES failures reading the namespace label-data are
+	 * interpreted as the label area being locked in addition to the
+	 * DIMM capacity. We fail the dimm probe to prevent regions from
+	 * attempting to parse the label area.
+	 */
+	rc = nd_label_data_init(ndd);
+	if (rc == -EACCES)
+		nvdimm_set_locked(dev);
+	if (rc)
+		goto err;
+
+	dev_dbg(dev, "config data size: %d\n", ndd->nsarea.config_size);
+
+	nvdimm_bus_lock(dev);
+	if (ndd->ns_current >= 0) {
+		rc = nd_label_reserve_dpa(ndd);
+		if (rc == 0)
+			nvdimm_set_labeling(dev);
+	}
+	nvdimm_bus_unlock(dev);
+
+	if (rc)
+		goto err;
+
+	return 0;
+
+ err:
+	put_ndd(ndd);
+	return rc;
+}
+
+static void nvdimm_remove(struct device *dev)
+{
+	struct nvdimm_drvdata *ndd = dev_get_drvdata(dev);
+
+	nvdimm_bus_lock(dev);
+	dev_set_drvdata(dev, NULL);
+	nvdimm_bus_unlock(dev);
+	put_ndd(ndd);
+}
+
+static struct nd_device_driver nvdimm_driver = {
+	.probe = nvdimm_probe,
+	.remove = nvdimm_remove,
+	.drv = {
+		.name = "nvdimm",
+	},
+	.type = ND_DRIVER_DIMM,
+};
+
+int __init nvdimm_init(void)
+{
+	return nd_driver_register(&nvdimm_driver);
+}
+
+void nvdimm_exit(void)
+{
+	driver_unregister(&nvdimm_driver.drv);
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DIMM);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
new file mode 100644
index 0000000000..1273873582
--- /dev/null
+++ b/drivers/nvdimm/dimm_devs.c
@@ -0,0 +1,881 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/moduleparam.h>
+#include <linux/vmalloc.h>
+#include <linux/device.h>
+#include <linux/ndctl.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "label.h"
+#include "pmem.h"
+#include "nd.h"
+
+static DEFINE_IDA(dimm_ida);
+
+/*
+ * Retrieve bus and dimm handle and return if this bus supports
+ * get_config_data commands
+ */
+int nvdimm_check_config_data(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	if (!nvdimm->cmd_mask ||
+	    !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) {
+		if (test_bit(NDD_LABELING, &nvdimm->flags))
+			return -ENXIO;
+		else
+			return -ENOTTY;
+	}
+
+	return 0;
+}
+
+static int validate_dimm(struct nvdimm_drvdata *ndd)
+{
+	int rc;
+
+	if (!ndd)
+		return -EINVAL;
+
+	rc = nvdimm_check_config_data(ndd->dev);
+	if (rc)
+		dev_dbg(ndd->dev, "%ps: %s error: %d\n",
+				__builtin_return_address(0), __func__, rc);
+	return rc;
+}
+
+/**
+ * nvdimm_init_nsarea - determine the geometry of a dimm's namespace area
+ * @nvdimm: dimm to initialize
+ */
+int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
+{
+	struct nd_cmd_get_config_size *cmd = &ndd->nsarea;
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+	struct nvdimm_bus_descriptor *nd_desc;
+	int rc = validate_dimm(ndd);
+	int cmd_rc = 0;
+
+	if (rc)
+		return rc;
+
+	if (cmd->config_size)
+		return 0; /* already valid */
+
+	memset(cmd, 0, sizeof(*cmd));
+	nd_desc = nvdimm_bus->nd_desc;
+	rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+			ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), &cmd_rc);
+	if (rc < 0)
+		return rc;
+	return cmd_rc;
+}
+
+int nvdimm_get_config_data(struct nvdimm_drvdata *ndd, void *buf,
+			   size_t offset, size_t len)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	int rc = validate_dimm(ndd), cmd_rc = 0;
+	struct nd_cmd_get_config_data_hdr *cmd;
+	size_t max_cmd_size, buf_offset;
+
+	if (rc)
+		return rc;
+
+	if (offset + len > ndd->nsarea.config_size)
+		return -ENXIO;
+
+	max_cmd_size = min_t(u32, len, ndd->nsarea.max_xfer);
+	cmd = kvzalloc(max_cmd_size + sizeof(*cmd), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	for (buf_offset = 0; len;
+	     len -= cmd->in_length, buf_offset += cmd->in_length) {
+		size_t cmd_size;
+
+		cmd->in_offset = offset + buf_offset;
+		cmd->in_length = min(max_cmd_size, len);
+
+		cmd_size = sizeof(*cmd) + cmd->in_length;
+
+		rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+				ND_CMD_GET_CONFIG_DATA, cmd, cmd_size, &cmd_rc);
+		if (rc < 0)
+			break;
+		if (cmd_rc < 0) {
+			rc = cmd_rc;
+			break;
+		}
+
+		/* out_buf should be valid, copy it into our output buffer */
+		memcpy(buf + buf_offset, cmd->out_buf, cmd->in_length);
+	}
+	kvfree(cmd);
+
+	return rc;
+}
+
+int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
+		void *buf, size_t len)
+{
+	size_t max_cmd_size, buf_offset;
+	struct nd_cmd_set_config_hdr *cmd;
+	int rc = validate_dimm(ndd), cmd_rc = 0;
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+
+	if (rc)
+		return rc;
+
+	if (offset + len > ndd->nsarea.config_size)
+		return -ENXIO;
+
+	max_cmd_size = min_t(u32, len, ndd->nsarea.max_xfer);
+	cmd = kvzalloc(max_cmd_size + sizeof(*cmd) + sizeof(u32), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	for (buf_offset = 0; len; len -= cmd->in_length,
+			buf_offset += cmd->in_length) {
+		size_t cmd_size;
+
+		cmd->in_offset = offset + buf_offset;
+		cmd->in_length = min(max_cmd_size, len);
+		memcpy(cmd->in_buf, buf + buf_offset, cmd->in_length);
+
+		/* status is output in the last 4-bytes of the command buffer */
+		cmd_size = sizeof(*cmd) + cmd->in_length + sizeof(u32);
+
+		rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+				ND_CMD_SET_CONFIG_DATA, cmd, cmd_size, &cmd_rc);
+		if (rc < 0)
+			break;
+		if (cmd_rc < 0) {
+			rc = cmd_rc;
+			break;
+		}
+	}
+	kvfree(cmd);
+
+	return rc;
+}
+
+void nvdimm_set_labeling(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	set_bit(NDD_LABELING, &nvdimm->flags);
+}
+
+void nvdimm_set_locked(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	set_bit(NDD_LOCKED, &nvdimm->flags);
+}
+
+void nvdimm_clear_locked(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	clear_bit(NDD_LOCKED, &nvdimm->flags);
+}
+
+static void nvdimm_release(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	ida_simple_remove(&dimm_ida, nvdimm->id);
+	kfree(nvdimm);
+}
+
+struct nvdimm *to_nvdimm(struct device *dev)
+{
+	struct nvdimm *nvdimm = container_of(dev, struct nvdimm, dev);
+
+	WARN_ON(!is_nvdimm(dev));
+	return nvdimm;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm);
+
+struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping)
+{
+	struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+	WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev));
+
+	return dev_get_drvdata(&nvdimm->dev);
+}
+EXPORT_SYMBOL(to_ndd);
+
+void nvdimm_drvdata_release(struct kref *kref)
+{
+	struct nvdimm_drvdata *ndd = container_of(kref, typeof(*ndd), kref);
+	struct device *dev = ndd->dev;
+	struct resource *res, *_r;
+
+	dev_dbg(dev, "trace\n");
+	nvdimm_bus_lock(dev);
+	for_each_dpa_resource_safe(ndd, res, _r)
+		nvdimm_free_dpa(ndd, res);
+	nvdimm_bus_unlock(dev);
+
+	kvfree(ndd->data);
+	kfree(ndd);
+	put_device(dev);
+}
+
+void get_ndd(struct nvdimm_drvdata *ndd)
+{
+	kref_get(&ndd->kref);
+}
+
+void put_ndd(struct nvdimm_drvdata *ndd)
+{
+	if (ndd)
+		kref_put(&ndd->kref, nvdimm_drvdata_release);
+}
+
+const char *nvdimm_name(struct nvdimm *nvdimm)
+{
+	return dev_name(&nvdimm->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_name);
+
+struct kobject *nvdimm_kobj(struct nvdimm *nvdimm)
+{
+	return &nvdimm->dev.kobj;
+}
+EXPORT_SYMBOL_GPL(nvdimm_kobj);
+
+unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm)
+{
+	return nvdimm->cmd_mask;
+}
+EXPORT_SYMBOL_GPL(nvdimm_cmd_mask);
+
+void *nvdimm_provider_data(struct nvdimm *nvdimm)
+{
+	if (nvdimm)
+		return nvdimm->provider_data;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nvdimm_provider_data);
+
+static ssize_t commands_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	int cmd, len = 0;
+
+	if (!nvdimm->cmd_mask)
+		return sprintf(buf, "\n");
+
+	for_each_set_bit(cmd, &nvdimm->cmd_mask, BITS_PER_LONG)
+		len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd));
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+static DEVICE_ATTR_RO(commands);
+
+static ssize_t flags_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	return sprintf(buf, "%s%s\n",
+			test_bit(NDD_LABELING, &nvdimm->flags) ? "label " : "",
+			test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
+}
+static DEVICE_ATTR_RO(flags);
+
+static ssize_t state_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	/*
+	 * The state may be in the process of changing, userspace should
+	 * quiesce probing if it wants a static answer
+	 */
+	nvdimm_bus_lock(dev);
+	nvdimm_bus_unlock(dev);
+	return sprintf(buf, "%s\n", atomic_read(&nvdimm->busy)
+			? "active" : "idle");
+}
+static DEVICE_ATTR_RO(state);
+
+static ssize_t __available_slots_show(struct nvdimm_drvdata *ndd, char *buf)
+{
+	struct device *dev;
+	ssize_t rc;
+	u32 nfree;
+
+	if (!ndd)
+		return -ENXIO;
+
+	dev = ndd->dev;
+	nvdimm_bus_lock(dev);
+	nfree = nd_label_nfree(ndd);
+	if (nfree - 1 > nfree) {
+		dev_WARN_ONCE(dev, 1, "we ate our last label?\n");
+		nfree = 0;
+	} else
+		nfree--;
+	rc = sprintf(buf, "%d\n", nfree);
+	nvdimm_bus_unlock(dev);
+	return rc;
+}
+
+static ssize_t available_slots_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	ssize_t rc;
+
+	device_lock(dev);
+	rc = __available_slots_show(dev_get_drvdata(dev), buf);
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(available_slots);
+
+static ssize_t security_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	/*
+	 * For the test version we need to poll the "hardware" in order
+	 * to get the updated status for unlock testing.
+	 */
+	if (IS_ENABLED(CONFIG_NVDIMM_SECURITY_TEST))
+		nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER);
+
+	if (test_bit(NVDIMM_SECURITY_OVERWRITE, &nvdimm->sec.flags))
+		return sprintf(buf, "overwrite\n");
+	if (test_bit(NVDIMM_SECURITY_DISABLED, &nvdimm->sec.flags))
+		return sprintf(buf, "disabled\n");
+	if (test_bit(NVDIMM_SECURITY_UNLOCKED, &nvdimm->sec.flags))
+		return sprintf(buf, "unlocked\n");
+	if (test_bit(NVDIMM_SECURITY_LOCKED, &nvdimm->sec.flags))
+		return sprintf(buf, "locked\n");
+	return -ENOTTY;
+}
+
+static ssize_t frozen_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	return sprintf(buf, "%d\n", test_bit(NVDIMM_SECURITY_FROZEN,
+				&nvdimm->sec.flags));
+}
+static DEVICE_ATTR_RO(frozen);
+
+static ssize_t security_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+
+{
+	ssize_t rc;
+
+	/*
+	 * Require all userspace triggered security management to be
+	 * done while probing is idle and the DIMM is not in active use
+	 * in any region.
+	 */
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	rc = nvdimm_security_store(dev, buf, len);
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(security);
+
+static struct attribute *nvdimm_attributes[] = {
+	&dev_attr_state.attr,
+	&dev_attr_flags.attr,
+	&dev_attr_commands.attr,
+	&dev_attr_available_slots.attr,
+	&dev_attr_security.attr,
+	&dev_attr_frozen.attr,
+	NULL,
+};
+
+static umode_t nvdimm_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, typeof(*dev), kobj);
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	if (a != &dev_attr_security.attr && a != &dev_attr_frozen.attr)
+		return a->mode;
+	if (!nvdimm->sec.flags)
+		return 0;
+
+	if (a == &dev_attr_security.attr) {
+		/* Are there any state mutation ops (make writable)? */
+		if (nvdimm->sec.ops->freeze || nvdimm->sec.ops->disable
+				|| nvdimm->sec.ops->change_key
+				|| nvdimm->sec.ops->erase
+				|| nvdimm->sec.ops->overwrite)
+			return a->mode;
+		return 0444;
+	}
+
+	if (nvdimm->sec.ops->freeze)
+		return a->mode;
+	return 0;
+}
+
+static const struct attribute_group nvdimm_attribute_group = {
+	.attrs = nvdimm_attributes,
+	.is_visible = nvdimm_visible,
+};
+
+static ssize_t result_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	enum nvdimm_fwa_result result;
+
+	if (!nvdimm->fw_ops)
+		return -EOPNOTSUPP;
+
+	nvdimm_bus_lock(dev);
+	result = nvdimm->fw_ops->activate_result(nvdimm);
+	nvdimm_bus_unlock(dev);
+
+	switch (result) {
+	case NVDIMM_FWA_RESULT_NONE:
+		return sprintf(buf, "none\n");
+	case NVDIMM_FWA_RESULT_SUCCESS:
+		return sprintf(buf, "success\n");
+	case NVDIMM_FWA_RESULT_FAIL:
+		return sprintf(buf, "fail\n");
+	case NVDIMM_FWA_RESULT_NOTSTAGED:
+		return sprintf(buf, "not_staged\n");
+	case NVDIMM_FWA_RESULT_NEEDRESET:
+		return sprintf(buf, "need_reset\n");
+	default:
+		return -ENXIO;
+	}
+}
+static DEVICE_ATTR_ADMIN_RO(result);
+
+static ssize_t activate_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	enum nvdimm_fwa_state state;
+
+	if (!nvdimm->fw_ops)
+		return -EOPNOTSUPP;
+
+	nvdimm_bus_lock(dev);
+	state = nvdimm->fw_ops->activate_state(nvdimm);
+	nvdimm_bus_unlock(dev);
+
+	switch (state) {
+	case NVDIMM_FWA_IDLE:
+		return sprintf(buf, "idle\n");
+	case NVDIMM_FWA_BUSY:
+		return sprintf(buf, "busy\n");
+	case NVDIMM_FWA_ARMED:
+		return sprintf(buf, "armed\n");
+	default:
+		return -ENXIO;
+	}
+}
+
+static ssize_t activate_store(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t len)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	enum nvdimm_fwa_trigger arg;
+	int rc;
+
+	if (!nvdimm->fw_ops)
+		return -EOPNOTSUPP;
+
+	if (sysfs_streq(buf, "arm"))
+		arg = NVDIMM_FWA_ARM;
+	else if (sysfs_streq(buf, "disarm"))
+		arg = NVDIMM_FWA_DISARM;
+	else
+		return -EINVAL;
+
+	nvdimm_bus_lock(dev);
+	rc = nvdimm->fw_ops->arm(nvdimm, arg);
+	nvdimm_bus_unlock(dev);
+
+	if (rc < 0)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_ADMIN_RW(activate);
+
+static struct attribute *nvdimm_firmware_attributes[] = {
+	&dev_attr_activate.attr,
+	&dev_attr_result.attr,
+	NULL,
+};
+
+static umode_t nvdimm_firmware_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, typeof(*dev), kobj);
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	enum nvdimm_fwa_capability cap;
+
+	if (!nd_desc->fw_ops)
+		return 0;
+	if (!nvdimm->fw_ops)
+		return 0;
+
+	nvdimm_bus_lock(dev);
+	cap = nd_desc->fw_ops->capability(nd_desc);
+	nvdimm_bus_unlock(dev);
+
+	if (cap < NVDIMM_FWA_CAP_QUIESCE)
+		return 0;
+
+	return a->mode;
+}
+
+static const struct attribute_group nvdimm_firmware_attribute_group = {
+	.name = "firmware",
+	.attrs = nvdimm_firmware_attributes,
+	.is_visible = nvdimm_firmware_visible,
+};
+
+static const struct attribute_group *nvdimm_attribute_groups[] = {
+	&nd_device_attribute_group,
+	&nvdimm_attribute_group,
+	&nvdimm_firmware_attribute_group,
+	NULL,
+};
+
+static const struct device_type nvdimm_device_type = {
+	.name = "nvdimm",
+	.release = nvdimm_release,
+	.groups = nvdimm_attribute_groups,
+};
+
+bool is_nvdimm(const struct device *dev)
+{
+	return dev->type == &nvdimm_device_type;
+}
+
+static struct lock_class_key nvdimm_key;
+
+struct nvdimm *__nvdimm_create(struct nvdimm_bus *nvdimm_bus,
+		void *provider_data, const struct attribute_group **groups,
+		unsigned long flags, unsigned long cmd_mask, int num_flush,
+		struct resource *flush_wpq, const char *dimm_id,
+		const struct nvdimm_security_ops *sec_ops,
+		const struct nvdimm_fw_ops *fw_ops)
+{
+	struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
+	struct device *dev;
+
+	if (!nvdimm)
+		return NULL;
+
+	nvdimm->id = ida_simple_get(&dimm_ida, 0, 0, GFP_KERNEL);
+	if (nvdimm->id < 0) {
+		kfree(nvdimm);
+		return NULL;
+	}
+
+	nvdimm->dimm_id = dimm_id;
+	nvdimm->provider_data = provider_data;
+	nvdimm->flags = flags;
+	nvdimm->cmd_mask = cmd_mask;
+	nvdimm->num_flush = num_flush;
+	nvdimm->flush_wpq = flush_wpq;
+	atomic_set(&nvdimm->busy, 0);
+	dev = &nvdimm->dev;
+	dev_set_name(dev, "nmem%d", nvdimm->id);
+	dev->parent = &nvdimm_bus->dev;
+	dev->type = &nvdimm_device_type;
+	dev->devt = MKDEV(nvdimm_major, nvdimm->id);
+	dev->groups = groups;
+	nvdimm->sec.ops = sec_ops;
+	nvdimm->fw_ops = fw_ops;
+	nvdimm->sec.overwrite_tmo = 0;
+	INIT_DELAYED_WORK(&nvdimm->dwork, nvdimm_security_overwrite_query);
+	/*
+	 * Security state must be initialized before device_add() for
+	 * attribute visibility.
+	 */
+	/* get security state and extended (master) state */
+	nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER);
+	nvdimm->sec.ext_flags = nvdimm_security_flags(nvdimm, NVDIMM_MASTER);
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &nvdimm_key);
+	if (test_bit(NDD_REGISTER_SYNC, &flags))
+		nd_device_register_sync(dev);
+	else
+		nd_device_register(dev);
+
+	return nvdimm;
+}
+EXPORT_SYMBOL_GPL(__nvdimm_create);
+
+void nvdimm_delete(struct nvdimm *nvdimm)
+{
+	struct device *dev = &nvdimm->dev;
+	bool dev_put = false;
+
+	/* We are shutting down. Make state frozen artificially. */
+	nvdimm_bus_lock(dev);
+	set_bit(NVDIMM_SECURITY_FROZEN, &nvdimm->sec.flags);
+	if (test_and_clear_bit(NDD_WORK_PENDING, &nvdimm->flags))
+		dev_put = true;
+	nvdimm_bus_unlock(dev);
+	cancel_delayed_work_sync(&nvdimm->dwork);
+	if (dev_put)
+		put_device(dev);
+	nd_device_unregister(dev, ND_SYNC);
+}
+EXPORT_SYMBOL_GPL(nvdimm_delete);
+
+static void shutdown_security_notify(void *data)
+{
+	struct nvdimm *nvdimm = data;
+
+	sysfs_put(nvdimm->sec.overwrite_state);
+}
+
+int nvdimm_security_setup_events(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	if (!nvdimm->sec.flags || !nvdimm->sec.ops
+			|| !nvdimm->sec.ops->overwrite)
+		return 0;
+	nvdimm->sec.overwrite_state = sysfs_get_dirent(dev->kobj.sd, "security");
+	if (!nvdimm->sec.overwrite_state)
+		return -ENOMEM;
+
+	return devm_add_action_or_reset(dev, shutdown_security_notify, nvdimm);
+}
+EXPORT_SYMBOL_GPL(nvdimm_security_setup_events);
+
+int nvdimm_in_overwrite(struct nvdimm *nvdimm)
+{
+	return test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags);
+}
+EXPORT_SYMBOL_GPL(nvdimm_in_overwrite);
+
+int nvdimm_security_freeze(struct nvdimm *nvdimm)
+{
+	int rc;
+
+	WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev));
+
+	if (!nvdimm->sec.ops || !nvdimm->sec.ops->freeze)
+		return -EOPNOTSUPP;
+
+	if (!nvdimm->sec.flags)
+		return -EIO;
+
+	if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) {
+		dev_warn(&nvdimm->dev, "Overwrite operation in progress.\n");
+		return -EBUSY;
+	}
+
+	rc = nvdimm->sec.ops->freeze(nvdimm);
+	nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER);
+
+	return rc;
+}
+
+static unsigned long dpa_align(struct nd_region *nd_region)
+{
+	struct device *dev = &nd_region->dev;
+
+	if (dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev),
+				"bus lock required for capacity provision\n"))
+		return 0;
+	if (dev_WARN_ONCE(dev, !nd_region->ndr_mappings || nd_region->align
+				% nd_region->ndr_mappings,
+				"invalid region align %#lx mappings: %d\n",
+				nd_region->align, nd_region->ndr_mappings))
+		return 0;
+	return nd_region->align / nd_region->ndr_mappings;
+}
+
+/**
+ * nd_pmem_max_contiguous_dpa - For the given dimm+region, return the max
+ *			   contiguous unallocated dpa range.
+ * @nd_region: constrain available space check to this reference region
+ * @nd_mapping: container of dpa-resource-root + labels
+ */
+resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
+					   struct nd_mapping *nd_mapping)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct nvdimm_bus *nvdimm_bus;
+	resource_size_t max = 0;
+	struct resource *res;
+	unsigned long align;
+
+	/* if a dimm is disabled the available capacity is zero */
+	if (!ndd)
+		return 0;
+
+	align = dpa_align(nd_region);
+	if (!align)
+		return 0;
+
+	nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+	if (__reserve_free_pmem(&nd_region->dev, nd_mapping->nvdimm))
+		return 0;
+	for_each_dpa_resource(ndd, res) {
+		resource_size_t start, end;
+
+		if (strcmp(res->name, "pmem-reserve") != 0)
+			continue;
+		/* trim free space relative to current alignment setting */
+		start = ALIGN(res->start, align);
+		end = ALIGN_DOWN(res->end + 1, align) - 1;
+		if (end < start)
+			continue;
+		if (end - start + 1 > max)
+			max = end - start + 1;
+	}
+	release_free_pmem(nvdimm_bus, nd_mapping);
+	return max;
+}
+
+/**
+ * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa
+ * @nd_mapping: container of dpa-resource-root + labels
+ * @nd_region: constrain available space check to this reference region
+ *
+ * Validate that a PMEM label, if present, aligns with the start of an
+ * interleave set.
+ */
+resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
+				      struct nd_mapping *nd_mapping)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	resource_size_t map_start, map_end, busy = 0;
+	struct resource *res;
+	unsigned long align;
+
+	if (!ndd)
+		return 0;
+
+	align = dpa_align(nd_region);
+	if (!align)
+		return 0;
+
+	map_start = nd_mapping->start;
+	map_end = map_start + nd_mapping->size - 1;
+	for_each_dpa_resource(ndd, res) {
+		resource_size_t start, end;
+
+		start = ALIGN_DOWN(res->start, align);
+		end = ALIGN(res->end + 1, align) - 1;
+		if (start >= map_start && start < map_end) {
+			if (end > map_end) {
+				nd_dbg_dpa(nd_region, ndd, res,
+					   "misaligned to iset\n");
+				return 0;
+			}
+			busy += end - start + 1;
+		} else if (end >= map_start && end <= map_end) {
+			busy += end - start + 1;
+		} else if (map_start > start && map_start < end) {
+			/* total eclipse of the mapping */
+			busy += nd_mapping->size;
+		}
+	}
+
+	if (busy < nd_mapping->size)
+		return ALIGN_DOWN(nd_mapping->size - busy, align);
+	return 0;
+}
+
+void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res)
+{
+	WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev));
+	kfree(res->name);
+	__release_region(&ndd->dpa, res->start, resource_size(res));
+}
+
+struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
+		struct nd_label_id *label_id, resource_size_t start,
+		resource_size_t n)
+{
+	char *name = kmemdup(label_id, sizeof(*label_id), GFP_KERNEL);
+	struct resource *res;
+
+	if (!name)
+		return NULL;
+
+	WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev));
+	res = __request_region(&ndd->dpa, start, n, name, 0);
+	if (!res)
+		kfree(name);
+	return res;
+}
+
+/**
+ * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id
+ * @nvdimm: container of dpa-resource-root + labels
+ * @label_id: dpa resource name of the form pmem-<human readable uuid>
+ */
+resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
+		struct nd_label_id *label_id)
+{
+	resource_size_t allocated = 0;
+	struct resource *res;
+
+	for_each_dpa_resource(ndd, res)
+		if (strcmp(res->name, label_id->id) == 0)
+			allocated += resource_size(res);
+
+	return allocated;
+}
+
+static int count_dimms(struct device *dev, void *c)
+{
+	int *count = c;
+
+	if (is_nvdimm(dev))
+		(*count)++;
+	return 0;
+}
+
+int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count)
+{
+	int count = 0;
+	/* Flush any possible dimm registration failures */
+	nd_synchronize();
+
+	device_for_each_child(&nvdimm_bus->dev, &count, count_dimms);
+	dev_dbg(&nvdimm_bus->dev, "count: %d\n", count);
+	if (count != dimm_count)
+		return -ENXIO;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count);
+
+void __exit nvdimm_devs_exit(void)
+{
+	ida_destroy(&dimm_ida);
+}
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
new file mode 100644
index 0000000000..4cd18be9d0
--- /dev/null
+++ b/drivers/nvdimm/e820.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
+ */
+#include <linux/platform_device.h>
+#include <linux/memory_hotplug.h>
+#include <linux/libnvdimm.h>
+#include <linux/module.h>
+#include <linux/numa.h>
+
+static int e820_pmem_remove(struct platform_device *pdev)
+{
+	struct nvdimm_bus *nvdimm_bus = platform_get_drvdata(pdev);
+
+	nvdimm_bus_unregister(nvdimm_bus);
+	return 0;
+}
+
+static int e820_register_one(struct resource *res, void *data)
+{
+	struct nd_region_desc ndr_desc;
+	struct nvdimm_bus *nvdimm_bus = data;
+	int nid = phys_to_target_node(res->start);
+
+	memset(&ndr_desc, 0, sizeof(ndr_desc));
+	ndr_desc.res = res;
+	ndr_desc.numa_node = numa_map_to_online_node(nid);
+	ndr_desc.target_node = nid;
+	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+	if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
+		return -ENXIO;
+	return 0;
+}
+
+static int e820_pmem_probe(struct platform_device *pdev)
+{
+	static struct nvdimm_bus_descriptor nd_desc;
+	struct device *dev = &pdev->dev;
+	struct nvdimm_bus *nvdimm_bus;
+	int rc = -ENXIO;
+
+	nd_desc.provider_name = "e820";
+	nd_desc.module = THIS_MODULE;
+	nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
+	if (!nvdimm_bus)
+		goto err;
+	platform_set_drvdata(pdev, nvdimm_bus);
+
+	rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+			IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one);
+	if (rc)
+		goto err;
+	return 0;
+err:
+	nvdimm_bus_unregister(nvdimm_bus);
+	dev_err(dev, "failed to register legacy persistent memory ranges\n");
+	return rc;
+}
+
+static struct platform_driver e820_pmem_driver = {
+	.probe = e820_pmem_probe,
+	.remove = e820_pmem_remove,
+	.driver = {
+		.name = "e820_pmem",
+	},
+};
+
+module_platform_driver(e820_pmem_driver);
+
+MODULE_ALIAS("platform:e820_pmem*");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
new file mode 100644
index 0000000000..082253a3a9
--- /dev/null
+++ b/drivers/nvdimm/label.c
@@ -0,0 +1,1120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#include <linux/device.h>
+#include <linux/ndctl.h>
+#include <linux/uuid.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "label.h"
+#include "nd.h"
+
+static guid_t nvdimm_btt_guid;
+static guid_t nvdimm_btt2_guid;
+static guid_t nvdimm_pfn_guid;
+static guid_t nvdimm_dax_guid;
+
+static uuid_t nvdimm_btt_uuid;
+static uuid_t nvdimm_btt2_uuid;
+static uuid_t nvdimm_pfn_uuid;
+static uuid_t nvdimm_dax_uuid;
+
+static uuid_t cxl_region_uuid;
+static uuid_t cxl_namespace_uuid;
+
+static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0";
+
+static u32 best_seq(u32 a, u32 b)
+{
+	a &= NSINDEX_SEQ_MASK;
+	b &= NSINDEX_SEQ_MASK;
+
+	if (a == 0 || a == b)
+		return b;
+	else if (b == 0)
+		return a;
+	else if (nd_inc_seq(a) == b)
+		return b;
+	else
+		return a;
+}
+
+unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd)
+{
+	return ndd->nslabel_size;
+}
+
+static size_t __sizeof_namespace_index(u32 nslot)
+{
+	return ALIGN(sizeof(struct nd_namespace_index) + DIV_ROUND_UP(nslot, 8),
+			NSINDEX_ALIGN);
+}
+
+static int __nvdimm_num_label_slots(struct nvdimm_drvdata *ndd,
+		size_t index_size)
+{
+	return (ndd->nsarea.config_size - index_size * 2) /
+			sizeof_namespace_label(ndd);
+}
+
+int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd)
+{
+	u32 tmp_nslot, n;
+
+	tmp_nslot = ndd->nsarea.config_size / sizeof_namespace_label(ndd);
+	n = __sizeof_namespace_index(tmp_nslot) / NSINDEX_ALIGN;
+
+	return __nvdimm_num_label_slots(ndd, NSINDEX_ALIGN * n);
+}
+
+size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
+{
+	u32 nslot, space, size;
+
+	/*
+	 * Per UEFI 2.7, the minimum size of the Label Storage Area is large
+	 * enough to hold 2 index blocks and 2 labels.  The minimum index
+	 * block size is 256 bytes. The label size is 128 for namespaces
+	 * prior to version 1.2 and at minimum 256 for version 1.2 and later.
+	 */
+	nslot = nvdimm_num_label_slots(ndd);
+	space = ndd->nsarea.config_size - nslot * sizeof_namespace_label(ndd);
+	size = __sizeof_namespace_index(nslot) * 2;
+	if (size <= space && nslot >= 2)
+		return size / 2;
+
+	dev_err(ndd->dev, "label area (%d) too small to host (%d byte) labels\n",
+			ndd->nsarea.config_size, sizeof_namespace_label(ndd));
+	return 0;
+}
+
+static int __nd_label_validate(struct nvdimm_drvdata *ndd)
+{
+	/*
+	 * On media label format consists of two index blocks followed
+	 * by an array of labels.  None of these structures are ever
+	 * updated in place.  A sequence number tracks the current
+	 * active index and the next one to write, while labels are
+	 * written to free slots.
+	 *
+	 *     +------------+
+	 *     |            |
+	 *     |  nsindex0  |
+	 *     |            |
+	 *     +------------+
+	 *     |            |
+	 *     |  nsindex1  |
+	 *     |            |
+	 *     +------------+
+	 *     |   label0   |
+	 *     +------------+
+	 *     |   label1   |
+	 *     +------------+
+	 *     |            |
+	 *      ....nslot...
+	 *     |            |
+	 *     +------------+
+	 *     |   labelN   |
+	 *     +------------+
+	 */
+	struct nd_namespace_index *nsindex[] = {
+		to_namespace_index(ndd, 0),
+		to_namespace_index(ndd, 1),
+	};
+	const int num_index = ARRAY_SIZE(nsindex);
+	struct device *dev = ndd->dev;
+	bool valid[2] = { 0 };
+	int i, num_valid = 0;
+	u32 seq;
+
+	for (i = 0; i < num_index; i++) {
+		u32 nslot;
+		u8 sig[NSINDEX_SIG_LEN];
+		u64 sum_save, sum, size;
+		unsigned int version, labelsize;
+
+		memcpy(sig, nsindex[i]->sig, NSINDEX_SIG_LEN);
+		if (memcmp(sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN) != 0) {
+			dev_dbg(dev, "nsindex%d signature invalid\n", i);
+			continue;
+		}
+
+		/* label sizes larger than 128 arrived with v1.2 */
+		version = __le16_to_cpu(nsindex[i]->major) * 100
+			+ __le16_to_cpu(nsindex[i]->minor);
+		if (version >= 102)
+			labelsize = 1 << (7 + nsindex[i]->labelsize);
+		else
+			labelsize = 128;
+
+		if (labelsize != sizeof_namespace_label(ndd)) {
+			dev_dbg(dev, "nsindex%d labelsize %d invalid\n",
+					i, nsindex[i]->labelsize);
+			continue;
+		}
+
+		sum_save = __le64_to_cpu(nsindex[i]->checksum);
+		nsindex[i]->checksum = __cpu_to_le64(0);
+		sum = nd_fletcher64(nsindex[i], sizeof_namespace_index(ndd), 1);
+		nsindex[i]->checksum = __cpu_to_le64(sum_save);
+		if (sum != sum_save) {
+			dev_dbg(dev, "nsindex%d checksum invalid\n", i);
+			continue;
+		}
+
+		seq = __le32_to_cpu(nsindex[i]->seq);
+		if ((seq & NSINDEX_SEQ_MASK) == 0) {
+			dev_dbg(dev, "nsindex%d sequence: %#x invalid\n", i, seq);
+			continue;
+		}
+
+		/* sanity check the index against expected values */
+		if (__le64_to_cpu(nsindex[i]->myoff)
+				!= i * sizeof_namespace_index(ndd)) {
+			dev_dbg(dev, "nsindex%d myoff: %#llx invalid\n",
+					i, (unsigned long long)
+					__le64_to_cpu(nsindex[i]->myoff));
+			continue;
+		}
+		if (__le64_to_cpu(nsindex[i]->otheroff)
+				!= (!i) * sizeof_namespace_index(ndd)) {
+			dev_dbg(dev, "nsindex%d otheroff: %#llx invalid\n",
+					i, (unsigned long long)
+					__le64_to_cpu(nsindex[i]->otheroff));
+			continue;
+		}
+		if (__le64_to_cpu(nsindex[i]->labeloff)
+				!= 2 * sizeof_namespace_index(ndd)) {
+			dev_dbg(dev, "nsindex%d labeloff: %#llx invalid\n",
+					i, (unsigned long long)
+					__le64_to_cpu(nsindex[i]->labeloff));
+			continue;
+		}
+
+		size = __le64_to_cpu(nsindex[i]->mysize);
+		if (size > sizeof_namespace_index(ndd)
+				|| size < sizeof(struct nd_namespace_index)) {
+			dev_dbg(dev, "nsindex%d mysize: %#llx invalid\n", i, size);
+			continue;
+		}
+
+		nslot = __le32_to_cpu(nsindex[i]->nslot);
+		if (nslot * sizeof_namespace_label(ndd)
+				+ 2 * sizeof_namespace_index(ndd)
+				> ndd->nsarea.config_size) {
+			dev_dbg(dev, "nsindex%d nslot: %u invalid, config_size: %#x\n",
+					i, nslot, ndd->nsarea.config_size);
+			continue;
+		}
+		valid[i] = true;
+		num_valid++;
+	}
+
+	switch (num_valid) {
+	case 0:
+		break;
+	case 1:
+		for (i = 0; i < num_index; i++)
+			if (valid[i])
+				return i;
+		/* can't have num_valid > 0 but valid[] = { false, false } */
+		WARN_ON(1);
+		break;
+	default:
+		/* pick the best index... */
+		seq = best_seq(__le32_to_cpu(nsindex[0]->seq),
+				__le32_to_cpu(nsindex[1]->seq));
+		if (seq == (__le32_to_cpu(nsindex[1]->seq) & NSINDEX_SEQ_MASK))
+			return 1;
+		else
+			return 0;
+		break;
+	}
+
+	return -1;
+}
+
+static int nd_label_validate(struct nvdimm_drvdata *ndd)
+{
+	/*
+	 * In order to probe for and validate namespace index blocks we
+	 * need to know the size of the labels, and we can't trust the
+	 * size of the labels until we validate the index blocks.
+	 * Resolve this dependency loop by probing for known label
+	 * sizes, but default to v1.2 256-byte namespace labels if
+	 * discovery fails.
+	 */
+	int label_size[] = { 128, 256 };
+	int i, rc;
+
+	for (i = 0; i < ARRAY_SIZE(label_size); i++) {
+		ndd->nslabel_size = label_size[i];
+		rc = __nd_label_validate(ndd);
+		if (rc >= 0)
+			return rc;
+	}
+
+	return -1;
+}
+
+static void nd_label_copy(struct nvdimm_drvdata *ndd,
+			  struct nd_namespace_index *dst,
+			  struct nd_namespace_index *src)
+{
+	/* just exit if either destination or source is NULL */
+	if (!dst || !src)
+		return;
+
+	memcpy(dst, src, sizeof_namespace_index(ndd));
+}
+
+static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd)
+{
+	void *base = to_namespace_index(ndd, 0);
+
+	return base + 2 * sizeof_namespace_index(ndd);
+}
+
+static int to_slot(struct nvdimm_drvdata *ndd,
+		struct nd_namespace_label *nd_label)
+{
+	unsigned long label, base;
+
+	label = (unsigned long) nd_label;
+	base = (unsigned long) nd_label_base(ndd);
+
+	return (label - base) / sizeof_namespace_label(ndd);
+}
+
+static struct nd_namespace_label *to_label(struct nvdimm_drvdata *ndd, int slot)
+{
+	unsigned long label, base;
+
+	base = (unsigned long) nd_label_base(ndd);
+	label = base + sizeof_namespace_label(ndd) * slot;
+
+	return (struct nd_namespace_label *) label;
+}
+
+#define for_each_clear_bit_le(bit, addr, size) \
+	for ((bit) = find_next_zero_bit_le((addr), (size), 0);  \
+	     (bit) < (size);                                    \
+	     (bit) = find_next_zero_bit_le((addr), (size), (bit) + 1))
+
+/**
+ * preamble_index - common variable initialization for nd_label_* routines
+ * @ndd: dimm container for the relevant label set
+ * @idx: namespace_index index
+ * @nsindex_out: on return set to the currently active namespace index
+ * @free: on return set to the free label bitmap in the index
+ * @nslot: on return set to the number of slots in the label space
+ */
+static bool preamble_index(struct nvdimm_drvdata *ndd, int idx,
+		struct nd_namespace_index **nsindex_out,
+		unsigned long **free, u32 *nslot)
+{
+	struct nd_namespace_index *nsindex;
+
+	nsindex = to_namespace_index(ndd, idx);
+	if (nsindex == NULL)
+		return false;
+
+	*free = (unsigned long *) nsindex->free;
+	*nslot = __le32_to_cpu(nsindex->nslot);
+	*nsindex_out = nsindex;
+
+	return true;
+}
+
+char *nd_label_gen_id(struct nd_label_id *label_id, const uuid_t *uuid,
+		      u32 flags)
+{
+	if (!label_id || !uuid)
+		return NULL;
+	snprintf(label_id->id, ND_LABEL_ID_SIZE, "pmem-%pUb", uuid);
+	return label_id->id;
+}
+
+static bool preamble_current(struct nvdimm_drvdata *ndd,
+		struct nd_namespace_index **nsindex,
+		unsigned long **free, u32 *nslot)
+{
+	return preamble_index(ndd, ndd->ns_current, nsindex,
+			free, nslot);
+}
+
+static bool preamble_next(struct nvdimm_drvdata *ndd,
+		struct nd_namespace_index **nsindex,
+		unsigned long **free, u32 *nslot)
+{
+	return preamble_index(ndd, ndd->ns_next, nsindex,
+			free, nslot);
+}
+
+static bool nsl_validate_checksum(struct nvdimm_drvdata *ndd,
+				  struct nd_namespace_label *nd_label)
+{
+	u64 sum, sum_save;
+
+	if (!ndd->cxl && !efi_namespace_label_has(ndd, checksum))
+		return true;
+
+	sum_save = nsl_get_checksum(ndd, nd_label);
+	nsl_set_checksum(ndd, nd_label, 0);
+	sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1);
+	nsl_set_checksum(ndd, nd_label, sum_save);
+	return sum == sum_save;
+}
+
+static void nsl_calculate_checksum(struct nvdimm_drvdata *ndd,
+				   struct nd_namespace_label *nd_label)
+{
+	u64 sum;
+
+	if (!ndd->cxl && !efi_namespace_label_has(ndd, checksum))
+		return;
+	nsl_set_checksum(ndd, nd_label, 0);
+	sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1);
+	nsl_set_checksum(ndd, nd_label, sum);
+}
+
+static bool slot_valid(struct nvdimm_drvdata *ndd,
+		struct nd_namespace_label *nd_label, u32 slot)
+{
+	bool valid;
+
+	/* check that we are written where we expect to be written */
+	if (slot != nsl_get_slot(ndd, nd_label))
+		return false;
+	valid = nsl_validate_checksum(ndd, nd_label);
+	if (!valid)
+		dev_dbg(ndd->dev, "fail checksum. slot: %d\n", slot);
+	return valid;
+}
+
+int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
+{
+	struct nd_namespace_index *nsindex;
+	unsigned long *free;
+	u32 nslot, slot;
+
+	if (!preamble_current(ndd, &nsindex, &free, &nslot))
+		return 0; /* no label, nothing to reserve */
+
+	for_each_clear_bit_le(slot, free, nslot) {
+		struct nd_namespace_label *nd_label;
+		struct nd_region *nd_region = NULL;
+		struct nd_label_id label_id;
+		struct resource *res;
+		uuid_t label_uuid;
+		u32 flags;
+
+		nd_label = to_label(ndd, slot);
+
+		if (!slot_valid(ndd, nd_label, slot))
+			continue;
+
+		nsl_get_uuid(ndd, nd_label, &label_uuid);
+		flags = nsl_get_flags(ndd, nd_label);
+		nd_label_gen_id(&label_id, &label_uuid, flags);
+		res = nvdimm_allocate_dpa(ndd, &label_id,
+					  nsl_get_dpa(ndd, nd_label),
+					  nsl_get_rawsize(ndd, nd_label));
+		nd_dbg_dpa(nd_region, ndd, res, "reserve\n");
+		if (!res)
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+int nd_label_data_init(struct nvdimm_drvdata *ndd)
+{
+	size_t config_size, read_size, max_xfer, offset;
+	struct nd_namespace_index *nsindex;
+	unsigned int i;
+	int rc = 0;
+	u32 nslot;
+
+	if (ndd->data)
+		return 0;
+
+	if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0) {
+		dev_dbg(ndd->dev, "failed to init config data area: (%u:%u)\n",
+			ndd->nsarea.max_xfer, ndd->nsarea.config_size);
+		return -ENXIO;
+	}
+
+	/*
+	 * We need to determine the maximum index area as this is the section
+	 * we must read and validate before we can start processing labels.
+	 *
+	 * If the area is too small to contain the two indexes and 2 labels
+	 * then we abort.
+	 *
+	 * Start at a label size of 128 as this should result in the largest
+	 * possible namespace index size.
+	 */
+	ndd->nslabel_size = 128;
+	read_size = sizeof_namespace_index(ndd) * 2;
+	if (!read_size)
+		return -ENXIO;
+
+	/* Allocate config data */
+	config_size = ndd->nsarea.config_size;
+	ndd->data = kvzalloc(config_size, GFP_KERNEL);
+	if (!ndd->data)
+		return -ENOMEM;
+
+	/*
+	 * We want to guarantee as few reads as possible while conserving
+	 * memory. To do that we figure out how much unused space will be left
+	 * in the last read, divide that by the total number of reads it is
+	 * going to take given our maximum transfer size, and then reduce our
+	 * maximum transfer size based on that result.
+	 */
+	max_xfer = min_t(size_t, ndd->nsarea.max_xfer, config_size);
+	if (read_size < max_xfer) {
+		/* trim waste */
+		max_xfer -= ((max_xfer - 1) - (config_size - 1) % max_xfer) /
+			    DIV_ROUND_UP(config_size, max_xfer);
+		/* make certain we read indexes in exactly 1 read */
+		if (max_xfer < read_size)
+			max_xfer = read_size;
+	}
+
+	/* Make our initial read size a multiple of max_xfer size */
+	read_size = min(DIV_ROUND_UP(read_size, max_xfer) * max_xfer,
+			config_size);
+
+	/* Read the index data */
+	rc = nvdimm_get_config_data(ndd, ndd->data, 0, read_size);
+	if (rc)
+		goto out_err;
+
+	/* Validate index data, if not valid assume all labels are invalid */
+	ndd->ns_current = nd_label_validate(ndd);
+	if (ndd->ns_current < 0)
+		return 0;
+
+	/* Record our index values */
+	ndd->ns_next = nd_label_next_nsindex(ndd->ns_current);
+
+	/* Copy "current" index on top of the "next" index */
+	nsindex = to_current_namespace_index(ndd);
+	nd_label_copy(ndd, to_next_namespace_index(ndd), nsindex);
+
+	/* Determine starting offset for label data */
+	offset = __le64_to_cpu(nsindex->labeloff);
+	nslot = __le32_to_cpu(nsindex->nslot);
+
+	/* Loop through the free list pulling in any active labels */
+	for (i = 0; i < nslot; i++, offset += ndd->nslabel_size) {
+		size_t label_read_size;
+
+		/* zero out the unused labels */
+		if (test_bit_le(i, nsindex->free)) {
+			memset(ndd->data + offset, 0, ndd->nslabel_size);
+			continue;
+		}
+
+		/* if we already read past here then just continue */
+		if (offset + ndd->nslabel_size <= read_size)
+			continue;
+
+		/* if we haven't read in a while reset our read_size offset */
+		if (read_size < offset)
+			read_size = offset;
+
+		/* determine how much more will be read after this next call. */
+		label_read_size = offset + ndd->nslabel_size - read_size;
+		label_read_size = DIV_ROUND_UP(label_read_size, max_xfer) *
+				  max_xfer;
+
+		/* truncate last read if needed */
+		if (read_size + label_read_size > config_size)
+			label_read_size = config_size - read_size;
+
+		/* Read the label data */
+		rc = nvdimm_get_config_data(ndd, ndd->data + read_size,
+					    read_size, label_read_size);
+		if (rc)
+			goto out_err;
+
+		/* push read_size to next read offset */
+		read_size += label_read_size;
+	}
+
+	dev_dbg(ndd->dev, "len: %zu rc: %d\n", offset, rc);
+out_err:
+	return rc;
+}
+
+int nd_label_active_count(struct nvdimm_drvdata *ndd)
+{
+	struct nd_namespace_index *nsindex;
+	unsigned long *free;
+	u32 nslot, slot;
+	int count = 0;
+
+	if (!preamble_current(ndd, &nsindex, &free, &nslot))
+		return 0;
+
+	for_each_clear_bit_le(slot, free, nslot) {
+		struct nd_namespace_label *nd_label;
+
+		nd_label = to_label(ndd, slot);
+
+		if (!slot_valid(ndd, nd_label, slot)) {
+			u32 label_slot = nsl_get_slot(ndd, nd_label);
+			u64 size = nsl_get_rawsize(ndd, nd_label);
+			u64 dpa = nsl_get_dpa(ndd, nd_label);
+
+			dev_dbg(ndd->dev,
+				"slot%d invalid slot: %d dpa: %llx size: %llx\n",
+					slot, label_slot, dpa, size);
+			continue;
+		}
+		count++;
+	}
+	return count;
+}
+
+struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n)
+{
+	struct nd_namespace_index *nsindex;
+	unsigned long *free;
+	u32 nslot, slot;
+
+	if (!preamble_current(ndd, &nsindex, &free, &nslot))
+		return NULL;
+
+	for_each_clear_bit_le(slot, free, nslot) {
+		struct nd_namespace_label *nd_label;
+
+		nd_label = to_label(ndd, slot);
+		if (!slot_valid(ndd, nd_label, slot))
+			continue;
+
+		if (n-- == 0)
+			return to_label(ndd, slot);
+	}
+
+	return NULL;
+}
+
+u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd)
+{
+	struct nd_namespace_index *nsindex;
+	unsigned long *free;
+	u32 nslot, slot;
+
+	if (!preamble_next(ndd, &nsindex, &free, &nslot))
+		return UINT_MAX;
+
+	WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
+
+	slot = find_next_bit_le(free, nslot, 0);
+	if (slot == nslot)
+		return UINT_MAX;
+
+	clear_bit_le(slot, free);
+
+	return slot;
+}
+
+bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot)
+{
+	struct nd_namespace_index *nsindex;
+	unsigned long *free;
+	u32 nslot;
+
+	if (!preamble_next(ndd, &nsindex, &free, &nslot))
+		return false;
+
+	WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
+
+	if (slot < nslot)
+		return !test_and_set_bit_le(slot, free);
+	return false;
+}
+
+u32 nd_label_nfree(struct nvdimm_drvdata *ndd)
+{
+	struct nd_namespace_index *nsindex;
+	unsigned long *free;
+	u32 nslot;
+
+	WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
+
+	if (!preamble_next(ndd, &nsindex, &free, &nslot))
+		return nvdimm_num_label_slots(ndd);
+
+	return bitmap_weight(free, nslot);
+}
+
+static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq,
+		unsigned long flags)
+{
+	struct nd_namespace_index *nsindex;
+	unsigned long offset;
+	u64 checksum;
+	u32 nslot;
+	int rc;
+
+	nsindex = to_namespace_index(ndd, index);
+	if (flags & ND_NSINDEX_INIT)
+		nslot = nvdimm_num_label_slots(ndd);
+	else
+		nslot = __le32_to_cpu(nsindex->nslot);
+
+	memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN);
+	memset(&nsindex->flags, 0, 3);
+	nsindex->labelsize = sizeof_namespace_label(ndd) >> 8;
+	nsindex->seq = __cpu_to_le32(seq);
+	offset = (unsigned long) nsindex
+		- (unsigned long) to_namespace_index(ndd, 0);
+	nsindex->myoff = __cpu_to_le64(offset);
+	nsindex->mysize = __cpu_to_le64(sizeof_namespace_index(ndd));
+	offset = (unsigned long) to_namespace_index(ndd,
+			nd_label_next_nsindex(index))
+		- (unsigned long) to_namespace_index(ndd, 0);
+	nsindex->otheroff = __cpu_to_le64(offset);
+	offset = (unsigned long) nd_label_base(ndd)
+		- (unsigned long) to_namespace_index(ndd, 0);
+	nsindex->labeloff = __cpu_to_le64(offset);
+	nsindex->nslot = __cpu_to_le32(nslot);
+	nsindex->major = __cpu_to_le16(1);
+	if (sizeof_namespace_label(ndd) < 256)
+		nsindex->minor = __cpu_to_le16(1);
+	else
+		nsindex->minor = __cpu_to_le16(2);
+	nsindex->checksum = __cpu_to_le64(0);
+	if (flags & ND_NSINDEX_INIT) {
+		unsigned long *free = (unsigned long *) nsindex->free;
+		u32 nfree = ALIGN(nslot, BITS_PER_LONG);
+		int last_bits, i;
+
+		memset(nsindex->free, 0xff, nfree / 8);
+		for (i = 0, last_bits = nfree - nslot; i < last_bits; i++)
+			clear_bit_le(nslot + i, free);
+	}
+	checksum = nd_fletcher64(nsindex, sizeof_namespace_index(ndd), 1);
+	nsindex->checksum = __cpu_to_le64(checksum);
+	rc = nvdimm_set_config_data(ndd, __le64_to_cpu(nsindex->myoff),
+			nsindex, sizeof_namespace_index(ndd));
+	if (rc < 0)
+		return rc;
+
+	if (flags & ND_NSINDEX_INIT)
+		return 0;
+
+	/* copy the index we just wrote to the new 'next' */
+	WARN_ON(index != ndd->ns_next);
+	nd_label_copy(ndd, to_current_namespace_index(ndd), nsindex);
+	ndd->ns_current = nd_label_next_nsindex(ndd->ns_current);
+	ndd->ns_next = nd_label_next_nsindex(ndd->ns_next);
+	WARN_ON(ndd->ns_current == ndd->ns_next);
+
+	return 0;
+}
+
+static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd,
+		struct nd_namespace_label *nd_label)
+{
+	return (unsigned long) nd_label
+		- (unsigned long) to_namespace_index(ndd, 0);
+}
+
+static enum nvdimm_claim_class guid_to_nvdimm_cclass(guid_t *guid)
+{
+	if (guid_equal(guid, &nvdimm_btt_guid))
+		return NVDIMM_CCLASS_BTT;
+	else if (guid_equal(guid, &nvdimm_btt2_guid))
+		return NVDIMM_CCLASS_BTT2;
+	else if (guid_equal(guid, &nvdimm_pfn_guid))
+		return NVDIMM_CCLASS_PFN;
+	else if (guid_equal(guid, &nvdimm_dax_guid))
+		return NVDIMM_CCLASS_DAX;
+	else if (guid_equal(guid, &guid_null))
+		return NVDIMM_CCLASS_NONE;
+
+	return NVDIMM_CCLASS_UNKNOWN;
+}
+
+/* CXL labels store UUIDs instead of GUIDs for the same data */
+static enum nvdimm_claim_class uuid_to_nvdimm_cclass(uuid_t *uuid)
+{
+	if (uuid_equal(uuid, &nvdimm_btt_uuid))
+		return NVDIMM_CCLASS_BTT;
+	else if (uuid_equal(uuid, &nvdimm_btt2_uuid))
+		return NVDIMM_CCLASS_BTT2;
+	else if (uuid_equal(uuid, &nvdimm_pfn_uuid))
+		return NVDIMM_CCLASS_PFN;
+	else if (uuid_equal(uuid, &nvdimm_dax_uuid))
+		return NVDIMM_CCLASS_DAX;
+	else if (uuid_equal(uuid, &uuid_null))
+		return NVDIMM_CCLASS_NONE;
+
+	return NVDIMM_CCLASS_UNKNOWN;
+}
+
+static const guid_t *to_abstraction_guid(enum nvdimm_claim_class claim_class,
+	guid_t *target)
+{
+	if (claim_class == NVDIMM_CCLASS_BTT)
+		return &nvdimm_btt_guid;
+	else if (claim_class == NVDIMM_CCLASS_BTT2)
+		return &nvdimm_btt2_guid;
+	else if (claim_class == NVDIMM_CCLASS_PFN)
+		return &nvdimm_pfn_guid;
+	else if (claim_class == NVDIMM_CCLASS_DAX)
+		return &nvdimm_dax_guid;
+	else if (claim_class == NVDIMM_CCLASS_UNKNOWN) {
+		/*
+		 * If we're modifying a namespace for which we don't
+		 * know the claim_class, don't touch the existing guid.
+		 */
+		return target;
+	} else
+		return &guid_null;
+}
+
+/* CXL labels store UUIDs instead of GUIDs for the same data */
+static const uuid_t *to_abstraction_uuid(enum nvdimm_claim_class claim_class,
+					 uuid_t *target)
+{
+	if (claim_class == NVDIMM_CCLASS_BTT)
+		return &nvdimm_btt_uuid;
+	else if (claim_class == NVDIMM_CCLASS_BTT2)
+		return &nvdimm_btt2_uuid;
+	else if (claim_class == NVDIMM_CCLASS_PFN)
+		return &nvdimm_pfn_uuid;
+	else if (claim_class == NVDIMM_CCLASS_DAX)
+		return &nvdimm_dax_uuid;
+	else if (claim_class == NVDIMM_CCLASS_UNKNOWN) {
+		/*
+		 * If we're modifying a namespace for which we don't
+		 * know the claim_class, don't touch the existing uuid.
+		 */
+		return target;
+	} else
+		return &uuid_null;
+}
+
+static void reap_victim(struct nd_mapping *nd_mapping,
+		struct nd_label_ent *victim)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	u32 slot = to_slot(ndd, victim->label);
+
+	dev_dbg(ndd->dev, "free: %d\n", slot);
+	nd_label_free_slot(ndd, slot);
+	victim->label = NULL;
+}
+
+static void nsl_set_type_guid(struct nvdimm_drvdata *ndd,
+			      struct nd_namespace_label *nd_label, guid_t *guid)
+{
+	if (efi_namespace_label_has(ndd, type_guid))
+		guid_copy(&nd_label->efi.type_guid, guid);
+}
+
+bool nsl_validate_type_guid(struct nvdimm_drvdata *ndd,
+			    struct nd_namespace_label *nd_label, guid_t *guid)
+{
+	if (ndd->cxl || !efi_namespace_label_has(ndd, type_guid))
+		return true;
+	if (!guid_equal(&nd_label->efi.type_guid, guid)) {
+		dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n", guid,
+			&nd_label->efi.type_guid);
+		return false;
+	}
+	return true;
+}
+
+static void nsl_set_claim_class(struct nvdimm_drvdata *ndd,
+				struct nd_namespace_label *nd_label,
+				enum nvdimm_claim_class claim_class)
+{
+	if (ndd->cxl) {
+		uuid_t uuid;
+
+		import_uuid(&uuid, nd_label->cxl.abstraction_uuid);
+		export_uuid(nd_label->cxl.abstraction_uuid,
+			    to_abstraction_uuid(claim_class, &uuid));
+		return;
+	}
+
+	if (!efi_namespace_label_has(ndd, abstraction_guid))
+		return;
+	guid_copy(&nd_label->efi.abstraction_guid,
+		  to_abstraction_guid(claim_class,
+				      &nd_label->efi.abstraction_guid));
+}
+
+enum nvdimm_claim_class nsl_get_claim_class(struct nvdimm_drvdata *ndd,
+					    struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl) {
+		uuid_t uuid;
+
+		import_uuid(&uuid, nd_label->cxl.abstraction_uuid);
+		return uuid_to_nvdimm_cclass(&uuid);
+	}
+	if (!efi_namespace_label_has(ndd, abstraction_guid))
+		return NVDIMM_CCLASS_NONE;
+	return guid_to_nvdimm_cclass(&nd_label->efi.abstraction_guid);
+}
+
+static int __pmem_label_update(struct nd_region *nd_region,
+		struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm,
+		int pos, unsigned long flags)
+{
+	struct nd_namespace_common *ndns = &nspm->nsio.common;
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct nd_namespace_label *nd_label;
+	struct nd_namespace_index *nsindex;
+	struct nd_label_ent *label_ent;
+	struct nd_label_id label_id;
+	struct resource *res;
+	unsigned long *free;
+	u32 nslot, slot;
+	size_t offset;
+	u64 cookie;
+	int rc;
+
+	if (!preamble_next(ndd, &nsindex, &free, &nslot))
+		return -ENXIO;
+
+	cookie = nd_region_interleave_set_cookie(nd_region, nsindex);
+	nd_label_gen_id(&label_id, nspm->uuid, 0);
+	for_each_dpa_resource(ndd, res)
+		if (strcmp(res->name, label_id.id) == 0)
+			break;
+
+	if (!res) {
+		WARN_ON_ONCE(1);
+		return -ENXIO;
+	}
+
+	/* allocate and write the label to the staging (next) index */
+	slot = nd_label_alloc_slot(ndd);
+	if (slot == UINT_MAX)
+		return -ENXIO;
+	dev_dbg(ndd->dev, "allocated: %d\n", slot);
+
+	nd_label = to_label(ndd, slot);
+	memset(nd_label, 0, sizeof_namespace_label(ndd));
+	nsl_set_uuid(ndd, nd_label, nspm->uuid);
+	nsl_set_name(ndd, nd_label, nspm->alt_name);
+	nsl_set_flags(ndd, nd_label, flags);
+	nsl_set_nlabel(ndd, nd_label, nd_region->ndr_mappings);
+	nsl_set_nrange(ndd, nd_label, 1);
+	nsl_set_position(ndd, nd_label, pos);
+	nsl_set_isetcookie(ndd, nd_label, cookie);
+	nsl_set_rawsize(ndd, nd_label, resource_size(res));
+	nsl_set_lbasize(ndd, nd_label, nspm->lbasize);
+	nsl_set_dpa(ndd, nd_label, res->start);
+	nsl_set_slot(ndd, nd_label, slot);
+	nsl_set_type_guid(ndd, nd_label, &nd_set->type_guid);
+	nsl_set_claim_class(ndd, nd_label, ndns->claim_class);
+	nsl_calculate_checksum(ndd, nd_label);
+	nd_dbg_dpa(nd_region, ndd, res, "\n");
+
+	/* update label */
+	offset = nd_label_offset(ndd, nd_label);
+	rc = nvdimm_set_config_data(ndd, offset, nd_label,
+			sizeof_namespace_label(ndd));
+	if (rc < 0)
+		return rc;
+
+	/* Garbage collect the previous label */
+	mutex_lock(&nd_mapping->lock);
+	list_for_each_entry(label_ent, &nd_mapping->labels, list) {
+		if (!label_ent->label)
+			continue;
+		if (test_and_clear_bit(ND_LABEL_REAP, &label_ent->flags) ||
+		    nsl_uuid_equal(ndd, label_ent->label, nspm->uuid))
+			reap_victim(nd_mapping, label_ent);
+	}
+
+	/* update index */
+	rc = nd_label_write_index(ndd, ndd->ns_next,
+			nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
+	if (rc == 0) {
+		list_for_each_entry(label_ent, &nd_mapping->labels, list)
+			if (!label_ent->label) {
+				label_ent->label = nd_label;
+				nd_label = NULL;
+				break;
+			}
+		dev_WARN_ONCE(&nspm->nsio.common.dev, nd_label,
+				"failed to track label: %d\n",
+				to_slot(ndd, nd_label));
+		if (nd_label)
+			rc = -ENXIO;
+	}
+	mutex_unlock(&nd_mapping->lock);
+
+	return rc;
+}
+
+static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
+{
+	int i, old_num_labels = 0;
+	struct nd_label_ent *label_ent;
+	struct nd_namespace_index *nsindex;
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+
+	mutex_lock(&nd_mapping->lock);
+	list_for_each_entry(label_ent, &nd_mapping->labels, list)
+		old_num_labels++;
+	mutex_unlock(&nd_mapping->lock);
+
+	/*
+	 * We need to preserve all the old labels for the mapping so
+	 * they can be garbage collected after writing the new labels.
+	 */
+	for (i = old_num_labels; i < num_labels; i++) {
+		label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL);
+		if (!label_ent)
+			return -ENOMEM;
+		mutex_lock(&nd_mapping->lock);
+		list_add_tail(&label_ent->list, &nd_mapping->labels);
+		mutex_unlock(&nd_mapping->lock);
+	}
+
+	if (ndd->ns_current == -1 || ndd->ns_next == -1)
+		/* pass */;
+	else
+		return max(num_labels, old_num_labels);
+
+	nsindex = to_namespace_index(ndd, 0);
+	memset(nsindex, 0, ndd->nsarea.config_size);
+	for (i = 0; i < 2; i++) {
+		int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT);
+
+		if (rc)
+			return rc;
+	}
+	ndd->ns_next = 1;
+	ndd->ns_current = 0;
+
+	return max(num_labels, old_num_labels);
+}
+
+static int del_labels(struct nd_mapping *nd_mapping, uuid_t *uuid)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct nd_label_ent *label_ent, *e;
+	struct nd_namespace_index *nsindex;
+	unsigned long *free;
+	LIST_HEAD(list);
+	u32 nslot, slot;
+	int active = 0;
+
+	if (!uuid)
+		return 0;
+
+	/* no index || no labels == nothing to delete */
+	if (!preamble_next(ndd, &nsindex, &free, &nslot))
+		return 0;
+
+	mutex_lock(&nd_mapping->lock);
+	list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
+		struct nd_namespace_label *nd_label = label_ent->label;
+
+		if (!nd_label)
+			continue;
+		active++;
+		if (!nsl_uuid_equal(ndd, nd_label, uuid))
+			continue;
+		active--;
+		slot = to_slot(ndd, nd_label);
+		nd_label_free_slot(ndd, slot);
+		dev_dbg(ndd->dev, "free: %d\n", slot);
+		list_move_tail(&label_ent->list, &list);
+		label_ent->label = NULL;
+	}
+	list_splice_tail_init(&list, &nd_mapping->labels);
+
+	if (active == 0) {
+		nd_mapping_free_labels(nd_mapping);
+		dev_dbg(ndd->dev, "no more active labels\n");
+	}
+	mutex_unlock(&nd_mapping->lock);
+
+	return nd_label_write_index(ndd, ndd->ns_next,
+			nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
+}
+
+int nd_pmem_namespace_label_update(struct nd_region *nd_region,
+		struct nd_namespace_pmem *nspm, resource_size_t size)
+{
+	int i, rc;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct resource *res;
+		int count = 0;
+
+		if (size == 0) {
+			rc = del_labels(nd_mapping, nspm->uuid);
+			if (rc)
+				return rc;
+			continue;
+		}
+
+		for_each_dpa_resource(ndd, res)
+			if (strncmp(res->name, "pmem", 4) == 0)
+				count++;
+		WARN_ON_ONCE(!count);
+
+		rc = init_labels(nd_mapping, count);
+		if (rc < 0)
+			return rc;
+
+		rc = __pmem_label_update(nd_region, nd_mapping, nspm, i,
+				NSLABEL_FLAG_UPDATING);
+		if (rc)
+			return rc;
+	}
+
+	if (size == 0)
+		return 0;
+
+	/* Clear the UPDATING flag per UEFI 2.7 expectations */
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+		rc = __pmem_label_update(nd_region, nd_mapping, nspm, i, 0);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+int __init nd_label_init(void)
+{
+	WARN_ON(guid_parse(NVDIMM_BTT_GUID, &nvdimm_btt_guid));
+	WARN_ON(guid_parse(NVDIMM_BTT2_GUID, &nvdimm_btt2_guid));
+	WARN_ON(guid_parse(NVDIMM_PFN_GUID, &nvdimm_pfn_guid));
+	WARN_ON(guid_parse(NVDIMM_DAX_GUID, &nvdimm_dax_guid));
+
+	WARN_ON(uuid_parse(NVDIMM_BTT_GUID, &nvdimm_btt_uuid));
+	WARN_ON(uuid_parse(NVDIMM_BTT2_GUID, &nvdimm_btt2_uuid));
+	WARN_ON(uuid_parse(NVDIMM_PFN_GUID, &nvdimm_pfn_uuid));
+	WARN_ON(uuid_parse(NVDIMM_DAX_GUID, &nvdimm_dax_uuid));
+
+	WARN_ON(uuid_parse(CXL_REGION_UUID, &cxl_region_uuid));
+	WARN_ON(uuid_parse(CXL_NAMESPACE_UUID, &cxl_namespace_uuid));
+
+	return 0;
+}
diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h
new file mode 100644
index 0000000000..0650fb4b98
--- /dev/null
+++ b/drivers/nvdimm/label.h
@@ -0,0 +1,226 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#ifndef __LABEL_H__
+#define __LABEL_H__
+
+#include <linux/ndctl.h>
+#include <linux/sizes.h>
+#include <linux/uuid.h>
+#include <linux/io.h>
+
+enum {
+	NSINDEX_SIG_LEN = 16,
+	NSINDEX_ALIGN = 256,
+	NSINDEX_SEQ_MASK = 0x3,
+	NSLABEL_UUID_LEN = 16,
+	NSLABEL_NAME_LEN = 64,
+	NSLABEL_FLAG_ROLABEL = 0x1,  /* read-only label */
+	NSLABEL_FLAG_LOCAL = 0x2,    /* DIMM-local namespace */
+	NSLABEL_FLAG_BTT = 0x4,      /* namespace contains a BTT */
+	NSLABEL_FLAG_UPDATING = 0x8, /* label being updated */
+	BTT_ALIGN = 4096,            /* all btt structures */
+	BTTINFO_SIG_LEN = 16,
+	BTTINFO_UUID_LEN = 16,
+	BTTINFO_FLAG_ERROR = 0x1,    /* error state (read-only) */
+	BTTINFO_MAJOR_VERSION = 1,
+	ND_LABEL_MIN_SIZE = 256 * 4, /* see sizeof_namespace_index() */
+	ND_LABEL_ID_SIZE = 50,
+	ND_NSINDEX_INIT = 0x1,
+};
+
+/**
+ * struct nd_namespace_index - label set superblock
+ * @sig: NAMESPACE_INDEX\0
+ * @flags: placeholder
+ * @labelsize: log2 size (v1 labels 128 bytes v2 labels 256 bytes)
+ * @seq: sequence number for this index
+ * @myoff: offset of this index in label area
+ * @mysize: size of this index struct
+ * @otheroff: offset of other index
+ * @labeloff: offset of first label slot
+ * @nslot: total number of label slots
+ * @major: label area major version
+ * @minor: label area minor version
+ * @checksum: fletcher64 of all fields
+ * @free: bitmap, nlabel bits
+ *
+ * The size of free[] is rounded up so the total struct size is a
+ * multiple of NSINDEX_ALIGN bytes.  Any bits this allocates beyond
+ * nlabel bits must be zero.
+ */
+struct nd_namespace_index {
+	u8 sig[NSINDEX_SIG_LEN];
+	u8 flags[3];
+	u8 labelsize;
+	__le32 seq;
+	__le64 myoff;
+	__le64 mysize;
+	__le64 otheroff;
+	__le64 labeloff;
+	__le32 nslot;
+	__le16 major;
+	__le16 minor;
+	__le64 checksum;
+	u8 free[];
+};
+
+/**
+ * struct cxl_region_label - CXL 2.0 Table 211
+ * @type: uuid identifying this label format (region)
+ * @uuid: uuid for the region this label describes
+ * @flags: NSLABEL_FLAG_UPDATING (all other flags reserved)
+ * @nlabel: 1 per interleave-way in the region
+ * @position: this label's position in the set
+ * @dpa: start address in device-local capacity for this label
+ * @rawsize: size of this label's contribution to region
+ * @hpa: mandatory system physical address to map this region
+ * @slot: slot id of this label in label area
+ * @ig: interleave granularity (1 << @ig) * 256 bytes
+ * @align: alignment in SZ_256M blocks
+ * @reserved: reserved
+ * @checksum: fletcher64 sum of this label
+ */
+struct cxl_region_label {
+	u8 type[NSLABEL_UUID_LEN];
+	u8 uuid[NSLABEL_UUID_LEN];
+	__le32 flags;
+	__le16 nlabel;
+	__le16 position;
+	__le64 dpa;
+	__le64 rawsize;
+	__le64 hpa;
+	__le32 slot;
+	__le32 ig;
+	__le32 align;
+	u8 reserved[0xac];
+	__le64 checksum;
+};
+
+/**
+ * struct nvdimm_efi_label - namespace superblock
+ * @uuid: UUID per RFC 4122
+ * @name: optional name (NULL-terminated)
+ * @flags: see NSLABEL_FLAG_*
+ * @nlabel: num labels to describe this ns
+ * @position: labels position in set
+ * @isetcookie: interleave set cookie
+ * @lbasize: LBA size in bytes or 0 for pmem
+ * @dpa: DPA of NVM range on this DIMM
+ * @rawsize: size of namespace
+ * @slot: slot of this label in label area
+ * @align: physical address alignment of the namespace
+ * @reserved: reserved
+ * @type_guid: copy of struct acpi_nfit_system_address.range_guid
+ * @abstraction_guid: personality id (btt, btt2, fsdax, devdax....)
+ * @reserved2: reserved
+ * @checksum: fletcher64 sum of this object
+ */
+struct nvdimm_efi_label {
+	u8 uuid[NSLABEL_UUID_LEN];
+	u8 name[NSLABEL_NAME_LEN];
+	__le32 flags;
+	__le16 nlabel;
+	__le16 position;
+	__le64 isetcookie;
+	__le64 lbasize;
+	__le64 dpa;
+	__le64 rawsize;
+	__le32 slot;
+	/*
+	 * Accessing fields past this point should be gated by a
+	 * efi_namespace_label_has() check.
+	 */
+	u8 align;
+	u8 reserved[3];
+	guid_t type_guid;
+	guid_t abstraction_guid;
+	u8 reserved2[88];
+	__le64 checksum;
+};
+
+/**
+ * struct nvdimm_cxl_label - CXL 2.0 Table 212
+ * @type: uuid identifying this label format (namespace)
+ * @uuid: uuid for the namespace this label describes
+ * @name: friendly name for the namespace
+ * @flags: NSLABEL_FLAG_UPDATING (all other flags reserved)
+ * @nrange: discontiguous namespace support
+ * @position: this label's position in the set
+ * @dpa: start address in device-local capacity for this label
+ * @rawsize: size of this label's contribution to namespace
+ * @slot: slot id of this label in label area
+ * @align: alignment in SZ_256M blocks
+ * @region_uuid: host interleave set identifier
+ * @abstraction_uuid: personality driver for this namespace
+ * @lbasize: address geometry for disk-like personalities
+ * @reserved: reserved
+ * @checksum: fletcher64 sum of this label
+ */
+struct nvdimm_cxl_label {
+	u8 type[NSLABEL_UUID_LEN];
+	u8 uuid[NSLABEL_UUID_LEN];
+	u8 name[NSLABEL_NAME_LEN];
+	__le32 flags;
+	__le16 nrange;
+	__le16 position;
+	__le64 dpa;
+	__le64 rawsize;
+	__le32 slot;
+	__le32 align;
+	u8 region_uuid[16];
+	u8 abstraction_uuid[16];
+	__le16 lbasize;
+	u8 reserved[0x56];
+	__le64 checksum;
+};
+
+struct nd_namespace_label {
+	union {
+		struct nvdimm_cxl_label cxl;
+		struct nvdimm_efi_label efi;
+	};
+};
+
+#define NVDIMM_BTT_GUID "8aed63a2-29a2-4c66-8b12-f05d15d3922a"
+#define NVDIMM_BTT2_GUID "18633bfc-1735-4217-8ac9-17239282d3f8"
+#define NVDIMM_PFN_GUID "266400ba-fb9f-4677-bcb0-968f11d0d225"
+#define NVDIMM_DAX_GUID "97a86d9c-3cdd-4eda-986f-5068b4f80088"
+
+#define CXL_REGION_UUID "529d7c61-da07-47c4-a93f-ecdf2c06f444"
+#define CXL_NAMESPACE_UUID "68bb2c0a-5a77-4937-9f85-3caf41a0f93c"
+
+/**
+ * struct nd_label_id - identifier string for dpa allocation
+ * @id: "pmem-<namespace uuid>"
+ */
+struct nd_label_id {
+	char id[ND_LABEL_ID_SIZE];
+};
+
+/*
+ * If the 'best' index is invalid, so is the 'next' index.  Otherwise,
+ * the next index is MOD(index+1, 2)
+ */
+static inline int nd_label_next_nsindex(int index)
+{
+	if (index < 0)
+		return -1;
+
+	return (index + 1) % 2;
+}
+
+struct nvdimm_drvdata;
+int nd_label_data_init(struct nvdimm_drvdata *ndd);
+size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd);
+int nd_label_active_count(struct nvdimm_drvdata *ndd);
+struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n);
+u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd);
+bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot);
+u32 nd_label_nfree(struct nvdimm_drvdata *ndd);
+struct nd_region;
+struct nd_namespace_pmem;
+int nd_pmem_namespace_label_update(struct nd_region *nd_region,
+		struct nd_namespace_pmem *nspm, resource_size_t size);
+#endif /* __LABEL_H__ */
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
new file mode 100644
index 0000000000..07177eadc5
--- /dev/null
+++ b/drivers/nvdimm/namespace_devs.c
@@ -0,0 +1,2230 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#include <linux/kstrtox.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "pmem.h"
+#include "pfn.h"
+#include "nd.h"
+
+static void namespace_io_release(struct device *dev)
+{
+	struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+	kfree(nsio);
+}
+
+static void namespace_pmem_release(struct device *dev)
+{
+	struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+
+	if (nspm->id >= 0)
+		ida_simple_remove(&nd_region->ns_ida, nspm->id);
+	kfree(nspm->alt_name);
+	kfree(nspm->uuid);
+	kfree(nspm);
+}
+
+static bool is_namespace_pmem(const struct device *dev);
+static bool is_namespace_io(const struct device *dev);
+
+static int is_uuid_busy(struct device *dev, void *data)
+{
+	uuid_t *uuid1 = data, *uuid2 = NULL;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		uuid2 = nspm->uuid;
+	} else if (is_nd_btt(dev)) {
+		struct nd_btt *nd_btt = to_nd_btt(dev);
+
+		uuid2 = nd_btt->uuid;
+	} else if (is_nd_pfn(dev)) {
+		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+		uuid2 = nd_pfn->uuid;
+	}
+
+	if (uuid2 && uuid_equal(uuid1, uuid2))
+		return -EBUSY;
+
+	return 0;
+}
+
+static int is_namespace_uuid_busy(struct device *dev, void *data)
+{
+	if (is_nd_region(dev))
+		return device_for_each_child(dev, data, is_uuid_busy);
+	return 0;
+}
+
+/**
+ * nd_is_uuid_unique - verify that no other namespace has @uuid
+ * @dev: any device on a nvdimm_bus
+ * @uuid: uuid to check
+ */
+bool nd_is_uuid_unique(struct device *dev, uuid_t *uuid)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	if (!nvdimm_bus)
+		return false;
+	WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
+	if (device_for_each_child(&nvdimm_bus->dev, uuid,
+				is_namespace_uuid_busy) != 0)
+		return false;
+	return true;
+}
+
+bool pmem_should_map_pages(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_namespace_common *ndns = to_ndns(dev);
+	struct nd_namespace_io *nsio;
+
+	if (!IS_ENABLED(CONFIG_ZONE_DEVICE))
+		return false;
+
+	if (!test_bit(ND_REGION_PAGEMAP, &nd_region->flags))
+		return false;
+
+	if (is_nd_pfn(dev) || is_nd_btt(dev))
+		return false;
+
+	if (ndns->force_raw)
+		return false;
+
+	nsio = to_nd_namespace_io(dev);
+	if (region_intersects(nsio->res.start, resource_size(&nsio->res),
+				IORESOURCE_SYSTEM_RAM,
+				IORES_DESC_NONE) == REGION_MIXED)
+		return false;
+
+	return ARCH_MEMREMAP_PMEM == MEMREMAP_WB;
+}
+EXPORT_SYMBOL(pmem_should_map_pages);
+
+unsigned int pmem_sector_size(struct nd_namespace_common *ndns)
+{
+	if (is_namespace_pmem(&ndns->dev)) {
+		struct nd_namespace_pmem *nspm;
+
+		nspm = to_nd_namespace_pmem(&ndns->dev);
+		if (nspm->lbasize == 0 || nspm->lbasize == 512)
+			/* default */;
+		else if (nspm->lbasize == 4096)
+			return 4096;
+		else
+			dev_WARN(&ndns->dev, "unsupported sector size: %ld\n",
+					nspm->lbasize);
+	}
+
+	/*
+	 * There is no namespace label (is_namespace_io()), or the label
+	 * indicates the default sector size.
+	 */
+	return 512;
+}
+EXPORT_SYMBOL(pmem_sector_size);
+
+const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
+		char *name)
+{
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+	const char *suffix = NULL;
+
+	if (ndns->claim && is_nd_btt(ndns->claim))
+		suffix = "s";
+
+	if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) {
+		int nsidx = 0;
+
+		if (is_namespace_pmem(&ndns->dev)) {
+			struct nd_namespace_pmem *nspm;
+
+			nspm = to_nd_namespace_pmem(&ndns->dev);
+			nsidx = nspm->id;
+		}
+
+		if (nsidx)
+			sprintf(name, "pmem%d.%d%s", nd_region->id, nsidx,
+					suffix ? suffix : "");
+		else
+			sprintf(name, "pmem%d%s", nd_region->id,
+					suffix ? suffix : "");
+	} else {
+		return NULL;
+	}
+
+	return name;
+}
+EXPORT_SYMBOL(nvdimm_namespace_disk_name);
+
+const uuid_t *nd_dev_to_uuid(struct device *dev)
+{
+	if (dev && is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		return nspm->uuid;
+	}
+	return &uuid_null;
+}
+EXPORT_SYMBOL(nd_dev_to_uuid);
+
+static ssize_t nstype_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+
+	return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region));
+}
+static DEVICE_ATTR_RO(nstype);
+
+static ssize_t __alt_name_store(struct device *dev, const char *buf,
+		const size_t len)
+{
+	char *input, *pos, *alt_name, **ns_altname;
+	ssize_t rc;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		ns_altname = &nspm->alt_name;
+	} else
+		return -ENXIO;
+
+	if (dev->driver || to_ndns(dev)->claim)
+		return -EBUSY;
+
+	input = kstrndup(buf, len, GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	pos = strim(input);
+	if (strlen(pos) + 1 > NSLABEL_NAME_LEN) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	alt_name = kzalloc(NSLABEL_NAME_LEN, GFP_KERNEL);
+	if (!alt_name) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	kfree(*ns_altname);
+	*ns_altname = alt_name;
+	sprintf(*ns_altname, "%s", pos);
+	rc = len;
+
+out:
+	kfree(input);
+	return rc;
+}
+
+static int nd_namespace_label_update(struct nd_region *nd_region,
+		struct device *dev)
+{
+	dev_WARN_ONCE(dev, dev->driver || to_ndns(dev)->claim,
+			"namespace must be idle during label update\n");
+	if (dev->driver || to_ndns(dev)->claim)
+		return 0;
+
+	/*
+	 * Only allow label writes that will result in a valid namespace
+	 * or deletion of an existing namespace.
+	 */
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+		resource_size_t size = resource_size(&nspm->nsio.res);
+
+		if (size == 0 && nspm->uuid)
+			/* delete allocation */;
+		else if (!nspm->uuid)
+			return 0;
+
+		return nd_pmem_namespace_label_update(nd_region, nspm, size);
+	} else
+		return -ENXIO;
+}
+
+static ssize_t alt_name_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	ssize_t rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	rc = __alt_name_store(dev, buf, len);
+	if (rc >= 0)
+		rc = nd_namespace_label_update(nd_region, dev);
+	dev_dbg(dev, "%s(%zd)\n", rc < 0 ? "fail " : "", rc);
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc < 0 ? rc : len;
+}
+
+static ssize_t alt_name_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	char *ns_altname;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		ns_altname = nspm->alt_name;
+	} else
+		return -ENXIO;
+
+	return sprintf(buf, "%s\n", ns_altname ? ns_altname : "");
+}
+static DEVICE_ATTR_RW(alt_name);
+
+static int scan_free(struct nd_region *nd_region,
+		struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
+		resource_size_t n)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	int rc = 0;
+
+	while (n) {
+		struct resource *res, *last;
+
+		last = NULL;
+		for_each_dpa_resource(ndd, res)
+			if (strcmp(res->name, label_id->id) == 0)
+				last = res;
+		res = last;
+		if (!res)
+			return 0;
+
+		if (n >= resource_size(res)) {
+			n -= resource_size(res);
+			nd_dbg_dpa(nd_region, ndd, res, "delete %d\n", rc);
+			nvdimm_free_dpa(ndd, res);
+			/* retry with last resource deleted */
+			continue;
+		}
+
+		rc = adjust_resource(res, res->start, resource_size(res) - n);
+		if (rc == 0)
+			res->flags |= DPA_RESOURCE_ADJUSTED;
+		nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc);
+		break;
+	}
+
+	return rc;
+}
+
+/**
+ * shrink_dpa_allocation - for each dimm in region free n bytes for label_id
+ * @nd_region: the set of dimms to reclaim @n bytes from
+ * @label_id: unique identifier for the namespace consuming this dpa range
+ * @n: number of bytes per-dimm to release
+ *
+ * Assumes resources are ordered.  Starting from the end try to
+ * adjust_resource() the allocation to @n, but if @n is larger than the
+ * allocation delete it and find the 'new' last allocation in the label
+ * set.
+ */
+static int shrink_dpa_allocation(struct nd_region *nd_region,
+		struct nd_label_id *label_id, resource_size_t n)
+{
+	int i;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		int rc;
+
+		rc = scan_free(nd_region, nd_mapping, label_id, n);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static resource_size_t init_dpa_allocation(struct nd_label_id *label_id,
+		struct nd_region *nd_region, struct nd_mapping *nd_mapping,
+		resource_size_t n)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct resource *res;
+	int rc = 0;
+
+	/* first resource allocation for this label-id or dimm */
+	res = nvdimm_allocate_dpa(ndd, label_id, nd_mapping->start, n);
+	if (!res)
+		rc = -EBUSY;
+
+	nd_dbg_dpa(nd_region, ndd, res, "init %d\n", rc);
+	return rc ? n : 0;
+}
+
+
+/**
+ * space_valid() - validate free dpa space against constraints
+ * @nd_region: hosting region of the free space
+ * @ndd: dimm device data for debug
+ * @label_id: namespace id to allocate space
+ * @prev: potential allocation that precedes free space
+ * @next: allocation that follows the given free space range
+ * @exist: first allocation with same id in the mapping
+ * @n: range that must satisfied for pmem allocations
+ * @valid: free space range to validate
+ *
+ * BLK-space is valid as long as it does not precede a PMEM
+ * allocation in a given region. PMEM-space must be contiguous
+ * and adjacent to an existing allocation (if one
+ * exists).  If reserving PMEM any space is valid.
+ */
+static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd,
+		struct nd_label_id *label_id, struct resource *prev,
+		struct resource *next, struct resource *exist,
+		resource_size_t n, struct resource *valid)
+{
+	bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0;
+	unsigned long align;
+
+	align = nd_region->align / nd_region->ndr_mappings;
+	valid->start = ALIGN(valid->start, align);
+	valid->end = ALIGN_DOWN(valid->end + 1, align) - 1;
+
+	if (valid->start >= valid->end)
+		goto invalid;
+
+	if (is_reserve)
+		return;
+
+	/* allocation needs to be contiguous, so this is all or nothing */
+	if (resource_size(valid) < n)
+		goto invalid;
+
+	/* we've got all the space we need and no existing allocation */
+	if (!exist)
+		return;
+
+	/* allocation needs to be contiguous with the existing namespace */
+	if (valid->start == exist->end + 1
+			|| valid->end == exist->start - 1)
+		return;
+
+ invalid:
+	/* truncate @valid size to 0 */
+	valid->end = valid->start - 1;
+}
+
+enum alloc_loc {
+	ALLOC_ERR = 0, ALLOC_BEFORE, ALLOC_MID, ALLOC_AFTER,
+};
+
+static resource_size_t scan_allocate(struct nd_region *nd_region,
+		struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
+		resource_size_t n)
+{
+	resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1;
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct resource *res, *exist = NULL, valid;
+	const resource_size_t to_allocate = n;
+	int first;
+
+	for_each_dpa_resource(ndd, res)
+		if (strcmp(label_id->id, res->name) == 0)
+			exist = res;
+
+	valid.start = nd_mapping->start;
+	valid.end = mapping_end;
+	valid.name = "free space";
+ retry:
+	first = 0;
+	for_each_dpa_resource(ndd, res) {
+		struct resource *next = res->sibling, *new_res = NULL;
+		resource_size_t allocate, available = 0;
+		enum alloc_loc loc = ALLOC_ERR;
+		const char *action;
+		int rc = 0;
+
+		/* ignore resources outside this nd_mapping */
+		if (res->start > mapping_end)
+			continue;
+		if (res->end < nd_mapping->start)
+			continue;
+
+		/* space at the beginning of the mapping */
+		if (!first++ && res->start > nd_mapping->start) {
+			valid.start = nd_mapping->start;
+			valid.end = res->start - 1;
+			space_valid(nd_region, ndd, label_id, NULL, next, exist,
+					to_allocate, &valid);
+			available = resource_size(&valid);
+			if (available)
+				loc = ALLOC_BEFORE;
+		}
+
+		/* space between allocations */
+		if (!loc && next) {
+			valid.start = res->start + resource_size(res);
+			valid.end = min(mapping_end, next->start - 1);
+			space_valid(nd_region, ndd, label_id, res, next, exist,
+					to_allocate, &valid);
+			available = resource_size(&valid);
+			if (available)
+				loc = ALLOC_MID;
+		}
+
+		/* space at the end of the mapping */
+		if (!loc && !next) {
+			valid.start = res->start + resource_size(res);
+			valid.end = mapping_end;
+			space_valid(nd_region, ndd, label_id, res, next, exist,
+					to_allocate, &valid);
+			available = resource_size(&valid);
+			if (available)
+				loc = ALLOC_AFTER;
+		}
+
+		if (!loc || !available)
+			continue;
+		allocate = min(available, n);
+		switch (loc) {
+		case ALLOC_BEFORE:
+			if (strcmp(res->name, label_id->id) == 0) {
+				/* adjust current resource up */
+				rc = adjust_resource(res, res->start - allocate,
+						resource_size(res) + allocate);
+				action = "cur grow up";
+			} else
+				action = "allocate";
+			break;
+		case ALLOC_MID:
+			if (strcmp(next->name, label_id->id) == 0) {
+				/* adjust next resource up */
+				rc = adjust_resource(next, next->start
+						- allocate, resource_size(next)
+						+ allocate);
+				new_res = next;
+				action = "next grow up";
+			} else if (strcmp(res->name, label_id->id) == 0) {
+				action = "grow down";
+			} else
+				action = "allocate";
+			break;
+		case ALLOC_AFTER:
+			if (strcmp(res->name, label_id->id) == 0)
+				action = "grow down";
+			else
+				action = "allocate";
+			break;
+		default:
+			return n;
+		}
+
+		if (strcmp(action, "allocate") == 0) {
+			new_res = nvdimm_allocate_dpa(ndd, label_id,
+					valid.start, allocate);
+			if (!new_res)
+				rc = -EBUSY;
+		} else if (strcmp(action, "grow down") == 0) {
+			/* adjust current resource down */
+			rc = adjust_resource(res, res->start, resource_size(res)
+					+ allocate);
+			if (rc == 0)
+				res->flags |= DPA_RESOURCE_ADJUSTED;
+		}
+
+		if (!new_res)
+			new_res = res;
+
+		nd_dbg_dpa(nd_region, ndd, new_res, "%s(%d) %d\n",
+				action, loc, rc);
+
+		if (rc)
+			return n;
+
+		n -= allocate;
+		if (n) {
+			/*
+			 * Retry scan with newly inserted resources.
+			 * For example, if we did an ALLOC_BEFORE
+			 * insertion there may also have been space
+			 * available for an ALLOC_AFTER insertion, so we
+			 * need to check this same resource again
+			 */
+			goto retry;
+		} else
+			return 0;
+	}
+
+	if (n == to_allocate)
+		return init_dpa_allocation(label_id, nd_region, nd_mapping, n);
+	return n;
+}
+
+static int merge_dpa(struct nd_region *nd_region,
+		struct nd_mapping *nd_mapping, struct nd_label_id *label_id)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct resource *res;
+
+	if (strncmp("pmem", label_id->id, 4) == 0)
+		return 0;
+ retry:
+	for_each_dpa_resource(ndd, res) {
+		int rc;
+		struct resource *next = res->sibling;
+		resource_size_t end = res->start + resource_size(res);
+
+		if (!next || strcmp(res->name, label_id->id) != 0
+				|| strcmp(next->name, label_id->id) != 0
+				|| end != next->start)
+			continue;
+		end += resource_size(next);
+		nvdimm_free_dpa(ndd, next);
+		rc = adjust_resource(res, res->start, end - res->start);
+		nd_dbg_dpa(nd_region, ndd, res, "merge %d\n", rc);
+		if (rc)
+			return rc;
+		res->flags |= DPA_RESOURCE_ADJUSTED;
+		goto retry;
+	}
+
+	return 0;
+}
+
+int __reserve_free_pmem(struct device *dev, void *data)
+{
+	struct nvdimm *nvdimm = data;
+	struct nd_region *nd_region;
+	struct nd_label_id label_id;
+	int i;
+
+	if (!is_memory(dev))
+		return 0;
+
+	nd_region = to_nd_region(dev);
+	if (nd_region->ndr_mappings == 0)
+		return 0;
+
+	memset(&label_id, 0, sizeof(label_id));
+	strcat(label_id.id, "pmem-reserve");
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		resource_size_t n, rem = 0;
+
+		if (nd_mapping->nvdimm != nvdimm)
+			continue;
+
+		n = nd_pmem_available_dpa(nd_region, nd_mapping);
+		if (n == 0)
+			return 0;
+		rem = scan_allocate(nd_region, nd_mapping, &label_id, n);
+		dev_WARN_ONCE(&nd_region->dev, rem,
+				"pmem reserve underrun: %#llx of %#llx bytes\n",
+				(unsigned long long) n - rem,
+				(unsigned long long) n);
+		return rem ? -ENXIO : 0;
+	}
+
+	return 0;
+}
+
+void release_free_pmem(struct nvdimm_bus *nvdimm_bus,
+		struct nd_mapping *nd_mapping)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct resource *res, *_res;
+
+	for_each_dpa_resource_safe(ndd, res, _res)
+		if (strcmp(res->name, "pmem-reserve") == 0)
+			nvdimm_free_dpa(ndd, res);
+}
+
+/**
+ * grow_dpa_allocation - for each dimm allocate n bytes for @label_id
+ * @nd_region: the set of dimms to allocate @n more bytes from
+ * @label_id: unique identifier for the namespace consuming this dpa range
+ * @n: number of bytes per-dimm to add to the existing allocation
+ *
+ * Assumes resources are ordered.  For BLK regions, first consume
+ * BLK-only available DPA free space, then consume PMEM-aliased DPA
+ * space starting at the highest DPA.  For PMEM regions start
+ * allocations from the start of an interleave set and end at the first
+ * BLK allocation or the end of the interleave set, whichever comes
+ * first.
+ */
+static int grow_dpa_allocation(struct nd_region *nd_region,
+		struct nd_label_id *label_id, resource_size_t n)
+{
+	int i;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		resource_size_t rem = n;
+		int rc;
+
+		rem = scan_allocate(nd_region, nd_mapping, label_id, rem);
+		dev_WARN_ONCE(&nd_region->dev, rem,
+				"allocation underrun: %#llx of %#llx bytes\n",
+				(unsigned long long) n - rem,
+				(unsigned long long) n);
+		if (rem)
+			return -ENXIO;
+
+		rc = merge_dpa(nd_region, nd_mapping, label_id);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static void nd_namespace_pmem_set_resource(struct nd_region *nd_region,
+		struct nd_namespace_pmem *nspm, resource_size_t size)
+{
+	struct resource *res = &nspm->nsio.res;
+	resource_size_t offset = 0;
+
+	if (size && !nspm->uuid) {
+		WARN_ON_ONCE(1);
+		size = 0;
+	}
+
+	if (size && nspm->uuid) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct nd_label_id label_id;
+		struct resource *res;
+
+		if (!ndd) {
+			size = 0;
+			goto out;
+		}
+
+		nd_label_gen_id(&label_id, nspm->uuid, 0);
+
+		/* calculate a spa offset from the dpa allocation offset */
+		for_each_dpa_resource(ndd, res)
+			if (strcmp(res->name, label_id.id) == 0) {
+				offset = (res->start - nd_mapping->start)
+					* nd_region->ndr_mappings;
+				goto out;
+			}
+
+		WARN_ON_ONCE(1);
+		size = 0;
+	}
+
+ out:
+	res->start = nd_region->ndr_start + offset;
+	res->end = res->start + size - 1;
+}
+
+static bool uuid_not_set(const uuid_t *uuid, struct device *dev,
+			 const char *where)
+{
+	if (!uuid) {
+		dev_dbg(dev, "%s: uuid not set\n", where);
+		return true;
+	}
+	return false;
+}
+
+static ssize_t __size_store(struct device *dev, unsigned long long val)
+{
+	resource_size_t allocated = 0, available = 0;
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_namespace_common *ndns = to_ndns(dev);
+	struct nd_mapping *nd_mapping;
+	struct nvdimm_drvdata *ndd;
+	struct nd_label_id label_id;
+	u32 flags = 0, remainder;
+	int rc, i, id = -1;
+	uuid_t *uuid = NULL;
+
+	if (dev->driver || ndns->claim)
+		return -EBUSY;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		uuid = nspm->uuid;
+		id = nspm->id;
+	}
+
+	/*
+	 * We need a uuid for the allocation-label and dimm(s) on which
+	 * to store the label.
+	 */
+	if (uuid_not_set(uuid, dev, __func__))
+		return -ENXIO;
+	if (nd_region->ndr_mappings == 0) {
+		dev_dbg(dev, "not associated with dimm(s)\n");
+		return -ENXIO;
+	}
+
+	div_u64_rem(val, nd_region->align, &remainder);
+	if (remainder) {
+		dev_dbg(dev, "%llu is not %ldK aligned\n", val,
+				nd_region->align / SZ_1K);
+		return -EINVAL;
+	}
+
+	nd_label_gen_id(&label_id, uuid, flags);
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		nd_mapping = &nd_region->mapping[i];
+		ndd = to_ndd(nd_mapping);
+
+		/*
+		 * All dimms in an interleave set, need to be enabled
+		 * for the size to be changed.
+		 */
+		if (!ndd)
+			return -ENXIO;
+
+		allocated += nvdimm_allocated_dpa(ndd, &label_id);
+	}
+	available = nd_region_allocatable_dpa(nd_region);
+
+	if (val > available + allocated)
+		return -ENOSPC;
+
+	if (val == allocated)
+		return 0;
+
+	val = div_u64(val, nd_region->ndr_mappings);
+	allocated = div_u64(allocated, nd_region->ndr_mappings);
+	if (val < allocated)
+		rc = shrink_dpa_allocation(nd_region, &label_id,
+				allocated - val);
+	else
+		rc = grow_dpa_allocation(nd_region, &label_id, val - allocated);
+
+	if (rc)
+		return rc;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		nd_namespace_pmem_set_resource(nd_region, nspm,
+				val * nd_region->ndr_mappings);
+	}
+
+	/*
+	 * Try to delete the namespace if we deleted all of its
+	 * allocation, this is not the seed or 0th device for the
+	 * region, and it is not actively claimed by a btt, pfn, or dax
+	 * instance.
+	 */
+	if (val == 0 && id != 0 && nd_region->ns_seed != dev && !ndns->claim)
+		nd_device_unregister(dev, ND_ASYNC);
+
+	return rc;
+}
+
+static ssize_t size_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	unsigned long long val;
+	int rc;
+
+	rc = kstrtoull(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	rc = __size_store(dev, val);
+	if (rc >= 0)
+		rc = nd_namespace_label_update(nd_region, dev);
+
+	/* setting size zero == 'delete namespace' */
+	if (rc == 0 && val == 0 && is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		kfree(nspm->uuid);
+		nspm->uuid = NULL;
+	}
+
+	dev_dbg(dev, "%llx %s (%d)\n", val, rc < 0 ? "fail" : "success", rc);
+
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc < 0 ? rc : len;
+}
+
+resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
+{
+	struct device *dev = &ndns->dev;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		return resource_size(&nspm->nsio.res);
+	} else if (is_namespace_io(dev)) {
+		struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+		return resource_size(&nsio->res);
+	} else
+		WARN_ONCE(1, "unknown namespace type\n");
+	return 0;
+}
+
+resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
+{
+	resource_size_t size;
+
+	nvdimm_bus_lock(&ndns->dev);
+	size = __nvdimm_namespace_capacity(ndns);
+	nvdimm_bus_unlock(&ndns->dev);
+
+	return size;
+}
+EXPORT_SYMBOL(nvdimm_namespace_capacity);
+
+bool nvdimm_namespace_locked(struct nd_namespace_common *ndns)
+{
+	int i;
+	bool locked = false;
+	struct device *dev = &ndns->dev;
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		if (test_bit(NDD_LOCKED, &nvdimm->flags)) {
+			dev_dbg(dev, "%s locked\n", nvdimm_name(nvdimm));
+			locked = true;
+		}
+	}
+	return locked;
+}
+EXPORT_SYMBOL(nvdimm_namespace_locked);
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%llu\n", (unsigned long long)
+			nvdimm_namespace_capacity(to_ndns(dev)));
+}
+static DEVICE_ATTR(size, 0444, size_show, size_store);
+
+static uuid_t *namespace_to_uuid(struct device *dev)
+{
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		return nspm->uuid;
+	}
+	return ERR_PTR(-ENXIO);
+}
+
+static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	uuid_t *uuid = namespace_to_uuid(dev);
+
+	if (IS_ERR(uuid))
+		return PTR_ERR(uuid);
+	if (uuid)
+		return sprintf(buf, "%pUb\n", uuid);
+	return sprintf(buf, "\n");
+}
+
+/**
+ * namespace_update_uuid - check for a unique uuid and whether we're "renaming"
+ * @nd_region: parent region so we can updates all dimms in the set
+ * @dev: namespace type for generating label_id
+ * @new_uuid: incoming uuid
+ * @old_uuid: reference to the uuid storage location in the namespace object
+ */
+static int namespace_update_uuid(struct nd_region *nd_region,
+				 struct device *dev, uuid_t *new_uuid,
+				 uuid_t **old_uuid)
+{
+	struct nd_label_id old_label_id;
+	struct nd_label_id new_label_id;
+	int i;
+
+	if (!nd_is_uuid_unique(dev, new_uuid))
+		return -EINVAL;
+
+	if (*old_uuid == NULL)
+		goto out;
+
+	/*
+	 * If we've already written a label with this uuid, then it's
+	 * too late to rename because we can't reliably update the uuid
+	 * without losing the old namespace.  Userspace must delete this
+	 * namespace to abandon the old uuid.
+	 */
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+		/*
+		 * This check by itself is sufficient because old_uuid
+		 * would be NULL above if this uuid did not exist in the
+		 * currently written set.
+		 *
+		 * FIXME: can we delete uuid with zero dpa allocated?
+		 */
+		if (list_empty(&nd_mapping->labels))
+			return -EBUSY;
+	}
+
+	nd_label_gen_id(&old_label_id, *old_uuid, 0);
+	nd_label_gen_id(&new_label_id, new_uuid, 0);
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct nd_label_ent *label_ent;
+		struct resource *res;
+
+		for_each_dpa_resource(ndd, res)
+			if (strcmp(res->name, old_label_id.id) == 0)
+				sprintf((void *) res->name, "%s",
+						new_label_id.id);
+
+		mutex_lock(&nd_mapping->lock);
+		list_for_each_entry(label_ent, &nd_mapping->labels, list) {
+			struct nd_namespace_label *nd_label = label_ent->label;
+			struct nd_label_id label_id;
+			uuid_t uuid;
+
+			if (!nd_label)
+				continue;
+			nsl_get_uuid(ndd, nd_label, &uuid);
+			nd_label_gen_id(&label_id, &uuid,
+					nsl_get_flags(ndd, nd_label));
+			if (strcmp(old_label_id.id, label_id.id) == 0)
+				set_bit(ND_LABEL_REAP, &label_ent->flags);
+		}
+		mutex_unlock(&nd_mapping->lock);
+	}
+	kfree(*old_uuid);
+ out:
+	*old_uuid = new_uuid;
+	return 0;
+}
+
+static ssize_t uuid_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	uuid_t *uuid = NULL;
+	uuid_t **ns_uuid;
+	ssize_t rc = 0;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		ns_uuid = &nspm->uuid;
+	} else
+		return -ENXIO;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	if (to_ndns(dev)->claim)
+		rc = -EBUSY;
+	if (rc >= 0)
+		rc = nd_uuid_store(dev, &uuid, buf, len);
+	if (rc >= 0)
+		rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid);
+	if (rc >= 0)
+		rc = nd_namespace_label_update(nd_region, dev);
+	else
+		kfree(uuid);
+	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+			buf[len - 1] == '\n' ? "" : "\n");
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc < 0 ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t resource_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct resource *res;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		res = &nspm->nsio.res;
+	} else if (is_namespace_io(dev)) {
+		struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+		res = &nsio->res;
+	} else
+		return -ENXIO;
+
+	/* no address to convey if the namespace has no allocation */
+	if (resource_size(res) == 0)
+		return -ENXIO;
+	return sprintf(buf, "%#llx\n", (unsigned long long) res->start);
+}
+static DEVICE_ATTR_ADMIN_RO(resource);
+
+static const unsigned long pmem_lbasize_supported[] = { 512, 4096, 0 };
+
+static ssize_t sector_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		return nd_size_select_show(nspm->lbasize,
+				pmem_lbasize_supported, buf);
+	}
+	return -ENXIO;
+}
+
+static ssize_t sector_size_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	const unsigned long *supported;
+	unsigned long *lbasize;
+	ssize_t rc = 0;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		lbasize = &nspm->lbasize;
+		supported = pmem_lbasize_supported;
+	} else
+		return -ENXIO;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	if (to_ndns(dev)->claim)
+		rc = -EBUSY;
+	if (rc >= 0)
+		rc = nd_size_select_store(dev, buf, lbasize, supported);
+	if (rc >= 0)
+		rc = nd_namespace_label_update(nd_region, dev);
+	dev_dbg(dev, "result: %zd %s: %s%s", rc, rc < 0 ? "tried" : "wrote",
+			buf, buf[len - 1] == '\n' ? "" : "\n");
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(sector_size);
+
+static ssize_t dpa_extents_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_label_id label_id;
+	uuid_t *uuid = NULL;
+	int count = 0, i;
+	u32 flags = 0;
+
+	nvdimm_bus_lock(dev);
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		uuid = nspm->uuid;
+		flags = 0;
+	}
+
+	if (!uuid)
+		goto out;
+
+	nd_label_gen_id(&label_id, uuid, flags);
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct resource *res;
+
+		for_each_dpa_resource(ndd, res)
+			if (strcmp(res->name, label_id.id) == 0)
+				count++;
+	}
+ out:
+	nvdimm_bus_unlock(dev);
+
+	return sprintf(buf, "%d\n", count);
+}
+static DEVICE_ATTR_RO(dpa_extents);
+
+static int btt_claim_class(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	int i, loop_bitmask = 0;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct nd_namespace_index *nsindex;
+
+		/*
+		 * If any of the DIMMs do not support labels the only
+		 * possible BTT format is v1.
+		 */
+		if (!ndd) {
+			loop_bitmask = 0;
+			break;
+		}
+
+		nsindex = to_namespace_index(ndd, ndd->ns_current);
+		if (nsindex == NULL)
+			loop_bitmask |= 1;
+		else {
+			/* check whether existing labels are v1.1 or v1.2 */
+			if (__le16_to_cpu(nsindex->major) == 1
+					&& __le16_to_cpu(nsindex->minor) == 1)
+				loop_bitmask |= 2;
+			else
+				loop_bitmask |= 4;
+		}
+	}
+	/*
+	 * If nsindex is null loop_bitmask's bit 0 will be set, and if an index
+	 * block is found, a v1.1 label for any mapping will set bit 1, and a
+	 * v1.2 label will set bit 2.
+	 *
+	 * At the end of the loop, at most one of the three bits must be set.
+	 * If multiple bits were set, it means the different mappings disagree
+	 * about their labels, and this must be cleaned up first.
+	 *
+	 * If all the label index blocks are found to agree, nsindex of NULL
+	 * implies labels haven't been initialized yet, and when they will,
+	 * they will be of the 1.2 format, so we can assume BTT2.0
+	 *
+	 * If 1.1 labels are found, we enforce BTT1.1, and if 1.2 labels are
+	 * found, we enforce BTT2.0
+	 *
+	 * If the loop was never entered, default to BTT1.1 (legacy namespaces)
+	 */
+	switch (loop_bitmask) {
+	case 0:
+	case 2:
+		return NVDIMM_CCLASS_BTT;
+	case 1:
+	case 4:
+		return NVDIMM_CCLASS_BTT2;
+	default:
+		return -ENXIO;
+	}
+}
+
+static ssize_t holder_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_namespace_common *ndns = to_ndns(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : "");
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(holder);
+
+static int __holder_class_store(struct device *dev, const char *buf)
+{
+	struct nd_namespace_common *ndns = to_ndns(dev);
+
+	if (dev->driver || ndns->claim)
+		return -EBUSY;
+
+	if (sysfs_streq(buf, "btt")) {
+		int rc = btt_claim_class(dev);
+
+		if (rc < NVDIMM_CCLASS_NONE)
+			return rc;
+		ndns->claim_class = rc;
+	} else if (sysfs_streq(buf, "pfn"))
+		ndns->claim_class = NVDIMM_CCLASS_PFN;
+	else if (sysfs_streq(buf, "dax"))
+		ndns->claim_class = NVDIMM_CCLASS_DAX;
+	else if (sysfs_streq(buf, ""))
+		ndns->claim_class = NVDIMM_CCLASS_NONE;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t holder_class_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	int rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	rc = __holder_class_store(dev, buf);
+	if (rc >= 0)
+		rc = nd_namespace_label_update(nd_region, dev);
+	dev_dbg(dev, "%s(%d)\n", rc < 0 ? "fail " : "", rc);
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc < 0 ? rc : len;
+}
+
+static ssize_t holder_class_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_namespace_common *ndns = to_ndns(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	if (ndns->claim_class == NVDIMM_CCLASS_NONE)
+		rc = sprintf(buf, "\n");
+	else if ((ndns->claim_class == NVDIMM_CCLASS_BTT) ||
+			(ndns->claim_class == NVDIMM_CCLASS_BTT2))
+		rc = sprintf(buf, "btt\n");
+	else if (ndns->claim_class == NVDIMM_CCLASS_PFN)
+		rc = sprintf(buf, "pfn\n");
+	else if (ndns->claim_class == NVDIMM_CCLASS_DAX)
+		rc = sprintf(buf, "dax\n");
+	else
+		rc = sprintf(buf, "<unknown>\n");
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(holder_class);
+
+static ssize_t mode_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_namespace_common *ndns = to_ndns(dev);
+	struct device *claim;
+	char *mode;
+	ssize_t rc;
+
+	device_lock(dev);
+	claim = ndns->claim;
+	if (claim && is_nd_btt(claim))
+		mode = "safe";
+	else if (claim && is_nd_pfn(claim))
+		mode = "memory";
+	else if (claim && is_nd_dax(claim))
+		mode = "dax";
+	else if (!claim && pmem_should_map_pages(dev))
+		mode = "memory";
+	else
+		mode = "raw";
+	rc = sprintf(buf, "%s\n", mode);
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(mode);
+
+static ssize_t force_raw_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	bool force_raw;
+	int rc = kstrtobool(buf, &force_raw);
+
+	if (rc)
+		return rc;
+
+	to_ndns(dev)->force_raw = force_raw;
+	return len;
+}
+
+static ssize_t force_raw_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", to_ndns(dev)->force_raw);
+}
+static DEVICE_ATTR_RW(force_raw);
+
+static struct attribute *nd_namespace_attributes[] = {
+	&dev_attr_nstype.attr,
+	&dev_attr_size.attr,
+	&dev_attr_mode.attr,
+	&dev_attr_uuid.attr,
+	&dev_attr_holder.attr,
+	&dev_attr_resource.attr,
+	&dev_attr_alt_name.attr,
+	&dev_attr_force_raw.attr,
+	&dev_attr_sector_size.attr,
+	&dev_attr_dpa_extents.attr,
+	&dev_attr_holder_class.attr,
+	NULL,
+};
+
+static umode_t namespace_visible(struct kobject *kobj,
+		struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+
+	if (is_namespace_pmem(dev)) {
+		if (a == &dev_attr_size.attr)
+			return 0644;
+
+		return a->mode;
+	}
+
+	/* base is_namespace_io() attributes */
+	if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr ||
+	    a == &dev_attr_holder.attr || a == &dev_attr_holder_class.attr ||
+	    a == &dev_attr_force_raw.attr || a == &dev_attr_mode.attr ||
+	    a == &dev_attr_resource.attr)
+		return a->mode;
+
+	return 0;
+}
+
+static struct attribute_group nd_namespace_attribute_group = {
+	.attrs = nd_namespace_attributes,
+	.is_visible = namespace_visible,
+};
+
+static const struct attribute_group *nd_namespace_attribute_groups[] = {
+	&nd_device_attribute_group,
+	&nd_namespace_attribute_group,
+	&nd_numa_attribute_group,
+	NULL,
+};
+
+static const struct device_type namespace_io_device_type = {
+	.name = "nd_namespace_io",
+	.release = namespace_io_release,
+	.groups = nd_namespace_attribute_groups,
+};
+
+static const struct device_type namespace_pmem_device_type = {
+	.name = "nd_namespace_pmem",
+	.release = namespace_pmem_release,
+	.groups = nd_namespace_attribute_groups,
+};
+
+static bool is_namespace_pmem(const struct device *dev)
+{
+	return dev ? dev->type == &namespace_pmem_device_type : false;
+}
+
+static bool is_namespace_io(const struct device *dev)
+{
+	return dev ? dev->type == &namespace_io_device_type : false;
+}
+
+struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
+{
+	struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
+	struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL;
+	struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL;
+	struct nd_namespace_common *ndns = NULL;
+	resource_size_t size;
+
+	if (nd_btt || nd_pfn || nd_dax) {
+		if (nd_btt)
+			ndns = nd_btt->ndns;
+		else if (nd_pfn)
+			ndns = nd_pfn->ndns;
+		else if (nd_dax)
+			ndns = nd_dax->nd_pfn.ndns;
+
+		if (!ndns)
+			return ERR_PTR(-ENODEV);
+
+		/*
+		 * Flush any in-progess probes / removals in the driver
+		 * for the raw personality of this namespace.
+		 */
+		device_lock(&ndns->dev);
+		device_unlock(&ndns->dev);
+		if (ndns->dev.driver) {
+			dev_dbg(&ndns->dev, "is active, can't bind %s\n",
+					dev_name(dev));
+			return ERR_PTR(-EBUSY);
+		}
+		if (dev_WARN_ONCE(&ndns->dev, ndns->claim != dev,
+					"host (%s) vs claim (%s) mismatch\n",
+					dev_name(dev),
+					dev_name(ndns->claim)))
+			return ERR_PTR(-ENXIO);
+	} else {
+		ndns = to_ndns(dev);
+		if (ndns->claim) {
+			dev_dbg(dev, "claimed by %s, failing probe\n",
+				dev_name(ndns->claim));
+
+			return ERR_PTR(-ENXIO);
+		}
+	}
+
+	if (nvdimm_namespace_locked(ndns))
+		return ERR_PTR(-EACCES);
+
+	size = nvdimm_namespace_capacity(ndns);
+	if (size < ND_MIN_NAMESPACE_SIZE) {
+		dev_dbg(&ndns->dev, "%pa, too small must be at least %#x\n",
+				&size, ND_MIN_NAMESPACE_SIZE);
+		return ERR_PTR(-ENODEV);
+	}
+
+	/*
+	 * Note, alignment validation for fsdax and devdax mode
+	 * namespaces happens in nd_pfn_validate() where infoblock
+	 * padding parameters can be applied.
+	 */
+	if (pmem_should_map_pages(dev)) {
+		struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+		struct resource *res = &nsio->res;
+
+		if (!IS_ALIGNED(res->start | (res->end + 1),
+					memremap_compat_align())) {
+			dev_err(&ndns->dev, "%pr misaligned, unable to map\n", res);
+			return ERR_PTR(-EOPNOTSUPP);
+		}
+	}
+
+	if (is_namespace_pmem(&ndns->dev)) {
+		struct nd_namespace_pmem *nspm;
+
+		nspm = to_nd_namespace_pmem(&ndns->dev);
+		if (uuid_not_set(nspm->uuid, &ndns->dev, __func__))
+			return ERR_PTR(-ENODEV);
+	}
+
+	return ndns;
+}
+EXPORT_SYMBOL(nvdimm_namespace_common_probe);
+
+int devm_namespace_enable(struct device *dev, struct nd_namespace_common *ndns,
+		resource_size_t size)
+{
+	return devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev), size);
+}
+EXPORT_SYMBOL_GPL(devm_namespace_enable);
+
+void devm_namespace_disable(struct device *dev, struct nd_namespace_common *ndns)
+{
+	devm_nsio_disable(dev, to_nd_namespace_io(&ndns->dev));
+}
+EXPORT_SYMBOL_GPL(devm_namespace_disable);
+
+static struct device **create_namespace_io(struct nd_region *nd_region)
+{
+	struct nd_namespace_io *nsio;
+	struct device *dev, **devs;
+	struct resource *res;
+
+	nsio = kzalloc(sizeof(*nsio), GFP_KERNEL);
+	if (!nsio)
+		return NULL;
+
+	devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL);
+	if (!devs) {
+		kfree(nsio);
+		return NULL;
+	}
+
+	dev = &nsio->common.dev;
+	dev->type = &namespace_io_device_type;
+	dev->parent = &nd_region->dev;
+	res = &nsio->res;
+	res->name = dev_name(&nd_region->dev);
+	res->flags = IORESOURCE_MEM;
+	res->start = nd_region->ndr_start;
+	res->end = res->start + nd_region->ndr_size - 1;
+
+	devs[0] = dev;
+	return devs;
+}
+
+static bool has_uuid_at_pos(struct nd_region *nd_region, const uuid_t *uuid,
+			    u64 cookie, u16 pos)
+{
+	struct nd_namespace_label *found = NULL;
+	int i;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nd_interleave_set *nd_set = nd_region->nd_set;
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct nd_label_ent *label_ent;
+		bool found_uuid = false;
+
+		list_for_each_entry(label_ent, &nd_mapping->labels, list) {
+			struct nd_namespace_label *nd_label = label_ent->label;
+			u16 position;
+
+			if (!nd_label)
+				continue;
+			position = nsl_get_position(ndd, nd_label);
+
+			if (!nsl_validate_isetcookie(ndd, nd_label, cookie))
+				continue;
+
+			if (!nsl_uuid_equal(ndd, nd_label, uuid))
+				continue;
+
+			if (!nsl_validate_type_guid(ndd, nd_label,
+						    &nd_set->type_guid))
+				continue;
+
+			if (found_uuid) {
+				dev_dbg(ndd->dev, "duplicate entry for uuid\n");
+				return false;
+			}
+			found_uuid = true;
+			if (!nsl_validate_nlabel(nd_region, ndd, nd_label))
+				continue;
+			if (position != pos)
+				continue;
+			found = nd_label;
+			break;
+		}
+		if (found)
+			break;
+	}
+	return found != NULL;
+}
+
+static int select_pmem_id(struct nd_region *nd_region, const uuid_t *pmem_id)
+{
+	int i;
+
+	if (!pmem_id)
+		return -ENODEV;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct nd_namespace_label *nd_label = NULL;
+		u64 hw_start, hw_end, pmem_start, pmem_end;
+		struct nd_label_ent *label_ent;
+
+		lockdep_assert_held(&nd_mapping->lock);
+		list_for_each_entry(label_ent, &nd_mapping->labels, list) {
+			nd_label = label_ent->label;
+			if (!nd_label)
+				continue;
+			if (nsl_uuid_equal(ndd, nd_label, pmem_id))
+				break;
+			nd_label = NULL;
+		}
+
+		if (!nd_label) {
+			WARN_ON(1);
+			return -EINVAL;
+		}
+
+		/*
+		 * Check that this label is compliant with the dpa
+		 * range published in NFIT
+		 */
+		hw_start = nd_mapping->start;
+		hw_end = hw_start + nd_mapping->size;
+		pmem_start = nsl_get_dpa(ndd, nd_label);
+		pmem_end = pmem_start + nsl_get_rawsize(ndd, nd_label);
+		if (pmem_start >= hw_start && pmem_start < hw_end
+				&& pmem_end <= hw_end && pmem_end > hw_start)
+			/* pass */;
+		else {
+			dev_dbg(&nd_region->dev, "%s invalid label for %pUb\n",
+				dev_name(ndd->dev),
+				nsl_uuid_raw(ndd, nd_label));
+			return -EINVAL;
+		}
+
+		/* move recently validated label to the front of the list */
+		list_move(&label_ent->list, &nd_mapping->labels);
+	}
+	return 0;
+}
+
+/**
+ * create_namespace_pmem - validate interleave set labelling, retrieve label0
+ * @nd_region: region with mappings to validate
+ * @nspm: target namespace to create
+ * @nd_label: target pmem namespace label to evaluate
+ */
+static struct device *create_namespace_pmem(struct nd_region *nd_region,
+					    struct nd_mapping *nd_mapping,
+					    struct nd_namespace_label *nd_label)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct nd_namespace_index *nsindex =
+		to_namespace_index(ndd, ndd->ns_current);
+	u64 cookie = nd_region_interleave_set_cookie(nd_region, nsindex);
+	u64 altcookie = nd_region_interleave_set_altcookie(nd_region);
+	struct nd_label_ent *label_ent;
+	struct nd_namespace_pmem *nspm;
+	resource_size_t size = 0;
+	struct resource *res;
+	struct device *dev;
+	uuid_t uuid;
+	int rc = 0;
+	u16 i;
+
+	if (cookie == 0) {
+		dev_dbg(&nd_region->dev, "invalid interleave-set-cookie\n");
+		return ERR_PTR(-ENXIO);
+	}
+
+	if (!nsl_validate_isetcookie(ndd, nd_label, cookie)) {
+		dev_dbg(&nd_region->dev, "invalid cookie in label: %pUb\n",
+			nsl_uuid_raw(ndd, nd_label));
+		if (!nsl_validate_isetcookie(ndd, nd_label, altcookie))
+			return ERR_PTR(-EAGAIN);
+
+		dev_dbg(&nd_region->dev, "valid altcookie in label: %pUb\n",
+			nsl_uuid_raw(ndd, nd_label));
+	}
+
+	nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
+	if (!nspm)
+		return ERR_PTR(-ENOMEM);
+
+	nspm->id = -1;
+	dev = &nspm->nsio.common.dev;
+	dev->type = &namespace_pmem_device_type;
+	dev->parent = &nd_region->dev;
+	res = &nspm->nsio.res;
+	res->name = dev_name(&nd_region->dev);
+	res->flags = IORESOURCE_MEM;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		nsl_get_uuid(ndd, nd_label, &uuid);
+		if (has_uuid_at_pos(nd_region, &uuid, cookie, i))
+			continue;
+		if (has_uuid_at_pos(nd_region, &uuid, altcookie, i))
+			continue;
+		break;
+	}
+
+	if (i < nd_region->ndr_mappings) {
+		struct nvdimm *nvdimm = nd_region->mapping[i].nvdimm;
+
+		/*
+		 * Give up if we don't find an instance of a uuid at each
+		 * position (from 0 to nd_region->ndr_mappings - 1), or if we
+		 * find a dimm with two instances of the same uuid.
+		 */
+		dev_err(&nd_region->dev, "%s missing label for %pUb\n",
+			nvdimm_name(nvdimm), nsl_uuid_raw(ndd, nd_label));
+		rc = -EINVAL;
+		goto err;
+	}
+
+	/*
+	 * Fix up each mapping's 'labels' to have the validated pmem label for
+	 * that position at labels[0], and NULL at labels[1].  In the process,
+	 * check that the namespace aligns with interleave-set.
+	 */
+	nsl_get_uuid(ndd, nd_label, &uuid);
+	rc = select_pmem_id(nd_region, &uuid);
+	if (rc)
+		goto err;
+
+	/* Calculate total size and populate namespace properties from label0 */
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_namespace_label *label0;
+		struct nvdimm_drvdata *ndd;
+
+		nd_mapping = &nd_region->mapping[i];
+		label_ent = list_first_entry_or_null(&nd_mapping->labels,
+				typeof(*label_ent), list);
+		label0 = label_ent ? label_ent->label : NULL;
+
+		if (!label0) {
+			WARN_ON(1);
+			continue;
+		}
+
+		ndd = to_ndd(nd_mapping);
+		size += nsl_get_rawsize(ndd, label0);
+		if (nsl_get_position(ndd, label0) != 0)
+			continue;
+		WARN_ON(nspm->alt_name || nspm->uuid);
+		nspm->alt_name = kmemdup(nsl_ref_name(ndd, label0),
+					 NSLABEL_NAME_LEN, GFP_KERNEL);
+		nsl_get_uuid(ndd, label0, &uuid);
+		nspm->uuid = kmemdup(&uuid, sizeof(uuid_t), GFP_KERNEL);
+		nspm->lbasize = nsl_get_lbasize(ndd, label0);
+		nspm->nsio.common.claim_class =
+			nsl_get_claim_class(ndd, label0);
+	}
+
+	if (!nspm->alt_name || !nspm->uuid) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	nd_namespace_pmem_set_resource(nd_region, nspm, size);
+
+	return dev;
+ err:
+	namespace_pmem_release(dev);
+	switch (rc) {
+	case -EINVAL:
+		dev_dbg(&nd_region->dev, "invalid label(s)\n");
+		break;
+	case -ENODEV:
+		dev_dbg(&nd_region->dev, "label not found\n");
+		break;
+	default:
+		dev_dbg(&nd_region->dev, "unexpected err: %d\n", rc);
+		break;
+	}
+	return ERR_PTR(rc);
+}
+
+static struct device *nd_namespace_pmem_create(struct nd_region *nd_region)
+{
+	struct nd_namespace_pmem *nspm;
+	struct resource *res;
+	struct device *dev;
+
+	if (!is_memory(&nd_region->dev))
+		return NULL;
+
+	nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
+	if (!nspm)
+		return NULL;
+
+	dev = &nspm->nsio.common.dev;
+	dev->type = &namespace_pmem_device_type;
+	dev->parent = &nd_region->dev;
+	res = &nspm->nsio.res;
+	res->name = dev_name(&nd_region->dev);
+	res->flags = IORESOURCE_MEM;
+
+	nspm->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL);
+	if (nspm->id < 0) {
+		kfree(nspm);
+		return NULL;
+	}
+	dev_set_name(dev, "namespace%d.%d", nd_region->id, nspm->id);
+	nd_namespace_pmem_set_resource(nd_region, nspm, 0);
+
+	return dev;
+}
+
+static struct lock_class_key nvdimm_namespace_key;
+
+void nd_region_create_ns_seed(struct nd_region *nd_region)
+{
+	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+
+	if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_IO)
+		return;
+
+	nd_region->ns_seed = nd_namespace_pmem_create(nd_region);
+
+	/*
+	 * Seed creation failures are not fatal, provisioning is simply
+	 * disabled until memory becomes available
+	 */
+	if (!nd_region->ns_seed)
+		dev_err(&nd_region->dev, "failed to create namespace\n");
+	else {
+		device_initialize(nd_region->ns_seed);
+		lockdep_set_class(&nd_region->ns_seed->mutex,
+				  &nvdimm_namespace_key);
+		nd_device_register(nd_region->ns_seed);
+	}
+}
+
+void nd_region_create_dax_seed(struct nd_region *nd_region)
+{
+	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+	nd_region->dax_seed = nd_dax_create(nd_region);
+	/*
+	 * Seed creation failures are not fatal, provisioning is simply
+	 * disabled until memory becomes available
+	 */
+	if (!nd_region->dax_seed)
+		dev_err(&nd_region->dev, "failed to create dax namespace\n");
+}
+
+void nd_region_create_pfn_seed(struct nd_region *nd_region)
+{
+	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+	nd_region->pfn_seed = nd_pfn_create(nd_region);
+	/*
+	 * Seed creation failures are not fatal, provisioning is simply
+	 * disabled until memory becomes available
+	 */
+	if (!nd_region->pfn_seed)
+		dev_err(&nd_region->dev, "failed to create pfn namespace\n");
+}
+
+void nd_region_create_btt_seed(struct nd_region *nd_region)
+{
+	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+	nd_region->btt_seed = nd_btt_create(nd_region);
+	/*
+	 * Seed creation failures are not fatal, provisioning is simply
+	 * disabled until memory becomes available
+	 */
+	if (!nd_region->btt_seed)
+		dev_err(&nd_region->dev, "failed to create btt namespace\n");
+}
+
+static int add_namespace_resource(struct nd_region *nd_region,
+		struct nd_namespace_label *nd_label, struct device **devs,
+		int count)
+{
+	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	int i;
+
+	for (i = 0; i < count; i++) {
+		uuid_t *uuid = namespace_to_uuid(devs[i]);
+
+		if (IS_ERR(uuid)) {
+			WARN_ON(1);
+			continue;
+		}
+
+		if (!nsl_uuid_equal(ndd, nd_label, uuid))
+			continue;
+		dev_err(&nd_region->dev,
+			"error: conflicting extents for uuid: %pUb\n", uuid);
+		return -ENXIO;
+	}
+
+	return i;
+}
+
+static int cmp_dpa(const void *a, const void *b)
+{
+	const struct device *dev_a = *(const struct device **) a;
+	const struct device *dev_b = *(const struct device **) b;
+	struct nd_namespace_pmem *nspm_a, *nspm_b;
+
+	if (is_namespace_io(dev_a))
+		return 0;
+
+	nspm_a = to_nd_namespace_pmem(dev_a);
+	nspm_b = to_nd_namespace_pmem(dev_b);
+
+	return memcmp(&nspm_a->nsio.res.start, &nspm_b->nsio.res.start,
+			sizeof(resource_size_t));
+}
+
+static struct device **scan_labels(struct nd_region *nd_region)
+{
+	int i, count = 0;
+	struct device *dev, **devs = NULL;
+	struct nd_label_ent *label_ent, *e;
+	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	resource_size_t map_end = nd_mapping->start + nd_mapping->size - 1;
+
+	/* "safe" because create_namespace_pmem() might list_move() label_ent */
+	list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
+		struct nd_namespace_label *nd_label = label_ent->label;
+		struct device **__devs;
+
+		if (!nd_label)
+			continue;
+
+		/* skip labels that describe extents outside of the region */
+		if (nsl_get_dpa(ndd, nd_label) < nd_mapping->start ||
+		    nsl_get_dpa(ndd, nd_label) > map_end)
+			continue;
+
+		i = add_namespace_resource(nd_region, nd_label, devs, count);
+		if (i < 0)
+			goto err;
+		if (i < count)
+			continue;
+		__devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL);
+		if (!__devs)
+			goto err;
+		memcpy(__devs, devs, sizeof(dev) * count);
+		kfree(devs);
+		devs = __devs;
+
+		dev = create_namespace_pmem(nd_region, nd_mapping, nd_label);
+		if (IS_ERR(dev)) {
+			switch (PTR_ERR(dev)) {
+			case -EAGAIN:
+				/* skip invalid labels */
+				continue;
+			case -ENODEV:
+				/* fallthrough to seed creation */
+				break;
+			default:
+				goto err;
+			}
+		} else
+			devs[count++] = dev;
+
+	}
+
+	dev_dbg(&nd_region->dev, "discovered %d namespace%s\n", count,
+		count == 1 ? "" : "s");
+
+	if (count == 0) {
+		struct nd_namespace_pmem *nspm;
+
+		/* Publish a zero-sized namespace for userspace to configure. */
+		nd_mapping_free_labels(nd_mapping);
+
+		devs = kcalloc(2, sizeof(dev), GFP_KERNEL);
+		if (!devs)
+			goto err;
+
+		nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
+		if (!nspm)
+			goto err;
+		dev = &nspm->nsio.common.dev;
+		dev->type = &namespace_pmem_device_type;
+		nd_namespace_pmem_set_resource(nd_region, nspm, 0);
+		dev->parent = &nd_region->dev;
+		devs[count++] = dev;
+	} else if (is_memory(&nd_region->dev)) {
+		/* clean unselected labels */
+		for (i = 0; i < nd_region->ndr_mappings; i++) {
+			struct list_head *l, *e;
+			LIST_HEAD(list);
+			int j;
+
+			nd_mapping = &nd_region->mapping[i];
+			if (list_empty(&nd_mapping->labels)) {
+				WARN_ON(1);
+				continue;
+			}
+
+			j = count;
+			list_for_each_safe(l, e, &nd_mapping->labels) {
+				if (!j--)
+					break;
+				list_move_tail(l, &list);
+			}
+			nd_mapping_free_labels(nd_mapping);
+			list_splice_init(&list, &nd_mapping->labels);
+		}
+	}
+
+	if (count > 1)
+		sort(devs, count, sizeof(struct device *), cmp_dpa, NULL);
+
+	return devs;
+
+ err:
+	if (devs) {
+		for (i = 0; devs[i]; i++)
+			namespace_pmem_release(devs[i]);
+		kfree(devs);
+	}
+	return NULL;
+}
+
+static struct device **create_namespaces(struct nd_region *nd_region)
+{
+	struct nd_mapping *nd_mapping;
+	struct device **devs;
+	int i;
+
+	if (nd_region->ndr_mappings == 0)
+		return NULL;
+
+	/* lock down all mappings while we scan labels */
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		nd_mapping = &nd_region->mapping[i];
+		mutex_lock_nested(&nd_mapping->lock, i);
+	}
+
+	devs = scan_labels(nd_region);
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		int reverse = nd_region->ndr_mappings - 1 - i;
+
+		nd_mapping = &nd_region->mapping[reverse];
+		mutex_unlock(&nd_mapping->lock);
+	}
+
+	return devs;
+}
+
+static void deactivate_labels(void *region)
+{
+	struct nd_region *nd_region = region;
+	int i;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = nd_mapping->ndd;
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		mutex_lock(&nd_mapping->lock);
+		nd_mapping_free_labels(nd_mapping);
+		mutex_unlock(&nd_mapping->lock);
+
+		put_ndd(ndd);
+		nd_mapping->ndd = NULL;
+		if (ndd)
+			atomic_dec(&nvdimm->busy);
+	}
+}
+
+static int init_active_labels(struct nd_region *nd_region)
+{
+	int i, rc = 0;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+		struct nd_label_ent *label_ent;
+		int count, j;
+
+		/*
+		 * If the dimm is disabled then we may need to prevent
+		 * the region from being activated.
+		 */
+		if (!ndd) {
+			if (test_bit(NDD_LOCKED, &nvdimm->flags))
+				/* fail, label data may be unreadable */;
+			else if (test_bit(NDD_LABELING, &nvdimm->flags))
+				/* fail, labels needed to disambiguate dpa */;
+			else
+				continue;
+
+			dev_err(&nd_region->dev, "%s: is %s, failing probe\n",
+					dev_name(&nd_mapping->nvdimm->dev),
+					test_bit(NDD_LOCKED, &nvdimm->flags)
+					? "locked" : "disabled");
+			rc = -ENXIO;
+			goto out;
+		}
+		nd_mapping->ndd = ndd;
+		atomic_inc(&nvdimm->busy);
+		get_ndd(ndd);
+
+		count = nd_label_active_count(ndd);
+		dev_dbg(ndd->dev, "count: %d\n", count);
+		if (!count)
+			continue;
+		for (j = 0; j < count; j++) {
+			struct nd_namespace_label *label;
+
+			label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL);
+			if (!label_ent)
+				break;
+			label = nd_label_active(ndd, j);
+			label_ent->label = label;
+
+			mutex_lock(&nd_mapping->lock);
+			list_add_tail(&label_ent->list, &nd_mapping->labels);
+			mutex_unlock(&nd_mapping->lock);
+		}
+
+		if (j < count)
+			break;
+	}
+
+	if (i < nd_region->ndr_mappings)
+		rc = -ENOMEM;
+
+out:
+	if (rc) {
+		deactivate_labels(nd_region);
+		return rc;
+	}
+
+	return devm_add_action_or_reset(&nd_region->dev, deactivate_labels,
+					nd_region);
+}
+
+int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
+{
+	struct device **devs = NULL;
+	int i, rc = 0, type;
+
+	*err = 0;
+	nvdimm_bus_lock(&nd_region->dev);
+	rc = init_active_labels(nd_region);
+	if (rc) {
+		nvdimm_bus_unlock(&nd_region->dev);
+		return rc;
+	}
+
+	type = nd_region_to_nstype(nd_region);
+	switch (type) {
+	case ND_DEVICE_NAMESPACE_IO:
+		devs = create_namespace_io(nd_region);
+		break;
+	case ND_DEVICE_NAMESPACE_PMEM:
+		devs = create_namespaces(nd_region);
+		break;
+	default:
+		break;
+	}
+	nvdimm_bus_unlock(&nd_region->dev);
+
+	if (!devs)
+		return -ENODEV;
+
+	for (i = 0; devs[i]; i++) {
+		struct device *dev = devs[i];
+		int id;
+
+		if (type == ND_DEVICE_NAMESPACE_PMEM) {
+			struct nd_namespace_pmem *nspm;
+
+			nspm = to_nd_namespace_pmem(dev);
+			id = ida_simple_get(&nd_region->ns_ida, 0, 0,
+					    GFP_KERNEL);
+			nspm->id = id;
+		} else
+			id = i;
+
+		if (id < 0)
+			break;
+		dev_set_name(dev, "namespace%d.%d", nd_region->id, id);
+		device_initialize(dev);
+		lockdep_set_class(&dev->mutex, &nvdimm_namespace_key);
+		nd_device_register(dev);
+	}
+	if (i)
+		nd_region->ns_seed = devs[0];
+
+	if (devs[i]) {
+		int j;
+
+		for (j = i; devs[j]; j++) {
+			struct device *dev = devs[j];
+
+			device_initialize(dev);
+			put_device(dev);
+		}
+		*err = j - i;
+		/*
+		 * All of the namespaces we tried to register failed, so
+		 * fail region activation.
+		 */
+		if (*err == 0)
+			rc = -ENODEV;
+	}
+	kfree(devs);
+
+	if (rc == -ENODEV)
+		return rc;
+
+	return i;
+}
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
new file mode 100644
index 0000000000..86976a9e8a
--- /dev/null
+++ b/drivers/nvdimm/nd-core.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#ifndef __ND_CORE_H__
+#define __ND_CORE_H__
+#include <linux/libnvdimm.h>
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/mutex.h>
+#include <linux/nd.h>
+#include "nd.h"
+
+extern struct list_head nvdimm_bus_list;
+extern struct mutex nvdimm_bus_list_mutex;
+extern int nvdimm_major;
+extern struct workqueue_struct *nvdimm_wq;
+
+struct nvdimm_bus {
+	struct nvdimm_bus_descriptor *nd_desc;
+	wait_queue_head_t wait;
+	struct list_head list;
+	struct device dev;
+	int id, probe_active;
+	atomic_t ioctl_active;
+	struct list_head mapping_list;
+	struct mutex reconfig_mutex;
+	struct badrange badrange;
+};
+
+struct nvdimm {
+	unsigned long flags;
+	void *provider_data;
+	unsigned long cmd_mask;
+	struct device dev;
+	atomic_t busy;
+	int id, num_flush;
+	struct resource *flush_wpq;
+	const char *dimm_id;
+	struct {
+		const struct nvdimm_security_ops *ops;
+		unsigned long flags;
+		unsigned long ext_flags;
+		unsigned int overwrite_tmo;
+		struct kernfs_node *overwrite_state;
+	} sec;
+	struct delayed_work dwork;
+	const struct nvdimm_fw_ops *fw_ops;
+};
+
+static inline unsigned long nvdimm_security_flags(
+		struct nvdimm *nvdimm, enum nvdimm_passphrase_type ptype)
+{
+	u64 flags;
+	const u64 state_flags = 1UL << NVDIMM_SECURITY_DISABLED
+		| 1UL << NVDIMM_SECURITY_LOCKED
+		| 1UL << NVDIMM_SECURITY_UNLOCKED
+		| 1UL << NVDIMM_SECURITY_OVERWRITE;
+
+	if (!nvdimm->sec.ops)
+		return 0;
+
+	flags = nvdimm->sec.ops->get_flags(nvdimm, ptype);
+	/* disabled, locked, unlocked, and overwrite are mutually exclusive */
+	dev_WARN_ONCE(&nvdimm->dev, hweight64(flags & state_flags) > 1,
+			"reported invalid security state: %#llx\n",
+			(unsigned long long) flags);
+	return flags;
+}
+int nvdimm_security_freeze(struct nvdimm *nvdimm);
+#if IS_ENABLED(CONFIG_NVDIMM_KEYS)
+ssize_t nvdimm_security_store(struct device *dev, const char *buf, size_t len);
+void nvdimm_security_overwrite_query(struct work_struct *work);
+#else
+static inline ssize_t nvdimm_security_store(struct device *dev,
+		const char *buf, size_t len)
+{
+	return -EOPNOTSUPP;
+}
+static inline void nvdimm_security_overwrite_query(struct work_struct *work)
+{
+}
+#endif
+
+bool is_nvdimm(const struct device *dev);
+bool is_nd_pmem(const struct device *dev);
+bool is_nd_volatile(const struct device *dev);
+static inline bool is_nd_region(const struct device *dev)
+{
+	return is_nd_pmem(dev) || is_nd_volatile(dev);
+}
+static inline bool is_memory(const struct device *dev)
+{
+	return is_nd_pmem(dev) || is_nd_volatile(dev);
+}
+struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
+int __init nvdimm_bus_init(void);
+void nvdimm_bus_exit(void);
+void nvdimm_devs_exit(void);
+struct nd_region;
+void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev);
+void nd_region_create_ns_seed(struct nd_region *nd_region);
+void nd_region_create_btt_seed(struct nd_region *nd_region);
+void nd_region_create_pfn_seed(struct nd_region *nd_region);
+void nd_region_create_dax_seed(struct nd_region *nd_region);
+int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
+void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
+void nd_synchronize(void);
+void nd_device_register(struct device *dev);
+void nd_device_register_sync(struct device *dev);
+struct nd_label_id;
+char *nd_label_gen_id(struct nd_label_id *label_id, const uuid_t *uuid,
+		      u32 flags);
+bool nd_is_uuid_unique(struct device *dev, uuid_t *uuid);
+struct nd_region;
+struct nvdimm_drvdata;
+struct nd_mapping;
+void nd_mapping_free_labels(struct nd_mapping *nd_mapping);
+
+int __reserve_free_pmem(struct device *dev, void *data);
+void release_free_pmem(struct nvdimm_bus *nvdimm_bus,
+		       struct nd_mapping *nd_mapping);
+
+resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
+					   struct nd_mapping *nd_mapping);
+resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region);
+resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
+				      struct nd_mapping *nd_mapping);
+resource_size_t nd_region_available_dpa(struct nd_region *nd_region);
+int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
+		resource_size_t size);
+resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
+		struct nd_label_id *label_id);
+int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd);
+void get_ndd(struct nvdimm_drvdata *ndd);
+resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
+void nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns);
+void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns);
+bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+		struct nd_namespace_common **_ndns);
+bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+		struct nd_namespace_common **_ndns);
+ssize_t nd_namespace_store(struct device *dev,
+		struct nd_namespace_common **_ndns, const char *buf,
+		size_t len);
+struct nd_pfn *to_nd_pfn_safe(struct device *dev);
+bool is_nvdimm_bus(struct device *dev);
+
+#if IS_ENABLED(CONFIG_ND_CLAIM)
+int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio,
+		resource_size_t size);
+void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio);
+#else
+static inline int devm_nsio_enable(struct device *dev,
+		struct nd_namespace_io *nsio, resource_size_t size)
+{
+	return -ENXIO;
+}
+
+static inline void devm_nsio_disable(struct device *dev,
+		struct nd_namespace_io *nsio)
+{
+}
+#endif
+#endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
new file mode 100644
index 0000000000..e8b9d27dbb
--- /dev/null
+++ b/drivers/nvdimm/nd.h
@@ -0,0 +1,680 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#ifndef __ND_H__
+#define __ND_H__
+#include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/ndctl.h>
+#include <linux/types.h>
+#include <linux/nd.h>
+#include "label.h"
+
+enum {
+	/*
+	 * Limits the maximum number of block apertures a dimm can
+	 * support and is an input to the geometry/on-disk-format of a
+	 * BTT instance
+	 */
+	ND_MAX_LANES = 256,
+	INT_LBASIZE_ALIGNMENT = 64,
+	NVDIMM_IO_ATOMIC = 1,
+};
+
+struct nvdimm_drvdata {
+	struct device *dev;
+	int nslabel_size;
+	struct nd_cmd_get_config_size nsarea;
+	void *data;
+	bool cxl;
+	int ns_current, ns_next;
+	struct resource dpa;
+	struct kref kref;
+};
+
+static inline const u8 *nsl_ref_name(struct nvdimm_drvdata *ndd,
+				     struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return nd_label->cxl.name;
+	return nd_label->efi.name;
+}
+
+static inline u8 *nsl_get_name(struct nvdimm_drvdata *ndd,
+			       struct nd_namespace_label *nd_label, u8 *name)
+{
+	if (ndd->cxl)
+		return memcpy(name, nd_label->cxl.name, NSLABEL_NAME_LEN);
+	return memcpy(name, nd_label->efi.name, NSLABEL_NAME_LEN);
+}
+
+static inline u8 *nsl_set_name(struct nvdimm_drvdata *ndd,
+			       struct nd_namespace_label *nd_label, u8 *name)
+{
+	if (!name)
+		return NULL;
+	if (ndd->cxl)
+		return memcpy(nd_label->cxl.name, name, NSLABEL_NAME_LEN);
+	return memcpy(nd_label->efi.name, name, NSLABEL_NAME_LEN);
+}
+
+static inline u32 nsl_get_slot(struct nvdimm_drvdata *ndd,
+			       struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return __le32_to_cpu(nd_label->cxl.slot);
+	return __le32_to_cpu(nd_label->efi.slot);
+}
+
+static inline void nsl_set_slot(struct nvdimm_drvdata *ndd,
+				struct nd_namespace_label *nd_label, u32 slot)
+{
+	if (ndd->cxl)
+		nd_label->cxl.slot = __cpu_to_le32(slot);
+	else
+		nd_label->efi.slot = __cpu_to_le32(slot);
+}
+
+static inline u64 nsl_get_checksum(struct nvdimm_drvdata *ndd,
+				   struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return __le64_to_cpu(nd_label->cxl.checksum);
+	return __le64_to_cpu(nd_label->efi.checksum);
+}
+
+static inline void nsl_set_checksum(struct nvdimm_drvdata *ndd,
+				    struct nd_namespace_label *nd_label,
+				    u64 checksum)
+{
+	if (ndd->cxl)
+		nd_label->cxl.checksum = __cpu_to_le64(checksum);
+	else
+		nd_label->efi.checksum = __cpu_to_le64(checksum);
+}
+
+static inline u32 nsl_get_flags(struct nvdimm_drvdata *ndd,
+				struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return __le32_to_cpu(nd_label->cxl.flags);
+	return __le32_to_cpu(nd_label->efi.flags);
+}
+
+static inline void nsl_set_flags(struct nvdimm_drvdata *ndd,
+				 struct nd_namespace_label *nd_label, u32 flags)
+{
+	if (ndd->cxl)
+		nd_label->cxl.flags = __cpu_to_le32(flags);
+	else
+		nd_label->efi.flags = __cpu_to_le32(flags);
+}
+
+static inline u64 nsl_get_dpa(struct nvdimm_drvdata *ndd,
+			      struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return __le64_to_cpu(nd_label->cxl.dpa);
+	return __le64_to_cpu(nd_label->efi.dpa);
+}
+
+static inline void nsl_set_dpa(struct nvdimm_drvdata *ndd,
+			       struct nd_namespace_label *nd_label, u64 dpa)
+{
+	if (ndd->cxl)
+		nd_label->cxl.dpa = __cpu_to_le64(dpa);
+	else
+		nd_label->efi.dpa = __cpu_to_le64(dpa);
+}
+
+static inline u64 nsl_get_rawsize(struct nvdimm_drvdata *ndd,
+				  struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return __le64_to_cpu(nd_label->cxl.rawsize);
+	return __le64_to_cpu(nd_label->efi.rawsize);
+}
+
+static inline void nsl_set_rawsize(struct nvdimm_drvdata *ndd,
+				   struct nd_namespace_label *nd_label,
+				   u64 rawsize)
+{
+	if (ndd->cxl)
+		nd_label->cxl.rawsize = __cpu_to_le64(rawsize);
+	else
+		nd_label->efi.rawsize = __cpu_to_le64(rawsize);
+}
+
+static inline u64 nsl_get_isetcookie(struct nvdimm_drvdata *ndd,
+				     struct nd_namespace_label *nd_label)
+{
+	/* WARN future refactor attempts that break this assumption */
+	if (dev_WARN_ONCE(ndd->dev, ndd->cxl,
+			  "CXL labels do not use the isetcookie concept\n"))
+		return 0;
+	return __le64_to_cpu(nd_label->efi.isetcookie);
+}
+
+static inline void nsl_set_isetcookie(struct nvdimm_drvdata *ndd,
+				      struct nd_namespace_label *nd_label,
+				      u64 isetcookie)
+{
+	if (!ndd->cxl)
+		nd_label->efi.isetcookie = __cpu_to_le64(isetcookie);
+}
+
+static inline bool nsl_validate_isetcookie(struct nvdimm_drvdata *ndd,
+					   struct nd_namespace_label *nd_label,
+					   u64 cookie)
+{
+	/*
+	 * Let the EFI and CXL validation comingle, where fields that
+	 * don't matter to CXL always validate.
+	 */
+	if (ndd->cxl)
+		return true;
+	return cookie == __le64_to_cpu(nd_label->efi.isetcookie);
+}
+
+static inline u16 nsl_get_position(struct nvdimm_drvdata *ndd,
+				   struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return __le16_to_cpu(nd_label->cxl.position);
+	return __le16_to_cpu(nd_label->efi.position);
+}
+
+static inline void nsl_set_position(struct nvdimm_drvdata *ndd,
+				    struct nd_namespace_label *nd_label,
+				    u16 position)
+{
+	if (ndd->cxl)
+		nd_label->cxl.position = __cpu_to_le16(position);
+	else
+		nd_label->efi.position = __cpu_to_le16(position);
+}
+
+static inline u16 nsl_get_nlabel(struct nvdimm_drvdata *ndd,
+				 struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return 0;
+	return __le16_to_cpu(nd_label->efi.nlabel);
+}
+
+static inline void nsl_set_nlabel(struct nvdimm_drvdata *ndd,
+				  struct nd_namespace_label *nd_label,
+				  u16 nlabel)
+{
+	if (!ndd->cxl)
+		nd_label->efi.nlabel = __cpu_to_le16(nlabel);
+}
+
+static inline u16 nsl_get_nrange(struct nvdimm_drvdata *ndd,
+				 struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return __le16_to_cpu(nd_label->cxl.nrange);
+	return 1;
+}
+
+static inline void nsl_set_nrange(struct nvdimm_drvdata *ndd,
+				  struct nd_namespace_label *nd_label,
+				  u16 nrange)
+{
+	if (ndd->cxl)
+		nd_label->cxl.nrange = __cpu_to_le16(nrange);
+}
+
+static inline u64 nsl_get_lbasize(struct nvdimm_drvdata *ndd,
+				  struct nd_namespace_label *nd_label)
+{
+	/*
+	 * Yes, for some reason the EFI labels convey a massive 64-bit
+	 * lbasize, that got fixed for CXL.
+	 */
+	if (ndd->cxl)
+		return __le16_to_cpu(nd_label->cxl.lbasize);
+	return __le64_to_cpu(nd_label->efi.lbasize);
+}
+
+static inline void nsl_set_lbasize(struct nvdimm_drvdata *ndd,
+				   struct nd_namespace_label *nd_label,
+				   u64 lbasize)
+{
+	if (ndd->cxl)
+		nd_label->cxl.lbasize = __cpu_to_le16(lbasize);
+	else
+		nd_label->efi.lbasize = __cpu_to_le64(lbasize);
+}
+
+static inline const uuid_t *nsl_get_uuid(struct nvdimm_drvdata *ndd,
+					 struct nd_namespace_label *nd_label,
+					 uuid_t *uuid)
+{
+	if (ndd->cxl)
+		import_uuid(uuid, nd_label->cxl.uuid);
+	else
+		import_uuid(uuid, nd_label->efi.uuid);
+	return uuid;
+}
+
+static inline const uuid_t *nsl_set_uuid(struct nvdimm_drvdata *ndd,
+					 struct nd_namespace_label *nd_label,
+					 const uuid_t *uuid)
+{
+	if (ndd->cxl)
+		export_uuid(nd_label->cxl.uuid, uuid);
+	else
+		export_uuid(nd_label->efi.uuid, uuid);
+	return uuid;
+}
+
+static inline bool nsl_uuid_equal(struct nvdimm_drvdata *ndd,
+				  struct nd_namespace_label *nd_label,
+				  const uuid_t *uuid)
+{
+	uuid_t tmp;
+
+	if (ndd->cxl)
+		import_uuid(&tmp, nd_label->cxl.uuid);
+	else
+		import_uuid(&tmp, nd_label->efi.uuid);
+	return uuid_equal(&tmp, uuid);
+}
+
+static inline const u8 *nsl_uuid_raw(struct nvdimm_drvdata *ndd,
+				     struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return nd_label->cxl.uuid;
+	return nd_label->efi.uuid;
+}
+
+bool nsl_validate_type_guid(struct nvdimm_drvdata *ndd,
+			    struct nd_namespace_label *nd_label, guid_t *guid);
+enum nvdimm_claim_class nsl_get_claim_class(struct nvdimm_drvdata *ndd,
+					    struct nd_namespace_label *nd_label);
+
+struct nd_region_data {
+	int ns_count;
+	int ns_active;
+	unsigned int hints_shift;
+	void __iomem *flush_wpq[];
+};
+
+static inline void __iomem *ndrd_get_flush_wpq(struct nd_region_data *ndrd,
+		int dimm, int hint)
+{
+	unsigned int num = 1 << ndrd->hints_shift;
+	unsigned int mask = num - 1;
+
+	return ndrd->flush_wpq[dimm * num + (hint & mask)];
+}
+
+static inline void ndrd_set_flush_wpq(struct nd_region_data *ndrd, int dimm,
+		int hint, void __iomem *flush)
+{
+	unsigned int num = 1 << ndrd->hints_shift;
+	unsigned int mask = num - 1;
+
+	ndrd->flush_wpq[dimm * num + (hint & mask)] = flush;
+}
+
+static inline struct nd_namespace_index *to_namespace_index(
+		struct nvdimm_drvdata *ndd, int i)
+{
+	if (i < 0)
+		return NULL;
+
+	return ndd->data + sizeof_namespace_index(ndd) * i;
+}
+
+static inline struct nd_namespace_index *to_current_namespace_index(
+		struct nvdimm_drvdata *ndd)
+{
+	return to_namespace_index(ndd, ndd->ns_current);
+}
+
+static inline struct nd_namespace_index *to_next_namespace_index(
+		struct nvdimm_drvdata *ndd)
+{
+	return to_namespace_index(ndd, ndd->ns_next);
+}
+
+unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd);
+
+#define efi_namespace_label_has(ndd, field) \
+	(!ndd->cxl && offsetof(struct nvdimm_efi_label, field) \
+		< sizeof_namespace_label(ndd))
+
+#define nd_dbg_dpa(r, d, res, fmt, arg...) \
+	dev_dbg((r) ? &(r)->dev : (d)->dev, "%s: %.13s: %#llx @ %#llx " fmt, \
+		(r) ? dev_name((d)->dev) : "", res ? res->name : "null", \
+		(unsigned long long) (res ? resource_size(res) : 0), \
+		(unsigned long long) (res ? res->start : 0), ##arg)
+
+#define for_each_dpa_resource(ndd, res) \
+	for (res = (ndd)->dpa.child; res; res = res->sibling)
+
+#define for_each_dpa_resource_safe(ndd, res, next) \
+	for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \
+			res; res = next, next = next ? next->sibling : NULL)
+
+struct nd_percpu_lane {
+	int count;
+	spinlock_t lock;
+};
+
+enum nd_label_flags {
+	ND_LABEL_REAP,
+};
+struct nd_label_ent {
+	struct list_head list;
+	unsigned long flags;
+	struct nd_namespace_label *label;
+};
+
+enum nd_mapping_lock_class {
+	ND_MAPPING_CLASS0,
+	ND_MAPPING_UUID_SCAN,
+};
+
+struct nd_mapping {
+	struct nvdimm *nvdimm;
+	u64 start;
+	u64 size;
+	int position;
+	struct list_head labels;
+	struct mutex lock;
+	/*
+	 * @ndd is for private use at region enable / disable time for
+	 * get_ndd() + put_ndd(), all other nd_mapping to ndd
+	 * conversions use to_ndd() which respects enabled state of the
+	 * nvdimm.
+	 */
+	struct nvdimm_drvdata *ndd;
+};
+
+struct nd_region {
+	struct device dev;
+	struct ida ns_ida;
+	struct ida btt_ida;
+	struct ida pfn_ida;
+	struct ida dax_ida;
+	unsigned long flags;
+	struct device *ns_seed;
+	struct device *btt_seed;
+	struct device *pfn_seed;
+	struct device *dax_seed;
+	unsigned long align;
+	u16 ndr_mappings;
+	u64 ndr_size;
+	u64 ndr_start;
+	int id, num_lanes, ro, numa_node, target_node;
+	void *provider_data;
+	struct kernfs_node *bb_state;
+	struct badblocks bb;
+	struct nd_interleave_set *nd_set;
+	struct nd_percpu_lane __percpu *lane;
+	int (*flush)(struct nd_region *nd_region, struct bio *bio);
+	struct nd_mapping mapping[];
+};
+
+static inline bool nsl_validate_nlabel(struct nd_region *nd_region,
+				       struct nvdimm_drvdata *ndd,
+				       struct nd_namespace_label *nd_label)
+{
+	if (ndd->cxl)
+		return true;
+	return nsl_get_nlabel(ndd, nd_label) == nd_region->ndr_mappings;
+}
+
+/*
+ * Lookup next in the repeating sequence of 01, 10, and 11.
+ */
+static inline unsigned nd_inc_seq(unsigned seq)
+{
+	static const unsigned next[] = { 0, 2, 3, 1 };
+
+	return next[seq & 3];
+}
+
+struct btt;
+struct nd_btt {
+	struct device dev;
+	struct nd_namespace_common *ndns;
+	struct btt *btt;
+	unsigned long lbasize;
+	u64 size;
+	uuid_t *uuid;
+	int id;
+	int initial_offset;
+	u16 version_major;
+	u16 version_minor;
+};
+
+enum nd_pfn_mode {
+	PFN_MODE_NONE,
+	PFN_MODE_RAM,
+	PFN_MODE_PMEM,
+};
+
+struct nd_pfn {
+	int id;
+	uuid_t *uuid;
+	struct device dev;
+	unsigned long align;
+	unsigned long npfns;
+	enum nd_pfn_mode mode;
+	struct nd_pfn_sb *pfn_sb;
+	struct nd_namespace_common *ndns;
+};
+
+struct nd_dax {
+	struct nd_pfn nd_pfn;
+};
+
+static inline u32 nd_info_block_reserve(void)
+{
+	return ALIGN(SZ_8K, PAGE_SIZE);
+}
+
+enum nd_async_mode {
+	ND_SYNC,
+	ND_ASYNC,
+};
+
+int nd_integrity_init(struct gendisk *disk, unsigned long meta_size);
+void wait_nvdimm_bus_probe_idle(struct device *dev);
+void nd_device_register(struct device *dev);
+void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
+void nd_device_notify(struct device *dev, enum nvdimm_event event);
+int nd_uuid_store(struct device *dev, uuid_t **uuid_out, const char *buf,
+		size_t len);
+ssize_t nd_size_select_show(unsigned long current_size,
+		const unsigned long *supported, char *buf);
+ssize_t nd_size_select_store(struct device *dev, const char *buf,
+		unsigned long *current_size, const unsigned long *supported);
+int __init nvdimm_init(void);
+int __init nd_region_init(void);
+int __init nd_label_init(void);
+void nvdimm_exit(void);
+void nd_region_exit(void);
+struct nvdimm;
+extern const struct attribute_group nd_device_attribute_group;
+extern const struct attribute_group nd_numa_attribute_group;
+extern const struct attribute_group *nvdimm_bus_attribute_groups[];
+struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping);
+int nvdimm_check_config_data(struct device *dev);
+int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd);
+int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
+int nvdimm_get_config_data(struct nvdimm_drvdata *ndd, void *buf,
+			   size_t offset, size_t len);
+int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
+		void *buf, size_t len);
+long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
+		unsigned int len);
+void nvdimm_set_labeling(struct device *dev);
+void nvdimm_set_locked(struct device *dev);
+void nvdimm_clear_locked(struct device *dev);
+int nvdimm_security_setup_events(struct device *dev);
+#if IS_ENABLED(CONFIG_NVDIMM_KEYS)
+int nvdimm_security_unlock(struct device *dev);
+#else
+static inline int nvdimm_security_unlock(struct device *dev)
+{
+	return 0;
+}
+#endif
+struct nd_btt *to_nd_btt(struct device *dev);
+
+struct nd_gen_sb {
+	char reserved[SZ_4K - 8];
+	__le64 checksum;
+};
+
+u64 nd_sb_checksum(struct nd_gen_sb *sb);
+#if IS_ENABLED(CONFIG_BTT)
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns);
+bool is_nd_btt(struct device *dev);
+struct device *nd_btt_create(struct nd_region *nd_region);
+#else
+static inline int nd_btt_probe(struct device *dev,
+		struct nd_namespace_common *ndns)
+{
+	return -ENODEV;
+}
+
+static inline bool is_nd_btt(struct device *dev)
+{
+	return false;
+}
+
+static inline struct device *nd_btt_create(struct nd_region *nd_region)
+{
+	return NULL;
+}
+#endif
+
+struct nd_pfn *to_nd_pfn(struct device *dev);
+#if IS_ENABLED(CONFIG_NVDIMM_PFN)
+
+#define MAX_NVDIMM_ALIGN	4
+
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
+bool is_nd_pfn(struct device *dev);
+struct device *nd_pfn_create(struct nd_region *nd_region);
+struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
+		struct nd_namespace_common *ndns);
+int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig);
+extern const struct attribute_group *nd_pfn_attribute_groups[];
+#else
+static inline int nd_pfn_probe(struct device *dev,
+		struct nd_namespace_common *ndns)
+{
+	return -ENODEV;
+}
+
+static inline bool is_nd_pfn(struct device *dev)
+{
+	return false;
+}
+
+static inline struct device *nd_pfn_create(struct nd_region *nd_region)
+{
+	return NULL;
+}
+
+static inline int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
+{
+	return -ENODEV;
+}
+#endif
+
+struct nd_dax *to_nd_dax(struct device *dev);
+#if IS_ENABLED(CONFIG_NVDIMM_DAX)
+int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns);
+bool is_nd_dax(const struct device *dev);
+struct device *nd_dax_create(struct nd_region *nd_region);
+#else
+static inline int nd_dax_probe(struct device *dev,
+		struct nd_namespace_common *ndns)
+{
+	return -ENODEV;
+}
+
+static inline bool is_nd_dax(const struct device *dev)
+{
+	return false;
+}
+
+static inline struct device *nd_dax_create(struct nd_region *nd_region)
+{
+	return NULL;
+}
+#endif
+
+int nd_region_to_nstype(struct nd_region *nd_region);
+int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
+u64 nd_region_interleave_set_cookie(struct nd_region *nd_region,
+		struct nd_namespace_index *nsindex);
+u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region);
+void nvdimm_bus_lock(struct device *dev);
+void nvdimm_bus_unlock(struct device *dev);
+bool is_nvdimm_bus_locked(struct device *dev);
+void nvdimm_check_and_set_ro(struct gendisk *disk);
+void nvdimm_drvdata_release(struct kref *kref);
+void put_ndd(struct nvdimm_drvdata *ndd);
+int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd);
+void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res);
+struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
+		struct nd_label_id *label_id, resource_size_t start,
+		resource_size_t n);
+resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
+bool nvdimm_namespace_locked(struct nd_namespace_common *ndns);
+struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev);
+int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
+int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt);
+const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
+		char *name);
+unsigned int pmem_sector_size(struct nd_namespace_common *ndns);
+struct range;
+void nvdimm_badblocks_populate(struct nd_region *nd_region,
+		struct badblocks *bb, const struct range *range);
+int devm_namespace_enable(struct device *dev, struct nd_namespace_common *ndns,
+		resource_size_t size);
+void devm_namespace_disable(struct device *dev,
+		struct nd_namespace_common *ndns);
+#if IS_ENABLED(CONFIG_ND_CLAIM)
+/* max struct page size independent of kernel config */
+#define MAX_STRUCT_PAGE_SIZE 64
+int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap);
+#else
+static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+				   struct dev_pagemap *pgmap)
+{
+	return -ENXIO;
+}
+#endif
+int nd_region_activate(struct nd_region *nd_region);
+static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
+		unsigned int len)
+{
+	if (bb->count) {
+		sector_t first_bad;
+		int num_bad;
+
+		return !!badblocks_check(bb, sector, len / 512, &first_bad,
+				&num_bad);
+	}
+
+	return false;
+}
+const uuid_t *nd_dev_to_uuid(struct device *dev);
+bool pmem_should_map_pages(struct device *dev);
+#endif /* __ND_H__ */
diff --git a/drivers/nvdimm/nd_perf.c b/drivers/nvdimm/nd_perf.c
new file mode 100644
index 0000000000..2b6dc80d8f
--- /dev/null
+++ b/drivers/nvdimm/nd_perf.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * nd_perf.c: NVDIMM Device Performance Monitoring Unit support
+ *
+ * Perf interface to expose nvdimm performance stats.
+ *
+ * Copyright (C) 2021 IBM Corporation
+ */
+
+#define pr_fmt(fmt) "nvdimm_pmu: " fmt
+
+#include <linux/nd.h>
+#include <linux/platform_device.h>
+
+#define EVENT(_name, _code)     enum{_name = _code}
+
+/*
+ * NVDIMM Events codes.
+ */
+
+/* Controller Reset Count */
+EVENT(CTL_RES_CNT,		0x1);
+/* Controller Reset Elapsed Time */
+EVENT(CTL_RES_TM,		0x2);
+/* Power-on Seconds */
+EVENT(POWERON_SECS,		0x3);
+/* Life Remaining */
+EVENT(MEM_LIFE,		0x4);
+/* Critical Resource Utilization */
+EVENT(CRI_RES_UTIL,		0x5);
+/* Host Load Count */
+EVENT(HOST_L_CNT,		0x6);
+/* Host Store Count */
+EVENT(HOST_S_CNT,		0x7);
+/* Host Store Duration */
+EVENT(HOST_S_DUR,		0x8);
+/* Host Load Duration */
+EVENT(HOST_L_DUR,		0x9);
+/* Media Read Count */
+EVENT(MED_R_CNT,		0xa);
+/* Media Write Count */
+EVENT(MED_W_CNT,		0xb);
+/* Media Read Duration */
+EVENT(MED_R_DUR,		0xc);
+/* Media Write Duration */
+EVENT(MED_W_DUR,		0xd);
+/* Cache Read Hit Count */
+EVENT(CACHE_RH_CNT,		0xe);
+/* Cache Write Hit Count */
+EVENT(CACHE_WH_CNT,		0xf);
+/* Fast Write Count */
+EVENT(FAST_W_CNT,		0x10);
+
+NVDIMM_EVENT_ATTR(ctl_res_cnt,		CTL_RES_CNT);
+NVDIMM_EVENT_ATTR(ctl_res_tm,		CTL_RES_TM);
+NVDIMM_EVENT_ATTR(poweron_secs,		POWERON_SECS);
+NVDIMM_EVENT_ATTR(mem_life,		MEM_LIFE);
+NVDIMM_EVENT_ATTR(cri_res_util,		CRI_RES_UTIL);
+NVDIMM_EVENT_ATTR(host_l_cnt,		HOST_L_CNT);
+NVDIMM_EVENT_ATTR(host_s_cnt,		HOST_S_CNT);
+NVDIMM_EVENT_ATTR(host_s_dur,		HOST_S_DUR);
+NVDIMM_EVENT_ATTR(host_l_dur,		HOST_L_DUR);
+NVDIMM_EVENT_ATTR(med_r_cnt,		MED_R_CNT);
+NVDIMM_EVENT_ATTR(med_w_cnt,		MED_W_CNT);
+NVDIMM_EVENT_ATTR(med_r_dur,		MED_R_DUR);
+NVDIMM_EVENT_ATTR(med_w_dur,		MED_W_DUR);
+NVDIMM_EVENT_ATTR(cache_rh_cnt,		CACHE_RH_CNT);
+NVDIMM_EVENT_ATTR(cache_wh_cnt,		CACHE_WH_CNT);
+NVDIMM_EVENT_ATTR(fast_w_cnt,		FAST_W_CNT);
+
+static struct attribute *nvdimm_events_attr[] = {
+	NVDIMM_EVENT_PTR(CTL_RES_CNT),
+	NVDIMM_EVENT_PTR(CTL_RES_TM),
+	NVDIMM_EVENT_PTR(POWERON_SECS),
+	NVDIMM_EVENT_PTR(MEM_LIFE),
+	NVDIMM_EVENT_PTR(CRI_RES_UTIL),
+	NVDIMM_EVENT_PTR(HOST_L_CNT),
+	NVDIMM_EVENT_PTR(HOST_S_CNT),
+	NVDIMM_EVENT_PTR(HOST_S_DUR),
+	NVDIMM_EVENT_PTR(HOST_L_DUR),
+	NVDIMM_EVENT_PTR(MED_R_CNT),
+	NVDIMM_EVENT_PTR(MED_W_CNT),
+	NVDIMM_EVENT_PTR(MED_R_DUR),
+	NVDIMM_EVENT_PTR(MED_W_DUR),
+	NVDIMM_EVENT_PTR(CACHE_RH_CNT),
+	NVDIMM_EVENT_PTR(CACHE_WH_CNT),
+	NVDIMM_EVENT_PTR(FAST_W_CNT),
+	NULL
+};
+
+static struct attribute_group nvdimm_pmu_events_group = {
+	.name = "events",
+	.attrs = nvdimm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-4");
+
+static struct attribute *nvdimm_pmu_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group nvdimm_pmu_format_group = {
+	.name = "format",
+	.attrs = nvdimm_pmu_format_attr,
+};
+
+ssize_t nvdimm_events_sysfs_show(struct device *dev,
+				 struct device_attribute *attr, char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+
+	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+static ssize_t nvdimm_pmu_cpumask_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct nvdimm_pmu *nd_pmu;
+
+	nd_pmu = container_of(pmu, struct nvdimm_pmu, pmu);
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(nd_pmu->cpu));
+}
+
+static int nvdimm_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+	struct nvdimm_pmu *nd_pmu;
+	u32 target;
+	int nodeid;
+	const struct cpumask *cpumask;
+
+	nd_pmu = hlist_entry_safe(node, struct nvdimm_pmu, node);
+
+	/* Clear it, incase given cpu is set in nd_pmu->arch_cpumask */
+	cpumask_test_and_clear_cpu(cpu, &nd_pmu->arch_cpumask);
+
+	/*
+	 * If given cpu is not same as current designated cpu for
+	 * counter access, just return.
+	 */
+	if (cpu != nd_pmu->cpu)
+		return 0;
+
+	/* Check for any active cpu in nd_pmu->arch_cpumask */
+	target = cpumask_any(&nd_pmu->arch_cpumask);
+
+	/*
+	 * Incase we don't have any active cpu in nd_pmu->arch_cpumask,
+	 * check in given cpu's numa node list.
+	 */
+	if (target >= nr_cpu_ids) {
+		nodeid = cpu_to_node(cpu);
+		cpumask = cpumask_of_node(nodeid);
+		target = cpumask_any_but(cpumask, cpu);
+	}
+	nd_pmu->cpu = target;
+
+	/* Migrate nvdimm pmu events to the new target cpu if valid */
+	if (target >= 0 && target < nr_cpu_ids)
+		perf_pmu_migrate_context(&nd_pmu->pmu, cpu, target);
+
+	return 0;
+}
+
+static int nvdimm_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct nvdimm_pmu *nd_pmu;
+
+	nd_pmu = hlist_entry_safe(node, struct nvdimm_pmu, node);
+
+	if (nd_pmu->cpu >= nr_cpu_ids)
+		nd_pmu->cpu = cpu;
+
+	return 0;
+}
+
+static int create_cpumask_attr_group(struct nvdimm_pmu *nd_pmu)
+{
+	struct perf_pmu_events_attr *pmu_events_attr;
+	struct attribute **attrs_group;
+	struct attribute_group *nvdimm_pmu_cpumask_group;
+
+	pmu_events_attr = kzalloc(sizeof(*pmu_events_attr), GFP_KERNEL);
+	if (!pmu_events_attr)
+		return -ENOMEM;
+
+	attrs_group = kzalloc(2 * sizeof(struct attribute *), GFP_KERNEL);
+	if (!attrs_group) {
+		kfree(pmu_events_attr);
+		return -ENOMEM;
+	}
+
+	/* Allocate memory for cpumask attribute group */
+	nvdimm_pmu_cpumask_group = kzalloc(sizeof(*nvdimm_pmu_cpumask_group), GFP_KERNEL);
+	if (!nvdimm_pmu_cpumask_group) {
+		kfree(pmu_events_attr);
+		kfree(attrs_group);
+		return -ENOMEM;
+	}
+
+	sysfs_attr_init(&pmu_events_attr->attr.attr);
+	pmu_events_attr->attr.attr.name = "cpumask";
+	pmu_events_attr->attr.attr.mode = 0444;
+	pmu_events_attr->attr.show = nvdimm_pmu_cpumask_show;
+	attrs_group[0] = &pmu_events_attr->attr.attr;
+	attrs_group[1] = NULL;
+
+	nvdimm_pmu_cpumask_group->attrs = attrs_group;
+	nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR] = nvdimm_pmu_cpumask_group;
+	return 0;
+}
+
+static int nvdimm_pmu_cpu_hotplug_init(struct nvdimm_pmu *nd_pmu)
+{
+	int nodeid, rc;
+	const struct cpumask *cpumask;
+
+	/*
+	 * Incase of cpu hotplug feature, arch specific code
+	 * can provide required cpumask which can be used
+	 * to get designatd cpu for counter access.
+	 * Check for any active cpu in nd_pmu->arch_cpumask.
+	 */
+	if (!cpumask_empty(&nd_pmu->arch_cpumask)) {
+		nd_pmu->cpu = cpumask_any(&nd_pmu->arch_cpumask);
+	} else {
+		/* pick active cpu from the cpumask of device numa node. */
+		nodeid = dev_to_node(nd_pmu->dev);
+		cpumask = cpumask_of_node(nodeid);
+		nd_pmu->cpu = cpumask_any(cpumask);
+	}
+
+	rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/nvdimm:online",
+				     nvdimm_pmu_cpu_online, nvdimm_pmu_cpu_offline);
+
+	if (rc < 0)
+		return rc;
+
+	nd_pmu->cpuhp_state = rc;
+
+	/* Register the pmu instance for cpu hotplug */
+	rc = cpuhp_state_add_instance_nocalls(nd_pmu->cpuhp_state, &nd_pmu->node);
+	if (rc) {
+		cpuhp_remove_multi_state(nd_pmu->cpuhp_state);
+		return rc;
+	}
+
+	/* Create cpumask attribute group */
+	rc = create_cpumask_attr_group(nd_pmu);
+	if (rc) {
+		cpuhp_state_remove_instance_nocalls(nd_pmu->cpuhp_state, &nd_pmu->node);
+		cpuhp_remove_multi_state(nd_pmu->cpuhp_state);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void nvdimm_pmu_free_hotplug_memory(struct nvdimm_pmu *nd_pmu)
+{
+	cpuhp_state_remove_instance_nocalls(nd_pmu->cpuhp_state, &nd_pmu->node);
+	cpuhp_remove_multi_state(nd_pmu->cpuhp_state);
+
+	if (nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR])
+		kfree(nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR]->attrs);
+	kfree(nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR]);
+}
+
+int register_nvdimm_pmu(struct nvdimm_pmu *nd_pmu, struct platform_device *pdev)
+{
+	int rc;
+
+	if (!nd_pmu || !pdev)
+		return -EINVAL;
+
+	/* event functions like add/del/read/event_init and pmu name should not be NULL */
+	if (WARN_ON_ONCE(!(nd_pmu->pmu.event_init && nd_pmu->pmu.add &&
+			   nd_pmu->pmu.del && nd_pmu->pmu.read && nd_pmu->pmu.name)))
+		return -EINVAL;
+
+	nd_pmu->pmu.attr_groups = kzalloc((NVDIMM_PMU_NULL_ATTR + 1) *
+					  sizeof(struct attribute_group *), GFP_KERNEL);
+	if (!nd_pmu->pmu.attr_groups)
+		return -ENOMEM;
+
+	/*
+	 * Add platform_device->dev pointer to nvdimm_pmu to access
+	 * device data in events functions.
+	 */
+	nd_pmu->dev = &pdev->dev;
+
+	/* Fill attribute groups for the nvdimm pmu device */
+	nd_pmu->pmu.attr_groups[NVDIMM_PMU_FORMAT_ATTR] = &nvdimm_pmu_format_group;
+	nd_pmu->pmu.attr_groups[NVDIMM_PMU_EVENT_ATTR] = &nvdimm_pmu_events_group;
+	nd_pmu->pmu.attr_groups[NVDIMM_PMU_NULL_ATTR] = NULL;
+
+	/* Fill attribute group for cpumask */
+	rc = nvdimm_pmu_cpu_hotplug_init(nd_pmu);
+	if (rc) {
+		pr_info("cpu hotplug feature failed for device: %s\n", nd_pmu->pmu.name);
+		kfree(nd_pmu->pmu.attr_groups);
+		return rc;
+	}
+
+	rc = perf_pmu_register(&nd_pmu->pmu, nd_pmu->pmu.name, -1);
+	if (rc) {
+		nvdimm_pmu_free_hotplug_memory(nd_pmu);
+		kfree(nd_pmu->pmu.attr_groups);
+		return rc;
+	}
+
+	pr_info("%s NVDIMM performance monitor support registered\n",
+		nd_pmu->pmu.name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_nvdimm_pmu);
+
+void unregister_nvdimm_pmu(struct nvdimm_pmu *nd_pmu)
+{
+	perf_pmu_unregister(&nd_pmu->pmu);
+	nvdimm_pmu_free_hotplug_memory(nd_pmu);
+	kfree(nd_pmu->pmu.attr_groups);
+	kfree(nd_pmu);
+}
+EXPORT_SYMBOL_GPL(unregister_nvdimm_pmu);
diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
new file mode 100644
index 0000000000..1f8c667c6f
--- /dev/null
+++ b/drivers/nvdimm/nd_virtio.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio_pmem.c: Virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and provides a virtio based flushing
+ * interface.
+ */
+#include "virtio_pmem.h"
+#include "nd.h"
+
+ /* The interrupt handler */
+void virtio_pmem_host_ack(struct virtqueue *vq)
+{
+	struct virtio_pmem *vpmem = vq->vdev->priv;
+	struct virtio_pmem_request *req_data, *req_buf;
+	unsigned long flags;
+	unsigned int len;
+
+	spin_lock_irqsave(&vpmem->pmem_lock, flags);
+	while ((req_data = virtqueue_get_buf(vq, &len)) != NULL) {
+		req_data->done = true;
+		wake_up(&req_data->host_acked);
+
+		if (!list_empty(&vpmem->req_list)) {
+			req_buf = list_first_entry(&vpmem->req_list,
+					struct virtio_pmem_request, list);
+			req_buf->wq_buf_avail = true;
+			wake_up(&req_buf->wq_buf);
+			list_del(&req_buf->list);
+		}
+	}
+	spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+}
+EXPORT_SYMBOL_GPL(virtio_pmem_host_ack);
+
+ /* The request submission function */
+static int virtio_pmem_flush(struct nd_region *nd_region)
+{
+	struct virtio_device *vdev = nd_region->provider_data;
+	struct virtio_pmem *vpmem  = vdev->priv;
+	struct virtio_pmem_request *req_data;
+	struct scatterlist *sgs[2], sg, ret;
+	unsigned long flags;
+	int err, err1;
+
+	might_sleep();
+	req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+	if (!req_data)
+		return -ENOMEM;
+
+	req_data->done = false;
+	init_waitqueue_head(&req_data->host_acked);
+	init_waitqueue_head(&req_data->wq_buf);
+	INIT_LIST_HEAD(&req_data->list);
+	req_data->req.type = cpu_to_le32(VIRTIO_PMEM_REQ_TYPE_FLUSH);
+	sg_init_one(&sg, &req_data->req, sizeof(req_data->req));
+	sgs[0] = &sg;
+	sg_init_one(&ret, &req_data->resp.ret, sizeof(req_data->resp));
+	sgs[1] = &ret;
+
+	spin_lock_irqsave(&vpmem->pmem_lock, flags);
+	 /*
+	  * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
+	  * queue does not have free descriptor. We add the request
+	  * to req_list and wait for host_ack to wake us up when free
+	  * slots are available.
+	  */
+	while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req_data,
+					GFP_ATOMIC)) == -ENOSPC) {
+
+		dev_info(&vdev->dev, "failed to send command to virtio pmem device, no free slots in the virtqueue\n");
+		req_data->wq_buf_avail = false;
+		list_add_tail(&req_data->list, &vpmem->req_list);
+		spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+
+		/* A host response results in "host_ack" getting called */
+		wait_event(req_data->wq_buf, req_data->wq_buf_avail);
+		spin_lock_irqsave(&vpmem->pmem_lock, flags);
+	}
+	err1 = virtqueue_kick(vpmem->req_vq);
+	spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+	/*
+	 * virtqueue_add_sgs failed with error different than -ENOSPC, we can't
+	 * do anything about that.
+	 */
+	if (err || !err1) {
+		dev_info(&vdev->dev, "failed to send command to virtio pmem device\n");
+		err = -EIO;
+	} else {
+		/* A host repsonse results in "host_ack" getting called */
+		wait_event(req_data->host_acked, req_data->done);
+		err = le32_to_cpu(req_data->resp.ret);
+	}
+
+	kfree(req_data);
+	return err;
+};
+
+/* The asynchronous flush callback function */
+int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
+{
+	/*
+	 * Create child bio for asynchronous flush and chain with
+	 * parent bio. Otherwise directly call nd_region flush.
+	 */
+	if (bio && bio->bi_iter.bi_sector != -1) {
+		struct bio *child = bio_alloc(bio->bi_bdev, 0,
+					      REQ_OP_WRITE | REQ_PREFLUSH,
+					      GFP_ATOMIC);
+
+		if (!child)
+			return -ENOMEM;
+		bio_clone_blkg_association(child, bio);
+		child->bi_iter.bi_sector = -1;
+		bio_chain(child, bio);
+		submit_bio(child);
+		return 0;
+	}
+	if (virtio_pmem_flush(nd_region))
+		return -EIO;
+
+	return 0;
+};
+EXPORT_SYMBOL_GPL(async_pmem_flush);
+MODULE_LICENSE("GPL");
diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c
new file mode 100644
index 0000000000..d3fca0ab62
--- /dev/null
+++ b/drivers/nvdimm/of_pmem.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#define pr_fmt(fmt) "of_pmem: " fmt
+
+#include <linux/of.h>
+#include <linux/libnvdimm.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+struct of_pmem_private {
+	struct nvdimm_bus_descriptor bus_desc;
+	struct nvdimm_bus *bus;
+};
+
+static int of_pmem_region_probe(struct platform_device *pdev)
+{
+	struct of_pmem_private *priv;
+	struct device_node *np;
+	struct nvdimm_bus *bus;
+	bool is_volatile;
+	int i;
+
+	np = dev_of_node(&pdev->dev);
+	if (!np)
+		return -ENXIO;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->bus_desc.provider_name = devm_kstrdup(&pdev->dev, pdev->name,
+							GFP_KERNEL);
+	if (!priv->bus_desc.provider_name) {
+		kfree(priv);
+		return -ENOMEM;
+	}
+
+	priv->bus_desc.module = THIS_MODULE;
+	priv->bus_desc.of_node = np;
+
+	priv->bus = bus = nvdimm_bus_register(&pdev->dev, &priv->bus_desc);
+	if (!bus) {
+		kfree(priv);
+		return -ENODEV;
+	}
+	platform_set_drvdata(pdev, priv);
+
+	is_volatile = !!of_find_property(np, "volatile", NULL);
+	dev_dbg(&pdev->dev, "Registering %s regions from %pOF\n",
+			is_volatile ? "volatile" : "non-volatile",  np);
+
+	for (i = 0; i < pdev->num_resources; i++) {
+		struct nd_region_desc ndr_desc;
+		struct nd_region *region;
+
+		/*
+		 * NB: libnvdimm copies the data from ndr_desc into it's own
+		 * structures so passing a stack pointer is fine.
+		 */
+		memset(&ndr_desc, 0, sizeof(ndr_desc));
+		ndr_desc.numa_node = dev_to_node(&pdev->dev);
+		ndr_desc.target_node = ndr_desc.numa_node;
+		ndr_desc.res = &pdev->resource[i];
+		ndr_desc.of_node = np;
+		set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+
+		if (is_volatile)
+			region = nvdimm_volatile_region_create(bus, &ndr_desc);
+		else {
+			set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
+			region = nvdimm_pmem_region_create(bus, &ndr_desc);
+		}
+
+		if (!region)
+			dev_warn(&pdev->dev, "Unable to register region %pR from %pOF\n",
+					ndr_desc.res, np);
+		else
+			dev_dbg(&pdev->dev, "Registered region %pR from %pOF\n",
+					ndr_desc.res, np);
+	}
+
+	return 0;
+}
+
+static int of_pmem_region_remove(struct platform_device *pdev)
+{
+	struct of_pmem_private *priv = platform_get_drvdata(pdev);
+
+	nvdimm_bus_unregister(priv->bus);
+	kfree(priv);
+
+	return 0;
+}
+
+static const struct of_device_id of_pmem_region_match[] = {
+	{ .compatible = "pmem-region" },
+	{ .compatible = "pmem-region-v2" },
+	{ },
+};
+
+static struct platform_driver of_pmem_region_driver = {
+	.probe = of_pmem_region_probe,
+	.remove = of_pmem_region_remove,
+	.driver = {
+		.name = "of_pmem",
+		.of_match_table = of_pmem_region_match,
+	},
+};
+
+module_platform_driver(of_pmem_region_driver);
+MODULE_DEVICE_TABLE(of, of_pmem_region_match);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("IBM Corporation");
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
new file mode 100644
index 0000000000..37cb1b8a2a
--- /dev/null
+++ b/drivers/nvdimm/pfn.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2014-2015, Intel Corporation.
+ */
+
+#ifndef __NVDIMM_PFN_H
+#define __NVDIMM_PFN_H
+
+#include <linux/types.h>
+#include <linux/mmzone.h>
+
+#define PFN_SIG_LEN 16
+#define PFN_SIG "NVDIMM_PFN_INFO\0"
+#define DAX_SIG "NVDIMM_DAX_INFO\0"
+
+struct nd_pfn_sb {
+	u8 signature[PFN_SIG_LEN];
+	u8 uuid[16];
+	u8 parent_uuid[16];
+	__le32 flags;
+	__le16 version_major;
+	__le16 version_minor;
+	__le64 dataoff; /* relative to namespace_base + start_pad */
+	__le64 npfns;
+	__le32 mode;
+	/* minor-version-1 additions for section alignment */
+	/**
+	 * @start_pad: Deprecated attribute to pad start-misaligned namespaces
+	 *
+	 * start_pad is deprecated because the original definition did
+	 * not comprehend that dataoff is relative to the base address
+	 * of the namespace not the start_pad adjusted base. The result
+	 * is that the dax path is broken, but the block-I/O path is
+	 * not. The kernel will no longer create namespaces using start
+	 * padding, but it still supports block-I/O for legacy
+	 * configurations mainly to allow a backup, reconfigure the
+	 * namespace, and restore flow to repair dax operation.
+	 */
+	__le32 start_pad;
+	__le32 end_trunc;
+	/* minor-version-2 record the base alignment of the mapping */
+	__le32 align;
+	/* minor-version-3 guarantee the padding and flags are zero */
+	/* minor-version-4 record the page size and struct page size */
+	__le32 page_size;
+	__le16 page_struct_size;
+	u8 padding[3994];
+	__le64 checksum;
+};
+
+#endif /* __NVDIMM_PFN_H */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
new file mode 100644
index 0000000000..0d08e21a1c
--- /dev/null
+++ b/drivers/nvdimm/pfn_devs.c
@@ -0,0 +1,869 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
+ */
+#include <linux/memremap.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "pfn.h"
+#include "nd.h"
+
+static const bool page_struct_override = IS_ENABLED(CONFIG_NVDIMM_KMSAN);
+
+static void nd_pfn_release(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+	dev_dbg(dev, "trace\n");
+	nd_detach_ndns(&nd_pfn->dev, &nd_pfn->ndns);
+	ida_simple_remove(&nd_region->pfn_ida, nd_pfn->id);
+	kfree(nd_pfn->uuid);
+	kfree(nd_pfn);
+}
+
+struct nd_pfn *to_nd_pfn(struct device *dev)
+{
+	struct nd_pfn *nd_pfn = container_of(dev, struct nd_pfn, dev);
+
+	WARN_ON(!is_nd_pfn(dev));
+	return nd_pfn;
+}
+EXPORT_SYMBOL(to_nd_pfn);
+
+static ssize_t mode_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+
+	switch (nd_pfn->mode) {
+	case PFN_MODE_RAM:
+		return sprintf(buf, "ram\n");
+	case PFN_MODE_PMEM:
+		return sprintf(buf, "pmem\n");
+	default:
+		return sprintf(buf, "none\n");
+	}
+}
+
+static ssize_t mode_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+	ssize_t rc = 0;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	if (dev->driver)
+		rc = -EBUSY;
+	else {
+		size_t n = len - 1;
+
+		if (strncmp(buf, "pmem\n", n) == 0
+				|| strncmp(buf, "pmem", n) == 0) {
+			nd_pfn->mode = PFN_MODE_PMEM;
+		} else if (strncmp(buf, "ram\n", n) == 0
+				|| strncmp(buf, "ram", n) == 0)
+			nd_pfn->mode = PFN_MODE_RAM;
+		else if (strncmp(buf, "none\n", n) == 0
+				|| strncmp(buf, "none", n) == 0)
+			nd_pfn->mode = PFN_MODE_NONE;
+		else
+			rc = -EINVAL;
+	}
+	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+			buf[len - 1] == '\n' ? "" : "\n");
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(mode);
+
+static ssize_t align_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+
+	return sprintf(buf, "%ld\n", nd_pfn->align);
+}
+
+static unsigned long *nd_pfn_supported_alignments(unsigned long *alignments)
+{
+
+	alignments[0] = PAGE_SIZE;
+
+	if (has_transparent_hugepage()) {
+		alignments[1] = HPAGE_PMD_SIZE;
+		if (has_transparent_pud_hugepage())
+			alignments[2] = HPAGE_PUD_SIZE;
+	}
+
+	return alignments;
+}
+
+/*
+ * Use pmd mapping if supported as default alignment
+ */
+static unsigned long nd_pfn_default_alignment(void)
+{
+
+	if (has_transparent_hugepage())
+		return HPAGE_PMD_SIZE;
+	return PAGE_SIZE;
+}
+
+static ssize_t align_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+	unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, };
+	ssize_t rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	rc = nd_size_select_store(dev, buf, &nd_pfn->align,
+			nd_pfn_supported_alignments(aligns));
+	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+			buf[len - 1] == '\n' ? "" : "\n");
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(align);
+
+static ssize_t uuid_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+
+	if (nd_pfn->uuid)
+		return sprintf(buf, "%pUb\n", nd_pfn->uuid);
+	return sprintf(buf, "\n");
+}
+
+static ssize_t uuid_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	rc = nd_uuid_store(dev, &nd_pfn->uuid, buf, len);
+	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+			buf[len - 1] == '\n' ? "" : "\n");
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t namespace_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	rc = sprintf(buf, "%s\n", nd_pfn->ndns
+			? dev_name(&nd_pfn->ndns->dev) : "");
+	nvdimm_bus_unlock(dev);
+	return rc;
+}
+
+static ssize_t namespace_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	rc = nd_namespace_store(dev, &nd_pfn->ndns, buf, len);
+	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
+			buf[len - 1] == '\n' ? "" : "\n");
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(namespace);
+
+static ssize_t resource_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	if (dev->driver) {
+		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+		u64 offset = __le64_to_cpu(pfn_sb->dataoff);
+		struct nd_namespace_common *ndns = nd_pfn->ndns;
+		u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+		struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+
+		rc = sprintf(buf, "%#llx\n", (unsigned long long) nsio->res.start
+				+ start_pad + offset);
+	} else {
+		/* no address to convey if the pfn instance is disabled */
+		rc = -ENXIO;
+	}
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_ADMIN_RO(resource);
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	if (dev->driver) {
+		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+		u64 offset = __le64_to_cpu(pfn_sb->dataoff);
+		struct nd_namespace_common *ndns = nd_pfn->ndns;
+		u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+		u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+		struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+
+		rc = sprintf(buf, "%llu\n", (unsigned long long)
+				resource_size(&nsio->res) - start_pad
+				- end_trunc - offset);
+	} else {
+		/* no size to convey if the pfn instance is disabled */
+		rc = -ENXIO;
+	}
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t supported_alignments_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, };
+
+	return nd_size_select_show(0,
+			nd_pfn_supported_alignments(aligns), buf);
+}
+static DEVICE_ATTR_RO(supported_alignments);
+
+static struct attribute *nd_pfn_attributes[] = {
+	&dev_attr_mode.attr,
+	&dev_attr_namespace.attr,
+	&dev_attr_uuid.attr,
+	&dev_attr_align.attr,
+	&dev_attr_resource.attr,
+	&dev_attr_size.attr,
+	&dev_attr_supported_alignments.attr,
+	NULL,
+};
+
+static struct attribute_group nd_pfn_attribute_group = {
+	.attrs = nd_pfn_attributes,
+};
+
+const struct attribute_group *nd_pfn_attribute_groups[] = {
+	&nd_pfn_attribute_group,
+	&nd_device_attribute_group,
+	&nd_numa_attribute_group,
+	NULL,
+};
+
+static const struct device_type nd_pfn_device_type = {
+	.name = "nd_pfn",
+	.release = nd_pfn_release,
+	.groups = nd_pfn_attribute_groups,
+};
+
+bool is_nd_pfn(struct device *dev)
+{
+	return dev ? dev->type == &nd_pfn_device_type : false;
+}
+EXPORT_SYMBOL(is_nd_pfn);
+
+static struct lock_class_key nvdimm_pfn_key;
+
+struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
+		struct nd_namespace_common *ndns)
+{
+	struct device *dev;
+
+	if (!nd_pfn)
+		return NULL;
+
+	nd_pfn->mode = PFN_MODE_NONE;
+	nd_pfn->align = nd_pfn_default_alignment();
+	dev = &nd_pfn->dev;
+	device_initialize(&nd_pfn->dev);
+	lockdep_set_class(&nd_pfn->dev.mutex, &nvdimm_pfn_key);
+	if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
+		dev_dbg(&ndns->dev, "failed, already claimed by %s\n",
+				dev_name(ndns->claim));
+		put_device(dev);
+		return NULL;
+	}
+	return dev;
+}
+
+static struct nd_pfn *nd_pfn_alloc(struct nd_region *nd_region)
+{
+	struct nd_pfn *nd_pfn;
+	struct device *dev;
+
+	nd_pfn = kzalloc(sizeof(*nd_pfn), GFP_KERNEL);
+	if (!nd_pfn)
+		return NULL;
+
+	nd_pfn->id = ida_simple_get(&nd_region->pfn_ida, 0, 0, GFP_KERNEL);
+	if (nd_pfn->id < 0) {
+		kfree(nd_pfn);
+		return NULL;
+	}
+
+	dev = &nd_pfn->dev;
+	dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id);
+	dev->type = &nd_pfn_device_type;
+	dev->parent = &nd_region->dev;
+
+	return nd_pfn;
+}
+
+struct device *nd_pfn_create(struct nd_region *nd_region)
+{
+	struct nd_pfn *nd_pfn;
+	struct device *dev;
+
+	if (!is_memory(&nd_region->dev))
+		return NULL;
+
+	nd_pfn = nd_pfn_alloc(nd_region);
+	dev = nd_pfn_devinit(nd_pfn, NULL);
+
+	nd_device_register(dev);
+	return dev;
+}
+
+/*
+ * nd_pfn_clear_memmap_errors() clears any errors in the volatile memmap
+ * space associated with the namespace. If the memmap is set to DRAM, then
+ * this is a no-op. Since the memmap area is freshly initialized during
+ * probe, we have an opportunity to clear any badblocks in this area.
+ */
+static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
+{
+	struct nd_region *nd_region = to_nd_region(nd_pfn->dev.parent);
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	void *zero_page = page_address(ZERO_PAGE(0));
+	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+	int num_bad, meta_num, rc, bb_present;
+	sector_t first_bad, meta_start;
+	struct nd_namespace_io *nsio;
+
+	if (nd_pfn->mode != PFN_MODE_PMEM)
+		return 0;
+
+	nsio = to_nd_namespace_io(&ndns->dev);
+	meta_start = (SZ_4K + sizeof(*pfn_sb)) >> 9;
+	meta_num = (le64_to_cpu(pfn_sb->dataoff) >> 9) - meta_start;
+
+	/*
+	 * re-enable the namespace with correct size so that we can access
+	 * the device memmap area.
+	 */
+	devm_namespace_disable(&nd_pfn->dev, ndns);
+	rc = devm_namespace_enable(&nd_pfn->dev, ndns, le64_to_cpu(pfn_sb->dataoff));
+	if (rc)
+		return rc;
+
+	do {
+		unsigned long zero_len;
+		u64 nsoff;
+
+		bb_present = badblocks_check(&nd_region->bb, meta_start,
+				meta_num, &first_bad, &num_bad);
+		if (bb_present) {
+			dev_dbg(&nd_pfn->dev, "meta: %x badblocks at %llx\n",
+					num_bad, first_bad);
+			nsoff = ALIGN_DOWN((nd_region->ndr_start
+					+ (first_bad << 9)) - nsio->res.start,
+					PAGE_SIZE);
+			zero_len = ALIGN(num_bad << 9, PAGE_SIZE);
+			while (zero_len) {
+				unsigned long chunk = min(zero_len, PAGE_SIZE);
+
+				rc = nvdimm_write_bytes(ndns, nsoff, zero_page,
+							chunk, 0);
+				if (rc)
+					break;
+
+				zero_len -= chunk;
+				nsoff += chunk;
+			}
+			if (rc) {
+				dev_err(&nd_pfn->dev,
+					"error clearing %x badblocks at %llx\n",
+					num_bad, first_bad);
+				return rc;
+			}
+		}
+	} while (bb_present);
+
+	return 0;
+}
+
+static bool nd_supported_alignment(unsigned long align)
+{
+	int i;
+	unsigned long supported[MAX_NVDIMM_ALIGN] = { [0] = 0, };
+
+	if (align == 0)
+		return false;
+
+	nd_pfn_supported_alignments(supported);
+	for (i = 0; supported[i]; i++)
+		if (align == supported[i])
+			return true;
+	return false;
+}
+
+/**
+ * nd_pfn_validate - read and validate info-block
+ * @nd_pfn: fsdax namespace runtime state / properties
+ * @sig: 'devdax' or 'fsdax' signature
+ *
+ * Upon return the info-block buffer contents (->pfn_sb) are
+ * indeterminate when validation fails, and a coherent info-block
+ * otherwise.
+ */
+int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
+{
+	u64 checksum, offset;
+	struct resource *res;
+	enum nd_pfn_mode mode;
+	resource_size_t res_size;
+	struct nd_namespace_io *nsio;
+	unsigned long align, start_pad, end_trunc;
+	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	const uuid_t *parent_uuid = nd_dev_to_uuid(&ndns->dev);
+
+	if (!pfn_sb || !ndns)
+		return -ENODEV;
+
+	if (!is_memory(nd_pfn->dev.parent))
+		return -ENODEV;
+
+	if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0))
+		return -ENXIO;
+
+	if (memcmp(pfn_sb->signature, sig, PFN_SIG_LEN) != 0)
+		return -ENODEV;
+
+	checksum = le64_to_cpu(pfn_sb->checksum);
+	pfn_sb->checksum = 0;
+	if (checksum != nd_sb_checksum((struct nd_gen_sb *) pfn_sb))
+		return -ENODEV;
+	pfn_sb->checksum = cpu_to_le64(checksum);
+
+	if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0)
+		return -ENODEV;
+
+	if (__le16_to_cpu(pfn_sb->version_minor) < 1) {
+		pfn_sb->start_pad = 0;
+		pfn_sb->end_trunc = 0;
+	}
+
+	if (__le16_to_cpu(pfn_sb->version_minor) < 2)
+		pfn_sb->align = 0;
+
+	if (__le16_to_cpu(pfn_sb->version_minor) < 4) {
+		pfn_sb->page_struct_size = cpu_to_le16(64);
+		pfn_sb->page_size = cpu_to_le32(PAGE_SIZE);
+	}
+
+	switch (le32_to_cpu(pfn_sb->mode)) {
+	case PFN_MODE_RAM:
+	case PFN_MODE_PMEM:
+		break;
+	default:
+		return -ENXIO;
+	}
+
+	align = le32_to_cpu(pfn_sb->align);
+	offset = le64_to_cpu(pfn_sb->dataoff);
+	start_pad = le32_to_cpu(pfn_sb->start_pad);
+	end_trunc = le32_to_cpu(pfn_sb->end_trunc);
+	if (align == 0)
+		align = 1UL << ilog2(offset);
+	mode = le32_to_cpu(pfn_sb->mode);
+
+	if ((le32_to_cpu(pfn_sb->page_size) > PAGE_SIZE) &&
+			(mode == PFN_MODE_PMEM)) {
+		dev_err(&nd_pfn->dev,
+				"init failed, page size mismatch %d\n",
+				le32_to_cpu(pfn_sb->page_size));
+		return -EOPNOTSUPP;
+	}
+
+	if ((le16_to_cpu(pfn_sb->page_struct_size) < sizeof(struct page)) &&
+			(mode == PFN_MODE_PMEM)) {
+		dev_err(&nd_pfn->dev,
+				"init failed, struct page size mismatch %d\n",
+				le16_to_cpu(pfn_sb->page_struct_size));
+		return -EOPNOTSUPP;
+	}
+
+	/*
+	 * Check whether the we support the alignment. For Dax if the
+	 * superblock alignment is not matching, we won't initialize
+	 * the device.
+	 */
+	if (!nd_supported_alignment(align) &&
+			!memcmp(pfn_sb->signature, DAX_SIG, PFN_SIG_LEN)) {
+		dev_err(&nd_pfn->dev, "init failed, alignment mismatch: "
+				"%ld:%ld\n", nd_pfn->align, align);
+		return -EOPNOTSUPP;
+	}
+
+	if (!nd_pfn->uuid) {
+		/*
+		 * When probing a namepace via nd_pfn_probe() the uuid
+		 * is NULL (see: nd_pfn_devinit()) we init settings from
+		 * pfn_sb
+		 */
+		nd_pfn->uuid = kmemdup(pfn_sb->uuid, 16, GFP_KERNEL);
+		if (!nd_pfn->uuid)
+			return -ENOMEM;
+		nd_pfn->align = align;
+		nd_pfn->mode = mode;
+	} else {
+		/*
+		 * When probing a pfn / dax instance we validate the
+		 * live settings against the pfn_sb
+		 */
+		if (memcmp(nd_pfn->uuid, pfn_sb->uuid, 16) != 0)
+			return -ENODEV;
+
+		/*
+		 * If the uuid validates, but other settings mismatch
+		 * return EINVAL because userspace has managed to change
+		 * the configuration without specifying new
+		 * identification.
+		 */
+		if (nd_pfn->align != align || nd_pfn->mode != mode) {
+			dev_err(&nd_pfn->dev,
+					"init failed, settings mismatch\n");
+			dev_dbg(&nd_pfn->dev, "align: %lx:%lx mode: %d:%d\n",
+					nd_pfn->align, align, nd_pfn->mode,
+					mode);
+			return -EOPNOTSUPP;
+		}
+	}
+
+	if (align > nvdimm_namespace_capacity(ndns)) {
+		dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
+				align, nvdimm_namespace_capacity(ndns));
+		return -EOPNOTSUPP;
+	}
+
+	/*
+	 * These warnings are verbose because they can only trigger in
+	 * the case where the physical address alignment of the
+	 * namespace has changed since the pfn superblock was
+	 * established.
+	 */
+	nsio = to_nd_namespace_io(&ndns->dev);
+	res = &nsio->res;
+	res_size = resource_size(res);
+	if (offset >= res_size) {
+		dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n",
+				dev_name(&ndns->dev));
+		return -EOPNOTSUPP;
+	}
+
+	if ((align && !IS_ALIGNED(res->start + offset + start_pad, align))
+			|| !IS_ALIGNED(offset, PAGE_SIZE)) {
+		dev_err(&nd_pfn->dev,
+				"bad offset: %#llx dax disabled align: %#lx\n",
+				offset, align);
+		return -EOPNOTSUPP;
+	}
+
+	if (!IS_ALIGNED(res->start + start_pad, memremap_compat_align())) {
+		dev_err(&nd_pfn->dev, "resource start misaligned\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!IS_ALIGNED(res->end + 1 - end_trunc, memremap_compat_align())) {
+		dev_err(&nd_pfn->dev, "resource end misaligned\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (offset >= (res_size - start_pad - end_trunc)) {
+		dev_err(&nd_pfn->dev, "bad offset with small namespace\n");
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(nd_pfn_validate);
+
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
+{
+	int rc;
+	struct nd_pfn *nd_pfn;
+	struct device *pfn_dev;
+	struct nd_pfn_sb *pfn_sb;
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+	if (ndns->force_raw)
+		return -ENODEV;
+
+	switch (ndns->claim_class) {
+	case NVDIMM_CCLASS_NONE:
+	case NVDIMM_CCLASS_PFN:
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	nvdimm_bus_lock(&ndns->dev);
+	nd_pfn = nd_pfn_alloc(nd_region);
+	pfn_dev = nd_pfn_devinit(nd_pfn, ndns);
+	nvdimm_bus_unlock(&ndns->dev);
+	if (!pfn_dev)
+		return -ENOMEM;
+	pfn_sb = devm_kmalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
+	nd_pfn = to_nd_pfn(pfn_dev);
+	nd_pfn->pfn_sb = pfn_sb;
+	rc = nd_pfn_validate(nd_pfn, PFN_SIG);
+	dev_dbg(dev, "pfn: %s\n", rc == 0 ? dev_name(pfn_dev) : "<none>");
+	if (rc < 0) {
+		nd_detach_ndns(pfn_dev, &nd_pfn->ndns);
+		put_device(pfn_dev);
+	} else
+		nd_device_register(pfn_dev);
+
+	return rc;
+}
+EXPORT_SYMBOL(nd_pfn_probe);
+
+/*
+ * We hotplug memory at sub-section granularity, pad the reserved area
+ * from the previous section base to the namespace base address.
+ */
+static unsigned long init_altmap_base(resource_size_t base)
+{
+	unsigned long base_pfn = PHYS_PFN(base);
+
+	return SUBSECTION_ALIGN_DOWN(base_pfn);
+}
+
+static unsigned long init_altmap_reserve(resource_size_t base)
+{
+	unsigned long reserve = nd_info_block_reserve() >> PAGE_SHIFT;
+	unsigned long base_pfn = PHYS_PFN(base);
+
+	reserve += base_pfn - SUBSECTION_ALIGN_DOWN(base_pfn);
+	return reserve;
+}
+
+static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
+{
+	struct range *range = &pgmap->range;
+	struct vmem_altmap *altmap = &pgmap->altmap;
+	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+	u64 offset = le64_to_cpu(pfn_sb->dataoff);
+	u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+	u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+	u32 reserve = nd_info_block_reserve();
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	resource_size_t base = nsio->res.start + start_pad;
+	resource_size_t end = nsio->res.end - end_trunc;
+	struct vmem_altmap __altmap = {
+		.base_pfn = init_altmap_base(base),
+		.reserve = init_altmap_reserve(base),
+		.end_pfn = PHYS_PFN(end),
+	};
+
+	*range = (struct range) {
+		.start = nsio->res.start + start_pad,
+		.end = nsio->res.end - end_trunc,
+	};
+	pgmap->nr_range = 1;
+	if (nd_pfn->mode == PFN_MODE_RAM) {
+		if (offset < reserve)
+			return -EINVAL;
+		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
+	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
+		nd_pfn->npfns = PHYS_PFN((range_len(range) - offset));
+		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
+			dev_info(&nd_pfn->dev,
+					"number of pfns truncated from %lld to %ld\n",
+					le64_to_cpu(nd_pfn->pfn_sb->npfns),
+					nd_pfn->npfns);
+		memcpy(altmap, &__altmap, sizeof(*altmap));
+		altmap->free = PHYS_PFN(offset - reserve);
+		altmap->alloc = 0;
+		pgmap->flags |= PGMAP_ALTMAP_VALID;
+	} else
+		return -ENXIO;
+
+	return 0;
+}
+
+static int nd_pfn_init(struct nd_pfn *nd_pfn)
+{
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	resource_size_t start, size;
+	struct nd_region *nd_region;
+	unsigned long npfns, align;
+	u32 end_trunc;
+	struct nd_pfn_sb *pfn_sb;
+	phys_addr_t offset;
+	const char *sig;
+	u64 checksum;
+	int rc;
+
+	pfn_sb = devm_kmalloc(&nd_pfn->dev, sizeof(*pfn_sb), GFP_KERNEL);
+	if (!pfn_sb)
+		return -ENOMEM;
+
+	nd_pfn->pfn_sb = pfn_sb;
+	if (is_nd_dax(&nd_pfn->dev))
+		sig = DAX_SIG;
+	else
+		sig = PFN_SIG;
+
+	rc = nd_pfn_validate(nd_pfn, sig);
+	if (rc == 0)
+		return nd_pfn_clear_memmap_errors(nd_pfn);
+	if (rc != -ENODEV)
+		return rc;
+
+	/* no info block, do init */;
+	memset(pfn_sb, 0, sizeof(*pfn_sb));
+
+	nd_region = to_nd_region(nd_pfn->dev.parent);
+	if (nd_region->ro) {
+		dev_info(&nd_pfn->dev,
+				"%s is read-only, unable to init metadata\n",
+				dev_name(&nd_region->dev));
+		return -ENXIO;
+	}
+
+	start = nsio->res.start;
+	size = resource_size(&nsio->res);
+	npfns = PHYS_PFN(size - SZ_8K);
+	align = max(nd_pfn->align, memremap_compat_align());
+
+	/*
+	 * When @start is misaligned fail namespace creation. See
+	 * the 'struct nd_pfn_sb' commentary on why ->start_pad is not
+	 * an option.
+	 */
+	if (!IS_ALIGNED(start, memremap_compat_align())) {
+		dev_err(&nd_pfn->dev, "%s: start %pa misaligned to %#lx\n",
+				dev_name(&ndns->dev), &start,
+				memremap_compat_align());
+		return -EINVAL;
+	}
+	end_trunc = start + size - ALIGN_DOWN(start + size, align);
+	if (nd_pfn->mode == PFN_MODE_PMEM) {
+		unsigned long page_map_size = MAX_STRUCT_PAGE_SIZE * npfns;
+
+		/*
+		 * The altmap should be padded out to the block size used
+		 * when populating the vmemmap. This *should* be equal to
+		 * PMD_SIZE for most architectures.
+		 *
+		 * Also make sure size of struct page is less than
+		 * MAX_STRUCT_PAGE_SIZE. The goal here is compatibility in the
+		 * face of production kernel configurations that reduce the
+		 * 'struct page' size below MAX_STRUCT_PAGE_SIZE. For debug
+		 * kernel configurations that increase the 'struct page' size
+		 * above MAX_STRUCT_PAGE_SIZE, the page_struct_override allows
+		 * for continuing with the capacity that will be wasted when
+		 * reverting to a production kernel configuration. Otherwise,
+		 * those configurations are blocked by default.
+		 */
+		if (sizeof(struct page) > MAX_STRUCT_PAGE_SIZE) {
+			if (page_struct_override)
+				page_map_size = sizeof(struct page) * npfns;
+			else {
+				dev_err(&nd_pfn->dev,
+					"Memory debug options prevent using pmem for the page map\n");
+				return -EINVAL;
+			}
+		}
+		offset = ALIGN(start + SZ_8K + page_map_size, align) - start;
+	} else if (nd_pfn->mode == PFN_MODE_RAM)
+		offset = ALIGN(start + SZ_8K, align) - start;
+	else
+		return -ENXIO;
+
+	if (offset >= (size - end_trunc)) {
+		/* This results in zero size devices */
+		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
+				dev_name(&ndns->dev));
+		return -ENXIO;
+	}
+
+	npfns = PHYS_PFN(size - offset - end_trunc);
+	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
+	pfn_sb->dataoff = cpu_to_le64(offset);
+	pfn_sb->npfns = cpu_to_le64(npfns);
+	memcpy(pfn_sb->signature, sig, PFN_SIG_LEN);
+	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
+	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
+	pfn_sb->version_major = cpu_to_le16(1);
+	pfn_sb->version_minor = cpu_to_le16(4);
+	pfn_sb->end_trunc = cpu_to_le32(end_trunc);
+	pfn_sb->align = cpu_to_le32(nd_pfn->align);
+	if (sizeof(struct page) > MAX_STRUCT_PAGE_SIZE && page_struct_override)
+		pfn_sb->page_struct_size = cpu_to_le16(sizeof(struct page));
+	else
+		pfn_sb->page_struct_size = cpu_to_le16(MAX_STRUCT_PAGE_SIZE);
+	pfn_sb->page_size = cpu_to_le32(PAGE_SIZE);
+	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
+	pfn_sb->checksum = cpu_to_le64(checksum);
+
+	rc = nd_pfn_clear_memmap_errors(nd_pfn);
+	if (rc)
+		return rc;
+
+	return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0);
+}
+
+/*
+ * Determine the effective resource range and vmem_altmap from an nd_pfn
+ * instance.
+ */
+int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
+{
+	int rc;
+
+	if (!nd_pfn->uuid || !nd_pfn->ndns)
+		return -ENODEV;
+
+	rc = nd_pfn_init(nd_pfn);
+	if (rc)
+		return rc;
+
+	/* we need a valid pfn_sb before we can init a dev_pagemap */
+	return __nvdimm_setup_pfn(nd_pfn, pgmap);
+}
+EXPORT_SYMBOL_GPL(nvdimm_setup_pfn);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
new file mode 100644
index 0000000000..4e8fdcb3f1
--- /dev/null
+++ b/drivers/nvdimm/pmem.c
@@ -0,0 +1,768 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Persistent Memory Driver
+ *
+ * Copyright (c) 2014-2015, Intel Corporation.
+ * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
+ * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/pagemap.h>
+#include <linux/hdreg.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/set_memory.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/badblocks.h>
+#include <linux/memremap.h>
+#include <linux/kstrtox.h>
+#include <linux/vmalloc.h>
+#include <linux/blk-mq.h>
+#include <linux/pfn_t.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/dax.h>
+#include <linux/nd.h>
+#include <linux/mm.h>
+#include <asm/cacheflush.h>
+#include "pmem.h"
+#include "btt.h"
+#include "pfn.h"
+#include "nd.h"
+
+static struct device *to_dev(struct pmem_device *pmem)
+{
+	/*
+	 * nvdimm bus services need a 'dev' parameter, and we record the device
+	 * at init in bb.dev.
+	 */
+	return pmem->bb.dev;
+}
+
+static struct nd_region *to_region(struct pmem_device *pmem)
+{
+	return to_nd_region(to_dev(pmem)->parent);
+}
+
+static phys_addr_t pmem_to_phys(struct pmem_device *pmem, phys_addr_t offset)
+{
+	return pmem->phys_addr + offset;
+}
+
+static sector_t to_sect(struct pmem_device *pmem, phys_addr_t offset)
+{
+	return (offset - pmem->data_offset) >> SECTOR_SHIFT;
+}
+
+static phys_addr_t to_offset(struct pmem_device *pmem, sector_t sector)
+{
+	return (sector << SECTOR_SHIFT) + pmem->data_offset;
+}
+
+static void pmem_mkpage_present(struct pmem_device *pmem, phys_addr_t offset,
+		unsigned int len)
+{
+	phys_addr_t phys = pmem_to_phys(pmem, offset);
+	unsigned long pfn_start, pfn_end, pfn;
+
+	/* only pmem in the linear map supports HWPoison */
+	if (is_vmalloc_addr(pmem->virt_addr))
+		return;
+
+	pfn_start = PHYS_PFN(phys);
+	pfn_end = pfn_start + PHYS_PFN(len);
+	for (pfn = pfn_start; pfn < pfn_end; pfn++) {
+		struct page *page = pfn_to_page(pfn);
+
+		/*
+		 * Note, no need to hold a get_dev_pagemap() reference
+		 * here since we're in the driver I/O path and
+		 * outstanding I/O requests pin the dev_pagemap.
+		 */
+		if (test_and_clear_pmem_poison(page))
+			clear_mce_nospec(pfn);
+	}
+}
+
+static void pmem_clear_bb(struct pmem_device *pmem, sector_t sector, long blks)
+{
+	if (blks == 0)
+		return;
+	badblocks_clear(&pmem->bb, sector, blks);
+	if (pmem->bb_state)
+		sysfs_notify_dirent(pmem->bb_state);
+}
+
+static long __pmem_clear_poison(struct pmem_device *pmem,
+		phys_addr_t offset, unsigned int len)
+{
+	phys_addr_t phys = pmem_to_phys(pmem, offset);
+	long cleared = nvdimm_clear_poison(to_dev(pmem), phys, len);
+
+	if (cleared > 0) {
+		pmem_mkpage_present(pmem, offset, cleared);
+		arch_invalidate_pmem(pmem->virt_addr + offset, len);
+	}
+	return cleared;
+}
+
+static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
+		phys_addr_t offset, unsigned int len)
+{
+	long cleared = __pmem_clear_poison(pmem, offset, len);
+
+	if (cleared < 0)
+		return BLK_STS_IOERR;
+
+	pmem_clear_bb(pmem, to_sect(pmem, offset), cleared >> SECTOR_SHIFT);
+	if (cleared < len)
+		return BLK_STS_IOERR;
+	return BLK_STS_OK;
+}
+
+static void write_pmem(void *pmem_addr, struct page *page,
+		unsigned int off, unsigned int len)
+{
+	unsigned int chunk;
+	void *mem;
+
+	while (len) {
+		mem = kmap_atomic(page);
+		chunk = min_t(unsigned int, len, PAGE_SIZE - off);
+		memcpy_flushcache(pmem_addr, mem + off, chunk);
+		kunmap_atomic(mem);
+		len -= chunk;
+		off = 0;
+		page++;
+		pmem_addr += chunk;
+	}
+}
+
+static blk_status_t read_pmem(struct page *page, unsigned int off,
+		void *pmem_addr, unsigned int len)
+{
+	unsigned int chunk;
+	unsigned long rem;
+	void *mem;
+
+	while (len) {
+		mem = kmap_atomic(page);
+		chunk = min_t(unsigned int, len, PAGE_SIZE - off);
+		rem = copy_mc_to_kernel(mem + off, pmem_addr, chunk);
+		kunmap_atomic(mem);
+		if (rem)
+			return BLK_STS_IOERR;
+		len -= chunk;
+		off = 0;
+		page++;
+		pmem_addr += chunk;
+	}
+	return BLK_STS_OK;
+}
+
+static blk_status_t pmem_do_read(struct pmem_device *pmem,
+			struct page *page, unsigned int page_off,
+			sector_t sector, unsigned int len)
+{
+	blk_status_t rc;
+	phys_addr_t pmem_off = to_offset(pmem, sector);
+	void *pmem_addr = pmem->virt_addr + pmem_off;
+
+	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
+		return BLK_STS_IOERR;
+
+	rc = read_pmem(page, page_off, pmem_addr, len);
+	flush_dcache_page(page);
+	return rc;
+}
+
+static blk_status_t pmem_do_write(struct pmem_device *pmem,
+			struct page *page, unsigned int page_off,
+			sector_t sector, unsigned int len)
+{
+	phys_addr_t pmem_off = to_offset(pmem, sector);
+	void *pmem_addr = pmem->virt_addr + pmem_off;
+
+	if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) {
+		blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len);
+
+		if (rc != BLK_STS_OK)
+			return rc;
+	}
+
+	flush_dcache_page(page);
+	write_pmem(pmem_addr, page, page_off, len);
+
+	return BLK_STS_OK;
+}
+
+static void pmem_submit_bio(struct bio *bio)
+{
+	int ret = 0;
+	blk_status_t rc = 0;
+	bool do_acct;
+	unsigned long start;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data;
+	struct nd_region *nd_region = to_region(pmem);
+
+	if (bio->bi_opf & REQ_PREFLUSH)
+		ret = nvdimm_flush(nd_region, bio);
+
+	do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
+	if (do_acct)
+		start = bio_start_io_acct(bio);
+	bio_for_each_segment(bvec, bio, iter) {
+		if (op_is_write(bio_op(bio)))
+			rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset,
+				iter.bi_sector, bvec.bv_len);
+		else
+			rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset,
+				iter.bi_sector, bvec.bv_len);
+		if (rc) {
+			bio->bi_status = rc;
+			break;
+		}
+	}
+	if (do_acct)
+		bio_end_io_acct(bio, start);
+
+	if (bio->bi_opf & REQ_FUA)
+		ret = nvdimm_flush(nd_region, bio);
+
+	if (ret)
+		bio->bi_status = errno_to_blk_status(ret);
+
+	bio_endio(bio);
+}
+
+/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
+__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
+		long nr_pages, enum dax_access_mode mode, void **kaddr,
+		pfn_t *pfn)
+{
+	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
+	sector_t sector = PFN_PHYS(pgoff) >> SECTOR_SHIFT;
+	unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT;
+	struct badblocks *bb = &pmem->bb;
+	sector_t first_bad;
+	int num_bad;
+
+	if (kaddr)
+		*kaddr = pmem->virt_addr + offset;
+	if (pfn)
+		*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+
+	if (bb->count &&
+	    badblocks_check(bb, sector, num, &first_bad, &num_bad)) {
+		long actual_nr;
+
+		if (mode != DAX_RECOVERY_WRITE)
+			return -EHWPOISON;
+
+		/*
+		 * Set the recovery stride is set to kernel page size because
+		 * the underlying driver and firmware clear poison functions
+		 * don't appear to handle large chunk(such as 2MiB) reliably.
+		 */
+		actual_nr = PHYS_PFN(
+			PAGE_ALIGN((first_bad - sector) << SECTOR_SHIFT));
+		dev_dbg(pmem->bb.dev, "start sector(%llu), nr_pages(%ld), first_bad(%llu), actual_nr(%ld)\n",
+				sector, nr_pages, first_bad, actual_nr);
+		if (actual_nr)
+			return actual_nr;
+		return 1;
+	}
+
+	/*
+	 * If badblocks are present but not in the range, limit known good range
+	 * to the requested range.
+	 */
+	if (bb->count)
+		return nr_pages;
+	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
+}
+
+static const struct block_device_operations pmem_fops = {
+	.owner =		THIS_MODULE,
+	.submit_bio =		pmem_submit_bio,
+};
+
+static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+				    size_t nr_pages)
+{
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+
+	return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0,
+				   PFN_PHYS(pgoff) >> SECTOR_SHIFT,
+				   PAGE_SIZE));
+}
+
+static long pmem_dax_direct_access(struct dax_device *dax_dev,
+		pgoff_t pgoff, long nr_pages, enum dax_access_mode mode,
+		void **kaddr, pfn_t *pfn)
+{
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+
+	return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn);
+}
+
+/*
+ * The recovery write thread started out as a normal pwrite thread and
+ * when the filesystem was told about potential media error in the
+ * range, filesystem turns the normal pwrite to a dax_recovery_write.
+ *
+ * The recovery write consists of clearing media poison, clearing page
+ * HWPoison bit, reenable page-wide read-write permission, flush the
+ * caches and finally write.  A competing pread thread will be held
+ * off during the recovery process since data read back might not be
+ * valid, and this is achieved by clearing the badblock records after
+ * the recovery write is complete. Competing recovery write threads
+ * are already serialized by writer lock held by dax_iomap_rw().
+ */
+static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
+		void *addr, size_t bytes, struct iov_iter *i)
+{
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	size_t olen, len, off;
+	phys_addr_t pmem_off;
+	struct device *dev = pmem->bb.dev;
+	long cleared;
+
+	off = offset_in_page(addr);
+	len = PFN_PHYS(PFN_UP(off + bytes));
+	if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len))
+		return _copy_from_iter_flushcache(addr, bytes, i);
+
+	/*
+	 * Not page-aligned range cannot be recovered. This should not
+	 * happen unless something else went wrong.
+	 */
+	if (off || !PAGE_ALIGNED(bytes)) {
+		dev_dbg(dev, "Found poison, but addr(%p) or bytes(%#zx) not page aligned\n",
+			addr, bytes);
+		return 0;
+	}
+
+	pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+	cleared = __pmem_clear_poison(pmem, pmem_off, len);
+	if (cleared > 0 && cleared < len) {
+		dev_dbg(dev, "poison cleared only %ld out of %zu bytes\n",
+			cleared, len);
+		return 0;
+	}
+	if (cleared < 0) {
+		dev_dbg(dev, "poison clear failed: %ld\n", cleared);
+		return 0;
+	}
+
+	olen = _copy_from_iter_flushcache(addr, bytes, i);
+	pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT);
+
+	return olen;
+}
+
+static const struct dax_operations pmem_dax_ops = {
+	.direct_access = pmem_dax_direct_access,
+	.zero_page_range = pmem_dax_zero_page_range,
+	.recovery_write = pmem_recovery_write,
+};
+
+static ssize_t write_cache_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct pmem_device *pmem = dev_to_disk(dev)->private_data;
+
+	return sprintf(buf, "%d\n", !!dax_write_cache_enabled(pmem->dax_dev));
+}
+
+static ssize_t write_cache_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct pmem_device *pmem = dev_to_disk(dev)->private_data;
+	bool write_cache;
+	int rc;
+
+	rc = kstrtobool(buf, &write_cache);
+	if (rc)
+		return rc;
+	dax_write_cache(pmem->dax_dev, write_cache);
+	return len;
+}
+static DEVICE_ATTR_RW(write_cache);
+
+static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+#ifndef CONFIG_ARCH_HAS_PMEM_API
+	if (a == &dev_attr_write_cache.attr)
+		return 0;
+#endif
+	return a->mode;
+}
+
+static struct attribute *dax_attributes[] = {
+	&dev_attr_write_cache.attr,
+	NULL,
+};
+
+static const struct attribute_group dax_attribute_group = {
+	.name		= "dax",
+	.attrs		= dax_attributes,
+	.is_visible	= dax_visible,
+};
+
+static const struct attribute_group *pmem_attribute_groups[] = {
+	&dax_attribute_group,
+	NULL,
+};
+
+static void pmem_release_disk(void *__pmem)
+{
+	struct pmem_device *pmem = __pmem;
+
+	dax_remove_host(pmem->disk);
+	kill_dax(pmem->dax_dev);
+	put_dax(pmem->dax_dev);
+	del_gendisk(pmem->disk);
+
+	put_disk(pmem->disk);
+}
+
+static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
+		unsigned long pfn, unsigned long nr_pages, int mf_flags)
+{
+	struct pmem_device *pmem =
+			container_of(pgmap, struct pmem_device, pgmap);
+	u64 offset = PFN_PHYS(pfn) - pmem->phys_addr - pmem->data_offset;
+	u64 len = nr_pages << PAGE_SHIFT;
+
+	return dax_holder_notify_failure(pmem->dax_dev, offset, len, mf_flags);
+}
+
+static const struct dev_pagemap_ops fsdax_pagemap_ops = {
+	.memory_failure		= pmem_pagemap_memory_failure,
+};
+
+static int pmem_attach_disk(struct device *dev,
+		struct nd_namespace_common *ndns)
+{
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	int nid = dev_to_node(dev), fua;
+	struct resource *res = &nsio->res;
+	struct range bb_range;
+	struct nd_pfn *nd_pfn = NULL;
+	struct dax_device *dax_dev;
+	struct nd_pfn_sb *pfn_sb;
+	struct pmem_device *pmem;
+	struct request_queue *q;
+	struct gendisk *disk;
+	void *addr;
+	int rc;
+
+	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
+	if (!pmem)
+		return -ENOMEM;
+
+	rc = devm_namespace_enable(dev, ndns, nd_info_block_reserve());
+	if (rc)
+		return rc;
+
+	/* while nsio_rw_bytes is active, parse a pfn info block if present */
+	if (is_nd_pfn(dev)) {
+		nd_pfn = to_nd_pfn(dev);
+		rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap);
+		if (rc)
+			return rc;
+	}
+
+	/* we're attaching a block device, disable raw namespace access */
+	devm_namespace_disable(dev, ndns);
+
+	dev_set_drvdata(dev, pmem);
+	pmem->phys_addr = res->start;
+	pmem->size = resource_size(res);
+	fua = nvdimm_has_flush(nd_region);
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) || fua < 0) {
+		dev_warn(dev, "unable to guarantee persistence of writes\n");
+		fua = 0;
+	}
+
+	if (!devm_request_mem_region(dev, res->start, resource_size(res),
+				dev_name(&ndns->dev))) {
+		dev_warn(dev, "could not reserve region %pR\n", res);
+		return -EBUSY;
+	}
+
+	disk = blk_alloc_disk(nid);
+	if (!disk)
+		return -ENOMEM;
+	q = disk->queue;
+
+	pmem->disk = disk;
+	pmem->pgmap.owner = pmem;
+	pmem->pfn_flags = PFN_DEV;
+	if (is_nd_pfn(dev)) {
+		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
+		addr = devm_memremap_pages(dev, &pmem->pgmap);
+		pfn_sb = nd_pfn->pfn_sb;
+		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
+		pmem->pfn_pad = resource_size(res) -
+			range_len(&pmem->pgmap.range);
+		pmem->pfn_flags |= PFN_MAP;
+		bb_range = pmem->pgmap.range;
+		bb_range.start += pmem->data_offset;
+	} else if (pmem_should_map_pages(dev)) {
+		pmem->pgmap.range.start = res->start;
+		pmem->pgmap.range.end = res->end;
+		pmem->pgmap.nr_range = 1;
+		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
+		addr = devm_memremap_pages(dev, &pmem->pgmap);
+		pmem->pfn_flags |= PFN_MAP;
+		bb_range = pmem->pgmap.range;
+	} else {
+		addr = devm_memremap(dev, pmem->phys_addr,
+				pmem->size, ARCH_MEMREMAP_PMEM);
+		bb_range.start =  res->start;
+		bb_range.end = res->end;
+	}
+
+	if (IS_ERR(addr)) {
+		rc = PTR_ERR(addr);
+		goto out;
+	}
+	pmem->virt_addr = addr;
+
+	blk_queue_write_cache(q, true, fua);
+	blk_queue_physical_block_size(q, PAGE_SIZE);
+	blk_queue_logical_block_size(q, pmem_sector_size(ndns));
+	blk_queue_max_hw_sectors(q, UINT_MAX);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+	blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q);
+	if (pmem->pfn_flags & PFN_MAP)
+		blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+
+	disk->fops		= &pmem_fops;
+	disk->private_data	= pmem;
+	nvdimm_namespace_disk_name(ndns, disk->disk_name);
+	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
+			/ 512);
+	if (devm_init_badblocks(dev, &pmem->bb))
+		return -ENOMEM;
+	nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range);
+	disk->bb = &pmem->bb;
+
+	dax_dev = alloc_dax(pmem, &pmem_dax_ops);
+	if (IS_ERR(dax_dev)) {
+		rc = PTR_ERR(dax_dev);
+		goto out;
+	}
+	set_dax_nocache(dax_dev);
+	set_dax_nomc(dax_dev);
+	if (is_nvdimm_sync(nd_region))
+		set_dax_synchronous(dax_dev);
+	rc = dax_add_host(dax_dev, disk);
+	if (rc)
+		goto out_cleanup_dax;
+	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
+	pmem->dax_dev = dax_dev;
+
+	rc = device_add_disk(dev, disk, pmem_attribute_groups);
+	if (rc)
+		goto out_remove_host;
+	if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
+		return -ENOMEM;
+
+	nvdimm_check_and_set_ro(disk);
+
+	pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd,
+					  "badblocks");
+	if (!pmem->bb_state)
+		dev_warn(dev, "'badblocks' notification disabled\n");
+	return 0;
+
+out_remove_host:
+	dax_remove_host(pmem->disk);
+out_cleanup_dax:
+	kill_dax(pmem->dax_dev);
+	put_dax(pmem->dax_dev);
+out:
+	put_disk(pmem->disk);
+	return rc;
+}
+
+static int nd_pmem_probe(struct device *dev)
+{
+	int ret;
+	struct nd_namespace_common *ndns;
+
+	ndns = nvdimm_namespace_common_probe(dev);
+	if (IS_ERR(ndns))
+		return PTR_ERR(ndns);
+
+	if (is_nd_btt(dev))
+		return nvdimm_namespace_attach_btt(ndns);
+
+	if (is_nd_pfn(dev))
+		return pmem_attach_disk(dev, ndns);
+
+	ret = devm_namespace_enable(dev, ndns, nd_info_block_reserve());
+	if (ret)
+		return ret;
+
+	ret = nd_btt_probe(dev, ndns);
+	if (ret == 0)
+		return -ENXIO;
+
+	/*
+	 * We have two failure conditions here, there is no
+	 * info reserver block or we found a valid info reserve block
+	 * but failed to initialize the pfn superblock.
+	 *
+	 * For the first case consider namespace as a raw pmem namespace
+	 * and attach a disk.
+	 *
+	 * For the latter, consider this a success and advance the namespace
+	 * seed.
+	 */
+	ret = nd_pfn_probe(dev, ndns);
+	if (ret == 0)
+		return -ENXIO;
+	else if (ret == -EOPNOTSUPP)
+		return ret;
+
+	ret = nd_dax_probe(dev, ndns);
+	if (ret == 0)
+		return -ENXIO;
+	else if (ret == -EOPNOTSUPP)
+		return ret;
+
+	/* probe complete, attach handles namespace enabling */
+	devm_namespace_disable(dev, ndns);
+
+	return pmem_attach_disk(dev, ndns);
+}
+
+static void nd_pmem_remove(struct device *dev)
+{
+	struct pmem_device *pmem = dev_get_drvdata(dev);
+
+	if (is_nd_btt(dev))
+		nvdimm_namespace_detach_btt(to_nd_btt(dev));
+	else {
+		/*
+		 * Note, this assumes device_lock() context to not
+		 * race nd_pmem_notify()
+		 */
+		sysfs_put(pmem->bb_state);
+		pmem->bb_state = NULL;
+	}
+	nvdimm_flush(to_nd_region(dev->parent), NULL);
+}
+
+static void nd_pmem_shutdown(struct device *dev)
+{
+	nvdimm_flush(to_nd_region(dev->parent), NULL);
+}
+
+static void pmem_revalidate_poison(struct device *dev)
+{
+	struct nd_region *nd_region;
+	resource_size_t offset = 0, end_trunc = 0;
+	struct nd_namespace_common *ndns;
+	struct nd_namespace_io *nsio;
+	struct badblocks *bb;
+	struct range range;
+	struct kernfs_node *bb_state;
+
+	if (is_nd_btt(dev)) {
+		struct nd_btt *nd_btt = to_nd_btt(dev);
+
+		ndns = nd_btt->ndns;
+		nd_region = to_nd_region(ndns->dev.parent);
+		nsio = to_nd_namespace_io(&ndns->dev);
+		bb = &nsio->bb;
+		bb_state = NULL;
+	} else {
+		struct pmem_device *pmem = dev_get_drvdata(dev);
+
+		nd_region = to_region(pmem);
+		bb = &pmem->bb;
+		bb_state = pmem->bb_state;
+
+		if (is_nd_pfn(dev)) {
+			struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+			struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+
+			ndns = nd_pfn->ndns;
+			offset = pmem->data_offset +
+					__le32_to_cpu(pfn_sb->start_pad);
+			end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+		} else {
+			ndns = to_ndns(dev);
+		}
+
+		nsio = to_nd_namespace_io(&ndns->dev);
+	}
+
+	range.start = nsio->res.start + offset;
+	range.end = nsio->res.end - end_trunc;
+	nvdimm_badblocks_populate(nd_region, bb, &range);
+	if (bb_state)
+		sysfs_notify_dirent(bb_state);
+}
+
+static void pmem_revalidate_region(struct device *dev)
+{
+	struct pmem_device *pmem;
+
+	if (is_nd_btt(dev)) {
+		struct nd_btt *nd_btt = to_nd_btt(dev);
+		struct btt *btt = nd_btt->btt;
+
+		nvdimm_check_and_set_ro(btt->btt_disk);
+		return;
+	}
+
+	pmem = dev_get_drvdata(dev);
+	nvdimm_check_and_set_ro(pmem->disk);
+}
+
+static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
+{
+	switch (event) {
+	case NVDIMM_REVALIDATE_POISON:
+		pmem_revalidate_poison(dev);
+		break;
+	case NVDIMM_REVALIDATE_REGION:
+		pmem_revalidate_region(dev);
+		break;
+	default:
+		dev_WARN_ONCE(dev, 1, "notify: unknown event: %d\n", event);
+		break;
+	}
+}
+
+MODULE_ALIAS("pmem");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
+static struct nd_device_driver nd_pmem_driver = {
+	.probe = nd_pmem_probe,
+	.remove = nd_pmem_remove,
+	.notify = nd_pmem_notify,
+	.shutdown = nd_pmem_shutdown,
+	.drv = {
+		.name = "nd_pmem",
+	},
+	.type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
+};
+
+module_nd_driver(nd_pmem_driver);
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
new file mode 100644
index 0000000000..392b0b38ac
--- /dev/null
+++ b/drivers/nvdimm/pmem.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NVDIMM_PMEM_H__
+#define __NVDIMM_PMEM_H__
+#include <linux/page-flags.h>
+#include <linux/badblocks.h>
+#include <linux/memremap.h>
+#include <linux/types.h>
+#include <linux/pfn_t.h>
+#include <linux/fs.h>
+
+enum dax_access_mode;
+
+/* this definition is in it's own header for tools/testing/nvdimm to consume */
+struct pmem_device {
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	/* when non-zero this device is hosting a 'pfn' instance */
+	phys_addr_t		data_offset;
+	u64			pfn_flags;
+	void			*virt_addr;
+	/* immutable base size of the namespace */
+	size_t			size;
+	/* trim size when namespace capacity has been section aligned */
+	u32			pfn_pad;
+	struct kernfs_node	*bb_state;
+	struct badblocks	bb;
+	struct dax_device	*dax_dev;
+	struct gendisk		*disk;
+	struct dev_pagemap	pgmap;
+};
+
+long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
+		long nr_pages, enum dax_access_mode mode, void **kaddr,
+		pfn_t *pfn);
+
+#ifdef CONFIG_MEMORY_FAILURE
+static inline bool test_and_clear_pmem_poison(struct page *page)
+{
+	return TestClearPageHWPoison(page);
+}
+#else
+static inline bool test_and_clear_pmem_poison(struct page *page)
+{
+	return false;
+}
+#endif
+#endif /* __NVDIMM_PMEM_H__ */
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
new file mode 100644
index 0000000000..88dc062af5
--- /dev/null
+++ b/drivers/nvdimm/region.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#include <linux/memregion.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "nd.h"
+
+static int nd_region_probe(struct device *dev)
+{
+	int err, rc;
+	static unsigned long once;
+	struct nd_region_data *ndrd;
+	struct nd_region *nd_region = to_nd_region(dev);
+	struct range range = {
+		.start = nd_region->ndr_start,
+		.end = nd_region->ndr_start + nd_region->ndr_size - 1,
+	};
+
+	if (nd_region->num_lanes > num_online_cpus()
+			&& nd_region->num_lanes < num_possible_cpus()
+			&& !test_and_set_bit(0, &once)) {
+		dev_dbg(dev, "online cpus (%d) < concurrent i/o lanes (%d) < possible cpus (%d)\n",
+				num_online_cpus(), nd_region->num_lanes,
+				num_possible_cpus());
+		dev_dbg(dev, "setting nr_cpus=%d may yield better libnvdimm device performance\n",
+				nd_region->num_lanes);
+	}
+
+	rc = nd_region_activate(nd_region);
+	if (rc)
+		return rc;
+
+	if (devm_init_badblocks(dev, &nd_region->bb))
+		return -ENODEV;
+	nd_region->bb_state =
+		sysfs_get_dirent(nd_region->dev.kobj.sd, "badblocks");
+	if (!nd_region->bb_state)
+		dev_warn(dev, "'badblocks' notification disabled\n");
+	nvdimm_badblocks_populate(nd_region, &nd_region->bb, &range);
+
+	rc = nd_region_register_namespaces(nd_region, &err);
+	if (rc < 0)
+		return rc;
+
+	ndrd = dev_get_drvdata(dev);
+	ndrd->ns_active = rc;
+	ndrd->ns_count = rc + err;
+
+	if (rc && err && rc == err)
+		return -ENODEV;
+
+	nd_region->btt_seed = nd_btt_create(nd_region);
+	nd_region->pfn_seed = nd_pfn_create(nd_region);
+	nd_region->dax_seed = nd_dax_create(nd_region);
+	if (err == 0)
+		return 0;
+
+	/*
+	 * Given multiple namespaces per region, we do not want to
+	 * disable all the successfully registered peer namespaces upon
+	 * a single registration failure.  If userspace is missing a
+	 * namespace that it expects it can disable/re-enable the region
+	 * to retry discovery after correcting the failure.
+	 * <regionX>/namespaces returns the current
+	 * "<async-registered>/<total>" namespace count.
+	 */
+	dev_err(dev, "failed to register %d namespace%s, continuing...\n",
+			err, err == 1 ? "" : "s");
+	return 0;
+}
+
+static int child_unregister(struct device *dev, void *data)
+{
+	nd_device_unregister(dev, ND_SYNC);
+	return 0;
+}
+
+static void nd_region_remove(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	device_for_each_child(dev, NULL, child_unregister);
+
+	/* flush attribute readers and disable */
+	nvdimm_bus_lock(dev);
+	nd_region->ns_seed = NULL;
+	nd_region->btt_seed = NULL;
+	nd_region->pfn_seed = NULL;
+	nd_region->dax_seed = NULL;
+	dev_set_drvdata(dev, NULL);
+	nvdimm_bus_unlock(dev);
+
+	/*
+	 * Note, this assumes device_lock() context to not race
+	 * nd_region_notify()
+	 */
+	sysfs_put(nd_region->bb_state);
+	nd_region->bb_state = NULL;
+
+	/*
+	 * Try to flush caches here since a disabled region may be subject to
+	 * secure erase while disabled, and previous dirty data should not be
+	 * written back to a new instance of the region. This only matters on
+	 * bare metal where security commands are available, so silent failure
+	 * here is ok.
+	 */
+	if (cpu_cache_has_invalidate_memregion())
+		cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
+}
+
+static int child_notify(struct device *dev, void *data)
+{
+	nd_device_notify(dev, *(enum nvdimm_event *) data);
+	return 0;
+}
+
+static void nd_region_notify(struct device *dev, enum nvdimm_event event)
+{
+	if (event == NVDIMM_REVALIDATE_POISON) {
+		struct nd_region *nd_region = to_nd_region(dev);
+
+		if (is_memory(&nd_region->dev)) {
+			struct range range = {
+				.start = nd_region->ndr_start,
+				.end = nd_region->ndr_start +
+					nd_region->ndr_size - 1,
+			};
+
+			nvdimm_badblocks_populate(nd_region,
+					&nd_region->bb, &range);
+			if (nd_region->bb_state)
+				sysfs_notify_dirent(nd_region->bb_state);
+		}
+	}
+	device_for_each_child(dev, &event, child_notify);
+}
+
+static struct nd_device_driver nd_region_driver = {
+	.probe = nd_region_probe,
+	.remove = nd_region_remove,
+	.notify = nd_region_notify,
+	.drv = {
+		.name = "nd_region",
+	},
+	.type = ND_DRIVER_REGION_BLK | ND_DRIVER_REGION_PMEM,
+};
+
+int __init nd_region_init(void)
+{
+	return nd_driver_register(&nd_region_driver);
+}
+
+void nd_region_exit(void)
+{
+	driver_unregister(&nd_region_driver.drv);
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
new file mode 100644
index 0000000000..e2f1fb9970
--- /dev/null
+++ b/drivers/nvdimm/region_devs.c
@@ -0,0 +1,1273 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ */
+#include <linux/scatterlist.h>
+#include <linux/memregion.h>
+#include <linux/highmem.h>
+#include <linux/kstrtox.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/sort.h>
+#include <linux/io.h>
+#include <linux/nd.h>
+#include "nd-core.h"
+#include "nd.h"
+
+/*
+ * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is
+ * irrelevant.
+ */
+#include <linux/io-64-nonatomic-hi-lo.h>
+
+static DEFINE_PER_CPU(int, flush_idx);
+
+static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm,
+		struct nd_region_data *ndrd)
+{
+	int i, j;
+
+	dev_dbg(dev, "%s: map %d flush address%s\n", nvdimm_name(nvdimm),
+			nvdimm->num_flush, nvdimm->num_flush == 1 ? "" : "es");
+	for (i = 0; i < (1 << ndrd->hints_shift); i++) {
+		struct resource *res = &nvdimm->flush_wpq[i];
+		unsigned long pfn = PHYS_PFN(res->start);
+		void __iomem *flush_page;
+
+		/* check if flush hints share a page */
+		for (j = 0; j < i; j++) {
+			struct resource *res_j = &nvdimm->flush_wpq[j];
+			unsigned long pfn_j = PHYS_PFN(res_j->start);
+
+			if (pfn == pfn_j)
+				break;
+		}
+
+		if (j < i)
+			flush_page = (void __iomem *) ((unsigned long)
+					ndrd_get_flush_wpq(ndrd, dimm, j)
+					& PAGE_MASK);
+		else
+			flush_page = devm_nvdimm_ioremap(dev,
+					PFN_PHYS(pfn), PAGE_SIZE);
+		if (!flush_page)
+			return -ENXIO;
+		ndrd_set_flush_wpq(ndrd, dimm, i, flush_page
+				+ (res->start & ~PAGE_MASK));
+	}
+
+	return 0;
+}
+
+static int nd_region_invalidate_memregion(struct nd_region *nd_region)
+{
+	int i, incoherent = 0;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		if (test_bit(NDD_INCOHERENT, &nvdimm->flags)) {
+			incoherent++;
+			break;
+		}
+	}
+
+	if (!incoherent)
+		return 0;
+
+	if (!cpu_cache_has_invalidate_memregion()) {
+		if (IS_ENABLED(CONFIG_NVDIMM_SECURITY_TEST)) {
+			dev_warn(
+				&nd_region->dev,
+				"Bypassing cpu_cache_invalidate_memergion() for testing!\n");
+			goto out;
+		} else {
+			dev_err(&nd_region->dev,
+				"Failed to synchronize CPU cache state\n");
+			return -ENXIO;
+		}
+	}
+
+	cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
+out:
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		clear_bit(NDD_INCOHERENT, &nvdimm->flags);
+	}
+
+	return 0;
+}
+
+int nd_region_activate(struct nd_region *nd_region)
+{
+	int i, j, rc, num_flush = 0;
+	struct nd_region_data *ndrd;
+	struct device *dev = &nd_region->dev;
+	size_t flush_data_size = sizeof(void *);
+
+	nvdimm_bus_lock(&nd_region->dev);
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) {
+			nvdimm_bus_unlock(&nd_region->dev);
+			return -EBUSY;
+		}
+
+		/* at least one null hint slot per-dimm for the "no-hint" case */
+		flush_data_size += sizeof(void *);
+		num_flush = min_not_zero(num_flush, nvdimm->num_flush);
+		if (!nvdimm->num_flush)
+			continue;
+		flush_data_size += nvdimm->num_flush * sizeof(void *);
+	}
+	nvdimm_bus_unlock(&nd_region->dev);
+
+	rc = nd_region_invalidate_memregion(nd_region);
+	if (rc)
+		return rc;
+
+	ndrd = devm_kzalloc(dev, sizeof(*ndrd) + flush_data_size, GFP_KERNEL);
+	if (!ndrd)
+		return -ENOMEM;
+	dev_set_drvdata(dev, ndrd);
+
+	if (!num_flush)
+		return 0;
+
+	ndrd->hints_shift = ilog2(num_flush);
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+		int rc = nvdimm_map_flush(&nd_region->dev, nvdimm, i, ndrd);
+
+		if (rc)
+			return rc;
+	}
+
+	/*
+	 * Clear out entries that are duplicates. This should prevent the
+	 * extra flushings.
+	 */
+	for (i = 0; i < nd_region->ndr_mappings - 1; i++) {
+		/* ignore if NULL already */
+		if (!ndrd_get_flush_wpq(ndrd, i, 0))
+			continue;
+
+		for (j = i + 1; j < nd_region->ndr_mappings; j++)
+			if (ndrd_get_flush_wpq(ndrd, i, 0) ==
+			    ndrd_get_flush_wpq(ndrd, j, 0))
+				ndrd_set_flush_wpq(ndrd, j, 0, NULL);
+	}
+
+	return 0;
+}
+
+static void nd_region_release(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	u16 i;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		put_device(&nvdimm->dev);
+	}
+	free_percpu(nd_region->lane);
+	if (!test_bit(ND_REGION_CXL, &nd_region->flags))
+		memregion_free(nd_region->id);
+	kfree(nd_region);
+}
+
+struct nd_region *to_nd_region(struct device *dev)
+{
+	struct nd_region *nd_region = container_of(dev, struct nd_region, dev);
+
+	WARN_ON(dev->type->release != nd_region_release);
+	return nd_region;
+}
+EXPORT_SYMBOL_GPL(to_nd_region);
+
+struct device *nd_region_dev(struct nd_region *nd_region)
+{
+	if (!nd_region)
+		return NULL;
+	return &nd_region->dev;
+}
+EXPORT_SYMBOL_GPL(nd_region_dev);
+
+void *nd_region_provider_data(struct nd_region *nd_region)
+{
+	return nd_region->provider_data;
+}
+EXPORT_SYMBOL_GPL(nd_region_provider_data);
+
+/**
+ * nd_region_to_nstype() - region to an integer namespace type
+ * @nd_region: region-device to interrogate
+ *
+ * This is the 'nstype' attribute of a region as well, an input to the
+ * MODALIAS for namespace devices, and bit number for a nvdimm_bus to match
+ * namespace devices with namespace drivers.
+ */
+int nd_region_to_nstype(struct nd_region *nd_region)
+{
+	if (is_memory(&nd_region->dev)) {
+		u16 i, label;
+
+		for (i = 0, label = 0; i < nd_region->ndr_mappings; i++) {
+			struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+			struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+			if (test_bit(NDD_LABELING, &nvdimm->flags))
+				label++;
+		}
+		if (label)
+			return ND_DEVICE_NAMESPACE_PMEM;
+		else
+			return ND_DEVICE_NAMESPACE_IO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(nd_region_to_nstype);
+
+static unsigned long long region_size(struct nd_region *nd_region)
+{
+	if (is_memory(&nd_region->dev)) {
+		return nd_region->ndr_size;
+	} else if (nd_region->ndr_mappings == 1) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+
+		return nd_mapping->size;
+	}
+
+	return 0;
+}
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return sprintf(buf, "%llu\n", region_size(nd_region));
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t deep_flush_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	/*
+	 * NOTE: in the nvdimm_has_flush() error case this attribute is
+	 * not visible.
+	 */
+	return sprintf(buf, "%d\n", nvdimm_has_flush(nd_region));
+}
+
+static ssize_t deep_flush_store(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t len)
+{
+	bool flush;
+	int rc = kstrtobool(buf, &flush);
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	if (rc)
+		return rc;
+	if (!flush)
+		return -EINVAL;
+	rc = nvdimm_flush(nd_region, NULL);
+	if (rc)
+		return rc;
+
+	return len;
+}
+static DEVICE_ATTR_RW(deep_flush);
+
+static ssize_t mappings_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return sprintf(buf, "%d\n", nd_region->ndr_mappings);
+}
+static DEVICE_ATTR_RO(mappings);
+
+static ssize_t nstype_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region));
+}
+static DEVICE_ATTR_RO(nstype);
+
+static ssize_t set_cookie_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
+	ssize_t rc = 0;
+
+	if (is_memory(dev) && nd_set)
+		/* pass, should be precluded by region_visible */;
+	else
+		return -ENXIO;
+
+	/*
+	 * The cookie to show depends on which specification of the
+	 * labels we are using. If there are not labels then default to
+	 * the v1.1 namespace label cookie definition. To read all this
+	 * data we need to wait for probing to settle.
+	 */
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	if (nd_region->ndr_mappings) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+
+		if (ndd) {
+			struct nd_namespace_index *nsindex;
+
+			nsindex = to_namespace_index(ndd, ndd->ns_current);
+			rc = sprintf(buf, "%#llx\n",
+					nd_region_interleave_set_cookie(nd_region,
+						nsindex));
+		}
+	}
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	if (rc)
+		return rc;
+	return sprintf(buf, "%#llx\n", nd_set->cookie1);
+}
+static DEVICE_ATTR_RO(set_cookie);
+
+resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
+{
+	resource_size_t available;
+	int i;
+
+	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+
+	available = 0;
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+
+		/* if a dimm is disabled the available capacity is zero */
+		if (!ndd)
+			return 0;
+
+		available += nd_pmem_available_dpa(nd_region, nd_mapping);
+	}
+
+	return available;
+}
+
+resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region)
+{
+	resource_size_t avail = 0;
+	int i;
+
+	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+		avail = min_not_zero(avail, nd_pmem_max_contiguous_dpa(
+						    nd_region, nd_mapping));
+	}
+	return avail * nd_region->ndr_mappings;
+}
+
+static ssize_t available_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	unsigned long long available = 0;
+
+	/*
+	 * Flush in-flight updates and grab a snapshot of the available
+	 * size.  Of course, this value is potentially invalidated the
+	 * memory nvdimm_bus_lock() is dropped, but that's userspace's
+	 * problem to not race itself.
+	 */
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	available = nd_region_available_dpa(nd_region);
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return sprintf(buf, "%llu\n", available);
+}
+static DEVICE_ATTR_RO(available_size);
+
+static ssize_t max_available_extent_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	unsigned long long available = 0;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	available = nd_region_allocatable_dpa(nd_region);
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return sprintf(buf, "%llu\n", available);
+}
+static DEVICE_ATTR_RO(max_available_extent);
+
+static ssize_t init_namespaces_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region_data *ndrd = dev_get_drvdata(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	if (ndrd)
+		rc = sprintf(buf, "%d/%d\n", ndrd->ns_active, ndrd->ns_count);
+	else
+		rc = -ENXIO;
+	nvdimm_bus_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(init_namespaces);
+
+static ssize_t namespace_seed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	if (nd_region->ns_seed)
+		rc = sprintf(buf, "%s\n", dev_name(nd_region->ns_seed));
+	else
+		rc = sprintf(buf, "\n");
+	nvdimm_bus_unlock(dev);
+	return rc;
+}
+static DEVICE_ATTR_RO(namespace_seed);
+
+static ssize_t btt_seed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	if (nd_region->btt_seed)
+		rc = sprintf(buf, "%s\n", dev_name(nd_region->btt_seed));
+	else
+		rc = sprintf(buf, "\n");
+	nvdimm_bus_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(btt_seed);
+
+static ssize_t pfn_seed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	if (nd_region->pfn_seed)
+		rc = sprintf(buf, "%s\n", dev_name(nd_region->pfn_seed));
+	else
+		rc = sprintf(buf, "\n");
+	nvdimm_bus_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(pfn_seed);
+
+static ssize_t dax_seed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	if (nd_region->dax_seed)
+		rc = sprintf(buf, "%s\n", dev_name(nd_region->dax_seed));
+	else
+		rc = sprintf(buf, "\n");
+	nvdimm_bus_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(dax_seed);
+
+static ssize_t read_only_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return sprintf(buf, "%d\n", nd_region->ro);
+}
+
+static int revalidate_read_only(struct device *dev, void *data)
+{
+	nd_device_notify(dev, NVDIMM_REVALIDATE_REGION);
+	return 0;
+}
+
+static ssize_t read_only_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	bool ro;
+	int rc = kstrtobool(buf, &ro);
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	if (rc)
+		return rc;
+
+	nd_region->ro = ro;
+	device_for_each_child(dev, NULL, revalidate_read_only);
+	return len;
+}
+static DEVICE_ATTR_RW(read_only);
+
+static ssize_t align_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return sprintf(buf, "%#lx\n", nd_region->align);
+}
+
+static ssize_t align_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	unsigned long val, dpa;
+	u32 mappings, remainder;
+	int rc;
+
+	rc = kstrtoul(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	/*
+	 * Ensure space-align is evenly divisible by the region
+	 * interleave-width because the kernel typically has no facility
+	 * to determine which DIMM(s), dimm-physical-addresses, would
+	 * contribute to the tail capacity in system-physical-address
+	 * space for the namespace.
+	 */
+	mappings = max_t(u32, 1, nd_region->ndr_mappings);
+	dpa = div_u64_rem(val, mappings, &remainder);
+	if (!is_power_of_2(dpa) || dpa < PAGE_SIZE
+			|| val > region_size(nd_region) || remainder)
+		return -EINVAL;
+
+	/*
+	 * Given that space allocation consults this value multiple
+	 * times ensure it does not change for the duration of the
+	 * allocation.
+	 */
+	nvdimm_bus_lock(dev);
+	nd_region->align = val;
+	nvdimm_bus_unlock(dev);
+
+	return len;
+}
+static DEVICE_ATTR_RW(align);
+
+static ssize_t region_badblocks_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	if (dev->driver)
+		rc = badblocks_show(&nd_region->bb, buf, 0);
+	else
+		rc = -ENXIO;
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR(badblocks, 0444, region_badblocks_show, NULL);
+
+static ssize_t resource_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return sprintf(buf, "%#llx\n", nd_region->ndr_start);
+}
+static DEVICE_ATTR_ADMIN_RO(resource);
+
+static ssize_t persistence_domain_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	if (test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags))
+		return sprintf(buf, "cpu_cache\n");
+	else if (test_bit(ND_REGION_PERSIST_MEMCTRL, &nd_region->flags))
+		return sprintf(buf, "memory_controller\n");
+	else
+		return sprintf(buf, "\n");
+}
+static DEVICE_ATTR_RO(persistence_domain);
+
+static struct attribute *nd_region_attributes[] = {
+	&dev_attr_size.attr,
+	&dev_attr_align.attr,
+	&dev_attr_nstype.attr,
+	&dev_attr_mappings.attr,
+	&dev_attr_btt_seed.attr,
+	&dev_attr_pfn_seed.attr,
+	&dev_attr_dax_seed.attr,
+	&dev_attr_deep_flush.attr,
+	&dev_attr_read_only.attr,
+	&dev_attr_set_cookie.attr,
+	&dev_attr_available_size.attr,
+	&dev_attr_max_available_extent.attr,
+	&dev_attr_namespace_seed.attr,
+	&dev_attr_init_namespaces.attr,
+	&dev_attr_badblocks.attr,
+	&dev_attr_resource.attr,
+	&dev_attr_persistence_domain.attr,
+	NULL,
+};
+
+static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, typeof(*dev), kobj);
+	struct nd_region *nd_region = to_nd_region(dev);
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
+	int type = nd_region_to_nstype(nd_region);
+
+	if (!is_memory(dev) && a == &dev_attr_pfn_seed.attr)
+		return 0;
+
+	if (!is_memory(dev) && a == &dev_attr_dax_seed.attr)
+		return 0;
+
+	if (!is_memory(dev) && a == &dev_attr_badblocks.attr)
+		return 0;
+
+	if (a == &dev_attr_resource.attr && !is_memory(dev))
+		return 0;
+
+	if (a == &dev_attr_deep_flush.attr) {
+		int has_flush = nvdimm_has_flush(nd_region);
+
+		if (has_flush == 1)
+			return a->mode;
+		else if (has_flush == 0)
+			return 0444;
+		else
+			return 0;
+	}
+
+	if (a == &dev_attr_persistence_domain.attr) {
+		if ((nd_region->flags & (BIT(ND_REGION_PERSIST_CACHE)
+					| BIT(ND_REGION_PERSIST_MEMCTRL))) == 0)
+			return 0;
+		return a->mode;
+	}
+
+	if (a == &dev_attr_align.attr)
+		return a->mode;
+
+	if (a != &dev_attr_set_cookie.attr
+			&& a != &dev_attr_available_size.attr)
+		return a->mode;
+
+	if (type == ND_DEVICE_NAMESPACE_PMEM &&
+	    a == &dev_attr_available_size.attr)
+		return a->mode;
+	else if (is_memory(dev) && nd_set)
+		return a->mode;
+
+	return 0;
+}
+
+static ssize_t mappingN(struct device *dev, char *buf, int n)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	struct nd_mapping *nd_mapping;
+	struct nvdimm *nvdimm;
+
+	if (n >= nd_region->ndr_mappings)
+		return -ENXIO;
+	nd_mapping = &nd_region->mapping[n];
+	nvdimm = nd_mapping->nvdimm;
+
+	return sprintf(buf, "%s,%llu,%llu,%d\n", dev_name(&nvdimm->dev),
+			nd_mapping->start, nd_mapping->size,
+			nd_mapping->position);
+}
+
+#define REGION_MAPPING(idx) \
+static ssize_t mapping##idx##_show(struct device *dev,		\
+		struct device_attribute *attr, char *buf)	\
+{								\
+	return mappingN(dev, buf, idx);				\
+}								\
+static DEVICE_ATTR_RO(mapping##idx)
+
+/*
+ * 32 should be enough for a while, even in the presence of socket
+ * interleave a 32-way interleave set is a degenerate case.
+ */
+REGION_MAPPING(0);
+REGION_MAPPING(1);
+REGION_MAPPING(2);
+REGION_MAPPING(3);
+REGION_MAPPING(4);
+REGION_MAPPING(5);
+REGION_MAPPING(6);
+REGION_MAPPING(7);
+REGION_MAPPING(8);
+REGION_MAPPING(9);
+REGION_MAPPING(10);
+REGION_MAPPING(11);
+REGION_MAPPING(12);
+REGION_MAPPING(13);
+REGION_MAPPING(14);
+REGION_MAPPING(15);
+REGION_MAPPING(16);
+REGION_MAPPING(17);
+REGION_MAPPING(18);
+REGION_MAPPING(19);
+REGION_MAPPING(20);
+REGION_MAPPING(21);
+REGION_MAPPING(22);
+REGION_MAPPING(23);
+REGION_MAPPING(24);
+REGION_MAPPING(25);
+REGION_MAPPING(26);
+REGION_MAPPING(27);
+REGION_MAPPING(28);
+REGION_MAPPING(29);
+REGION_MAPPING(30);
+REGION_MAPPING(31);
+
+static umode_t mapping_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	if (n < nd_region->ndr_mappings)
+		return a->mode;
+	return 0;
+}
+
+static struct attribute *mapping_attributes[] = {
+	&dev_attr_mapping0.attr,
+	&dev_attr_mapping1.attr,
+	&dev_attr_mapping2.attr,
+	&dev_attr_mapping3.attr,
+	&dev_attr_mapping4.attr,
+	&dev_attr_mapping5.attr,
+	&dev_attr_mapping6.attr,
+	&dev_attr_mapping7.attr,
+	&dev_attr_mapping8.attr,
+	&dev_attr_mapping9.attr,
+	&dev_attr_mapping10.attr,
+	&dev_attr_mapping11.attr,
+	&dev_attr_mapping12.attr,
+	&dev_attr_mapping13.attr,
+	&dev_attr_mapping14.attr,
+	&dev_attr_mapping15.attr,
+	&dev_attr_mapping16.attr,
+	&dev_attr_mapping17.attr,
+	&dev_attr_mapping18.attr,
+	&dev_attr_mapping19.attr,
+	&dev_attr_mapping20.attr,
+	&dev_attr_mapping21.attr,
+	&dev_attr_mapping22.attr,
+	&dev_attr_mapping23.attr,
+	&dev_attr_mapping24.attr,
+	&dev_attr_mapping25.attr,
+	&dev_attr_mapping26.attr,
+	&dev_attr_mapping27.attr,
+	&dev_attr_mapping28.attr,
+	&dev_attr_mapping29.attr,
+	&dev_attr_mapping30.attr,
+	&dev_attr_mapping31.attr,
+	NULL,
+};
+
+static const struct attribute_group nd_mapping_attribute_group = {
+	.is_visible = mapping_visible,
+	.attrs = mapping_attributes,
+};
+
+static const struct attribute_group nd_region_attribute_group = {
+	.attrs = nd_region_attributes,
+	.is_visible = region_visible,
+};
+
+static const struct attribute_group *nd_region_attribute_groups[] = {
+	&nd_device_attribute_group,
+	&nd_region_attribute_group,
+	&nd_numa_attribute_group,
+	&nd_mapping_attribute_group,
+	NULL,
+};
+
+static const struct device_type nd_pmem_device_type = {
+	.name = "nd_pmem",
+	.release = nd_region_release,
+	.groups = nd_region_attribute_groups,
+};
+
+static const struct device_type nd_volatile_device_type = {
+	.name = "nd_volatile",
+	.release = nd_region_release,
+	.groups = nd_region_attribute_groups,
+};
+
+bool is_nd_pmem(const struct device *dev)
+{
+	return dev ? dev->type == &nd_pmem_device_type : false;
+}
+
+bool is_nd_volatile(const struct device *dev)
+{
+	return dev ? dev->type == &nd_volatile_device_type : false;
+}
+
+u64 nd_region_interleave_set_cookie(struct nd_region *nd_region,
+		struct nd_namespace_index *nsindex)
+{
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
+
+	if (!nd_set)
+		return 0;
+
+	if (nsindex && __le16_to_cpu(nsindex->major) == 1
+			&& __le16_to_cpu(nsindex->minor) == 1)
+		return nd_set->cookie1;
+	return nd_set->cookie2;
+}
+
+u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region)
+{
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
+
+	if (nd_set)
+		return nd_set->altcookie;
+	return 0;
+}
+
+void nd_mapping_free_labels(struct nd_mapping *nd_mapping)
+{
+	struct nd_label_ent *label_ent, *e;
+
+	lockdep_assert_held(&nd_mapping->lock);
+	list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
+		list_del(&label_ent->list);
+		kfree(label_ent);
+	}
+}
+
+/*
+ * When a namespace is activated create new seeds for the next
+ * namespace, or namespace-personality to be configured.
+ */
+void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev)
+{
+	nvdimm_bus_lock(dev);
+	if (nd_region->ns_seed == dev) {
+		nd_region_create_ns_seed(nd_region);
+	} else if (is_nd_btt(dev)) {
+		struct nd_btt *nd_btt = to_nd_btt(dev);
+
+		if (nd_region->btt_seed == dev)
+			nd_region_create_btt_seed(nd_region);
+		if (nd_region->ns_seed == &nd_btt->ndns->dev)
+			nd_region_create_ns_seed(nd_region);
+	} else if (is_nd_pfn(dev)) {
+		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+		if (nd_region->pfn_seed == dev)
+			nd_region_create_pfn_seed(nd_region);
+		if (nd_region->ns_seed == &nd_pfn->ndns->dev)
+			nd_region_create_ns_seed(nd_region);
+	} else if (is_nd_dax(dev)) {
+		struct nd_dax *nd_dax = to_nd_dax(dev);
+
+		if (nd_region->dax_seed == dev)
+			nd_region_create_dax_seed(nd_region);
+		if (nd_region->ns_seed == &nd_dax->nd_pfn.ndns->dev)
+			nd_region_create_ns_seed(nd_region);
+	}
+	nvdimm_bus_unlock(dev);
+}
+
+/**
+ * nd_region_acquire_lane - allocate and lock a lane
+ * @nd_region: region id and number of lanes possible
+ *
+ * A lane correlates to a BLK-data-window and/or a log slot in the BTT.
+ * We optimize for the common case where there are 256 lanes, one
+ * per-cpu.  For larger systems we need to lock to share lanes.  For now
+ * this implementation assumes the cost of maintaining an allocator for
+ * free lanes is on the order of the lock hold time, so it implements a
+ * static lane = cpu % num_lanes mapping.
+ *
+ * In the case of a BTT instance on top of a BLK namespace a lane may be
+ * acquired recursively.  We lock on the first instance.
+ *
+ * In the case of a BTT instance on top of PMEM, we only acquire a lane
+ * for the BTT metadata updates.
+ */
+unsigned int nd_region_acquire_lane(struct nd_region *nd_region)
+{
+	unsigned int cpu, lane;
+
+	migrate_disable();
+	cpu = smp_processor_id();
+	if (nd_region->num_lanes < nr_cpu_ids) {
+		struct nd_percpu_lane *ndl_lock, *ndl_count;
+
+		lane = cpu % nd_region->num_lanes;
+		ndl_count = per_cpu_ptr(nd_region->lane, cpu);
+		ndl_lock = per_cpu_ptr(nd_region->lane, lane);
+		if (ndl_count->count++ == 0)
+			spin_lock(&ndl_lock->lock);
+	} else
+		lane = cpu;
+
+	return lane;
+}
+EXPORT_SYMBOL(nd_region_acquire_lane);
+
+void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
+{
+	if (nd_region->num_lanes < nr_cpu_ids) {
+		unsigned int cpu = smp_processor_id();
+		struct nd_percpu_lane *ndl_lock, *ndl_count;
+
+		ndl_count = per_cpu_ptr(nd_region->lane, cpu);
+		ndl_lock = per_cpu_ptr(nd_region->lane, lane);
+		if (--ndl_count->count == 0)
+			spin_unlock(&ndl_lock->lock);
+	}
+	migrate_enable();
+}
+EXPORT_SYMBOL(nd_region_release_lane);
+
+/*
+ * PowerPC requires this alignment for memremap_pages(). All other archs
+ * should be ok with SUBSECTION_SIZE (see memremap_compat_align()).
+ */
+#define MEMREMAP_COMPAT_ALIGN_MAX SZ_16M
+
+static unsigned long default_align(struct nd_region *nd_region)
+{
+	unsigned long align;
+	u32 remainder;
+	int mappings;
+
+	align = MEMREMAP_COMPAT_ALIGN_MAX;
+	if (nd_region->ndr_size < MEMREMAP_COMPAT_ALIGN_MAX)
+		align = PAGE_SIZE;
+
+	mappings = max_t(u16, 1, nd_region->ndr_mappings);
+	div_u64_rem(align, mappings, &remainder);
+	if (remainder)
+		align *= mappings;
+
+	return align;
+}
+
+static struct lock_class_key nvdimm_region_key;
+
+static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
+		struct nd_region_desc *ndr_desc,
+		const struct device_type *dev_type, const char *caller)
+{
+	struct nd_region *nd_region;
+	struct device *dev;
+	unsigned int i;
+	int ro = 0;
+
+	for (i = 0; i < ndr_desc->num_mappings; i++) {
+		struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
+		struct nvdimm *nvdimm = mapping->nvdimm;
+
+		if ((mapping->start | mapping->size) % PAGE_SIZE) {
+			dev_err(&nvdimm_bus->dev,
+				"%s: %s mapping%d is not %ld aligned\n",
+				caller, dev_name(&nvdimm->dev), i, PAGE_SIZE);
+			return NULL;
+		}
+
+		if (test_bit(NDD_UNARMED, &nvdimm->flags))
+			ro = 1;
+
+	}
+
+	nd_region =
+		kzalloc(struct_size(nd_region, mapping, ndr_desc->num_mappings),
+			GFP_KERNEL);
+
+	if (!nd_region)
+		return NULL;
+	/* CXL pre-assigns memregion ids before creating nvdimm regions */
+	if (test_bit(ND_REGION_CXL, &ndr_desc->flags)) {
+		nd_region->id = ndr_desc->memregion;
+	} else {
+		nd_region->id = memregion_alloc(GFP_KERNEL);
+		if (nd_region->id < 0)
+			goto err_id;
+	}
+
+	nd_region->lane = alloc_percpu(struct nd_percpu_lane);
+	if (!nd_region->lane)
+		goto err_percpu;
+
+        for (i = 0; i < nr_cpu_ids; i++) {
+		struct nd_percpu_lane *ndl;
+
+		ndl = per_cpu_ptr(nd_region->lane, i);
+		spin_lock_init(&ndl->lock);
+		ndl->count = 0;
+	}
+
+	for (i = 0; i < ndr_desc->num_mappings; i++) {
+		struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
+		struct nvdimm *nvdimm = mapping->nvdimm;
+
+		nd_region->mapping[i].nvdimm = nvdimm;
+		nd_region->mapping[i].start = mapping->start;
+		nd_region->mapping[i].size = mapping->size;
+		nd_region->mapping[i].position = mapping->position;
+		INIT_LIST_HEAD(&nd_region->mapping[i].labels);
+		mutex_init(&nd_region->mapping[i].lock);
+
+		get_device(&nvdimm->dev);
+	}
+	nd_region->ndr_mappings = ndr_desc->num_mappings;
+	nd_region->provider_data = ndr_desc->provider_data;
+	nd_region->nd_set = ndr_desc->nd_set;
+	nd_region->num_lanes = ndr_desc->num_lanes;
+	nd_region->flags = ndr_desc->flags;
+	nd_region->ro = ro;
+	nd_region->numa_node = ndr_desc->numa_node;
+	nd_region->target_node = ndr_desc->target_node;
+	ida_init(&nd_region->ns_ida);
+	ida_init(&nd_region->btt_ida);
+	ida_init(&nd_region->pfn_ida);
+	ida_init(&nd_region->dax_ida);
+	dev = &nd_region->dev;
+	dev_set_name(dev, "region%d", nd_region->id);
+	dev->parent = &nvdimm_bus->dev;
+	dev->type = dev_type;
+	dev->groups = ndr_desc->attr_groups;
+	dev->of_node = ndr_desc->of_node;
+	nd_region->ndr_size = resource_size(ndr_desc->res);
+	nd_region->ndr_start = ndr_desc->res->start;
+	nd_region->align = default_align(nd_region);
+	if (ndr_desc->flush)
+		nd_region->flush = ndr_desc->flush;
+	else
+		nd_region->flush = NULL;
+
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &nvdimm_region_key);
+	nd_device_register(dev);
+
+	return nd_region;
+
+err_percpu:
+	if (!test_bit(ND_REGION_CXL, &ndr_desc->flags))
+		memregion_free(nd_region->id);
+err_id:
+	kfree(nd_region);
+	return NULL;
+}
+
+struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
+		struct nd_region_desc *ndr_desc)
+{
+	ndr_desc->num_lanes = ND_MAX_LANES;
+	return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type,
+			__func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create);
+
+struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
+		struct nd_region_desc *ndr_desc)
+{
+	ndr_desc->num_lanes = ND_MAX_LANES;
+	return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type,
+			__func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
+
+void nvdimm_region_delete(struct nd_region *nd_region)
+{
+	if (nd_region)
+		nd_device_unregister(&nd_region->dev, ND_SYNC);
+}
+EXPORT_SYMBOL_GPL(nvdimm_region_delete);
+
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
+{
+	int rc = 0;
+
+	if (!nd_region->flush)
+		rc = generic_nvdimm_flush(nd_region);
+	else {
+		if (nd_region->flush(nd_region, bio))
+			rc = -EIO;
+	}
+
+	return rc;
+}
+/**
+ * generic_nvdimm_flush() - flush any posted write queues between the cpu and pmem media
+ * @nd_region: interleaved pmem region
+ */
+int generic_nvdimm_flush(struct nd_region *nd_region)
+{
+	struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
+	int i, idx;
+
+	/*
+	 * Try to encourage some diversity in flush hint addresses
+	 * across cpus assuming a limited number of flush hints.
+	 */
+	idx = this_cpu_read(flush_idx);
+	idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));
+
+	/*
+	 * The pmem_wmb() is needed to 'sfence' all
+	 * previous writes such that they are architecturally visible for
+	 * the platform buffer flush. Note that we've already arranged for pmem
+	 * writes to avoid the cache via memcpy_flushcache().  The final
+	 * wmb() ensures ordering for the NVDIMM flush write.
+	 */
+	pmem_wmb();
+	for (i = 0; i < nd_region->ndr_mappings; i++)
+		if (ndrd_get_flush_wpq(ndrd, i, 0))
+			writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
+	wmb();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_flush);
+
+/**
+ * nvdimm_has_flush - determine write flushing requirements
+ * @nd_region: interleaved pmem region
+ *
+ * Returns 1 if writes require flushing
+ * Returns 0 if writes do not require flushing
+ * Returns -ENXIO if flushing capability can not be determined
+ */
+int nvdimm_has_flush(struct nd_region *nd_region)
+{
+	int i;
+
+	/* no nvdimm or pmem api == flushing capability unknown */
+	if (nd_region->ndr_mappings == 0
+			|| !IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API))
+		return -ENXIO;
+
+	/* Test if an explicit flush function is defined */
+	if (test_bit(ND_REGION_ASYNC, &nd_region->flags) && nd_region->flush)
+		return 1;
+
+	/* Test if any flush hints for the region are available */
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		/* flush hints present / available */
+		if (nvdimm->num_flush)
+			return 1;
+	}
+
+	/*
+	 * The platform defines dimm devices without hints nor explicit flush,
+	 * assume platform persistence mechanism like ADR
+	 */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_has_flush);
+
+int nvdimm_has_cache(struct nd_region *nd_region)
+{
+	return is_nd_pmem(&nd_region->dev) &&
+		!test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags);
+}
+EXPORT_SYMBOL_GPL(nvdimm_has_cache);
+
+bool is_nvdimm_sync(struct nd_region *nd_region)
+{
+	if (is_nd_volatile(&nd_region->dev))
+		return true;
+
+	return is_nd_pmem(&nd_region->dev) &&
+		!test_bit(ND_REGION_ASYNC, &nd_region->flags);
+}
+EXPORT_SYMBOL_GPL(is_nvdimm_sync);
+
+struct conflict_context {
+	struct nd_region *nd_region;
+	resource_size_t start, size;
+};
+
+static int region_conflict(struct device *dev, void *data)
+{
+	struct nd_region *nd_region;
+	struct conflict_context *ctx = data;
+	resource_size_t res_end, region_end, region_start;
+
+	if (!is_memory(dev))
+		return 0;
+
+	nd_region = to_nd_region(dev);
+	if (nd_region == ctx->nd_region)
+		return 0;
+
+	res_end = ctx->start + ctx->size;
+	region_start = nd_region->ndr_start;
+	region_end = region_start + nd_region->ndr_size;
+	if (ctx->start >= region_start && ctx->start < region_end)
+		return -EBUSY;
+	if (res_end > region_start && res_end <= region_end)
+		return -EBUSY;
+	return 0;
+}
+
+int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
+		resource_size_t size)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+	struct conflict_context ctx = {
+		.nd_region = nd_region,
+		.start = start,
+		.size = size,
+	};
+
+	return device_for_each_child(&nvdimm_bus->dev, &ctx, region_conflict);
+}
+
+MODULE_IMPORT_NS(DEVMEM);
diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c
new file mode 100644
index 0000000000..a03e3c45f2
--- /dev/null
+++ b/drivers/nvdimm/security.c
@@ -0,0 +1,582 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2018 Intel Corporation. All rights reserved. */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/ndctl.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/cred.h>
+#include <linux/key.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <keys/encrypted-type.h>
+#include "nd-core.h"
+#include "nd.h"
+
+#define NVDIMM_BASE_KEY		0
+#define NVDIMM_NEW_KEY		1
+
+static bool key_revalidate = true;
+module_param(key_revalidate, bool, 0444);
+MODULE_PARM_DESC(key_revalidate, "Require key validation at init.");
+
+static const char zero_key[NVDIMM_PASSPHRASE_LEN];
+
+static void *key_data(struct key *key)
+{
+	struct encrypted_key_payload *epayload = dereference_key_locked(key);
+
+	lockdep_assert_held_read(&key->sem);
+
+	return epayload->decrypted_data;
+}
+
+static void nvdimm_put_key(struct key *key)
+{
+	if (!key)
+		return;
+
+	up_read(&key->sem);
+	key_put(key);
+}
+
+/*
+ * Retrieve kernel key for DIMM and request from user space if
+ * necessary. Returns a key held for read and must be put by
+ * nvdimm_put_key() before the usage goes out of scope.
+ */
+static struct key *nvdimm_request_key(struct nvdimm *nvdimm)
+{
+	struct key *key = NULL;
+	static const char NVDIMM_PREFIX[] = "nvdimm:";
+	char desc[NVDIMM_KEY_DESC_LEN + sizeof(NVDIMM_PREFIX)];
+	struct device *dev = &nvdimm->dev;
+
+	sprintf(desc, "%s%s", NVDIMM_PREFIX, nvdimm->dimm_id);
+	key = request_key(&key_type_encrypted, desc, "");
+	if (IS_ERR(key)) {
+		if (PTR_ERR(key) == -ENOKEY)
+			dev_dbg(dev, "request_key() found no key\n");
+		else
+			dev_dbg(dev, "request_key() upcall failed\n");
+		key = NULL;
+	} else {
+		struct encrypted_key_payload *epayload;
+
+		down_read(&key->sem);
+		epayload = dereference_key_locked(key);
+		if (epayload->decrypted_datalen != NVDIMM_PASSPHRASE_LEN) {
+			up_read(&key->sem);
+			key_put(key);
+			key = NULL;
+		}
+	}
+
+	return key;
+}
+
+static const void *nvdimm_get_key_payload(struct nvdimm *nvdimm,
+		struct key **key)
+{
+	*key = nvdimm_request_key(nvdimm);
+	if (!*key)
+		return zero_key;
+
+	return key_data(*key);
+}
+
+static struct key *nvdimm_lookup_user_key(struct nvdimm *nvdimm,
+		key_serial_t id, int subclass)
+{
+	key_ref_t keyref;
+	struct key *key;
+	struct encrypted_key_payload *epayload;
+	struct device *dev = &nvdimm->dev;
+
+	keyref = lookup_user_key(id, 0, KEY_NEED_SEARCH);
+	if (IS_ERR(keyref))
+		return NULL;
+
+	key = key_ref_to_ptr(keyref);
+	if (key->type != &key_type_encrypted) {
+		key_put(key);
+		return NULL;
+	}
+
+	dev_dbg(dev, "%s: key found: %#x\n", __func__, key_serial(key));
+
+	down_read_nested(&key->sem, subclass);
+	epayload = dereference_key_locked(key);
+	if (epayload->decrypted_datalen != NVDIMM_PASSPHRASE_LEN) {
+		up_read(&key->sem);
+		key_put(key);
+		key = NULL;
+	}
+	return key;
+}
+
+static const void *nvdimm_get_user_key_payload(struct nvdimm *nvdimm,
+		key_serial_t id, int subclass, struct key **key)
+{
+	*key = NULL;
+	if (id == 0) {
+		if (subclass == NVDIMM_BASE_KEY)
+			return zero_key;
+		else
+			return NULL;
+	}
+
+	*key = nvdimm_lookup_user_key(nvdimm, id, subclass);
+	if (!*key)
+		return NULL;
+
+	return key_data(*key);
+}
+
+
+static int nvdimm_key_revalidate(struct nvdimm *nvdimm)
+{
+	struct key *key;
+	int rc;
+	const void *data;
+
+	if (!nvdimm->sec.ops->change_key)
+		return -EOPNOTSUPP;
+
+	data = nvdimm_get_key_payload(nvdimm, &key);
+
+	/*
+	 * Send the same key to the hardware as new and old key to
+	 * verify that the key is good.
+	 */
+	rc = nvdimm->sec.ops->change_key(nvdimm, data, data, NVDIMM_USER);
+	if (rc < 0) {
+		nvdimm_put_key(key);
+		return rc;
+	}
+
+	nvdimm_put_key(key);
+	nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER);
+	return 0;
+}
+
+static int __nvdimm_security_unlock(struct nvdimm *nvdimm)
+{
+	struct device *dev = &nvdimm->dev;
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct key *key;
+	const void *data;
+	int rc;
+
+	/* The bus lock should be held at the top level of the call stack */
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+
+	if (!nvdimm->sec.ops || !nvdimm->sec.ops->unlock
+			|| !nvdimm->sec.flags)
+		return -EIO;
+
+	/* cxl_test needs this to pre-populate the security state */
+	if (IS_ENABLED(CONFIG_NVDIMM_SECURITY_TEST))
+		nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER);
+
+	/* No need to go further if security is disabled */
+	if (test_bit(NVDIMM_SECURITY_DISABLED, &nvdimm->sec.flags))
+		return 0;
+
+	if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) {
+		dev_dbg(dev, "Security operation in progress.\n");
+		return -EBUSY;
+	}
+
+	/*
+	 * If the pre-OS has unlocked the DIMM, attempt to send the key
+	 * from request_key() to the hardware for verification.  Failure
+	 * to revalidate the key against the hardware results in a
+	 * freeze of the security configuration. I.e. if the OS does not
+	 * have the key, security is being managed pre-OS.
+	 */
+	if (test_bit(NVDIMM_SECURITY_UNLOCKED, &nvdimm->sec.flags)) {
+		if (!key_revalidate)
+			return 0;
+
+		return nvdimm_key_revalidate(nvdimm);
+	} else
+		data = nvdimm_get_key_payload(nvdimm, &key);
+
+	rc = nvdimm->sec.ops->unlock(nvdimm, data);
+	dev_dbg(dev, "key: %d unlock: %s\n", key_serial(key),
+			rc == 0 ? "success" : "fail");
+	if (rc == 0)
+		set_bit(NDD_INCOHERENT, &nvdimm->flags);
+
+	nvdimm_put_key(key);
+	nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER);
+	return rc;
+}
+
+int nvdimm_security_unlock(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	int rc;
+
+	nvdimm_bus_lock(dev);
+	rc = __nvdimm_security_unlock(nvdimm);
+	nvdimm_bus_unlock(dev);
+	return rc;
+}
+
+static int check_security_state(struct nvdimm *nvdimm)
+{
+	struct device *dev = &nvdimm->dev;
+
+	if (test_bit(NVDIMM_SECURITY_FROZEN, &nvdimm->sec.flags)) {
+		dev_dbg(dev, "Incorrect security state: %#lx\n",
+				nvdimm->sec.flags);
+		return -EIO;
+	}
+
+	if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) {
+		dev_dbg(dev, "Security operation in progress.\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int security_disable(struct nvdimm *nvdimm, unsigned int keyid,
+			    enum nvdimm_passphrase_type pass_type)
+{
+	struct device *dev = &nvdimm->dev;
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct key *key;
+	int rc;
+	const void *data;
+
+	/* The bus lock should be held at the top level of the call stack */
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+
+	if (!nvdimm->sec.ops || !nvdimm->sec.flags)
+		return -EOPNOTSUPP;
+
+	if (pass_type == NVDIMM_USER && !nvdimm->sec.ops->disable)
+		return -EOPNOTSUPP;
+
+	if (pass_type == NVDIMM_MASTER && !nvdimm->sec.ops->disable_master)
+		return -EOPNOTSUPP;
+
+	rc = check_security_state(nvdimm);
+	if (rc)
+		return rc;
+
+	data = nvdimm_get_user_key_payload(nvdimm, keyid,
+			NVDIMM_BASE_KEY, &key);
+	if (!data)
+		return -ENOKEY;
+
+	if (pass_type == NVDIMM_MASTER) {
+		rc = nvdimm->sec.ops->disable_master(nvdimm, data);
+		dev_dbg(dev, "key: %d disable_master: %s\n", key_serial(key),
+			rc == 0 ? "success" : "fail");
+	} else {
+		rc = nvdimm->sec.ops->disable(nvdimm, data);
+		dev_dbg(dev, "key: %d disable: %s\n", key_serial(key),
+			rc == 0 ? "success" : "fail");
+	}
+
+	nvdimm_put_key(key);
+	if (pass_type == NVDIMM_MASTER)
+		nvdimm->sec.ext_flags = nvdimm_security_flags(nvdimm, NVDIMM_MASTER);
+	else
+		nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER);
+	return rc;
+}
+
+static int security_update(struct nvdimm *nvdimm, unsigned int keyid,
+		unsigned int new_keyid,
+		enum nvdimm_passphrase_type pass_type)
+{
+	struct device *dev = &nvdimm->dev;
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct key *key, *newkey;
+	int rc;
+	const void *data, *newdata;
+
+	/* The bus lock should be held at the top level of the call stack */
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+
+	if (!nvdimm->sec.ops || !nvdimm->sec.ops->change_key
+			|| !nvdimm->sec.flags)
+		return -EOPNOTSUPP;
+
+	rc = check_security_state(nvdimm);
+	if (rc)
+		return rc;
+
+	data = nvdimm_get_user_key_payload(nvdimm, keyid,
+			NVDIMM_BASE_KEY, &key);
+	if (!data)
+		return -ENOKEY;
+
+	newdata = nvdimm_get_user_key_payload(nvdimm, new_keyid,
+			NVDIMM_NEW_KEY, &newkey);
+	if (!newdata) {
+		nvdimm_put_key(key);
+		return -ENOKEY;
+	}
+
+	rc = nvdimm->sec.ops->change_key(nvdimm, data, newdata, pass_type);
+	dev_dbg(dev, "key: %d %d update%s: %s\n",
+			key_serial(key), key_serial(newkey),
+			pass_type == NVDIMM_MASTER ? "(master)" : "(user)",
+			rc == 0 ? "success" : "fail");
+
+	nvdimm_put_key(newkey);
+	nvdimm_put_key(key);
+	if (pass_type == NVDIMM_MASTER)
+		nvdimm->sec.ext_flags = nvdimm_security_flags(nvdimm,
+				NVDIMM_MASTER);
+	else
+		nvdimm->sec.flags = nvdimm_security_flags(nvdimm,
+				NVDIMM_USER);
+	return rc;
+}
+
+static int security_erase(struct nvdimm *nvdimm, unsigned int keyid,
+		enum nvdimm_passphrase_type pass_type)
+{
+	struct device *dev = &nvdimm->dev;
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct key *key = NULL;
+	int rc;
+	const void *data;
+
+	/* The bus lock should be held at the top level of the call stack */
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+
+	if (!nvdimm->sec.ops || !nvdimm->sec.ops->erase
+			|| !nvdimm->sec.flags)
+		return -EOPNOTSUPP;
+
+	rc = check_security_state(nvdimm);
+	if (rc)
+		return rc;
+
+	if (!test_bit(NVDIMM_SECURITY_UNLOCKED, &nvdimm->sec.ext_flags)
+			&& pass_type == NVDIMM_MASTER) {
+		dev_dbg(dev,
+			"Attempt to secure erase in wrong master state.\n");
+		return -EOPNOTSUPP;
+	}
+
+	data = nvdimm_get_user_key_payload(nvdimm, keyid,
+			NVDIMM_BASE_KEY, &key);
+	if (!data)
+		return -ENOKEY;
+
+	rc = nvdimm->sec.ops->erase(nvdimm, data, pass_type);
+	if (rc == 0)
+		set_bit(NDD_INCOHERENT, &nvdimm->flags);
+	dev_dbg(dev, "key: %d erase%s: %s\n", key_serial(key),
+			pass_type == NVDIMM_MASTER ? "(master)" : "(user)",
+			rc == 0 ? "success" : "fail");
+
+	nvdimm_put_key(key);
+	nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER);
+	return rc;
+}
+
+static int security_overwrite(struct nvdimm *nvdimm, unsigned int keyid)
+{
+	struct device *dev = &nvdimm->dev;
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct key *key = NULL;
+	int rc;
+	const void *data;
+
+	/* The bus lock should be held at the top level of the call stack */
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+
+	if (!nvdimm->sec.ops || !nvdimm->sec.ops->overwrite
+			|| !nvdimm->sec.flags)
+		return -EOPNOTSUPP;
+
+	rc = check_security_state(nvdimm);
+	if (rc)
+		return rc;
+
+	data = nvdimm_get_user_key_payload(nvdimm, keyid,
+			NVDIMM_BASE_KEY, &key);
+	if (!data)
+		return -ENOKEY;
+
+	rc = nvdimm->sec.ops->overwrite(nvdimm, data);
+	if (rc == 0)
+		set_bit(NDD_INCOHERENT, &nvdimm->flags);
+	dev_dbg(dev, "key: %d overwrite submission: %s\n", key_serial(key),
+			rc == 0 ? "success" : "fail");
+
+	nvdimm_put_key(key);
+	if (rc == 0) {
+		set_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags);
+		set_bit(NDD_WORK_PENDING, &nvdimm->flags);
+		set_bit(NVDIMM_SECURITY_OVERWRITE, &nvdimm->sec.flags);
+		/*
+		 * Make sure we don't lose device while doing overwrite
+		 * query.
+		 */
+		get_device(dev);
+		queue_delayed_work(system_wq, &nvdimm->dwork, 0);
+	}
+
+	return rc;
+}
+
+static void __nvdimm_security_overwrite_query(struct nvdimm *nvdimm)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev);
+	int rc;
+	unsigned int tmo;
+
+	/* The bus lock should be held at the top level of the call stack */
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+
+	/*
+	 * Abort and release device if we no longer have the overwrite
+	 * flag set. It means the work has been canceled.
+	 */
+	if (!test_bit(NDD_WORK_PENDING, &nvdimm->flags))
+		return;
+
+	tmo = nvdimm->sec.overwrite_tmo;
+
+	if (!nvdimm->sec.ops || !nvdimm->sec.ops->query_overwrite
+			|| !nvdimm->sec.flags)
+		return;
+
+	rc = nvdimm->sec.ops->query_overwrite(nvdimm);
+	if (rc == -EBUSY) {
+
+		/* setup delayed work again */
+		tmo += 10;
+		queue_delayed_work(system_wq, &nvdimm->dwork, tmo * HZ);
+		nvdimm->sec.overwrite_tmo = min(15U * 60U, tmo);
+		return;
+	}
+
+	if (rc < 0)
+		dev_dbg(&nvdimm->dev, "overwrite failed\n");
+	else
+		dev_dbg(&nvdimm->dev, "overwrite completed\n");
+
+	/*
+	 * Mark the overwrite work done and update dimm security flags,
+	 * then send a sysfs event notification to wake up userspace
+	 * poll threads to picked up the changed state.
+	 */
+	nvdimm->sec.overwrite_tmo = 0;
+	clear_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags);
+	clear_bit(NDD_WORK_PENDING, &nvdimm->flags);
+	nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER);
+	nvdimm->sec.ext_flags = nvdimm_security_flags(nvdimm, NVDIMM_MASTER);
+	if (nvdimm->sec.overwrite_state)
+		sysfs_notify_dirent(nvdimm->sec.overwrite_state);
+	put_device(&nvdimm->dev);
+}
+
+void nvdimm_security_overwrite_query(struct work_struct *work)
+{
+	struct nvdimm *nvdimm =
+		container_of(work, typeof(*nvdimm), dwork.work);
+
+	nvdimm_bus_lock(&nvdimm->dev);
+	__nvdimm_security_overwrite_query(nvdimm);
+	nvdimm_bus_unlock(&nvdimm->dev);
+}
+
+#define OPS							\
+	C( OP_FREEZE,		"freeze",		1),	\
+	C( OP_DISABLE,		"disable",		2),	\
+	C( OP_DISABLE_MASTER,	"disable_master",	2),	\
+	C( OP_UPDATE,		"update",		3),	\
+	C( OP_ERASE,		"erase",		2),	\
+	C( OP_OVERWRITE,	"overwrite",		2),	\
+	C( OP_MASTER_UPDATE,	"master_update",	3),	\
+	C( OP_MASTER_ERASE,	"master_erase",		2)
+#undef C
+#define C(a, b, c) a
+enum nvdimmsec_op_ids { OPS };
+#undef C
+#define C(a, b, c) { b, c }
+static struct {
+	const char *name;
+	int args;
+} ops[] = { OPS };
+#undef C
+
+#define SEC_CMD_SIZE 32
+#define KEY_ID_SIZE 10
+
+ssize_t nvdimm_security_store(struct device *dev, const char *buf, size_t len)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	ssize_t rc;
+	char cmd[SEC_CMD_SIZE+1], keystr[KEY_ID_SIZE+1],
+		nkeystr[KEY_ID_SIZE+1];
+	unsigned int key, newkey;
+	int i;
+
+	rc = sscanf(buf, "%"__stringify(SEC_CMD_SIZE)"s"
+			" %"__stringify(KEY_ID_SIZE)"s"
+			" %"__stringify(KEY_ID_SIZE)"s",
+			cmd, keystr, nkeystr);
+	if (rc < 1)
+		return -EINVAL;
+	for (i = 0; i < ARRAY_SIZE(ops); i++)
+		if (sysfs_streq(cmd, ops[i].name))
+			break;
+	if (i >= ARRAY_SIZE(ops))
+		return -EINVAL;
+	if (ops[i].args > 1)
+		rc = kstrtouint(keystr, 0, &key);
+	if (rc >= 0 && ops[i].args > 2)
+		rc = kstrtouint(nkeystr, 0, &newkey);
+	if (rc < 0)
+		return rc;
+
+	if (i == OP_FREEZE) {
+		dev_dbg(dev, "freeze\n");
+		rc = nvdimm_security_freeze(nvdimm);
+	} else if (i == OP_DISABLE) {
+		dev_dbg(dev, "disable %u\n", key);
+		rc = security_disable(nvdimm, key, NVDIMM_USER);
+	} else if (i == OP_DISABLE_MASTER) {
+		dev_dbg(dev, "disable_master %u\n", key);
+		rc = security_disable(nvdimm, key, NVDIMM_MASTER);
+	} else if (i == OP_UPDATE || i == OP_MASTER_UPDATE) {
+		dev_dbg(dev, "%s %u %u\n", ops[i].name, key, newkey);
+		rc = security_update(nvdimm, key, newkey, i == OP_UPDATE
+				? NVDIMM_USER : NVDIMM_MASTER);
+	} else if (i == OP_ERASE || i == OP_MASTER_ERASE) {
+		dev_dbg(dev, "%s %u\n", ops[i].name, key);
+		if (atomic_read(&nvdimm->busy)) {
+			dev_dbg(dev, "Unable to secure erase while DIMM active.\n");
+			return -EBUSY;
+		}
+		rc = security_erase(nvdimm, key, i == OP_ERASE
+				? NVDIMM_USER : NVDIMM_MASTER);
+	} else if (i == OP_OVERWRITE) {
+		dev_dbg(dev, "overwrite %u\n", key);
+		if (atomic_read(&nvdimm->busy)) {
+			dev_dbg(dev, "Unable to overwrite while DIMM active.\n");
+			return -EBUSY;
+		}
+		rc = security_overwrite(nvdimm, key);
+	} else
+		return -EINVAL;
+
+	if (rc == 0)
+		rc = len;
+	return rc;
+}
diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
new file mode 100644
index 0000000000..a92eb172f0
--- /dev/null
+++ b/drivers/nvdimm/virtio_pmem.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio_pmem.c: Virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and registers the virtual pmem device
+ * with libnvdimm core.
+ */
+#include "virtio_pmem.h"
+#include "nd.h"
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+ /* Initialize virt queue */
+static int init_vq(struct virtio_pmem *vpmem)
+{
+	/* single vq */
+	vpmem->req_vq = virtio_find_single_vq(vpmem->vdev,
+					virtio_pmem_host_ack, "flush_queue");
+	if (IS_ERR(vpmem->req_vq))
+		return PTR_ERR(vpmem->req_vq);
+
+	spin_lock_init(&vpmem->pmem_lock);
+	INIT_LIST_HEAD(&vpmem->req_list);
+
+	return 0;
+};
+
+static int virtio_pmem_probe(struct virtio_device *vdev)
+{
+	struct nd_region_desc ndr_desc = {};
+	struct nd_region *nd_region;
+	struct virtio_pmem *vpmem;
+	struct resource res;
+	int err = 0;
+
+	if (!vdev->config->get) {
+		dev_err(&vdev->dev, "%s failure: config access disabled\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), GFP_KERNEL);
+	if (!vpmem) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	vpmem->vdev = vdev;
+	vdev->priv = vpmem;
+	err = init_vq(vpmem);
+	if (err) {
+		dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
+		goto out_err;
+	}
+
+	virtio_cread_le(vpmem->vdev, struct virtio_pmem_config,
+			start, &vpmem->start);
+	virtio_cread_le(vpmem->vdev, struct virtio_pmem_config,
+			size, &vpmem->size);
+
+	res.start = vpmem->start;
+	res.end   = vpmem->start + vpmem->size - 1;
+	vpmem->nd_desc.provider_name = "virtio-pmem";
+	vpmem->nd_desc.module = THIS_MODULE;
+
+	vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev,
+						&vpmem->nd_desc);
+	if (!vpmem->nvdimm_bus) {
+		dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n");
+		err = -ENXIO;
+		goto out_vq;
+	}
+
+	dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus);
+
+	ndr_desc.res = &res;
+
+	ndr_desc.numa_node = memory_add_physaddr_to_nid(res.start);
+	ndr_desc.target_node = phys_to_target_node(res.start);
+	if (ndr_desc.target_node == NUMA_NO_NODE) {
+		ndr_desc.target_node = ndr_desc.numa_node;
+		dev_dbg(&vdev->dev, "changing target node from %d to %d",
+			NUMA_NO_NODE, ndr_desc.target_node);
+	}
+
+	ndr_desc.flush = async_pmem_flush;
+	ndr_desc.provider_data = vdev;
+	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+	set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
+	/*
+	 * The NVDIMM region could be available before the
+	 * virtio_device_ready() that is called by
+	 * virtio_dev_probe(), so we set device ready here.
+	 */
+	virtio_device_ready(vdev);
+	nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc);
+	if (!nd_region) {
+		dev_err(&vdev->dev, "failed to create nvdimm region\n");
+		err = -ENXIO;
+		goto out_nd;
+	}
+	return 0;
+out_nd:
+	virtio_reset_device(vdev);
+	nvdimm_bus_unregister(vpmem->nvdimm_bus);
+out_vq:
+	vdev->config->del_vqs(vdev);
+out_err:
+	return err;
+}
+
+static void virtio_pmem_remove(struct virtio_device *vdev)
+{
+	struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
+
+	nvdimm_bus_unregister(nvdimm_bus);
+	vdev->config->del_vqs(vdev);
+	virtio_reset_device(vdev);
+}
+
+static struct virtio_driver virtio_pmem_driver = {
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.probe			= virtio_pmem_probe,
+	.remove			= virtio_pmem_remove,
+};
+
+module_virtio_driver(virtio_pmem_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio pmem driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
new file mode 100644
index 0000000000..0dddefe594
--- /dev/null
+++ b/drivers/nvdimm/virtio_pmem.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * virtio_pmem.h: virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and provides a virtio based flushing
+ * interface.
+ **/
+
+#ifndef _LINUX_VIRTIO_PMEM_H
+#define _LINUX_VIRTIO_PMEM_H
+
+#include <linux/module.h>
+#include <uapi/linux/virtio_pmem.h>
+#include <linux/libnvdimm.h>
+#include <linux/spinlock.h>
+
+struct virtio_pmem_request {
+	struct virtio_pmem_req req;
+	struct virtio_pmem_resp resp;
+
+	/* Wait queue to process deferred work after ack from host */
+	wait_queue_head_t host_acked;
+	bool done;
+
+	/* Wait queue to process deferred work after virt queue buffer avail */
+	wait_queue_head_t wq_buf;
+	bool wq_buf_avail;
+	struct list_head list;
+};
+
+struct virtio_pmem {
+	struct virtio_device *vdev;
+
+	/* Virtio pmem request queue */
+	struct virtqueue *req_vq;
+
+	/* nvdimm bus registers virtio pmem device */
+	struct nvdimm_bus *nvdimm_bus;
+	struct nvdimm_bus_descriptor nd_desc;
+
+	/* List to store deferred work if virtqueue is full */
+	struct list_head req_list;
+
+	/* Synchronize virtqueue data */
+	spinlock_t pmem_lock;
+
+	/* Memory region information */
+	__u64 start;
+	__u64 size;
+};
+
+void virtio_pmem_host_ack(struct virtqueue *vq);
+int async_pmem_flush(struct nd_region *nd_region, struct bio *bio);
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-11 08:27:49 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-11 08:27:49 +0000
commit	ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
tree	b2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/nvdimm
parent	Initial commit. (diff)
download	linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip