diff options
Diffstat (limited to 'drivers/nvdimm')
30 files changed, 15143 insertions, 0 deletions
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig new file mode 100644 index 000000000..e4c20f0cb --- /dev/null +++ b/drivers/nvdimm/Kconfig @@ -0,0 +1,136 @@ +# SPDX-License-Identifier: GPL-2.0-only +menuconfig LIBNVDIMM + tristate "NVDIMM (Non-Volatile Memory Device) Support" + depends on PHYS_ADDR_T_64BIT + depends on HAS_IOMEM + depends on BLK_DEV + select MEMREGION + help + Generic support for non-volatile memory devices including + ACPI-6-NFIT defined resources. On platforms that define an + NFIT, or otherwise can discover NVDIMM resources, a libnvdimm + bus is registered to advertise PMEM (persistent memory) + namespaces (/dev/pmemX). A PMEM namespace refers to a + memory resource that may span multiple DIMMs and support DAX + (see CONFIG_DAX). + +if LIBNVDIMM + +config BLK_DEV_PMEM + tristate "PMEM: Persistent memory block device support" + default LIBNVDIMM + select DAX + select ND_BTT if BTT + select ND_PFN if NVDIMM_PFN + help + Memory ranges for PMEM are described by either an NFIT + (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a + non-standard OEM-specific E820 memory type (type-12, see + CONFIG_X86_PMEM_LEGACY), or it is manually specified by the + 'memmap=nn[KMG]!ss[KMG]' kernel command line (see + Documentation/admin-guide/kernel-parameters.rst). This driver converts + these persistent memory ranges into block devices that are + capable of DAX (direct-access) file system mappings. See + Documentation/driver-api/nvdimm/nvdimm.rst for more details. + + Say Y if you want to use an NVDIMM + +config ND_CLAIM + bool + +config ND_BTT + tristate + +config BTT + bool "BTT: Block Translation Table (atomic sector updates)" + default y if LIBNVDIMM + select ND_CLAIM + help + The Block Translation Table (BTT) provides atomic sector + update semantics for persistent memory devices, so that + applications that rely on sector writes not being torn (a + guarantee that typical disks provide) can continue to do so. + The BTT manifests itself as an alternate personality for an + NVDIMM namespace, i.e. a namespace can be in raw mode pmemX, + or 'sectored' mode. + + Select Y if unsure + +config ND_PFN + tristate + +config NVDIMM_PFN + bool "PFN: Map persistent (device) memory" + default LIBNVDIMM + depends on ZONE_DEVICE + select ND_CLAIM + help + Map persistent memory, i.e. advertise it to the memory + management sub-system. By default persistent memory does + not support direct I/O, RDMA, or any other usage that + requires a 'struct page' to mediate an I/O request. This + driver allocates and initializes the infrastructure needed + to support those use cases. + + Select Y if unsure + +config NVDIMM_DAX + bool "NVDIMM DAX: Raw access to persistent memory" + default LIBNVDIMM + depends on NVDIMM_PFN + help + Support raw device dax access to a persistent memory + namespace. For environments that want to hard partition + persistent memory, this capability provides a mechanism to + sub-divide a namespace into character devices that can only be + accessed via DAX (mmap(2)). + + Select Y if unsure + +config OF_PMEM + tristate "Device-tree support for persistent memory regions" + depends on OF + default LIBNVDIMM + help + Allows regions of persistent memory to be described in the + device-tree. + + Select Y if unsure. + +config NVDIMM_KEYS + def_bool y + depends on ENCRYPTED_KEYS + depends on (LIBNVDIMM=ENCRYPTED_KEYS) || LIBNVDIMM=m + +config NVDIMM_KMSAN + bool + depends on KMSAN + help + KMSAN, and other memory debug facilities, increase the size of + 'struct page' to contain extra metadata. This collides with + the NVDIMM capability to store a potentially + larger-than-"System RAM" size 'struct page' array in a + reservation of persistent memory rather than limited / + precious DRAM. However, that reservation needs to persist for + the life of the given NVDIMM namespace. If you are using KMSAN + to debug an issue unrelated to NVDIMMs or DAX then say N to this + option. Otherwise, say Y but understand that any namespaces + (with the page array stored pmem) created with this build of + the kernel will permanently reserve and strand excess + capacity compared to the CONFIG_KMSAN=n case. + + Select N if unsure. + +config NVDIMM_TEST_BUILD + tristate "Build the unit test core" + depends on m + depends on COMPILE_TEST && X86_64 + default m if COMPILE_TEST + help + Build the core of the unit test infrastructure. The result of + this build is non-functional for unit test execution, but it + otherwise helps catch build errors induced by changes to the + core devm_memremap_pages() implementation and other + infrastructure. + +endif diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile new file mode 100644 index 000000000..ba0296dca --- /dev/null +++ b/drivers/nvdimm/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o +obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o +obj-$(CONFIG_ND_BTT) += nd_btt.o +obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o +obj-$(CONFIG_OF_PMEM) += of_pmem.o +obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o + +nd_pmem-y := pmem.o + +nd_btt-y := btt.o + +nd_e820-y := e820.o + +libnvdimm-y := core.o +libnvdimm-y += bus.o +libnvdimm-y += dimm_devs.o +libnvdimm-$(CONFIG_PERF_EVENTS) += nd_perf.o +libnvdimm-y += dimm.o +libnvdimm-y += region_devs.o +libnvdimm-y += region.o +libnvdimm-y += namespace_devs.o +libnvdimm-y += label.o +libnvdimm-y += badrange.o +libnvdimm-$(CONFIG_ND_CLAIM) += claim.o +libnvdimm-$(CONFIG_BTT) += btt_devs.o +libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o +libnvdimm-$(CONFIG_NVDIMM_DAX) += dax_devs.o +libnvdimm-$(CONFIG_NVDIMM_KEYS) += security.o + +TOOLS := ../../tools +TEST_SRC := $(TOOLS)/testing/nvdimm/test +obj-$(CONFIG_NVDIMM_TEST_BUILD) += $(TEST_SRC)/iomap.o diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c new file mode 100644 index 000000000..aaf6e215a --- /dev/null +++ b/drivers/nvdimm/badrange.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + */ +#include <linux/libnvdimm.h> +#include <linux/badblocks.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/ctype.h> +#include <linux/ndctl.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/io.h> +#include "nd-core.h" +#include "nd.h" + +void badrange_init(struct badrange *badrange) +{ + INIT_LIST_HEAD(&badrange->list); + spin_lock_init(&badrange->lock); +} +EXPORT_SYMBOL_GPL(badrange_init); + +static void append_badrange_entry(struct badrange *badrange, + struct badrange_entry *bre, u64 addr, u64 length) +{ + lockdep_assert_held(&badrange->lock); + bre->start = addr; + bre->length = length; + list_add_tail(&bre->list, &badrange->list); +} + +static int alloc_and_append_badrange_entry(struct badrange *badrange, + u64 addr, u64 length, gfp_t flags) +{ + struct badrange_entry *bre; + + bre = kzalloc(sizeof(*bre), flags); + if (!bre) + return -ENOMEM; + + append_badrange_entry(badrange, bre, addr, length); + return 0; +} + +static int add_badrange(struct badrange *badrange, u64 addr, u64 length) +{ + struct badrange_entry *bre, *bre_new; + + spin_unlock(&badrange->lock); + bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL); + spin_lock(&badrange->lock); + + if (list_empty(&badrange->list)) { + if (!bre_new) + return -ENOMEM; + append_badrange_entry(badrange, bre_new, addr, length); + return 0; + } + + /* + * There is a chance this is a duplicate, check for those first. + * This will be the common case as ARS_STATUS returns all known + * errors in the SPA space, and we can't query it per region + */ + list_for_each_entry(bre, &badrange->list, list) + if (bre->start == addr) { + /* If length has changed, update this list entry */ + if (bre->length != length) + bre->length = length; + kfree(bre_new); + return 0; + } + + /* + * If not a duplicate or a simple length update, add the entry as is, + * as any overlapping ranges will get resolved when the list is consumed + * and converted to badblocks + */ + if (!bre_new) + return -ENOMEM; + append_badrange_entry(badrange, bre_new, addr, length); + + return 0; +} + +int badrange_add(struct badrange *badrange, u64 addr, u64 length) +{ + int rc; + + spin_lock(&badrange->lock); + rc = add_badrange(badrange, addr, length); + spin_unlock(&badrange->lock); + + return rc; +} +EXPORT_SYMBOL_GPL(badrange_add); + +void badrange_forget(struct badrange *badrange, phys_addr_t start, + unsigned int len) +{ + struct list_head *badrange_list = &badrange->list; + u64 clr_end = start + len - 1; + struct badrange_entry *bre, *next; + + spin_lock(&badrange->lock); + + /* + * [start, clr_end] is the badrange interval being cleared. + * [bre->start, bre_end] is the badrange_list entry we're comparing + * the above interval against. The badrange list entry may need + * to be modified (update either start or length), deleted, or + * split into two based on the overlap characteristics + */ + + list_for_each_entry_safe(bre, next, badrange_list, list) { + u64 bre_end = bre->start + bre->length - 1; + + /* Skip intervals with no intersection */ + if (bre_end < start) + continue; + if (bre->start > clr_end) + continue; + /* Delete completely overlapped badrange entries */ + if ((bre->start >= start) && (bre_end <= clr_end)) { + list_del(&bre->list); + kfree(bre); + continue; + } + /* Adjust start point of partially cleared entries */ + if ((start <= bre->start) && (clr_end > bre->start)) { + bre->length -= clr_end - bre->start + 1; + bre->start = clr_end + 1; + continue; + } + /* Adjust bre->length for partial clearing at the tail end */ + if ((bre->start < start) && (bre_end <= clr_end)) { + /* bre->start remains the same */ + bre->length = start - bre->start; + continue; + } + /* + * If clearing in the middle of an entry, we split it into + * two by modifying the current entry to represent one half of + * the split, and adding a new entry for the second half. + */ + if ((bre->start < start) && (bre_end > clr_end)) { + u64 new_start = clr_end + 1; + u64 new_len = bre_end - new_start + 1; + + /* Add new entry covering the right half */ + alloc_and_append_badrange_entry(badrange, new_start, + new_len, GFP_NOWAIT); + /* Adjust this entry to cover the left half */ + bre->length = start - bre->start; + continue; + } + } + spin_unlock(&badrange->lock); +} +EXPORT_SYMBOL_GPL(badrange_forget); + +static void set_badblock(struct badblocks *bb, sector_t s, int num) +{ + dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n", + (u64) s * 512, (u64) num * 512); + /* this isn't an error as the hardware will still throw an exception */ + if (badblocks_set(bb, s, num, 1)) + dev_info_once(bb->dev, "%s: failed for sector %llx\n", + __func__, (u64) s); +} + +/** + * __add_badblock_range() - Convert a physical address range to bad sectors + * @bb: badblocks instance to populate + * @ns_offset: namespace offset where the error range begins (in bytes) + * @len: number of bytes of badrange to be added + * + * This assumes that the range provided with (ns_offset, len) is within + * the bounds of physical addresses for this namespace, i.e. lies in the + * interval [ns_start, ns_start + ns_size) + */ +static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) +{ + const unsigned int sector_size = 512; + sector_t start_sector, end_sector; + u64 num_sectors; + u32 rem; + + start_sector = div_u64(ns_offset, sector_size); + end_sector = div_u64_rem(ns_offset + len, sector_size, &rem); + if (rem) + end_sector++; + num_sectors = end_sector - start_sector; + + if (unlikely(num_sectors > (u64)INT_MAX)) { + u64 remaining = num_sectors; + sector_t s = start_sector; + + while (remaining) { + int done = min_t(u64, remaining, INT_MAX); + + set_badblock(bb, s, done); + remaining -= done; + s += done; + } + } else + set_badblock(bb, start_sector, num_sectors); +} + +static void badblocks_populate(struct badrange *badrange, + struct badblocks *bb, const struct range *range) +{ + struct badrange_entry *bre; + + if (list_empty(&badrange->list)) + return; + + list_for_each_entry(bre, &badrange->list, list) { + u64 bre_end = bre->start + bre->length - 1; + + /* Discard intervals with no intersection */ + if (bre_end < range->start) + continue; + if (bre->start > range->end) + continue; + /* Deal with any overlap after start of the namespace */ + if (bre->start >= range->start) { + u64 start = bre->start; + u64 len; + + if (bre_end <= range->end) + len = bre->length; + else + len = range->start + range_len(range) + - bre->start; + __add_badblock_range(bb, start - range->start, len); + continue; + } + /* + * Deal with overlap for badrange starting before + * the namespace. + */ + if (bre->start < range->start) { + u64 len; + + if (bre_end < range->end) + len = bre->start + bre->length - range->start; + else + len = range_len(range); + __add_badblock_range(bb, 0, len); + } + } +} + +/** + * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks + * @region: parent region of the range to interrogate + * @bb: badblocks instance to populate + * @res: resource range to consider + * + * The badrange list generated during bus initialization may contain + * multiple, possibly overlapping physical address ranges. Compare each + * of these ranges to the resource range currently being initialized, + * and add badblocks entries for all matching sub-ranges + */ +void nvdimm_badblocks_populate(struct nd_region *nd_region, + struct badblocks *bb, const struct range *range) +{ + struct nvdimm_bus *nvdimm_bus; + + if (!is_memory(&nd_region->dev)) { + dev_WARN_ONCE(&nd_region->dev, 1, + "%s only valid for pmem regions\n", __func__); + return; + } + nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + + nvdimm_bus_lock(&nvdimm_bus->dev); + badblocks_populate(&nvdimm_bus->badrange, bb, range); + nvdimm_bus_unlock(&nvdimm_bus->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c new file mode 100644 index 000000000..0297b7882 --- /dev/null +++ b/drivers/nvdimm/btt.c @@ -0,0 +1,1739 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Block Translation Table + * Copyright (c) 2014-2015, Intel Corporation. + */ +#include <linux/highmem.h> +#include <linux/debugfs.h> +#include <linux/blkdev.h> +#include <linux/pagemap.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/hdreg.h> +#include <linux/sizes.h> +#include <linux/ndctl.h> +#include <linux/fs.h> +#include <linux/nd.h> +#include <linux/backing-dev.h> +#include "btt.h" +#include "nd.h" + +enum log_ent_request { + LOG_NEW_ENT = 0, + LOG_OLD_ENT +}; + +static struct device *to_dev(struct arena_info *arena) +{ + return &arena->nd_btt->dev; +} + +static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset) +{ + return offset + nd_btt->initial_offset; +} + +static int arena_read_bytes(struct arena_info *arena, resource_size_t offset, + void *buf, size_t n, unsigned long flags) +{ + struct nd_btt *nd_btt = arena->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* arena offsets may be shifted from the base of the device */ + offset = adjust_initial_offset(nd_btt, offset); + return nvdimm_read_bytes(ndns, offset, buf, n, flags); +} + +static int arena_write_bytes(struct arena_info *arena, resource_size_t offset, + void *buf, size_t n, unsigned long flags) +{ + struct nd_btt *nd_btt = arena->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* arena offsets may be shifted from the base of the device */ + offset = adjust_initial_offset(nd_btt, offset); + return nvdimm_write_bytes(ndns, offset, buf, n, flags); +} + +static int btt_info_write(struct arena_info *arena, struct btt_sb *super) +{ + int ret; + + /* + * infooff and info2off should always be at least 512B aligned. + * We rely on that to make sure rw_bytes does error clearing + * correctly, so make sure that is the case. + */ + dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512), + "arena->infooff: %#llx is unaligned\n", arena->infooff); + dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512), + "arena->info2off: %#llx is unaligned\n", arena->info2off); + + ret = arena_write_bytes(arena, arena->info2off, super, + sizeof(struct btt_sb), 0); + if (ret) + return ret; + + return arena_write_bytes(arena, arena->infooff, super, + sizeof(struct btt_sb), 0); +} + +static int btt_info_read(struct arena_info *arena, struct btt_sb *super) +{ + return arena_read_bytes(arena, arena->infooff, super, + sizeof(struct btt_sb), 0); +} + +/* + * 'raw' version of btt_map write + * Assumptions: + * mapping is in little-endian + * mapping contains 'E' and 'Z' flags as desired + */ +static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping, + unsigned long flags) +{ + u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); + + if (unlikely(lba >= arena->external_nlba)) + dev_err_ratelimited(to_dev(arena), + "%s: lba %#x out of range (max: %#x)\n", + __func__, lba, arena->external_nlba); + return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags); +} + +static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping, + u32 z_flag, u32 e_flag, unsigned long rwb_flags) +{ + u32 ze; + __le32 mapping_le; + + /* + * This 'mapping' is supposed to be just the LBA mapping, without + * any flags set, so strip the flag bits. + */ + mapping = ent_lba(mapping); + + ze = (z_flag << 1) + e_flag; + switch (ze) { + case 0: + /* + * We want to set neither of the Z or E flags, and + * in the actual layout, this means setting the bit + * positions of both to '1' to indicate a 'normal' + * map entry + */ + mapping |= MAP_ENT_NORMAL; + break; + case 1: + mapping |= (1 << MAP_ERR_SHIFT); + break; + case 2: + mapping |= (1 << MAP_TRIM_SHIFT); + break; + default: + /* + * The case where Z and E are both sent in as '1' could be + * construed as a valid 'normal' case, but we decide not to, + * to avoid confusion + */ + dev_err_ratelimited(to_dev(arena), + "Invalid use of Z and E flags\n"); + return -EIO; + } + + mapping_le = cpu_to_le32(mapping); + return __btt_map_write(arena, lba, mapping_le, rwb_flags); +} + +static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, + int *trim, int *error, unsigned long rwb_flags) +{ + int ret; + __le32 in; + u32 raw_mapping, postmap, ze, z_flag, e_flag; + u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); + + if (unlikely(lba >= arena->external_nlba)) + dev_err_ratelimited(to_dev(arena), + "%s: lba %#x out of range (max: %#x)\n", + __func__, lba, arena->external_nlba); + + ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags); + if (ret) + return ret; + + raw_mapping = le32_to_cpu(in); + + z_flag = ent_z_flag(raw_mapping); + e_flag = ent_e_flag(raw_mapping); + ze = (z_flag << 1) + e_flag; + postmap = ent_lba(raw_mapping); + + /* Reuse the {z,e}_flag variables for *trim and *error */ + z_flag = 0; + e_flag = 0; + + switch (ze) { + case 0: + /* Initial state. Return postmap = premap */ + *mapping = lba; + break; + case 1: + *mapping = postmap; + e_flag = 1; + break; + case 2: + *mapping = postmap; + z_flag = 1; + break; + case 3: + *mapping = postmap; + break; + default: + return -EIO; + } + + if (trim) + *trim = z_flag; + if (error) + *error = e_flag; + + return ret; +} + +static int btt_log_group_read(struct arena_info *arena, u32 lane, + struct log_group *log) +{ + return arena_read_bytes(arena, + arena->logoff + (lane * LOG_GRP_SIZE), log, + LOG_GRP_SIZE, 0); +} + +static struct dentry *debugfs_root; + +static void arena_debugfs_init(struct arena_info *a, struct dentry *parent, + int idx) +{ + char dirname[32]; + struct dentry *d; + + /* If for some reason, parent bttN was not created, exit */ + if (!parent) + return; + + snprintf(dirname, 32, "arena%d", idx); + d = debugfs_create_dir(dirname, parent); + if (IS_ERR_OR_NULL(d)) + return; + a->debugfs_dir = d; + + debugfs_create_x64("size", S_IRUGO, d, &a->size); + debugfs_create_x64("external_lba_start", S_IRUGO, d, + &a->external_lba_start); + debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba); + debugfs_create_u32("internal_lbasize", S_IRUGO, d, + &a->internal_lbasize); + debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba); + debugfs_create_u32("external_lbasize", S_IRUGO, d, + &a->external_lbasize); + debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree); + debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major); + debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor); + debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff); + debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff); + debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff); + debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff); + debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff); + debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off); + debugfs_create_x32("flags", S_IRUGO, d, &a->flags); + debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]); + debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]); +} + +static void btt_debugfs_init(struct btt *btt) +{ + int i = 0; + struct arena_info *arena; + + btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev), + debugfs_root); + if (IS_ERR_OR_NULL(btt->debugfs_dir)) + return; + + list_for_each_entry(arena, &btt->arena_list, list) { + arena_debugfs_init(arena, btt->debugfs_dir, i); + i++; + } +} + +static u32 log_seq(struct log_group *log, int log_idx) +{ + return le32_to_cpu(log->ent[log_idx].seq); +} + +/* + * This function accepts two log entries, and uses the + * sequence number to find the 'older' entry. + * It also updates the sequence number in this old entry to + * make it the 'new' one if the mark_flag is set. + * Finally, it returns which of the entries was the older one. + * + * TODO The logic feels a bit kludge-y. make it better.. + */ +static int btt_log_get_old(struct arena_info *a, struct log_group *log) +{ + int idx0 = a->log_index[0]; + int idx1 = a->log_index[1]; + int old; + + /* + * the first ever time this is seen, the entry goes into [0] + * the next time, the following logic works out to put this + * (next) entry into [1] + */ + if (log_seq(log, idx0) == 0) { + log->ent[idx0].seq = cpu_to_le32(1); + return 0; + } + + if (log_seq(log, idx0) == log_seq(log, idx1)) + return -EINVAL; + if (log_seq(log, idx0) + log_seq(log, idx1) > 5) + return -EINVAL; + + if (log_seq(log, idx0) < log_seq(log, idx1)) { + if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1) + old = 0; + else + old = 1; + } else { + if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1) + old = 1; + else + old = 0; + } + + return old; +} + +/* + * This function copies the desired (old/new) log entry into ent if + * it is not NULL. It returns the sub-slot number (0 or 1) + * where the desired log entry was found. Negative return values + * indicate errors. + */ +static int btt_log_read(struct arena_info *arena, u32 lane, + struct log_entry *ent, int old_flag) +{ + int ret; + int old_ent, ret_ent; + struct log_group log; + + ret = btt_log_group_read(arena, lane, &log); + if (ret) + return -EIO; + + old_ent = btt_log_get_old(arena, &log); + if (old_ent < 0 || old_ent > 1) { + dev_err(to_dev(arena), + "log corruption (%d): lane %d seq [%d, %d]\n", + old_ent, lane, log.ent[arena->log_index[0]].seq, + log.ent[arena->log_index[1]].seq); + /* TODO set error state? */ + return -EIO; + } + + ret_ent = (old_flag ? old_ent : (1 - old_ent)); + + if (ent != NULL) + memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE); + + return ret_ent; +} + +/* + * This function commits a log entry to media + * It does _not_ prepare the freelist entry for the next write + * btt_flog_write is the wrapper for updating the freelist elements + */ +static int __btt_log_write(struct arena_info *arena, u32 lane, + u32 sub, struct log_entry *ent, unsigned long flags) +{ + int ret; + u32 group_slot = arena->log_index[sub]; + unsigned int log_half = LOG_ENT_SIZE / 2; + void *src = ent; + u64 ns_off; + + ns_off = arena->logoff + (lane * LOG_GRP_SIZE) + + (group_slot * LOG_ENT_SIZE); + /* split the 16B write into atomic, durable halves */ + ret = arena_write_bytes(arena, ns_off, src, log_half, flags); + if (ret) + return ret; + + ns_off += log_half; + src += log_half; + return arena_write_bytes(arena, ns_off, src, log_half, flags); +} + +static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub, + struct log_entry *ent) +{ + int ret; + + ret = __btt_log_write(arena, lane, sub, ent, NVDIMM_IO_ATOMIC); + if (ret) + return ret; + + /* prepare the next free entry */ + arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; + if (++(arena->freelist[lane].seq) == 4) + arena->freelist[lane].seq = 1; + if (ent_e_flag(le32_to_cpu(ent->old_map))) + arena->freelist[lane].has_err = 1; + arena->freelist[lane].block = ent_lba(le32_to_cpu(ent->old_map)); + + return ret; +} + +/* + * This function initializes the BTT map to the initial state, which is + * all-zeroes, and indicates an identity mapping + */ +static int btt_map_init(struct arena_info *arena) +{ + int ret = -EINVAL; + void *zerobuf; + size_t offset = 0; + size_t chunk_size = SZ_2M; + size_t mapsize = arena->logoff - arena->mapoff; + + zerobuf = kzalloc(chunk_size, GFP_KERNEL); + if (!zerobuf) + return -ENOMEM; + + /* + * mapoff should always be at least 512B aligned. We rely on that to + * make sure rw_bytes does error clearing correctly, so make sure that + * is the case. + */ + dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512), + "arena->mapoff: %#llx is unaligned\n", arena->mapoff); + + while (mapsize) { + size_t size = min(mapsize, chunk_size); + + dev_WARN_ONCE(to_dev(arena), size < 512, + "chunk size: %#zx is unaligned\n", size); + ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf, + size, 0); + if (ret) + goto free; + + offset += size; + mapsize -= size; + cond_resched(); + } + + free: + kfree(zerobuf); + return ret; +} + +/* + * This function initializes the BTT log with 'fake' entries pointing + * to the initial reserved set of blocks as being free + */ +static int btt_log_init(struct arena_info *arena) +{ + size_t logsize = arena->info2off - arena->logoff; + size_t chunk_size = SZ_4K, offset = 0; + struct log_entry ent; + void *zerobuf; + int ret; + u32 i; + + zerobuf = kzalloc(chunk_size, GFP_KERNEL); + if (!zerobuf) + return -ENOMEM; + /* + * logoff should always be at least 512B aligned. We rely on that to + * make sure rw_bytes does error clearing correctly, so make sure that + * is the case. + */ + dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512), + "arena->logoff: %#llx is unaligned\n", arena->logoff); + + while (logsize) { + size_t size = min(logsize, chunk_size); + + dev_WARN_ONCE(to_dev(arena), size < 512, + "chunk size: %#zx is unaligned\n", size); + ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf, + size, 0); + if (ret) + goto free; + + offset += size; + logsize -= size; + cond_resched(); + } + + for (i = 0; i < arena->nfree; i++) { + ent.lba = cpu_to_le32(i); + ent.old_map = cpu_to_le32(arena->external_nlba + i); + ent.new_map = cpu_to_le32(arena->external_nlba + i); + ent.seq = cpu_to_le32(LOG_SEQ_INIT); + ret = __btt_log_write(arena, i, 0, &ent, 0); + if (ret) + goto free; + } + + free: + kfree(zerobuf); + return ret; +} + +static u64 to_namespace_offset(struct arena_info *arena, u64 lba) +{ + return arena->dataoff + ((u64)lba * arena->internal_lbasize); +} + +static int arena_clear_freelist_error(struct arena_info *arena, u32 lane) +{ + int ret = 0; + + if (arena->freelist[lane].has_err) { + void *zero_page = page_address(ZERO_PAGE(0)); + u32 lba = arena->freelist[lane].block; + u64 nsoff = to_namespace_offset(arena, lba); + unsigned long len = arena->sector_size; + + mutex_lock(&arena->err_lock); + + while (len) { + unsigned long chunk = min(len, PAGE_SIZE); + + ret = arena_write_bytes(arena, nsoff, zero_page, + chunk, 0); + if (ret) + break; + len -= chunk; + nsoff += chunk; + if (len == 0) + arena->freelist[lane].has_err = 0; + } + mutex_unlock(&arena->err_lock); + } + return ret; +} + +static int btt_freelist_init(struct arena_info *arena) +{ + int new, ret; + struct log_entry log_new; + u32 i, map_entry, log_oldmap, log_newmap; + + arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry), + GFP_KERNEL); + if (!arena->freelist) + return -ENOMEM; + + for (i = 0; i < arena->nfree; i++) { + new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT); + if (new < 0) + return new; + + /* old and new map entries with any flags stripped out */ + log_oldmap = ent_lba(le32_to_cpu(log_new.old_map)); + log_newmap = ent_lba(le32_to_cpu(log_new.new_map)); + + /* sub points to the next one to be overwritten */ + arena->freelist[i].sub = 1 - new; + arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq)); + arena->freelist[i].block = log_oldmap; + + /* + * FIXME: if error clearing fails during init, we want to make + * the BTT read-only + */ + if (ent_e_flag(le32_to_cpu(log_new.old_map)) && + !ent_normal(le32_to_cpu(log_new.old_map))) { + arena->freelist[i].has_err = 1; + ret = arena_clear_freelist_error(arena, i); + if (ret) + dev_err_ratelimited(to_dev(arena), + "Unable to clear known errors\n"); + } + + /* This implies a newly created or untouched flog entry */ + if (log_oldmap == log_newmap) + continue; + + /* Check if map recovery is needed */ + ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry, + NULL, NULL, 0); + if (ret) + return ret; + + /* + * The map_entry from btt_read_map is stripped of any flag bits, + * so use the stripped out versions from the log as well for + * testing whether recovery is needed. For restoration, use the + * 'raw' version of the log entries as that captured what we + * were going to write originally. + */ + if ((log_newmap != map_entry) && (log_oldmap == map_entry)) { + /* + * Last transaction wrote the flog, but wasn't able + * to complete the map write. So fix up the map. + */ + ret = btt_map_write(arena, le32_to_cpu(log_new.lba), + le32_to_cpu(log_new.new_map), 0, 0, 0); + if (ret) + return ret; + } + } + + return 0; +} + +static bool ent_is_padding(struct log_entry *ent) +{ + return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0) + && (ent->seq == 0); +} + +/* + * Detecting valid log indices: We read a log group (see the comments in btt.h + * for a description of a 'log_group' and its 'slots'), and iterate over its + * four slots. We expect that a padding slot will be all-zeroes, and use this + * to detect a padding slot vs. an actual entry. + * + * If a log_group is in the initial state, i.e. hasn't been used since the + * creation of this BTT layout, it will have three of the four slots with + * zeroes. We skip over these log_groups for the detection of log_index. If + * all log_groups are in the initial state (i.e. the BTT has never been + * written to), it is safe to assume the 'new format' of log entries in slots + * (0, 1). + */ +static int log_set_indices(struct arena_info *arena) +{ + bool idx_set = false, initial_state = true; + int ret, log_index[2] = {-1, -1}; + u32 i, j, next_idx = 0; + struct log_group log; + u32 pad_count = 0; + + for (i = 0; i < arena->nfree; i++) { + ret = btt_log_group_read(arena, i, &log); + if (ret < 0) + return ret; + + for (j = 0; j < 4; j++) { + if (!idx_set) { + if (ent_is_padding(&log.ent[j])) { + pad_count++; + continue; + } else { + /* Skip if index has been recorded */ + if ((next_idx == 1) && + (j == log_index[0])) + continue; + /* valid entry, record index */ + log_index[next_idx] = j; + next_idx++; + } + if (next_idx == 2) { + /* two valid entries found */ + idx_set = true; + } else if (next_idx > 2) { + /* too many valid indices */ + return -ENXIO; + } + } else { + /* + * once the indices have been set, just verify + * that all subsequent log groups are either in + * their initial state or follow the same + * indices. + */ + if (j == log_index[0]) { + /* entry must be 'valid' */ + if (ent_is_padding(&log.ent[j])) + return -ENXIO; + } else if (j == log_index[1]) { + ; + /* + * log_index[1] can be padding if the + * lane never got used and it is still + * in the initial state (three 'padding' + * entries) + */ + } else { + /* entry must be invalid (padding) */ + if (!ent_is_padding(&log.ent[j])) + return -ENXIO; + } + } + } + /* + * If any of the log_groups have more than one valid, + * non-padding entry, then the we are no longer in the + * initial_state + */ + if (pad_count < 3) + initial_state = false; + pad_count = 0; + } + + if (!initial_state && !idx_set) + return -ENXIO; + + /* + * If all the entries in the log were in the initial state, + * assume new padding scheme + */ + if (initial_state) + log_index[1] = 1; + + /* + * Only allow the known permutations of log/padding indices, + * i.e. (0, 1), and (0, 2) + */ + if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2))) + ; /* known index possibilities */ + else { + dev_err(to_dev(arena), "Found an unknown padding scheme\n"); + return -ENXIO; + } + + arena->log_index[0] = log_index[0]; + arena->log_index[1] = log_index[1]; + dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]); + dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]); + return 0; +} + +static int btt_rtt_init(struct arena_info *arena) +{ + arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL); + if (arena->rtt == NULL) + return -ENOMEM; + + return 0; +} + +static int btt_maplocks_init(struct arena_info *arena) +{ + u32 i; + + arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock), + GFP_KERNEL); + if (!arena->map_locks) + return -ENOMEM; + + for (i = 0; i < arena->nfree; i++) + spin_lock_init(&arena->map_locks[i].lock); + + return 0; +} + +static struct arena_info *alloc_arena(struct btt *btt, size_t size, + size_t start, size_t arena_off) +{ + struct arena_info *arena; + u64 logsize, mapsize, datasize; + u64 available = size; + + arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL); + if (!arena) + return NULL; + arena->nd_btt = btt->nd_btt; + arena->sector_size = btt->sector_size; + mutex_init(&arena->err_lock); + + if (!size) + return arena; + + arena->size = size; + arena->external_lba_start = start; + arena->external_lbasize = btt->lbasize; + arena->internal_lbasize = roundup(arena->external_lbasize, + INT_LBASIZE_ALIGNMENT); + arena->nfree = BTT_DEFAULT_NFREE; + arena->version_major = btt->nd_btt->version_major; + arena->version_minor = btt->nd_btt->version_minor; + + if (available % BTT_PG_SIZE) + available -= (available % BTT_PG_SIZE); + + /* Two pages are reserved for the super block and its copy */ + available -= 2 * BTT_PG_SIZE; + + /* The log takes a fixed amount of space based on nfree */ + logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE); + available -= logsize; + + /* Calculate optimal split between map and data area */ + arena->internal_nlba = div_u64(available - BTT_PG_SIZE, + arena->internal_lbasize + MAP_ENT_SIZE); + arena->external_nlba = arena->internal_nlba - arena->nfree; + + mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE); + datasize = available - mapsize; + + /* 'Absolute' values, relative to start of storage space */ + arena->infooff = arena_off; + arena->dataoff = arena->infooff + BTT_PG_SIZE; + arena->mapoff = arena->dataoff + datasize; + arena->logoff = arena->mapoff + mapsize; + arena->info2off = arena->logoff + logsize; + + /* Default log indices are (0,1) */ + arena->log_index[0] = 0; + arena->log_index[1] = 1; + return arena; +} + +static void free_arenas(struct btt *btt) +{ + struct arena_info *arena, *next; + + list_for_each_entry_safe(arena, next, &btt->arena_list, list) { + list_del(&arena->list); + kfree(arena->rtt); + kfree(arena->map_locks); + kfree(arena->freelist); + debugfs_remove_recursive(arena->debugfs_dir); + kfree(arena); + } +} + +/* + * This function reads an existing valid btt superblock and + * populates the corresponding arena_info struct + */ +static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super, + u64 arena_off) +{ + arena->internal_nlba = le32_to_cpu(super->internal_nlba); + arena->internal_lbasize = le32_to_cpu(super->internal_lbasize); + arena->external_nlba = le32_to_cpu(super->external_nlba); + arena->external_lbasize = le32_to_cpu(super->external_lbasize); + arena->nfree = le32_to_cpu(super->nfree); + arena->version_major = le16_to_cpu(super->version_major); + arena->version_minor = le16_to_cpu(super->version_minor); + + arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off + + le64_to_cpu(super->nextoff)); + arena->infooff = arena_off; + arena->dataoff = arena_off + le64_to_cpu(super->dataoff); + arena->mapoff = arena_off + le64_to_cpu(super->mapoff); + arena->logoff = arena_off + le64_to_cpu(super->logoff); + arena->info2off = arena_off + le64_to_cpu(super->info2off); + + arena->size = (le64_to_cpu(super->nextoff) > 0) + ? (le64_to_cpu(super->nextoff)) + : (arena->info2off - arena->infooff + BTT_PG_SIZE); + + arena->flags = le32_to_cpu(super->flags); +} + +static int discover_arenas(struct btt *btt) +{ + int ret = 0; + struct arena_info *arena; + struct btt_sb *super; + size_t remaining = btt->rawsize; + u64 cur_nlba = 0; + size_t cur_off = 0; + int num_arenas = 0; + + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + return -ENOMEM; + + while (remaining) { + /* Alloc memory for arena */ + arena = alloc_arena(btt, 0, 0, 0); + if (!arena) { + ret = -ENOMEM; + goto out_super; + } + + arena->infooff = cur_off; + ret = btt_info_read(arena, super); + if (ret) + goto out; + + if (!nd_btt_arena_is_valid(btt->nd_btt, super)) { + if (remaining == btt->rawsize) { + btt->init_state = INIT_NOTFOUND; + dev_info(to_dev(arena), "No existing arenas\n"); + goto out; + } else { + dev_err(to_dev(arena), + "Found corrupted metadata!\n"); + ret = -ENODEV; + goto out; + } + } + + arena->external_lba_start = cur_nlba; + parse_arena_meta(arena, super, cur_off); + + ret = log_set_indices(arena); + if (ret) { + dev_err(to_dev(arena), + "Unable to deduce log/padding indices\n"); + goto out; + } + + ret = btt_freelist_init(arena); + if (ret) + goto out; + + ret = btt_rtt_init(arena); + if (ret) + goto out; + + ret = btt_maplocks_init(arena); + if (ret) + goto out; + + list_add_tail(&arena->list, &btt->arena_list); + + remaining -= arena->size; + cur_off += arena->size; + cur_nlba += arena->external_nlba; + num_arenas++; + + if (arena->nextoff == 0) + break; + } + btt->num_arenas = num_arenas; + btt->nlba = cur_nlba; + btt->init_state = INIT_READY; + + kfree(super); + return ret; + + out: + kfree(arena); + free_arenas(btt); + out_super: + kfree(super); + return ret; +} + +static int create_arenas(struct btt *btt) +{ + size_t remaining = btt->rawsize; + size_t cur_off = 0; + + while (remaining) { + struct arena_info *arena; + size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining); + + remaining -= arena_size; + if (arena_size < ARENA_MIN_SIZE) + break; + + arena = alloc_arena(btt, arena_size, btt->nlba, cur_off); + if (!arena) { + free_arenas(btt); + return -ENOMEM; + } + btt->nlba += arena->external_nlba; + if (remaining >= ARENA_MIN_SIZE) + arena->nextoff = arena->size; + else + arena->nextoff = 0; + cur_off += arena_size; + list_add_tail(&arena->list, &btt->arena_list); + } + + return 0; +} + +/* + * This function completes arena initialization by writing + * all the metadata. + * It is only called for an uninitialized arena when a write + * to that arena occurs for the first time. + */ +static int btt_arena_write_layout(struct arena_info *arena) +{ + int ret; + u64 sum; + struct btt_sb *super; + struct nd_btt *nd_btt = arena->nd_btt; + const uuid_t *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev); + + ret = btt_map_init(arena); + if (ret) + return ret; + + ret = btt_log_init(arena); + if (ret) + return ret; + + super = kzalloc(sizeof(struct btt_sb), GFP_NOIO); + if (!super) + return -ENOMEM; + + strncpy(super->signature, BTT_SIG, BTT_SIG_LEN); + export_uuid(super->uuid, nd_btt->uuid); + export_uuid(super->parent_uuid, parent_uuid); + super->flags = cpu_to_le32(arena->flags); + super->version_major = cpu_to_le16(arena->version_major); + super->version_minor = cpu_to_le16(arena->version_minor); + super->external_lbasize = cpu_to_le32(arena->external_lbasize); + super->external_nlba = cpu_to_le32(arena->external_nlba); + super->internal_lbasize = cpu_to_le32(arena->internal_lbasize); + super->internal_nlba = cpu_to_le32(arena->internal_nlba); + super->nfree = cpu_to_le32(arena->nfree); + super->infosize = cpu_to_le32(sizeof(struct btt_sb)); + super->nextoff = cpu_to_le64(arena->nextoff); + /* + * Subtract arena->infooff (arena start) so numbers are relative + * to 'this' arena + */ + super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff); + super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff); + super->logoff = cpu_to_le64(arena->logoff - arena->infooff); + super->info2off = cpu_to_le64(arena->info2off - arena->infooff); + + super->flags = 0; + sum = nd_sb_checksum((struct nd_gen_sb *) super); + super->checksum = cpu_to_le64(sum); + + ret = btt_info_write(arena, super); + + kfree(super); + return ret; +} + +/* + * This function completes the initialization for the BTT namespace + * such that it is ready to accept IOs + */ +static int btt_meta_init(struct btt *btt) +{ + int ret = 0; + struct arena_info *arena; + + mutex_lock(&btt->init_lock); + list_for_each_entry(arena, &btt->arena_list, list) { + ret = btt_arena_write_layout(arena); + if (ret) + goto unlock; + + ret = btt_freelist_init(arena); + if (ret) + goto unlock; + + ret = btt_rtt_init(arena); + if (ret) + goto unlock; + + ret = btt_maplocks_init(arena); + if (ret) + goto unlock; + } + + btt->init_state = INIT_READY; + + unlock: + mutex_unlock(&btt->init_lock); + return ret; +} + +static u32 btt_meta_size(struct btt *btt) +{ + return btt->lbasize - btt->sector_size; +} + +/* + * This function calculates the arena in which the given LBA lies + * by doing a linear walk. This is acceptable since we expect only + * a few arenas. If we have backing devices that get much larger, + * we can construct a balanced binary tree of arenas at init time + * so that this range search becomes faster. + */ +static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap, + struct arena_info **arena) +{ + struct arena_info *arena_list; + __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size); + + list_for_each_entry(arena_list, &btt->arena_list, list) { + if (lba < arena_list->external_nlba) { + *arena = arena_list; + *premap = lba; + return 0; + } + lba -= arena_list->external_nlba; + } + + return -EIO; +} + +/* + * The following (lock_map, unlock_map) are mostly just to improve + * readability, since they index into an array of locks + */ +static void lock_map(struct arena_info *arena, u32 premap) + __acquires(&arena->map_locks[idx].lock) +{ + u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; + + spin_lock(&arena->map_locks[idx].lock); +} + +static void unlock_map(struct arena_info *arena, u32 premap) + __releases(&arena->map_locks[idx].lock) +{ + u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; + + spin_unlock(&arena->map_locks[idx].lock); +} + +static int btt_data_read(struct arena_info *arena, struct page *page, + unsigned int off, u32 lba, u32 len) +{ + int ret; + u64 nsoff = to_namespace_offset(arena, lba); + void *mem = kmap_atomic(page); + + ret = arena_read_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC); + kunmap_atomic(mem); + + return ret; +} + +static int btt_data_write(struct arena_info *arena, u32 lba, + struct page *page, unsigned int off, u32 len) +{ + int ret; + u64 nsoff = to_namespace_offset(arena, lba); + void *mem = kmap_atomic(page); + + ret = arena_write_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC); + kunmap_atomic(mem); + + return ret; +} + +static void zero_fill_data(struct page *page, unsigned int off, u32 len) +{ + void *mem = kmap_atomic(page); + + memset(mem + off, 0, len); + kunmap_atomic(mem); +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip, + struct arena_info *arena, u32 postmap, int rw) +{ + unsigned int len = btt_meta_size(btt); + u64 meta_nsoff; + int ret = 0; + + if (bip == NULL) + return 0; + + meta_nsoff = to_namespace_offset(arena, postmap) + btt->sector_size; + + while (len) { + unsigned int cur_len; + struct bio_vec bv; + void *mem; + + bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); + /* + * The 'bv' obtained from bvec_iter_bvec has its .bv_len and + * .bv_offset already adjusted for iter->bi_bvec_done, and we + * can use those directly + */ + + cur_len = min(len, bv.bv_len); + mem = bvec_kmap_local(&bv); + if (rw) + ret = arena_write_bytes(arena, meta_nsoff, mem, cur_len, + NVDIMM_IO_ATOMIC); + else + ret = arena_read_bytes(arena, meta_nsoff, mem, cur_len, + NVDIMM_IO_ATOMIC); + + kunmap_local(mem); + if (ret) + return ret; + + len -= cur_len; + meta_nsoff += cur_len; + if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len)) + return -EIO; + } + + return ret; +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip, + struct arena_info *arena, u32 postmap, int rw) +{ + return 0; +} +#endif + +static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip, + struct page *page, unsigned int off, sector_t sector, + unsigned int len) +{ + int ret = 0; + int t_flag, e_flag; + struct arena_info *arena = NULL; + u32 lane = 0, premap, postmap; + + while (len) { + u32 cur_len; + + lane = nd_region_acquire_lane(btt->nd_region); + + ret = lba_to_arena(btt, sector, &premap, &arena); + if (ret) + goto out_lane; + + cur_len = min(btt->sector_size, len); + + ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag, + NVDIMM_IO_ATOMIC); + if (ret) + goto out_lane; + + /* + * We loop to make sure that the post map LBA didn't change + * from under us between writing the RTT and doing the actual + * read. + */ + while (1) { + u32 new_map; + int new_t, new_e; + + if (t_flag) { + zero_fill_data(page, off, cur_len); + goto out_lane; + } + + if (e_flag) { + ret = -EIO; + goto out_lane; + } + + arena->rtt[lane] = RTT_VALID | postmap; + /* + * Barrier to make sure this write is not reordered + * to do the verification map_read before the RTT store + */ + barrier(); + + ret = btt_map_read(arena, premap, &new_map, &new_t, + &new_e, NVDIMM_IO_ATOMIC); + if (ret) + goto out_rtt; + + if ((postmap == new_map) && (t_flag == new_t) && + (e_flag == new_e)) + break; + + postmap = new_map; + t_flag = new_t; + e_flag = new_e; + } + + ret = btt_data_read(arena, page, off, postmap, cur_len); + if (ret) { + /* Media error - set the e_flag */ + if (btt_map_write(arena, premap, postmap, 0, 1, NVDIMM_IO_ATOMIC)) + dev_warn_ratelimited(to_dev(arena), + "Error persistently tracking bad blocks at %#x\n", + premap); + goto out_rtt; + } + + if (bip) { + ret = btt_rw_integrity(btt, bip, arena, postmap, READ); + if (ret) + goto out_rtt; + } + + arena->rtt[lane] = RTT_INVALID; + nd_region_release_lane(btt->nd_region, lane); + + len -= cur_len; + off += cur_len; + sector += btt->sector_size >> SECTOR_SHIFT; + } + + return 0; + + out_rtt: + arena->rtt[lane] = RTT_INVALID; + out_lane: + nd_region_release_lane(btt->nd_region, lane); + return ret; +} + +/* + * Normally, arena_{read,write}_bytes will take care of the initial offset + * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem, + * we need the final, raw namespace offset here + */ +static bool btt_is_badblock(struct btt *btt, struct arena_info *arena, + u32 postmap) +{ + u64 nsoff = adjust_initial_offset(arena->nd_btt, + to_namespace_offset(arena, postmap)); + sector_t phys_sector = nsoff >> 9; + + return is_bad_pmem(btt->phys_bb, phys_sector, arena->internal_lbasize); +} + +static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, + sector_t sector, struct page *page, unsigned int off, + unsigned int len) +{ + int ret = 0; + struct arena_info *arena = NULL; + u32 premap = 0, old_postmap, new_postmap, lane = 0, i; + struct log_entry log; + int sub; + + while (len) { + u32 cur_len; + int e_flag; + + retry: + lane = nd_region_acquire_lane(btt->nd_region); + + ret = lba_to_arena(btt, sector, &premap, &arena); + if (ret) + goto out_lane; + cur_len = min(btt->sector_size, len); + + if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) { + ret = -EIO; + goto out_lane; + } + + if (btt_is_badblock(btt, arena, arena->freelist[lane].block)) + arena->freelist[lane].has_err = 1; + + if (mutex_is_locked(&arena->err_lock) + || arena->freelist[lane].has_err) { + nd_region_release_lane(btt->nd_region, lane); + + ret = arena_clear_freelist_error(arena, lane); + if (ret) + return ret; + + /* OK to acquire a different lane/free block */ + goto retry; + } + + new_postmap = arena->freelist[lane].block; + + /* Wait if the new block is being read from */ + for (i = 0; i < arena->nfree; i++) + while (arena->rtt[i] == (RTT_VALID | new_postmap)) + cpu_relax(); + + + if (new_postmap >= arena->internal_nlba) { + ret = -EIO; + goto out_lane; + } + + ret = btt_data_write(arena, new_postmap, page, off, cur_len); + if (ret) + goto out_lane; + + if (bip) { + ret = btt_rw_integrity(btt, bip, arena, new_postmap, + WRITE); + if (ret) + goto out_lane; + } + + lock_map(arena, premap); + ret = btt_map_read(arena, premap, &old_postmap, NULL, &e_flag, + NVDIMM_IO_ATOMIC); + if (ret) + goto out_map; + if (old_postmap >= arena->internal_nlba) { + ret = -EIO; + goto out_map; + } + if (e_flag) + set_e_flag(old_postmap); + + log.lba = cpu_to_le32(premap); + log.old_map = cpu_to_le32(old_postmap); + log.new_map = cpu_to_le32(new_postmap); + log.seq = cpu_to_le32(arena->freelist[lane].seq); + sub = arena->freelist[lane].sub; + ret = btt_flog_write(arena, lane, sub, &log); + if (ret) + goto out_map; + + ret = btt_map_write(arena, premap, new_postmap, 0, 0, + NVDIMM_IO_ATOMIC); + if (ret) + goto out_map; + + unlock_map(arena, premap); + nd_region_release_lane(btt->nd_region, lane); + + if (e_flag) { + ret = arena_clear_freelist_error(arena, lane); + if (ret) + return ret; + } + + len -= cur_len; + off += cur_len; + sector += btt->sector_size >> SECTOR_SHIFT; + } + + return 0; + + out_map: + unlock_map(arena, premap); + out_lane: + nd_region_release_lane(btt->nd_region, lane); + return ret; +} + +static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, + struct page *page, unsigned int len, unsigned int off, + enum req_op op, sector_t sector) +{ + int ret; + + if (!op_is_write(op)) { + ret = btt_read_pg(btt, bip, page, off, sector, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + ret = btt_write_pg(btt, bip, sector, page, off, len); + } + + return ret; +} + +static void btt_submit_bio(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct btt *btt = bio->bi_bdev->bd_disk->private_data; + struct bvec_iter iter; + unsigned long start; + struct bio_vec bvec; + int err = 0; + bool do_acct; + + if (!bio_integrity_prep(bio)) + return; + + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); + if (do_acct) + start = bio_start_io_acct(bio); + bio_for_each_segment(bvec, bio, iter) { + unsigned int len = bvec.bv_len; + + if (len > PAGE_SIZE || len < btt->sector_size || + len % btt->sector_size) { + dev_err_ratelimited(&btt->nd_btt->dev, + "unaligned bio segment (len: %d)\n", len); + bio->bi_status = BLK_STS_IOERR; + break; + } + + err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, + bio_op(bio), iter.bi_sector); + if (err) { + dev_err(&btt->nd_btt->dev, + "io error in %s sector %lld, len %d,\n", + (op_is_write(bio_op(bio))) ? "WRITE" : + "READ", + (unsigned long long) iter.bi_sector, len); + bio->bi_status = errno_to_blk_status(err); + break; + } + } + if (do_acct) + bio_end_io_acct(bio, start); + + bio_endio(bio); +} + +static int btt_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, enum req_op op) +{ + struct btt *btt = bdev->bd_disk->private_data; + int rc; + + rc = btt_do_bvec(btt, NULL, page, thp_size(page), 0, op, sector); + if (rc == 0) + page_endio(page, op_is_write(op), 0); + + return rc; +} + + +static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + return 0; +} + +static const struct block_device_operations btt_fops = { + .owner = THIS_MODULE, + .submit_bio = btt_submit_bio, + .rw_page = btt_rw_page, + .getgeo = btt_getgeo, +}; + +static int btt_blk_init(struct btt *btt) +{ + struct nd_btt *nd_btt = btt->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + int rc = -ENOMEM; + + btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE); + if (!btt->btt_disk) + return -ENOMEM; + + nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); + btt->btt_disk->first_minor = 0; + btt->btt_disk->fops = &btt_fops; + btt->btt_disk->private_data = btt; + + blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size); + blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX); + blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); + + if (btt_meta_size(btt)) { + rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); + if (rc) + goto out_cleanup_disk; + } + + set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); + rc = device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL); + if (rc) + goto out_cleanup_disk; + + btt->nd_btt->size = btt->nlba * (u64)btt->sector_size; + nvdimm_check_and_set_ro(btt->btt_disk); + + return 0; + +out_cleanup_disk: + put_disk(btt->btt_disk); + return rc; +} + +static void btt_blk_cleanup(struct btt *btt) +{ + del_gendisk(btt->btt_disk); + put_disk(btt->btt_disk); +} + +/** + * btt_init - initialize a block translation table for the given device + * @nd_btt: device with BTT geometry and backing device info + * @rawsize: raw size in bytes of the backing device + * @lbasize: lba size of the backing device + * @uuid: A uuid for the backing device - this is stored on media + * @maxlane: maximum number of parallel requests the device can handle + * + * Initialize a Block Translation Table on a backing device to provide + * single sector power fail atomicity. + * + * Context: + * Might sleep. + * + * Returns: + * Pointer to a new struct btt on success, NULL on failure. + */ +static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, + u32 lbasize, uuid_t *uuid, + struct nd_region *nd_region) +{ + int ret; + struct btt *btt; + struct nd_namespace_io *nsio; + struct device *dev = &nd_btt->dev; + + btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL); + if (!btt) + return NULL; + + btt->nd_btt = nd_btt; + btt->rawsize = rawsize; + btt->lbasize = lbasize; + btt->sector_size = ((lbasize >= 4096) ? 4096 : 512); + INIT_LIST_HEAD(&btt->arena_list); + mutex_init(&btt->init_lock); + btt->nd_region = nd_region; + nsio = to_nd_namespace_io(&nd_btt->ndns->dev); + btt->phys_bb = &nsio->bb; + + ret = discover_arenas(btt); + if (ret) { + dev_err(dev, "init: error in arena_discover: %d\n", ret); + return NULL; + } + + if (btt->init_state != INIT_READY && nd_region->ro) { + dev_warn(dev, "%s is read-only, unable to init btt metadata\n", + dev_name(&nd_region->dev)); + return NULL; + } else if (btt->init_state != INIT_READY) { + btt->num_arenas = (rawsize / ARENA_MAX_SIZE) + + ((rawsize % ARENA_MAX_SIZE) ? 1 : 0); + dev_dbg(dev, "init: %d arenas for %llu rawsize\n", + btt->num_arenas, rawsize); + + ret = create_arenas(btt); + if (ret) { + dev_info(dev, "init: create_arenas: %d\n", ret); + return NULL; + } + + ret = btt_meta_init(btt); + if (ret) { + dev_err(dev, "init: error in meta_init: %d\n", ret); + return NULL; + } + } + + ret = btt_blk_init(btt); + if (ret) { + dev_err(dev, "init: error in blk_init: %d\n", ret); + return NULL; + } + + btt_debugfs_init(btt); + + return btt; +} + +/** + * btt_fini - de-initialize a BTT + * @btt: the BTT handle that was generated by btt_init + * + * De-initialize a Block Translation Table on device removal + * + * Context: + * Might sleep. + */ +static void btt_fini(struct btt *btt) +{ + if (btt) { + btt_blk_cleanup(btt); + free_arenas(btt); + debugfs_remove_recursive(btt->debugfs_dir); + } +} + +int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt = to_nd_btt(ndns->claim); + struct nd_region *nd_region; + struct btt_sb *btt_sb; + struct btt *btt; + size_t size, rawsize; + int rc; + + if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) { + dev_dbg(&nd_btt->dev, "incomplete btt configuration\n"); + return -ENODEV; + } + + btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL); + if (!btt_sb) + return -ENOMEM; + + size = nvdimm_namespace_capacity(ndns); + rc = devm_namespace_enable(&nd_btt->dev, ndns, size); + if (rc) + return rc; + + /* + * If this returns < 0, that is ok as it just means there wasn't + * an existing BTT, and we're creating a new one. We still need to + * call this as we need the version dependent fields in nd_btt to be + * set correctly based on the holder class + */ + nd_btt_version(nd_btt, ndns, btt_sb); + + rawsize = size - nd_btt->initial_offset; + if (rawsize < ARENA_MIN_SIZE) { + dev_dbg(&nd_btt->dev, "%s must be at least %ld bytes\n", + dev_name(&ndns->dev), + ARENA_MIN_SIZE + nd_btt->initial_offset); + return -ENXIO; + } + nd_region = to_nd_region(nd_btt->dev.parent); + btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid, + nd_region); + if (!btt) + return -ENOMEM; + nd_btt->btt = btt; + + return 0; +} +EXPORT_SYMBOL(nvdimm_namespace_attach_btt); + +int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt) +{ + struct btt *btt = nd_btt->btt; + + btt_fini(btt); + nd_btt->btt = NULL; + + return 0; +} +EXPORT_SYMBOL(nvdimm_namespace_detach_btt); + +static int __init nd_btt_init(void) +{ + int rc = 0; + + debugfs_root = debugfs_create_dir("btt", NULL); + if (IS_ERR_OR_NULL(debugfs_root)) + rc = -ENXIO; + + return rc; +} + +static void __exit nd_btt_exit(void) +{ + debugfs_remove_recursive(debugfs_root); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT); +MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>"); +MODULE_LICENSE("GPL v2"); +module_init(nd_btt_init); +module_exit(nd_btt_exit); diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h new file mode 100644 index 000000000..0c76c0333 --- /dev/null +++ b/drivers/nvdimm/btt.h @@ -0,0 +1,239 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Block Translation Table library + * Copyright (c) 2014-2015, Intel Corporation. + */ + +#ifndef _LINUX_BTT_H +#define _LINUX_BTT_H + +#include <linux/types.h> + +#define BTT_SIG_LEN 16 +#define BTT_SIG "BTT_ARENA_INFO\0" +#define MAP_ENT_SIZE 4 +#define MAP_TRIM_SHIFT 31 +#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT) +#define MAP_ERR_SHIFT 30 +#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT) +#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT))) +#define MAP_ENT_NORMAL 0xC0000000 +#define LOG_GRP_SIZE sizeof(struct log_group) +#define LOG_ENT_SIZE sizeof(struct log_entry) +#define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */ +#define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */ +#define RTT_VALID (1UL << 31) +#define RTT_INVALID 0 +#define BTT_PG_SIZE 4096 +#define BTT_DEFAULT_NFREE ND_MAX_LANES +#define LOG_SEQ_INIT 1 + +#define IB_FLAG_ERROR 0x00000001 +#define IB_FLAG_ERROR_MASK 0x00000001 + +#define ent_lba(ent) (ent & MAP_LBA_MASK) +#define ent_e_flag(ent) (!!(ent & MAP_ERR_MASK)) +#define ent_z_flag(ent) (!!(ent & MAP_TRIM_MASK)) +#define set_e_flag(ent) (ent |= MAP_ERR_MASK) +/* 'normal' is both e and z flags set */ +#define ent_normal(ent) (ent_e_flag(ent) && ent_z_flag(ent)) + +enum btt_init_state { + INIT_UNCHECKED = 0, + INIT_NOTFOUND, + INIT_READY +}; + +/* + * A log group represents one log 'lane', and consists of four log entries. + * Two of the four entries are valid entries, and the remaining two are + * padding. Due to an old bug in the padding location, we need to perform a + * test to determine the padding scheme being used, and use that scheme + * thereafter. + * + * In kernels prior to 4.15, 'log group' would have actual log entries at + * indices (0, 2) and padding at indices (1, 3), where as the correct/updated + * format has log entries at indices (0, 1) and padding at indices (2, 3). + * + * Old (pre 4.15) format: + * +-----------------+-----------------+ + * | ent[0] | ent[1] | + * | 16B | 16B | + * | lba/old/new/seq | pad | + * +-----------------------------------+ + * | ent[2] | ent[3] | + * | 16B | 16B | + * | lba/old/new/seq | pad | + * +-----------------+-----------------+ + * + * New format: + * +-----------------+-----------------+ + * | ent[0] | ent[1] | + * | 16B | 16B | + * | lba/old/new/seq | lba/old/new/seq | + * +-----------------------------------+ + * | ent[2] | ent[3] | + * | 16B | 16B | + * | pad | pad | + * +-----------------+-----------------+ + * + * We detect during start-up which format is in use, and set + * arena->log_index[(0, 1)] with the detected format. + */ + +struct log_entry { + __le32 lba; + __le32 old_map; + __le32 new_map; + __le32 seq; +}; + +struct log_group { + struct log_entry ent[4]; +}; + +struct btt_sb { + u8 signature[BTT_SIG_LEN]; + u8 uuid[16]; + u8 parent_uuid[16]; + __le32 flags; + __le16 version_major; + __le16 version_minor; + __le32 external_lbasize; + __le32 external_nlba; + __le32 internal_lbasize; + __le32 internal_nlba; + __le32 nfree; + __le32 infosize; + __le64 nextoff; + __le64 dataoff; + __le64 mapoff; + __le64 logoff; + __le64 info2off; + u8 padding[3968]; + __le64 checksum; +}; + +struct free_entry { + u32 block; + u8 sub; + u8 seq; + u8 has_err; +}; + +struct aligned_lock { + union { + spinlock_t lock; + u8 cacheline_padding[L1_CACHE_BYTES]; + }; +}; + +/** + * struct arena_info - handle for an arena + * @size: Size in bytes this arena occupies on the raw device. + * This includes arena metadata. + * @external_lba_start: The first external LBA in this arena. + * @internal_nlba: Number of internal blocks available in the arena + * including nfree reserved blocks + * @internal_lbasize: Internal and external lba sizes may be different as + * we can round up 'odd' external lbasizes such as 520B + * to be aligned. + * @external_nlba: Number of blocks contributed by the arena to the number + * reported to upper layers. (internal_nlba - nfree) + * @external_lbasize: LBA size as exposed to upper layers. + * @nfree: A reserve number of 'free' blocks that is used to + * handle incoming writes. + * @version_major: Metadata layout version major. + * @version_minor: Metadata layout version minor. + * @sector_size: The Linux sector size - 512 or 4096 + * @nextoff: Offset in bytes to the start of the next arena. + * @infooff: Offset in bytes to the info block of this arena. + * @dataoff: Offset in bytes to the data area of this arena. + * @mapoff: Offset in bytes to the map area of this arena. + * @logoff: Offset in bytes to the log area of this arena. + * @info2off: Offset in bytes to the backup info block of this arena. + * @freelist: Pointer to in-memory list of free blocks + * @rtt: Pointer to in-memory "Read Tracking Table" + * @map_locks: Spinlocks protecting concurrent map writes + * @nd_btt: Pointer to parent nd_btt structure. + * @list: List head for list of arenas + * @debugfs_dir: Debugfs dentry + * @flags: Arena flags - may signify error states. + * @err_lock: Mutex for synchronizing error clearing. + * @log_index: Indices of the valid log entries in a log_group + * + * arena_info is a per-arena handle. Once an arena is narrowed down for an + * IO, this struct is passed around for the duration of the IO. + */ +struct arena_info { + u64 size; /* Total bytes for this arena */ + u64 external_lba_start; + u32 internal_nlba; + u32 internal_lbasize; + u32 external_nlba; + u32 external_lbasize; + u32 nfree; + u16 version_major; + u16 version_minor; + u32 sector_size; + /* Byte offsets to the different on-media structures */ + u64 nextoff; + u64 infooff; + u64 dataoff; + u64 mapoff; + u64 logoff; + u64 info2off; + /* Pointers to other in-memory structures for this arena */ + struct free_entry *freelist; + u32 *rtt; + struct aligned_lock *map_locks; + struct nd_btt *nd_btt; + struct list_head list; + struct dentry *debugfs_dir; + /* Arena flags */ + u32 flags; + struct mutex err_lock; + int log_index[2]; +}; + +struct badblocks; + +/** + * struct btt - handle for a BTT instance + * @btt_disk: Pointer to the gendisk for BTT device + * @arena_list: Head of the list of arenas + * @debugfs_dir: Debugfs dentry + * @nd_btt: Parent nd_btt struct + * @nlba: Number of logical blocks exposed to the upper layers + * after removing the amount of space needed by metadata + * @rawsize: Total size in bytes of the available backing device + * @lbasize: LBA size as requested and presented to upper layers. + * This is sector_size + size of any metadata. + * @sector_size: The Linux sector size - 512 or 4096 + * @lanes: Per-lane spinlocks + * @init_lock: Mutex used for the BTT initialization + * @init_state: Flag describing the initialization state for the BTT + * @num_arenas: Number of arenas in the BTT instance + * @phys_bb: Pointer to the namespace's badblocks structure + */ +struct btt { + struct gendisk *btt_disk; + struct list_head arena_list; + struct dentry *debugfs_dir; + struct nd_btt *nd_btt; + u64 nlba; + unsigned long long rawsize; + u32 lbasize; + u32 sector_size; + struct nd_region *nd_region; + struct mutex init_lock; + int init_state; + int num_arenas; + struct badblocks *phys_bb; +}; + +bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super); +int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns, + struct btt_sb *btt_sb); + +#endif diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c new file mode 100644 index 000000000..fabbb31f2 --- /dev/null +++ b/drivers/nvdimm/btt_devs.c @@ -0,0 +1,371 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include "nd-core.h" +#include "btt.h" +#include "nd.h" + +static void nd_btt_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_btt *nd_btt = to_nd_btt(dev); + + dev_dbg(dev, "trace\n"); + nd_detach_ndns(&nd_btt->dev, &nd_btt->ndns); + ida_simple_remove(&nd_region->btt_ida, nd_btt->id); + kfree(nd_btt->uuid); + kfree(nd_btt); +} + +struct nd_btt *to_nd_btt(struct device *dev) +{ + struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev); + + WARN_ON(!is_nd_btt(dev)); + return nd_btt; +} +EXPORT_SYMBOL(to_nd_btt); + +static const unsigned long btt_lbasize_supported[] = { 512, 520, 528, + 4096, 4104, 4160, 4224, 0 }; + +static ssize_t sector_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + + return nd_size_select_show(nd_btt->lbasize, btt_lbasize_supported, buf); +} + +static ssize_t sector_size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_size_select_store(dev, buf, &nd_btt->lbasize, + btt_lbasize_supported); + dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, + buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(sector_size); + +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + + if (nd_btt->uuid) + return sprintf(buf, "%pUb\n", nd_btt->uuid); + return sprintf(buf, "\n"); +} + +static ssize_t uuid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len); + dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, + buf[len - 1] == '\n' ? "" : "\n"); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(uuid); + +static ssize_t namespace_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + rc = sprintf(buf, "%s\n", nd_btt->ndns + ? dev_name(&nd_btt->ndns->dev) : ""); + nvdimm_bus_unlock(dev); + return rc; +} + +static ssize_t namespace_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_namespace_store(dev, &nd_btt->ndns, buf, len); + dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, + buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(namespace); + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + if (dev->driver) + rc = sprintf(buf, "%llu\n", nd_btt->size); + else { + /* no size to convey if the btt instance is disabled */ + rc = -ENXIO; + } + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(size); + +static ssize_t log_zero_flags_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Y\n"); +} +static DEVICE_ATTR_RO(log_zero_flags); + +static struct attribute *nd_btt_attributes[] = { + &dev_attr_sector_size.attr, + &dev_attr_namespace.attr, + &dev_attr_uuid.attr, + &dev_attr_size.attr, + &dev_attr_log_zero_flags.attr, + NULL, +}; + +static struct attribute_group nd_btt_attribute_group = { + .attrs = nd_btt_attributes, +}; + +static const struct attribute_group *nd_btt_attribute_groups[] = { + &nd_btt_attribute_group, + &nd_device_attribute_group, + &nd_numa_attribute_group, + NULL, +}; + +static const struct device_type nd_btt_device_type = { + .name = "nd_btt", + .release = nd_btt_release, + .groups = nd_btt_attribute_groups, +}; + +bool is_nd_btt(struct device *dev) +{ + return dev->type == &nd_btt_device_type; +} +EXPORT_SYMBOL(is_nd_btt); + +static struct lock_class_key nvdimm_btt_key; + +static struct device *__nd_btt_create(struct nd_region *nd_region, + unsigned long lbasize, uuid_t *uuid, + struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt; + struct device *dev; + + nd_btt = kzalloc(sizeof(*nd_btt), GFP_KERNEL); + if (!nd_btt) + return NULL; + + nd_btt->id = ida_simple_get(&nd_region->btt_ida, 0, 0, GFP_KERNEL); + if (nd_btt->id < 0) + goto out_nd_btt; + + nd_btt->lbasize = lbasize; + if (uuid) { + uuid = kmemdup(uuid, 16, GFP_KERNEL); + if (!uuid) + goto out_put_id; + } + nd_btt->uuid = uuid; + dev = &nd_btt->dev; + dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id); + dev->parent = &nd_region->dev; + dev->type = &nd_btt_device_type; + device_initialize(&nd_btt->dev); + lockdep_set_class(&nd_btt->dev.mutex, &nvdimm_btt_key); + if (ndns && !__nd_attach_ndns(&nd_btt->dev, ndns, &nd_btt->ndns)) { + dev_dbg(&ndns->dev, "failed, already claimed by %s\n", + dev_name(ndns->claim)); + put_device(dev); + return NULL; + } + return dev; + +out_put_id: + ida_simple_remove(&nd_region->btt_ida, nd_btt->id); + +out_nd_btt: + kfree(nd_btt); + return NULL; +} + +struct device *nd_btt_create(struct nd_region *nd_region) +{ + struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL); + + nd_device_register(dev); + return dev; +} + +/** + * nd_btt_arena_is_valid - check if the metadata layout is valid + * @nd_btt: device with BTT geometry and backing device info + * @super: pointer to the arena's info block being tested + * + * Check consistency of the btt info block with itself by validating + * the checksum, and with the parent namespace by verifying the + * parent_uuid contained in the info block with the one supplied in. + * + * Returns: + * false for an invalid info block, true for a valid one + */ +bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super) +{ + const uuid_t *ns_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev); + uuid_t parent_uuid; + u64 checksum; + + if (memcmp(super->signature, BTT_SIG, BTT_SIG_LEN) != 0) + return false; + + import_uuid(&parent_uuid, super->parent_uuid); + if (!uuid_is_null(&parent_uuid)) + if (!uuid_equal(&parent_uuid, ns_uuid)) + return false; + + checksum = le64_to_cpu(super->checksum); + super->checksum = 0; + if (checksum != nd_sb_checksum((struct nd_gen_sb *) super)) + return false; + super->checksum = cpu_to_le64(checksum); + + /* TODO: figure out action for this */ + if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0) + dev_info(&nd_btt->dev, "Found arena with an error flag\n"); + + return true; +} +EXPORT_SYMBOL(nd_btt_arena_is_valid); + +int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns, + struct btt_sb *btt_sb) +{ + if (ndns->claim_class == NVDIMM_CCLASS_BTT2) { + /* Probe/setup for BTT v2.0 */ + nd_btt->initial_offset = 0; + nd_btt->version_major = 2; + nd_btt->version_minor = 0; + if (nvdimm_read_bytes(ndns, 0, btt_sb, sizeof(*btt_sb), 0)) + return -ENXIO; + if (!nd_btt_arena_is_valid(nd_btt, btt_sb)) + return -ENODEV; + if ((le16_to_cpu(btt_sb->version_major) != 2) || + (le16_to_cpu(btt_sb->version_minor) != 0)) + return -ENODEV; + } else { + /* + * Probe/setup for BTT v1.1 (NVDIMM_CCLASS_NONE or + * NVDIMM_CCLASS_BTT) + */ + nd_btt->initial_offset = SZ_4K; + nd_btt->version_major = 1; + nd_btt->version_minor = 1; + if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0)) + return -ENXIO; + if (!nd_btt_arena_is_valid(nd_btt, btt_sb)) + return -ENODEV; + if ((le16_to_cpu(btt_sb->version_major) != 1) || + (le16_to_cpu(btt_sb->version_minor) != 1)) + return -ENODEV; + } + return 0; +} +EXPORT_SYMBOL(nd_btt_version); + +static int __nd_btt_probe(struct nd_btt *nd_btt, + struct nd_namespace_common *ndns, struct btt_sb *btt_sb) +{ + int rc; + + if (!btt_sb || !ndns || !nd_btt) + return -ENODEV; + + if (nvdimm_namespace_capacity(ndns) < SZ_16M) + return -ENXIO; + + rc = nd_btt_version(nd_btt, ndns, btt_sb); + if (rc < 0) + return rc; + + nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize); + nd_btt->uuid = kmemdup(&btt_sb->uuid, sizeof(uuid_t), GFP_KERNEL); + if (!nd_btt->uuid) + return -ENOMEM; + + nd_device_register(&nd_btt->dev); + + return 0; +} + +int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns) +{ + int rc; + struct device *btt_dev; + struct btt_sb *btt_sb; + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + + if (ndns->force_raw) + return -ENODEV; + + switch (ndns->claim_class) { + case NVDIMM_CCLASS_NONE: + case NVDIMM_CCLASS_BTT: + case NVDIMM_CCLASS_BTT2: + break; + default: + return -ENODEV; + } + + nvdimm_bus_lock(&ndns->dev); + btt_dev = __nd_btt_create(nd_region, 0, NULL, ndns); + nvdimm_bus_unlock(&ndns->dev); + if (!btt_dev) + return -ENOMEM; + btt_sb = devm_kzalloc(dev, sizeof(*btt_sb), GFP_KERNEL); + rc = __nd_btt_probe(to_nd_btt(btt_dev), ndns, btt_sb); + dev_dbg(dev, "btt: %s\n", rc == 0 ? dev_name(btt_dev) : "<none>"); + if (rc < 0) { + struct nd_btt *nd_btt = to_nd_btt(btt_dev); + + nd_detach_ndns(btt_dev, &nd_btt->ndns); + put_device(btt_dev); + } + + return rc; +} +EXPORT_SYMBOL(nd_btt_probe); diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c new file mode 100644 index 000000000..5ad490569 --- /dev/null +++ b/drivers/nvdimm/bus.c @@ -0,0 +1,1355 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/libnvdimm.h> +#include <linux/sched/mm.h> +#include <linux/vmalloc.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/fcntl.h> +#include <linux/async.h> +#include <linux/ndctl.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "nd.h" +#include "pfn.h" + +int nvdimm_major; +static int nvdimm_bus_major; +struct class *nd_class; +static DEFINE_IDA(nd_ida); + +static int to_nd_device_type(struct device *dev) +{ + if (is_nvdimm(dev)) + return ND_DEVICE_DIMM; + else if (is_memory(dev)) + return ND_DEVICE_REGION_PMEM; + else if (is_nd_dax(dev)) + return ND_DEVICE_DAX_PMEM; + else if (is_nd_region(dev->parent)) + return nd_region_to_nstype(to_nd_region(dev->parent)); + + return 0; +} + +static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT, + to_nd_device_type(dev)); +} + +static struct module *to_bus_provider(struct device *dev) +{ + /* pin bus providers while regions are enabled */ + if (is_nd_region(dev)) { + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + return nvdimm_bus->nd_desc->module; + } + return NULL; +} + +static void nvdimm_bus_probe_start(struct nvdimm_bus *nvdimm_bus) +{ + nvdimm_bus_lock(&nvdimm_bus->dev); + nvdimm_bus->probe_active++; + nvdimm_bus_unlock(&nvdimm_bus->dev); +} + +static void nvdimm_bus_probe_end(struct nvdimm_bus *nvdimm_bus) +{ + nvdimm_bus_lock(&nvdimm_bus->dev); + if (--nvdimm_bus->probe_active == 0) + wake_up(&nvdimm_bus->wait); + nvdimm_bus_unlock(&nvdimm_bus->dev); +} + +static int nvdimm_bus_probe(struct device *dev) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); + struct module *provider = to_bus_provider(dev); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + int rc; + + if (!try_module_get(provider)) + return -ENXIO; + + dev_dbg(&nvdimm_bus->dev, "START: %s.probe(%s)\n", + dev->driver->name, dev_name(dev)); + + nvdimm_bus_probe_start(nvdimm_bus); + rc = nd_drv->probe(dev); + if ((rc == 0 || rc == -EOPNOTSUPP) && + dev->parent && is_nd_region(dev->parent)) + nd_region_advance_seeds(to_nd_region(dev->parent), dev); + nvdimm_bus_probe_end(nvdimm_bus); + + dev_dbg(&nvdimm_bus->dev, "END: %s.probe(%s) = %d\n", dev->driver->name, + dev_name(dev), rc); + + if (rc != 0) + module_put(provider); + return rc; +} + +static void nvdimm_bus_remove(struct device *dev) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); + struct module *provider = to_bus_provider(dev); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (nd_drv->remove) + nd_drv->remove(dev); + + dev_dbg(&nvdimm_bus->dev, "%s.remove(%s)\n", dev->driver->name, + dev_name(dev)); + module_put(provider); +} + +static void nvdimm_bus_shutdown(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct nd_device_driver *nd_drv = NULL; + + if (dev->driver) + nd_drv = to_nd_device_driver(dev->driver); + + if (nd_drv && nd_drv->shutdown) { + nd_drv->shutdown(dev); + dev_dbg(&nvdimm_bus->dev, "%s.shutdown(%s)\n", + dev->driver->name, dev_name(dev)); + } +} + +void nd_device_notify(struct device *dev, enum nvdimm_event event) +{ + device_lock(dev); + if (dev->driver) { + struct nd_device_driver *nd_drv; + + nd_drv = to_nd_device_driver(dev->driver); + if (nd_drv->notify) + nd_drv->notify(dev, event); + } + device_unlock(dev); +} +EXPORT_SYMBOL(nd_device_notify); + +void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + + if (!nvdimm_bus) + return; + + /* caller is responsible for holding a reference on the device */ + nd_device_notify(&nd_region->dev, event); +} +EXPORT_SYMBOL_GPL(nvdimm_region_notify); + +struct clear_badblocks_context { + resource_size_t phys, cleared; +}; + +static int nvdimm_clear_badblocks_region(struct device *dev, void *data) +{ + struct clear_badblocks_context *ctx = data; + struct nd_region *nd_region; + resource_size_t ndr_end; + sector_t sector; + + /* make sure device is a region */ + if (!is_memory(dev)) + return 0; + + nd_region = to_nd_region(dev); + ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1; + + /* make sure we are in the region */ + if (ctx->phys < nd_region->ndr_start || + (ctx->phys + ctx->cleared - 1) > ndr_end) + return 0; + + sector = (ctx->phys - nd_region->ndr_start) / 512; + badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512); + + if (nd_region->bb_state) + sysfs_notify_dirent(nd_region->bb_state); + + return 0; +} + +static void nvdimm_clear_badblocks_regions(struct nvdimm_bus *nvdimm_bus, + phys_addr_t phys, u64 cleared) +{ + struct clear_badblocks_context ctx = { + .phys = phys, + .cleared = cleared, + }; + + device_for_each_child(&nvdimm_bus->dev, &ctx, + nvdimm_clear_badblocks_region); +} + +static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus, + phys_addr_t phys, u64 cleared) +{ + if (cleared > 0) + badrange_forget(&nvdimm_bus->badrange, phys, cleared); + + if (cleared > 0 && cleared / 512) + nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared); +} + +long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, + unsigned int len) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc; + struct nd_cmd_clear_error clear_err; + struct nd_cmd_ars_cap ars_cap; + u32 clear_err_unit, mask; + unsigned int noio_flag; + int cmd_rc, rc; + + if (!nvdimm_bus) + return -ENXIO; + + nd_desc = nvdimm_bus->nd_desc; + /* + * if ndctl does not exist, it's PMEM_LEGACY and + * we want to just pretend everything is handled. + */ + if (!nd_desc->ndctl) + return len; + + memset(&ars_cap, 0, sizeof(ars_cap)); + ars_cap.address = phys; + ars_cap.length = len; + noio_flag = memalloc_noio_save(); + rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, &ars_cap, + sizeof(ars_cap), &cmd_rc); + memalloc_noio_restore(noio_flag); + if (rc < 0) + return rc; + if (cmd_rc < 0) + return cmd_rc; + clear_err_unit = ars_cap.clear_err_unit; + if (!clear_err_unit || !is_power_of_2(clear_err_unit)) + return -ENXIO; + + mask = clear_err_unit - 1; + if ((phys | len) & mask) + return -ENXIO; + memset(&clear_err, 0, sizeof(clear_err)); + clear_err.address = phys; + clear_err.length = len; + noio_flag = memalloc_noio_save(); + rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_CLEAR_ERROR, &clear_err, + sizeof(clear_err), &cmd_rc); + memalloc_noio_restore(noio_flag); + if (rc < 0) + return rc; + if (cmd_rc < 0) + return cmd_rc; + + nvdimm_account_cleared_poison(nvdimm_bus, phys, clear_err.cleared); + + return clear_err.cleared; +} +EXPORT_SYMBOL_GPL(nvdimm_clear_poison); + +static int nvdimm_bus_match(struct device *dev, struct device_driver *drv); + +static struct bus_type nvdimm_bus_type = { + .name = "nd", + .uevent = nvdimm_bus_uevent, + .match = nvdimm_bus_match, + .probe = nvdimm_bus_probe, + .remove = nvdimm_bus_remove, + .shutdown = nvdimm_bus_shutdown, +}; + +static void nvdimm_bus_release(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus; + + nvdimm_bus = container_of(dev, struct nvdimm_bus, dev); + ida_simple_remove(&nd_ida, nvdimm_bus->id); + kfree(nvdimm_bus); +} + +static const struct device_type nvdimm_bus_dev_type = { + .release = nvdimm_bus_release, + .groups = nvdimm_bus_attribute_groups, +}; + +bool is_nvdimm_bus(struct device *dev) +{ + return dev->type == &nvdimm_bus_dev_type; +} + +struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev) +{ + struct device *dev; + + for (dev = nd_dev; dev; dev = dev->parent) + if (is_nvdimm_bus(dev)) + break; + dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n"); + if (dev) + return to_nvdimm_bus(dev); + return NULL; +} + +struct nvdimm_bus *to_nvdimm_bus(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus; + + nvdimm_bus = container_of(dev, struct nvdimm_bus, dev); + WARN_ON(!is_nvdimm_bus(dev)); + return nvdimm_bus; +} +EXPORT_SYMBOL_GPL(to_nvdimm_bus); + +struct nvdimm_bus *nvdimm_to_bus(struct nvdimm *nvdimm) +{ + return to_nvdimm_bus(nvdimm->dev.parent); +} +EXPORT_SYMBOL_GPL(nvdimm_to_bus); + +static struct lock_class_key nvdimm_bus_key; + +struct nvdimm_bus *nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nd_desc) +{ + struct nvdimm_bus *nvdimm_bus; + int rc; + + nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL); + if (!nvdimm_bus) + return NULL; + INIT_LIST_HEAD(&nvdimm_bus->list); + INIT_LIST_HEAD(&nvdimm_bus->mapping_list); + init_waitqueue_head(&nvdimm_bus->wait); + nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); + if (nvdimm_bus->id < 0) { + kfree(nvdimm_bus); + return NULL; + } + mutex_init(&nvdimm_bus->reconfig_mutex); + badrange_init(&nvdimm_bus->badrange); + nvdimm_bus->nd_desc = nd_desc; + nvdimm_bus->dev.parent = parent; + nvdimm_bus->dev.type = &nvdimm_bus_dev_type; + nvdimm_bus->dev.groups = nd_desc->attr_groups; + nvdimm_bus->dev.bus = &nvdimm_bus_type; + nvdimm_bus->dev.of_node = nd_desc->of_node; + device_initialize(&nvdimm_bus->dev); + lockdep_set_class(&nvdimm_bus->dev.mutex, &nvdimm_bus_key); + device_set_pm_not_required(&nvdimm_bus->dev); + rc = dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id); + if (rc) + goto err; + + rc = device_add(&nvdimm_bus->dev); + if (rc) { + dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc); + goto err; + } + + return nvdimm_bus; + err: + put_device(&nvdimm_bus->dev); + return NULL; +} +EXPORT_SYMBOL_GPL(nvdimm_bus_register); + +void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) +{ + if (!nvdimm_bus) + return; + device_unregister(&nvdimm_bus->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_bus_unregister); + +static int child_unregister(struct device *dev, void *data) +{ + /* + * the singular ndctl class device per bus needs to be + * "device_destroy"ed, so skip it here + * + * i.e. remove classless children + */ + if (dev->class) + return 0; + + if (is_nvdimm(dev)) + nvdimm_delete(to_nvdimm(dev)); + else + nd_device_unregister(dev, ND_SYNC); + + return 0; +} + +static void free_badrange_list(struct list_head *badrange_list) +{ + struct badrange_entry *bre, *next; + + list_for_each_entry_safe(bre, next, badrange_list, list) { + list_del(&bre->list); + kfree(bre); + } + list_del_init(badrange_list); +} + +static void nd_bus_remove(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + + mutex_lock(&nvdimm_bus_list_mutex); + list_del_init(&nvdimm_bus->list); + mutex_unlock(&nvdimm_bus_list_mutex); + + wait_event(nvdimm_bus->wait, + atomic_read(&nvdimm_bus->ioctl_active) == 0); + + nd_synchronize(); + device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); + + spin_lock(&nvdimm_bus->badrange.lock); + free_badrange_list(&nvdimm_bus->badrange.list); + spin_unlock(&nvdimm_bus->badrange.lock); + + nvdimm_bus_destroy_ndctl(nvdimm_bus); +} + +static int nd_bus_probe(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + int rc; + + rc = nvdimm_bus_create_ndctl(nvdimm_bus); + if (rc) + return rc; + + mutex_lock(&nvdimm_bus_list_mutex); + list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list); + mutex_unlock(&nvdimm_bus_list_mutex); + + /* enable bus provider attributes to look up their local context */ + dev_set_drvdata(dev, nvdimm_bus->nd_desc); + + return 0; +} + +static struct nd_device_driver nd_bus_driver = { + .probe = nd_bus_probe, + .remove = nd_bus_remove, + .drv = { + .name = "nd_bus", + .suppress_bind_attrs = true, + .bus = &nvdimm_bus_type, + .owner = THIS_MODULE, + .mod_name = KBUILD_MODNAME, + }, +}; + +static int nvdimm_bus_match(struct device *dev, struct device_driver *drv) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(drv); + + if (is_nvdimm_bus(dev) && nd_drv == &nd_bus_driver) + return true; + + return !!test_bit(to_nd_device_type(dev), &nd_drv->type); +} + +static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain); + +void nd_synchronize(void) +{ + async_synchronize_full_domain(&nd_async_domain); +} +EXPORT_SYMBOL_GPL(nd_synchronize); + +static void nd_async_device_register(void *d, async_cookie_t cookie) +{ + struct device *dev = d; + + if (device_add(dev) != 0) { + dev_err(dev, "%s: failed\n", __func__); + put_device(dev); + } + put_device(dev); + if (dev->parent) + put_device(dev->parent); +} + +static void nd_async_device_unregister(void *d, async_cookie_t cookie) +{ + struct device *dev = d; + + /* flush bus operations before delete */ + nvdimm_bus_lock(dev); + nvdimm_bus_unlock(dev); + + device_unregister(dev); + put_device(dev); +} + +static void __nd_device_register(struct device *dev, bool sync) +{ + if (!dev) + return; + + /* + * Ensure that region devices always have their NUMA node set as + * early as possible. This way we are able to make certain that + * any memory associated with the creation and the creation + * itself of the region is associated with the correct node. + */ + if (is_nd_region(dev)) + set_dev_node(dev, to_nd_region(dev)->numa_node); + + dev->bus = &nvdimm_bus_type; + device_set_pm_not_required(dev); + if (dev->parent) { + get_device(dev->parent); + if (dev_to_node(dev) == NUMA_NO_NODE) + set_dev_node(dev, dev_to_node(dev->parent)); + } + get_device(dev); + + if (sync) + nd_async_device_register(dev, 0); + else + async_schedule_dev_domain(nd_async_device_register, dev, + &nd_async_domain); +} + +void nd_device_register(struct device *dev) +{ + __nd_device_register(dev, false); +} +EXPORT_SYMBOL(nd_device_register); + +void nd_device_register_sync(struct device *dev) +{ + __nd_device_register(dev, true); +} + +void nd_device_unregister(struct device *dev, enum nd_async_mode mode) +{ + bool killed; + + switch (mode) { + case ND_ASYNC: + /* + * In the async case this is being triggered with the + * device lock held and the unregistration work needs to + * be moved out of line iff this is thread has won the + * race to schedule the deletion. + */ + if (!kill_device(dev)) + return; + + get_device(dev); + async_schedule_domain(nd_async_device_unregister, dev, + &nd_async_domain); + break; + case ND_SYNC: + /* + * In the sync case the device is being unregistered due + * to a state change of the parent. Claim the kill state + * to synchronize against other unregistration requests, + * or otherwise let the async path handle it if the + * unregistration was already queued. + */ + device_lock(dev); + killed = kill_device(dev); + device_unlock(dev); + + if (!killed) + return; + + nd_synchronize(); + device_unregister(dev); + break; + } +} +EXPORT_SYMBOL(nd_device_unregister); + +/** + * __nd_driver_register() - register a region or a namespace driver + * @nd_drv: driver to register + * @owner: automatically set by nd_driver_register() macro + * @mod_name: automatically set by nd_driver_register() macro + */ +int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner, + const char *mod_name) +{ + struct device_driver *drv = &nd_drv->drv; + + if (!nd_drv->type) { + pr_debug("driver type bitmask not set (%ps)\n", + __builtin_return_address(0)); + return -EINVAL; + } + + if (!nd_drv->probe) { + pr_debug("%s ->probe() must be specified\n", mod_name); + return -EINVAL; + } + + drv->bus = &nvdimm_bus_type; + drv->owner = owner; + drv->mod_name = mod_name; + + return driver_register(drv); +} +EXPORT_SYMBOL(__nd_driver_register); + +void nvdimm_check_and_set_ro(struct gendisk *disk) +{ + struct device *dev = disk_to_dev(disk)->parent; + struct nd_region *nd_region = to_nd_region(dev->parent); + int disk_ro = get_disk_ro(disk); + + /* catch the disk up with the region ro state */ + if (disk_ro == nd_region->ro) + return; + + dev_info(dev, "%s read-%s, marking %s read-%s\n", + dev_name(&nd_region->dev), nd_region->ro ? "only" : "write", + disk->disk_name, nd_region->ro ? "only" : "write"); + set_disk_ro(disk, nd_region->ro); +} +EXPORT_SYMBOL(nvdimm_check_and_set_ro); + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, ND_DEVICE_MODALIAS_FMT "\n", + to_nd_device_type(dev)); +} +static DEVICE_ATTR_RO(modalias); + +static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", dev->type->name); +} +static DEVICE_ATTR_RO(devtype); + +static struct attribute *nd_device_attributes[] = { + &dev_attr_modalias.attr, + &dev_attr_devtype.attr, + NULL, +}; + +/* + * nd_device_attribute_group - generic attributes for all devices on an nd bus + */ +const struct attribute_group nd_device_attribute_group = { + .attrs = nd_device_attributes, +}; + +static ssize_t numa_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", dev_to_node(dev)); +} +static DEVICE_ATTR_RO(numa_node); + +static int nvdimm_dev_to_target_node(struct device *dev) +{ + struct device *parent = dev->parent; + struct nd_region *nd_region = NULL; + + if (is_nd_region(dev)) + nd_region = to_nd_region(dev); + else if (parent && is_nd_region(parent)) + nd_region = to_nd_region(parent); + + if (!nd_region) + return NUMA_NO_NODE; + return nd_region->target_node; +} + +static ssize_t target_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", nvdimm_dev_to_target_node(dev)); +} +static DEVICE_ATTR_RO(target_node); + +static struct attribute *nd_numa_attributes[] = { + &dev_attr_numa_node.attr, + &dev_attr_target_node.attr, + NULL, +}; + +static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + + if (!IS_ENABLED(CONFIG_NUMA)) + return 0; + + if (a == &dev_attr_target_node.attr && + nvdimm_dev_to_target_node(dev) == NUMA_NO_NODE) + return 0; + + return a->mode; +} + +/* + * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus + */ +const struct attribute_group nd_numa_attribute_group = { + .attrs = nd_numa_attributes, + .is_visible = nd_numa_attr_visible, +}; + +static void ndctl_release(struct device *dev) +{ + kfree(dev); +} + +static struct lock_class_key nvdimm_ndctl_key; + +int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus) +{ + dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id); + struct device *dev; + int rc; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &nvdimm_ndctl_key); + device_set_pm_not_required(dev); + dev->class = nd_class; + dev->parent = &nvdimm_bus->dev; + dev->devt = devt; + dev->release = ndctl_release; + rc = dev_set_name(dev, "ndctl%d", nvdimm_bus->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) { + dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %d\n", + nvdimm_bus->id, rc); + goto err; + } + return 0; + +err: + put_device(dev); + return rc; +} + +void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus) +{ + device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id)); +} + +static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = { + [ND_CMD_IMPLEMENTED] = { }, + [ND_CMD_SMART] = { + .out_num = 2, + .out_sizes = { 4, 128, }, + }, + [ND_CMD_SMART_THRESHOLD] = { + .out_num = 2, + .out_sizes = { 4, 8, }, + }, + [ND_CMD_DIMM_FLAGS] = { + .out_num = 2, + .out_sizes = { 4, 4 }, + }, + [ND_CMD_GET_CONFIG_SIZE] = { + .out_num = 3, + .out_sizes = { 4, 4, 4, }, + }, + [ND_CMD_GET_CONFIG_DATA] = { + .in_num = 2, + .in_sizes = { 4, 4, }, + .out_num = 2, + .out_sizes = { 4, UINT_MAX, }, + }, + [ND_CMD_SET_CONFIG_DATA] = { + .in_num = 3, + .in_sizes = { 4, 4, UINT_MAX, }, + .out_num = 1, + .out_sizes = { 4, }, + }, + [ND_CMD_VENDOR] = { + .in_num = 3, + .in_sizes = { 4, 4, UINT_MAX, }, + .out_num = 3, + .out_sizes = { 4, 4, UINT_MAX, }, + }, + [ND_CMD_CALL] = { + .in_num = 2, + .in_sizes = { sizeof(struct nd_cmd_pkg), UINT_MAX, }, + .out_num = 1, + .out_sizes = { UINT_MAX, }, + }, +}; + +const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd) +{ + if (cmd < ARRAY_SIZE(__nd_cmd_dimm_descs)) + return &__nd_cmd_dimm_descs[cmd]; + return NULL; +} +EXPORT_SYMBOL_GPL(nd_cmd_dimm_desc); + +static const struct nd_cmd_desc __nd_cmd_bus_descs[] = { + [ND_CMD_IMPLEMENTED] = { }, + [ND_CMD_ARS_CAP] = { + .in_num = 2, + .in_sizes = { 8, 8, }, + .out_num = 4, + .out_sizes = { 4, 4, 4, 4, }, + }, + [ND_CMD_ARS_START] = { + .in_num = 5, + .in_sizes = { 8, 8, 2, 1, 5, }, + .out_num = 2, + .out_sizes = { 4, 4, }, + }, + [ND_CMD_ARS_STATUS] = { + .out_num = 3, + .out_sizes = { 4, 4, UINT_MAX, }, + }, + [ND_CMD_CLEAR_ERROR] = { + .in_num = 2, + .in_sizes = { 8, 8, }, + .out_num = 3, + .out_sizes = { 4, 4, 8, }, + }, + [ND_CMD_CALL] = { + .in_num = 2, + .in_sizes = { sizeof(struct nd_cmd_pkg), UINT_MAX, }, + .out_num = 1, + .out_sizes = { UINT_MAX, }, + }, +}; + +const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd) +{ + if (cmd < ARRAY_SIZE(__nd_cmd_bus_descs)) + return &__nd_cmd_bus_descs[cmd]; + return NULL; +} +EXPORT_SYMBOL_GPL(nd_cmd_bus_desc); + +u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, void *buf) +{ + if (idx >= desc->in_num) + return UINT_MAX; + + if (desc->in_sizes[idx] < UINT_MAX) + return desc->in_sizes[idx]; + + if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && idx == 2) { + struct nd_cmd_set_config_hdr *hdr = buf; + + return hdr->in_length; + } else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) { + struct nd_cmd_vendor_hdr *hdr = buf; + + return hdr->in_length; + } else if (cmd == ND_CMD_CALL) { + struct nd_cmd_pkg *pkg = buf; + + return pkg->nd_size_in; + } + + return UINT_MAX; +} +EXPORT_SYMBOL_GPL(nd_cmd_in_size); + +u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, const u32 *in_field, + const u32 *out_field, unsigned long remainder) +{ + if (idx >= desc->out_num) + return UINT_MAX; + + if (desc->out_sizes[idx] < UINT_MAX) + return desc->out_sizes[idx]; + + if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && idx == 1) + return in_field[1]; + else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) + return out_field[1]; + else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 2) { + /* + * Per table 9-276 ARS Data in ACPI 6.1, out_field[1] is + * "Size of Output Buffer in bytes, including this + * field." + */ + if (out_field[1] < 4) + return 0; + /* + * ACPI 6.1 is ambiguous if 'status' is included in the + * output size. If we encounter an output size that + * overshoots the remainder by 4 bytes, assume it was + * including 'status'. + */ + if (out_field[1] - 4 == remainder) + return remainder; + return out_field[1] - 8; + } else if (cmd == ND_CMD_CALL) { + struct nd_cmd_pkg *pkg = (struct nd_cmd_pkg *) in_field; + + return pkg->nd_size_out; + } + + + return UINT_MAX; +} +EXPORT_SYMBOL_GPL(nd_cmd_out_size); + +void wait_nvdimm_bus_probe_idle(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + do { + if (nvdimm_bus->probe_active == 0) + break; + nvdimm_bus_unlock(dev); + device_unlock(dev); + wait_event(nvdimm_bus->wait, + nvdimm_bus->probe_active == 0); + device_lock(dev); + nvdimm_bus_lock(dev); + } while (true); +} + +static int nd_pmem_forget_poison_check(struct device *dev, void *data) +{ + struct nd_cmd_clear_error *clear_err = + (struct nd_cmd_clear_error *)data; + struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL; + struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL; + struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL; + struct nd_namespace_common *ndns = NULL; + struct nd_namespace_io *nsio; + resource_size_t offset = 0, end_trunc = 0, start, end, pstart, pend; + + if (nd_dax || !dev->driver) + return 0; + + start = clear_err->address; + end = clear_err->address + clear_err->cleared - 1; + + if (nd_btt || nd_pfn || nd_dax) { + if (nd_btt) + ndns = nd_btt->ndns; + else if (nd_pfn) + ndns = nd_pfn->ndns; + else if (nd_dax) + ndns = nd_dax->nd_pfn.ndns; + + if (!ndns) + return 0; + } else + ndns = to_ndns(dev); + + nsio = to_nd_namespace_io(&ndns->dev); + pstart = nsio->res.start + offset; + pend = nsio->res.end - end_trunc; + + if ((pstart >= start) && (pend <= end)) + return -EBUSY; + + return 0; + +} + +static int nd_ns_forget_poison_check(struct device *dev, void *data) +{ + return device_for_each_child(dev, data, nd_pmem_forget_poison_check); +} + +/* set_config requires an idle interleave set */ +static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus, + struct nvdimm *nvdimm, unsigned int cmd, void *data) +{ + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + + /* ask the bus provider if it would like to block this request */ + if (nd_desc->clear_to_send) { + int rc = nd_desc->clear_to_send(nd_desc, nvdimm, cmd, data); + + if (rc) + return rc; + } + + /* require clear error to go through the pmem driver */ + if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR) + return device_for_each_child(&nvdimm_bus->dev, data, + nd_ns_forget_poison_check); + + if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA) + return 0; + + /* prevent label manipulation while the kernel owns label updates */ + wait_nvdimm_bus_probe_idle(&nvdimm_bus->dev); + if (atomic_read(&nvdimm->busy)) + return -EBUSY; + return 0; +} + +static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, + int read_only, unsigned int ioctl_cmd, unsigned long arg) +{ + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + const struct nd_cmd_desc *desc = NULL; + unsigned int cmd = _IOC_NR(ioctl_cmd); + struct device *dev = &nvdimm_bus->dev; + void __user *p = (void __user *) arg; + char *out_env = NULL, *in_env = NULL; + const char *cmd_name, *dimm_name; + u32 in_len = 0, out_len = 0; + unsigned int func = cmd; + unsigned long cmd_mask; + struct nd_cmd_pkg pkg; + int rc, i, cmd_rc; + void *buf = NULL; + u64 buf_len = 0; + + if (nvdimm) { + desc = nd_cmd_dimm_desc(cmd); + cmd_name = nvdimm_cmd_name(cmd); + cmd_mask = nvdimm->cmd_mask; + dimm_name = dev_name(&nvdimm->dev); + } else { + desc = nd_cmd_bus_desc(cmd); + cmd_name = nvdimm_bus_cmd_name(cmd); + cmd_mask = nd_desc->cmd_mask; + dimm_name = "bus"; + } + + /* Validate command family support against bus declared support */ + if (cmd == ND_CMD_CALL) { + unsigned long *mask; + + if (copy_from_user(&pkg, p, sizeof(pkg))) + return -EFAULT; + + if (nvdimm) { + if (pkg.nd_family > NVDIMM_FAMILY_MAX) + return -EINVAL; + mask = &nd_desc->dimm_family_mask; + } else { + if (pkg.nd_family > NVDIMM_BUS_FAMILY_MAX) + return -EINVAL; + mask = &nd_desc->bus_family_mask; + } + + if (!test_bit(pkg.nd_family, mask)) + return -EINVAL; + } + + if (!desc || + (desc->out_num + desc->in_num == 0) || + cmd > ND_CMD_CALL || + !test_bit(cmd, &cmd_mask)) + return -ENOTTY; + + /* fail write commands (when read-only) */ + if (read_only) + switch (cmd) { + case ND_CMD_VENDOR: + case ND_CMD_SET_CONFIG_DATA: + case ND_CMD_ARS_START: + case ND_CMD_CLEAR_ERROR: + case ND_CMD_CALL: + dev_dbg(dev, "'%s' command while read-only.\n", + nvdimm ? nvdimm_cmd_name(cmd) + : nvdimm_bus_cmd_name(cmd)); + return -EPERM; + default: + break; + } + + /* process an input envelope */ + in_env = kzalloc(ND_CMD_MAX_ENVELOPE, GFP_KERNEL); + if (!in_env) + return -ENOMEM; + for (i = 0; i < desc->in_num; i++) { + u32 in_size, copy; + + in_size = nd_cmd_in_size(nvdimm, cmd, desc, i, in_env); + if (in_size == UINT_MAX) { + dev_err(dev, "%s:%s unknown input size cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + rc = -ENXIO; + goto out; + } + if (in_len < ND_CMD_MAX_ENVELOPE) + copy = min_t(u32, ND_CMD_MAX_ENVELOPE - in_len, in_size); + else + copy = 0; + if (copy && copy_from_user(&in_env[in_len], p + in_len, copy)) { + rc = -EFAULT; + goto out; + } + in_len += in_size; + } + + if (cmd == ND_CMD_CALL) { + func = pkg.nd_command; + dev_dbg(dev, "%s, idx: %llu, in: %u, out: %u, len %llu\n", + dimm_name, pkg.nd_command, + in_len, out_len, buf_len); + } + + /* process an output envelope */ + out_env = kzalloc(ND_CMD_MAX_ENVELOPE, GFP_KERNEL); + if (!out_env) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < desc->out_num; i++) { + u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, + (u32 *) in_env, (u32 *) out_env, 0); + u32 copy; + + if (out_size == UINT_MAX) { + dev_dbg(dev, "%s unknown output size cmd: %s field: %d\n", + dimm_name, cmd_name, i); + rc = -EFAULT; + goto out; + } + if (out_len < ND_CMD_MAX_ENVELOPE) + copy = min_t(u32, ND_CMD_MAX_ENVELOPE - out_len, out_size); + else + copy = 0; + if (copy && copy_from_user(&out_env[out_len], + p + in_len + out_len, copy)) { + rc = -EFAULT; + goto out; + } + out_len += out_size; + } + + buf_len = (u64) out_len + (u64) in_len; + if (buf_len > ND_IOCTL_MAX_BUFLEN) { + dev_dbg(dev, "%s cmd: %s buf_len: %llu > %d\n", dimm_name, + cmd_name, buf_len, ND_IOCTL_MAX_BUFLEN); + rc = -EINVAL; + goto out; + } + + buf = vmalloc(buf_len); + if (!buf) { + rc = -ENOMEM; + goto out; + } + + if (copy_from_user(buf, p, buf_len)) { + rc = -EFAULT; + goto out; + } + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, func, buf); + if (rc) + goto out_unlock; + + rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, &cmd_rc); + if (rc < 0) + goto out_unlock; + + if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) { + struct nd_cmd_clear_error *clear_err = buf; + + nvdimm_account_cleared_poison(nvdimm_bus, clear_err->address, + clear_err->cleared); + } + + if (copy_to_user(p, buf, buf_len)) + rc = -EFAULT; + +out_unlock: + nvdimm_bus_unlock(dev); + device_unlock(dev); +out: + kfree(in_env); + kfree(out_env); + vfree(buf); + return rc; +} + +enum nd_ioctl_mode { + BUS_IOCTL, + DIMM_IOCTL, +}; + +static int match_dimm(struct device *dev, void *data) +{ + long id = (long) data; + + if (is_nvdimm(dev)) { + struct nvdimm *nvdimm = to_nvdimm(dev); + + return nvdimm->id == id; + } + + return 0; +} + +static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg, + enum nd_ioctl_mode mode) + +{ + struct nvdimm_bus *nvdimm_bus, *found = NULL; + long id = (long) file->private_data; + struct nvdimm *nvdimm = NULL; + int rc, ro; + + ro = ((file->f_flags & O_ACCMODE) == O_RDONLY); + mutex_lock(&nvdimm_bus_list_mutex); + list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) { + if (mode == DIMM_IOCTL) { + struct device *dev; + + dev = device_find_child(&nvdimm_bus->dev, + file->private_data, match_dimm); + if (!dev) + continue; + nvdimm = to_nvdimm(dev); + found = nvdimm_bus; + } else if (nvdimm_bus->id == id) { + found = nvdimm_bus; + } + + if (found) { + atomic_inc(&nvdimm_bus->ioctl_active); + break; + } + } + mutex_unlock(&nvdimm_bus_list_mutex); + + if (!found) + return -ENXIO; + + nvdimm_bus = found; + rc = __nd_ioctl(nvdimm_bus, nvdimm, ro, cmd, arg); + + if (nvdimm) + put_device(&nvdimm->dev); + if (atomic_dec_and_test(&nvdimm_bus->ioctl_active)) + wake_up(&nvdimm_bus->wait); + + return rc; +} + +static long bus_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return nd_ioctl(file, cmd, arg, BUS_IOCTL); +} + +static long dimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return nd_ioctl(file, cmd, arg, DIMM_IOCTL); +} + +static int nd_open(struct inode *inode, struct file *file) +{ + long minor = iminor(inode); + + file->private_data = (void *) minor; + return 0; +} + +static const struct file_operations nvdimm_bus_fops = { + .owner = THIS_MODULE, + .open = nd_open, + .unlocked_ioctl = bus_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +static const struct file_operations nvdimm_fops = { + .owner = THIS_MODULE, + .open = nd_open, + .unlocked_ioctl = dimm_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +int __init nvdimm_bus_init(void) +{ + int rc; + + rc = bus_register(&nvdimm_bus_type); + if (rc) + return rc; + + rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops); + if (rc < 0) + goto err_bus_chrdev; + nvdimm_bus_major = rc; + + rc = register_chrdev(0, "dimmctl", &nvdimm_fops); + if (rc < 0) + goto err_dimm_chrdev; + nvdimm_major = rc; + + nd_class = class_create(THIS_MODULE, "nd"); + if (IS_ERR(nd_class)) { + rc = PTR_ERR(nd_class); + goto err_class; + } + + rc = driver_register(&nd_bus_driver.drv); + if (rc) + goto err_nd_bus; + + return 0; + + err_nd_bus: + class_destroy(nd_class); + err_class: + unregister_chrdev(nvdimm_major, "dimmctl"); + err_dimm_chrdev: + unregister_chrdev(nvdimm_bus_major, "ndctl"); + err_bus_chrdev: + bus_unregister(&nvdimm_bus_type); + + return rc; +} + +void nvdimm_bus_exit(void) +{ + driver_unregister(&nd_bus_driver.drv); + class_destroy(nd_class); + unregister_chrdev(nvdimm_bus_major, "ndctl"); + unregister_chrdev(nvdimm_major, "dimmctl"); + bus_unregister(&nvdimm_bus_type); + ida_destroy(&nd_ida); +} diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c new file mode 100644 index 000000000..030dbde6b --- /dev/null +++ b/drivers/nvdimm/claim.c @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#include <linux/device.h> +#include <linux/sizes.h> +#include <linux/badblocks.h> +#include "nd-core.h" +#include "pmem.h" +#include "pfn.h" +#include "btt.h" +#include "nd.h" + +void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns) +{ + struct nd_namespace_common *ndns = *_ndns; + struct nvdimm_bus *nvdimm_bus; + + if (!ndns) + return; + + nvdimm_bus = walk_to_nvdimm_bus(&ndns->dev); + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); + dev_WARN_ONCE(dev, ndns->claim != dev, "%s: invalid claim\n", __func__); + ndns->claim = NULL; + *_ndns = NULL; + put_device(&ndns->dev); +} + +void nd_detach_ndns(struct device *dev, + struct nd_namespace_common **_ndns) +{ + struct nd_namespace_common *ndns = *_ndns; + + if (!ndns) + return; + get_device(&ndns->dev); + nvdimm_bus_lock(&ndns->dev); + __nd_detach_ndns(dev, _ndns); + nvdimm_bus_unlock(&ndns->dev); + put_device(&ndns->dev); +} + +bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, + struct nd_namespace_common **_ndns) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&attach->dev); + + if (attach->claim) + return false; + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); + dev_WARN_ONCE(dev, *_ndns, "%s: invalid claim\n", __func__); + attach->claim = dev; + *_ndns = attach; + get_device(&attach->dev); + return true; +} + +bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, + struct nd_namespace_common **_ndns) +{ + bool claimed; + + nvdimm_bus_lock(&attach->dev); + claimed = __nd_attach_ndns(dev, attach, _ndns); + nvdimm_bus_unlock(&attach->dev); + return claimed; +} + +static int namespace_match(struct device *dev, void *data) +{ + char *name = data; + + return strcmp(name, dev_name(dev)) == 0; +} + +static bool is_idle(struct device *dev, struct nd_namespace_common *ndns) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct device *seed = NULL; + + if (is_nd_btt(dev)) + seed = nd_region->btt_seed; + else if (is_nd_pfn(dev)) + seed = nd_region->pfn_seed; + else if (is_nd_dax(dev)) + seed = nd_region->dax_seed; + + if (seed == dev || ndns || dev->driver) + return false; + return true; +} + +struct nd_pfn *to_nd_pfn_safe(struct device *dev) +{ + /* + * pfn device attributes are re-used by dax device instances, so we + * need to be careful to correct device-to-nd_pfn conversion. + */ + if (is_nd_pfn(dev)) + return to_nd_pfn(dev); + + if (is_nd_dax(dev)) { + struct nd_dax *nd_dax = to_nd_dax(dev); + + return &nd_dax->nd_pfn; + } + + WARN_ON(1); + return NULL; +} + +static void nd_detach_and_reset(struct device *dev, + struct nd_namespace_common **_ndns) +{ + /* detach the namespace and destroy / reset the device */ + __nd_detach_ndns(dev, _ndns); + if (is_idle(dev, *_ndns)) { + nd_device_unregister(dev, ND_ASYNC); + } else if (is_nd_btt(dev)) { + struct nd_btt *nd_btt = to_nd_btt(dev); + + nd_btt->lbasize = 0; + kfree(nd_btt->uuid); + nd_btt->uuid = NULL; + } else if (is_nd_pfn(dev) || is_nd_dax(dev)) { + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + + kfree(nd_pfn->uuid); + nd_pfn->uuid = NULL; + nd_pfn->mode = PFN_MODE_NONE; + } +} + +ssize_t nd_namespace_store(struct device *dev, + struct nd_namespace_common **_ndns, const char *buf, + size_t len) +{ + struct nd_namespace_common *ndns; + struct device *found; + char *name; + + if (dev->driver) { + dev_dbg(dev, "namespace already active\n"); + return -EBUSY; + } + + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; + strim(name); + + if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0) + /* pass */; + else { + len = -EINVAL; + goto out; + } + + ndns = *_ndns; + if (strcmp(name, "") == 0) { + nd_detach_and_reset(dev, _ndns); + goto out; + } else if (ndns) { + dev_dbg(dev, "namespace already set to: %s\n", + dev_name(&ndns->dev)); + len = -EBUSY; + goto out; + } + + found = device_find_child(dev->parent, name, namespace_match); + if (!found) { + dev_dbg(dev, "'%s' not found under %s\n", name, + dev_name(dev->parent)); + len = -ENODEV; + goto out; + } + + ndns = to_ndns(found); + + switch (ndns->claim_class) { + case NVDIMM_CCLASS_NONE: + break; + case NVDIMM_CCLASS_BTT: + case NVDIMM_CCLASS_BTT2: + if (!is_nd_btt(dev)) { + len = -EBUSY; + goto out_attach; + } + break; + case NVDIMM_CCLASS_PFN: + if (!is_nd_pfn(dev)) { + len = -EBUSY; + goto out_attach; + } + break; + case NVDIMM_CCLASS_DAX: + if (!is_nd_dax(dev)) { + len = -EBUSY; + goto out_attach; + } + break; + default: + len = -EBUSY; + goto out_attach; + break; + } + + if (__nvdimm_namespace_capacity(ndns) < SZ_16M) { + dev_dbg(dev, "%s too small to host\n", name); + len = -ENXIO; + goto out_attach; + } + + WARN_ON_ONCE(!is_nvdimm_bus_locked(dev)); + if (!__nd_attach_ndns(dev, ndns, _ndns)) { + dev_dbg(dev, "%s already claimed\n", + dev_name(&ndns->dev)); + len = -EBUSY; + } + + out_attach: + put_device(&ndns->dev); /* from device_find_child */ + out: + kfree(name); + return len; +} + +/* + * nd_sb_checksum: compute checksum for a generic info block + * + * Returns a fletcher64 checksum of everything in the given info block + * except the last field (since that's where the checksum lives). + */ +u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb) +{ + u64 sum; + __le64 sum_save; + + BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K); + BUILD_BUG_ON(sizeof(struct nd_pfn_sb) != SZ_4K); + BUILD_BUG_ON(sizeof(struct nd_gen_sb) != SZ_4K); + + sum_save = nd_gen_sb->checksum; + nd_gen_sb->checksum = 0; + sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1); + nd_gen_sb->checksum = sum_save; + return sum; +} +EXPORT_SYMBOL(nd_sb_checksum); + +static int nsio_rw_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size, int rw, + unsigned long flags) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512); + sector_t sector = offset >> 9; + int rc = 0, ret = 0; + + if (unlikely(!size)) + return 0; + + if (unlikely(offset + size > nsio->size)) { + dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n"); + return -EFAULT; + } + + if (rw == READ) { + if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) + return -EIO; + if (copy_mc_to_kernel(buf, nsio->addr + offset, size) != 0) + return -EIO; + return 0; + } + + if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) { + if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512) + && !(flags & NVDIMM_IO_ATOMIC)) { + long cleared; + + might_sleep(); + cleared = nvdimm_clear_poison(&ndns->dev, + nsio->res.start + offset, size); + if (cleared < size) + rc = -EIO; + if (cleared > 0 && cleared / 512) { + cleared /= 512; + badblocks_clear(&nsio->bb, sector, cleared); + } + arch_invalidate_pmem(nsio->addr + offset, size); + } else + rc = -EIO; + } + + memcpy_flushcache(nsio->addr + offset, buf, size); + ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL); + if (ret) + rc = ret; + + return rc; +} + +int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio, + resource_size_t size) +{ + struct nd_namespace_common *ndns = &nsio->common; + struct range range = { + .start = nsio->res.start, + .end = nsio->res.end, + }; + + nsio->size = size; + if (!devm_request_mem_region(dev, range.start, size, + dev_name(&ndns->dev))) { + dev_warn(dev, "could not reserve region %pR\n", &nsio->res); + return -EBUSY; + } + + ndns->rw_bytes = nsio_rw_bytes; + if (devm_init_badblocks(dev, &nsio->bb)) + return -ENOMEM; + nvdimm_badblocks_populate(to_nd_region(ndns->dev.parent), &nsio->bb, + &range); + + nsio->addr = devm_memremap(dev, range.start, size, ARCH_MEMREMAP_PMEM); + + return PTR_ERR_OR_ZERO(nsio->addr); +} + +void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio) +{ + struct resource *res = &nsio->res; + + devm_memunmap(dev, nsio->addr); + devm_exit_badblocks(dev, &nsio->bb); + devm_release_mem_region(dev, res->start, nsio->size); +} diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c new file mode 100644 index 000000000..d91799b71 --- /dev/null +++ b/drivers/nvdimm/core.c @@ -0,0 +1,576 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#include <linux/libnvdimm.h> +#include <linux/suspend.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/blk-integrity.h> +#include <linux/device.h> +#include <linux/ctype.h> +#include <linux/ndctl.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/io.h> +#include "nd-core.h" +#include "nd.h" + +LIST_HEAD(nvdimm_bus_list); +DEFINE_MUTEX(nvdimm_bus_list_mutex); + +void nvdimm_bus_lock(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return; + mutex_lock(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(nvdimm_bus_lock); + +void nvdimm_bus_unlock(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return; + mutex_unlock(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(nvdimm_bus_unlock); + +bool is_nvdimm_bus_locked(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return false; + return mutex_is_locked(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(is_nvdimm_bus_locked); + +struct nvdimm_map { + struct nvdimm_bus *nvdimm_bus; + struct list_head list; + resource_size_t offset; + unsigned long flags; + size_t size; + union { + void *mem; + void __iomem *iomem; + }; + struct kref kref; +}; + +static struct nvdimm_map *find_nvdimm_map(struct device *dev, + resource_size_t offset) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct nvdimm_map *nvdimm_map; + + list_for_each_entry(nvdimm_map, &nvdimm_bus->mapping_list, list) + if (nvdimm_map->offset == offset) + return nvdimm_map; + return NULL; +} + +static struct nvdimm_map *alloc_nvdimm_map(struct device *dev, + resource_size_t offset, size_t size, unsigned long flags) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct nvdimm_map *nvdimm_map; + + nvdimm_map = kzalloc(sizeof(*nvdimm_map), GFP_KERNEL); + if (!nvdimm_map) + return NULL; + + INIT_LIST_HEAD(&nvdimm_map->list); + nvdimm_map->nvdimm_bus = nvdimm_bus; + nvdimm_map->offset = offset; + nvdimm_map->flags = flags; + nvdimm_map->size = size; + kref_init(&nvdimm_map->kref); + + if (!request_mem_region(offset, size, dev_name(&nvdimm_bus->dev))) { + dev_err(&nvdimm_bus->dev, "failed to request %pa + %zd for %s\n", + &offset, size, dev_name(dev)); + goto err_request_region; + } + + if (flags) + nvdimm_map->mem = memremap(offset, size, flags); + else + nvdimm_map->iomem = ioremap(offset, size); + + if (!nvdimm_map->mem) + goto err_map; + + dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev), "%s: bus unlocked!", + __func__); + list_add(&nvdimm_map->list, &nvdimm_bus->mapping_list); + + return nvdimm_map; + + err_map: + release_mem_region(offset, size); + err_request_region: + kfree(nvdimm_map); + return NULL; +} + +static void nvdimm_map_release(struct kref *kref) +{ + struct nvdimm_bus *nvdimm_bus; + struct nvdimm_map *nvdimm_map; + + nvdimm_map = container_of(kref, struct nvdimm_map, kref); + nvdimm_bus = nvdimm_map->nvdimm_bus; + + dev_dbg(&nvdimm_bus->dev, "%pa\n", &nvdimm_map->offset); + list_del(&nvdimm_map->list); + if (nvdimm_map->flags) + memunmap(nvdimm_map->mem); + else + iounmap(nvdimm_map->iomem); + release_mem_region(nvdimm_map->offset, nvdimm_map->size); + kfree(nvdimm_map); +} + +static void nvdimm_map_put(void *data) +{ + struct nvdimm_map *nvdimm_map = data; + struct nvdimm_bus *nvdimm_bus = nvdimm_map->nvdimm_bus; + + nvdimm_bus_lock(&nvdimm_bus->dev); + kref_put(&nvdimm_map->kref, nvdimm_map_release); + nvdimm_bus_unlock(&nvdimm_bus->dev); +} + +/** + * devm_nvdimm_memremap - map a resource that is shared across regions + * @dev: device that will own a reference to the shared mapping + * @offset: physical base address of the mapping + * @size: mapping size + * @flags: memremap flags, or, if zero, perform an ioremap instead + */ +void *devm_nvdimm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + struct nvdimm_map *nvdimm_map; + + nvdimm_bus_lock(dev); + nvdimm_map = find_nvdimm_map(dev, offset); + if (!nvdimm_map) + nvdimm_map = alloc_nvdimm_map(dev, offset, size, flags); + else + kref_get(&nvdimm_map->kref); + nvdimm_bus_unlock(dev); + + if (!nvdimm_map) + return NULL; + + if (devm_add_action_or_reset(dev, nvdimm_map_put, nvdimm_map)) + return NULL; + + return nvdimm_map->mem; +} +EXPORT_SYMBOL_GPL(devm_nvdimm_memremap); + +u64 nd_fletcher64(void *addr, size_t len, bool le) +{ + u32 *buf = addr; + u32 lo32 = 0; + u64 hi32 = 0; + int i; + + for (i = 0; i < len / sizeof(u32); i++) { + lo32 += le ? le32_to_cpu((__le32) buf[i]) : buf[i]; + hi32 += lo32; + } + + return hi32 << 32 | lo32; +} +EXPORT_SYMBOL_GPL(nd_fletcher64); + +struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus) +{ + /* struct nvdimm_bus definition is private to libnvdimm */ + return nvdimm_bus->nd_desc; +} +EXPORT_SYMBOL_GPL(to_nd_desc); + +struct device *to_nvdimm_bus_dev(struct nvdimm_bus *nvdimm_bus) +{ + /* struct nvdimm_bus definition is private to libnvdimm */ + return &nvdimm_bus->dev; +} +EXPORT_SYMBOL_GPL(to_nvdimm_bus_dev); + +/** + * nd_uuid_store: common implementation for writing 'uuid' sysfs attributes + * @dev: container device for the uuid property + * @uuid_out: uuid buffer to replace + * @buf: raw sysfs buffer to parse + * + * Enforce that uuids can only be changed while the device is disabled + * (driver detached) + * LOCKING: expects device_lock() is held on entry + */ +int nd_uuid_store(struct device *dev, uuid_t **uuid_out, const char *buf, + size_t len) +{ + uuid_t uuid; + int rc; + + if (dev->driver) + return -EBUSY; + + rc = uuid_parse(buf, &uuid); + if (rc) + return rc; + + kfree(*uuid_out); + *uuid_out = kmemdup(&uuid, sizeof(uuid), GFP_KERNEL); + if (!(*uuid_out)) + return -ENOMEM; + + return 0; +} + +ssize_t nd_size_select_show(unsigned long current_size, + const unsigned long *supported, char *buf) +{ + ssize_t len = 0; + int i; + + for (i = 0; supported[i]; i++) + if (current_size == supported[i]) + len += sprintf(buf + len, "[%ld] ", supported[i]); + else + len += sprintf(buf + len, "%ld ", supported[i]); + len += sprintf(buf + len, "\n"); + return len; +} + +ssize_t nd_size_select_store(struct device *dev, const char *buf, + unsigned long *current_size, const unsigned long *supported) +{ + unsigned long lbasize; + int rc, i; + + if (dev->driver) + return -EBUSY; + + rc = kstrtoul(buf, 0, &lbasize); + if (rc) + return rc; + + for (i = 0; supported[i]; i++) + if (lbasize == supported[i]) + break; + + if (supported[i]) { + *current_size = lbasize; + return 0; + } else { + return -EINVAL; + } +} + +static ssize_t commands_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int cmd, len = 0; + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + + for_each_set_bit(cmd, &nd_desc->cmd_mask, BITS_PER_LONG) + len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd)); + len += sprintf(buf + len, "\n"); + return len; +} +static DEVICE_ATTR_RO(commands); + +static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus) +{ + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + struct device *parent = nvdimm_bus->dev.parent; + + if (nd_desc->provider_name) + return nd_desc->provider_name; + else if (parent) + return dev_name(parent); + else + return "unknown"; +} + +static ssize_t provider_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + + return sprintf(buf, "%s\n", nvdimm_bus_provider(nvdimm_bus)); +} +static DEVICE_ATTR_RO(provider); + +static int flush_namespaces(struct device *dev, void *data) +{ + device_lock(dev); + device_unlock(dev); + return 0; +} + +static int flush_regions_dimms(struct device *dev, void *data) +{ + device_lock(dev); + device_unlock(dev); + device_for_each_child(dev, NULL, flush_namespaces); + return 0; +} + +static ssize_t wait_probe_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + int rc; + + if (nd_desc->flush_probe) { + rc = nd_desc->flush_probe(nd_desc); + if (rc) + return rc; + } + nd_synchronize(); + device_for_each_child(dev, NULL, flush_regions_dimms); + return sprintf(buf, "1\n"); +} +static DEVICE_ATTR_RO(wait_probe); + +static struct attribute *nvdimm_bus_attributes[] = { + &dev_attr_commands.attr, + &dev_attr_wait_probe.attr, + &dev_attr_provider.attr, + NULL, +}; + +static const struct attribute_group nvdimm_bus_attribute_group = { + .attrs = nvdimm_bus_attributes, +}; + +static ssize_t capability_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + enum nvdimm_fwa_capability cap; + + if (!nd_desc->fw_ops) + return -EOPNOTSUPP; + + cap = nd_desc->fw_ops->capability(nd_desc); + + switch (cap) { + case NVDIMM_FWA_CAP_QUIESCE: + return sprintf(buf, "quiesce\n"); + case NVDIMM_FWA_CAP_LIVE: + return sprintf(buf, "live\n"); + default: + return -EOPNOTSUPP; + } +} + +static DEVICE_ATTR_RO(capability); + +static ssize_t activate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + enum nvdimm_fwa_capability cap; + enum nvdimm_fwa_state state; + + if (!nd_desc->fw_ops) + return -EOPNOTSUPP; + + cap = nd_desc->fw_ops->capability(nd_desc); + state = nd_desc->fw_ops->activate_state(nd_desc); + + if (cap < NVDIMM_FWA_CAP_QUIESCE) + return -EOPNOTSUPP; + + switch (state) { + case NVDIMM_FWA_IDLE: + return sprintf(buf, "idle\n"); + case NVDIMM_FWA_BUSY: + return sprintf(buf, "busy\n"); + case NVDIMM_FWA_ARMED: + return sprintf(buf, "armed\n"); + case NVDIMM_FWA_ARM_OVERFLOW: + return sprintf(buf, "overflow\n"); + default: + return -ENXIO; + } +} + +static int exec_firmware_activate(void *data) +{ + struct nvdimm_bus_descriptor *nd_desc = data; + + return nd_desc->fw_ops->activate(nd_desc); +} + +static ssize_t activate_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + enum nvdimm_fwa_state state; + bool quiesce; + ssize_t rc; + + if (!nd_desc->fw_ops) + return -EOPNOTSUPP; + + if (sysfs_streq(buf, "live")) + quiesce = false; + else if (sysfs_streq(buf, "quiesce")) + quiesce = true; + else + return -EINVAL; + + state = nd_desc->fw_ops->activate_state(nd_desc); + + switch (state) { + case NVDIMM_FWA_BUSY: + rc = -EBUSY; + break; + case NVDIMM_FWA_ARMED: + case NVDIMM_FWA_ARM_OVERFLOW: + if (quiesce) + rc = hibernate_quiet_exec(exec_firmware_activate, nd_desc); + else + rc = nd_desc->fw_ops->activate(nd_desc); + break; + case NVDIMM_FWA_IDLE: + default: + rc = -ENXIO; + } + + if (rc == 0) + rc = len; + return rc; +} + +static DEVICE_ATTR_ADMIN_RW(activate); + +static umode_t nvdimm_bus_firmware_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + enum nvdimm_fwa_capability cap; + + /* + * Both 'activate' and 'capability' disappear when no ops + * detected, or a negative capability is indicated. + */ + if (!nd_desc->fw_ops) + return 0; + + cap = nd_desc->fw_ops->capability(nd_desc); + if (cap < NVDIMM_FWA_CAP_QUIESCE) + return 0; + + return a->mode; +} +static struct attribute *nvdimm_bus_firmware_attributes[] = { + &dev_attr_activate.attr, + &dev_attr_capability.attr, + NULL, +}; + +static const struct attribute_group nvdimm_bus_firmware_attribute_group = { + .name = "firmware", + .attrs = nvdimm_bus_firmware_attributes, + .is_visible = nvdimm_bus_firmware_visible, +}; + +const struct attribute_group *nvdimm_bus_attribute_groups[] = { + &nvdimm_bus_attribute_group, + &nvdimm_bus_firmware_attribute_group, + NULL, +}; + +int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +{ + return badrange_add(&nvdimm_bus->badrange, addr, length); +} +EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange); + +#ifdef CONFIG_BLK_DEV_INTEGRITY +int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) +{ + struct blk_integrity bi; + + if (meta_size == 0) + return 0; + + memset(&bi, 0, sizeof(bi)); + + bi.tuple_size = meta_size; + bi.tag_size = meta_size; + + blk_integrity_register(disk, &bi); + blk_queue_max_integrity_segments(disk->queue, 1); + + return 0; +} +EXPORT_SYMBOL(nd_integrity_init); + +#else /* CONFIG_BLK_DEV_INTEGRITY */ +int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) +{ + return 0; +} +EXPORT_SYMBOL(nd_integrity_init); + +#endif + +static __init int libnvdimm_init(void) +{ + int rc; + + rc = nvdimm_bus_init(); + if (rc) + return rc; + rc = nvdimm_init(); + if (rc) + goto err_dimm; + rc = nd_region_init(); + if (rc) + goto err_region; + + nd_label_init(); + + return 0; + err_region: + nvdimm_exit(); + err_dimm: + nvdimm_bus_exit(); + return rc; +} + +static __exit void libnvdimm_exit(void) +{ + WARN_ON(!list_empty(&nvdimm_bus_list)); + nd_region_exit(); + nvdimm_exit(); + nvdimm_bus_exit(); + nvdimm_devs_exit(); +} + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +subsys_initcall(libnvdimm_init); +module_exit(libnvdimm_exit); diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c new file mode 100644 index 000000000..7f4a9d28b --- /dev/null +++ b/drivers/nvdimm/dax_devs.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2016 Intel Corporation. All rights reserved. + */ +#include <linux/device.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include "nd-core.h" +#include "pfn.h" +#include "nd.h" + +static void nd_dax_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_dax *nd_dax = to_nd_dax(dev); + struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; + + dev_dbg(dev, "trace\n"); + nd_detach_ndns(dev, &nd_pfn->ndns); + ida_simple_remove(&nd_region->dax_ida, nd_pfn->id); + kfree(nd_pfn->uuid); + kfree(nd_dax); +} + +struct nd_dax *to_nd_dax(struct device *dev) +{ + struct nd_dax *nd_dax = container_of(dev, struct nd_dax, nd_pfn.dev); + + WARN_ON(!is_nd_dax(dev)); + return nd_dax; +} +EXPORT_SYMBOL(to_nd_dax); + +static const struct device_type nd_dax_device_type = { + .name = "nd_dax", + .release = nd_dax_release, + .groups = nd_pfn_attribute_groups, +}; + +bool is_nd_dax(struct device *dev) +{ + return dev ? dev->type == &nd_dax_device_type : false; +} +EXPORT_SYMBOL(is_nd_dax); + +static struct nd_dax *nd_dax_alloc(struct nd_region *nd_region) +{ + struct nd_pfn *nd_pfn; + struct nd_dax *nd_dax; + struct device *dev; + + nd_dax = kzalloc(sizeof(*nd_dax), GFP_KERNEL); + if (!nd_dax) + return NULL; + + nd_pfn = &nd_dax->nd_pfn; + nd_pfn->id = ida_simple_get(&nd_region->dax_ida, 0, 0, GFP_KERNEL); + if (nd_pfn->id < 0) { + kfree(nd_dax); + return NULL; + } + + dev = &nd_pfn->dev; + dev_set_name(dev, "dax%d.%d", nd_region->id, nd_pfn->id); + dev->type = &nd_dax_device_type; + dev->parent = &nd_region->dev; + + return nd_dax; +} + +struct device *nd_dax_create(struct nd_region *nd_region) +{ + struct device *dev = NULL; + struct nd_dax *nd_dax; + + if (!is_memory(&nd_region->dev)) + return NULL; + + nd_dax = nd_dax_alloc(nd_region); + if (nd_dax) + dev = nd_pfn_devinit(&nd_dax->nd_pfn, NULL); + nd_device_register(dev); + return dev; +} + +int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns) +{ + int rc; + struct nd_dax *nd_dax; + struct device *dax_dev; + struct nd_pfn *nd_pfn; + struct nd_pfn_sb *pfn_sb; + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + + if (ndns->force_raw) + return -ENODEV; + + switch (ndns->claim_class) { + case NVDIMM_CCLASS_NONE: + case NVDIMM_CCLASS_DAX: + break; + default: + return -ENODEV; + } + + nvdimm_bus_lock(&ndns->dev); + nd_dax = nd_dax_alloc(nd_region); + nd_pfn = &nd_dax->nd_pfn; + dax_dev = nd_pfn_devinit(nd_pfn, ndns); + nvdimm_bus_unlock(&ndns->dev); + if (!dax_dev) + return -ENOMEM; + pfn_sb = devm_kmalloc(dev, sizeof(*pfn_sb), GFP_KERNEL); + nd_pfn->pfn_sb = pfn_sb; + rc = nd_pfn_validate(nd_pfn, DAX_SIG); + dev_dbg(dev, "dax: %s\n", rc == 0 ? dev_name(dax_dev) : "<none>"); + if (rc < 0) { + nd_detach_ndns(dax_dev, &nd_pfn->ndns); + put_device(dax_dev); + } else + nd_device_register(dax_dev); + + return rc; +} +EXPORT_SYMBOL(nd_dax_probe); diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c new file mode 100644 index 000000000..91d9163ee --- /dev/null +++ b/drivers/nvdimm/dimm.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/sizes.h> +#include <linux/ndctl.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/nd.h> +#include "label.h" +#include "nd.h" + +static int nvdimm_probe(struct device *dev) +{ + struct nvdimm_drvdata *ndd; + int rc; + + rc = nvdimm_security_setup_events(dev); + if (rc < 0) { + dev_err(dev, "security event setup failed: %d\n", rc); + return rc; + } + + rc = nvdimm_check_config_data(dev); + if (rc) { + /* not required for non-aliased nvdimm, ex. NVDIMM-N */ + if (rc == -ENOTTY) + rc = 0; + return rc; + } + + /* + * The locked status bit reflects explicit status codes from the + * label reading commands, revalidate it each time the driver is + * activated and re-reads the label area. + */ + nvdimm_clear_locked(dev); + + ndd = kzalloc(sizeof(*ndd), GFP_KERNEL); + if (!ndd) + return -ENOMEM; + + dev_set_drvdata(dev, ndd); + ndd->dpa.name = dev_name(dev); + ndd->ns_current = -1; + ndd->ns_next = -1; + ndd->dpa.start = 0; + ndd->dpa.end = -1; + ndd->dev = dev; + get_device(dev); + kref_init(&ndd->kref); + + /* + * Attempt to unlock, if the DIMM supports security commands, + * otherwise the locked indication is determined by explicit + * status codes from the label reading commands. + */ + rc = nvdimm_security_unlock(dev); + if (rc < 0) + dev_dbg(dev, "failed to unlock dimm: %d\n", rc); + + + /* + * EACCES failures reading the namespace label-area-properties + * are interpreted as the DIMM capacity being locked but the + * namespace labels themselves being accessible. + */ + rc = nvdimm_init_nsarea(ndd); + if (rc == -EACCES) { + /* + * See nvdimm_namespace_common_probe() where we fail to + * allow namespaces to probe while the DIMM is locked, + * but we do allow for namespace enumeration. + */ + nvdimm_set_locked(dev); + rc = 0; + } + if (rc) + goto err; + + /* + * EACCES failures reading the namespace label-data are + * interpreted as the label area being locked in addition to the + * DIMM capacity. We fail the dimm probe to prevent regions from + * attempting to parse the label area. + */ + rc = nd_label_data_init(ndd); + if (rc == -EACCES) + nvdimm_set_locked(dev); + if (rc) + goto err; + + dev_dbg(dev, "config data size: %d\n", ndd->nsarea.config_size); + + nvdimm_bus_lock(dev); + if (ndd->ns_current >= 0) { + rc = nd_label_reserve_dpa(ndd); + if (rc == 0) + nvdimm_set_labeling(dev); + } + nvdimm_bus_unlock(dev); + + if (rc) + goto err; + + return 0; + + err: + put_ndd(ndd); + return rc; +} + +static void nvdimm_remove(struct device *dev) +{ + struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + + nvdimm_bus_lock(dev); + dev_set_drvdata(dev, NULL); + nvdimm_bus_unlock(dev); + put_ndd(ndd); +} + +static struct nd_device_driver nvdimm_driver = { + .probe = nvdimm_probe, + .remove = nvdimm_remove, + .drv = { + .name = "nvdimm", + }, + .type = ND_DRIVER_DIMM, +}; + +int __init nvdimm_init(void) +{ + return nd_driver_register(&nvdimm_driver); +} + +void nvdimm_exit(void) +{ + driver_unregister(&nvdimm_driver.drv); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DIMM); diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c new file mode 100644 index 000000000..1634e3c34 --- /dev/null +++ b/drivers/nvdimm/dimm_devs.c @@ -0,0 +1,874 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/moduleparam.h> +#include <linux/vmalloc.h> +#include <linux/device.h> +#include <linux/ndctl.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include "nd-core.h" +#include "label.h" +#include "pmem.h" +#include "nd.h" + +static DEFINE_IDA(dimm_ida); + +/* + * Retrieve bus and dimm handle and return if this bus supports + * get_config_data commands + */ +int nvdimm_check_config_data(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + if (!nvdimm->cmd_mask || + !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) { + if (test_bit(NDD_LABELING, &nvdimm->flags)) + return -ENXIO; + else + return -ENOTTY; + } + + return 0; +} + +static int validate_dimm(struct nvdimm_drvdata *ndd) +{ + int rc; + + if (!ndd) + return -EINVAL; + + rc = nvdimm_check_config_data(ndd->dev); + if (rc) + dev_dbg(ndd->dev, "%ps: %s error: %d\n", + __builtin_return_address(0), __func__, rc); + return rc; +} + +/** + * nvdimm_init_nsarea - determine the geometry of a dimm's namespace area + * @nvdimm: dimm to initialize + */ +int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd) +{ + struct nd_cmd_get_config_size *cmd = &ndd->nsarea; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nvdimm_bus_descriptor *nd_desc; + int rc = validate_dimm(ndd); + int cmd_rc = 0; + + if (rc) + return rc; + + if (cmd->config_size) + return 0; /* already valid */ + + memset(cmd, 0, sizeof(*cmd)); + nd_desc = nvdimm_bus->nd_desc; + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), &cmd_rc); + if (rc < 0) + return rc; + return cmd_rc; +} + +int nvdimm_get_config_data(struct nvdimm_drvdata *ndd, void *buf, + size_t offset, size_t len) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + int rc = validate_dimm(ndd), cmd_rc = 0; + struct nd_cmd_get_config_data_hdr *cmd; + size_t max_cmd_size, buf_offset; + + if (rc) + return rc; + + if (offset + len > ndd->nsarea.config_size) + return -ENXIO; + + max_cmd_size = min_t(u32, len, ndd->nsarea.max_xfer); + cmd = kvzalloc(max_cmd_size + sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + for (buf_offset = 0; len; + len -= cmd->in_length, buf_offset += cmd->in_length) { + size_t cmd_size; + + cmd->in_offset = offset + buf_offset; + cmd->in_length = min(max_cmd_size, len); + + cmd_size = sizeof(*cmd) + cmd->in_length; + + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_GET_CONFIG_DATA, cmd, cmd_size, &cmd_rc); + if (rc < 0) + break; + if (cmd_rc < 0) { + rc = cmd_rc; + break; + } + + /* out_buf should be valid, copy it into our output buffer */ + memcpy(buf + buf_offset, cmd->out_buf, cmd->in_length); + } + kvfree(cmd); + + return rc; +} + +int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, + void *buf, size_t len) +{ + size_t max_cmd_size, buf_offset; + struct nd_cmd_set_config_hdr *cmd; + int rc = validate_dimm(ndd), cmd_rc = 0; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + + if (rc) + return rc; + + if (offset + len > ndd->nsarea.config_size) + return -ENXIO; + + max_cmd_size = min_t(u32, len, ndd->nsarea.max_xfer); + cmd = kvzalloc(max_cmd_size + sizeof(*cmd) + sizeof(u32), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + for (buf_offset = 0; len; len -= cmd->in_length, + buf_offset += cmd->in_length) { + size_t cmd_size; + + cmd->in_offset = offset + buf_offset; + cmd->in_length = min(max_cmd_size, len); + memcpy(cmd->in_buf, buf + buf_offset, cmd->in_length); + + /* status is output in the last 4-bytes of the command buffer */ + cmd_size = sizeof(*cmd) + cmd->in_length + sizeof(u32); + + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_SET_CONFIG_DATA, cmd, cmd_size, &cmd_rc); + if (rc < 0) + break; + if (cmd_rc < 0) { + rc = cmd_rc; + break; + } + } + kvfree(cmd); + + return rc; +} + +void nvdimm_set_labeling(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + set_bit(NDD_LABELING, &nvdimm->flags); +} + +void nvdimm_set_locked(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + set_bit(NDD_LOCKED, &nvdimm->flags); +} + +void nvdimm_clear_locked(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + clear_bit(NDD_LOCKED, &nvdimm->flags); +} + +static void nvdimm_release(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + ida_simple_remove(&dimm_ida, nvdimm->id); + kfree(nvdimm); +} + +struct nvdimm *to_nvdimm(struct device *dev) +{ + struct nvdimm *nvdimm = container_of(dev, struct nvdimm, dev); + + WARN_ON(!is_nvdimm(dev)); + return nvdimm; +} +EXPORT_SYMBOL_GPL(to_nvdimm); + +struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping) +{ + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev)); + + return dev_get_drvdata(&nvdimm->dev); +} +EXPORT_SYMBOL(to_ndd); + +void nvdimm_drvdata_release(struct kref *kref) +{ + struct nvdimm_drvdata *ndd = container_of(kref, typeof(*ndd), kref); + struct device *dev = ndd->dev; + struct resource *res, *_r; + + dev_dbg(dev, "trace\n"); + nvdimm_bus_lock(dev); + for_each_dpa_resource_safe(ndd, res, _r) + nvdimm_free_dpa(ndd, res); + nvdimm_bus_unlock(dev); + + kvfree(ndd->data); + kfree(ndd); + put_device(dev); +} + +void get_ndd(struct nvdimm_drvdata *ndd) +{ + kref_get(&ndd->kref); +} + +void put_ndd(struct nvdimm_drvdata *ndd) +{ + if (ndd) + kref_put(&ndd->kref, nvdimm_drvdata_release); +} + +const char *nvdimm_name(struct nvdimm *nvdimm) +{ + return dev_name(&nvdimm->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_name); + +struct kobject *nvdimm_kobj(struct nvdimm *nvdimm) +{ + return &nvdimm->dev.kobj; +} +EXPORT_SYMBOL_GPL(nvdimm_kobj); + +unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm) +{ + return nvdimm->cmd_mask; +} +EXPORT_SYMBOL_GPL(nvdimm_cmd_mask); + +void *nvdimm_provider_data(struct nvdimm *nvdimm) +{ + if (nvdimm) + return nvdimm->provider_data; + return NULL; +} +EXPORT_SYMBOL_GPL(nvdimm_provider_data); + +static ssize_t commands_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + int cmd, len = 0; + + if (!nvdimm->cmd_mask) + return sprintf(buf, "\n"); + + for_each_set_bit(cmd, &nvdimm->cmd_mask, BITS_PER_LONG) + len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd)); + len += sprintf(buf + len, "\n"); + return len; +} +static DEVICE_ATTR_RO(commands); + +static ssize_t flags_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + return sprintf(buf, "%s%s\n", + test_bit(NDD_LABELING, &nvdimm->flags) ? "label " : "", + test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : ""); +} +static DEVICE_ATTR_RO(flags); + +static ssize_t state_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + /* + * The state may be in the process of changing, userspace should + * quiesce probing if it wants a static answer + */ + nvdimm_bus_lock(dev); + nvdimm_bus_unlock(dev); + return sprintf(buf, "%s\n", atomic_read(&nvdimm->busy) + ? "active" : "idle"); +} +static DEVICE_ATTR_RO(state); + +static ssize_t __available_slots_show(struct nvdimm_drvdata *ndd, char *buf) +{ + struct device *dev; + ssize_t rc; + u32 nfree; + + if (!ndd) + return -ENXIO; + + dev = ndd->dev; + nvdimm_bus_lock(dev); + nfree = nd_label_nfree(ndd); + if (nfree - 1 > nfree) { + dev_WARN_ONCE(dev, 1, "we ate our last label?\n"); + nfree = 0; + } else + nfree--; + rc = sprintf(buf, "%d\n", nfree); + nvdimm_bus_unlock(dev); + return rc; +} + +static ssize_t available_slots_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t rc; + + device_lock(dev); + rc = __available_slots_show(dev_get_drvdata(dev), buf); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(available_slots); + +__weak ssize_t security_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + if (test_bit(NVDIMM_SECURITY_OVERWRITE, &nvdimm->sec.flags)) + return sprintf(buf, "overwrite\n"); + if (test_bit(NVDIMM_SECURITY_DISABLED, &nvdimm->sec.flags)) + return sprintf(buf, "disabled\n"); + if (test_bit(NVDIMM_SECURITY_UNLOCKED, &nvdimm->sec.flags)) + return sprintf(buf, "unlocked\n"); + if (test_bit(NVDIMM_SECURITY_LOCKED, &nvdimm->sec.flags)) + return sprintf(buf, "locked\n"); + return -ENOTTY; +} + +static ssize_t frozen_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + return sprintf(buf, "%d\n", test_bit(NVDIMM_SECURITY_FROZEN, + &nvdimm->sec.flags)); +} +static DEVICE_ATTR_RO(frozen); + +static ssize_t security_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) + +{ + ssize_t rc; + + /* + * Require all userspace triggered security management to be + * done while probing is idle and the DIMM is not in active use + * in any region. + */ + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = nvdimm_security_store(dev, buf, len); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(security); + +static struct attribute *nvdimm_attributes[] = { + &dev_attr_state.attr, + &dev_attr_flags.attr, + &dev_attr_commands.attr, + &dev_attr_available_slots.attr, + &dev_attr_security.attr, + &dev_attr_frozen.attr, + NULL, +}; + +static umode_t nvdimm_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct nvdimm *nvdimm = to_nvdimm(dev); + + if (a != &dev_attr_security.attr && a != &dev_attr_frozen.attr) + return a->mode; + if (!nvdimm->sec.flags) + return 0; + + if (a == &dev_attr_security.attr) { + /* Are there any state mutation ops (make writable)? */ + if (nvdimm->sec.ops->freeze || nvdimm->sec.ops->disable + || nvdimm->sec.ops->change_key + || nvdimm->sec.ops->erase + || nvdimm->sec.ops->overwrite) + return a->mode; + return 0444; + } + + if (nvdimm->sec.ops->freeze) + return a->mode; + return 0; +} + +static const struct attribute_group nvdimm_attribute_group = { + .attrs = nvdimm_attributes, + .is_visible = nvdimm_visible, +}; + +static ssize_t result_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + enum nvdimm_fwa_result result; + + if (!nvdimm->fw_ops) + return -EOPNOTSUPP; + + nvdimm_bus_lock(dev); + result = nvdimm->fw_ops->activate_result(nvdimm); + nvdimm_bus_unlock(dev); + + switch (result) { + case NVDIMM_FWA_RESULT_NONE: + return sprintf(buf, "none\n"); + case NVDIMM_FWA_RESULT_SUCCESS: + return sprintf(buf, "success\n"); + case NVDIMM_FWA_RESULT_FAIL: + return sprintf(buf, "fail\n"); + case NVDIMM_FWA_RESULT_NOTSTAGED: + return sprintf(buf, "not_staged\n"); + case NVDIMM_FWA_RESULT_NEEDRESET: + return sprintf(buf, "need_reset\n"); + default: + return -ENXIO; + } +} +static DEVICE_ATTR_ADMIN_RO(result); + +static ssize_t activate_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + enum nvdimm_fwa_state state; + + if (!nvdimm->fw_ops) + return -EOPNOTSUPP; + + nvdimm_bus_lock(dev); + state = nvdimm->fw_ops->activate_state(nvdimm); + nvdimm_bus_unlock(dev); + + switch (state) { + case NVDIMM_FWA_IDLE: + return sprintf(buf, "idle\n"); + case NVDIMM_FWA_BUSY: + return sprintf(buf, "busy\n"); + case NVDIMM_FWA_ARMED: + return sprintf(buf, "armed\n"); + default: + return -ENXIO; + } +} + +static ssize_t activate_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + enum nvdimm_fwa_trigger arg; + int rc; + + if (!nvdimm->fw_ops) + return -EOPNOTSUPP; + + if (sysfs_streq(buf, "arm")) + arg = NVDIMM_FWA_ARM; + else if (sysfs_streq(buf, "disarm")) + arg = NVDIMM_FWA_DISARM; + else + return -EINVAL; + + nvdimm_bus_lock(dev); + rc = nvdimm->fw_ops->arm(nvdimm, arg); + nvdimm_bus_unlock(dev); + + if (rc < 0) + return rc; + return len; +} +static DEVICE_ATTR_ADMIN_RW(activate); + +static struct attribute *nvdimm_firmware_attributes[] = { + &dev_attr_activate.attr, + &dev_attr_result.attr, + NULL, +}; + +static umode_t nvdimm_firmware_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + struct nvdimm *nvdimm = to_nvdimm(dev); + enum nvdimm_fwa_capability cap; + + if (!nd_desc->fw_ops) + return 0; + if (!nvdimm->fw_ops) + return 0; + + nvdimm_bus_lock(dev); + cap = nd_desc->fw_ops->capability(nd_desc); + nvdimm_bus_unlock(dev); + + if (cap < NVDIMM_FWA_CAP_QUIESCE) + return 0; + + return a->mode; +} + +static const struct attribute_group nvdimm_firmware_attribute_group = { + .name = "firmware", + .attrs = nvdimm_firmware_attributes, + .is_visible = nvdimm_firmware_visible, +}; + +static const struct attribute_group *nvdimm_attribute_groups[] = { + &nd_device_attribute_group, + &nvdimm_attribute_group, + &nvdimm_firmware_attribute_group, + NULL, +}; + +static const struct device_type nvdimm_device_type = { + .name = "nvdimm", + .release = nvdimm_release, + .groups = nvdimm_attribute_groups, +}; + +bool is_nvdimm(struct device *dev) +{ + return dev->type == &nvdimm_device_type; +} + +static struct lock_class_key nvdimm_key; + +struct nvdimm *__nvdimm_create(struct nvdimm_bus *nvdimm_bus, + void *provider_data, const struct attribute_group **groups, + unsigned long flags, unsigned long cmd_mask, int num_flush, + struct resource *flush_wpq, const char *dimm_id, + const struct nvdimm_security_ops *sec_ops, + const struct nvdimm_fw_ops *fw_ops) +{ + struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL); + struct device *dev; + + if (!nvdimm) + return NULL; + + nvdimm->id = ida_simple_get(&dimm_ida, 0, 0, GFP_KERNEL); + if (nvdimm->id < 0) { + kfree(nvdimm); + return NULL; + } + + nvdimm->dimm_id = dimm_id; + nvdimm->provider_data = provider_data; + nvdimm->flags = flags; + nvdimm->cmd_mask = cmd_mask; + nvdimm->num_flush = num_flush; + nvdimm->flush_wpq = flush_wpq; + atomic_set(&nvdimm->busy, 0); + dev = &nvdimm->dev; + dev_set_name(dev, "nmem%d", nvdimm->id); + dev->parent = &nvdimm_bus->dev; + dev->type = &nvdimm_device_type; + dev->devt = MKDEV(nvdimm_major, nvdimm->id); + dev->groups = groups; + nvdimm->sec.ops = sec_ops; + nvdimm->fw_ops = fw_ops; + nvdimm->sec.overwrite_tmo = 0; + INIT_DELAYED_WORK(&nvdimm->dwork, nvdimm_security_overwrite_query); + /* + * Security state must be initialized before device_add() for + * attribute visibility. + */ + /* get security state and extended (master) state */ + nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER); + nvdimm->sec.ext_flags = nvdimm_security_flags(nvdimm, NVDIMM_MASTER); + device_initialize(dev); + lockdep_set_class(&dev->mutex, &nvdimm_key); + if (test_bit(NDD_REGISTER_SYNC, &flags)) + nd_device_register_sync(dev); + else + nd_device_register(dev); + + return nvdimm; +} +EXPORT_SYMBOL_GPL(__nvdimm_create); + +void nvdimm_delete(struct nvdimm *nvdimm) +{ + struct device *dev = &nvdimm->dev; + bool dev_put = false; + + /* We are shutting down. Make state frozen artificially. */ + nvdimm_bus_lock(dev); + set_bit(NVDIMM_SECURITY_FROZEN, &nvdimm->sec.flags); + if (test_and_clear_bit(NDD_WORK_PENDING, &nvdimm->flags)) + dev_put = true; + nvdimm_bus_unlock(dev); + cancel_delayed_work_sync(&nvdimm->dwork); + if (dev_put) + put_device(dev); + nd_device_unregister(dev, ND_SYNC); +} +EXPORT_SYMBOL_GPL(nvdimm_delete); + +static void shutdown_security_notify(void *data) +{ + struct nvdimm *nvdimm = data; + + sysfs_put(nvdimm->sec.overwrite_state); +} + +int nvdimm_security_setup_events(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + if (!nvdimm->sec.flags || !nvdimm->sec.ops + || !nvdimm->sec.ops->overwrite) + return 0; + nvdimm->sec.overwrite_state = sysfs_get_dirent(dev->kobj.sd, "security"); + if (!nvdimm->sec.overwrite_state) + return -ENOMEM; + + return devm_add_action_or_reset(dev, shutdown_security_notify, nvdimm); +} +EXPORT_SYMBOL_GPL(nvdimm_security_setup_events); + +int nvdimm_in_overwrite(struct nvdimm *nvdimm) +{ + return test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags); +} +EXPORT_SYMBOL_GPL(nvdimm_in_overwrite); + +int nvdimm_security_freeze(struct nvdimm *nvdimm) +{ + int rc; + + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev)); + + if (!nvdimm->sec.ops || !nvdimm->sec.ops->freeze) + return -EOPNOTSUPP; + + if (!nvdimm->sec.flags) + return -EIO; + + if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) { + dev_warn(&nvdimm->dev, "Overwrite operation in progress.\n"); + return -EBUSY; + } + + rc = nvdimm->sec.ops->freeze(nvdimm); + nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER); + + return rc; +} + +static unsigned long dpa_align(struct nd_region *nd_region) +{ + struct device *dev = &nd_region->dev; + + if (dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev), + "bus lock required for capacity provision\n")) + return 0; + if (dev_WARN_ONCE(dev, !nd_region->ndr_mappings || nd_region->align + % nd_region->ndr_mappings, + "invalid region align %#lx mappings: %d\n", + nd_region->align, nd_region->ndr_mappings)) + return 0; + return nd_region->align / nd_region->ndr_mappings; +} + +/** + * nd_pmem_max_contiguous_dpa - For the given dimm+region, return the max + * contiguous unallocated dpa range. + * @nd_region: constrain available space check to this reference region + * @nd_mapping: container of dpa-resource-root + labels + */ +resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nvdimm_bus *nvdimm_bus; + resource_size_t max = 0; + struct resource *res; + unsigned long align; + + /* if a dimm is disabled the available capacity is zero */ + if (!ndd) + return 0; + + align = dpa_align(nd_region); + if (!align) + return 0; + + nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + if (__reserve_free_pmem(&nd_region->dev, nd_mapping->nvdimm)) + return 0; + for_each_dpa_resource(ndd, res) { + resource_size_t start, end; + + if (strcmp(res->name, "pmem-reserve") != 0) + continue; + /* trim free space relative to current alignment setting */ + start = ALIGN(res->start, align); + end = ALIGN_DOWN(res->end + 1, align) - 1; + if (end < start) + continue; + if (end - start + 1 > max) + max = end - start + 1; + } + release_free_pmem(nvdimm_bus, nd_mapping); + return max; +} + +/** + * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa + * @nd_mapping: container of dpa-resource-root + labels + * @nd_region: constrain available space check to this reference region + * + * Validate that a PMEM label, if present, aligns with the start of an + * interleave set. + */ +resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + resource_size_t map_start, map_end, busy = 0; + struct resource *res; + unsigned long align; + + if (!ndd) + return 0; + + align = dpa_align(nd_region); + if (!align) + return 0; + + map_start = nd_mapping->start; + map_end = map_start + nd_mapping->size - 1; + for_each_dpa_resource(ndd, res) { + resource_size_t start, end; + + start = ALIGN_DOWN(res->start, align); + end = ALIGN(res->end + 1, align) - 1; + if (start >= map_start && start < map_end) { + if (end > map_end) { + nd_dbg_dpa(nd_region, ndd, res, + "misaligned to iset\n"); + return 0; + } + busy += end - start + 1; + } else if (end >= map_start && end <= map_end) { + busy += end - start + 1; + } else if (map_start > start && map_start < end) { + /* total eclipse of the mapping */ + busy += nd_mapping->size; + } + } + + if (busy < nd_mapping->size) + return ALIGN_DOWN(nd_mapping->size - busy, align); + return 0; +} + +void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res) +{ + WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev)); + kfree(res->name); + __release_region(&ndd->dpa, res->start, resource_size(res)); +} + +struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id, resource_size_t start, + resource_size_t n) +{ + char *name = kmemdup(label_id, sizeof(*label_id), GFP_KERNEL); + struct resource *res; + + if (!name) + return NULL; + + WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev)); + res = __request_region(&ndd->dpa, start, n, name, 0); + if (!res) + kfree(name); + return res; +} + +/** + * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id + * @nvdimm: container of dpa-resource-root + labels + * @label_id: dpa resource name of the form pmem-<human readable uuid> + */ +resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id) +{ + resource_size_t allocated = 0; + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id->id) == 0) + allocated += resource_size(res); + + return allocated; +} + +static int count_dimms(struct device *dev, void *c) +{ + int *count = c; + + if (is_nvdimm(dev)) + (*count)++; + return 0; +} + +int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count) +{ + int count = 0; + /* Flush any possible dimm registration failures */ + nd_synchronize(); + + device_for_each_child(&nvdimm_bus->dev, &count, count_dimms); + dev_dbg(&nvdimm_bus->dev, "count: %d\n", count); + if (count != dimm_count) + return -ENXIO; + return 0; +} +EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count); + +void __exit nvdimm_devs_exit(void) +{ + ida_destroy(&dimm_ida); +} diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c new file mode 100644 index 000000000..4cd18be9d --- /dev/null +++ b/drivers/nvdimm/e820.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2015, Christoph Hellwig. + * Copyright (c) 2015, Intel Corporation. + */ +#include <linux/platform_device.h> +#include <linux/memory_hotplug.h> +#include <linux/libnvdimm.h> +#include <linux/module.h> +#include <linux/numa.h> + +static int e820_pmem_remove(struct platform_device *pdev) +{ + struct nvdimm_bus *nvdimm_bus = platform_get_drvdata(pdev); + + nvdimm_bus_unregister(nvdimm_bus); + return 0; +} + +static int e820_register_one(struct resource *res, void *data) +{ + struct nd_region_desc ndr_desc; + struct nvdimm_bus *nvdimm_bus = data; + int nid = phys_to_target_node(res->start); + + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.res = res; + ndr_desc.numa_node = numa_map_to_online_node(nid); + ndr_desc.target_node = nid; + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + return -ENXIO; + return 0; +} + +static int e820_pmem_probe(struct platform_device *pdev) +{ + static struct nvdimm_bus_descriptor nd_desc; + struct device *dev = &pdev->dev; + struct nvdimm_bus *nvdimm_bus; + int rc = -ENXIO; + + nd_desc.provider_name = "e820"; + nd_desc.module = THIS_MODULE; + nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); + if (!nvdimm_bus) + goto err; + platform_set_drvdata(pdev, nvdimm_bus); + + rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY, + IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one); + if (rc) + goto err; + return 0; +err: + nvdimm_bus_unregister(nvdimm_bus); + dev_err(dev, "failed to register legacy persistent memory ranges\n"); + return rc; +} + +static struct platform_driver e820_pmem_driver = { + .probe = e820_pmem_probe, + .remove = e820_pmem_remove, + .driver = { + .name = "e820_pmem", + }, +}; + +module_platform_driver(e820_pmem_driver); + +MODULE_ALIAS("platform:e820_pmem*"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c new file mode 100644 index 000000000..082253a3a --- /dev/null +++ b/drivers/nvdimm/label.c @@ -0,0 +1,1120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#include <linux/device.h> +#include <linux/ndctl.h> +#include <linux/uuid.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "label.h" +#include "nd.h" + +static guid_t nvdimm_btt_guid; +static guid_t nvdimm_btt2_guid; +static guid_t nvdimm_pfn_guid; +static guid_t nvdimm_dax_guid; + +static uuid_t nvdimm_btt_uuid; +static uuid_t nvdimm_btt2_uuid; +static uuid_t nvdimm_pfn_uuid; +static uuid_t nvdimm_dax_uuid; + +static uuid_t cxl_region_uuid; +static uuid_t cxl_namespace_uuid; + +static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0"; + +static u32 best_seq(u32 a, u32 b) +{ + a &= NSINDEX_SEQ_MASK; + b &= NSINDEX_SEQ_MASK; + + if (a == 0 || a == b) + return b; + else if (b == 0) + return a; + else if (nd_inc_seq(a) == b) + return b; + else + return a; +} + +unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd) +{ + return ndd->nslabel_size; +} + +static size_t __sizeof_namespace_index(u32 nslot) +{ + return ALIGN(sizeof(struct nd_namespace_index) + DIV_ROUND_UP(nslot, 8), + NSINDEX_ALIGN); +} + +static int __nvdimm_num_label_slots(struct nvdimm_drvdata *ndd, + size_t index_size) +{ + return (ndd->nsarea.config_size - index_size * 2) / + sizeof_namespace_label(ndd); +} + +int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd) +{ + u32 tmp_nslot, n; + + tmp_nslot = ndd->nsarea.config_size / sizeof_namespace_label(ndd); + n = __sizeof_namespace_index(tmp_nslot) / NSINDEX_ALIGN; + + return __nvdimm_num_label_slots(ndd, NSINDEX_ALIGN * n); +} + +size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) +{ + u32 nslot, space, size; + + /* + * Per UEFI 2.7, the minimum size of the Label Storage Area is large + * enough to hold 2 index blocks and 2 labels. The minimum index + * block size is 256 bytes. The label size is 128 for namespaces + * prior to version 1.2 and at minimum 256 for version 1.2 and later. + */ + nslot = nvdimm_num_label_slots(ndd); + space = ndd->nsarea.config_size - nslot * sizeof_namespace_label(ndd); + size = __sizeof_namespace_index(nslot) * 2; + if (size <= space && nslot >= 2) + return size / 2; + + dev_err(ndd->dev, "label area (%d) too small to host (%d byte) labels\n", + ndd->nsarea.config_size, sizeof_namespace_label(ndd)); + return 0; +} + +static int __nd_label_validate(struct nvdimm_drvdata *ndd) +{ + /* + * On media label format consists of two index blocks followed + * by an array of labels. None of these structures are ever + * updated in place. A sequence number tracks the current + * active index and the next one to write, while labels are + * written to free slots. + * + * +------------+ + * | | + * | nsindex0 | + * | | + * +------------+ + * | | + * | nsindex1 | + * | | + * +------------+ + * | label0 | + * +------------+ + * | label1 | + * +------------+ + * | | + * ....nslot... + * | | + * +------------+ + * | labelN | + * +------------+ + */ + struct nd_namespace_index *nsindex[] = { + to_namespace_index(ndd, 0), + to_namespace_index(ndd, 1), + }; + const int num_index = ARRAY_SIZE(nsindex); + struct device *dev = ndd->dev; + bool valid[2] = { 0 }; + int i, num_valid = 0; + u32 seq; + + for (i = 0; i < num_index; i++) { + u32 nslot; + u8 sig[NSINDEX_SIG_LEN]; + u64 sum_save, sum, size; + unsigned int version, labelsize; + + memcpy(sig, nsindex[i]->sig, NSINDEX_SIG_LEN); + if (memcmp(sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN) != 0) { + dev_dbg(dev, "nsindex%d signature invalid\n", i); + continue; + } + + /* label sizes larger than 128 arrived with v1.2 */ + version = __le16_to_cpu(nsindex[i]->major) * 100 + + __le16_to_cpu(nsindex[i]->minor); + if (version >= 102) + labelsize = 1 << (7 + nsindex[i]->labelsize); + else + labelsize = 128; + + if (labelsize != sizeof_namespace_label(ndd)) { + dev_dbg(dev, "nsindex%d labelsize %d invalid\n", + i, nsindex[i]->labelsize); + continue; + } + + sum_save = __le64_to_cpu(nsindex[i]->checksum); + nsindex[i]->checksum = __cpu_to_le64(0); + sum = nd_fletcher64(nsindex[i], sizeof_namespace_index(ndd), 1); + nsindex[i]->checksum = __cpu_to_le64(sum_save); + if (sum != sum_save) { + dev_dbg(dev, "nsindex%d checksum invalid\n", i); + continue; + } + + seq = __le32_to_cpu(nsindex[i]->seq); + if ((seq & NSINDEX_SEQ_MASK) == 0) { + dev_dbg(dev, "nsindex%d sequence: %#x invalid\n", i, seq); + continue; + } + + /* sanity check the index against expected values */ + if (__le64_to_cpu(nsindex[i]->myoff) + != i * sizeof_namespace_index(ndd)) { + dev_dbg(dev, "nsindex%d myoff: %#llx invalid\n", + i, (unsigned long long) + __le64_to_cpu(nsindex[i]->myoff)); + continue; + } + if (__le64_to_cpu(nsindex[i]->otheroff) + != (!i) * sizeof_namespace_index(ndd)) { + dev_dbg(dev, "nsindex%d otheroff: %#llx invalid\n", + i, (unsigned long long) + __le64_to_cpu(nsindex[i]->otheroff)); + continue; + } + if (__le64_to_cpu(nsindex[i]->labeloff) + != 2 * sizeof_namespace_index(ndd)) { + dev_dbg(dev, "nsindex%d labeloff: %#llx invalid\n", + i, (unsigned long long) + __le64_to_cpu(nsindex[i]->labeloff)); + continue; + } + + size = __le64_to_cpu(nsindex[i]->mysize); + if (size > sizeof_namespace_index(ndd) + || size < sizeof(struct nd_namespace_index)) { + dev_dbg(dev, "nsindex%d mysize: %#llx invalid\n", i, size); + continue; + } + + nslot = __le32_to_cpu(nsindex[i]->nslot); + if (nslot * sizeof_namespace_label(ndd) + + 2 * sizeof_namespace_index(ndd) + > ndd->nsarea.config_size) { + dev_dbg(dev, "nsindex%d nslot: %u invalid, config_size: %#x\n", + i, nslot, ndd->nsarea.config_size); + continue; + } + valid[i] = true; + num_valid++; + } + + switch (num_valid) { + case 0: + break; + case 1: + for (i = 0; i < num_index; i++) + if (valid[i]) + return i; + /* can't have num_valid > 0 but valid[] = { false, false } */ + WARN_ON(1); + break; + default: + /* pick the best index... */ + seq = best_seq(__le32_to_cpu(nsindex[0]->seq), + __le32_to_cpu(nsindex[1]->seq)); + if (seq == (__le32_to_cpu(nsindex[1]->seq) & NSINDEX_SEQ_MASK)) + return 1; + else + return 0; + break; + } + + return -1; +} + +static int nd_label_validate(struct nvdimm_drvdata *ndd) +{ + /* + * In order to probe for and validate namespace index blocks we + * need to know the size of the labels, and we can't trust the + * size of the labels until we validate the index blocks. + * Resolve this dependency loop by probing for known label + * sizes, but default to v1.2 256-byte namespace labels if + * discovery fails. + */ + int label_size[] = { 128, 256 }; + int i, rc; + + for (i = 0; i < ARRAY_SIZE(label_size); i++) { + ndd->nslabel_size = label_size[i]; + rc = __nd_label_validate(ndd); + if (rc >= 0) + return rc; + } + + return -1; +} + +static void nd_label_copy(struct nvdimm_drvdata *ndd, + struct nd_namespace_index *dst, + struct nd_namespace_index *src) +{ + /* just exit if either destination or source is NULL */ + if (!dst || !src) + return; + + memcpy(dst, src, sizeof_namespace_index(ndd)); +} + +static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd) +{ + void *base = to_namespace_index(ndd, 0); + + return base + 2 * sizeof_namespace_index(ndd); +} + +static int to_slot(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + unsigned long label, base; + + label = (unsigned long) nd_label; + base = (unsigned long) nd_label_base(ndd); + + return (label - base) / sizeof_namespace_label(ndd); +} + +static struct nd_namespace_label *to_label(struct nvdimm_drvdata *ndd, int slot) +{ + unsigned long label, base; + + base = (unsigned long) nd_label_base(ndd); + label = base + sizeof_namespace_label(ndd) * slot; + + return (struct nd_namespace_label *) label; +} + +#define for_each_clear_bit_le(bit, addr, size) \ + for ((bit) = find_next_zero_bit_le((addr), (size), 0); \ + (bit) < (size); \ + (bit) = find_next_zero_bit_le((addr), (size), (bit) + 1)) + +/** + * preamble_index - common variable initialization for nd_label_* routines + * @ndd: dimm container for the relevant label set + * @idx: namespace_index index + * @nsindex_out: on return set to the currently active namespace index + * @free: on return set to the free label bitmap in the index + * @nslot: on return set to the number of slots in the label space + */ +static bool preamble_index(struct nvdimm_drvdata *ndd, int idx, + struct nd_namespace_index **nsindex_out, + unsigned long **free, u32 *nslot) +{ + struct nd_namespace_index *nsindex; + + nsindex = to_namespace_index(ndd, idx); + if (nsindex == NULL) + return false; + + *free = (unsigned long *) nsindex->free; + *nslot = __le32_to_cpu(nsindex->nslot); + *nsindex_out = nsindex; + + return true; +} + +char *nd_label_gen_id(struct nd_label_id *label_id, const uuid_t *uuid, + u32 flags) +{ + if (!label_id || !uuid) + return NULL; + snprintf(label_id->id, ND_LABEL_ID_SIZE, "pmem-%pUb", uuid); + return label_id->id; +} + +static bool preamble_current(struct nvdimm_drvdata *ndd, + struct nd_namespace_index **nsindex, + unsigned long **free, u32 *nslot) +{ + return preamble_index(ndd, ndd->ns_current, nsindex, + free, nslot); +} + +static bool preamble_next(struct nvdimm_drvdata *ndd, + struct nd_namespace_index **nsindex, + unsigned long **free, u32 *nslot) +{ + return preamble_index(ndd, ndd->ns_next, nsindex, + free, nslot); +} + +static bool nsl_validate_checksum(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + u64 sum, sum_save; + + if (!ndd->cxl && !efi_namespace_label_has(ndd, checksum)) + return true; + + sum_save = nsl_get_checksum(ndd, nd_label); + nsl_set_checksum(ndd, nd_label, 0); + sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1); + nsl_set_checksum(ndd, nd_label, sum_save); + return sum == sum_save; +} + +static void nsl_calculate_checksum(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + u64 sum; + + if (!ndd->cxl && !efi_namespace_label_has(ndd, checksum)) + return; + nsl_set_checksum(ndd, nd_label, 0); + sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1); + nsl_set_checksum(ndd, nd_label, sum); +} + +static bool slot_valid(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, u32 slot) +{ + bool valid; + + /* check that we are written where we expect to be written */ + if (slot != nsl_get_slot(ndd, nd_label)) + return false; + valid = nsl_validate_checksum(ndd, nd_label); + if (!valid) + dev_dbg(ndd->dev, "fail checksum. slot: %d\n", slot); + return valid; +} + +int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return 0; /* no label, nothing to reserve */ + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + struct nd_region *nd_region = NULL; + struct nd_label_id label_id; + struct resource *res; + uuid_t label_uuid; + u32 flags; + + nd_label = to_label(ndd, slot); + + if (!slot_valid(ndd, nd_label, slot)) + continue; + + nsl_get_uuid(ndd, nd_label, &label_uuid); + flags = nsl_get_flags(ndd, nd_label); + nd_label_gen_id(&label_id, &label_uuid, flags); + res = nvdimm_allocate_dpa(ndd, &label_id, + nsl_get_dpa(ndd, nd_label), + nsl_get_rawsize(ndd, nd_label)); + nd_dbg_dpa(nd_region, ndd, res, "reserve\n"); + if (!res) + return -EBUSY; + } + + return 0; +} + +int nd_label_data_init(struct nvdimm_drvdata *ndd) +{ + size_t config_size, read_size, max_xfer, offset; + struct nd_namespace_index *nsindex; + unsigned int i; + int rc = 0; + u32 nslot; + + if (ndd->data) + return 0; + + if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0) { + dev_dbg(ndd->dev, "failed to init config data area: (%u:%u)\n", + ndd->nsarea.max_xfer, ndd->nsarea.config_size); + return -ENXIO; + } + + /* + * We need to determine the maximum index area as this is the section + * we must read and validate before we can start processing labels. + * + * If the area is too small to contain the two indexes and 2 labels + * then we abort. + * + * Start at a label size of 128 as this should result in the largest + * possible namespace index size. + */ + ndd->nslabel_size = 128; + read_size = sizeof_namespace_index(ndd) * 2; + if (!read_size) + return -ENXIO; + + /* Allocate config data */ + config_size = ndd->nsarea.config_size; + ndd->data = kvzalloc(config_size, GFP_KERNEL); + if (!ndd->data) + return -ENOMEM; + + /* + * We want to guarantee as few reads as possible while conserving + * memory. To do that we figure out how much unused space will be left + * in the last read, divide that by the total number of reads it is + * going to take given our maximum transfer size, and then reduce our + * maximum transfer size based on that result. + */ + max_xfer = min_t(size_t, ndd->nsarea.max_xfer, config_size); + if (read_size < max_xfer) { + /* trim waste */ + max_xfer -= ((max_xfer - 1) - (config_size - 1) % max_xfer) / + DIV_ROUND_UP(config_size, max_xfer); + /* make certain we read indexes in exactly 1 read */ + if (max_xfer < read_size) + max_xfer = read_size; + } + + /* Make our initial read size a multiple of max_xfer size */ + read_size = min(DIV_ROUND_UP(read_size, max_xfer) * max_xfer, + config_size); + + /* Read the index data */ + rc = nvdimm_get_config_data(ndd, ndd->data, 0, read_size); + if (rc) + goto out_err; + + /* Validate index data, if not valid assume all labels are invalid */ + ndd->ns_current = nd_label_validate(ndd); + if (ndd->ns_current < 0) + return 0; + + /* Record our index values */ + ndd->ns_next = nd_label_next_nsindex(ndd->ns_current); + + /* Copy "current" index on top of the "next" index */ + nsindex = to_current_namespace_index(ndd); + nd_label_copy(ndd, to_next_namespace_index(ndd), nsindex); + + /* Determine starting offset for label data */ + offset = __le64_to_cpu(nsindex->labeloff); + nslot = __le32_to_cpu(nsindex->nslot); + + /* Loop through the free list pulling in any active labels */ + for (i = 0; i < nslot; i++, offset += ndd->nslabel_size) { + size_t label_read_size; + + /* zero out the unused labels */ + if (test_bit_le(i, nsindex->free)) { + memset(ndd->data + offset, 0, ndd->nslabel_size); + continue; + } + + /* if we already read past here then just continue */ + if (offset + ndd->nslabel_size <= read_size) + continue; + + /* if we haven't read in a while reset our read_size offset */ + if (read_size < offset) + read_size = offset; + + /* determine how much more will be read after this next call. */ + label_read_size = offset + ndd->nslabel_size - read_size; + label_read_size = DIV_ROUND_UP(label_read_size, max_xfer) * + max_xfer; + + /* truncate last read if needed */ + if (read_size + label_read_size > config_size) + label_read_size = config_size - read_size; + + /* Read the label data */ + rc = nvdimm_get_config_data(ndd, ndd->data + read_size, + read_size, label_read_size); + if (rc) + goto out_err; + + /* push read_size to next read offset */ + read_size += label_read_size; + } + + dev_dbg(ndd->dev, "len: %zu rc: %d\n", offset, rc); +out_err: + return rc; +} + +int nd_label_active_count(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + int count = 0; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return 0; + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + + nd_label = to_label(ndd, slot); + + if (!slot_valid(ndd, nd_label, slot)) { + u32 label_slot = nsl_get_slot(ndd, nd_label); + u64 size = nsl_get_rawsize(ndd, nd_label); + u64 dpa = nsl_get_dpa(ndd, nd_label); + + dev_dbg(ndd->dev, + "slot%d invalid slot: %d dpa: %llx size: %llx\n", + slot, label_slot, dpa, size); + continue; + } + count++; + } + return count; +} + +struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return NULL; + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + + nd_label = to_label(ndd, slot); + if (!slot_valid(ndd, nd_label, slot)) + continue; + + if (n-- == 0) + return to_label(ndd, slot); + } + + return NULL; +} + +u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return UINT_MAX; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + slot = find_next_bit_le(free, nslot, 0); + if (slot == nslot) + return UINT_MAX; + + clear_bit_le(slot, free); + + return slot; +} + +bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return false; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + if (slot < nslot) + return !test_and_set_bit_le(slot, free); + return false; +} + +u32 nd_label_nfree(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return nvdimm_num_label_slots(ndd); + + return bitmap_weight(free, nslot); +} + +static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq, + unsigned long flags) +{ + struct nd_namespace_index *nsindex; + unsigned long offset; + u64 checksum; + u32 nslot; + int rc; + + nsindex = to_namespace_index(ndd, index); + if (flags & ND_NSINDEX_INIT) + nslot = nvdimm_num_label_slots(ndd); + else + nslot = __le32_to_cpu(nsindex->nslot); + + memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN); + memset(&nsindex->flags, 0, 3); + nsindex->labelsize = sizeof_namespace_label(ndd) >> 8; + nsindex->seq = __cpu_to_le32(seq); + offset = (unsigned long) nsindex + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->myoff = __cpu_to_le64(offset); + nsindex->mysize = __cpu_to_le64(sizeof_namespace_index(ndd)); + offset = (unsigned long) to_namespace_index(ndd, + nd_label_next_nsindex(index)) + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->otheroff = __cpu_to_le64(offset); + offset = (unsigned long) nd_label_base(ndd) + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->labeloff = __cpu_to_le64(offset); + nsindex->nslot = __cpu_to_le32(nslot); + nsindex->major = __cpu_to_le16(1); + if (sizeof_namespace_label(ndd) < 256) + nsindex->minor = __cpu_to_le16(1); + else + nsindex->minor = __cpu_to_le16(2); + nsindex->checksum = __cpu_to_le64(0); + if (flags & ND_NSINDEX_INIT) { + unsigned long *free = (unsigned long *) nsindex->free; + u32 nfree = ALIGN(nslot, BITS_PER_LONG); + int last_bits, i; + + memset(nsindex->free, 0xff, nfree / 8); + for (i = 0, last_bits = nfree - nslot; i < last_bits; i++) + clear_bit_le(nslot + i, free); + } + checksum = nd_fletcher64(nsindex, sizeof_namespace_index(ndd), 1); + nsindex->checksum = __cpu_to_le64(checksum); + rc = nvdimm_set_config_data(ndd, __le64_to_cpu(nsindex->myoff), + nsindex, sizeof_namespace_index(ndd)); + if (rc < 0) + return rc; + + if (flags & ND_NSINDEX_INIT) + return 0; + + /* copy the index we just wrote to the new 'next' */ + WARN_ON(index != ndd->ns_next); + nd_label_copy(ndd, to_current_namespace_index(ndd), nsindex); + ndd->ns_current = nd_label_next_nsindex(ndd->ns_current); + ndd->ns_next = nd_label_next_nsindex(ndd->ns_next); + WARN_ON(ndd->ns_current == ndd->ns_next); + + return 0; +} + +static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + return (unsigned long) nd_label + - (unsigned long) to_namespace_index(ndd, 0); +} + +static enum nvdimm_claim_class guid_to_nvdimm_cclass(guid_t *guid) +{ + if (guid_equal(guid, &nvdimm_btt_guid)) + return NVDIMM_CCLASS_BTT; + else if (guid_equal(guid, &nvdimm_btt2_guid)) + return NVDIMM_CCLASS_BTT2; + else if (guid_equal(guid, &nvdimm_pfn_guid)) + return NVDIMM_CCLASS_PFN; + else if (guid_equal(guid, &nvdimm_dax_guid)) + return NVDIMM_CCLASS_DAX; + else if (guid_equal(guid, &guid_null)) + return NVDIMM_CCLASS_NONE; + + return NVDIMM_CCLASS_UNKNOWN; +} + +/* CXL labels store UUIDs instead of GUIDs for the same data */ +static enum nvdimm_claim_class uuid_to_nvdimm_cclass(uuid_t *uuid) +{ + if (uuid_equal(uuid, &nvdimm_btt_uuid)) + return NVDIMM_CCLASS_BTT; + else if (uuid_equal(uuid, &nvdimm_btt2_uuid)) + return NVDIMM_CCLASS_BTT2; + else if (uuid_equal(uuid, &nvdimm_pfn_uuid)) + return NVDIMM_CCLASS_PFN; + else if (uuid_equal(uuid, &nvdimm_dax_uuid)) + return NVDIMM_CCLASS_DAX; + else if (uuid_equal(uuid, &uuid_null)) + return NVDIMM_CCLASS_NONE; + + return NVDIMM_CCLASS_UNKNOWN; +} + +static const guid_t *to_abstraction_guid(enum nvdimm_claim_class claim_class, + guid_t *target) +{ + if (claim_class == NVDIMM_CCLASS_BTT) + return &nvdimm_btt_guid; + else if (claim_class == NVDIMM_CCLASS_BTT2) + return &nvdimm_btt2_guid; + else if (claim_class == NVDIMM_CCLASS_PFN) + return &nvdimm_pfn_guid; + else if (claim_class == NVDIMM_CCLASS_DAX) + return &nvdimm_dax_guid; + else if (claim_class == NVDIMM_CCLASS_UNKNOWN) { + /* + * If we're modifying a namespace for which we don't + * know the claim_class, don't touch the existing guid. + */ + return target; + } else + return &guid_null; +} + +/* CXL labels store UUIDs instead of GUIDs for the same data */ +static const uuid_t *to_abstraction_uuid(enum nvdimm_claim_class claim_class, + uuid_t *target) +{ + if (claim_class == NVDIMM_CCLASS_BTT) + return &nvdimm_btt_uuid; + else if (claim_class == NVDIMM_CCLASS_BTT2) + return &nvdimm_btt2_uuid; + else if (claim_class == NVDIMM_CCLASS_PFN) + return &nvdimm_pfn_uuid; + else if (claim_class == NVDIMM_CCLASS_DAX) + return &nvdimm_dax_uuid; + else if (claim_class == NVDIMM_CCLASS_UNKNOWN) { + /* + * If we're modifying a namespace for which we don't + * know the claim_class, don't touch the existing uuid. + */ + return target; + } else + return &uuid_null; +} + +static void reap_victim(struct nd_mapping *nd_mapping, + struct nd_label_ent *victim) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + u32 slot = to_slot(ndd, victim->label); + + dev_dbg(ndd->dev, "free: %d\n", slot); + nd_label_free_slot(ndd, slot); + victim->label = NULL; +} + +static void nsl_set_type_guid(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, guid_t *guid) +{ + if (efi_namespace_label_has(ndd, type_guid)) + guid_copy(&nd_label->efi.type_guid, guid); +} + +bool nsl_validate_type_guid(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, guid_t *guid) +{ + if (ndd->cxl || !efi_namespace_label_has(ndd, type_guid)) + return true; + if (!guid_equal(&nd_label->efi.type_guid, guid)) { + dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n", guid, + &nd_label->efi.type_guid); + return false; + } + return true; +} + +static void nsl_set_claim_class(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + enum nvdimm_claim_class claim_class) +{ + if (ndd->cxl) { + uuid_t uuid; + + import_uuid(&uuid, nd_label->cxl.abstraction_uuid); + export_uuid(nd_label->cxl.abstraction_uuid, + to_abstraction_uuid(claim_class, &uuid)); + return; + } + + if (!efi_namespace_label_has(ndd, abstraction_guid)) + return; + guid_copy(&nd_label->efi.abstraction_guid, + to_abstraction_guid(claim_class, + &nd_label->efi.abstraction_guid)); +} + +enum nvdimm_claim_class nsl_get_claim_class(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) { + uuid_t uuid; + + import_uuid(&uuid, nd_label->cxl.abstraction_uuid); + return uuid_to_nvdimm_cclass(&uuid); + } + if (!efi_namespace_label_has(ndd, abstraction_guid)) + return NVDIMM_CCLASS_NONE; + return guid_to_nvdimm_cclass(&nd_label->efi.abstraction_guid); +} + +static int __pmem_label_update(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm, + int pos, unsigned long flags) +{ + struct nd_namespace_common *ndns = &nspm->nsio.common; + struct nd_interleave_set *nd_set = nd_region->nd_set; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *nd_label; + struct nd_namespace_index *nsindex; + struct nd_label_ent *label_ent; + struct nd_label_id label_id; + struct resource *res; + unsigned long *free; + u32 nslot, slot; + size_t offset; + u64 cookie; + int rc; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return -ENXIO; + + cookie = nd_region_interleave_set_cookie(nd_region, nsindex); + nd_label_gen_id(&label_id, nspm->uuid, 0); + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + break; + + if (!res) { + WARN_ON_ONCE(1); + return -ENXIO; + } + + /* allocate and write the label to the staging (next) index */ + slot = nd_label_alloc_slot(ndd); + if (slot == UINT_MAX) + return -ENXIO; + dev_dbg(ndd->dev, "allocated: %d\n", slot); + + nd_label = to_label(ndd, slot); + memset(nd_label, 0, sizeof_namespace_label(ndd)); + nsl_set_uuid(ndd, nd_label, nspm->uuid); + nsl_set_name(ndd, nd_label, nspm->alt_name); + nsl_set_flags(ndd, nd_label, flags); + nsl_set_nlabel(ndd, nd_label, nd_region->ndr_mappings); + nsl_set_nrange(ndd, nd_label, 1); + nsl_set_position(ndd, nd_label, pos); + nsl_set_isetcookie(ndd, nd_label, cookie); + nsl_set_rawsize(ndd, nd_label, resource_size(res)); + nsl_set_lbasize(ndd, nd_label, nspm->lbasize); + nsl_set_dpa(ndd, nd_label, res->start); + nsl_set_slot(ndd, nd_label, slot); + nsl_set_type_guid(ndd, nd_label, &nd_set->type_guid); + nsl_set_claim_class(ndd, nd_label, ndns->claim_class); + nsl_calculate_checksum(ndd, nd_label); + nd_dbg_dpa(nd_region, ndd, res, "\n"); + + /* update label */ + offset = nd_label_offset(ndd, nd_label); + rc = nvdimm_set_config_data(ndd, offset, nd_label, + sizeof_namespace_label(ndd)); + if (rc < 0) + return rc; + + /* Garbage collect the previous label */ + mutex_lock(&nd_mapping->lock); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + if (!label_ent->label) + continue; + if (test_and_clear_bit(ND_LABEL_REAP, &label_ent->flags) || + nsl_uuid_equal(ndd, label_ent->label, nspm->uuid)) + reap_victim(nd_mapping, label_ent); + } + + /* update index */ + rc = nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); + if (rc == 0) { + list_for_each_entry(label_ent, &nd_mapping->labels, list) + if (!label_ent->label) { + label_ent->label = nd_label; + nd_label = NULL; + break; + } + dev_WARN_ONCE(&nspm->nsio.common.dev, nd_label, + "failed to track label: %d\n", + to_slot(ndd, nd_label)); + if (nd_label) + rc = -ENXIO; + } + mutex_unlock(&nd_mapping->lock); + + return rc; +} + +static int init_labels(struct nd_mapping *nd_mapping, int num_labels) +{ + int i, old_num_labels = 0; + struct nd_label_ent *label_ent; + struct nd_namespace_index *nsindex; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + + mutex_lock(&nd_mapping->lock); + list_for_each_entry(label_ent, &nd_mapping->labels, list) + old_num_labels++; + mutex_unlock(&nd_mapping->lock); + + /* + * We need to preserve all the old labels for the mapping so + * they can be garbage collected after writing the new labels. + */ + for (i = old_num_labels; i < num_labels; i++) { + label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL); + if (!label_ent) + return -ENOMEM; + mutex_lock(&nd_mapping->lock); + list_add_tail(&label_ent->list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); + } + + if (ndd->ns_current == -1 || ndd->ns_next == -1) + /* pass */; + else + return max(num_labels, old_num_labels); + + nsindex = to_namespace_index(ndd, 0); + memset(nsindex, 0, ndd->nsarea.config_size); + for (i = 0; i < 2; i++) { + int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT); + + if (rc) + return rc; + } + ndd->ns_next = 1; + ndd->ns_current = 0; + + return max(num_labels, old_num_labels); +} + +static int del_labels(struct nd_mapping *nd_mapping, uuid_t *uuid) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_ent *label_ent, *e; + struct nd_namespace_index *nsindex; + unsigned long *free; + LIST_HEAD(list); + u32 nslot, slot; + int active = 0; + + if (!uuid) + return 0; + + /* no index || no labels == nothing to delete */ + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return 0; + + mutex_lock(&nd_mapping->lock); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; + + if (!nd_label) + continue; + active++; + if (!nsl_uuid_equal(ndd, nd_label, uuid)) + continue; + active--; + slot = to_slot(ndd, nd_label); + nd_label_free_slot(ndd, slot); + dev_dbg(ndd->dev, "free: %d\n", slot); + list_move_tail(&label_ent->list, &list); + label_ent->label = NULL; + } + list_splice_tail_init(&list, &nd_mapping->labels); + + if (active == 0) { + nd_mapping_free_labels(nd_mapping); + dev_dbg(ndd->dev, "no more active labels\n"); + } + mutex_unlock(&nd_mapping->lock); + + return nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); +} + +int nd_pmem_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size) +{ + int i, rc; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + int count = 0; + + if (size == 0) { + rc = del_labels(nd_mapping, nspm->uuid); + if (rc) + return rc; + continue; + } + + for_each_dpa_resource(ndd, res) + if (strncmp(res->name, "pmem", 4) == 0) + count++; + WARN_ON_ONCE(!count); + + rc = init_labels(nd_mapping, count); + if (rc < 0) + return rc; + + rc = __pmem_label_update(nd_region, nd_mapping, nspm, i, + NSLABEL_FLAG_UPDATING); + if (rc) + return rc; + } + + if (size == 0) + return 0; + + /* Clear the UPDATING flag per UEFI 2.7 expectations */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + rc = __pmem_label_update(nd_region, nd_mapping, nspm, i, 0); + if (rc) + return rc; + } + + return 0; +} + +int __init nd_label_init(void) +{ + WARN_ON(guid_parse(NVDIMM_BTT_GUID, &nvdimm_btt_guid)); + WARN_ON(guid_parse(NVDIMM_BTT2_GUID, &nvdimm_btt2_guid)); + WARN_ON(guid_parse(NVDIMM_PFN_GUID, &nvdimm_pfn_guid)); + WARN_ON(guid_parse(NVDIMM_DAX_GUID, &nvdimm_dax_guid)); + + WARN_ON(uuid_parse(NVDIMM_BTT_GUID, &nvdimm_btt_uuid)); + WARN_ON(uuid_parse(NVDIMM_BTT2_GUID, &nvdimm_btt2_uuid)); + WARN_ON(uuid_parse(NVDIMM_PFN_GUID, &nvdimm_pfn_uuid)); + WARN_ON(uuid_parse(NVDIMM_DAX_GUID, &nvdimm_dax_uuid)); + + WARN_ON(uuid_parse(CXL_REGION_UUID, &cxl_region_uuid)); + WARN_ON(uuid_parse(CXL_NAMESPACE_UUID, &cxl_namespace_uuid)); + + return 0; +} diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h new file mode 100644 index 000000000..0650fb4b9 --- /dev/null +++ b/drivers/nvdimm/label.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#ifndef __LABEL_H__ +#define __LABEL_H__ + +#include <linux/ndctl.h> +#include <linux/sizes.h> +#include <linux/uuid.h> +#include <linux/io.h> + +enum { + NSINDEX_SIG_LEN = 16, + NSINDEX_ALIGN = 256, + NSINDEX_SEQ_MASK = 0x3, + NSLABEL_UUID_LEN = 16, + NSLABEL_NAME_LEN = 64, + NSLABEL_FLAG_ROLABEL = 0x1, /* read-only label */ + NSLABEL_FLAG_LOCAL = 0x2, /* DIMM-local namespace */ + NSLABEL_FLAG_BTT = 0x4, /* namespace contains a BTT */ + NSLABEL_FLAG_UPDATING = 0x8, /* label being updated */ + BTT_ALIGN = 4096, /* all btt structures */ + BTTINFO_SIG_LEN = 16, + BTTINFO_UUID_LEN = 16, + BTTINFO_FLAG_ERROR = 0x1, /* error state (read-only) */ + BTTINFO_MAJOR_VERSION = 1, + ND_LABEL_MIN_SIZE = 256 * 4, /* see sizeof_namespace_index() */ + ND_LABEL_ID_SIZE = 50, + ND_NSINDEX_INIT = 0x1, +}; + +/** + * struct nd_namespace_index - label set superblock + * @sig: NAMESPACE_INDEX\0 + * @flags: placeholder + * @labelsize: log2 size (v1 labels 128 bytes v2 labels 256 bytes) + * @seq: sequence number for this index + * @myoff: offset of this index in label area + * @mysize: size of this index struct + * @otheroff: offset of other index + * @labeloff: offset of first label slot + * @nslot: total number of label slots + * @major: label area major version + * @minor: label area minor version + * @checksum: fletcher64 of all fields + * @free: bitmap, nlabel bits + * + * The size of free[] is rounded up so the total struct size is a + * multiple of NSINDEX_ALIGN bytes. Any bits this allocates beyond + * nlabel bits must be zero. + */ +struct nd_namespace_index { + u8 sig[NSINDEX_SIG_LEN]; + u8 flags[3]; + u8 labelsize; + __le32 seq; + __le64 myoff; + __le64 mysize; + __le64 otheroff; + __le64 labeloff; + __le32 nslot; + __le16 major; + __le16 minor; + __le64 checksum; + u8 free[]; +}; + +/** + * struct cxl_region_label - CXL 2.0 Table 211 + * @type: uuid identifying this label format (region) + * @uuid: uuid for the region this label describes + * @flags: NSLABEL_FLAG_UPDATING (all other flags reserved) + * @nlabel: 1 per interleave-way in the region + * @position: this label's position in the set + * @dpa: start address in device-local capacity for this label + * @rawsize: size of this label's contribution to region + * @hpa: mandatory system physical address to map this region + * @slot: slot id of this label in label area + * @ig: interleave granularity (1 << @ig) * 256 bytes + * @align: alignment in SZ_256M blocks + * @reserved: reserved + * @checksum: fletcher64 sum of this label + */ +struct cxl_region_label { + u8 type[NSLABEL_UUID_LEN]; + u8 uuid[NSLABEL_UUID_LEN]; + __le32 flags; + __le16 nlabel; + __le16 position; + __le64 dpa; + __le64 rawsize; + __le64 hpa; + __le32 slot; + __le32 ig; + __le32 align; + u8 reserved[0xac]; + __le64 checksum; +}; + +/** + * struct nvdimm_efi_label - namespace superblock + * @uuid: UUID per RFC 4122 + * @name: optional name (NULL-terminated) + * @flags: see NSLABEL_FLAG_* + * @nlabel: num labels to describe this ns + * @position: labels position in set + * @isetcookie: interleave set cookie + * @lbasize: LBA size in bytes or 0 for pmem + * @dpa: DPA of NVM range on this DIMM + * @rawsize: size of namespace + * @slot: slot of this label in label area + * @align: physical address alignment of the namespace + * @reserved: reserved + * @type_guid: copy of struct acpi_nfit_system_address.range_guid + * @abstraction_guid: personality id (btt, btt2, fsdax, devdax....) + * @reserved2: reserved + * @checksum: fletcher64 sum of this object + */ +struct nvdimm_efi_label { + u8 uuid[NSLABEL_UUID_LEN]; + u8 name[NSLABEL_NAME_LEN]; + __le32 flags; + __le16 nlabel; + __le16 position; + __le64 isetcookie; + __le64 lbasize; + __le64 dpa; + __le64 rawsize; + __le32 slot; + /* + * Accessing fields past this point should be gated by a + * efi_namespace_label_has() check. + */ + u8 align; + u8 reserved[3]; + guid_t type_guid; + guid_t abstraction_guid; + u8 reserved2[88]; + __le64 checksum; +}; + +/** + * struct nvdimm_cxl_label - CXL 2.0 Table 212 + * @type: uuid identifying this label format (namespace) + * @uuid: uuid for the namespace this label describes + * @name: friendly name for the namespace + * @flags: NSLABEL_FLAG_UPDATING (all other flags reserved) + * @nrange: discontiguous namespace support + * @position: this label's position in the set + * @dpa: start address in device-local capacity for this label + * @rawsize: size of this label's contribution to namespace + * @slot: slot id of this label in label area + * @align: alignment in SZ_256M blocks + * @region_uuid: host interleave set identifier + * @abstraction_uuid: personality driver for this namespace + * @lbasize: address geometry for disk-like personalities + * @reserved: reserved + * @checksum: fletcher64 sum of this label + */ +struct nvdimm_cxl_label { + u8 type[NSLABEL_UUID_LEN]; + u8 uuid[NSLABEL_UUID_LEN]; + u8 name[NSLABEL_NAME_LEN]; + __le32 flags; + __le16 nrange; + __le16 position; + __le64 dpa; + __le64 rawsize; + __le32 slot; + __le32 align; + u8 region_uuid[16]; + u8 abstraction_uuid[16]; + __le16 lbasize; + u8 reserved[0x56]; + __le64 checksum; +}; + +struct nd_namespace_label { + union { + struct nvdimm_cxl_label cxl; + struct nvdimm_efi_label efi; + }; +}; + +#define NVDIMM_BTT_GUID "8aed63a2-29a2-4c66-8b12-f05d15d3922a" +#define NVDIMM_BTT2_GUID "18633bfc-1735-4217-8ac9-17239282d3f8" +#define NVDIMM_PFN_GUID "266400ba-fb9f-4677-bcb0-968f11d0d225" +#define NVDIMM_DAX_GUID "97a86d9c-3cdd-4eda-986f-5068b4f80088" + +#define CXL_REGION_UUID "529d7c61-da07-47c4-a93f-ecdf2c06f444" +#define CXL_NAMESPACE_UUID "68bb2c0a-5a77-4937-9f85-3caf41a0f93c" + +/** + * struct nd_label_id - identifier string for dpa allocation + * @id: "pmem-<namespace uuid>" + */ +struct nd_label_id { + char id[ND_LABEL_ID_SIZE]; +}; + +/* + * If the 'best' index is invalid, so is the 'next' index. Otherwise, + * the next index is MOD(index+1, 2) + */ +static inline int nd_label_next_nsindex(int index) +{ + if (index < 0) + return -1; + + return (index + 1) % 2; +} + +struct nvdimm_drvdata; +int nd_label_data_init(struct nvdimm_drvdata *ndd); +size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd); +int nd_label_active_count(struct nvdimm_drvdata *ndd); +struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n); +u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd); +bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot); +u32 nd_label_nfree(struct nvdimm_drvdata *ndd); +struct nd_region; +struct nd_namespace_pmem; +int nd_pmem_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size); +#endif /* __LABEL_H__ */ diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c new file mode 100644 index 000000000..c60ec0b37 --- /dev/null +++ b/drivers/nvdimm/namespace_devs.c @@ -0,0 +1,2229 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#include <linux/module.h> +#include <linux/device.h> +#include <linux/sort.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "pmem.h" +#include "pfn.h" +#include "nd.h" + +static void namespace_io_release(struct device *dev) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + kfree(nsio); +} + +static void namespace_pmem_release(struct device *dev) +{ + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + struct nd_region *nd_region = to_nd_region(dev->parent); + + if (nspm->id >= 0) + ida_simple_remove(&nd_region->ns_ida, nspm->id); + kfree(nspm->alt_name); + kfree(nspm->uuid); + kfree(nspm); +} + +static bool is_namespace_pmem(const struct device *dev); +static bool is_namespace_io(const struct device *dev); + +static int is_uuid_busy(struct device *dev, void *data) +{ + uuid_t *uuid1 = data, *uuid2 = NULL; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid2 = nspm->uuid; + } else if (is_nd_btt(dev)) { + struct nd_btt *nd_btt = to_nd_btt(dev); + + uuid2 = nd_btt->uuid; + } else if (is_nd_pfn(dev)) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + uuid2 = nd_pfn->uuid; + } + + if (uuid2 && uuid_equal(uuid1, uuid2)) + return -EBUSY; + + return 0; +} + +static int is_namespace_uuid_busy(struct device *dev, void *data) +{ + if (is_nd_region(dev)) + return device_for_each_child(dev, data, is_uuid_busy); + return 0; +} + +/** + * nd_is_uuid_unique - verify that no other namespace has @uuid + * @dev: any device on a nvdimm_bus + * @uuid: uuid to check + */ +bool nd_is_uuid_unique(struct device *dev, uuid_t *uuid) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return false; + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev)); + if (device_for_each_child(&nvdimm_bus->dev, uuid, + is_namespace_uuid_busy) != 0) + return false; + return true; +} + +bool pmem_should_map_pages(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_namespace_common *ndns = to_ndns(dev); + struct nd_namespace_io *nsio; + + if (!IS_ENABLED(CONFIG_ZONE_DEVICE)) + return false; + + if (!test_bit(ND_REGION_PAGEMAP, &nd_region->flags)) + return false; + + if (is_nd_pfn(dev) || is_nd_btt(dev)) + return false; + + if (ndns->force_raw) + return false; + + nsio = to_nd_namespace_io(dev); + if (region_intersects(nsio->res.start, resource_size(&nsio->res), + IORESOURCE_SYSTEM_RAM, + IORES_DESC_NONE) == REGION_MIXED) + return false; + + return ARCH_MEMREMAP_PMEM == MEMREMAP_WB; +} +EXPORT_SYMBOL(pmem_should_map_pages); + +unsigned int pmem_sector_size(struct nd_namespace_common *ndns) +{ + if (is_namespace_pmem(&ndns->dev)) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(&ndns->dev); + if (nspm->lbasize == 0 || nspm->lbasize == 512) + /* default */; + else if (nspm->lbasize == 4096) + return 4096; + else + dev_WARN(&ndns->dev, "unsupported sector size: %ld\n", + nspm->lbasize); + } + + /* + * There is no namespace label (is_namespace_io()), or the label + * indicates the default sector size. + */ + return 512; +} +EXPORT_SYMBOL(pmem_sector_size); + +const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, + char *name) +{ + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + const char *suffix = NULL; + + if (ndns->claim && is_nd_btt(ndns->claim)) + suffix = "s"; + + if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) { + int nsidx = 0; + + if (is_namespace_pmem(&ndns->dev)) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(&ndns->dev); + nsidx = nspm->id; + } + + if (nsidx) + sprintf(name, "pmem%d.%d%s", nd_region->id, nsidx, + suffix ? suffix : ""); + else + sprintf(name, "pmem%d%s", nd_region->id, + suffix ? suffix : ""); + } else { + return NULL; + } + + return name; +} +EXPORT_SYMBOL(nvdimm_namespace_disk_name); + +const uuid_t *nd_dev_to_uuid(struct device *dev) +{ + if (dev && is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + return nspm->uuid; + } + return &uuid_null; +} +EXPORT_SYMBOL(nd_dev_to_uuid); + +static ssize_t nstype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + + return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region)); +} +static DEVICE_ATTR_RO(nstype); + +static ssize_t __alt_name_store(struct device *dev, const char *buf, + const size_t len) +{ + char *input, *pos, *alt_name, **ns_altname; + ssize_t rc; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_altname = &nspm->alt_name; + } else + return -ENXIO; + + if (dev->driver || to_ndns(dev)->claim) + return -EBUSY; + + input = kstrndup(buf, len, GFP_KERNEL); + if (!input) + return -ENOMEM; + + pos = strim(input); + if (strlen(pos) + 1 > NSLABEL_NAME_LEN) { + rc = -EINVAL; + goto out; + } + + alt_name = kzalloc(NSLABEL_NAME_LEN, GFP_KERNEL); + if (!alt_name) { + rc = -ENOMEM; + goto out; + } + kfree(*ns_altname); + *ns_altname = alt_name; + sprintf(*ns_altname, "%s", pos); + rc = len; + +out: + kfree(input); + return rc; +} + +static int nd_namespace_label_update(struct nd_region *nd_region, + struct device *dev) +{ + dev_WARN_ONCE(dev, dev->driver || to_ndns(dev)->claim, + "namespace must be idle during label update\n"); + if (dev->driver || to_ndns(dev)->claim) + return 0; + + /* + * Only allow label writes that will result in a valid namespace + * or deletion of an existing namespace. + */ + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + resource_size_t size = resource_size(&nspm->nsio.res); + + if (size == 0 && nspm->uuid) + /* delete allocation */; + else if (!nspm->uuid) + return 0; + + return nd_pmem_namespace_label_update(nd_region, nspm, size); + } else + return -ENXIO; +} + +static ssize_t alt_name_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = __alt_name_store(dev, buf, len); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + dev_dbg(dev, "%s(%zd)\n", rc < 0 ? "fail " : "", rc); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} + +static ssize_t alt_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + char *ns_altname; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_altname = nspm->alt_name; + } else + return -ENXIO; + + return sprintf(buf, "%s\n", ns_altname ? ns_altname : ""); +} +static DEVICE_ATTR_RW(alt_name); + +static int scan_free(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id, + resource_size_t n) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + int rc = 0; + + while (n) { + struct resource *res, *last; + + last = NULL; + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id->id) == 0) + last = res; + res = last; + if (!res) + return 0; + + if (n >= resource_size(res)) { + n -= resource_size(res); + nd_dbg_dpa(nd_region, ndd, res, "delete %d\n", rc); + nvdimm_free_dpa(ndd, res); + /* retry with last resource deleted */ + continue; + } + + rc = adjust_resource(res, res->start, resource_size(res) - n); + if (rc == 0) + res->flags |= DPA_RESOURCE_ADJUSTED; + nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc); + break; + } + + return rc; +} + +/** + * shrink_dpa_allocation - for each dimm in region free n bytes for label_id + * @nd_region: the set of dimms to reclaim @n bytes from + * @label_id: unique identifier for the namespace consuming this dpa range + * @n: number of bytes per-dimm to release + * + * Assumes resources are ordered. Starting from the end try to + * adjust_resource() the allocation to @n, but if @n is larger than the + * allocation delete it and find the 'new' last allocation in the label + * set. + */ +static int shrink_dpa_allocation(struct nd_region *nd_region, + struct nd_label_id *label_id, resource_size_t n) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + int rc; + + rc = scan_free(nd_region, nd_mapping, label_id, n); + if (rc) + return rc; + } + + return 0; +} + +static resource_size_t init_dpa_allocation(struct nd_label_id *label_id, + struct nd_region *nd_region, struct nd_mapping *nd_mapping, + resource_size_t n) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + int rc = 0; + + /* first resource allocation for this label-id or dimm */ + res = nvdimm_allocate_dpa(ndd, label_id, nd_mapping->start, n); + if (!res) + rc = -EBUSY; + + nd_dbg_dpa(nd_region, ndd, res, "init %d\n", rc); + return rc ? n : 0; +} + + +/** + * space_valid() - validate free dpa space against constraints + * @nd_region: hosting region of the free space + * @ndd: dimm device data for debug + * @label_id: namespace id to allocate space + * @prev: potential allocation that precedes free space + * @next: allocation that follows the given free space range + * @exist: first allocation with same id in the mapping + * @n: range that must satisfied for pmem allocations + * @valid: free space range to validate + * + * BLK-space is valid as long as it does not precede a PMEM + * allocation in a given region. PMEM-space must be contiguous + * and adjacent to an existing allocation (if one + * exists). If reserving PMEM any space is valid. + */ +static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id, struct resource *prev, + struct resource *next, struct resource *exist, + resource_size_t n, struct resource *valid) +{ + bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; + unsigned long align; + + align = nd_region->align / nd_region->ndr_mappings; + valid->start = ALIGN(valid->start, align); + valid->end = ALIGN_DOWN(valid->end + 1, align) - 1; + + if (valid->start >= valid->end) + goto invalid; + + if (is_reserve) + return; + + /* allocation needs to be contiguous, so this is all or nothing */ + if (resource_size(valid) < n) + goto invalid; + + /* we've got all the space we need and no existing allocation */ + if (!exist) + return; + + /* allocation needs to be contiguous with the existing namespace */ + if (valid->start == exist->end + 1 + || valid->end == exist->start - 1) + return; + + invalid: + /* truncate @valid size to 0 */ + valid->end = valid->start - 1; +} + +enum alloc_loc { + ALLOC_ERR = 0, ALLOC_BEFORE, ALLOC_MID, ALLOC_AFTER, +}; + +static resource_size_t scan_allocate(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id, + resource_size_t n) +{ + resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res, *exist = NULL, valid; + const resource_size_t to_allocate = n; + int first; + + for_each_dpa_resource(ndd, res) + if (strcmp(label_id->id, res->name) == 0) + exist = res; + + valid.start = nd_mapping->start; + valid.end = mapping_end; + valid.name = "free space"; + retry: + first = 0; + for_each_dpa_resource(ndd, res) { + struct resource *next = res->sibling, *new_res = NULL; + resource_size_t allocate, available = 0; + enum alloc_loc loc = ALLOC_ERR; + const char *action; + int rc = 0; + + /* ignore resources outside this nd_mapping */ + if (res->start > mapping_end) + continue; + if (res->end < nd_mapping->start) + continue; + + /* space at the beginning of the mapping */ + if (!first++ && res->start > nd_mapping->start) { + valid.start = nd_mapping->start; + valid.end = res->start - 1; + space_valid(nd_region, ndd, label_id, NULL, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) + loc = ALLOC_BEFORE; + } + + /* space between allocations */ + if (!loc && next) { + valid.start = res->start + resource_size(res); + valid.end = min(mapping_end, next->start - 1); + space_valid(nd_region, ndd, label_id, res, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) + loc = ALLOC_MID; + } + + /* space at the end of the mapping */ + if (!loc && !next) { + valid.start = res->start + resource_size(res); + valid.end = mapping_end; + space_valid(nd_region, ndd, label_id, res, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) + loc = ALLOC_AFTER; + } + + if (!loc || !available) + continue; + allocate = min(available, n); + switch (loc) { + case ALLOC_BEFORE: + if (strcmp(res->name, label_id->id) == 0) { + /* adjust current resource up */ + rc = adjust_resource(res, res->start - allocate, + resource_size(res) + allocate); + action = "cur grow up"; + } else + action = "allocate"; + break; + case ALLOC_MID: + if (strcmp(next->name, label_id->id) == 0) { + /* adjust next resource up */ + rc = adjust_resource(next, next->start + - allocate, resource_size(next) + + allocate); + new_res = next; + action = "next grow up"; + } else if (strcmp(res->name, label_id->id) == 0) { + action = "grow down"; + } else + action = "allocate"; + break; + case ALLOC_AFTER: + if (strcmp(res->name, label_id->id) == 0) + action = "grow down"; + else + action = "allocate"; + break; + default: + return n; + } + + if (strcmp(action, "allocate") == 0) { + new_res = nvdimm_allocate_dpa(ndd, label_id, + valid.start, allocate); + if (!new_res) + rc = -EBUSY; + } else if (strcmp(action, "grow down") == 0) { + /* adjust current resource down */ + rc = adjust_resource(res, res->start, resource_size(res) + + allocate); + if (rc == 0) + res->flags |= DPA_RESOURCE_ADJUSTED; + } + + if (!new_res) + new_res = res; + + nd_dbg_dpa(nd_region, ndd, new_res, "%s(%d) %d\n", + action, loc, rc); + + if (rc) + return n; + + n -= allocate; + if (n) { + /* + * Retry scan with newly inserted resources. + * For example, if we did an ALLOC_BEFORE + * insertion there may also have been space + * available for an ALLOC_AFTER insertion, so we + * need to check this same resource again + */ + goto retry; + } else + return 0; + } + + if (n == to_allocate) + return init_dpa_allocation(label_id, nd_region, nd_mapping, n); + return n; +} + +static int merge_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + if (strncmp("pmem", label_id->id, 4) == 0) + return 0; + retry: + for_each_dpa_resource(ndd, res) { + int rc; + struct resource *next = res->sibling; + resource_size_t end = res->start + resource_size(res); + + if (!next || strcmp(res->name, label_id->id) != 0 + || strcmp(next->name, label_id->id) != 0 + || end != next->start) + continue; + end += resource_size(next); + nvdimm_free_dpa(ndd, next); + rc = adjust_resource(res, res->start, end - res->start); + nd_dbg_dpa(nd_region, ndd, res, "merge %d\n", rc); + if (rc) + return rc; + res->flags |= DPA_RESOURCE_ADJUSTED; + goto retry; + } + + return 0; +} + +int __reserve_free_pmem(struct device *dev, void *data) +{ + struct nvdimm *nvdimm = data; + struct nd_region *nd_region; + struct nd_label_id label_id; + int i; + + if (!is_memory(dev)) + return 0; + + nd_region = to_nd_region(dev); + if (nd_region->ndr_mappings == 0) + return 0; + + memset(&label_id, 0, sizeof(label_id)); + strcat(label_id.id, "pmem-reserve"); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + resource_size_t n, rem = 0; + + if (nd_mapping->nvdimm != nvdimm) + continue; + + n = nd_pmem_available_dpa(nd_region, nd_mapping); + if (n == 0) + return 0; + rem = scan_allocate(nd_region, nd_mapping, &label_id, n); + dev_WARN_ONCE(&nd_region->dev, rem, + "pmem reserve underrun: %#llx of %#llx bytes\n", + (unsigned long long) n - rem, + (unsigned long long) n); + return rem ? -ENXIO : 0; + } + + return 0; +} + +void release_free_pmem(struct nvdimm_bus *nvdimm_bus, + struct nd_mapping *nd_mapping) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res, *_res; + + for_each_dpa_resource_safe(ndd, res, _res) + if (strcmp(res->name, "pmem-reserve") == 0) + nvdimm_free_dpa(ndd, res); +} + +/** + * grow_dpa_allocation - for each dimm allocate n bytes for @label_id + * @nd_region: the set of dimms to allocate @n more bytes from + * @label_id: unique identifier for the namespace consuming this dpa range + * @n: number of bytes per-dimm to add to the existing allocation + * + * Assumes resources are ordered. For BLK regions, first consume + * BLK-only available DPA free space, then consume PMEM-aliased DPA + * space starting at the highest DPA. For PMEM regions start + * allocations from the start of an interleave set and end at the first + * BLK allocation or the end of the interleave set, whichever comes + * first. + */ +static int grow_dpa_allocation(struct nd_region *nd_region, + struct nd_label_id *label_id, resource_size_t n) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + resource_size_t rem = n; + int rc; + + rem = scan_allocate(nd_region, nd_mapping, label_id, rem); + dev_WARN_ONCE(&nd_region->dev, rem, + "allocation underrun: %#llx of %#llx bytes\n", + (unsigned long long) n - rem, + (unsigned long long) n); + if (rem) + return -ENXIO; + + rc = merge_dpa(nd_region, nd_mapping, label_id); + if (rc) + return rc; + } + + return 0; +} + +static void nd_namespace_pmem_set_resource(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size) +{ + struct resource *res = &nspm->nsio.res; + resource_size_t offset = 0; + + if (size && !nspm->uuid) { + WARN_ON_ONCE(1); + size = 0; + } + + if (size && nspm->uuid) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + struct resource *res; + + if (!ndd) { + size = 0; + goto out; + } + + nd_label_gen_id(&label_id, nspm->uuid, 0); + + /* calculate a spa offset from the dpa allocation offset */ + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) { + offset = (res->start - nd_mapping->start) + * nd_region->ndr_mappings; + goto out; + } + + WARN_ON_ONCE(1); + size = 0; + } + + out: + res->start = nd_region->ndr_start + offset; + res->end = res->start + size - 1; +} + +static bool uuid_not_set(const uuid_t *uuid, struct device *dev, + const char *where) +{ + if (!uuid) { + dev_dbg(dev, "%s: uuid not set\n", where); + return true; + } + return false; +} + +static ssize_t __size_store(struct device *dev, unsigned long long val) +{ + resource_size_t allocated = 0, available = 0; + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_namespace_common *ndns = to_ndns(dev); + struct nd_mapping *nd_mapping; + struct nvdimm_drvdata *ndd; + struct nd_label_id label_id; + u32 flags = 0, remainder; + int rc, i, id = -1; + uuid_t *uuid = NULL; + + if (dev->driver || ndns->claim) + return -EBUSY; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + id = nspm->id; + } + + /* + * We need a uuid for the allocation-label and dimm(s) on which + * to store the label. + */ + if (uuid_not_set(uuid, dev, __func__)) + return -ENXIO; + if (nd_region->ndr_mappings == 0) { + dev_dbg(dev, "not associated with dimm(s)\n"); + return -ENXIO; + } + + div_u64_rem(val, nd_region->align, &remainder); + if (remainder) { + dev_dbg(dev, "%llu is not %ldK aligned\n", val, + nd_region->align / SZ_1K); + return -EINVAL; + } + + nd_label_gen_id(&label_id, uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + ndd = to_ndd(nd_mapping); + + /* + * All dimms in an interleave set, need to be enabled + * for the size to be changed. + */ + if (!ndd) + return -ENXIO; + + allocated += nvdimm_allocated_dpa(ndd, &label_id); + } + available = nd_region_allocatable_dpa(nd_region); + + if (val > available + allocated) + return -ENOSPC; + + if (val == allocated) + return 0; + + val = div_u64(val, nd_region->ndr_mappings); + allocated = div_u64(allocated, nd_region->ndr_mappings); + if (val < allocated) + rc = shrink_dpa_allocation(nd_region, &label_id, + allocated - val); + else + rc = grow_dpa_allocation(nd_region, &label_id, val - allocated); + + if (rc) + return rc; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + nd_namespace_pmem_set_resource(nd_region, nspm, + val * nd_region->ndr_mappings); + } + + /* + * Try to delete the namespace if we deleted all of its + * allocation, this is not the seed or 0th device for the + * region, and it is not actively claimed by a btt, pfn, or dax + * instance. + */ + if (val == 0 && id != 0 && nd_region->ns_seed != dev && !ndns->claim) + nd_device_unregister(dev, ND_ASYNC); + + return rc; +} + +static ssize_t size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + unsigned long long val; + int rc; + + rc = kstrtoull(buf, 0, &val); + if (rc) + return rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = __size_store(dev, val); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + + /* setting size zero == 'delete namespace' */ + if (rc == 0 && val == 0 && is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + kfree(nspm->uuid); + nspm->uuid = NULL; + } + + dev_dbg(dev, "%llx %s (%d)\n", val, rc < 0 ? "fail" : "success", rc); + + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} + +resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns) +{ + struct device *dev = &ndns->dev; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + return resource_size(&nspm->nsio.res); + } else if (is_namespace_io(dev)) { + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + return resource_size(&nsio->res); + } else + WARN_ONCE(1, "unknown namespace type\n"); + return 0; +} + +resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns) +{ + resource_size_t size; + + nvdimm_bus_lock(&ndns->dev); + size = __nvdimm_namespace_capacity(ndns); + nvdimm_bus_unlock(&ndns->dev); + + return size; +} +EXPORT_SYMBOL(nvdimm_namespace_capacity); + +bool nvdimm_namespace_locked(struct nd_namespace_common *ndns) +{ + int i; + bool locked = false; + struct device *dev = &ndns->dev; + struct nd_region *nd_region = to_nd_region(dev->parent); + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if (test_bit(NDD_LOCKED, &nvdimm->flags)) { + dev_dbg(dev, "%s locked\n", nvdimm_name(nvdimm)); + locked = true; + } + } + return locked; +} +EXPORT_SYMBOL(nvdimm_namespace_locked); + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long) + nvdimm_namespace_capacity(to_ndns(dev))); +} +static DEVICE_ATTR(size, 0444, size_show, size_store); + +static uuid_t *namespace_to_uuid(struct device *dev) +{ + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + return nspm->uuid; + } + return ERR_PTR(-ENXIO); +} + +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + uuid_t *uuid = namespace_to_uuid(dev); + + if (IS_ERR(uuid)) + return PTR_ERR(uuid); + if (uuid) + return sprintf(buf, "%pUb\n", uuid); + return sprintf(buf, "\n"); +} + +/** + * namespace_update_uuid - check for a unique uuid and whether we're "renaming" + * @nd_region: parent region so we can updates all dimms in the set + * @dev: namespace type for generating label_id + * @new_uuid: incoming uuid + * @old_uuid: reference to the uuid storage location in the namespace object + */ +static int namespace_update_uuid(struct nd_region *nd_region, + struct device *dev, uuid_t *new_uuid, + uuid_t **old_uuid) +{ + struct nd_label_id old_label_id; + struct nd_label_id new_label_id; + int i; + + if (!nd_is_uuid_unique(dev, new_uuid)) + return -EINVAL; + + if (*old_uuid == NULL) + goto out; + + /* + * If we've already written a label with this uuid, then it's + * too late to rename because we can't reliably update the uuid + * without losing the old namespace. Userspace must delete this + * namespace to abandon the old uuid. + */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + /* + * This check by itself is sufficient because old_uuid + * would be NULL above if this uuid did not exist in the + * currently written set. + * + * FIXME: can we delete uuid with zero dpa allocated? + */ + if (list_empty(&nd_mapping->labels)) + return -EBUSY; + } + + nd_label_gen_id(&old_label_id, *old_uuid, 0); + nd_label_gen_id(&new_label_id, new_uuid, 0); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_ent *label_ent; + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, old_label_id.id) == 0) + sprintf((void *) res->name, "%s", + new_label_id.id); + + mutex_lock(&nd_mapping->lock); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; + struct nd_label_id label_id; + uuid_t uuid; + + if (!nd_label) + continue; + nsl_get_uuid(ndd, nd_label, &uuid); + nd_label_gen_id(&label_id, &uuid, + nsl_get_flags(ndd, nd_label)); + if (strcmp(old_label_id.id, label_id.id) == 0) + set_bit(ND_LABEL_REAP, &label_ent->flags); + } + mutex_unlock(&nd_mapping->lock); + } + kfree(*old_uuid); + out: + *old_uuid = new_uuid; + return 0; +} + +static ssize_t uuid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + uuid_t *uuid = NULL; + uuid_t **ns_uuid; + ssize_t rc = 0; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_uuid = &nspm->uuid; + } else + return -ENXIO; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + if (to_ndns(dev)->claim) + rc = -EBUSY; + if (rc >= 0) + rc = nd_uuid_store(dev, &uuid, buf, len); + if (rc >= 0) + rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + else + kfree(uuid); + dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, + buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} +static DEVICE_ATTR_RW(uuid); + +static ssize_t resource_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct resource *res; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + res = &nspm->nsio.res; + } else if (is_namespace_io(dev)) { + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + res = &nsio->res; + } else + return -ENXIO; + + /* no address to convey if the namespace has no allocation */ + if (resource_size(res) == 0) + return -ENXIO; + return sprintf(buf, "%#llx\n", (unsigned long long) res->start); +} +static DEVICE_ATTR_ADMIN_RO(resource); + +static const unsigned long pmem_lbasize_supported[] = { 512, 4096, 0 }; + +static ssize_t sector_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + return nd_size_select_show(nspm->lbasize, + pmem_lbasize_supported, buf); + } + return -ENXIO; +} + +static ssize_t sector_size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + const unsigned long *supported; + unsigned long *lbasize; + ssize_t rc = 0; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + lbasize = &nspm->lbasize; + supported = pmem_lbasize_supported; + } else + return -ENXIO; + + device_lock(dev); + nvdimm_bus_lock(dev); + if (to_ndns(dev)->claim) + rc = -EBUSY; + if (rc >= 0) + rc = nd_size_select_store(dev, buf, lbasize, supported); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + dev_dbg(dev, "result: %zd %s: %s%s", rc, rc < 0 ? "tried" : "wrote", + buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(sector_size); + +static ssize_t dpa_extents_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_label_id label_id; + uuid_t *uuid = NULL; + int count = 0, i; + u32 flags = 0; + + nvdimm_bus_lock(dev); + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + flags = 0; + } + + if (!uuid) + goto out; + + nd_label_gen_id(&label_id, uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + count++; + } + out: + nvdimm_bus_unlock(dev); + + return sprintf(buf, "%d\n", count); +} +static DEVICE_ATTR_RO(dpa_extents); + +static int btt_claim_class(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + int i, loop_bitmask = 0; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_index *nsindex; + + /* + * If any of the DIMMs do not support labels the only + * possible BTT format is v1. + */ + if (!ndd) { + loop_bitmask = 0; + break; + } + + nsindex = to_namespace_index(ndd, ndd->ns_current); + if (nsindex == NULL) + loop_bitmask |= 1; + else { + /* check whether existing labels are v1.1 or v1.2 */ + if (__le16_to_cpu(nsindex->major) == 1 + && __le16_to_cpu(nsindex->minor) == 1) + loop_bitmask |= 2; + else + loop_bitmask |= 4; + } + } + /* + * If nsindex is null loop_bitmask's bit 0 will be set, and if an index + * block is found, a v1.1 label for any mapping will set bit 1, and a + * v1.2 label will set bit 2. + * + * At the end of the loop, at most one of the three bits must be set. + * If multiple bits were set, it means the different mappings disagree + * about their labels, and this must be cleaned up first. + * + * If all the label index blocks are found to agree, nsindex of NULL + * implies labels haven't been initialized yet, and when they will, + * they will be of the 1.2 format, so we can assume BTT2.0 + * + * If 1.1 labels are found, we enforce BTT1.1, and if 1.2 labels are + * found, we enforce BTT2.0 + * + * If the loop was never entered, default to BTT1.1 (legacy namespaces) + */ + switch (loop_bitmask) { + case 0: + case 2: + return NVDIMM_CCLASS_BTT; + case 1: + case 4: + return NVDIMM_CCLASS_BTT2; + default: + return -ENXIO; + } +} + +static ssize_t holder_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + ssize_t rc; + + device_lock(dev); + rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : ""); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(holder); + +static int __holder_class_store(struct device *dev, const char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + + if (dev->driver || ndns->claim) + return -EBUSY; + + if (sysfs_streq(buf, "btt")) { + int rc = btt_claim_class(dev); + + if (rc < NVDIMM_CCLASS_NONE) + return rc; + ndns->claim_class = rc; + } else if (sysfs_streq(buf, "pfn")) + ndns->claim_class = NVDIMM_CCLASS_PFN; + else if (sysfs_streq(buf, "dax")) + ndns->claim_class = NVDIMM_CCLASS_DAX; + else if (sysfs_streq(buf, "")) + ndns->claim_class = NVDIMM_CCLASS_NONE; + else + return -EINVAL; + + return 0; +} + +static ssize_t holder_class_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + int rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = __holder_class_store(dev, buf); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + dev_dbg(dev, "%s(%d)\n", rc < 0 ? "fail " : "", rc); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} + +static ssize_t holder_class_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + ssize_t rc; + + device_lock(dev); + if (ndns->claim_class == NVDIMM_CCLASS_NONE) + rc = sprintf(buf, "\n"); + else if ((ndns->claim_class == NVDIMM_CCLASS_BTT) || + (ndns->claim_class == NVDIMM_CCLASS_BTT2)) + rc = sprintf(buf, "btt\n"); + else if (ndns->claim_class == NVDIMM_CCLASS_PFN) + rc = sprintf(buf, "pfn\n"); + else if (ndns->claim_class == NVDIMM_CCLASS_DAX) + rc = sprintf(buf, "dax\n"); + else + rc = sprintf(buf, "<unknown>\n"); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(holder_class); + +static ssize_t mode_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + struct device *claim; + char *mode; + ssize_t rc; + + device_lock(dev); + claim = ndns->claim; + if (claim && is_nd_btt(claim)) + mode = "safe"; + else if (claim && is_nd_pfn(claim)) + mode = "memory"; + else if (claim && is_nd_dax(claim)) + mode = "dax"; + else if (!claim && pmem_should_map_pages(dev)) + mode = "memory"; + else + mode = "raw"; + rc = sprintf(buf, "%s\n", mode); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(mode); + +static ssize_t force_raw_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool force_raw; + int rc = strtobool(buf, &force_raw); + + if (rc) + return rc; + + to_ndns(dev)->force_raw = force_raw; + return len; +} + +static ssize_t force_raw_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", to_ndns(dev)->force_raw); +} +static DEVICE_ATTR_RW(force_raw); + +static struct attribute *nd_namespace_attributes[] = { + &dev_attr_nstype.attr, + &dev_attr_size.attr, + &dev_attr_mode.attr, + &dev_attr_uuid.attr, + &dev_attr_holder.attr, + &dev_attr_resource.attr, + &dev_attr_alt_name.attr, + &dev_attr_force_raw.attr, + &dev_attr_sector_size.attr, + &dev_attr_dpa_extents.attr, + &dev_attr_holder_class.attr, + NULL, +}; + +static umode_t namespace_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + + if (is_namespace_pmem(dev)) { + if (a == &dev_attr_size.attr) + return 0644; + + return a->mode; + } + + /* base is_namespace_io() attributes */ + if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr || + a == &dev_attr_holder.attr || a == &dev_attr_holder_class.attr || + a == &dev_attr_force_raw.attr || a == &dev_attr_mode.attr || + a == &dev_attr_resource.attr) + return a->mode; + + return 0; +} + +static struct attribute_group nd_namespace_attribute_group = { + .attrs = nd_namespace_attributes, + .is_visible = namespace_visible, +}; + +static const struct attribute_group *nd_namespace_attribute_groups[] = { + &nd_device_attribute_group, + &nd_namespace_attribute_group, + &nd_numa_attribute_group, + NULL, +}; + +static const struct device_type namespace_io_device_type = { + .name = "nd_namespace_io", + .release = namespace_io_release, + .groups = nd_namespace_attribute_groups, +}; + +static const struct device_type namespace_pmem_device_type = { + .name = "nd_namespace_pmem", + .release = namespace_pmem_release, + .groups = nd_namespace_attribute_groups, +}; + +static bool is_namespace_pmem(const struct device *dev) +{ + return dev ? dev->type == &namespace_pmem_device_type : false; +} + +static bool is_namespace_io(const struct device *dev) +{ + return dev ? dev->type == &namespace_io_device_type : false; +} + +struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) +{ + struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL; + struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL; + struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL; + struct nd_namespace_common *ndns = NULL; + resource_size_t size; + + if (nd_btt || nd_pfn || nd_dax) { + if (nd_btt) + ndns = nd_btt->ndns; + else if (nd_pfn) + ndns = nd_pfn->ndns; + else if (nd_dax) + ndns = nd_dax->nd_pfn.ndns; + + if (!ndns) + return ERR_PTR(-ENODEV); + + /* + * Flush any in-progess probes / removals in the driver + * for the raw personality of this namespace. + */ + device_lock(&ndns->dev); + device_unlock(&ndns->dev); + if (ndns->dev.driver) { + dev_dbg(&ndns->dev, "is active, can't bind %s\n", + dev_name(dev)); + return ERR_PTR(-EBUSY); + } + if (dev_WARN_ONCE(&ndns->dev, ndns->claim != dev, + "host (%s) vs claim (%s) mismatch\n", + dev_name(dev), + dev_name(ndns->claim))) + return ERR_PTR(-ENXIO); + } else { + ndns = to_ndns(dev); + if (ndns->claim) { + dev_dbg(dev, "claimed by %s, failing probe\n", + dev_name(ndns->claim)); + + return ERR_PTR(-ENXIO); + } + } + + if (nvdimm_namespace_locked(ndns)) + return ERR_PTR(-EACCES); + + size = nvdimm_namespace_capacity(ndns); + if (size < ND_MIN_NAMESPACE_SIZE) { + dev_dbg(&ndns->dev, "%pa, too small must be at least %#x\n", + &size, ND_MIN_NAMESPACE_SIZE); + return ERR_PTR(-ENODEV); + } + + /* + * Note, alignment validation for fsdax and devdax mode + * namespaces happens in nd_pfn_validate() where infoblock + * padding parameters can be applied. + */ + if (pmem_should_map_pages(dev)) { + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + struct resource *res = &nsio->res; + + if (!IS_ALIGNED(res->start | (res->end + 1), + memremap_compat_align())) { + dev_err(&ndns->dev, "%pr misaligned, unable to map\n", res); + return ERR_PTR(-EOPNOTSUPP); + } + } + + if (is_namespace_pmem(&ndns->dev)) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(&ndns->dev); + if (uuid_not_set(nspm->uuid, &ndns->dev, __func__)) + return ERR_PTR(-ENODEV); + } + + return ndns; +} +EXPORT_SYMBOL(nvdimm_namespace_common_probe); + +int devm_namespace_enable(struct device *dev, struct nd_namespace_common *ndns, + resource_size_t size) +{ + return devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev), size); +} +EXPORT_SYMBOL_GPL(devm_namespace_enable); + +void devm_namespace_disable(struct device *dev, struct nd_namespace_common *ndns) +{ + devm_nsio_disable(dev, to_nd_namespace_io(&ndns->dev)); +} +EXPORT_SYMBOL_GPL(devm_namespace_disable); + +static struct device **create_namespace_io(struct nd_region *nd_region) +{ + struct nd_namespace_io *nsio; + struct device *dev, **devs; + struct resource *res; + + nsio = kzalloc(sizeof(*nsio), GFP_KERNEL); + if (!nsio) + return NULL; + + devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); + if (!devs) { + kfree(nsio); + return NULL; + } + + dev = &nsio->common.dev; + dev->type = &namespace_io_device_type; + dev->parent = &nd_region->dev; + res = &nsio->res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + res->start = nd_region->ndr_start; + res->end = res->start + nd_region->ndr_size - 1; + + devs[0] = dev; + return devs; +} + +static bool has_uuid_at_pos(struct nd_region *nd_region, const uuid_t *uuid, + u64 cookie, u16 pos) +{ + struct nd_namespace_label *found = NULL; + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_interleave_set *nd_set = nd_region->nd_set; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_ent *label_ent; + bool found_uuid = false; + + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; + u16 position; + + if (!nd_label) + continue; + position = nsl_get_position(ndd, nd_label); + + if (!nsl_validate_isetcookie(ndd, nd_label, cookie)) + continue; + + if (!nsl_uuid_equal(ndd, nd_label, uuid)) + continue; + + if (!nsl_validate_type_guid(ndd, nd_label, + &nd_set->type_guid)) + continue; + + if (found_uuid) { + dev_dbg(ndd->dev, "duplicate entry for uuid\n"); + return false; + } + found_uuid = true; + if (!nsl_validate_nlabel(nd_region, ndd, nd_label)) + continue; + if (position != pos) + continue; + found = nd_label; + break; + } + if (found) + break; + } + return found != NULL; +} + +static int select_pmem_id(struct nd_region *nd_region, const uuid_t *pmem_id) +{ + int i; + + if (!pmem_id) + return -ENODEV; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *nd_label = NULL; + u64 hw_start, hw_end, pmem_start, pmem_end; + struct nd_label_ent *label_ent; + + lockdep_assert_held(&nd_mapping->lock); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + nd_label = label_ent->label; + if (!nd_label) + continue; + if (nsl_uuid_equal(ndd, nd_label, pmem_id)) + break; + nd_label = NULL; + } + + if (!nd_label) { + WARN_ON(1); + return -EINVAL; + } + + /* + * Check that this label is compliant with the dpa + * range published in NFIT + */ + hw_start = nd_mapping->start; + hw_end = hw_start + nd_mapping->size; + pmem_start = nsl_get_dpa(ndd, nd_label); + pmem_end = pmem_start + nsl_get_rawsize(ndd, nd_label); + if (pmem_start >= hw_start && pmem_start < hw_end + && pmem_end <= hw_end && pmem_end > hw_start) + /* pass */; + else { + dev_dbg(&nd_region->dev, "%s invalid label for %pUb\n", + dev_name(ndd->dev), + nsl_uuid_raw(ndd, nd_label)); + return -EINVAL; + } + + /* move recently validated label to the front of the list */ + list_move(&label_ent->list, &nd_mapping->labels); + } + return 0; +} + +/** + * create_namespace_pmem - validate interleave set labelling, retrieve label0 + * @nd_region: region with mappings to validate + * @nspm: target namespace to create + * @nd_label: target pmem namespace label to evaluate + */ +static struct device *create_namespace_pmem(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, + struct nd_namespace_label *nd_label) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_index *nsindex = + to_namespace_index(ndd, ndd->ns_current); + u64 cookie = nd_region_interleave_set_cookie(nd_region, nsindex); + u64 altcookie = nd_region_interleave_set_altcookie(nd_region); + struct nd_label_ent *label_ent; + struct nd_namespace_pmem *nspm; + resource_size_t size = 0; + struct resource *res; + struct device *dev; + uuid_t uuid; + int rc = 0; + u16 i; + + if (cookie == 0) { + dev_dbg(&nd_region->dev, "invalid interleave-set-cookie\n"); + return ERR_PTR(-ENXIO); + } + + if (!nsl_validate_isetcookie(ndd, nd_label, cookie)) { + dev_dbg(&nd_region->dev, "invalid cookie in label: %pUb\n", + nsl_uuid_raw(ndd, nd_label)); + if (!nsl_validate_isetcookie(ndd, nd_label, altcookie)) + return ERR_PTR(-EAGAIN); + + dev_dbg(&nd_region->dev, "valid altcookie in label: %pUb\n", + nsl_uuid_raw(ndd, nd_label)); + } + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return ERR_PTR(-ENOMEM); + + nspm->id = -1; + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + nsl_get_uuid(ndd, nd_label, &uuid); + if (has_uuid_at_pos(nd_region, &uuid, cookie, i)) + continue; + if (has_uuid_at_pos(nd_region, &uuid, altcookie, i)) + continue; + break; + } + + if (i < nd_region->ndr_mappings) { + struct nvdimm *nvdimm = nd_region->mapping[i].nvdimm; + + /* + * Give up if we don't find an instance of a uuid at each + * position (from 0 to nd_region->ndr_mappings - 1), or if we + * find a dimm with two instances of the same uuid. + */ + dev_err(&nd_region->dev, "%s missing label for %pUb\n", + nvdimm_name(nvdimm), nsl_uuid_raw(ndd, nd_label)); + rc = -EINVAL; + goto err; + } + + /* + * Fix up each mapping's 'labels' to have the validated pmem label for + * that position at labels[0], and NULL at labels[1]. In the process, + * check that the namespace aligns with interleave-set. + */ + nsl_get_uuid(ndd, nd_label, &uuid); + rc = select_pmem_id(nd_region, &uuid); + if (rc) + goto err; + + /* Calculate total size and populate namespace properties from label0 */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_namespace_label *label0; + struct nvdimm_drvdata *ndd; + + nd_mapping = &nd_region->mapping[i]; + label_ent = list_first_entry_or_null(&nd_mapping->labels, + typeof(*label_ent), list); + label0 = label_ent ? label_ent->label : NULL; + + if (!label0) { + WARN_ON(1); + continue; + } + + ndd = to_ndd(nd_mapping); + size += nsl_get_rawsize(ndd, label0); + if (nsl_get_position(ndd, label0) != 0) + continue; + WARN_ON(nspm->alt_name || nspm->uuid); + nspm->alt_name = kmemdup(nsl_ref_name(ndd, label0), + NSLABEL_NAME_LEN, GFP_KERNEL); + nsl_get_uuid(ndd, label0, &uuid); + nspm->uuid = kmemdup(&uuid, sizeof(uuid_t), GFP_KERNEL); + nspm->lbasize = nsl_get_lbasize(ndd, label0); + nspm->nsio.common.claim_class = + nsl_get_claim_class(ndd, label0); + } + + if (!nspm->alt_name || !nspm->uuid) { + rc = -ENOMEM; + goto err; + } + + nd_namespace_pmem_set_resource(nd_region, nspm, size); + + return dev; + err: + namespace_pmem_release(dev); + switch (rc) { + case -EINVAL: + dev_dbg(&nd_region->dev, "invalid label(s)\n"); + break; + case -ENODEV: + dev_dbg(&nd_region->dev, "label not found\n"); + break; + default: + dev_dbg(&nd_region->dev, "unexpected err: %d\n", rc); + break; + } + return ERR_PTR(rc); +} + +static struct device *nd_namespace_pmem_create(struct nd_region *nd_region) +{ + struct nd_namespace_pmem *nspm; + struct resource *res; + struct device *dev; + + if (!is_memory(&nd_region->dev)) + return NULL; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return NULL; + + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + + nspm->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); + if (nspm->id < 0) { + kfree(nspm); + return NULL; + } + dev_set_name(dev, "namespace%d.%d", nd_region->id, nspm->id); + nd_namespace_pmem_set_resource(nd_region, nspm, 0); + + return dev; +} + +static struct lock_class_key nvdimm_namespace_key; + +void nd_region_create_ns_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + + if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_IO) + return; + + nd_region->ns_seed = nd_namespace_pmem_create(nd_region); + + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->ns_seed) + dev_err(&nd_region->dev, "failed to create namespace\n"); + else { + device_initialize(nd_region->ns_seed); + lockdep_set_class(&nd_region->ns_seed->mutex, + &nvdimm_namespace_key); + nd_device_register(nd_region->ns_seed); + } +} + +void nd_region_create_dax_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->dax_seed = nd_dax_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->dax_seed) + dev_err(&nd_region->dev, "failed to create dax namespace\n"); +} + +void nd_region_create_pfn_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->pfn_seed = nd_pfn_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->pfn_seed) + dev_err(&nd_region->dev, "failed to create pfn namespace\n"); +} + +void nd_region_create_btt_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->btt_seed = nd_btt_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->btt_seed) + dev_err(&nd_region->dev, "failed to create btt namespace\n"); +} + +static int add_namespace_resource(struct nd_region *nd_region, + struct nd_namespace_label *nd_label, struct device **devs, + int count) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + int i; + + for (i = 0; i < count; i++) { + uuid_t *uuid = namespace_to_uuid(devs[i]); + + if (IS_ERR(uuid)) { + WARN_ON(1); + continue; + } + + if (!nsl_uuid_equal(ndd, nd_label, uuid)) + continue; + dev_err(&nd_region->dev, + "error: conflicting extents for uuid: %pUb\n", uuid); + return -ENXIO; + } + + return i; +} + +static int cmp_dpa(const void *a, const void *b) +{ + const struct device *dev_a = *(const struct device **) a; + const struct device *dev_b = *(const struct device **) b; + struct nd_namespace_pmem *nspm_a, *nspm_b; + + if (is_namespace_io(dev_a)) + return 0; + + nspm_a = to_nd_namespace_pmem(dev_a); + nspm_b = to_nd_namespace_pmem(dev_b); + + return memcmp(&nspm_a->nsio.res.start, &nspm_b->nsio.res.start, + sizeof(resource_size_t)); +} + +static struct device **scan_labels(struct nd_region *nd_region) +{ + int i, count = 0; + struct device *dev, **devs = NULL; + struct nd_label_ent *label_ent, *e; + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + resource_size_t map_end = nd_mapping->start + nd_mapping->size - 1; + + /* "safe" because create_namespace_pmem() might list_move() label_ent */ + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; + struct device **__devs; + + if (!nd_label) + continue; + + /* skip labels that describe extents outside of the region */ + if (nsl_get_dpa(ndd, nd_label) < nd_mapping->start || + nsl_get_dpa(ndd, nd_label) > map_end) + continue; + + i = add_namespace_resource(nd_region, nd_label, devs, count); + if (i < 0) + goto err; + if (i < count) + continue; + __devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL); + if (!__devs) + goto err; + memcpy(__devs, devs, sizeof(dev) * count); + kfree(devs); + devs = __devs; + + dev = create_namespace_pmem(nd_region, nd_mapping, nd_label); + if (IS_ERR(dev)) { + switch (PTR_ERR(dev)) { + case -EAGAIN: + /* skip invalid labels */ + continue; + case -ENODEV: + /* fallthrough to seed creation */ + break; + default: + goto err; + } + } else + devs[count++] = dev; + + } + + dev_dbg(&nd_region->dev, "discovered %d namespace%s\n", count, + count == 1 ? "" : "s"); + + if (count == 0) { + struct nd_namespace_pmem *nspm; + + /* Publish a zero-sized namespace for userspace to configure. */ + nd_mapping_free_labels(nd_mapping); + + devs = kcalloc(2, sizeof(dev), GFP_KERNEL); + if (!devs) + goto err; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + goto err; + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + nd_namespace_pmem_set_resource(nd_region, nspm, 0); + dev->parent = &nd_region->dev; + devs[count++] = dev; + } else if (is_memory(&nd_region->dev)) { + /* clean unselected labels */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct list_head *l, *e; + LIST_HEAD(list); + int j; + + nd_mapping = &nd_region->mapping[i]; + if (list_empty(&nd_mapping->labels)) { + WARN_ON(1); + continue; + } + + j = count; + list_for_each_safe(l, e, &nd_mapping->labels) { + if (!j--) + break; + list_move_tail(l, &list); + } + nd_mapping_free_labels(nd_mapping); + list_splice_init(&list, &nd_mapping->labels); + } + } + + if (count > 1) + sort(devs, count, sizeof(struct device *), cmp_dpa, NULL); + + return devs; + + err: + if (devs) { + for (i = 0; devs[i]; i++) + namespace_pmem_release(devs[i]); + kfree(devs); + } + return NULL; +} + +static struct device **create_namespaces(struct nd_region *nd_region) +{ + struct nd_mapping *nd_mapping; + struct device **devs; + int i; + + if (nd_region->ndr_mappings == 0) + return NULL; + + /* lock down all mappings while we scan labels */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + mutex_lock_nested(&nd_mapping->lock, i); + } + + devs = scan_labels(nd_region); + + for (i = 0; i < nd_region->ndr_mappings; i++) { + int reverse = nd_region->ndr_mappings - 1 - i; + + nd_mapping = &nd_region->mapping[reverse]; + mutex_unlock(&nd_mapping->lock); + } + + return devs; +} + +static void deactivate_labels(void *region) +{ + struct nd_region *nd_region = region; + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = nd_mapping->ndd; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + mutex_lock(&nd_mapping->lock); + nd_mapping_free_labels(nd_mapping); + mutex_unlock(&nd_mapping->lock); + + put_ndd(ndd); + nd_mapping->ndd = NULL; + if (ndd) + atomic_dec(&nvdimm->busy); + } +} + +static int init_active_labels(struct nd_region *nd_region) +{ + int i, rc = 0; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nd_label_ent *label_ent; + int count, j; + + /* + * If the dimm is disabled then we may need to prevent + * the region from being activated. + */ + if (!ndd) { + if (test_bit(NDD_LOCKED, &nvdimm->flags)) + /* fail, label data may be unreadable */; + else if (test_bit(NDD_LABELING, &nvdimm->flags)) + /* fail, labels needed to disambiguate dpa */; + else + continue; + + dev_err(&nd_region->dev, "%s: is %s, failing probe\n", + dev_name(&nd_mapping->nvdimm->dev), + test_bit(NDD_LOCKED, &nvdimm->flags) + ? "locked" : "disabled"); + rc = -ENXIO; + goto out; + } + nd_mapping->ndd = ndd; + atomic_inc(&nvdimm->busy); + get_ndd(ndd); + + count = nd_label_active_count(ndd); + dev_dbg(ndd->dev, "count: %d\n", count); + if (!count) + continue; + for (j = 0; j < count; j++) { + struct nd_namespace_label *label; + + label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL); + if (!label_ent) + break; + label = nd_label_active(ndd, j); + label_ent->label = label; + + mutex_lock(&nd_mapping->lock); + list_add_tail(&label_ent->list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); + } + + if (j < count) + break; + } + + if (i < nd_region->ndr_mappings) + rc = -ENOMEM; + +out: + if (rc) { + deactivate_labels(nd_region); + return rc; + } + + return devm_add_action_or_reset(&nd_region->dev, deactivate_labels, + nd_region); +} + +int nd_region_register_namespaces(struct nd_region *nd_region, int *err) +{ + struct device **devs = NULL; + int i, rc = 0, type; + + *err = 0; + nvdimm_bus_lock(&nd_region->dev); + rc = init_active_labels(nd_region); + if (rc) { + nvdimm_bus_unlock(&nd_region->dev); + return rc; + } + + type = nd_region_to_nstype(nd_region); + switch (type) { + case ND_DEVICE_NAMESPACE_IO: + devs = create_namespace_io(nd_region); + break; + case ND_DEVICE_NAMESPACE_PMEM: + devs = create_namespaces(nd_region); + break; + default: + break; + } + nvdimm_bus_unlock(&nd_region->dev); + + if (!devs) + return -ENODEV; + + for (i = 0; devs[i]; i++) { + struct device *dev = devs[i]; + int id; + + if (type == ND_DEVICE_NAMESPACE_PMEM) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(dev); + id = ida_simple_get(&nd_region->ns_ida, 0, 0, + GFP_KERNEL); + nspm->id = id; + } else + id = i; + + if (id < 0) + break; + dev_set_name(dev, "namespace%d.%d", nd_region->id, id); + device_initialize(dev); + lockdep_set_class(&dev->mutex, &nvdimm_namespace_key); + nd_device_register(dev); + } + if (i) + nd_region->ns_seed = devs[0]; + + if (devs[i]) { + int j; + + for (j = i; devs[j]; j++) { + struct device *dev = devs[j]; + + device_initialize(dev); + put_device(dev); + } + *err = j - i; + /* + * All of the namespaces we tried to register failed, so + * fail region activation. + */ + if (*err == 0) + rc = -ENODEV; + } + kfree(devs); + + if (rc == -ENODEV) + return rc; + + return i; +} diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h new file mode 100644 index 000000000..845408f10 --- /dev/null +++ b/drivers/nvdimm/nd-core.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#ifndef __ND_CORE_H__ +#define __ND_CORE_H__ +#include <linux/libnvdimm.h> +#include <linux/device.h> +#include <linux/sizes.h> +#include <linux/mutex.h> +#include <linux/nd.h> +#include "nd.h" + +extern struct list_head nvdimm_bus_list; +extern struct mutex nvdimm_bus_list_mutex; +extern int nvdimm_major; +extern struct workqueue_struct *nvdimm_wq; + +struct nvdimm_bus { + struct nvdimm_bus_descriptor *nd_desc; + wait_queue_head_t wait; + struct list_head list; + struct device dev; + int id, probe_active; + atomic_t ioctl_active; + struct list_head mapping_list; + struct mutex reconfig_mutex; + struct badrange badrange; +}; + +struct nvdimm { + unsigned long flags; + void *provider_data; + unsigned long cmd_mask; + struct device dev; + atomic_t busy; + int id, num_flush; + struct resource *flush_wpq; + const char *dimm_id; + struct { + const struct nvdimm_security_ops *ops; + unsigned long flags; + unsigned long ext_flags; + unsigned int overwrite_tmo; + struct kernfs_node *overwrite_state; + } sec; + struct delayed_work dwork; + const struct nvdimm_fw_ops *fw_ops; +}; + +static inline unsigned long nvdimm_security_flags( + struct nvdimm *nvdimm, enum nvdimm_passphrase_type ptype) +{ + u64 flags; + const u64 state_flags = 1UL << NVDIMM_SECURITY_DISABLED + | 1UL << NVDIMM_SECURITY_LOCKED + | 1UL << NVDIMM_SECURITY_UNLOCKED + | 1UL << NVDIMM_SECURITY_OVERWRITE; + + if (!nvdimm->sec.ops) + return 0; + + flags = nvdimm->sec.ops->get_flags(nvdimm, ptype); + /* disabled, locked, unlocked, and overwrite are mutually exclusive */ + dev_WARN_ONCE(&nvdimm->dev, hweight64(flags & state_flags) > 1, + "reported invalid security state: %#llx\n", + (unsigned long long) flags); + return flags; +} +int nvdimm_security_freeze(struct nvdimm *nvdimm); +#if IS_ENABLED(CONFIG_NVDIMM_KEYS) +ssize_t nvdimm_security_store(struct device *dev, const char *buf, size_t len); +void nvdimm_security_overwrite_query(struct work_struct *work); +#else +static inline ssize_t nvdimm_security_store(struct device *dev, + const char *buf, size_t len) +{ + return -EOPNOTSUPP; +} +static inline void nvdimm_security_overwrite_query(struct work_struct *work) +{ +} +#endif + +bool is_nvdimm(struct device *dev); +bool is_nd_pmem(struct device *dev); +bool is_nd_volatile(struct device *dev); +static inline bool is_nd_region(struct device *dev) +{ + return is_nd_pmem(dev) || is_nd_volatile(dev); +} +static inline bool is_memory(struct device *dev) +{ + return is_nd_pmem(dev) || is_nd_volatile(dev); +} +struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); +int __init nvdimm_bus_init(void); +void nvdimm_bus_exit(void); +void nvdimm_devs_exit(void); +struct nd_region; +void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev); +void nd_region_create_ns_seed(struct nd_region *nd_region); +void nd_region_create_btt_seed(struct nd_region *nd_region); +void nd_region_create_pfn_seed(struct nd_region *nd_region); +void nd_region_create_dax_seed(struct nd_region *nd_region); +int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); +void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); +void nd_synchronize(void); +void nd_device_register(struct device *dev); +void nd_device_register_sync(struct device *dev); +struct nd_label_id; +char *nd_label_gen_id(struct nd_label_id *label_id, const uuid_t *uuid, + u32 flags); +bool nd_is_uuid_unique(struct device *dev, uuid_t *uuid); +struct nd_region; +struct nvdimm_drvdata; +struct nd_mapping; +void nd_mapping_free_labels(struct nd_mapping *nd_mapping); + +int __reserve_free_pmem(struct device *dev, void *data); +void release_free_pmem(struct nvdimm_bus *nvdimm_bus, + struct nd_mapping *nd_mapping); + +resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping); +resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region); +resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping); +resource_size_t nd_region_available_dpa(struct nd_region *nd_region); +int nd_region_conflict(struct nd_region *nd_region, resource_size_t start, + resource_size_t size); +resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id); +int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd); +void get_ndd(struct nvdimm_drvdata *ndd); +resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns); +void nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns); +void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns); +bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, + struct nd_namespace_common **_ndns); +bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, + struct nd_namespace_common **_ndns); +ssize_t nd_namespace_store(struct device *dev, + struct nd_namespace_common **_ndns, const char *buf, + size_t len); +struct nd_pfn *to_nd_pfn_safe(struct device *dev); +bool is_nvdimm_bus(struct device *dev); + +#if IS_ENABLED(CONFIG_ND_CLAIM) +int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio, + resource_size_t size); +void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio); +#else +static inline int devm_nsio_enable(struct device *dev, + struct nd_namespace_io *nsio, resource_size_t size) +{ + return -ENXIO; +} + +static inline void devm_nsio_disable(struct device *dev, + struct nd_namespace_io *nsio) +{ +} +#endif +#endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h new file mode 100644 index 000000000..ec5219680 --- /dev/null +++ b/drivers/nvdimm/nd.h @@ -0,0 +1,680 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#ifndef __ND_H__ +#define __ND_H__ +#include <linux/libnvdimm.h> +#include <linux/badblocks.h> +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/ndctl.h> +#include <linux/types.h> +#include <linux/nd.h> +#include "label.h" + +enum { + /* + * Limits the maximum number of block apertures a dimm can + * support and is an input to the geometry/on-disk-format of a + * BTT instance + */ + ND_MAX_LANES = 256, + INT_LBASIZE_ALIGNMENT = 64, + NVDIMM_IO_ATOMIC = 1, +}; + +struct nvdimm_drvdata { + struct device *dev; + int nslabel_size; + struct nd_cmd_get_config_size nsarea; + void *data; + bool cxl; + int ns_current, ns_next; + struct resource dpa; + struct kref kref; +}; + +static inline const u8 *nsl_ref_name(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return nd_label->cxl.name; + return nd_label->efi.name; +} + +static inline u8 *nsl_get_name(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, u8 *name) +{ + if (ndd->cxl) + return memcpy(name, nd_label->cxl.name, NSLABEL_NAME_LEN); + return memcpy(name, nd_label->efi.name, NSLABEL_NAME_LEN); +} + +static inline u8 *nsl_set_name(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, u8 *name) +{ + if (!name) + return NULL; + if (ndd->cxl) + return memcpy(nd_label->cxl.name, name, NSLABEL_NAME_LEN); + return memcpy(nd_label->efi.name, name, NSLABEL_NAME_LEN); +} + +static inline u32 nsl_get_slot(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return __le32_to_cpu(nd_label->cxl.slot); + return __le32_to_cpu(nd_label->efi.slot); +} + +static inline void nsl_set_slot(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, u32 slot) +{ + if (ndd->cxl) + nd_label->cxl.slot = __cpu_to_le32(slot); + else + nd_label->efi.slot = __cpu_to_le32(slot); +} + +static inline u64 nsl_get_checksum(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return __le64_to_cpu(nd_label->cxl.checksum); + return __le64_to_cpu(nd_label->efi.checksum); +} + +static inline void nsl_set_checksum(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + u64 checksum) +{ + if (ndd->cxl) + nd_label->cxl.checksum = __cpu_to_le64(checksum); + else + nd_label->efi.checksum = __cpu_to_le64(checksum); +} + +static inline u32 nsl_get_flags(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return __le32_to_cpu(nd_label->cxl.flags); + return __le32_to_cpu(nd_label->efi.flags); +} + +static inline void nsl_set_flags(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, u32 flags) +{ + if (ndd->cxl) + nd_label->cxl.flags = __cpu_to_le32(flags); + else + nd_label->efi.flags = __cpu_to_le32(flags); +} + +static inline u64 nsl_get_dpa(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return __le64_to_cpu(nd_label->cxl.dpa); + return __le64_to_cpu(nd_label->efi.dpa); +} + +static inline void nsl_set_dpa(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, u64 dpa) +{ + if (ndd->cxl) + nd_label->cxl.dpa = __cpu_to_le64(dpa); + else + nd_label->efi.dpa = __cpu_to_le64(dpa); +} + +static inline u64 nsl_get_rawsize(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return __le64_to_cpu(nd_label->cxl.rawsize); + return __le64_to_cpu(nd_label->efi.rawsize); +} + +static inline void nsl_set_rawsize(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + u64 rawsize) +{ + if (ndd->cxl) + nd_label->cxl.rawsize = __cpu_to_le64(rawsize); + else + nd_label->efi.rawsize = __cpu_to_le64(rawsize); +} + +static inline u64 nsl_get_isetcookie(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + /* WARN future refactor attempts that break this assumption */ + if (dev_WARN_ONCE(ndd->dev, ndd->cxl, + "CXL labels do not use the isetcookie concept\n")) + return 0; + return __le64_to_cpu(nd_label->efi.isetcookie); +} + +static inline void nsl_set_isetcookie(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + u64 isetcookie) +{ + if (!ndd->cxl) + nd_label->efi.isetcookie = __cpu_to_le64(isetcookie); +} + +static inline bool nsl_validate_isetcookie(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + u64 cookie) +{ + /* + * Let the EFI and CXL validation comingle, where fields that + * don't matter to CXL always validate. + */ + if (ndd->cxl) + return true; + return cookie == __le64_to_cpu(nd_label->efi.isetcookie); +} + +static inline u16 nsl_get_position(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return __le16_to_cpu(nd_label->cxl.position); + return __le16_to_cpu(nd_label->efi.position); +} + +static inline void nsl_set_position(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + u16 position) +{ + if (ndd->cxl) + nd_label->cxl.position = __cpu_to_le16(position); + else + nd_label->efi.position = __cpu_to_le16(position); +} + +static inline u16 nsl_get_nlabel(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return 0; + return __le16_to_cpu(nd_label->efi.nlabel); +} + +static inline void nsl_set_nlabel(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + u16 nlabel) +{ + if (!ndd->cxl) + nd_label->efi.nlabel = __cpu_to_le16(nlabel); +} + +static inline u16 nsl_get_nrange(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return __le16_to_cpu(nd_label->cxl.nrange); + return 1; +} + +static inline void nsl_set_nrange(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + u16 nrange) +{ + if (ndd->cxl) + nd_label->cxl.nrange = __cpu_to_le16(nrange); +} + +static inline u64 nsl_get_lbasize(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + /* + * Yes, for some reason the EFI labels convey a massive 64-bit + * lbasize, that got fixed for CXL. + */ + if (ndd->cxl) + return __le16_to_cpu(nd_label->cxl.lbasize); + return __le64_to_cpu(nd_label->efi.lbasize); +} + +static inline void nsl_set_lbasize(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + u64 lbasize) +{ + if (ndd->cxl) + nd_label->cxl.lbasize = __cpu_to_le16(lbasize); + else + nd_label->efi.lbasize = __cpu_to_le64(lbasize); +} + +static inline const uuid_t *nsl_get_uuid(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + uuid_t *uuid) +{ + if (ndd->cxl) + import_uuid(uuid, nd_label->cxl.uuid); + else + import_uuid(uuid, nd_label->efi.uuid); + return uuid; +} + +static inline const uuid_t *nsl_set_uuid(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + const uuid_t *uuid) +{ + if (ndd->cxl) + export_uuid(nd_label->cxl.uuid, uuid); + else + export_uuid(nd_label->efi.uuid, uuid); + return uuid; +} + +static inline bool nsl_uuid_equal(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, + const uuid_t *uuid) +{ + uuid_t tmp; + + if (ndd->cxl) + import_uuid(&tmp, nd_label->cxl.uuid); + else + import_uuid(&tmp, nd_label->efi.uuid); + return uuid_equal(&tmp, uuid); +} + +static inline const u8 *nsl_uuid_raw(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return nd_label->cxl.uuid; + return nd_label->efi.uuid; +} + +bool nsl_validate_type_guid(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, guid_t *guid); +enum nvdimm_claim_class nsl_get_claim_class(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label); + +struct nd_region_data { + int ns_count; + int ns_active; + unsigned int hints_shift; + void __iomem *flush_wpq[]; +}; + +static inline void __iomem *ndrd_get_flush_wpq(struct nd_region_data *ndrd, + int dimm, int hint) +{ + unsigned int num = 1 << ndrd->hints_shift; + unsigned int mask = num - 1; + + return ndrd->flush_wpq[dimm * num + (hint & mask)]; +} + +static inline void ndrd_set_flush_wpq(struct nd_region_data *ndrd, int dimm, + int hint, void __iomem *flush) +{ + unsigned int num = 1 << ndrd->hints_shift; + unsigned int mask = num - 1; + + ndrd->flush_wpq[dimm * num + (hint & mask)] = flush; +} + +static inline struct nd_namespace_index *to_namespace_index( + struct nvdimm_drvdata *ndd, int i) +{ + if (i < 0) + return NULL; + + return ndd->data + sizeof_namespace_index(ndd) * i; +} + +static inline struct nd_namespace_index *to_current_namespace_index( + struct nvdimm_drvdata *ndd) +{ + return to_namespace_index(ndd, ndd->ns_current); +} + +static inline struct nd_namespace_index *to_next_namespace_index( + struct nvdimm_drvdata *ndd) +{ + return to_namespace_index(ndd, ndd->ns_next); +} + +unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd); + +#define efi_namespace_label_has(ndd, field) \ + (!ndd->cxl && offsetof(struct nvdimm_efi_label, field) \ + < sizeof_namespace_label(ndd)) + +#define nd_dbg_dpa(r, d, res, fmt, arg...) \ + dev_dbg((r) ? &(r)->dev : (d)->dev, "%s: %.13s: %#llx @ %#llx " fmt, \ + (r) ? dev_name((d)->dev) : "", res ? res->name : "null", \ + (unsigned long long) (res ? resource_size(res) : 0), \ + (unsigned long long) (res ? res->start : 0), ##arg) + +#define for_each_dpa_resource(ndd, res) \ + for (res = (ndd)->dpa.child; res; res = res->sibling) + +#define for_each_dpa_resource_safe(ndd, res, next) \ + for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \ + res; res = next, next = next ? next->sibling : NULL) + +struct nd_percpu_lane { + int count; + spinlock_t lock; +}; + +enum nd_label_flags { + ND_LABEL_REAP, +}; +struct nd_label_ent { + struct list_head list; + unsigned long flags; + struct nd_namespace_label *label; +}; + +enum nd_mapping_lock_class { + ND_MAPPING_CLASS0, + ND_MAPPING_UUID_SCAN, +}; + +struct nd_mapping { + struct nvdimm *nvdimm; + u64 start; + u64 size; + int position; + struct list_head labels; + struct mutex lock; + /* + * @ndd is for private use at region enable / disable time for + * get_ndd() + put_ndd(), all other nd_mapping to ndd + * conversions use to_ndd() which respects enabled state of the + * nvdimm. + */ + struct nvdimm_drvdata *ndd; +}; + +struct nd_region { + struct device dev; + struct ida ns_ida; + struct ida btt_ida; + struct ida pfn_ida; + struct ida dax_ida; + unsigned long flags; + struct device *ns_seed; + struct device *btt_seed; + struct device *pfn_seed; + struct device *dax_seed; + unsigned long align; + u16 ndr_mappings; + u64 ndr_size; + u64 ndr_start; + int id, num_lanes, ro, numa_node, target_node; + void *provider_data; + struct kernfs_node *bb_state; + struct badblocks bb; + struct nd_interleave_set *nd_set; + struct nd_percpu_lane __percpu *lane; + int (*flush)(struct nd_region *nd_region, struct bio *bio); + struct nd_mapping mapping[]; +}; + +static inline bool nsl_validate_nlabel(struct nd_region *nd_region, + struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + if (ndd->cxl) + return true; + return nsl_get_nlabel(ndd, nd_label) == nd_region->ndr_mappings; +} + +/* + * Lookup next in the repeating sequence of 01, 10, and 11. + */ +static inline unsigned nd_inc_seq(unsigned seq) +{ + static const unsigned next[] = { 0, 2, 3, 1 }; + + return next[seq & 3]; +} + +struct btt; +struct nd_btt { + struct device dev; + struct nd_namespace_common *ndns; + struct btt *btt; + unsigned long lbasize; + u64 size; + uuid_t *uuid; + int id; + int initial_offset; + u16 version_major; + u16 version_minor; +}; + +enum nd_pfn_mode { + PFN_MODE_NONE, + PFN_MODE_RAM, + PFN_MODE_PMEM, +}; + +struct nd_pfn { + int id; + uuid_t *uuid; + struct device dev; + unsigned long align; + unsigned long npfns; + enum nd_pfn_mode mode; + struct nd_pfn_sb *pfn_sb; + struct nd_namespace_common *ndns; +}; + +struct nd_dax { + struct nd_pfn nd_pfn; +}; + +static inline u32 nd_info_block_reserve(void) +{ + return ALIGN(SZ_8K, PAGE_SIZE); +} + +enum nd_async_mode { + ND_SYNC, + ND_ASYNC, +}; + +int nd_integrity_init(struct gendisk *disk, unsigned long meta_size); +void wait_nvdimm_bus_probe_idle(struct device *dev); +void nd_device_register(struct device *dev); +void nd_device_unregister(struct device *dev, enum nd_async_mode mode); +void nd_device_notify(struct device *dev, enum nvdimm_event event); +int nd_uuid_store(struct device *dev, uuid_t **uuid_out, const char *buf, + size_t len); +ssize_t nd_size_select_show(unsigned long current_size, + const unsigned long *supported, char *buf); +ssize_t nd_size_select_store(struct device *dev, const char *buf, + unsigned long *current_size, const unsigned long *supported); +int __init nvdimm_init(void); +int __init nd_region_init(void); +int __init nd_label_init(void); +void nvdimm_exit(void); +void nd_region_exit(void); +struct nvdimm; +extern const struct attribute_group nd_device_attribute_group; +extern const struct attribute_group nd_numa_attribute_group; +extern const struct attribute_group *nvdimm_bus_attribute_groups[]; +struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping); +int nvdimm_check_config_data(struct device *dev); +int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); +int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); +int nvdimm_get_config_data(struct nvdimm_drvdata *ndd, void *buf, + size_t offset, size_t len); +int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, + void *buf, size_t len); +long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, + unsigned int len); +void nvdimm_set_labeling(struct device *dev); +void nvdimm_set_locked(struct device *dev); +void nvdimm_clear_locked(struct device *dev); +int nvdimm_security_setup_events(struct device *dev); +#if IS_ENABLED(CONFIG_NVDIMM_KEYS) +int nvdimm_security_unlock(struct device *dev); +#else +static inline int nvdimm_security_unlock(struct device *dev) +{ + return 0; +} +#endif +struct nd_btt *to_nd_btt(struct device *dev); + +struct nd_gen_sb { + char reserved[SZ_4K - 8]; + __le64 checksum; +}; + +u64 nd_sb_checksum(struct nd_gen_sb *sb); +#if IS_ENABLED(CONFIG_BTT) +int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns); +bool is_nd_btt(struct device *dev); +struct device *nd_btt_create(struct nd_region *nd_region); +#else +static inline int nd_btt_probe(struct device *dev, + struct nd_namespace_common *ndns) +{ + return -ENODEV; +} + +static inline bool is_nd_btt(struct device *dev) +{ + return false; +} + +static inline struct device *nd_btt_create(struct nd_region *nd_region) +{ + return NULL; +} +#endif + +struct nd_pfn *to_nd_pfn(struct device *dev); +#if IS_ENABLED(CONFIG_NVDIMM_PFN) + +#define MAX_NVDIMM_ALIGN 4 + +int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns); +bool is_nd_pfn(struct device *dev); +struct device *nd_pfn_create(struct nd_region *nd_region); +struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn, + struct nd_namespace_common *ndns); +int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig); +extern const struct attribute_group *nd_pfn_attribute_groups[]; +#else +static inline int nd_pfn_probe(struct device *dev, + struct nd_namespace_common *ndns) +{ + return -ENODEV; +} + +static inline bool is_nd_pfn(struct device *dev) +{ + return false; +} + +static inline struct device *nd_pfn_create(struct nd_region *nd_region) +{ + return NULL; +} + +static inline int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) +{ + return -ENODEV; +} +#endif + +struct nd_dax *to_nd_dax(struct device *dev); +#if IS_ENABLED(CONFIG_NVDIMM_DAX) +int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns); +bool is_nd_dax(struct device *dev); +struct device *nd_dax_create(struct nd_region *nd_region); +#else +static inline int nd_dax_probe(struct device *dev, + struct nd_namespace_common *ndns) +{ + return -ENODEV; +} + +static inline bool is_nd_dax(struct device *dev) +{ + return false; +} + +static inline struct device *nd_dax_create(struct nd_region *nd_region) +{ + return NULL; +} +#endif + +int nd_region_to_nstype(struct nd_region *nd_region); +int nd_region_register_namespaces(struct nd_region *nd_region, int *err); +u64 nd_region_interleave_set_cookie(struct nd_region *nd_region, + struct nd_namespace_index *nsindex); +u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region); +void nvdimm_bus_lock(struct device *dev); +void nvdimm_bus_unlock(struct device *dev); +bool is_nvdimm_bus_locked(struct device *dev); +void nvdimm_check_and_set_ro(struct gendisk *disk); +void nvdimm_drvdata_release(struct kref *kref); +void put_ndd(struct nvdimm_drvdata *ndd); +int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd); +void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res); +struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id, resource_size_t start, + resource_size_t n); +resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns); +bool nvdimm_namespace_locked(struct nd_namespace_common *ndns); +struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev); +int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); +int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt); +const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, + char *name); +unsigned int pmem_sector_size(struct nd_namespace_common *ndns); +struct range; +void nvdimm_badblocks_populate(struct nd_region *nd_region, + struct badblocks *bb, const struct range *range); +int devm_namespace_enable(struct device *dev, struct nd_namespace_common *ndns, + resource_size_t size); +void devm_namespace_disable(struct device *dev, + struct nd_namespace_common *ndns); +#if IS_ENABLED(CONFIG_ND_CLAIM) +/* max struct page size independent of kernel config */ +#define MAX_STRUCT_PAGE_SIZE 64 +int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap); +#else +static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, + struct dev_pagemap *pgmap) +{ + return -ENXIO; +} +#endif +int nd_region_activate(struct nd_region *nd_region); +static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector, + unsigned int len) +{ + if (bb->count) { + sector_t first_bad; + int num_bad; + + return !!badblocks_check(bb, sector, len / 512, &first_bad, + &num_bad); + } + + return false; +} +const uuid_t *nd_dev_to_uuid(struct device *dev); +bool pmem_should_map_pages(struct device *dev); +#endif /* __ND_H__ */ diff --git a/drivers/nvdimm/nd_perf.c b/drivers/nvdimm/nd_perf.c new file mode 100644 index 000000000..2b6dc80d8 --- /dev/null +++ b/drivers/nvdimm/nd_perf.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * nd_perf.c: NVDIMM Device Performance Monitoring Unit support + * + * Perf interface to expose nvdimm performance stats. + * + * Copyright (C) 2021 IBM Corporation + */ + +#define pr_fmt(fmt) "nvdimm_pmu: " fmt + +#include <linux/nd.h> +#include <linux/platform_device.h> + +#define EVENT(_name, _code) enum{_name = _code} + +/* + * NVDIMM Events codes. + */ + +/* Controller Reset Count */ +EVENT(CTL_RES_CNT, 0x1); +/* Controller Reset Elapsed Time */ +EVENT(CTL_RES_TM, 0x2); +/* Power-on Seconds */ +EVENT(POWERON_SECS, 0x3); +/* Life Remaining */ +EVENT(MEM_LIFE, 0x4); +/* Critical Resource Utilization */ +EVENT(CRI_RES_UTIL, 0x5); +/* Host Load Count */ +EVENT(HOST_L_CNT, 0x6); +/* Host Store Count */ +EVENT(HOST_S_CNT, 0x7); +/* Host Store Duration */ +EVENT(HOST_S_DUR, 0x8); +/* Host Load Duration */ +EVENT(HOST_L_DUR, 0x9); +/* Media Read Count */ +EVENT(MED_R_CNT, 0xa); +/* Media Write Count */ +EVENT(MED_W_CNT, 0xb); +/* Media Read Duration */ +EVENT(MED_R_DUR, 0xc); +/* Media Write Duration */ +EVENT(MED_W_DUR, 0xd); +/* Cache Read Hit Count */ +EVENT(CACHE_RH_CNT, 0xe); +/* Cache Write Hit Count */ +EVENT(CACHE_WH_CNT, 0xf); +/* Fast Write Count */ +EVENT(FAST_W_CNT, 0x10); + +NVDIMM_EVENT_ATTR(ctl_res_cnt, CTL_RES_CNT); +NVDIMM_EVENT_ATTR(ctl_res_tm, CTL_RES_TM); +NVDIMM_EVENT_ATTR(poweron_secs, POWERON_SECS); +NVDIMM_EVENT_ATTR(mem_life, MEM_LIFE); +NVDIMM_EVENT_ATTR(cri_res_util, CRI_RES_UTIL); +NVDIMM_EVENT_ATTR(host_l_cnt, HOST_L_CNT); +NVDIMM_EVENT_ATTR(host_s_cnt, HOST_S_CNT); +NVDIMM_EVENT_ATTR(host_s_dur, HOST_S_DUR); +NVDIMM_EVENT_ATTR(host_l_dur, HOST_L_DUR); +NVDIMM_EVENT_ATTR(med_r_cnt, MED_R_CNT); +NVDIMM_EVENT_ATTR(med_w_cnt, MED_W_CNT); +NVDIMM_EVENT_ATTR(med_r_dur, MED_R_DUR); +NVDIMM_EVENT_ATTR(med_w_dur, MED_W_DUR); +NVDIMM_EVENT_ATTR(cache_rh_cnt, CACHE_RH_CNT); +NVDIMM_EVENT_ATTR(cache_wh_cnt, CACHE_WH_CNT); +NVDIMM_EVENT_ATTR(fast_w_cnt, FAST_W_CNT); + +static struct attribute *nvdimm_events_attr[] = { + NVDIMM_EVENT_PTR(CTL_RES_CNT), + NVDIMM_EVENT_PTR(CTL_RES_TM), + NVDIMM_EVENT_PTR(POWERON_SECS), + NVDIMM_EVENT_PTR(MEM_LIFE), + NVDIMM_EVENT_PTR(CRI_RES_UTIL), + NVDIMM_EVENT_PTR(HOST_L_CNT), + NVDIMM_EVENT_PTR(HOST_S_CNT), + NVDIMM_EVENT_PTR(HOST_S_DUR), + NVDIMM_EVENT_PTR(HOST_L_DUR), + NVDIMM_EVENT_PTR(MED_R_CNT), + NVDIMM_EVENT_PTR(MED_W_CNT), + NVDIMM_EVENT_PTR(MED_R_DUR), + NVDIMM_EVENT_PTR(MED_W_DUR), + NVDIMM_EVENT_PTR(CACHE_RH_CNT), + NVDIMM_EVENT_PTR(CACHE_WH_CNT), + NVDIMM_EVENT_PTR(FAST_W_CNT), + NULL +}; + +static struct attribute_group nvdimm_pmu_events_group = { + .name = "events", + .attrs = nvdimm_events_attr, +}; + +PMU_FORMAT_ATTR(event, "config:0-4"); + +static struct attribute *nvdimm_pmu_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group nvdimm_pmu_format_group = { + .name = "format", + .attrs = nvdimm_pmu_format_attr, +}; + +ssize_t nvdimm_events_sysfs_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); + + return sprintf(page, "event=0x%02llx\n", pmu_attr->id); +} + +static ssize_t nvdimm_pmu_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct nvdimm_pmu *nd_pmu; + + nd_pmu = container_of(pmu, struct nvdimm_pmu, pmu); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(nd_pmu->cpu)); +} + +static int nvdimm_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) +{ + struct nvdimm_pmu *nd_pmu; + u32 target; + int nodeid; + const struct cpumask *cpumask; + + nd_pmu = hlist_entry_safe(node, struct nvdimm_pmu, node); + + /* Clear it, incase given cpu is set in nd_pmu->arch_cpumask */ + cpumask_test_and_clear_cpu(cpu, &nd_pmu->arch_cpumask); + + /* + * If given cpu is not same as current designated cpu for + * counter access, just return. + */ + if (cpu != nd_pmu->cpu) + return 0; + + /* Check for any active cpu in nd_pmu->arch_cpumask */ + target = cpumask_any(&nd_pmu->arch_cpumask); + + /* + * Incase we don't have any active cpu in nd_pmu->arch_cpumask, + * check in given cpu's numa node list. + */ + if (target >= nr_cpu_ids) { + nodeid = cpu_to_node(cpu); + cpumask = cpumask_of_node(nodeid); + target = cpumask_any_but(cpumask, cpu); + } + nd_pmu->cpu = target; + + /* Migrate nvdimm pmu events to the new target cpu if valid */ + if (target >= 0 && target < nr_cpu_ids) + perf_pmu_migrate_context(&nd_pmu->pmu, cpu, target); + + return 0; +} + +static int nvdimm_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct nvdimm_pmu *nd_pmu; + + nd_pmu = hlist_entry_safe(node, struct nvdimm_pmu, node); + + if (nd_pmu->cpu >= nr_cpu_ids) + nd_pmu->cpu = cpu; + + return 0; +} + +static int create_cpumask_attr_group(struct nvdimm_pmu *nd_pmu) +{ + struct perf_pmu_events_attr *pmu_events_attr; + struct attribute **attrs_group; + struct attribute_group *nvdimm_pmu_cpumask_group; + + pmu_events_attr = kzalloc(sizeof(*pmu_events_attr), GFP_KERNEL); + if (!pmu_events_attr) + return -ENOMEM; + + attrs_group = kzalloc(2 * sizeof(struct attribute *), GFP_KERNEL); + if (!attrs_group) { + kfree(pmu_events_attr); + return -ENOMEM; + } + + /* Allocate memory for cpumask attribute group */ + nvdimm_pmu_cpumask_group = kzalloc(sizeof(*nvdimm_pmu_cpumask_group), GFP_KERNEL); + if (!nvdimm_pmu_cpumask_group) { + kfree(pmu_events_attr); + kfree(attrs_group); + return -ENOMEM; + } + + sysfs_attr_init(&pmu_events_attr->attr.attr); + pmu_events_attr->attr.attr.name = "cpumask"; + pmu_events_attr->attr.attr.mode = 0444; + pmu_events_attr->attr.show = nvdimm_pmu_cpumask_show; + attrs_group[0] = &pmu_events_attr->attr.attr; + attrs_group[1] = NULL; + + nvdimm_pmu_cpumask_group->attrs = attrs_group; + nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR] = nvdimm_pmu_cpumask_group; + return 0; +} + +static int nvdimm_pmu_cpu_hotplug_init(struct nvdimm_pmu *nd_pmu) +{ + int nodeid, rc; + const struct cpumask *cpumask; + + /* + * Incase of cpu hotplug feature, arch specific code + * can provide required cpumask which can be used + * to get designatd cpu for counter access. + * Check for any active cpu in nd_pmu->arch_cpumask. + */ + if (!cpumask_empty(&nd_pmu->arch_cpumask)) { + nd_pmu->cpu = cpumask_any(&nd_pmu->arch_cpumask); + } else { + /* pick active cpu from the cpumask of device numa node. */ + nodeid = dev_to_node(nd_pmu->dev); + cpumask = cpumask_of_node(nodeid); + nd_pmu->cpu = cpumask_any(cpumask); + } + + rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/nvdimm:online", + nvdimm_pmu_cpu_online, nvdimm_pmu_cpu_offline); + + if (rc < 0) + return rc; + + nd_pmu->cpuhp_state = rc; + + /* Register the pmu instance for cpu hotplug */ + rc = cpuhp_state_add_instance_nocalls(nd_pmu->cpuhp_state, &nd_pmu->node); + if (rc) { + cpuhp_remove_multi_state(nd_pmu->cpuhp_state); + return rc; + } + + /* Create cpumask attribute group */ + rc = create_cpumask_attr_group(nd_pmu); + if (rc) { + cpuhp_state_remove_instance_nocalls(nd_pmu->cpuhp_state, &nd_pmu->node); + cpuhp_remove_multi_state(nd_pmu->cpuhp_state); + return rc; + } + + return 0; +} + +static void nvdimm_pmu_free_hotplug_memory(struct nvdimm_pmu *nd_pmu) +{ + cpuhp_state_remove_instance_nocalls(nd_pmu->cpuhp_state, &nd_pmu->node); + cpuhp_remove_multi_state(nd_pmu->cpuhp_state); + + if (nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR]) + kfree(nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR]->attrs); + kfree(nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR]); +} + +int register_nvdimm_pmu(struct nvdimm_pmu *nd_pmu, struct platform_device *pdev) +{ + int rc; + + if (!nd_pmu || !pdev) + return -EINVAL; + + /* event functions like add/del/read/event_init and pmu name should not be NULL */ + if (WARN_ON_ONCE(!(nd_pmu->pmu.event_init && nd_pmu->pmu.add && + nd_pmu->pmu.del && nd_pmu->pmu.read && nd_pmu->pmu.name))) + return -EINVAL; + + nd_pmu->pmu.attr_groups = kzalloc((NVDIMM_PMU_NULL_ATTR + 1) * + sizeof(struct attribute_group *), GFP_KERNEL); + if (!nd_pmu->pmu.attr_groups) + return -ENOMEM; + + /* + * Add platform_device->dev pointer to nvdimm_pmu to access + * device data in events functions. + */ + nd_pmu->dev = &pdev->dev; + + /* Fill attribute groups for the nvdimm pmu device */ + nd_pmu->pmu.attr_groups[NVDIMM_PMU_FORMAT_ATTR] = &nvdimm_pmu_format_group; + nd_pmu->pmu.attr_groups[NVDIMM_PMU_EVENT_ATTR] = &nvdimm_pmu_events_group; + nd_pmu->pmu.attr_groups[NVDIMM_PMU_NULL_ATTR] = NULL; + + /* Fill attribute group for cpumask */ + rc = nvdimm_pmu_cpu_hotplug_init(nd_pmu); + if (rc) { + pr_info("cpu hotplug feature failed for device: %s\n", nd_pmu->pmu.name); + kfree(nd_pmu->pmu.attr_groups); + return rc; + } + + rc = perf_pmu_register(&nd_pmu->pmu, nd_pmu->pmu.name, -1); + if (rc) { + nvdimm_pmu_free_hotplug_memory(nd_pmu); + kfree(nd_pmu->pmu.attr_groups); + return rc; + } + + pr_info("%s NVDIMM performance monitor support registered\n", + nd_pmu->pmu.name); + + return 0; +} +EXPORT_SYMBOL_GPL(register_nvdimm_pmu); + +void unregister_nvdimm_pmu(struct nvdimm_pmu *nd_pmu) +{ + perf_pmu_unregister(&nd_pmu->pmu); + nvdimm_pmu_free_hotplug_memory(nd_pmu); + kfree(nd_pmu->pmu.attr_groups); + kfree(nd_pmu); +} +EXPORT_SYMBOL_GPL(unregister_nvdimm_pmu); diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c new file mode 100644 index 000000000..c6a648fd8 --- /dev/null +++ b/drivers/nvdimm/nd_virtio.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio_pmem.c: Virtio pmem Driver + * + * Discovers persistent memory range information + * from host and provides a virtio based flushing + * interface. + */ +#include "virtio_pmem.h" +#include "nd.h" + + /* The interrupt handler */ +void virtio_pmem_host_ack(struct virtqueue *vq) +{ + struct virtio_pmem *vpmem = vq->vdev->priv; + struct virtio_pmem_request *req_data, *req_buf; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vpmem->pmem_lock, flags); + while ((req_data = virtqueue_get_buf(vq, &len)) != NULL) { + req_data->done = true; + wake_up(&req_data->host_acked); + + if (!list_empty(&vpmem->req_list)) { + req_buf = list_first_entry(&vpmem->req_list, + struct virtio_pmem_request, list); + req_buf->wq_buf_avail = true; + wake_up(&req_buf->wq_buf); + list_del(&req_buf->list); + } + } + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); +} +EXPORT_SYMBOL_GPL(virtio_pmem_host_ack); + + /* The request submission function */ +static int virtio_pmem_flush(struct nd_region *nd_region) +{ + struct virtio_device *vdev = nd_region->provider_data; + struct virtio_pmem *vpmem = vdev->priv; + struct virtio_pmem_request *req_data; + struct scatterlist *sgs[2], sg, ret; + unsigned long flags; + int err, err1; + + might_sleep(); + req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); + if (!req_data) + return -ENOMEM; + + req_data->done = false; + init_waitqueue_head(&req_data->host_acked); + init_waitqueue_head(&req_data->wq_buf); + INIT_LIST_HEAD(&req_data->list); + req_data->req.type = cpu_to_le32(VIRTIO_PMEM_REQ_TYPE_FLUSH); + sg_init_one(&sg, &req_data->req, sizeof(req_data->req)); + sgs[0] = &sg; + sg_init_one(&ret, &req_data->resp.ret, sizeof(req_data->resp)); + sgs[1] = &ret; + + spin_lock_irqsave(&vpmem->pmem_lock, flags); + /* + * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual + * queue does not have free descriptor. We add the request + * to req_list and wait for host_ack to wake us up when free + * slots are available. + */ + while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req_data, + GFP_ATOMIC)) == -ENOSPC) { + + dev_info(&vdev->dev, "failed to send command to virtio pmem device, no free slots in the virtqueue\n"); + req_data->wq_buf_avail = false; + list_add_tail(&req_data->list, &vpmem->req_list); + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); + + /* A host response results in "host_ack" getting called */ + wait_event(req_data->wq_buf, req_data->wq_buf_avail); + spin_lock_irqsave(&vpmem->pmem_lock, flags); + } + err1 = virtqueue_kick(vpmem->req_vq); + spin_unlock_irqrestore(&vpmem->pmem_lock, flags); + /* + * virtqueue_add_sgs failed with error different than -ENOSPC, we can't + * do anything about that. + */ + if (err || !err1) { + dev_info(&vdev->dev, "failed to send command to virtio pmem device\n"); + err = -EIO; + } else { + /* A host repsonse results in "host_ack" getting called */ + wait_event(req_data->host_acked, req_data->done); + err = le32_to_cpu(req_data->resp.ret); + } + + kfree(req_data); + return err; +}; + +/* The asynchronous flush callback function */ +int async_pmem_flush(struct nd_region *nd_region, struct bio *bio) +{ + /* + * Create child bio for asynchronous flush and chain with + * parent bio. Otherwise directly call nd_region flush. + */ + if (bio && bio->bi_iter.bi_sector != -1) { + struct bio *child = bio_alloc(bio->bi_bdev, 0, REQ_PREFLUSH, + GFP_ATOMIC); + + if (!child) + return -ENOMEM; + bio_clone_blkg_association(child, bio); + child->bi_iter.bi_sector = -1; + bio_chain(child, bio); + submit_bio(child); + return 0; + } + if (virtio_pmem_flush(nd_region)) + return -EIO; + + return 0; +}; +EXPORT_SYMBOL_GPL(async_pmem_flush); +MODULE_LICENSE("GPL"); diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c new file mode 100644 index 000000000..0243789ba --- /dev/null +++ b/drivers/nvdimm/of_pmem.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#define pr_fmt(fmt) "of_pmem: " fmt + +#include <linux/of_platform.h> +#include <linux/of_address.h> +#include <linux/libnvdimm.h> +#include <linux/module.h> +#include <linux/ioport.h> +#include <linux/slab.h> + +struct of_pmem_private { + struct nvdimm_bus_descriptor bus_desc; + struct nvdimm_bus *bus; +}; + +static int of_pmem_region_probe(struct platform_device *pdev) +{ + struct of_pmem_private *priv; + struct device_node *np; + struct nvdimm_bus *bus; + bool is_volatile; + int i; + + np = dev_of_node(&pdev->dev); + if (!np) + return -ENXIO; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->bus_desc.provider_name = devm_kstrdup(&pdev->dev, pdev->name, + GFP_KERNEL); + if (!priv->bus_desc.provider_name) { + kfree(priv); + return -ENOMEM; + } + + priv->bus_desc.module = THIS_MODULE; + priv->bus_desc.of_node = np; + + priv->bus = bus = nvdimm_bus_register(&pdev->dev, &priv->bus_desc); + if (!bus) { + kfree(priv); + return -ENODEV; + } + platform_set_drvdata(pdev, priv); + + is_volatile = !!of_find_property(np, "volatile", NULL); + dev_dbg(&pdev->dev, "Registering %s regions from %pOF\n", + is_volatile ? "volatile" : "non-volatile", np); + + for (i = 0; i < pdev->num_resources; i++) { + struct nd_region_desc ndr_desc; + struct nd_region *region; + + /* + * NB: libnvdimm copies the data from ndr_desc into it's own + * structures so passing a stack pointer is fine. + */ + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.numa_node = dev_to_node(&pdev->dev); + ndr_desc.target_node = ndr_desc.numa_node; + ndr_desc.res = &pdev->resource[i]; + ndr_desc.of_node = np; + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + + if (is_volatile) + region = nvdimm_volatile_region_create(bus, &ndr_desc); + else { + set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags); + region = nvdimm_pmem_region_create(bus, &ndr_desc); + } + + if (!region) + dev_warn(&pdev->dev, "Unable to register region %pR from %pOF\n", + ndr_desc.res, np); + else + dev_dbg(&pdev->dev, "Registered region %pR from %pOF\n", + ndr_desc.res, np); + } + + return 0; +} + +static int of_pmem_region_remove(struct platform_device *pdev) +{ + struct of_pmem_private *priv = platform_get_drvdata(pdev); + + nvdimm_bus_unregister(priv->bus); + kfree(priv); + + return 0; +} + +static const struct of_device_id of_pmem_region_match[] = { + { .compatible = "pmem-region" }, + { .compatible = "pmem-region-v2" }, + { }, +}; + +static struct platform_driver of_pmem_region_driver = { + .probe = of_pmem_region_probe, + .remove = of_pmem_region_remove, + .driver = { + .name = "of_pmem", + .of_match_table = of_pmem_region_match, + }, +}; + +module_platform_driver(of_pmem_region_driver); +MODULE_DEVICE_TABLE(of, of_pmem_region_match); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("IBM Corporation"); diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h new file mode 100644 index 000000000..37cb1b8a2 --- /dev/null +++ b/drivers/nvdimm/pfn.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2014-2015, Intel Corporation. + */ + +#ifndef __NVDIMM_PFN_H +#define __NVDIMM_PFN_H + +#include <linux/types.h> +#include <linux/mmzone.h> + +#define PFN_SIG_LEN 16 +#define PFN_SIG "NVDIMM_PFN_INFO\0" +#define DAX_SIG "NVDIMM_DAX_INFO\0" + +struct nd_pfn_sb { + u8 signature[PFN_SIG_LEN]; + u8 uuid[16]; + u8 parent_uuid[16]; + __le32 flags; + __le16 version_major; + __le16 version_minor; + __le64 dataoff; /* relative to namespace_base + start_pad */ + __le64 npfns; + __le32 mode; + /* minor-version-1 additions for section alignment */ + /** + * @start_pad: Deprecated attribute to pad start-misaligned namespaces + * + * start_pad is deprecated because the original definition did + * not comprehend that dataoff is relative to the base address + * of the namespace not the start_pad adjusted base. The result + * is that the dax path is broken, but the block-I/O path is + * not. The kernel will no longer create namespaces using start + * padding, but it still supports block-I/O for legacy + * configurations mainly to allow a backup, reconfigure the + * namespace, and restore flow to repair dax operation. + */ + __le32 start_pad; + __le32 end_trunc; + /* minor-version-2 record the base alignment of the mapping */ + __le32 align; + /* minor-version-3 guarantee the padding and flags are zero */ + /* minor-version-4 record the page size and struct page size */ + __le32 page_size; + __le16 page_struct_size; + u8 padding[3994]; + __le64 checksum; +}; + +#endif /* __NVDIMM_PFN_H */ diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c new file mode 100644 index 000000000..af7d93015 --- /dev/null +++ b/drivers/nvdimm/pfn_devs.c @@ -0,0 +1,863 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2016 Intel Corporation. All rights reserved. + */ +#include <linux/memremap.h> +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include "nd-core.h" +#include "pfn.h" +#include "nd.h" + +static const bool page_struct_override = IS_ENABLED(CONFIG_NVDIMM_KMSAN); + +static void nd_pfn_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + dev_dbg(dev, "trace\n"); + nd_detach_ndns(&nd_pfn->dev, &nd_pfn->ndns); + ida_simple_remove(&nd_region->pfn_ida, nd_pfn->id); + kfree(nd_pfn->uuid); + kfree(nd_pfn); +} + +struct nd_pfn *to_nd_pfn(struct device *dev) +{ + struct nd_pfn *nd_pfn = container_of(dev, struct nd_pfn, dev); + + WARN_ON(!is_nd_pfn(dev)); + return nd_pfn; +} +EXPORT_SYMBOL(to_nd_pfn); + +static ssize_t mode_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + + switch (nd_pfn->mode) { + case PFN_MODE_RAM: + return sprintf(buf, "ram\n"); + case PFN_MODE_PMEM: + return sprintf(buf, "pmem\n"); + default: + return sprintf(buf, "none\n"); + } +} + +static ssize_t mode_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + ssize_t rc = 0; + + device_lock(dev); + nvdimm_bus_lock(dev); + if (dev->driver) + rc = -EBUSY; + else { + size_t n = len - 1; + + if (strncmp(buf, "pmem\n", n) == 0 + || strncmp(buf, "pmem", n) == 0) { + nd_pfn->mode = PFN_MODE_PMEM; + } else if (strncmp(buf, "ram\n", n) == 0 + || strncmp(buf, "ram", n) == 0) + nd_pfn->mode = PFN_MODE_RAM; + else if (strncmp(buf, "none\n", n) == 0 + || strncmp(buf, "none", n) == 0) + nd_pfn->mode = PFN_MODE_NONE; + else + rc = -EINVAL; + } + dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, + buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(mode); + +static ssize_t align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + + return sprintf(buf, "%ld\n", nd_pfn->align); +} + +static unsigned long *nd_pfn_supported_alignments(unsigned long *alignments) +{ + + alignments[0] = PAGE_SIZE; + + if (has_transparent_hugepage()) { + alignments[1] = HPAGE_PMD_SIZE; + if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + alignments[2] = HPAGE_PUD_SIZE; + } + + return alignments; +} + +/* + * Use pmd mapping if supported as default alignment + */ +static unsigned long nd_pfn_default_alignment(void) +{ + + if (has_transparent_hugepage()) + return HPAGE_PMD_SIZE; + return PAGE_SIZE; +} + +static ssize_t align_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, }; + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_size_select_store(dev, buf, &nd_pfn->align, + nd_pfn_supported_alignments(aligns)); + dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, + buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(align); + +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + + if (nd_pfn->uuid) + return sprintf(buf, "%pUb\n", nd_pfn->uuid); + return sprintf(buf, "\n"); +} + +static ssize_t uuid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + ssize_t rc; + + device_lock(dev); + rc = nd_uuid_store(dev, &nd_pfn->uuid, buf, len); + dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, + buf[len - 1] == '\n' ? "" : "\n"); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(uuid); + +static ssize_t namespace_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + rc = sprintf(buf, "%s\n", nd_pfn->ndns + ? dev_name(&nd_pfn->ndns->dev) : ""); + nvdimm_bus_unlock(dev); + return rc; +} + +static ssize_t namespace_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_namespace_store(dev, &nd_pfn->ndns, buf, len); + dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf, + buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(namespace); + +static ssize_t resource_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + ssize_t rc; + + device_lock(dev); + if (dev->driver) { + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + u64 offset = __le64_to_cpu(pfn_sb->dataoff); + struct nd_namespace_common *ndns = nd_pfn->ndns; + u32 start_pad = __le32_to_cpu(pfn_sb->start_pad); + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + + rc = sprintf(buf, "%#llx\n", (unsigned long long) nsio->res.start + + start_pad + offset); + } else { + /* no address to convey if the pfn instance is disabled */ + rc = -ENXIO; + } + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_ADMIN_RO(resource); + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev); + ssize_t rc; + + device_lock(dev); + if (dev->driver) { + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + u64 offset = __le64_to_cpu(pfn_sb->dataoff); + struct nd_namespace_common *ndns = nd_pfn->ndns; + u32 start_pad = __le32_to_cpu(pfn_sb->start_pad); + u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc); + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + + rc = sprintf(buf, "%llu\n", (unsigned long long) + resource_size(&nsio->res) - start_pad + - end_trunc - offset); + } else { + /* no size to convey if the pfn instance is disabled */ + rc = -ENXIO; + } + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(size); + +static ssize_t supported_alignments_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, }; + + return nd_size_select_show(0, + nd_pfn_supported_alignments(aligns), buf); +} +static DEVICE_ATTR_RO(supported_alignments); + +static struct attribute *nd_pfn_attributes[] = { + &dev_attr_mode.attr, + &dev_attr_namespace.attr, + &dev_attr_uuid.attr, + &dev_attr_align.attr, + &dev_attr_resource.attr, + &dev_attr_size.attr, + &dev_attr_supported_alignments.attr, + NULL, +}; + +static struct attribute_group nd_pfn_attribute_group = { + .attrs = nd_pfn_attributes, +}; + +const struct attribute_group *nd_pfn_attribute_groups[] = { + &nd_pfn_attribute_group, + &nd_device_attribute_group, + &nd_numa_attribute_group, + NULL, +}; + +static const struct device_type nd_pfn_device_type = { + .name = "nd_pfn", + .release = nd_pfn_release, + .groups = nd_pfn_attribute_groups, +}; + +bool is_nd_pfn(struct device *dev) +{ + return dev ? dev->type == &nd_pfn_device_type : false; +} +EXPORT_SYMBOL(is_nd_pfn); + +static struct lock_class_key nvdimm_pfn_key; + +struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn, + struct nd_namespace_common *ndns) +{ + struct device *dev; + + if (!nd_pfn) + return NULL; + + nd_pfn->mode = PFN_MODE_NONE; + nd_pfn->align = nd_pfn_default_alignment(); + dev = &nd_pfn->dev; + device_initialize(&nd_pfn->dev); + lockdep_set_class(&nd_pfn->dev.mutex, &nvdimm_pfn_key); + if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) { + dev_dbg(&ndns->dev, "failed, already claimed by %s\n", + dev_name(ndns->claim)); + put_device(dev); + return NULL; + } + return dev; +} + +static struct nd_pfn *nd_pfn_alloc(struct nd_region *nd_region) +{ + struct nd_pfn *nd_pfn; + struct device *dev; + + nd_pfn = kzalloc(sizeof(*nd_pfn), GFP_KERNEL); + if (!nd_pfn) + return NULL; + + nd_pfn->id = ida_simple_get(&nd_region->pfn_ida, 0, 0, GFP_KERNEL); + if (nd_pfn->id < 0) { + kfree(nd_pfn); + return NULL; + } + + dev = &nd_pfn->dev; + dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id); + dev->type = &nd_pfn_device_type; + dev->parent = &nd_region->dev; + + return nd_pfn; +} + +struct device *nd_pfn_create(struct nd_region *nd_region) +{ + struct nd_pfn *nd_pfn; + struct device *dev; + + if (!is_memory(&nd_region->dev)) + return NULL; + + nd_pfn = nd_pfn_alloc(nd_region); + dev = nd_pfn_devinit(nd_pfn, NULL); + + nd_device_register(dev); + return dev; +} + +/* + * nd_pfn_clear_memmap_errors() clears any errors in the volatile memmap + * space associated with the namespace. If the memmap is set to DRAM, then + * this is a no-op. Since the memmap area is freshly initialized during + * probe, we have an opportunity to clear any badblocks in this area. + */ +static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) +{ + struct nd_region *nd_region = to_nd_region(nd_pfn->dev.parent); + struct nd_namespace_common *ndns = nd_pfn->ndns; + void *zero_page = page_address(ZERO_PAGE(0)); + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + int num_bad, meta_num, rc, bb_present; + sector_t first_bad, meta_start; + struct nd_namespace_io *nsio; + + if (nd_pfn->mode != PFN_MODE_PMEM) + return 0; + + nsio = to_nd_namespace_io(&ndns->dev); + meta_start = (SZ_4K + sizeof(*pfn_sb)) >> 9; + meta_num = (le64_to_cpu(pfn_sb->dataoff) >> 9) - meta_start; + + /* + * re-enable the namespace with correct size so that we can access + * the device memmap area. + */ + devm_namespace_disable(&nd_pfn->dev, ndns); + rc = devm_namespace_enable(&nd_pfn->dev, ndns, le64_to_cpu(pfn_sb->dataoff)); + if (rc) + return rc; + + do { + unsigned long zero_len; + u64 nsoff; + + bb_present = badblocks_check(&nd_region->bb, meta_start, + meta_num, &first_bad, &num_bad); + if (bb_present) { + dev_dbg(&nd_pfn->dev, "meta: %x badblocks at %llx\n", + num_bad, first_bad); + nsoff = ALIGN_DOWN((nd_region->ndr_start + + (first_bad << 9)) - nsio->res.start, + PAGE_SIZE); + zero_len = ALIGN(num_bad << 9, PAGE_SIZE); + while (zero_len) { + unsigned long chunk = min(zero_len, PAGE_SIZE); + + rc = nvdimm_write_bytes(ndns, nsoff, zero_page, + chunk, 0); + if (rc) + break; + + zero_len -= chunk; + nsoff += chunk; + } + if (rc) { + dev_err(&nd_pfn->dev, + "error clearing %x badblocks at %llx\n", + num_bad, first_bad); + return rc; + } + } + } while (bb_present); + + return 0; +} + +static bool nd_supported_alignment(unsigned long align) +{ + int i; + unsigned long supported[MAX_NVDIMM_ALIGN] = { [0] = 0, }; + + if (align == 0) + return false; + + nd_pfn_supported_alignments(supported); + for (i = 0; supported[i]; i++) + if (align == supported[i]) + return true; + return false; +} + +/** + * nd_pfn_validate - read and validate info-block + * @nd_pfn: fsdax namespace runtime state / properties + * @sig: 'devdax' or 'fsdax' signature + * + * Upon return the info-block buffer contents (->pfn_sb) are + * indeterminate when validation fails, and a coherent info-block + * otherwise. + */ +int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) +{ + u64 checksum, offset; + struct resource *res; + enum nd_pfn_mode mode; + struct nd_namespace_io *nsio; + unsigned long align, start_pad; + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + struct nd_namespace_common *ndns = nd_pfn->ndns; + const uuid_t *parent_uuid = nd_dev_to_uuid(&ndns->dev); + + if (!pfn_sb || !ndns) + return -ENODEV; + + if (!is_memory(nd_pfn->dev.parent)) + return -ENODEV; + + if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0)) + return -ENXIO; + + if (memcmp(pfn_sb->signature, sig, PFN_SIG_LEN) != 0) + return -ENODEV; + + checksum = le64_to_cpu(pfn_sb->checksum); + pfn_sb->checksum = 0; + if (checksum != nd_sb_checksum((struct nd_gen_sb *) pfn_sb)) + return -ENODEV; + pfn_sb->checksum = cpu_to_le64(checksum); + + if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0) + return -ENODEV; + + if (__le16_to_cpu(pfn_sb->version_minor) < 1) { + pfn_sb->start_pad = 0; + pfn_sb->end_trunc = 0; + } + + if (__le16_to_cpu(pfn_sb->version_minor) < 2) + pfn_sb->align = 0; + + if (__le16_to_cpu(pfn_sb->version_minor) < 4) { + pfn_sb->page_struct_size = cpu_to_le16(64); + pfn_sb->page_size = cpu_to_le32(PAGE_SIZE); + } + + switch (le32_to_cpu(pfn_sb->mode)) { + case PFN_MODE_RAM: + case PFN_MODE_PMEM: + break; + default: + return -ENXIO; + } + + align = le32_to_cpu(pfn_sb->align); + offset = le64_to_cpu(pfn_sb->dataoff); + start_pad = le32_to_cpu(pfn_sb->start_pad); + if (align == 0) + align = 1UL << ilog2(offset); + mode = le32_to_cpu(pfn_sb->mode); + + if ((le32_to_cpu(pfn_sb->page_size) > PAGE_SIZE) && + (mode == PFN_MODE_PMEM)) { + dev_err(&nd_pfn->dev, + "init failed, page size mismatch %d\n", + le32_to_cpu(pfn_sb->page_size)); + return -EOPNOTSUPP; + } + + if ((le16_to_cpu(pfn_sb->page_struct_size) < sizeof(struct page)) && + (mode == PFN_MODE_PMEM)) { + dev_err(&nd_pfn->dev, + "init failed, struct page size mismatch %d\n", + le16_to_cpu(pfn_sb->page_struct_size)); + return -EOPNOTSUPP; + } + + /* + * Check whether the we support the alignment. For Dax if the + * superblock alignment is not matching, we won't initialize + * the device. + */ + if (!nd_supported_alignment(align) && + !memcmp(pfn_sb->signature, DAX_SIG, PFN_SIG_LEN)) { + dev_err(&nd_pfn->dev, "init failed, alignment mismatch: " + "%ld:%ld\n", nd_pfn->align, align); + return -EOPNOTSUPP; + } + + if (!nd_pfn->uuid) { + /* + * When probing a namepace via nd_pfn_probe() the uuid + * is NULL (see: nd_pfn_devinit()) we init settings from + * pfn_sb + */ + nd_pfn->uuid = kmemdup(pfn_sb->uuid, 16, GFP_KERNEL); + if (!nd_pfn->uuid) + return -ENOMEM; + nd_pfn->align = align; + nd_pfn->mode = mode; + } else { + /* + * When probing a pfn / dax instance we validate the + * live settings against the pfn_sb + */ + if (memcmp(nd_pfn->uuid, pfn_sb->uuid, 16) != 0) + return -ENODEV; + + /* + * If the uuid validates, but other settings mismatch + * return EINVAL because userspace has managed to change + * the configuration without specifying new + * identification. + */ + if (nd_pfn->align != align || nd_pfn->mode != mode) { + dev_err(&nd_pfn->dev, + "init failed, settings mismatch\n"); + dev_dbg(&nd_pfn->dev, "align: %lx:%lx mode: %d:%d\n", + nd_pfn->align, align, nd_pfn->mode, + mode); + return -EOPNOTSUPP; + } + } + + if (align > nvdimm_namespace_capacity(ndns)) { + dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n", + align, nvdimm_namespace_capacity(ndns)); + return -EOPNOTSUPP; + } + + /* + * These warnings are verbose because they can only trigger in + * the case where the physical address alignment of the + * namespace has changed since the pfn superblock was + * established. + */ + nsio = to_nd_namespace_io(&ndns->dev); + res = &nsio->res; + if (offset >= resource_size(res)) { + dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n", + dev_name(&ndns->dev)); + return -EOPNOTSUPP; + } + + if ((align && !IS_ALIGNED(res->start + offset + start_pad, align)) + || !IS_ALIGNED(offset, PAGE_SIZE)) { + dev_err(&nd_pfn->dev, + "bad offset: %#llx dax disabled align: %#lx\n", + offset, align); + return -EOPNOTSUPP; + } + + if (!IS_ALIGNED(res->start + le32_to_cpu(pfn_sb->start_pad), + memremap_compat_align())) { + dev_err(&nd_pfn->dev, "resource start misaligned\n"); + return -EOPNOTSUPP; + } + + if (!IS_ALIGNED(res->end + 1 - le32_to_cpu(pfn_sb->end_trunc), + memremap_compat_align())) { + dev_err(&nd_pfn->dev, "resource end misaligned\n"); + return -EOPNOTSUPP; + } + + return 0; +} +EXPORT_SYMBOL(nd_pfn_validate); + +int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns) +{ + int rc; + struct nd_pfn *nd_pfn; + struct device *pfn_dev; + struct nd_pfn_sb *pfn_sb; + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + + if (ndns->force_raw) + return -ENODEV; + + switch (ndns->claim_class) { + case NVDIMM_CCLASS_NONE: + case NVDIMM_CCLASS_PFN: + break; + default: + return -ENODEV; + } + + nvdimm_bus_lock(&ndns->dev); + nd_pfn = nd_pfn_alloc(nd_region); + pfn_dev = nd_pfn_devinit(nd_pfn, ndns); + nvdimm_bus_unlock(&ndns->dev); + if (!pfn_dev) + return -ENOMEM; + pfn_sb = devm_kmalloc(dev, sizeof(*pfn_sb), GFP_KERNEL); + nd_pfn = to_nd_pfn(pfn_dev); + nd_pfn->pfn_sb = pfn_sb; + rc = nd_pfn_validate(nd_pfn, PFN_SIG); + dev_dbg(dev, "pfn: %s\n", rc == 0 ? dev_name(pfn_dev) : "<none>"); + if (rc < 0) { + nd_detach_ndns(pfn_dev, &nd_pfn->ndns); + put_device(pfn_dev); + } else + nd_device_register(pfn_dev); + + return rc; +} +EXPORT_SYMBOL(nd_pfn_probe); + +/* + * We hotplug memory at sub-section granularity, pad the reserved area + * from the previous section base to the namespace base address. + */ +static unsigned long init_altmap_base(resource_size_t base) +{ + unsigned long base_pfn = PHYS_PFN(base); + + return SUBSECTION_ALIGN_DOWN(base_pfn); +} + +static unsigned long init_altmap_reserve(resource_size_t base) +{ + unsigned long reserve = nd_info_block_reserve() >> PAGE_SHIFT; + unsigned long base_pfn = PHYS_PFN(base); + + reserve += base_pfn - SUBSECTION_ALIGN_DOWN(base_pfn); + return reserve; +} + +static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) +{ + struct range *range = &pgmap->range; + struct vmem_altmap *altmap = &pgmap->altmap; + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + u64 offset = le64_to_cpu(pfn_sb->dataoff); + u32 start_pad = __le32_to_cpu(pfn_sb->start_pad); + u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc); + u32 reserve = nd_info_block_reserve(); + struct nd_namespace_common *ndns = nd_pfn->ndns; + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + resource_size_t base = nsio->res.start + start_pad; + resource_size_t end = nsio->res.end - end_trunc; + struct vmem_altmap __altmap = { + .base_pfn = init_altmap_base(base), + .reserve = init_altmap_reserve(base), + .end_pfn = PHYS_PFN(end), + }; + + *range = (struct range) { + .start = nsio->res.start + start_pad, + .end = nsio->res.end - end_trunc, + }; + pgmap->nr_range = 1; + if (nd_pfn->mode == PFN_MODE_RAM) { + if (offset < reserve) + return -EINVAL; + nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); + } else if (nd_pfn->mode == PFN_MODE_PMEM) { + nd_pfn->npfns = PHYS_PFN((range_len(range) - offset)); + if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) + dev_info(&nd_pfn->dev, + "number of pfns truncated from %lld to %ld\n", + le64_to_cpu(nd_pfn->pfn_sb->npfns), + nd_pfn->npfns); + memcpy(altmap, &__altmap, sizeof(*altmap)); + altmap->free = PHYS_PFN(offset - reserve); + altmap->alloc = 0; + pgmap->flags |= PGMAP_ALTMAP_VALID; + } else + return -ENXIO; + + return 0; +} + +static int nd_pfn_init(struct nd_pfn *nd_pfn) +{ + struct nd_namespace_common *ndns = nd_pfn->ndns; + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + resource_size_t start, size; + struct nd_region *nd_region; + unsigned long npfns, align; + u32 end_trunc; + struct nd_pfn_sb *pfn_sb; + phys_addr_t offset; + const char *sig; + u64 checksum; + int rc; + + pfn_sb = devm_kmalloc(&nd_pfn->dev, sizeof(*pfn_sb), GFP_KERNEL); + if (!pfn_sb) + return -ENOMEM; + + nd_pfn->pfn_sb = pfn_sb; + if (is_nd_dax(&nd_pfn->dev)) + sig = DAX_SIG; + else + sig = PFN_SIG; + + rc = nd_pfn_validate(nd_pfn, sig); + if (rc == 0) + return nd_pfn_clear_memmap_errors(nd_pfn); + if (rc != -ENODEV) + return rc; + + /* no info block, do init */; + memset(pfn_sb, 0, sizeof(*pfn_sb)); + + nd_region = to_nd_region(nd_pfn->dev.parent); + if (nd_region->ro) { + dev_info(&nd_pfn->dev, + "%s is read-only, unable to init metadata\n", + dev_name(&nd_region->dev)); + return -ENXIO; + } + + start = nsio->res.start; + size = resource_size(&nsio->res); + npfns = PHYS_PFN(size - SZ_8K); + align = max(nd_pfn->align, memremap_compat_align()); + + /* + * When @start is misaligned fail namespace creation. See + * the 'struct nd_pfn_sb' commentary on why ->start_pad is not + * an option. + */ + if (!IS_ALIGNED(start, memremap_compat_align())) { + dev_err(&nd_pfn->dev, "%s: start %pa misaligned to %#lx\n", + dev_name(&ndns->dev), &start, + memremap_compat_align()); + return -EINVAL; + } + end_trunc = start + size - ALIGN_DOWN(start + size, align); + if (nd_pfn->mode == PFN_MODE_PMEM) { + unsigned long page_map_size = MAX_STRUCT_PAGE_SIZE * npfns; + + /* + * The altmap should be padded out to the block size used + * when populating the vmemmap. This *should* be equal to + * PMD_SIZE for most architectures. + * + * Also make sure size of struct page is less than + * MAX_STRUCT_PAGE_SIZE. The goal here is compatibility in the + * face of production kernel configurations that reduce the + * 'struct page' size below MAX_STRUCT_PAGE_SIZE. For debug + * kernel configurations that increase the 'struct page' size + * above MAX_STRUCT_PAGE_SIZE, the page_struct_override allows + * for continuing with the capacity that will be wasted when + * reverting to a production kernel configuration. Otherwise, + * those configurations are blocked by default. + */ + if (sizeof(struct page) > MAX_STRUCT_PAGE_SIZE) { + if (page_struct_override) + page_map_size = sizeof(struct page) * npfns; + else { + dev_err(&nd_pfn->dev, + "Memory debug options prevent using pmem for the page map\n"); + return -EINVAL; + } + } + offset = ALIGN(start + SZ_8K + page_map_size, align) - start; + } else if (nd_pfn->mode == PFN_MODE_RAM) + offset = ALIGN(start + SZ_8K, align) - start; + else + return -ENXIO; + + if (offset >= size) { + dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n", + dev_name(&ndns->dev)); + return -ENXIO; + } + + npfns = PHYS_PFN(size - offset - end_trunc); + pfn_sb->mode = cpu_to_le32(nd_pfn->mode); + pfn_sb->dataoff = cpu_to_le64(offset); + pfn_sb->npfns = cpu_to_le64(npfns); + memcpy(pfn_sb->signature, sig, PFN_SIG_LEN); + memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); + memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); + pfn_sb->version_major = cpu_to_le16(1); + pfn_sb->version_minor = cpu_to_le16(4); + pfn_sb->end_trunc = cpu_to_le32(end_trunc); + pfn_sb->align = cpu_to_le32(nd_pfn->align); + if (sizeof(struct page) > MAX_STRUCT_PAGE_SIZE && page_struct_override) + pfn_sb->page_struct_size = cpu_to_le16(sizeof(struct page)); + else + pfn_sb->page_struct_size = cpu_to_le16(MAX_STRUCT_PAGE_SIZE); + pfn_sb->page_size = cpu_to_le32(PAGE_SIZE); + checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); + pfn_sb->checksum = cpu_to_le64(checksum); + + rc = nd_pfn_clear_memmap_errors(nd_pfn); + if (rc) + return rc; + + return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0); +} + +/* + * Determine the effective resource range and vmem_altmap from an nd_pfn + * instance. + */ +int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) +{ + int rc; + + if (!nd_pfn->uuid || !nd_pfn->ndns) + return -ENODEV; + + rc = nd_pfn_init(nd_pfn); + if (rc) + return rc; + + /* we need a valid pfn_sb before we can init a dev_pagemap */ + return __nvdimm_setup_pfn(nd_pfn, pgmap); +} +EXPORT_SYMBOL_GPL(nvdimm_setup_pfn); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c new file mode 100644 index 000000000..96e6e9a5f --- /dev/null +++ b/drivers/nvdimm/pmem.c @@ -0,0 +1,789 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Persistent Memory Driver + * + * Copyright (c) 2014-2015, Intel Corporation. + * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. + * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. + */ + +#include <linux/blkdev.h> +#include <linux/pagemap.h> +#include <linux/hdreg.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/set_memory.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/badblocks.h> +#include <linux/memremap.h> +#include <linux/vmalloc.h> +#include <linux/blk-mq.h> +#include <linux/pfn_t.h> +#include <linux/slab.h> +#include <linux/uio.h> +#include <linux/dax.h> +#include <linux/nd.h> +#include <linux/mm.h> +#include <asm/cacheflush.h> +#include "pmem.h" +#include "btt.h" +#include "pfn.h" +#include "nd.h" + +static struct device *to_dev(struct pmem_device *pmem) +{ + /* + * nvdimm bus services need a 'dev' parameter, and we record the device + * at init in bb.dev. + */ + return pmem->bb.dev; +} + +static struct nd_region *to_region(struct pmem_device *pmem) +{ + return to_nd_region(to_dev(pmem)->parent); +} + +static phys_addr_t pmem_to_phys(struct pmem_device *pmem, phys_addr_t offset) +{ + return pmem->phys_addr + offset; +} + +static sector_t to_sect(struct pmem_device *pmem, phys_addr_t offset) +{ + return (offset - pmem->data_offset) >> SECTOR_SHIFT; +} + +static phys_addr_t to_offset(struct pmem_device *pmem, sector_t sector) +{ + return (sector << SECTOR_SHIFT) + pmem->data_offset; +} + +static void pmem_mkpage_present(struct pmem_device *pmem, phys_addr_t offset, + unsigned int len) +{ + phys_addr_t phys = pmem_to_phys(pmem, offset); + unsigned long pfn_start, pfn_end, pfn; + + /* only pmem in the linear map supports HWPoison */ + if (is_vmalloc_addr(pmem->virt_addr)) + return; + + pfn_start = PHYS_PFN(phys); + pfn_end = pfn_start + PHYS_PFN(len); + for (pfn = pfn_start; pfn < pfn_end; pfn++) { + struct page *page = pfn_to_page(pfn); + + /* + * Note, no need to hold a get_dev_pagemap() reference + * here since we're in the driver I/O path and + * outstanding I/O requests pin the dev_pagemap. + */ + if (test_and_clear_pmem_poison(page)) + clear_mce_nospec(pfn); + } +} + +static void pmem_clear_bb(struct pmem_device *pmem, sector_t sector, long blks) +{ + if (blks == 0) + return; + badblocks_clear(&pmem->bb, sector, blks); + if (pmem->bb_state) + sysfs_notify_dirent(pmem->bb_state); +} + +static long __pmem_clear_poison(struct pmem_device *pmem, + phys_addr_t offset, unsigned int len) +{ + phys_addr_t phys = pmem_to_phys(pmem, offset); + long cleared = nvdimm_clear_poison(to_dev(pmem), phys, len); + + if (cleared > 0) { + pmem_mkpage_present(pmem, offset, cleared); + arch_invalidate_pmem(pmem->virt_addr + offset, len); + } + return cleared; +} + +static blk_status_t pmem_clear_poison(struct pmem_device *pmem, + phys_addr_t offset, unsigned int len) +{ + long cleared = __pmem_clear_poison(pmem, offset, len); + + if (cleared < 0) + return BLK_STS_IOERR; + + pmem_clear_bb(pmem, to_sect(pmem, offset), cleared >> SECTOR_SHIFT); + if (cleared < len) + return BLK_STS_IOERR; + return BLK_STS_OK; +} + +static void write_pmem(void *pmem_addr, struct page *page, + unsigned int off, unsigned int len) +{ + unsigned int chunk; + void *mem; + + while (len) { + mem = kmap_atomic(page); + chunk = min_t(unsigned int, len, PAGE_SIZE - off); + memcpy_flushcache(pmem_addr, mem + off, chunk); + kunmap_atomic(mem); + len -= chunk; + off = 0; + page++; + pmem_addr += chunk; + } +} + +static blk_status_t read_pmem(struct page *page, unsigned int off, + void *pmem_addr, unsigned int len) +{ + unsigned int chunk; + unsigned long rem; + void *mem; + + while (len) { + mem = kmap_atomic(page); + chunk = min_t(unsigned int, len, PAGE_SIZE - off); + rem = copy_mc_to_kernel(mem + off, pmem_addr, chunk); + kunmap_atomic(mem); + if (rem) + return BLK_STS_IOERR; + len -= chunk; + off = 0; + page++; + pmem_addr += chunk; + } + return BLK_STS_OK; +} + +static blk_status_t pmem_do_read(struct pmem_device *pmem, + struct page *page, unsigned int page_off, + sector_t sector, unsigned int len) +{ + blk_status_t rc; + phys_addr_t pmem_off = to_offset(pmem, sector); + void *pmem_addr = pmem->virt_addr + pmem_off; + + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) + return BLK_STS_IOERR; + + rc = read_pmem(page, page_off, pmem_addr, len); + flush_dcache_page(page); + return rc; +} + +static blk_status_t pmem_do_write(struct pmem_device *pmem, + struct page *page, unsigned int page_off, + sector_t sector, unsigned int len) +{ + phys_addr_t pmem_off = to_offset(pmem, sector); + void *pmem_addr = pmem->virt_addr + pmem_off; + + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) { + blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len); + + if (rc != BLK_STS_OK) + return rc; + } + + flush_dcache_page(page); + write_pmem(pmem_addr, page, page_off, len); + + return BLK_STS_OK; +} + +static void pmem_submit_bio(struct bio *bio) +{ + int ret = 0; + blk_status_t rc = 0; + bool do_acct; + unsigned long start; + struct bio_vec bvec; + struct bvec_iter iter; + struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data; + struct nd_region *nd_region = to_region(pmem); + + if (bio->bi_opf & REQ_PREFLUSH) + ret = nvdimm_flush(nd_region, bio); + + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); + if (do_acct) + start = bio_start_io_acct(bio); + bio_for_each_segment(bvec, bio, iter) { + if (op_is_write(bio_op(bio))) + rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset, + iter.bi_sector, bvec.bv_len); + else + rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset, + iter.bi_sector, bvec.bv_len); + if (rc) { + bio->bi_status = rc; + break; + } + } + if (do_acct) + bio_end_io_acct(bio, start); + + if (bio->bi_opf & REQ_FUA) + ret = nvdimm_flush(nd_region, bio); + + if (ret) + bio->bi_status = errno_to_blk_status(ret); + + bio_endio(bio); +} + +static int pmem_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, enum req_op op) +{ + struct pmem_device *pmem = bdev->bd_disk->private_data; + blk_status_t rc; + + if (op_is_write(op)) + rc = pmem_do_write(pmem, page, 0, sector, thp_size(page)); + else + rc = pmem_do_read(pmem, page, 0, sector, thp_size(page)); + /* + * The ->rw_page interface is subtle and tricky. The core + * retries on any error, so we can only invoke page_endio() in + * the successful completion case. Otherwise, we'll see crashes + * caused by double completion. + */ + if (rc == 0) + page_endio(page, op_is_write(op), 0); + + return blk_status_to_errno(rc); +} + +/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ +__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) +{ + resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; + sector_t sector = PFN_PHYS(pgoff) >> SECTOR_SHIFT; + unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT; + struct badblocks *bb = &pmem->bb; + sector_t first_bad; + int num_bad; + + if (kaddr) + *kaddr = pmem->virt_addr + offset; + if (pfn) + *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); + + if (bb->count && + badblocks_check(bb, sector, num, &first_bad, &num_bad)) { + long actual_nr; + + if (mode != DAX_RECOVERY_WRITE) + return -EIO; + + /* + * Set the recovery stride is set to kernel page size because + * the underlying driver and firmware clear poison functions + * don't appear to handle large chunk(such as 2MiB) reliably. + */ + actual_nr = PHYS_PFN( + PAGE_ALIGN((first_bad - sector) << SECTOR_SHIFT)); + dev_dbg(pmem->bb.dev, "start sector(%llu), nr_pages(%ld), first_bad(%llu), actual_nr(%ld)\n", + sector, nr_pages, first_bad, actual_nr); + if (actual_nr) + return actual_nr; + return 1; + } + + /* + * If badblocks are present but not in the range, limit known good range + * to the requested range. + */ + if (bb->count) + return nr_pages; + return PHYS_PFN(pmem->size - pmem->pfn_pad - offset); +} + +static const struct block_device_operations pmem_fops = { + .owner = THIS_MODULE, + .submit_bio = pmem_submit_bio, + .rw_page = pmem_rw_page, +}; + +static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages) +{ + struct pmem_device *pmem = dax_get_private(dax_dev); + + return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0, + PFN_PHYS(pgoff) >> SECTOR_SHIFT, + PAGE_SIZE)); +} + +static long pmem_dax_direct_access(struct dax_device *dax_dev, + pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, + void **kaddr, pfn_t *pfn) +{ + struct pmem_device *pmem = dax_get_private(dax_dev); + + return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn); +} + +/* + * The recovery write thread started out as a normal pwrite thread and + * when the filesystem was told about potential media error in the + * range, filesystem turns the normal pwrite to a dax_recovery_write. + * + * The recovery write consists of clearing media poison, clearing page + * HWPoison bit, reenable page-wide read-write permission, flush the + * caches and finally write. A competing pread thread will be held + * off during the recovery process since data read back might not be + * valid, and this is achieved by clearing the badblock records after + * the recovery write is complete. Competing recovery write threads + * are already serialized by writer lock held by dax_iomap_rw(). + */ +static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct pmem_device *pmem = dax_get_private(dax_dev); + size_t olen, len, off; + phys_addr_t pmem_off; + struct device *dev = pmem->bb.dev; + long cleared; + + off = offset_in_page(addr); + len = PFN_PHYS(PFN_UP(off + bytes)); + if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len)) + return _copy_from_iter_flushcache(addr, bytes, i); + + /* + * Not page-aligned range cannot be recovered. This should not + * happen unless something else went wrong. + */ + if (off || !PAGE_ALIGNED(bytes)) { + dev_dbg(dev, "Found poison, but addr(%p) or bytes(%#zx) not page aligned\n", + addr, bytes); + return 0; + } + + pmem_off = PFN_PHYS(pgoff) + pmem->data_offset; + cleared = __pmem_clear_poison(pmem, pmem_off, len); + if (cleared > 0 && cleared < len) { + dev_dbg(dev, "poison cleared only %ld out of %zu bytes\n", + cleared, len); + return 0; + } + if (cleared < 0) { + dev_dbg(dev, "poison clear failed: %ld\n", cleared); + return 0; + } + + olen = _copy_from_iter_flushcache(addr, bytes, i); + pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT); + + return olen; +} + +static const struct dax_operations pmem_dax_ops = { + .direct_access = pmem_dax_direct_access, + .zero_page_range = pmem_dax_zero_page_range, + .recovery_write = pmem_recovery_write, +}; + +static ssize_t write_cache_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pmem_device *pmem = dev_to_disk(dev)->private_data; + + return sprintf(buf, "%d\n", !!dax_write_cache_enabled(pmem->dax_dev)); +} + +static ssize_t write_cache_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct pmem_device *pmem = dev_to_disk(dev)->private_data; + bool write_cache; + int rc; + + rc = strtobool(buf, &write_cache); + if (rc) + return rc; + dax_write_cache(pmem->dax_dev, write_cache); + return len; +} +static DEVICE_ATTR_RW(write_cache); + +static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n) +{ +#ifndef CONFIG_ARCH_HAS_PMEM_API + if (a == &dev_attr_write_cache.attr) + return 0; +#endif + return a->mode; +} + +static struct attribute *dax_attributes[] = { + &dev_attr_write_cache.attr, + NULL, +}; + +static const struct attribute_group dax_attribute_group = { + .name = "dax", + .attrs = dax_attributes, + .is_visible = dax_visible, +}; + +static const struct attribute_group *pmem_attribute_groups[] = { + &dax_attribute_group, + NULL, +}; + +static void pmem_release_disk(void *__pmem) +{ + struct pmem_device *pmem = __pmem; + + dax_remove_host(pmem->disk); + kill_dax(pmem->dax_dev); + put_dax(pmem->dax_dev); + del_gendisk(pmem->disk); + + put_disk(pmem->disk); +} + +static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap, + unsigned long pfn, unsigned long nr_pages, int mf_flags) +{ + struct pmem_device *pmem = + container_of(pgmap, struct pmem_device, pgmap); + u64 offset = PFN_PHYS(pfn) - pmem->phys_addr - pmem->data_offset; + u64 len = nr_pages << PAGE_SHIFT; + + return dax_holder_notify_failure(pmem->dax_dev, offset, len, mf_flags); +} + +static const struct dev_pagemap_ops fsdax_pagemap_ops = { + .memory_failure = pmem_pagemap_memory_failure, +}; + +static int pmem_attach_disk(struct device *dev, + struct nd_namespace_common *ndns) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + struct nd_region *nd_region = to_nd_region(dev->parent); + int nid = dev_to_node(dev), fua; + struct resource *res = &nsio->res; + struct range bb_range; + struct nd_pfn *nd_pfn = NULL; + struct dax_device *dax_dev; + struct nd_pfn_sb *pfn_sb; + struct pmem_device *pmem; + struct request_queue *q; + struct gendisk *disk; + void *addr; + int rc; + + pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); + if (!pmem) + return -ENOMEM; + + rc = devm_namespace_enable(dev, ndns, nd_info_block_reserve()); + if (rc) + return rc; + + /* while nsio_rw_bytes is active, parse a pfn info block if present */ + if (is_nd_pfn(dev)) { + nd_pfn = to_nd_pfn(dev); + rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap); + if (rc) + return rc; + } + + /* we're attaching a block device, disable raw namespace access */ + devm_namespace_disable(dev, ndns); + + dev_set_drvdata(dev, pmem); + pmem->phys_addr = res->start; + pmem->size = resource_size(res); + fua = nvdimm_has_flush(nd_region); + if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) || fua < 0) { + dev_warn(dev, "unable to guarantee persistence of writes\n"); + fua = 0; + } + + if (!devm_request_mem_region(dev, res->start, resource_size(res), + dev_name(&ndns->dev))) { + dev_warn(dev, "could not reserve region %pR\n", res); + return -EBUSY; + } + + disk = blk_alloc_disk(nid); + if (!disk) + return -ENOMEM; + q = disk->queue; + + pmem->disk = disk; + pmem->pgmap.owner = pmem; + pmem->pfn_flags = PFN_DEV; + if (is_nd_pfn(dev)) { + pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; + addr = devm_memremap_pages(dev, &pmem->pgmap); + pfn_sb = nd_pfn->pfn_sb; + pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); + pmem->pfn_pad = resource_size(res) - + range_len(&pmem->pgmap.range); + pmem->pfn_flags |= PFN_MAP; + bb_range = pmem->pgmap.range; + bb_range.start += pmem->data_offset; + } else if (pmem_should_map_pages(dev)) { + pmem->pgmap.range.start = res->start; + pmem->pgmap.range.end = res->end; + pmem->pgmap.nr_range = 1; + pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; + addr = devm_memremap_pages(dev, &pmem->pgmap); + pmem->pfn_flags |= PFN_MAP; + bb_range = pmem->pgmap.range; + } else { + addr = devm_memremap(dev, pmem->phys_addr, + pmem->size, ARCH_MEMREMAP_PMEM); + bb_range.start = res->start; + bb_range.end = res->end; + } + + if (IS_ERR(addr)) { + rc = PTR_ERR(addr); + goto out; + } + pmem->virt_addr = addr; + + blk_queue_write_cache(q, true, fua); + blk_queue_physical_block_size(q, PAGE_SIZE); + blk_queue_logical_block_size(q, pmem_sector_size(ndns)); + blk_queue_max_hw_sectors(q, UINT_MAX); + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); + if (pmem->pfn_flags & PFN_MAP) + blk_queue_flag_set(QUEUE_FLAG_DAX, q); + + disk->fops = &pmem_fops; + disk->private_data = pmem; + nvdimm_namespace_disk_name(ndns, disk->disk_name); + set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) + / 512); + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range); + disk->bb = &pmem->bb; + + dax_dev = alloc_dax(pmem, &pmem_dax_ops); + if (IS_ERR(dax_dev)) { + rc = PTR_ERR(dax_dev); + goto out; + } + set_dax_nocache(dax_dev); + set_dax_nomc(dax_dev); + if (is_nvdimm_sync(nd_region)) + set_dax_synchronous(dax_dev); + rc = dax_add_host(dax_dev, disk); + if (rc) + goto out_cleanup_dax; + dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); + pmem->dax_dev = dax_dev; + + rc = device_add_disk(dev, disk, pmem_attribute_groups); + if (rc) + goto out_remove_host; + if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) + return -ENOMEM; + + nvdimm_check_and_set_ro(disk); + + pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd, + "badblocks"); + if (!pmem->bb_state) + dev_warn(dev, "'badblocks' notification disabled\n"); + return 0; + +out_remove_host: + dax_remove_host(pmem->disk); +out_cleanup_dax: + kill_dax(pmem->dax_dev); + put_dax(pmem->dax_dev); +out: + put_disk(pmem->disk); + return rc; +} + +static int nd_pmem_probe(struct device *dev) +{ + int ret; + struct nd_namespace_common *ndns; + + ndns = nvdimm_namespace_common_probe(dev); + if (IS_ERR(ndns)) + return PTR_ERR(ndns); + + if (is_nd_btt(dev)) + return nvdimm_namespace_attach_btt(ndns); + + if (is_nd_pfn(dev)) + return pmem_attach_disk(dev, ndns); + + ret = devm_namespace_enable(dev, ndns, nd_info_block_reserve()); + if (ret) + return ret; + + ret = nd_btt_probe(dev, ndns); + if (ret == 0) + return -ENXIO; + + /* + * We have two failure conditions here, there is no + * info reserver block or we found a valid info reserve block + * but failed to initialize the pfn superblock. + * + * For the first case consider namespace as a raw pmem namespace + * and attach a disk. + * + * For the latter, consider this a success and advance the namespace + * seed. + */ + ret = nd_pfn_probe(dev, ndns); + if (ret == 0) + return -ENXIO; + else if (ret == -EOPNOTSUPP) + return ret; + + ret = nd_dax_probe(dev, ndns); + if (ret == 0) + return -ENXIO; + else if (ret == -EOPNOTSUPP) + return ret; + + /* probe complete, attach handles namespace enabling */ + devm_namespace_disable(dev, ndns); + + return pmem_attach_disk(dev, ndns); +} + +static void nd_pmem_remove(struct device *dev) +{ + struct pmem_device *pmem = dev_get_drvdata(dev); + + if (is_nd_btt(dev)) + nvdimm_namespace_detach_btt(to_nd_btt(dev)); + else { + /* + * Note, this assumes device_lock() context to not + * race nd_pmem_notify() + */ + sysfs_put(pmem->bb_state); + pmem->bb_state = NULL; + } + nvdimm_flush(to_nd_region(dev->parent), NULL); +} + +static void nd_pmem_shutdown(struct device *dev) +{ + nvdimm_flush(to_nd_region(dev->parent), NULL); +} + +static void pmem_revalidate_poison(struct device *dev) +{ + struct nd_region *nd_region; + resource_size_t offset = 0, end_trunc = 0; + struct nd_namespace_common *ndns; + struct nd_namespace_io *nsio; + struct badblocks *bb; + struct range range; + struct kernfs_node *bb_state; + + if (is_nd_btt(dev)) { + struct nd_btt *nd_btt = to_nd_btt(dev); + + ndns = nd_btt->ndns; + nd_region = to_nd_region(ndns->dev.parent); + nsio = to_nd_namespace_io(&ndns->dev); + bb = &nsio->bb; + bb_state = NULL; + } else { + struct pmem_device *pmem = dev_get_drvdata(dev); + + nd_region = to_region(pmem); + bb = &pmem->bb; + bb_state = pmem->bb_state; + + if (is_nd_pfn(dev)) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + + ndns = nd_pfn->ndns; + offset = pmem->data_offset + + __le32_to_cpu(pfn_sb->start_pad); + end_trunc = __le32_to_cpu(pfn_sb->end_trunc); + } else { + ndns = to_ndns(dev); + } + + nsio = to_nd_namespace_io(&ndns->dev); + } + + range.start = nsio->res.start + offset; + range.end = nsio->res.end - end_trunc; + nvdimm_badblocks_populate(nd_region, bb, &range); + if (bb_state) + sysfs_notify_dirent(bb_state); +} + +static void pmem_revalidate_region(struct device *dev) +{ + struct pmem_device *pmem; + + if (is_nd_btt(dev)) { + struct nd_btt *nd_btt = to_nd_btt(dev); + struct btt *btt = nd_btt->btt; + + nvdimm_check_and_set_ro(btt->btt_disk); + return; + } + + pmem = dev_get_drvdata(dev); + nvdimm_check_and_set_ro(pmem->disk); +} + +static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) +{ + switch (event) { + case NVDIMM_REVALIDATE_POISON: + pmem_revalidate_poison(dev); + break; + case NVDIMM_REVALIDATE_REGION: + pmem_revalidate_region(dev); + break; + default: + dev_WARN_ONCE(dev, 1, "notify: unknown event: %d\n", event); + break; + } +} + +MODULE_ALIAS("pmem"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); +static struct nd_device_driver nd_pmem_driver = { + .probe = nd_pmem_probe, + .remove = nd_pmem_remove, + .notify = nd_pmem_notify, + .shutdown = nd_pmem_shutdown, + .drv = { + .name = "nd_pmem", + }, + .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM, +}; + +module_nd_driver(nd_pmem_driver); + +MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h new file mode 100644 index 000000000..392b0b38a --- /dev/null +++ b/drivers/nvdimm/pmem.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NVDIMM_PMEM_H__ +#define __NVDIMM_PMEM_H__ +#include <linux/page-flags.h> +#include <linux/badblocks.h> +#include <linux/memremap.h> +#include <linux/types.h> +#include <linux/pfn_t.h> +#include <linux/fs.h> + +enum dax_access_mode; + +/* this definition is in it's own header for tools/testing/nvdimm to consume */ +struct pmem_device { + /* One contiguous memory region per device */ + phys_addr_t phys_addr; + /* when non-zero this device is hosting a 'pfn' instance */ + phys_addr_t data_offset; + u64 pfn_flags; + void *virt_addr; + /* immutable base size of the namespace */ + size_t size; + /* trim size when namespace capacity has been section aligned */ + u32 pfn_pad; + struct kernfs_node *bb_state; + struct badblocks bb; + struct dax_device *dax_dev; + struct gendisk *disk; + struct dev_pagemap pgmap; +}; + +long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn); + +#ifdef CONFIG_MEMORY_FAILURE +static inline bool test_and_clear_pmem_poison(struct page *page) +{ + return TestClearPageHWPoison(page); +} +#else +static inline bool test_and_clear_pmem_poison(struct page *page) +{ + return false; +} +#endif +#endif /* __NVDIMM_PMEM_H__ */ diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c new file mode 100644 index 000000000..390123d29 --- /dev/null +++ b/drivers/nvdimm/region.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "nd.h" + +static int nd_region_probe(struct device *dev) +{ + int err, rc; + static unsigned long once; + struct nd_region_data *ndrd; + struct nd_region *nd_region = to_nd_region(dev); + struct range range = { + .start = nd_region->ndr_start, + .end = nd_region->ndr_start + nd_region->ndr_size - 1, + }; + + if (nd_region->num_lanes > num_online_cpus() + && nd_region->num_lanes < num_possible_cpus() + && !test_and_set_bit(0, &once)) { + dev_dbg(dev, "online cpus (%d) < concurrent i/o lanes (%d) < possible cpus (%d)\n", + num_online_cpus(), nd_region->num_lanes, + num_possible_cpus()); + dev_dbg(dev, "setting nr_cpus=%d may yield better libnvdimm device performance\n", + nd_region->num_lanes); + } + + rc = nd_region_activate(nd_region); + if (rc) + return rc; + + if (devm_init_badblocks(dev, &nd_region->bb)) + return -ENODEV; + nd_region->bb_state = + sysfs_get_dirent(nd_region->dev.kobj.sd, "badblocks"); + if (!nd_region->bb_state) + dev_warn(dev, "'badblocks' notification disabled\n"); + nvdimm_badblocks_populate(nd_region, &nd_region->bb, &range); + + rc = nd_region_register_namespaces(nd_region, &err); + if (rc < 0) + return rc; + + ndrd = dev_get_drvdata(dev); + ndrd->ns_active = rc; + ndrd->ns_count = rc + err; + + if (rc && err && rc == err) + return -ENODEV; + + nd_region->btt_seed = nd_btt_create(nd_region); + nd_region->pfn_seed = nd_pfn_create(nd_region); + nd_region->dax_seed = nd_dax_create(nd_region); + if (err == 0) + return 0; + + /* + * Given multiple namespaces per region, we do not want to + * disable all the successfully registered peer namespaces upon + * a single registration failure. If userspace is missing a + * namespace that it expects it can disable/re-enable the region + * to retry discovery after correcting the failure. + * <regionX>/namespaces returns the current + * "<async-registered>/<total>" namespace count. + */ + dev_err(dev, "failed to register %d namespace%s, continuing...\n", + err, err == 1 ? "" : "s"); + return 0; +} + +static int child_unregister(struct device *dev, void *data) +{ + nd_device_unregister(dev, ND_SYNC); + return 0; +} + +static void nd_region_remove(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + + device_for_each_child(dev, NULL, child_unregister); + + /* flush attribute readers and disable */ + nvdimm_bus_lock(dev); + nd_region->ns_seed = NULL; + nd_region->btt_seed = NULL; + nd_region->pfn_seed = NULL; + nd_region->dax_seed = NULL; + dev_set_drvdata(dev, NULL); + nvdimm_bus_unlock(dev); + + /* + * Note, this assumes device_lock() context to not race + * nd_region_notify() + */ + sysfs_put(nd_region->bb_state); + nd_region->bb_state = NULL; +} + +static int child_notify(struct device *dev, void *data) +{ + nd_device_notify(dev, *(enum nvdimm_event *) data); + return 0; +} + +static void nd_region_notify(struct device *dev, enum nvdimm_event event) +{ + if (event == NVDIMM_REVALIDATE_POISON) { + struct nd_region *nd_region = to_nd_region(dev); + + if (is_memory(&nd_region->dev)) { + struct range range = { + .start = nd_region->ndr_start, + .end = nd_region->ndr_start + + nd_region->ndr_size - 1, + }; + + nvdimm_badblocks_populate(nd_region, + &nd_region->bb, &range); + if (nd_region->bb_state) + sysfs_notify_dirent(nd_region->bb_state); + } + } + device_for_each_child(dev, &event, child_notify); +} + +static struct nd_device_driver nd_region_driver = { + .probe = nd_region_probe, + .remove = nd_region_remove, + .notify = nd_region_notify, + .drv = { + .name = "nd_region", + }, + .type = ND_DRIVER_REGION_BLK | ND_DRIVER_REGION_PMEM, +}; + +int __init nd_region_init(void) +{ + return nd_driver_register(&nd_region_driver); +} + +void nd_region_exit(void) +{ + driver_unregister(&nd_region_driver.drv); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c new file mode 100644 index 000000000..7995f93db --- /dev/null +++ b/drivers/nvdimm/region_devs.c @@ -0,0 +1,1224 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + */ +#include <linux/scatterlist.h> +#include <linux/memregion.h> +#include <linux/highmem.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/hash.h> +#include <linux/sort.h> +#include <linux/io.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "nd.h" + +/* + * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is + * irrelevant. + */ +#include <linux/io-64-nonatomic-hi-lo.h> + +static DEFINE_PER_CPU(int, flush_idx); + +static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm, + struct nd_region_data *ndrd) +{ + int i, j; + + dev_dbg(dev, "%s: map %d flush address%s\n", nvdimm_name(nvdimm), + nvdimm->num_flush, nvdimm->num_flush == 1 ? "" : "es"); + for (i = 0; i < (1 << ndrd->hints_shift); i++) { + struct resource *res = &nvdimm->flush_wpq[i]; + unsigned long pfn = PHYS_PFN(res->start); + void __iomem *flush_page; + + /* check if flush hints share a page */ + for (j = 0; j < i; j++) { + struct resource *res_j = &nvdimm->flush_wpq[j]; + unsigned long pfn_j = PHYS_PFN(res_j->start); + + if (pfn == pfn_j) + break; + } + + if (j < i) + flush_page = (void __iomem *) ((unsigned long) + ndrd_get_flush_wpq(ndrd, dimm, j) + & PAGE_MASK); + else + flush_page = devm_nvdimm_ioremap(dev, + PFN_PHYS(pfn), PAGE_SIZE); + if (!flush_page) + return -ENXIO; + ndrd_set_flush_wpq(ndrd, dimm, i, flush_page + + (res->start & ~PAGE_MASK)); + } + + return 0; +} + +int nd_region_activate(struct nd_region *nd_region) +{ + int i, j, num_flush = 0; + struct nd_region_data *ndrd; + struct device *dev = &nd_region->dev; + size_t flush_data_size = sizeof(void *); + + nvdimm_bus_lock(&nd_region->dev); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) { + nvdimm_bus_unlock(&nd_region->dev); + return -EBUSY; + } + + /* at least one null hint slot per-dimm for the "no-hint" case */ + flush_data_size += sizeof(void *); + num_flush = min_not_zero(num_flush, nvdimm->num_flush); + if (!nvdimm->num_flush) + continue; + flush_data_size += nvdimm->num_flush * sizeof(void *); + } + nvdimm_bus_unlock(&nd_region->dev); + + ndrd = devm_kzalloc(dev, sizeof(*ndrd) + flush_data_size, GFP_KERNEL); + if (!ndrd) + return -ENOMEM; + dev_set_drvdata(dev, ndrd); + + if (!num_flush) + return 0; + + ndrd->hints_shift = ilog2(num_flush); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + int rc = nvdimm_map_flush(&nd_region->dev, nvdimm, i, ndrd); + + if (rc) + return rc; + } + + /* + * Clear out entries that are duplicates. This should prevent the + * extra flushings. + */ + for (i = 0; i < nd_region->ndr_mappings - 1; i++) { + /* ignore if NULL already */ + if (!ndrd_get_flush_wpq(ndrd, i, 0)) + continue; + + for (j = i + 1; j < nd_region->ndr_mappings; j++) + if (ndrd_get_flush_wpq(ndrd, i, 0) == + ndrd_get_flush_wpq(ndrd, j, 0)) + ndrd_set_flush_wpq(ndrd, j, 0, NULL); + } + + return 0; +} + +static void nd_region_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + u16 i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + put_device(&nvdimm->dev); + } + free_percpu(nd_region->lane); + if (!test_bit(ND_REGION_CXL, &nd_region->flags)) + memregion_free(nd_region->id); + kfree(nd_region); +} + +struct nd_region *to_nd_region(struct device *dev) +{ + struct nd_region *nd_region = container_of(dev, struct nd_region, dev); + + WARN_ON(dev->type->release != nd_region_release); + return nd_region; +} +EXPORT_SYMBOL_GPL(to_nd_region); + +struct device *nd_region_dev(struct nd_region *nd_region) +{ + if (!nd_region) + return NULL; + return &nd_region->dev; +} +EXPORT_SYMBOL_GPL(nd_region_dev); + +void *nd_region_provider_data(struct nd_region *nd_region) +{ + return nd_region->provider_data; +} +EXPORT_SYMBOL_GPL(nd_region_provider_data); + +/** + * nd_region_to_nstype() - region to an integer namespace type + * @nd_region: region-device to interrogate + * + * This is the 'nstype' attribute of a region as well, an input to the + * MODALIAS for namespace devices, and bit number for a nvdimm_bus to match + * namespace devices with namespace drivers. + */ +int nd_region_to_nstype(struct nd_region *nd_region) +{ + if (is_memory(&nd_region->dev)) { + u16 i, label; + + for (i = 0, label = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if (test_bit(NDD_LABELING, &nvdimm->flags)) + label++; + } + if (label) + return ND_DEVICE_NAMESPACE_PMEM; + else + return ND_DEVICE_NAMESPACE_IO; + } + + return 0; +} +EXPORT_SYMBOL(nd_region_to_nstype); + +static unsigned long long region_size(struct nd_region *nd_region) +{ + if (is_memory(&nd_region->dev)) { + return nd_region->ndr_size; + } else if (nd_region->ndr_mappings == 1) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + + return nd_mapping->size; + } + + return 0; +} + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%llu\n", region_size(nd_region)); +} +static DEVICE_ATTR_RO(size); + +static ssize_t deep_flush_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + /* + * NOTE: in the nvdimm_has_flush() error case this attribute is + * not visible. + */ + return sprintf(buf, "%d\n", nvdimm_has_flush(nd_region)); +} + +static ssize_t deep_flush_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + bool flush; + int rc = strtobool(buf, &flush); + struct nd_region *nd_region = to_nd_region(dev); + + if (rc) + return rc; + if (!flush) + return -EINVAL; + rc = nvdimm_flush(nd_region, NULL); + if (rc) + return rc; + + return len; +} +static DEVICE_ATTR_RW(deep_flush); + +static ssize_t mappings_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region->ndr_mappings); +} +static DEVICE_ATTR_RO(mappings); + +static ssize_t nstype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region)); +} +static DEVICE_ATTR_RO(nstype); + +static ssize_t set_cookie_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nd_interleave_set *nd_set = nd_region->nd_set; + ssize_t rc = 0; + + if (is_memory(dev) && nd_set) + /* pass, should be precluded by region_visible */; + else + return -ENXIO; + + /* + * The cookie to show depends on which specification of the + * labels we are using. If there are not labels then default to + * the v1.1 namespace label cookie definition. To read all this + * data we need to wait for probing to settle. + */ + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + if (nd_region->ndr_mappings) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + + if (ndd) { + struct nd_namespace_index *nsindex; + + nsindex = to_namespace_index(ndd, ndd->ns_current); + rc = sprintf(buf, "%#llx\n", + nd_region_interleave_set_cookie(nd_region, + nsindex)); + } + } + nvdimm_bus_unlock(dev); + device_unlock(dev); + + if (rc) + return rc; + return sprintf(buf, "%#llx\n", nd_set->cookie1); +} +static DEVICE_ATTR_RO(set_cookie); + +resource_size_t nd_region_available_dpa(struct nd_region *nd_region) +{ + resource_size_t available; + int i; + + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + + available = 0; + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + + /* if a dimm is disabled the available capacity is zero */ + if (!ndd) + return 0; + + available += nd_pmem_available_dpa(nd_region, nd_mapping); + } + + return available; +} + +resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region) +{ + resource_size_t avail = 0; + int i; + + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + avail = min_not_zero(avail, nd_pmem_max_contiguous_dpa( + nd_region, nd_mapping)); + } + return avail * nd_region->ndr_mappings; +} + +static ssize_t available_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + unsigned long long available = 0; + + /* + * Flush in-flight updates and grab a snapshot of the available + * size. Of course, this value is potentially invalidated the + * memory nvdimm_bus_lock() is dropped, but that's userspace's + * problem to not race itself. + */ + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + available = nd_region_available_dpa(nd_region); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return sprintf(buf, "%llu\n", available); +} +static DEVICE_ATTR_RO(available_size); + +static ssize_t max_available_extent_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + unsigned long long available = 0; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + available = nd_region_allocatable_dpa(nd_region); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return sprintf(buf, "%llu\n", available); +} +static DEVICE_ATTR_RO(max_available_extent); + +static ssize_t init_namespaces_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region_data *ndrd = dev_get_drvdata(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (ndrd) + rc = sprintf(buf, "%d/%d\n", ndrd->ns_active, ndrd->ns_count); + else + rc = -ENXIO; + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(init_namespaces); + +static ssize_t namespace_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->ns_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->ns_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + return rc; +} +static DEVICE_ATTR_RO(namespace_seed); + +static ssize_t btt_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->btt_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->btt_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(btt_seed); + +static ssize_t pfn_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->pfn_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->pfn_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(pfn_seed); + +static ssize_t dax_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->dax_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->dax_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(dax_seed); + +static ssize_t read_only_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region->ro); +} + +static int revalidate_read_only(struct device *dev, void *data) +{ + nd_device_notify(dev, NVDIMM_REVALIDATE_REGION); + return 0; +} + +static ssize_t read_only_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool ro; + int rc = strtobool(buf, &ro); + struct nd_region *nd_region = to_nd_region(dev); + + if (rc) + return rc; + + nd_region->ro = ro; + device_for_each_child(dev, NULL, revalidate_read_only); + return len; +} +static DEVICE_ATTR_RW(read_only); + +static ssize_t align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%#lx\n", nd_region->align); +} + +static ssize_t align_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev); + unsigned long val, dpa; + u32 mappings, remainder; + int rc; + + rc = kstrtoul(buf, 0, &val); + if (rc) + return rc; + + /* + * Ensure space-align is evenly divisible by the region + * interleave-width because the kernel typically has no facility + * to determine which DIMM(s), dimm-physical-addresses, would + * contribute to the tail capacity in system-physical-address + * space for the namespace. + */ + mappings = max_t(u32, 1, nd_region->ndr_mappings); + dpa = div_u64_rem(val, mappings, &remainder); + if (!is_power_of_2(dpa) || dpa < PAGE_SIZE + || val > region_size(nd_region) || remainder) + return -EINVAL; + + /* + * Given that space allocation consults this value multiple + * times ensure it does not change for the duration of the + * allocation. + */ + nvdimm_bus_lock(dev); + nd_region->align = val; + nvdimm_bus_unlock(dev); + + return len; +} +static DEVICE_ATTR_RW(align); + +static ssize_t region_badblocks_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + device_lock(dev); + if (dev->driver) + rc = badblocks_show(&nd_region->bb, buf, 0); + else + rc = -ENXIO; + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR(badblocks, 0444, region_badblocks_show, NULL); + +static ssize_t resource_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%#llx\n", nd_region->ndr_start); +} +static DEVICE_ATTR_ADMIN_RO(resource); + +static ssize_t persistence_domain_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + if (test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags)) + return sprintf(buf, "cpu_cache\n"); + else if (test_bit(ND_REGION_PERSIST_MEMCTRL, &nd_region->flags)) + return sprintf(buf, "memory_controller\n"); + else + return sprintf(buf, "\n"); +} +static DEVICE_ATTR_RO(persistence_domain); + +static struct attribute *nd_region_attributes[] = { + &dev_attr_size.attr, + &dev_attr_align.attr, + &dev_attr_nstype.attr, + &dev_attr_mappings.attr, + &dev_attr_btt_seed.attr, + &dev_attr_pfn_seed.attr, + &dev_attr_dax_seed.attr, + &dev_attr_deep_flush.attr, + &dev_attr_read_only.attr, + &dev_attr_set_cookie.attr, + &dev_attr_available_size.attr, + &dev_attr_max_available_extent.attr, + &dev_attr_namespace_seed.attr, + &dev_attr_init_namespaces.attr, + &dev_attr_badblocks.attr, + &dev_attr_resource.attr, + &dev_attr_persistence_domain.attr, + NULL, +}; + +static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct nd_region *nd_region = to_nd_region(dev); + struct nd_interleave_set *nd_set = nd_region->nd_set; + int type = nd_region_to_nstype(nd_region); + + if (!is_memory(dev) && a == &dev_attr_pfn_seed.attr) + return 0; + + if (!is_memory(dev) && a == &dev_attr_dax_seed.attr) + return 0; + + if (!is_memory(dev) && a == &dev_attr_badblocks.attr) + return 0; + + if (a == &dev_attr_resource.attr && !is_memory(dev)) + return 0; + + if (a == &dev_attr_deep_flush.attr) { + int has_flush = nvdimm_has_flush(nd_region); + + if (has_flush == 1) + return a->mode; + else if (has_flush == 0) + return 0444; + else + return 0; + } + + if (a == &dev_attr_persistence_domain.attr) { + if ((nd_region->flags & (BIT(ND_REGION_PERSIST_CACHE) + | BIT(ND_REGION_PERSIST_MEMCTRL))) == 0) + return 0; + return a->mode; + } + + if (a == &dev_attr_align.attr) + return a->mode; + + if (a != &dev_attr_set_cookie.attr + && a != &dev_attr_available_size.attr) + return a->mode; + + if (type == ND_DEVICE_NAMESPACE_PMEM && + a == &dev_attr_available_size.attr) + return a->mode; + else if (is_memory(dev) && nd_set) + return a->mode; + + return 0; +} + +static ssize_t mappingN(struct device *dev, char *buf, int n) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nd_mapping *nd_mapping; + struct nvdimm *nvdimm; + + if (n >= nd_region->ndr_mappings) + return -ENXIO; + nd_mapping = &nd_region->mapping[n]; + nvdimm = nd_mapping->nvdimm; + + return sprintf(buf, "%s,%llu,%llu,%d\n", dev_name(&nvdimm->dev), + nd_mapping->start, nd_mapping->size, + nd_mapping->position); +} + +#define REGION_MAPPING(idx) \ +static ssize_t mapping##idx##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + return mappingN(dev, buf, idx); \ +} \ +static DEVICE_ATTR_RO(mapping##idx) + +/* + * 32 should be enough for a while, even in the presence of socket + * interleave a 32-way interleave set is a degenerate case. + */ +REGION_MAPPING(0); +REGION_MAPPING(1); +REGION_MAPPING(2); +REGION_MAPPING(3); +REGION_MAPPING(4); +REGION_MAPPING(5); +REGION_MAPPING(6); +REGION_MAPPING(7); +REGION_MAPPING(8); +REGION_MAPPING(9); +REGION_MAPPING(10); +REGION_MAPPING(11); +REGION_MAPPING(12); +REGION_MAPPING(13); +REGION_MAPPING(14); +REGION_MAPPING(15); +REGION_MAPPING(16); +REGION_MAPPING(17); +REGION_MAPPING(18); +REGION_MAPPING(19); +REGION_MAPPING(20); +REGION_MAPPING(21); +REGION_MAPPING(22); +REGION_MAPPING(23); +REGION_MAPPING(24); +REGION_MAPPING(25); +REGION_MAPPING(26); +REGION_MAPPING(27); +REGION_MAPPING(28); +REGION_MAPPING(29); +REGION_MAPPING(30); +REGION_MAPPING(31); + +static umode_t mapping_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nd_region *nd_region = to_nd_region(dev); + + if (n < nd_region->ndr_mappings) + return a->mode; + return 0; +} + +static struct attribute *mapping_attributes[] = { + &dev_attr_mapping0.attr, + &dev_attr_mapping1.attr, + &dev_attr_mapping2.attr, + &dev_attr_mapping3.attr, + &dev_attr_mapping4.attr, + &dev_attr_mapping5.attr, + &dev_attr_mapping6.attr, + &dev_attr_mapping7.attr, + &dev_attr_mapping8.attr, + &dev_attr_mapping9.attr, + &dev_attr_mapping10.attr, + &dev_attr_mapping11.attr, + &dev_attr_mapping12.attr, + &dev_attr_mapping13.attr, + &dev_attr_mapping14.attr, + &dev_attr_mapping15.attr, + &dev_attr_mapping16.attr, + &dev_attr_mapping17.attr, + &dev_attr_mapping18.attr, + &dev_attr_mapping19.attr, + &dev_attr_mapping20.attr, + &dev_attr_mapping21.attr, + &dev_attr_mapping22.attr, + &dev_attr_mapping23.attr, + &dev_attr_mapping24.attr, + &dev_attr_mapping25.attr, + &dev_attr_mapping26.attr, + &dev_attr_mapping27.attr, + &dev_attr_mapping28.attr, + &dev_attr_mapping29.attr, + &dev_attr_mapping30.attr, + &dev_attr_mapping31.attr, + NULL, +}; + +static const struct attribute_group nd_mapping_attribute_group = { + .is_visible = mapping_visible, + .attrs = mapping_attributes, +}; + +static const struct attribute_group nd_region_attribute_group = { + .attrs = nd_region_attributes, + .is_visible = region_visible, +}; + +static const struct attribute_group *nd_region_attribute_groups[] = { + &nd_device_attribute_group, + &nd_region_attribute_group, + &nd_numa_attribute_group, + &nd_mapping_attribute_group, + NULL, +}; + +static const struct device_type nd_pmem_device_type = { + .name = "nd_pmem", + .release = nd_region_release, + .groups = nd_region_attribute_groups, +}; + +static const struct device_type nd_volatile_device_type = { + .name = "nd_volatile", + .release = nd_region_release, + .groups = nd_region_attribute_groups, +}; + +bool is_nd_pmem(struct device *dev) +{ + return dev ? dev->type == &nd_pmem_device_type : false; +} + +bool is_nd_volatile(struct device *dev) +{ + return dev ? dev->type == &nd_volatile_device_type : false; +} + +u64 nd_region_interleave_set_cookie(struct nd_region *nd_region, + struct nd_namespace_index *nsindex) +{ + struct nd_interleave_set *nd_set = nd_region->nd_set; + + if (!nd_set) + return 0; + + if (nsindex && __le16_to_cpu(nsindex->major) == 1 + && __le16_to_cpu(nsindex->minor) == 1) + return nd_set->cookie1; + return nd_set->cookie2; +} + +u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region) +{ + struct nd_interleave_set *nd_set = nd_region->nd_set; + + if (nd_set) + return nd_set->altcookie; + return 0; +} + +void nd_mapping_free_labels(struct nd_mapping *nd_mapping) +{ + struct nd_label_ent *label_ent, *e; + + lockdep_assert_held(&nd_mapping->lock); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + list_del(&label_ent->list); + kfree(label_ent); + } +} + +/* + * When a namespace is activated create new seeds for the next + * namespace, or namespace-personality to be configured. + */ +void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev) +{ + nvdimm_bus_lock(dev); + if (nd_region->ns_seed == dev) { + nd_region_create_ns_seed(nd_region); + } else if (is_nd_btt(dev)) { + struct nd_btt *nd_btt = to_nd_btt(dev); + + if (nd_region->btt_seed == dev) + nd_region_create_btt_seed(nd_region); + if (nd_region->ns_seed == &nd_btt->ndns->dev) + nd_region_create_ns_seed(nd_region); + } else if (is_nd_pfn(dev)) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + if (nd_region->pfn_seed == dev) + nd_region_create_pfn_seed(nd_region); + if (nd_region->ns_seed == &nd_pfn->ndns->dev) + nd_region_create_ns_seed(nd_region); + } else if (is_nd_dax(dev)) { + struct nd_dax *nd_dax = to_nd_dax(dev); + + if (nd_region->dax_seed == dev) + nd_region_create_dax_seed(nd_region); + if (nd_region->ns_seed == &nd_dax->nd_pfn.ndns->dev) + nd_region_create_ns_seed(nd_region); + } + nvdimm_bus_unlock(dev); +} + +/** + * nd_region_acquire_lane - allocate and lock a lane + * @nd_region: region id and number of lanes possible + * + * A lane correlates to a BLK-data-window and/or a log slot in the BTT. + * We optimize for the common case where there are 256 lanes, one + * per-cpu. For larger systems we need to lock to share lanes. For now + * this implementation assumes the cost of maintaining an allocator for + * free lanes is on the order of the lock hold time, so it implements a + * static lane = cpu % num_lanes mapping. + * + * In the case of a BTT instance on top of a BLK namespace a lane may be + * acquired recursively. We lock on the first instance. + * + * In the case of a BTT instance on top of PMEM, we only acquire a lane + * for the BTT metadata updates. + */ +unsigned int nd_region_acquire_lane(struct nd_region *nd_region) +{ + unsigned int cpu, lane; + + migrate_disable(); + cpu = smp_processor_id(); + if (nd_region->num_lanes < nr_cpu_ids) { + struct nd_percpu_lane *ndl_lock, *ndl_count; + + lane = cpu % nd_region->num_lanes; + ndl_count = per_cpu_ptr(nd_region->lane, cpu); + ndl_lock = per_cpu_ptr(nd_region->lane, lane); + if (ndl_count->count++ == 0) + spin_lock(&ndl_lock->lock); + } else + lane = cpu; + + return lane; +} +EXPORT_SYMBOL(nd_region_acquire_lane); + +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane) +{ + if (nd_region->num_lanes < nr_cpu_ids) { + unsigned int cpu = smp_processor_id(); + struct nd_percpu_lane *ndl_lock, *ndl_count; + + ndl_count = per_cpu_ptr(nd_region->lane, cpu); + ndl_lock = per_cpu_ptr(nd_region->lane, lane); + if (--ndl_count->count == 0) + spin_unlock(&ndl_lock->lock); + } + migrate_enable(); +} +EXPORT_SYMBOL(nd_region_release_lane); + +/* + * PowerPC requires this alignment for memremap_pages(). All other archs + * should be ok with SUBSECTION_SIZE (see memremap_compat_align()). + */ +#define MEMREMAP_COMPAT_ALIGN_MAX SZ_16M + +static unsigned long default_align(struct nd_region *nd_region) +{ + unsigned long align; + u32 remainder; + int mappings; + + align = MEMREMAP_COMPAT_ALIGN_MAX; + if (nd_region->ndr_size < MEMREMAP_COMPAT_ALIGN_MAX) + align = PAGE_SIZE; + + mappings = max_t(u16, 1, nd_region->ndr_mappings); + div_u64_rem(align, mappings, &remainder); + if (remainder) + align *= mappings; + + return align; +} + +static struct lock_class_key nvdimm_region_key; + +static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc, + const struct device_type *dev_type, const char *caller) +{ + struct nd_region *nd_region; + struct device *dev; + unsigned int i; + int ro = 0; + + for (i = 0; i < ndr_desc->num_mappings; i++) { + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; + struct nvdimm *nvdimm = mapping->nvdimm; + + if ((mapping->start | mapping->size) % PAGE_SIZE) { + dev_err(&nvdimm_bus->dev, + "%s: %s mapping%d is not %ld aligned\n", + caller, dev_name(&nvdimm->dev), i, PAGE_SIZE); + return NULL; + } + + if (test_bit(NDD_UNARMED, &nvdimm->flags)) + ro = 1; + + } + + nd_region = + kzalloc(struct_size(nd_region, mapping, ndr_desc->num_mappings), + GFP_KERNEL); + + if (!nd_region) + return NULL; + /* CXL pre-assigns memregion ids before creating nvdimm regions */ + if (test_bit(ND_REGION_CXL, &ndr_desc->flags)) { + nd_region->id = ndr_desc->memregion; + } else { + nd_region->id = memregion_alloc(GFP_KERNEL); + if (nd_region->id < 0) + goto err_id; + } + + nd_region->lane = alloc_percpu(struct nd_percpu_lane); + if (!nd_region->lane) + goto err_percpu; + + for (i = 0; i < nr_cpu_ids; i++) { + struct nd_percpu_lane *ndl; + + ndl = per_cpu_ptr(nd_region->lane, i); + spin_lock_init(&ndl->lock); + ndl->count = 0; + } + + for (i = 0; i < ndr_desc->num_mappings; i++) { + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; + struct nvdimm *nvdimm = mapping->nvdimm; + + nd_region->mapping[i].nvdimm = nvdimm; + nd_region->mapping[i].start = mapping->start; + nd_region->mapping[i].size = mapping->size; + nd_region->mapping[i].position = mapping->position; + INIT_LIST_HEAD(&nd_region->mapping[i].labels); + mutex_init(&nd_region->mapping[i].lock); + + get_device(&nvdimm->dev); + } + nd_region->ndr_mappings = ndr_desc->num_mappings; + nd_region->provider_data = ndr_desc->provider_data; + nd_region->nd_set = ndr_desc->nd_set; + nd_region->num_lanes = ndr_desc->num_lanes; + nd_region->flags = ndr_desc->flags; + nd_region->ro = ro; + nd_region->numa_node = ndr_desc->numa_node; + nd_region->target_node = ndr_desc->target_node; + ida_init(&nd_region->ns_ida); + ida_init(&nd_region->btt_ida); + ida_init(&nd_region->pfn_ida); + ida_init(&nd_region->dax_ida); + dev = &nd_region->dev; + dev_set_name(dev, "region%d", nd_region->id); + dev->parent = &nvdimm_bus->dev; + dev->type = dev_type; + dev->groups = ndr_desc->attr_groups; + dev->of_node = ndr_desc->of_node; + nd_region->ndr_size = resource_size(ndr_desc->res); + nd_region->ndr_start = ndr_desc->res->start; + nd_region->align = default_align(nd_region); + if (ndr_desc->flush) + nd_region->flush = ndr_desc->flush; + else + nd_region->flush = NULL; + + device_initialize(dev); + lockdep_set_class(&dev->mutex, &nvdimm_region_key); + nd_device_register(dev); + + return nd_region; + +err_percpu: + if (!test_bit(ND_REGION_CXL, &ndr_desc->flags)) + memregion_free(nd_region->id); +err_id: + kfree(nd_region); + return NULL; +} + +struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + ndr_desc->num_lanes = ND_MAX_LANES; + return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create); + +struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + ndr_desc->num_lanes = ND_MAX_LANES; + return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create); + +void nvdimm_region_delete(struct nd_region *nd_region) +{ + if (nd_region) + nd_device_unregister(&nd_region->dev, ND_SYNC); +} +EXPORT_SYMBOL_GPL(nvdimm_region_delete); + +int nvdimm_flush(struct nd_region *nd_region, struct bio *bio) +{ + int rc = 0; + + if (!nd_region->flush) + rc = generic_nvdimm_flush(nd_region); + else { + if (nd_region->flush(nd_region, bio)) + rc = -EIO; + } + + return rc; +} +/** + * generic_nvdimm_flush() - flush any posted write queues between the cpu and pmem media + * @nd_region: interleaved pmem region + */ +int generic_nvdimm_flush(struct nd_region *nd_region) +{ + struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev); + int i, idx; + + /* + * Try to encourage some diversity in flush hint addresses + * across cpus assuming a limited number of flush hints. + */ + idx = this_cpu_read(flush_idx); + idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8)); + + /* + * The pmem_wmb() is needed to 'sfence' all + * previous writes such that they are architecturally visible for + * the platform buffer flush. Note that we've already arranged for pmem + * writes to avoid the cache via memcpy_flushcache(). The final + * wmb() ensures ordering for the NVDIMM flush write. + */ + pmem_wmb(); + for (i = 0; i < nd_region->ndr_mappings; i++) + if (ndrd_get_flush_wpq(ndrd, i, 0)) + writeq(1, ndrd_get_flush_wpq(ndrd, i, idx)); + wmb(); + + return 0; +} +EXPORT_SYMBOL_GPL(nvdimm_flush); + +/** + * nvdimm_has_flush - determine write flushing requirements + * @nd_region: interleaved pmem region + * + * Returns 1 if writes require flushing + * Returns 0 if writes do not require flushing + * Returns -ENXIO if flushing capability can not be determined + */ +int nvdimm_has_flush(struct nd_region *nd_region) +{ + int i; + + /* no nvdimm or pmem api == flushing capability unknown */ + if (nd_region->ndr_mappings == 0 + || !IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)) + return -ENXIO; + + /* Test if an explicit flush function is defined */ + if (test_bit(ND_REGION_ASYNC, &nd_region->flags) && nd_region->flush) + return 1; + + /* Test if any flush hints for the region are available */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + /* flush hints present / available */ + if (nvdimm->num_flush) + return 1; + } + + /* + * The platform defines dimm devices without hints nor explicit flush, + * assume platform persistence mechanism like ADR + */ + return 0; +} +EXPORT_SYMBOL_GPL(nvdimm_has_flush); + +int nvdimm_has_cache(struct nd_region *nd_region) +{ + return is_nd_pmem(&nd_region->dev) && + !test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags); +} +EXPORT_SYMBOL_GPL(nvdimm_has_cache); + +bool is_nvdimm_sync(struct nd_region *nd_region) +{ + if (is_nd_volatile(&nd_region->dev)) + return true; + + return is_nd_pmem(&nd_region->dev) && + !test_bit(ND_REGION_ASYNC, &nd_region->flags); +} +EXPORT_SYMBOL_GPL(is_nvdimm_sync); + +struct conflict_context { + struct nd_region *nd_region; + resource_size_t start, size; +}; + +static int region_conflict(struct device *dev, void *data) +{ + struct nd_region *nd_region; + struct conflict_context *ctx = data; + resource_size_t res_end, region_end, region_start; + + if (!is_memory(dev)) + return 0; + + nd_region = to_nd_region(dev); + if (nd_region == ctx->nd_region) + return 0; + + res_end = ctx->start + ctx->size; + region_start = nd_region->ndr_start; + region_end = region_start + nd_region->ndr_size; + if (ctx->start >= region_start && ctx->start < region_end) + return -EBUSY; + if (res_end > region_start && res_end <= region_end) + return -EBUSY; + return 0; +} + +int nd_region_conflict(struct nd_region *nd_region, resource_size_t start, + resource_size_t size) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + struct conflict_context ctx = { + .nd_region = nd_region, + .start = start, + .size = size, + }; + + return device_for_each_child(&nvdimm_bus->dev, &ctx, region_conflict); +} diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c new file mode 100644 index 000000000..8aefb60c4 --- /dev/null +++ b/drivers/nvdimm/security.c @@ -0,0 +1,553 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2018 Intel Corporation. All rights reserved. */ + +#include <linux/module.h> +#include <linux/device.h> +#include <linux/ndctl.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/cred.h> +#include <linux/key.h> +#include <linux/key-type.h> +#include <keys/user-type.h> +#include <keys/encrypted-type.h> +#include "nd-core.h" +#include "nd.h" + +#define NVDIMM_BASE_KEY 0 +#define NVDIMM_NEW_KEY 1 + +static bool key_revalidate = true; +module_param(key_revalidate, bool, 0444); +MODULE_PARM_DESC(key_revalidate, "Require key validation at init."); + +static const char zero_key[NVDIMM_PASSPHRASE_LEN]; + +static void *key_data(struct key *key) +{ + struct encrypted_key_payload *epayload = dereference_key_locked(key); + + lockdep_assert_held_read(&key->sem); + + return epayload->decrypted_data; +} + +static void nvdimm_put_key(struct key *key) +{ + if (!key) + return; + + up_read(&key->sem); + key_put(key); +} + +/* + * Retrieve kernel key for DIMM and request from user space if + * necessary. Returns a key held for read and must be put by + * nvdimm_put_key() before the usage goes out of scope. + */ +static struct key *nvdimm_request_key(struct nvdimm *nvdimm) +{ + struct key *key = NULL; + static const char NVDIMM_PREFIX[] = "nvdimm:"; + char desc[NVDIMM_KEY_DESC_LEN + sizeof(NVDIMM_PREFIX)]; + struct device *dev = &nvdimm->dev; + + sprintf(desc, "%s%s", NVDIMM_PREFIX, nvdimm->dimm_id); + key = request_key(&key_type_encrypted, desc, ""); + if (IS_ERR(key)) { + if (PTR_ERR(key) == -ENOKEY) + dev_dbg(dev, "request_key() found no key\n"); + else + dev_dbg(dev, "request_key() upcall failed\n"); + key = NULL; + } else { + struct encrypted_key_payload *epayload; + + down_read(&key->sem); + epayload = dereference_key_locked(key); + if (epayload->decrypted_datalen != NVDIMM_PASSPHRASE_LEN) { + up_read(&key->sem); + key_put(key); + key = NULL; + } + } + + return key; +} + +static const void *nvdimm_get_key_payload(struct nvdimm *nvdimm, + struct key **key) +{ + *key = nvdimm_request_key(nvdimm); + if (!*key) + return zero_key; + + return key_data(*key); +} + +static struct key *nvdimm_lookup_user_key(struct nvdimm *nvdimm, + key_serial_t id, int subclass) +{ + key_ref_t keyref; + struct key *key; + struct encrypted_key_payload *epayload; + struct device *dev = &nvdimm->dev; + + keyref = lookup_user_key(id, 0, KEY_NEED_SEARCH); + if (IS_ERR(keyref)) + return NULL; + + key = key_ref_to_ptr(keyref); + if (key->type != &key_type_encrypted) { + key_put(key); + return NULL; + } + + dev_dbg(dev, "%s: key found: %#x\n", __func__, key_serial(key)); + + down_read_nested(&key->sem, subclass); + epayload = dereference_key_locked(key); + if (epayload->decrypted_datalen != NVDIMM_PASSPHRASE_LEN) { + up_read(&key->sem); + key_put(key); + key = NULL; + } + return key; +} + +static const void *nvdimm_get_user_key_payload(struct nvdimm *nvdimm, + key_serial_t id, int subclass, struct key **key) +{ + *key = NULL; + if (id == 0) { + if (subclass == NVDIMM_BASE_KEY) + return zero_key; + else + return NULL; + } + + *key = nvdimm_lookup_user_key(nvdimm, id, subclass); + if (!*key) + return NULL; + + return key_data(*key); +} + + +static int nvdimm_key_revalidate(struct nvdimm *nvdimm) +{ + struct key *key; + int rc; + const void *data; + + if (!nvdimm->sec.ops->change_key) + return -EOPNOTSUPP; + + data = nvdimm_get_key_payload(nvdimm, &key); + + /* + * Send the same key to the hardware as new and old key to + * verify that the key is good. + */ + rc = nvdimm->sec.ops->change_key(nvdimm, data, data, NVDIMM_USER); + if (rc < 0) { + nvdimm_put_key(key); + return rc; + } + + nvdimm_put_key(key); + nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER); + return 0; +} + +static int __nvdimm_security_unlock(struct nvdimm *nvdimm) +{ + struct device *dev = &nvdimm->dev; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct key *key; + const void *data; + int rc; + + /* The bus lock should be held at the top level of the call stack */ + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); + + if (!nvdimm->sec.ops || !nvdimm->sec.ops->unlock + || !nvdimm->sec.flags) + return -EIO; + + /* No need to go further if security is disabled */ + if (test_bit(NVDIMM_SECURITY_DISABLED, &nvdimm->sec.flags)) + return 0; + + if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) { + dev_dbg(dev, "Security operation in progress.\n"); + return -EBUSY; + } + + /* + * If the pre-OS has unlocked the DIMM, attempt to send the key + * from request_key() to the hardware for verification. Failure + * to revalidate the key against the hardware results in a + * freeze of the security configuration. I.e. if the OS does not + * have the key, security is being managed pre-OS. + */ + if (test_bit(NVDIMM_SECURITY_UNLOCKED, &nvdimm->sec.flags)) { + if (!key_revalidate) + return 0; + + return nvdimm_key_revalidate(nvdimm); + } else + data = nvdimm_get_key_payload(nvdimm, &key); + + rc = nvdimm->sec.ops->unlock(nvdimm, data); + dev_dbg(dev, "key: %d unlock: %s\n", key_serial(key), + rc == 0 ? "success" : "fail"); + + nvdimm_put_key(key); + nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER); + return rc; +} + +int nvdimm_security_unlock(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + int rc; + + nvdimm_bus_lock(dev); + rc = __nvdimm_security_unlock(nvdimm); + nvdimm_bus_unlock(dev); + return rc; +} + +static int check_security_state(struct nvdimm *nvdimm) +{ + struct device *dev = &nvdimm->dev; + + if (test_bit(NVDIMM_SECURITY_FROZEN, &nvdimm->sec.flags)) { + dev_dbg(dev, "Incorrect security state: %#lx\n", + nvdimm->sec.flags); + return -EIO; + } + + if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) { + dev_dbg(dev, "Security operation in progress.\n"); + return -EBUSY; + } + + return 0; +} + +static int security_disable(struct nvdimm *nvdimm, unsigned int keyid) +{ + struct device *dev = &nvdimm->dev; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct key *key; + int rc; + const void *data; + + /* The bus lock should be held at the top level of the call stack */ + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); + + if (!nvdimm->sec.ops || !nvdimm->sec.ops->disable + || !nvdimm->sec.flags) + return -EOPNOTSUPP; + + rc = check_security_state(nvdimm); + if (rc) + return rc; + + data = nvdimm_get_user_key_payload(nvdimm, keyid, + NVDIMM_BASE_KEY, &key); + if (!data) + return -ENOKEY; + + rc = nvdimm->sec.ops->disable(nvdimm, data); + dev_dbg(dev, "key: %d disable: %s\n", key_serial(key), + rc == 0 ? "success" : "fail"); + + nvdimm_put_key(key); + nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER); + return rc; +} + +static int security_update(struct nvdimm *nvdimm, unsigned int keyid, + unsigned int new_keyid, + enum nvdimm_passphrase_type pass_type) +{ + struct device *dev = &nvdimm->dev; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct key *key, *newkey; + int rc; + const void *data, *newdata; + + /* The bus lock should be held at the top level of the call stack */ + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); + + if (!nvdimm->sec.ops || !nvdimm->sec.ops->change_key + || !nvdimm->sec.flags) + return -EOPNOTSUPP; + + rc = check_security_state(nvdimm); + if (rc) + return rc; + + data = nvdimm_get_user_key_payload(nvdimm, keyid, + NVDIMM_BASE_KEY, &key); + if (!data) + return -ENOKEY; + + newdata = nvdimm_get_user_key_payload(nvdimm, new_keyid, + NVDIMM_NEW_KEY, &newkey); + if (!newdata) { + nvdimm_put_key(key); + return -ENOKEY; + } + + rc = nvdimm->sec.ops->change_key(nvdimm, data, newdata, pass_type); + dev_dbg(dev, "key: %d %d update%s: %s\n", + key_serial(key), key_serial(newkey), + pass_type == NVDIMM_MASTER ? "(master)" : "(user)", + rc == 0 ? "success" : "fail"); + + nvdimm_put_key(newkey); + nvdimm_put_key(key); + if (pass_type == NVDIMM_MASTER) + nvdimm->sec.ext_flags = nvdimm_security_flags(nvdimm, + NVDIMM_MASTER); + else + nvdimm->sec.flags = nvdimm_security_flags(nvdimm, + NVDIMM_USER); + return rc; +} + +static int security_erase(struct nvdimm *nvdimm, unsigned int keyid, + enum nvdimm_passphrase_type pass_type) +{ + struct device *dev = &nvdimm->dev; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct key *key = NULL; + int rc; + const void *data; + + /* The bus lock should be held at the top level of the call stack */ + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); + + if (!nvdimm->sec.ops || !nvdimm->sec.ops->erase + || !nvdimm->sec.flags) + return -EOPNOTSUPP; + + rc = check_security_state(nvdimm); + if (rc) + return rc; + + if (!test_bit(NVDIMM_SECURITY_UNLOCKED, &nvdimm->sec.ext_flags) + && pass_type == NVDIMM_MASTER) { + dev_dbg(dev, + "Attempt to secure erase in wrong master state.\n"); + return -EOPNOTSUPP; + } + + data = nvdimm_get_user_key_payload(nvdimm, keyid, + NVDIMM_BASE_KEY, &key); + if (!data) + return -ENOKEY; + + rc = nvdimm->sec.ops->erase(nvdimm, data, pass_type); + dev_dbg(dev, "key: %d erase%s: %s\n", key_serial(key), + pass_type == NVDIMM_MASTER ? "(master)" : "(user)", + rc == 0 ? "success" : "fail"); + + nvdimm_put_key(key); + nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER); + return rc; +} + +static int security_overwrite(struct nvdimm *nvdimm, unsigned int keyid) +{ + struct device *dev = &nvdimm->dev; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + struct key *key = NULL; + int rc; + const void *data; + + /* The bus lock should be held at the top level of the call stack */ + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); + + if (!nvdimm->sec.ops || !nvdimm->sec.ops->overwrite + || !nvdimm->sec.flags) + return -EOPNOTSUPP; + + rc = check_security_state(nvdimm); + if (rc) + return rc; + + data = nvdimm_get_user_key_payload(nvdimm, keyid, + NVDIMM_BASE_KEY, &key); + if (!data) + return -ENOKEY; + + rc = nvdimm->sec.ops->overwrite(nvdimm, data); + dev_dbg(dev, "key: %d overwrite submission: %s\n", key_serial(key), + rc == 0 ? "success" : "fail"); + + nvdimm_put_key(key); + if (rc == 0) { + set_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags); + set_bit(NDD_WORK_PENDING, &nvdimm->flags); + set_bit(NVDIMM_SECURITY_OVERWRITE, &nvdimm->sec.flags); + /* + * Make sure we don't lose device while doing overwrite + * query. + */ + get_device(dev); + queue_delayed_work(system_wq, &nvdimm->dwork, 0); + } + + return rc; +} + +static void __nvdimm_security_overwrite_query(struct nvdimm *nvdimm) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev); + int rc; + unsigned int tmo; + + /* The bus lock should be held at the top level of the call stack */ + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); + + /* + * Abort and release device if we no longer have the overwrite + * flag set. It means the work has been canceled. + */ + if (!test_bit(NDD_WORK_PENDING, &nvdimm->flags)) + return; + + tmo = nvdimm->sec.overwrite_tmo; + + if (!nvdimm->sec.ops || !nvdimm->sec.ops->query_overwrite + || !nvdimm->sec.flags) + return; + + rc = nvdimm->sec.ops->query_overwrite(nvdimm); + if (rc == -EBUSY) { + + /* setup delayed work again */ + tmo += 10; + queue_delayed_work(system_wq, &nvdimm->dwork, tmo * HZ); + nvdimm->sec.overwrite_tmo = min(15U * 60U, tmo); + return; + } + + if (rc < 0) + dev_dbg(&nvdimm->dev, "overwrite failed\n"); + else + dev_dbg(&nvdimm->dev, "overwrite completed\n"); + + /* + * Mark the overwrite work done and update dimm security flags, + * then send a sysfs event notification to wake up userspace + * poll threads to picked up the changed state. + */ + nvdimm->sec.overwrite_tmo = 0; + clear_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags); + clear_bit(NDD_WORK_PENDING, &nvdimm->flags); + nvdimm->sec.flags = nvdimm_security_flags(nvdimm, NVDIMM_USER); + nvdimm->sec.ext_flags = nvdimm_security_flags(nvdimm, NVDIMM_MASTER); + if (nvdimm->sec.overwrite_state) + sysfs_notify_dirent(nvdimm->sec.overwrite_state); + put_device(&nvdimm->dev); +} + +void nvdimm_security_overwrite_query(struct work_struct *work) +{ + struct nvdimm *nvdimm = + container_of(work, typeof(*nvdimm), dwork.work); + + nvdimm_bus_lock(&nvdimm->dev); + __nvdimm_security_overwrite_query(nvdimm); + nvdimm_bus_unlock(&nvdimm->dev); +} + +#define OPS \ + C( OP_FREEZE, "freeze", 1), \ + C( OP_DISABLE, "disable", 2), \ + C( OP_UPDATE, "update", 3), \ + C( OP_ERASE, "erase", 2), \ + C( OP_OVERWRITE, "overwrite", 2), \ + C( OP_MASTER_UPDATE, "master_update", 3), \ + C( OP_MASTER_ERASE, "master_erase", 2) +#undef C +#define C(a, b, c) a +enum nvdimmsec_op_ids { OPS }; +#undef C +#define C(a, b, c) { b, c } +static struct { + const char *name; + int args; +} ops[] = { OPS }; +#undef C + +#define SEC_CMD_SIZE 32 +#define KEY_ID_SIZE 10 + +ssize_t nvdimm_security_store(struct device *dev, const char *buf, size_t len) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + ssize_t rc; + char cmd[SEC_CMD_SIZE+1], keystr[KEY_ID_SIZE+1], + nkeystr[KEY_ID_SIZE+1]; + unsigned int key, newkey; + int i; + + rc = sscanf(buf, "%"__stringify(SEC_CMD_SIZE)"s" + " %"__stringify(KEY_ID_SIZE)"s" + " %"__stringify(KEY_ID_SIZE)"s", + cmd, keystr, nkeystr); + if (rc < 1) + return -EINVAL; + for (i = 0; i < ARRAY_SIZE(ops); i++) + if (sysfs_streq(cmd, ops[i].name)) + break; + if (i >= ARRAY_SIZE(ops)) + return -EINVAL; + if (ops[i].args > 1) + rc = kstrtouint(keystr, 0, &key); + if (rc >= 0 && ops[i].args > 2) + rc = kstrtouint(nkeystr, 0, &newkey); + if (rc < 0) + return rc; + + if (i == OP_FREEZE) { + dev_dbg(dev, "freeze\n"); + rc = nvdimm_security_freeze(nvdimm); + } else if (i == OP_DISABLE) { + dev_dbg(dev, "disable %u\n", key); + rc = security_disable(nvdimm, key); + } else if (i == OP_UPDATE || i == OP_MASTER_UPDATE) { + dev_dbg(dev, "%s %u %u\n", ops[i].name, key, newkey); + rc = security_update(nvdimm, key, newkey, i == OP_UPDATE + ? NVDIMM_USER : NVDIMM_MASTER); + } else if (i == OP_ERASE || i == OP_MASTER_ERASE) { + dev_dbg(dev, "%s %u\n", ops[i].name, key); + if (atomic_read(&nvdimm->busy)) { + dev_dbg(dev, "Unable to secure erase while DIMM active.\n"); + return -EBUSY; + } + rc = security_erase(nvdimm, key, i == OP_ERASE + ? NVDIMM_USER : NVDIMM_MASTER); + } else if (i == OP_OVERWRITE) { + dev_dbg(dev, "overwrite %u\n", key); + if (atomic_read(&nvdimm->busy)) { + dev_dbg(dev, "Unable to overwrite while DIMM active.\n"); + return -EBUSY; + } + rc = security_overwrite(nvdimm, key); + } else + return -EINVAL; + + if (rc == 0) + rc = len; + return rc; +} diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c new file mode 100644 index 000000000..20da455d2 --- /dev/null +++ b/drivers/nvdimm/virtio_pmem.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio_pmem.c: Virtio pmem Driver + * + * Discovers persistent memory range information + * from host and registers the virtual pmem device + * with libnvdimm core. + */ +#include "virtio_pmem.h" +#include "nd.h" + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + + /* Initialize virt queue */ +static int init_vq(struct virtio_pmem *vpmem) +{ + /* single vq */ + vpmem->req_vq = virtio_find_single_vq(vpmem->vdev, + virtio_pmem_host_ack, "flush_queue"); + if (IS_ERR(vpmem->req_vq)) + return PTR_ERR(vpmem->req_vq); + + spin_lock_init(&vpmem->pmem_lock); + INIT_LIST_HEAD(&vpmem->req_list); + + return 0; +}; + +static int virtio_pmem_probe(struct virtio_device *vdev) +{ + struct nd_region_desc ndr_desc = {}; + int nid = dev_to_node(&vdev->dev); + struct nd_region *nd_region; + struct virtio_pmem *vpmem; + struct resource res; + int err = 0; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), GFP_KERNEL); + if (!vpmem) { + err = -ENOMEM; + goto out_err; + } + + vpmem->vdev = vdev; + vdev->priv = vpmem; + err = init_vq(vpmem); + if (err) { + dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n"); + goto out_err; + } + + virtio_cread_le(vpmem->vdev, struct virtio_pmem_config, + start, &vpmem->start); + virtio_cread_le(vpmem->vdev, struct virtio_pmem_config, + size, &vpmem->size); + + res.start = vpmem->start; + res.end = vpmem->start + vpmem->size - 1; + vpmem->nd_desc.provider_name = "virtio-pmem"; + vpmem->nd_desc.module = THIS_MODULE; + + vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev, + &vpmem->nd_desc); + if (!vpmem->nvdimm_bus) { + dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n"); + err = -ENXIO; + goto out_vq; + } + + dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus); + + ndr_desc.res = &res; + ndr_desc.numa_node = nid; + ndr_desc.flush = async_pmem_flush; + ndr_desc.provider_data = vdev; + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + set_bit(ND_REGION_ASYNC, &ndr_desc.flags); + /* + * The NVDIMM region could be available before the + * virtio_device_ready() that is called by + * virtio_dev_probe(), so we set device ready here. + */ + virtio_device_ready(vdev); + nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc); + if (!nd_region) { + dev_err(&vdev->dev, "failed to create nvdimm region\n"); + err = -ENXIO; + goto out_nd; + } + return 0; +out_nd: + virtio_reset_device(vdev); + nvdimm_bus_unregister(vpmem->nvdimm_bus); +out_vq: + vdev->config->del_vqs(vdev); +out_err: + return err; +} + +static void virtio_pmem_remove(struct virtio_device *vdev) +{ + struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev); + + nvdimm_bus_unregister(nvdimm_bus); + vdev->config->del_vqs(vdev); + virtio_reset_device(vdev); +} + +static struct virtio_driver virtio_pmem_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtio_pmem_probe, + .remove = virtio_pmem_remove, +}; + +module_virtio_driver(virtio_pmem_driver); +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio pmem driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h new file mode 100644 index 000000000..0dddefe59 --- /dev/null +++ b/drivers/nvdimm/virtio_pmem.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * virtio_pmem.h: virtio pmem Driver + * + * Discovers persistent memory range information + * from host and provides a virtio based flushing + * interface. + **/ + +#ifndef _LINUX_VIRTIO_PMEM_H +#define _LINUX_VIRTIO_PMEM_H + +#include <linux/module.h> +#include <uapi/linux/virtio_pmem.h> +#include <linux/libnvdimm.h> +#include <linux/spinlock.h> + +struct virtio_pmem_request { + struct virtio_pmem_req req; + struct virtio_pmem_resp resp; + + /* Wait queue to process deferred work after ack from host */ + wait_queue_head_t host_acked; + bool done; + + /* Wait queue to process deferred work after virt queue buffer avail */ + wait_queue_head_t wq_buf; + bool wq_buf_avail; + struct list_head list; +}; + +struct virtio_pmem { + struct virtio_device *vdev; + + /* Virtio pmem request queue */ + struct virtqueue *req_vq; + + /* nvdimm bus registers virtio pmem device */ + struct nvdimm_bus *nvdimm_bus; + struct nvdimm_bus_descriptor nd_desc; + + /* List to store deferred work if virtqueue is full */ + struct list_head req_list; + + /* Synchronize virtqueue data */ + spinlock_t pmem_lock; + + /* Memory region information */ + __u64 start; + __u64 size; +}; + +void virtio_pmem_host_ack(struct virtqueue *vq); +int async_pmem_flush(struct nd_region *nd_region, struct bio *bio); +#endif |