From 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 12:05:51 +0200 Subject: Adding upstream version 5.10.209. Signed-off-by: Daniel Baumann --- drivers/dax/Kconfig | 82 +++ drivers/dax/Makefile | 11 + drivers/dax/bus.c | 1482 +++++++++++++++++++++++++++++++++++++++++++++ drivers/dax/bus.h | 67 ++ drivers/dax/dax-private.h | 110 ++++ drivers/dax/device.c | 481 +++++++++++++++ drivers/dax/hmem/Makefile | 6 + drivers/dax/hmem/device.c | 101 +++ drivers/dax/hmem/hmem.c | 65 ++ drivers/dax/kmem.c | 228 +++++++ drivers/dax/pmem/Makefile | 8 + drivers/dax/pmem/compat.c | 73 +++ drivers/dax/pmem/core.c | 79 +++ drivers/dax/pmem/pmem.c | 40 ++ drivers/dax/super.c | 765 +++++++++++++++++++++++ 15 files changed, 3598 insertions(+) create mode 100644 drivers/dax/Kconfig create mode 100644 drivers/dax/Makefile create mode 100644 drivers/dax/bus.c create mode 100644 drivers/dax/bus.h create mode 100644 drivers/dax/dax-private.h create mode 100644 drivers/dax/device.c create mode 100644 drivers/dax/hmem/Makefile create mode 100644 drivers/dax/hmem/device.c create mode 100644 drivers/dax/hmem/hmem.c create mode 100644 drivers/dax/kmem.c create mode 100644 drivers/dax/pmem/Makefile create mode 100644 drivers/dax/pmem/compat.c create mode 100644 drivers/dax/pmem/core.c create mode 100644 drivers/dax/pmem/pmem.c create mode 100644 drivers/dax/super.c (limited to 'drivers/dax') diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig new file mode 100644 index 000000000..d2834c2cf --- /dev/null +++ b/drivers/dax/Kconfig @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: GPL-2.0-only +config DAX_DRIVER + select DAX + bool + +menuconfig DAX + tristate "DAX: direct access to differentiated memory" + select SRCU + default m if NVDIMM_DAX + +if DAX + +config DEV_DAX + tristate "Device DAX: direct access mapping device" + depends on TRANSPARENT_HUGEPAGE + help + Support raw access to differentiated (persistence, bandwidth, + latency...) memory via an mmap(2) capable character + device. Platform firmware or a device driver may identify a + platform memory resource that is differentiated from the + baseline memory pool. Mappings of a /dev/daxX.Y device impose + restrictions that make the mapping behavior deterministic. + +config DEV_DAX_PMEM + tristate "PMEM DAX: direct access to persistent memory" + depends on LIBNVDIMM && NVDIMM_DAX && DEV_DAX + default DEV_DAX + help + Support raw access to persistent memory. Note that this + driver consumes memory ranges allocated and exported by the + libnvdimm sub-system. + + Say M if unsure + +config DEV_DAX_HMEM + tristate "HMEM DAX: direct access to 'specific purpose' memory" + depends on EFI_SOFT_RESERVE + select NUMA_KEEP_MEMINFO if (NUMA && X86) + default DEV_DAX + help + EFI 2.8 platforms, and others, may advertise 'specific purpose' + memory. For example, a high bandwidth memory pool. The + indication from platform firmware is meant to reserve the + memory from typical usage by default. This driver creates + device-dax instances for these memory ranges, and that also + enables the possibility to assign them to the DEV_DAX_KMEM + driver to override the reservation and add them to kernel + "System RAM" pool. + + Say M if unsure. + +config DEV_DAX_HMEM_DEVICES + depends on DEV_DAX_HMEM && DAX=y + def_bool y + +config DEV_DAX_KMEM + tristate "KMEM DAX: volatile-use of persistent memory" + default DEV_DAX + depends on DEV_DAX + depends on MEMORY_HOTPLUG # for add_memory() and friends + help + Support access to persistent, or other performance + differentiated memory as if it were System RAM. This allows + easier use of persistent memory by unmodified applications, or + adds core kernel memory services to heterogeneous memory types + (HMEM) marked "reserved" by platform firmware. + + To use this feature, a DAX device must be unbound from the + device_dax driver and bound to this kmem driver on each boot. + + Say N if unsure. + +config DEV_DAX_PMEM_COMPAT + tristate "PMEM DAX: support the deprecated /sys/class/dax interface" + depends on m && DEV_DAX_PMEM=m + default DEV_DAX_PMEM + help + Older versions of the libdaxctl library expect to find all + device-dax instances under /sys/class/dax. If libdaxctl in + your distribution is older than v58 say M, otherwise say N. + +endif diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile new file mode 100644 index 000000000..9d4ba672d --- /dev/null +++ b/drivers/dax/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DAX) += dax.o +obj-$(CONFIG_DEV_DAX) += device_dax.o +obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o + +dax-y := super.o +dax-y += bus.o +device_dax-y := device.o + +obj-y += pmem/ +obj-y += hmem/ diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c new file mode 100644 index 000000000..0541b7e4d --- /dev/null +++ b/drivers/dax/bus.c @@ -0,0 +1,1482 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2017-2018 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include +#include +#include "dax-private.h" +#include "bus.h" + +static struct class *dax_class; + +static DEFINE_MUTEX(dax_bus_lock); + +#define DAX_NAME_LEN 30 +struct dax_id { + struct list_head list; + char dev_name[DAX_NAME_LEN]; +}; + +static int dax_bus_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + /* + * We only ever expect to handle device-dax instances, i.e. the + * @type argument to MODULE_ALIAS_DAX_DEVICE() is always zero + */ + return add_uevent_var(env, "MODALIAS=" DAX_DEVICE_MODALIAS_FMT, 0); +} + +static struct dax_device_driver *to_dax_drv(struct device_driver *drv) +{ + return container_of(drv, struct dax_device_driver, drv); +} + +static struct dax_id *__dax_match_id(struct dax_device_driver *dax_drv, + const char *dev_name) +{ + struct dax_id *dax_id; + + lockdep_assert_held(&dax_bus_lock); + + list_for_each_entry(dax_id, &dax_drv->ids, list) + if (sysfs_streq(dax_id->dev_name, dev_name)) + return dax_id; + return NULL; +} + +static int dax_match_id(struct dax_device_driver *dax_drv, struct device *dev) +{ + int match; + + mutex_lock(&dax_bus_lock); + match = !!__dax_match_id(dax_drv, dev_name(dev)); + mutex_unlock(&dax_bus_lock); + + return match; +} + +enum id_action { + ID_REMOVE, + ID_ADD, +}; + +static ssize_t do_id_store(struct device_driver *drv, const char *buf, + size_t count, enum id_action action) +{ + struct dax_device_driver *dax_drv = to_dax_drv(drv); + unsigned int region_id, id; + char devname[DAX_NAME_LEN]; + struct dax_id *dax_id; + ssize_t rc = count; + int fields; + + fields = sscanf(buf, "dax%d.%d", ®ion_id, &id); + if (fields != 2) + return -EINVAL; + sprintf(devname, "dax%d.%d", region_id, id); + if (!sysfs_streq(buf, devname)) + return -EINVAL; + + mutex_lock(&dax_bus_lock); + dax_id = __dax_match_id(dax_drv, buf); + if (!dax_id) { + if (action == ID_ADD) { + dax_id = kzalloc(sizeof(*dax_id), GFP_KERNEL); + if (dax_id) { + strncpy(dax_id->dev_name, buf, DAX_NAME_LEN); + list_add(&dax_id->list, &dax_drv->ids); + } else + rc = -ENOMEM; + } else + /* nothing to remove */; + } else if (action == ID_REMOVE) { + list_del(&dax_id->list); + kfree(dax_id); + } else + /* dax_id already added */; + mutex_unlock(&dax_bus_lock); + + if (rc < 0) + return rc; + if (action == ID_ADD) + rc = driver_attach(drv); + if (rc) + return rc; + return count; +} + +static ssize_t new_id_store(struct device_driver *drv, const char *buf, + size_t count) +{ + return do_id_store(drv, buf, count, ID_ADD); +} +static DRIVER_ATTR_WO(new_id); + +static ssize_t remove_id_store(struct device_driver *drv, const char *buf, + size_t count) +{ + return do_id_store(drv, buf, count, ID_REMOVE); +} +static DRIVER_ATTR_WO(remove_id); + +static struct attribute *dax_drv_attrs[] = { + &driver_attr_new_id.attr, + &driver_attr_remove_id.attr, + NULL, +}; +ATTRIBUTE_GROUPS(dax_drv); + +static int dax_bus_match(struct device *dev, struct device_driver *drv); + +static bool is_static(struct dax_region *dax_region) +{ + return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0; +} + +static u64 dev_dax_size(struct dev_dax *dev_dax) +{ + u64 size = 0; + int i; + + device_lock_assert(&dev_dax->dev); + + for (i = 0; i < dev_dax->nr_range; i++) + size += range_len(&dev_dax->ranges[i].range); + + return size; +} + +static int dax_bus_probe(struct device *dev) +{ + struct dax_device_driver *dax_drv = to_dax_drv(dev->driver); + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + int rc; + + if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0) + return -ENXIO; + + rc = dax_drv->probe(dev_dax); + + if (rc || is_static(dax_region)) + return rc; + + /* + * Track new seed creation only after successful probe of the + * previous seed. + */ + if (dax_region->seed == dev) + dax_region->seed = NULL; + + return 0; +} + +static int dax_bus_remove(struct device *dev) +{ + struct dax_device_driver *dax_drv = to_dax_drv(dev->driver); + struct dev_dax *dev_dax = to_dev_dax(dev); + + return dax_drv->remove(dev_dax); +} + +static struct bus_type dax_bus_type = { + .name = "dax", + .uevent = dax_bus_uevent, + .match = dax_bus_match, + .probe = dax_bus_probe, + .remove = dax_bus_remove, + .drv_groups = dax_drv_groups, +}; + +static int dax_bus_match(struct device *dev, struct device_driver *drv) +{ + struct dax_device_driver *dax_drv = to_dax_drv(drv); + + /* + * All but the 'device-dax' driver, which has 'match_always' + * set, requires an exact id match. + */ + if (dax_drv->match_always) + return 1; + + return dax_match_id(dax_drv, dev); +} + +/* + * Rely on the fact that drvdata is set before the attributes are + * registered, and that the attributes are unregistered before drvdata + * is cleared to assume that drvdata is always valid. + */ +static ssize_t id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + + return sprintf(buf, "%d\n", dax_region->id); +} +static DEVICE_ATTR_RO(id); + +static ssize_t region_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + + return sprintf(buf, "%llu\n", (unsigned long long) + resource_size(&dax_region->res)); +} +static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, + region_size_show, NULL); + +static ssize_t region_align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + + return sprintf(buf, "%u\n", dax_region->align); +} +static struct device_attribute dev_attr_region_align = + __ATTR(align, 0400, region_align_show, NULL); + +#define for_each_dax_region_resource(dax_region, res) \ + for (res = (dax_region)->res.child; res; res = res->sibling) + +static unsigned long long dax_region_avail_size(struct dax_region *dax_region) +{ + resource_size_t size = resource_size(&dax_region->res); + struct resource *res; + + device_lock_assert(dax_region->dev); + + for_each_dax_region_resource(dax_region, res) + size -= resource_size(res); + return size; +} + +static ssize_t available_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + unsigned long long size; + + device_lock(dev); + size = dax_region_avail_size(dax_region); + device_unlock(dev); + + return sprintf(buf, "%llu\n", size); +} +static DEVICE_ATTR_RO(available_size); + +static ssize_t seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct device *seed; + ssize_t rc; + + if (is_static(dax_region)) + return -EINVAL; + + device_lock(dev); + seed = dax_region->seed; + rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : ""); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(seed); + +static ssize_t create_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct device *youngest; + ssize_t rc; + + if (is_static(dax_region)) + return -EINVAL; + + device_lock(dev); + youngest = dax_region->youngest; + rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : ""); + device_unlock(dev); + + return rc; +} + +static ssize_t create_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + unsigned long long avail; + ssize_t rc; + int val; + + if (is_static(dax_region)) + return -EINVAL; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + if (val != 1) + return -EINVAL; + + device_lock(dev); + avail = dax_region_avail_size(dax_region); + if (avail == 0) + rc = -ENOSPC; + else { + struct dev_dax_data data = { + .dax_region = dax_region, + .size = 0, + .id = -1, + }; + struct dev_dax *dev_dax = devm_create_dev_dax(&data); + + if (IS_ERR(dev_dax)) + rc = PTR_ERR(dev_dax); + else { + /* + * In support of crafting multiple new devices + * simultaneously multiple seeds can be created, + * but only the first one that has not been + * successfully bound is tracked as the region + * seed. + */ + if (!dax_region->seed) + dax_region->seed = &dev_dax->dev; + dax_region->youngest = &dev_dax->dev; + rc = len; + } + } + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(create); + +void kill_dev_dax(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct inode *inode = dax_inode(dax_dev); + + kill_dax(dax_dev); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); +} +EXPORT_SYMBOL_GPL(kill_dev_dax); + +static void trim_dev_dax_range(struct dev_dax *dev_dax) +{ + int i = dev_dax->nr_range - 1; + struct range *range = &dev_dax->ranges[i].range; + struct dax_region *dax_region = dev_dax->region; + + device_lock_assert(dax_region->dev); + dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i, + (unsigned long long)range->start, + (unsigned long long)range->end); + + __release_region(&dax_region->res, range->start, range_len(range)); + if (--dev_dax->nr_range == 0) { + kfree(dev_dax->ranges); + dev_dax->ranges = NULL; + } +} + +static void free_dev_dax_ranges(struct dev_dax *dev_dax) +{ + while (dev_dax->nr_range) + trim_dev_dax_range(dev_dax); +} + +static void unregister_dev_dax(void *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + dev_dbg(dev, "%s\n", __func__); + + kill_dev_dax(dev_dax); + device_del(dev); + free_dev_dax_ranges(dev_dax); + put_device(dev); +} + +static void dax_region_free(struct kref *kref) +{ + struct dax_region *dax_region; + + dax_region = container_of(kref, struct dax_region, kref); + kfree(dax_region); +} + +void dax_region_put(struct dax_region *dax_region) +{ + kref_put(&dax_region->kref, dax_region_free); +} +EXPORT_SYMBOL_GPL(dax_region_put); + +/* a return value >= 0 indicates this invocation invalidated the id */ +static int __free_dev_dax_id(struct dev_dax *dev_dax) +{ + struct device *dev = &dev_dax->dev; + struct dax_region *dax_region; + int rc = dev_dax->id; + + device_lock_assert(dev); + + if (!dev_dax->dyn_id || dev_dax->id < 0) + return -1; + dax_region = dev_dax->region; + ida_free(&dax_region->ida, dev_dax->id); + dax_region_put(dax_region); + dev_dax->id = -1; + return rc; +} + +static int free_dev_dax_id(struct dev_dax *dev_dax) +{ + struct device *dev = &dev_dax->dev; + int rc; + + device_lock(dev); + rc = __free_dev_dax_id(dev_dax); + device_unlock(dev); + return rc; +} + +static int alloc_dev_dax_id(struct dev_dax *dev_dax) +{ + struct dax_region *dax_region = dev_dax->region; + int id; + + id = ida_alloc(&dax_region->ida, GFP_KERNEL); + if (id < 0) + return id; + kref_get(&dax_region->kref); + dev_dax->dyn_id = true; + dev_dax->id = id; + return id; +} + +static ssize_t delete_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct dev_dax *dev_dax; + struct device *victim; + bool do_del = false; + int rc; + + if (is_static(dax_region)) + return -EINVAL; + + victim = device_find_child_by_name(dax_region->dev, buf); + if (!victim) + return -ENXIO; + + device_lock(dev); + device_lock(victim); + dev_dax = to_dev_dax(victim); + if (victim->driver || dev_dax_size(dev_dax)) + rc = -EBUSY; + else { + /* + * Invalidate the device so it does not become active + * again, but always preserve device-id-0 so that + * /sys/bus/dax/ is guaranteed to be populated while any + * dax_region is registered. + */ + if (dev_dax->id > 0) { + do_del = __free_dev_dax_id(dev_dax) >= 0; + rc = len; + if (dax_region->seed == victim) + dax_region->seed = NULL; + if (dax_region->youngest == victim) + dax_region->youngest = NULL; + } else + rc = -EBUSY; + } + device_unlock(victim); + + /* won the race to invalidate the device, clean it up */ + if (do_del) + devm_release_action(dev, unregister_dev_dax, victim); + device_unlock(dev); + put_device(victim); + + return rc; +} +static DEVICE_ATTR_WO(delete); + +static umode_t dax_region_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct dax_region *dax_region = dev_get_drvdata(dev); + + if (is_static(dax_region)) + if (a == &dev_attr_available_size.attr + || a == &dev_attr_create.attr + || a == &dev_attr_seed.attr + || a == &dev_attr_delete.attr) + return 0; + return a->mode; +} + +static struct attribute *dax_region_attributes[] = { + &dev_attr_available_size.attr, + &dev_attr_region_size.attr, + &dev_attr_region_align.attr, + &dev_attr_create.attr, + &dev_attr_seed.attr, + &dev_attr_delete.attr, + &dev_attr_id.attr, + NULL, +}; + +static const struct attribute_group dax_region_attribute_group = { + .name = "dax_region", + .attrs = dax_region_attributes, + .is_visible = dax_region_visible, +}; + +static const struct attribute_group *dax_region_attribute_groups[] = { + &dax_region_attribute_group, + NULL, +}; + +static void dax_region_unregister(void *region) +{ + struct dax_region *dax_region = region; + + sysfs_remove_groups(&dax_region->dev->kobj, + dax_region_attribute_groups); + dax_region_put(dax_region); +} + +struct dax_region *alloc_dax_region(struct device *parent, int region_id, + struct range *range, int target_node, unsigned int align, + unsigned long flags) +{ + struct dax_region *dax_region; + + /* + * The DAX core assumes that it can store its private data in + * parent->driver_data. This WARN is a reminder / safeguard for + * developers of device-dax drivers. + */ + if (dev_get_drvdata(parent)) { + dev_WARN(parent, "dax core failed to setup private data\n"); + return NULL; + } + + if (!IS_ALIGNED(range->start, align) + || !IS_ALIGNED(range_len(range), align)) + return NULL; + + dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); + if (!dax_region) + return NULL; + + dev_set_drvdata(parent, dax_region); + kref_init(&dax_region->kref); + dax_region->id = region_id; + dax_region->align = align; + dax_region->dev = parent; + dax_region->target_node = target_node; + ida_init(&dax_region->ida); + dax_region->res = (struct resource) { + .start = range->start, + .end = range->end, + .flags = IORESOURCE_MEM | flags, + }; + + if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { + kfree(dax_region); + return NULL; + } + + kref_get(&dax_region->kref); + if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) + return NULL; + return dax_region; +} +EXPORT_SYMBOL_GPL(alloc_dax_region); + +static void dax_mapping_release(struct device *dev) +{ + struct dax_mapping *mapping = to_dax_mapping(dev); + struct device *parent = dev->parent; + struct dev_dax *dev_dax = to_dev_dax(parent); + + ida_free(&dev_dax->ida, mapping->id); + kfree(mapping); + put_device(parent); +} + +static void unregister_dax_mapping(void *data) +{ + struct device *dev = data; + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + struct dax_region *dax_region = dev_dax->region; + + dev_dbg(dev, "%s\n", __func__); + + device_lock_assert(dax_region->dev); + + dev_dax->ranges[mapping->range_id].mapping = NULL; + mapping->range_id = -1; + + device_del(dev); + put_device(dev); +} + +static struct dev_dax_range *get_dax_range(struct device *dev) +{ + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + struct dax_region *dax_region = dev_dax->region; + + device_lock(dax_region->dev); + if (mapping->range_id < 0) { + device_unlock(dax_region->dev); + return NULL; + } + + return &dev_dax->ranges[mapping->range_id]; +} + +static void put_dax_range(struct dev_dax_range *dax_range) +{ + struct dax_mapping *mapping = dax_range->mapping; + struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent); + struct dax_region *dax_region = dev_dax->region; + + device_unlock(dax_region->dev); +} + +static ssize_t start_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#llx\n", dax_range->range.start); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(start, 0400, start_show, NULL); + +static ssize_t end_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#llx\n", dax_range->range.end); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(end, 0400, end_show, NULL); + +static ssize_t pgoff_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#lx\n", dax_range->pgoff); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(page_offset, 0400, pgoff_show, NULL); + +static struct attribute *dax_mapping_attributes[] = { + &dev_attr_start.attr, + &dev_attr_end.attr, + &dev_attr_page_offset.attr, + NULL, +}; + +static const struct attribute_group dax_mapping_attribute_group = { + .attrs = dax_mapping_attributes, +}; + +static const struct attribute_group *dax_mapping_attribute_groups[] = { + &dax_mapping_attribute_group, + NULL, +}; + +static struct device_type dax_mapping_type = { + .release = dax_mapping_release, + .groups = dax_mapping_attribute_groups, +}; + +static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id) +{ + struct dax_region *dax_region = dev_dax->region; + struct dax_mapping *mapping; + struct device *dev; + int rc; + + device_lock_assert(dax_region->dev); + + if (dev_WARN_ONCE(&dev_dax->dev, !dax_region->dev->driver, + "region disabled\n")) + return -ENXIO; + + mapping = kzalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return -ENOMEM; + mapping->range_id = range_id; + mapping->id = ida_alloc(&dev_dax->ida, GFP_KERNEL); + if (mapping->id < 0) { + kfree(mapping); + return -ENOMEM; + } + dev_dax->ranges[range_id].mapping = mapping; + dev = &mapping->dev; + device_initialize(dev); + dev->parent = &dev_dax->dev; + get_device(dev->parent); + dev->type = &dax_mapping_type; + dev_set_name(dev, "mapping%d", mapping->id); + rc = device_add(dev); + if (rc) { + put_device(dev); + return rc; + } + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_mapping, + dev); + if (rc) + return rc; + return 0; +} + +static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start, + resource_size_t size) +{ + struct dax_region *dax_region = dev_dax->region; + struct resource *res = &dax_region->res; + struct device *dev = &dev_dax->dev; + struct dev_dax_range *ranges; + unsigned long pgoff = 0; + struct resource *alloc; + int i, rc; + + device_lock_assert(dax_region->dev); + + /* handle the seed alloc special case */ + if (!size) { + if (dev_WARN_ONCE(dev, dev_dax->nr_range, + "0-size allocation must be first\n")) + return -EBUSY; + /* nr_range == 0 is elsewhere special cased as 0-size device */ + return 0; + } + + ranges = krealloc(dev_dax->ranges, sizeof(*ranges) + * (dev_dax->nr_range + 1), GFP_KERNEL); + if (!ranges) + return -ENOMEM; + + alloc = __request_region(res, start, size, dev_name(dev), 0); + if (!alloc) { + /* + * If this was an empty set of ranges nothing else + * will release @ranges, so do it now. + */ + if (!dev_dax->nr_range) { + kfree(ranges); + ranges = NULL; + } + dev_dax->ranges = ranges; + return -ENOMEM; + } + + for (i = 0; i < dev_dax->nr_range; i++) + pgoff += PHYS_PFN(range_len(&ranges[i].range)); + dev_dax->ranges = ranges; + ranges[dev_dax->nr_range++] = (struct dev_dax_range) { + .pgoff = pgoff, + .range = { + .start = alloc->start, + .end = alloc->end, + }, + }; + + dev_dbg(dev, "alloc range[%d]: %pa:%pa\n", dev_dax->nr_range - 1, + &alloc->start, &alloc->end); + /* + * A dev_dax instance must be registered before mapping device + * children can be added. Defer to devm_create_dev_dax() to add + * the initial mapping device. + */ + if (!device_is_registered(&dev_dax->dev)) + return 0; + + rc = devm_register_dax_mapping(dev_dax, dev_dax->nr_range - 1); + if (rc) + trim_dev_dax_range(dev_dax); + + return rc; +} + +static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, resource_size_t size) +{ + int last_range = dev_dax->nr_range - 1; + struct dev_dax_range *dax_range = &dev_dax->ranges[last_range]; + struct dax_region *dax_region = dev_dax->region; + bool is_shrink = resource_size(res) > size; + struct range *range = &dax_range->range; + struct device *dev = &dev_dax->dev; + int rc; + + device_lock_assert(dax_region->dev); + + if (dev_WARN_ONCE(dev, !size, "deletion is handled by dev_dax_shrink\n")) + return -EINVAL; + + rc = adjust_resource(res, range->start, size); + if (rc) + return rc; + + *range = (struct range) { + .start = range->start, + .end = range->start + size - 1, + }; + + dev_dbg(dev, "%s range[%d]: %#llx:%#llx\n", is_shrink ? "shrink" : "extend", + last_range, (unsigned long long) range->start, + (unsigned long long) range->end); + + return 0; +} + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + unsigned long long size; + + device_lock(dev); + size = dev_dax_size(dev_dax); + device_unlock(dev); + + return sprintf(buf, "%llu\n", size); +} + +static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size) +{ + /* + * The minimum mapping granularity for a device instance is a + * single subsection, unless the arch says otherwise. + */ + return IS_ALIGNED(size, max_t(unsigned long, dev_dax->align, memremap_compat_align())); +} + +static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size) +{ + resource_size_t to_shrink = dev_dax_size(dev_dax) - size; + struct dax_region *dax_region = dev_dax->region; + struct device *dev = &dev_dax->dev; + int i; + + for (i = dev_dax->nr_range - 1; i >= 0; i--) { + struct range *range = &dev_dax->ranges[i].range; + struct dax_mapping *mapping = dev_dax->ranges[i].mapping; + struct resource *adjust = NULL, *res; + resource_size_t shrink; + + shrink = min_t(u64, to_shrink, range_len(range)); + if (shrink >= range_len(range)) { + devm_release_action(dax_region->dev, + unregister_dax_mapping, &mapping->dev); + trim_dev_dax_range(dev_dax); + to_shrink -= shrink; + if (!to_shrink) + break; + continue; + } + + for_each_dax_region_resource(dax_region, res) + if (strcmp(res->name, dev_name(dev)) == 0 + && res->start == range->start) { + adjust = res; + break; + } + + if (dev_WARN_ONCE(dev, !adjust || i != dev_dax->nr_range - 1, + "failed to find matching resource\n")) + return -ENXIO; + return adjust_dev_dax_range(dev_dax, adjust, range_len(range) + - shrink); + } + return 0; +} + +/* + * Only allow adjustments that preserve the relative pgoff of existing + * allocations. I.e. the dev_dax->ranges array is ordered by increasing pgoff. + */ +static bool adjust_ok(struct dev_dax *dev_dax, struct resource *res) +{ + struct dev_dax_range *last; + int i; + + if (dev_dax->nr_range == 0) + return false; + if (strcmp(res->name, dev_name(&dev_dax->dev)) != 0) + return false; + last = &dev_dax->ranges[dev_dax->nr_range - 1]; + if (last->range.start != res->start || last->range.end != res->end) + return false; + for (i = 0; i < dev_dax->nr_range - 1; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + + if (dax_range->pgoff > last->pgoff) + return false; + } + + return true; +} + +static ssize_t dev_dax_resize(struct dax_region *dax_region, + struct dev_dax *dev_dax, resource_size_t size) +{ + resource_size_t avail = dax_region_avail_size(dax_region), to_alloc; + resource_size_t dev_size = dev_dax_size(dev_dax); + struct resource *region_res = &dax_region->res; + struct device *dev = &dev_dax->dev; + struct resource *res, *first; + resource_size_t alloc = 0; + int rc; + + if (dev->driver) + return -EBUSY; + if (size == dev_size) + return 0; + if (size > dev_size && size - dev_size > avail) + return -ENOSPC; + if (size < dev_size) + return dev_dax_shrink(dev_dax, size); + + to_alloc = size - dev_size; + if (dev_WARN_ONCE(dev, !alloc_is_aligned(dev_dax, to_alloc), + "resize of %pa misaligned\n", &to_alloc)) + return -ENXIO; + + /* + * Expand the device into the unused portion of the region. This + * may involve adjusting the end of an existing resource, or + * allocating a new resource. + */ +retry: + first = region_res->child; + if (!first) + return alloc_dev_dax_range(dev_dax, dax_region->res.start, to_alloc); + + rc = -ENOSPC; + for (res = first; res; res = res->sibling) { + struct resource *next = res->sibling; + + /* space at the beginning of the region */ + if (res == first && res->start > dax_region->res.start) { + alloc = min(res->start - dax_region->res.start, to_alloc); + rc = alloc_dev_dax_range(dev_dax, dax_region->res.start, alloc); + break; + } + + alloc = 0; + /* space between allocations */ + if (next && next->start > res->end + 1) + alloc = min(next->start - (res->end + 1), to_alloc); + + /* space at the end of the region */ + if (!alloc && !next && res->end < region_res->end) + alloc = min(region_res->end - res->end, to_alloc); + + if (!alloc) + continue; + + if (adjust_ok(dev_dax, res)) { + rc = adjust_dev_dax_range(dev_dax, res, resource_size(res) + alloc); + break; + } + rc = alloc_dev_dax_range(dev_dax, res->end + 1, alloc); + break; + } + if (rc) + return rc; + to_alloc -= alloc; + if (to_alloc) + goto retry; + return 0; +} + +static ssize_t size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + ssize_t rc; + unsigned long long val; + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + + rc = kstrtoull(buf, 0, &val); + if (rc) + return rc; + + if (!alloc_is_aligned(dev_dax, val)) { + dev_dbg(dev, "%s: size: %lld misaligned\n", __func__, val); + return -EINVAL; + } + + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return -ENXIO; + } + device_lock(dev); + rc = dev_dax_resize(dax_region, dev_dax, val); + device_unlock(dev); + device_unlock(dax_region->dev); + + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_RW(size); + +static ssize_t range_parse(const char *opt, size_t len, struct range *range) +{ + unsigned long long addr = 0; + char *start, *end, *str; + ssize_t rc = -EINVAL; + + str = kstrdup(opt, GFP_KERNEL); + if (!str) + return rc; + + end = str; + start = strsep(&end, "-"); + if (!start || !end) + goto err; + + rc = kstrtoull(start, 16, &addr); + if (rc) + goto err; + range->start = addr; + + rc = kstrtoull(end, 16, &addr); + if (rc) + goto err; + range->end = addr; + +err: + kfree(str); + return rc; +} + +static ssize_t mapping_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + size_t to_alloc; + struct range r; + ssize_t rc; + + rc = range_parse(buf, len, &r); + if (rc) + return rc; + + rc = -ENXIO; + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return rc; + } + device_lock(dev); + + to_alloc = range_len(&r); + if (alloc_is_aligned(dev_dax, to_alloc)) + rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc); + device_unlock(dev); + device_unlock(dax_region->dev); + + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_WO(mapping); + +static ssize_t align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + return sprintf(buf, "%d\n", dev_dax->align); +} + +static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax) +{ + resource_size_t dev_size = dev_dax_size(dev_dax); + struct device *dev = &dev_dax->dev; + int i; + + if (dev_size > 0 && !alloc_is_aligned(dev_dax, dev_size)) { + dev_dbg(dev, "%s: align %u invalid for size %pa\n", + __func__, dev_dax->align, &dev_size); + return -EINVAL; + } + + for (i = 0; i < dev_dax->nr_range; i++) { + size_t len = range_len(&dev_dax->ranges[i].range); + + if (!alloc_is_aligned(dev_dax, len)) { + dev_dbg(dev, "%s: align %u invalid for range %d\n", + __func__, dev_dax->align, i); + return -EINVAL; + } + } + + return 0; +} + +static ssize_t align_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + unsigned long val, align_save; + ssize_t rc; + + rc = kstrtoul(buf, 0, &val); + if (rc) + return -ENXIO; + + if (!dax_align_valid(val)) + return -EINVAL; + + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return -ENXIO; + } + + device_lock(dev); + if (dev->driver) { + rc = -EBUSY; + goto out_unlock; + } + + align_save = dev_dax->align; + dev_dax->align = val; + rc = dev_dax_validate_align(dev_dax); + if (rc) + dev_dax->align = align_save; +out_unlock: + device_unlock(dev); + device_unlock(dax_region->dev); + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_RW(align); + +static int dev_dax_target_node(struct dev_dax *dev_dax) +{ + struct dax_region *dax_region = dev_dax->region; + + return dax_region->target_node; +} + +static ssize_t target_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax)); +} +static DEVICE_ATTR_RO(target_node); + +static ssize_t resource_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + unsigned long long start; + + if (dev_dax->nr_range < 1) + start = dax_region->res.start; + else + start = dev_dax->ranges[0].range.start; + + return sprintf(buf, "%#llx\n", start); +} +static DEVICE_ATTR(resource, 0400, resource_show, NULL); + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + /* + * We only ever expect to handle device-dax instances, i.e. the + * @type argument to MODULE_ALIAS_DAX_DEVICE() is always zero + */ + return sprintf(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0); +} +static DEVICE_ATTR_RO(modalias); + +static ssize_t numa_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", dev_to_node(dev)); +} +static DEVICE_ATTR_RO(numa_node); + +static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + + if (a == &dev_attr_target_node.attr && dev_dax_target_node(dev_dax) < 0) + return 0; + if (a == &dev_attr_numa_node.attr && !IS_ENABLED(CONFIG_NUMA)) + return 0; + if (a == &dev_attr_mapping.attr && is_static(dax_region)) + return 0; + if ((a == &dev_attr_align.attr || + a == &dev_attr_size.attr) && is_static(dax_region)) + return 0444; + return a->mode; +} + +static struct attribute *dev_dax_attributes[] = { + &dev_attr_modalias.attr, + &dev_attr_size.attr, + &dev_attr_mapping.attr, + &dev_attr_target_node.attr, + &dev_attr_align.attr, + &dev_attr_resource.attr, + &dev_attr_numa_node.attr, + NULL, +}; + +static const struct attribute_group dev_dax_attribute_group = { + .attrs = dev_dax_attributes, + .is_visible = dev_dax_visible, +}; + +static const struct attribute_group *dax_attribute_groups[] = { + &dev_dax_attribute_group, + NULL, +}; + +static void dev_dax_release(struct device *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_device *dax_dev = dev_dax->dax_dev; + + put_dax(dax_dev); + free_dev_dax_id(dev_dax); + kfree(dev_dax->pgmap); + kfree(dev_dax); +} + +static const struct device_type dev_dax_type = { + .release = dev_dax_release, + .groups = dax_attribute_groups, +}; + +struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) +{ + struct dax_region *dax_region = data->dax_region; + struct device *parent = dax_region->dev; + struct dax_device *dax_dev; + struct dev_dax *dev_dax; + struct inode *inode; + struct device *dev; + int rc; + + dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL); + if (!dev_dax) + return ERR_PTR(-ENOMEM); + + dev_dax->region = dax_region; + if (is_static(dax_region)) { + if (dev_WARN_ONCE(parent, data->id < 0, + "dynamic id specified to static region\n")) { + rc = -EINVAL; + goto err_id; + } + + dev_dax->id = data->id; + } else { + if (dev_WARN_ONCE(parent, data->id >= 0, + "static id specified to dynamic region\n")) { + rc = -EINVAL; + goto err_id; + } + + rc = alloc_dev_dax_id(dev_dax); + if (rc < 0) + goto err_id; + } + + dev = &dev_dax->dev; + device_initialize(dev); + dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); + + rc = alloc_dev_dax_range(dev_dax, dax_region->res.start, data->size); + if (rc) + goto err_range; + + if (data->pgmap) { + dev_WARN_ONCE(parent, !is_static(dax_region), + "custom dev_pagemap requires a static dax_region\n"); + + dev_dax->pgmap = kmemdup(data->pgmap, + sizeof(struct dev_pagemap), GFP_KERNEL); + if (!dev_dax->pgmap) { + rc = -ENOMEM; + goto err_pgmap; + } + } + + /* + * No 'host' or dax_operations since there is no access to this + * device outside of mmap of the resulting character device. + */ + dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC); + if (IS_ERR(dax_dev)) { + rc = PTR_ERR(dax_dev); + goto err_alloc_dax; + } + + /* a device_dax instance is dead while the driver is not attached */ + kill_dax(dax_dev); + + dev_dax->dax_dev = dax_dev; + dev_dax->target_node = dax_region->target_node; + dev_dax->align = dax_region->align; + ida_init(&dev_dax->ida); + + inode = dax_inode(dax_dev); + dev->devt = inode->i_rdev; + if (data->subsys == DEV_DAX_BUS) + dev->bus = &dax_bus_type; + else + dev->class = dax_class; + dev->parent = parent; + dev->type = &dev_dax_type; + + rc = device_add(dev); + if (rc) { + kill_dev_dax(dev_dax); + put_device(dev); + return ERR_PTR(rc); + } + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); + if (rc) + return ERR_PTR(rc); + + /* register mapping device for the initial allocation range */ + if (dev_dax->nr_range && range_len(&dev_dax->ranges[0].range)) { + rc = devm_register_dax_mapping(dev_dax, 0); + if (rc) + return ERR_PTR(rc); + } + + return dev_dax; + +err_alloc_dax: + kfree(dev_dax->pgmap); +err_pgmap: + free_dev_dax_ranges(dev_dax); +err_range: + free_dev_dax_id(dev_dax); +err_id: + kfree(dev_dax); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(devm_create_dev_dax); + +static int match_always_count; + +int __dax_driver_register(struct dax_device_driver *dax_drv, + struct module *module, const char *mod_name) +{ + struct device_driver *drv = &dax_drv->drv; + int rc = 0; + + INIT_LIST_HEAD(&dax_drv->ids); + drv->owner = module; + drv->name = mod_name; + drv->mod_name = mod_name; + drv->bus = &dax_bus_type; + + /* there can only be one default driver */ + mutex_lock(&dax_bus_lock); + match_always_count += dax_drv->match_always; + if (match_always_count > 1) { + match_always_count--; + WARN_ON(1); + rc = -EINVAL; + } + mutex_unlock(&dax_bus_lock); + if (rc) + return rc; + return driver_register(drv); +} +EXPORT_SYMBOL_GPL(__dax_driver_register); + +void dax_driver_unregister(struct dax_device_driver *dax_drv) +{ + struct device_driver *drv = &dax_drv->drv; + struct dax_id *dax_id, *_id; + + mutex_lock(&dax_bus_lock); + match_always_count -= dax_drv->match_always; + list_for_each_entry_safe(dax_id, _id, &dax_drv->ids, list) { + list_del(&dax_id->list); + kfree(dax_id); + } + mutex_unlock(&dax_bus_lock); + driver_unregister(drv); +} +EXPORT_SYMBOL_GPL(dax_driver_unregister); + +int __init dax_bus_init(void) +{ + int rc; + + if (IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT)) { + dax_class = class_create(THIS_MODULE, "dax"); + if (IS_ERR(dax_class)) + return PTR_ERR(dax_class); + } + + rc = bus_register(&dax_bus_type); + if (rc) + class_destroy(dax_class); + return rc; +} + +void __exit dax_bus_exit(void) +{ + bus_unregister(&dax_bus_type); + class_destroy(dax_class); +} diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h new file mode 100644 index 000000000..72b92f955 --- /dev/null +++ b/drivers/dax/bus.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ +#ifndef __DAX_BUS_H__ +#define __DAX_BUS_H__ +#include +#include + +struct dev_dax; +struct resource; +struct dax_device; +struct dax_region; +void dax_region_put(struct dax_region *dax_region); + +#define IORESOURCE_DAX_STATIC (1UL << 0) +struct dax_region *alloc_dax_region(struct device *parent, int region_id, + struct range *range, int target_node, unsigned int align, + unsigned long flags); + +enum dev_dax_subsys { + DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */ + DEV_DAX_CLASS, +}; + +struct dev_dax_data { + struct dax_region *dax_region; + struct dev_pagemap *pgmap; + enum dev_dax_subsys subsys; + resource_size_t size; + int id; +}; + +struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data); + +/* to be deleted when DEV_DAX_CLASS is removed */ +struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys); + +struct dax_device_driver { + struct device_driver drv; + struct list_head ids; + int match_always; + int (*probe)(struct dev_dax *dev); + int (*remove)(struct dev_dax *dev); +}; + +int __dax_driver_register(struct dax_device_driver *dax_drv, + struct module *module, const char *mod_name); +#define dax_driver_register(driver) \ + __dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) +void dax_driver_unregister(struct dax_device_driver *dax_drv); +void kill_dev_dax(struct dev_dax *dev_dax); + +#if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) +int dev_dax_probe(struct dev_dax *dev_dax); +#endif + +/* + * While run_dax() is potentially a generic operation that could be + * defined in include/linux/dax.h we don't want to grow any users + * outside of drivers/dax/ + */ +void run_dax(struct dax_device *dax_dev); + +#define MODULE_ALIAS_DAX_DEVICE(type) \ + MODULE_ALIAS("dax:t" __stringify(type) "*") +#define DAX_DEVICE_MODALIAS_FMT "dax:t%d" + +#endif /* __DAX_BUS_H__ */ diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h new file mode 100644 index 000000000..afcada6fd --- /dev/null +++ b/drivers/dax/dax-private.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright(c) 2016 Intel Corporation. All rights reserved. + */ +#ifndef __DAX_PRIVATE_H__ +#define __DAX_PRIVATE_H__ + +#include +#include +#include + +/* private routines between core files */ +struct dax_device; +struct dax_device *inode_dax(struct inode *inode); +struct inode *dax_inode(struct dax_device *dax_dev); +int dax_bus_init(void); +void dax_bus_exit(void); + +/** + * struct dax_region - mapping infrastructure for dax devices + * @id: kernel-wide unique region for a memory range + * @target_node: effective numa node if this memory range is onlined + * @kref: to pin while other agents have a need to do lookups + * @dev: parent device backing this region + * @align: allocation and mapping alignment for child dax devices + * @ida: instance id allocator + * @res: resource tree to track instance allocations + * @seed: allow userspace to find the first unbound seed device + * @youngest: allow userspace to find the most recently created device + */ +struct dax_region { + int id; + int target_node; + struct kref kref; + struct device *dev; + unsigned int align; + struct ida ida; + struct resource res; + struct device *seed; + struct device *youngest; +}; + +struct dax_mapping { + struct device dev; + int range_id; + int id; +}; + +/** + * struct dev_dax - instance data for a subdivision of a dax region, and + * data while the device is activated in the driver. + * @region - parent region + * @dax_dev - core dax functionality + * @target_node: effective numa node if dev_dax memory range is onlined + * @dyn_id: is this a dynamic or statically created instance + * @id: ida allocated id when the dax_region is not static + * @ida: mapping id allocator + * @dev - device core + * @pgmap - pgmap for memmap setup / lifetime (driver owned) + * @nr_range: size of @ranges + * @ranges: resource-span + pgoff tuples for the instance + */ +struct dev_dax { + struct dax_region *region; + struct dax_device *dax_dev; + unsigned int align; + int target_node; + bool dyn_id; + int id; + struct ida ida; + struct device dev; + struct dev_pagemap *pgmap; + int nr_range; + struct dev_dax_range { + unsigned long pgoff; + struct range range; + struct dax_mapping *mapping; + } *ranges; +}; + +static inline struct dev_dax *to_dev_dax(struct device *dev) +{ + return container_of(dev, struct dev_dax, dev); +} + +static inline struct dax_mapping *to_dax_mapping(struct device *dev) +{ + return container_of(dev, struct dax_mapping, dev); +} + +phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline bool dax_align_valid(unsigned long align) +{ + if (align == PUD_SIZE && IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + return true; + if (align == PMD_SIZE && has_transparent_hugepage()) + return true; + if (align == PAGE_SIZE) + return true; + return false; +} +#else +static inline bool dax_align_valid(unsigned long align) +{ + return align == PAGE_SIZE; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif diff --git a/drivers/dax/device.c b/drivers/dax/device.c new file mode 100644 index 000000000..25e0b84a4 --- /dev/null +++ b/drivers/dax/device.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-2018 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dax-private.h" +#include "bus.h" + +static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, + const char *func) +{ + struct device *dev = &dev_dax->dev; + unsigned long mask; + + if (!dax_alive(dev_dax->dax_dev)) + return -ENXIO; + + /* prevent private mappings from being established */ + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { + dev_info_ratelimited(dev, + "%s: %s: fail, attempted private mapping\n", + current->comm, func); + return -EINVAL; + } + + mask = dev_dax->align - 1; + if (vma->vm_start & mask || vma->vm_end & mask) { + dev_info_ratelimited(dev, + "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", + current->comm, func, vma->vm_start, vma->vm_end, + mask); + return -EINVAL; + } + + if (!vma_is_dax(vma)) { + dev_info_ratelimited(dev, + "%s: %s: fail, vma is not DAX capable\n", + current->comm, func); + return -EINVAL; + } + + return 0; +} + +/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ +__weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, + unsigned long size) +{ + int i; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + unsigned long long pgoff_end; + phys_addr_t phys; + + pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; + if (pgoff < dax_range->pgoff || pgoff > pgoff_end) + continue; + phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; + if (phys + size - 1 <= range->end) + return phys; + break; + } + return -1; +} + +static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, + struct vm_fault *vmf, pfn_t *pfn) +{ + struct device *dev = &dev_dax->dev; + phys_addr_t phys; + unsigned int fault_size = PAGE_SIZE; + + if (check_vma(dev_dax, vmf->vma, __func__)) + return VM_FAULT_SIGBUS; + + if (dev_dax->align > PAGE_SIZE) { + dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", + dev_dax->align, fault_size); + return VM_FAULT_SIGBUS; + } + + if (fault_size != dev_dax->align) + return VM_FAULT_SIGBUS; + + phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); + if (phys == -1) { + dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", vmf->pgoff); + return VM_FAULT_SIGBUS; + } + + *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + + return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); +} + +static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, + struct vm_fault *vmf, pfn_t *pfn) +{ + unsigned long pmd_addr = vmf->address & PMD_MASK; + struct device *dev = &dev_dax->dev; + phys_addr_t phys; + pgoff_t pgoff; + unsigned int fault_size = PMD_SIZE; + + if (check_vma(dev_dax, vmf->vma, __func__)) + return VM_FAULT_SIGBUS; + + if (dev_dax->align > PMD_SIZE) { + dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", + dev_dax->align, fault_size); + return VM_FAULT_SIGBUS; + } + + if (fault_size < dev_dax->align) + return VM_FAULT_SIGBUS; + else if (fault_size > dev_dax->align) + return VM_FAULT_FALLBACK; + + /* if we are outside of the VMA */ + if (pmd_addr < vmf->vma->vm_start || + (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + + pgoff = linear_page_index(vmf->vma, pmd_addr); + phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); + if (phys == -1) { + dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", pgoff); + return VM_FAULT_SIGBUS; + } + + *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + + return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, + struct vm_fault *vmf, pfn_t *pfn) +{ + unsigned long pud_addr = vmf->address & PUD_MASK; + struct device *dev = &dev_dax->dev; + phys_addr_t phys; + pgoff_t pgoff; + unsigned int fault_size = PUD_SIZE; + + + if (check_vma(dev_dax, vmf->vma, __func__)) + return VM_FAULT_SIGBUS; + + if (dev_dax->align > PUD_SIZE) { + dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", + dev_dax->align, fault_size); + return VM_FAULT_SIGBUS; + } + + if (fault_size < dev_dax->align) + return VM_FAULT_SIGBUS; + else if (fault_size > dev_dax->align) + return VM_FAULT_FALLBACK; + + /* if we are outside of the VMA */ + if (pud_addr < vmf->vma->vm_start || + (pud_addr + PUD_SIZE) > vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + + pgoff = linear_page_index(vmf->vma, pud_addr); + phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); + if (phys == -1) { + dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", pgoff); + return VM_FAULT_SIGBUS; + } + + *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + + return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); +} +#else +static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, + struct vm_fault *vmf, pfn_t *pfn) +{ + return VM_FAULT_FALLBACK; +} +#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + +static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + struct file *filp = vmf->vma->vm_file; + unsigned long fault_size; + vm_fault_t rc = VM_FAULT_SIGBUS; + int id; + pfn_t pfn; + struct dev_dax *dev_dax = filp->private_data; + + dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, + (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", + vmf->vma->vm_start, vmf->vma->vm_end, pe_size); + + id = dax_read_lock(); + switch (pe_size) { + case PE_SIZE_PTE: + fault_size = PAGE_SIZE; + rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn); + break; + case PE_SIZE_PMD: + fault_size = PMD_SIZE; + rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn); + break; + case PE_SIZE_PUD: + fault_size = PUD_SIZE; + rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn); + break; + default: + rc = VM_FAULT_SIGBUS; + } + + if (rc == VM_FAULT_NOPAGE) { + unsigned long i; + pgoff_t pgoff; + + /* + * In the device-dax case the only possibility for a + * VM_FAULT_NOPAGE result is when device-dax capacity is + * mapped. No need to consider the zero page, or racing + * conflicting mappings. + */ + pgoff = linear_page_index(vmf->vma, vmf->address + & ~(fault_size - 1)); + for (i = 0; i < fault_size / PAGE_SIZE; i++) { + struct page *page; + + page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + if (page->mapping) + continue; + page->mapping = filp->f_mapping; + page->index = pgoff + i; + } + } + dax_read_unlock(id); + + return rc; +} + +static vm_fault_t dev_dax_fault(struct vm_fault *vmf) +{ + return dev_dax_huge_fault(vmf, PE_SIZE_PTE); +} + +static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) +{ + struct file *filp = vma->vm_file; + struct dev_dax *dev_dax = filp->private_data; + + if (!IS_ALIGNED(addr, dev_dax->align)) + return -EINVAL; + return 0; +} + +static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) +{ + struct file *filp = vma->vm_file; + struct dev_dax *dev_dax = filp->private_data; + + return dev_dax->align; +} + +static const struct vm_operations_struct dax_vm_ops = { + .fault = dev_dax_fault, + .huge_fault = dev_dax_huge_fault, + .split = dev_dax_split, + .pagesize = dev_dax_pagesize, +}; + +static int dax_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct dev_dax *dev_dax = filp->private_data; + int rc, id; + + dev_dbg(&dev_dax->dev, "trace\n"); + + /* + * We lock to check dax_dev liveness and will re-check at + * fault time. + */ + id = dax_read_lock(); + rc = check_vma(dev_dax, vma, __func__); + dax_read_unlock(id); + if (rc) + return rc; + + vma->vm_ops = &dax_vm_ops; + vma->vm_flags |= VM_HUGEPAGE; + return 0; +} + +/* return an unmapped area aligned to the dax region specified alignment */ +static unsigned long dax_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long off, off_end, off_align, len_align, addr_align, align; + struct dev_dax *dev_dax = filp ? filp->private_data : NULL; + + if (!dev_dax || addr) + goto out; + + align = dev_dax->align; + off = pgoff << PAGE_SHIFT; + off_end = off + len; + off_align = round_up(off, align); + + if ((off_end <= off_align) || ((off_end - off_align) < align)) + goto out; + + len_align = len + align; + if ((off + len_align) < off) + goto out; + + addr_align = current->mm->get_unmapped_area(filp, addr, len_align, + pgoff, flags); + if (!IS_ERR_VALUE(addr_align)) { + addr_align += (off - addr_align) & (align - 1); + return addr_align; + } + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + +static const struct address_space_operations dev_dax_aops = { + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; + +static int dax_open(struct inode *inode, struct file *filp) +{ + struct dax_device *dax_dev = inode_dax(inode); + struct inode *__dax_inode = dax_inode(dax_dev); + struct dev_dax *dev_dax = dax_get_private(dax_dev); + + dev_dbg(&dev_dax->dev, "trace\n"); + inode->i_mapping = __dax_inode->i_mapping; + inode->i_mapping->host = __dax_inode; + inode->i_mapping->a_ops = &dev_dax_aops; + filp->f_mapping = inode->i_mapping; + filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + filp->f_sb_err = file_sample_sb_err(filp); + filp->private_data = dev_dax; + inode->i_flags = S_DAX; + + return 0; +} + +static int dax_release(struct inode *inode, struct file *filp) +{ + struct dev_dax *dev_dax = filp->private_data; + + dev_dbg(&dev_dax->dev, "trace\n"); + return 0; +} + +static const struct file_operations dax_fops = { + .llseek = noop_llseek, + .owner = THIS_MODULE, + .open = dax_open, + .release = dax_release, + .get_unmapped_area = dax_get_unmapped_area, + .mmap = dax_mmap, + .mmap_supported_flags = MAP_SYNC, +}; + +static void dev_dax_cdev_del(void *cdev) +{ + cdev_del(cdev); +} + +static void dev_dax_kill(void *dev_dax) +{ + kill_dev_dax(dev_dax); +} + +int dev_dax_probe(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct device *dev = &dev_dax->dev; + struct dev_pagemap *pgmap; + struct inode *inode; + struct cdev *cdev; + void *addr; + int rc, i; + + pgmap = dev_dax->pgmap; + if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1, + "static pgmap / multi-range device conflict\n")) + return -EINVAL; + + if (!pgmap) { + pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range) + * (dev_dax->nr_range - 1), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + pgmap->nr_range = dev_dax->nr_range; + } + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + if (!devm_request_mem_region(dev, range->start, + range_len(range), dev_name(dev))) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", + i, range->start, range->end); + return -EBUSY; + } + /* don't update the range for static pgmap */ + if (!dev_dax->pgmap) + pgmap->ranges[i] = *range; + } + + pgmap->type = MEMORY_DEVICE_GENERIC; + addr = devm_memremap_pages(dev, pgmap); + if (IS_ERR(addr)) + return PTR_ERR(addr); + + inode = dax_inode(dax_dev); + cdev = inode->i_cdev; + cdev_init(cdev, &dax_fops); + if (dev->class) { + /* for the CONFIG_DEV_DAX_PMEM_COMPAT case */ + cdev->owner = dev->parent->driver->owner; + } else + cdev->owner = dev->driver->owner; + cdev_set_parent(cdev, &dev->kobj); + rc = cdev_add(cdev, dev->devt, 1); + if (rc) + return rc; + + rc = devm_add_action_or_reset(dev, dev_dax_cdev_del, cdev); + if (rc) + return rc; + + run_dax(dax_dev); + return devm_add_action_or_reset(dev, dev_dax_kill, dev_dax); +} +EXPORT_SYMBOL_GPL(dev_dax_probe); + +static int dev_dax_remove(struct dev_dax *dev_dax) +{ + /* all probe actions are unwound by devm */ + return 0; +} + +static struct dax_device_driver device_dax_driver = { + .probe = dev_dax_probe, + .remove = dev_dax_remove, + .match_always = 1, +}; + +static int __init dax_init(void) +{ + return dax_driver_register(&device_dax_driver); +} + +static void __exit dax_exit(void) +{ + dax_driver_unregister(&device_dax_driver); +} + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); +module_init(dax_init); +module_exit(dax_exit); +MODULE_ALIAS_DAX_DEVICE(0); diff --git a/drivers/dax/hmem/Makefile b/drivers/dax/hmem/Makefile new file mode 100644 index 000000000..57377b4c3 --- /dev/null +++ b/drivers/dax/hmem/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o +obj-$(CONFIG_DEV_DAX_HMEM_DEVICES) += device_hmem.o + +device_hmem-y := device.o +dax_hmem-y := hmem.o diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c new file mode 100644 index 000000000..acf31cc1d --- /dev/null +++ b/drivers/dax/hmem/device.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +static bool nohmem; +module_param_named(disable, nohmem, bool, 0444); + +void hmem_register_device(int target_nid, struct resource *r) +{ + /* define a clean / non-busy resource for the platform device */ + struct resource res = { + .start = r->start, + .end = r->end, + .flags = IORESOURCE_MEM, + .desc = IORES_DESC_SOFT_RESERVED, + }; + struct platform_device *pdev; + struct memregion_info info; + int rc, id; + + if (nohmem) + return; + + rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED); + if (rc != REGION_INTERSECTS) + return; + + id = memregion_alloc(GFP_KERNEL); + if (id < 0) { + pr_err("memregion allocation failure for %pr\n", &res); + return; + } + + pdev = platform_device_alloc("hmem", id); + if (!pdev) { + pr_err("hmem device allocation failure for %pr\n", &res); + goto out_pdev; + } + + pdev->dev.numa_node = numa_map_to_online_node(target_nid); + info = (struct memregion_info) { + .target_node = target_nid, + }; + rc = platform_device_add_data(pdev, &info, sizeof(info)); + if (rc < 0) { + pr_err("hmem memregion_info allocation failure for %pr\n", &res); + goto out_pdev; + } + + rc = platform_device_add_resources(pdev, &res, 1); + if (rc < 0) { + pr_err("hmem resource allocation failure for %pr\n", &res); + goto out_resource; + } + + rc = platform_device_add(pdev); + if (rc < 0) { + dev_err(&pdev->dev, "device add failed for %pr\n", &res); + goto out_resource; + } + + return; + +out_resource: + put_device(&pdev->dev); +out_pdev: + memregion_free(id); +} + +static __init int hmem_register_one(struct resource *res, void *data) +{ + /* + * If the resource is not a top-level resource it was already + * assigned to a device by the HMAT parsing. + */ + if (res->parent != &iomem_resource) { + pr_info("HMEM: skip %pr, already claimed\n", res); + return 0; + } + + hmem_register_device(phys_to_target_node(res->start), res); + + return 0; +} + +static __init int hmem_init(void) +{ + walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED, + IORESOURCE_MEM, 0, -1, NULL, hmem_register_one); + return 0; +} + +/* + * As this is a fallback for address ranges unclaimed by the ACPI HMAT + * parsing it must be at an initcall level greater than hmat_init(). + */ +late_initcall(hmem_init); diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c new file mode 100644 index 000000000..1bf040dbc --- /dev/null +++ b/drivers/dax/hmem/hmem.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "../bus.h" + +static bool region_idle; +module_param_named(region_idle, region_idle, bool, 0644); + +static int dax_hmem_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct dax_region *dax_region; + struct memregion_info *mri; + struct dev_dax_data data; + struct dev_dax *dev_dax; + struct resource *res; + struct range range; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENOMEM; + + mri = dev->platform_data; + range.start = res->start; + range.end = res->end; + dax_region = alloc_dax_region(dev, pdev->id, &range, mri->target_node, + PMD_SIZE, 0); + if (!dax_region) + return -ENOMEM; + + data = (struct dev_dax_data) { + .dax_region = dax_region, + .id = -1, + .size = region_idle ? 0 : resource_size(res), + }; + dev_dax = devm_create_dev_dax(&data); + if (IS_ERR(dev_dax)) + return PTR_ERR(dev_dax); + + /* child dev_dax instances now own the lifetime of the dax_region */ + dax_region_put(dax_region); + return 0; +} + +static int dax_hmem_remove(struct platform_device *pdev) +{ + /* devm handles teardown */ + return 0; +} + +static struct platform_driver dax_hmem_driver = { + .probe = dax_hmem_probe, + .remove = dax_hmem_remove, + .driver = { + .name = "hmem", + }, +}; + +module_platform_driver(dax_hmem_driver); + +MODULE_ALIAS("platform:hmem*"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c new file mode 100644 index 000000000..27d669f8b --- /dev/null +++ b/drivers/dax/kmem.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dax-private.h" +#include "bus.h" + +/* Memory resource name used for add_memory_driver_managed(). */ +static const char *kmem_name; +/* Set if any memory will remain added when the driver will be unloaded. */ +static bool any_hotremove_failed; + +static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) +{ + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + + /* memory-block align the hotplug range */ + r->start = ALIGN(range->start, memory_block_size_bytes()); + r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1; + if (r->start >= r->end) { + r->start = range->start; + r->end = range->end; + return -ENOSPC; + } + return 0; +} + +struct dax_kmem_data { + const char *res_name; + struct resource *res[]; +}; + +static int dev_dax_kmem_probe(struct dev_dax *dev_dax) +{ + struct device *dev = &dev_dax->dev; + struct dax_kmem_data *data; + int rc = -ENOMEM; + int i, mapped = 0; + int numa_node; + + /* + * Ensure good NUMA information for the persistent memory. + * Without this check, there is a risk that slow memory + * could be mixed in a node with faster memory, causing + * unavoidable performance issues. + */ + numa_node = dev_dax->target_node; + if (numa_node < 0) { + dev_warn(dev, "rejecting DAX region with invalid node: %d\n", + numa_node); + return -EINVAL; + } + + data = kzalloc(sizeof(*data) + sizeof(struct resource *) * dev_dax->nr_range, GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); + if (!data->res_name) + goto err_res_name; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct resource *res; + struct range range; + + rc = dax_kmem_range(dev_dax, i, &range); + if (rc) { + dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", + i, range.start, range.end); + continue; + } + + /* Region is permanently reserved if hotremove fails. */ + res = request_mem_region(range.start, range_len(&range), data->res_name); + if (!res) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", + i, range.start, range.end); + /* + * Once some memory has been onlined we can't + * assume that it can be un-onlined safely. + */ + if (mapped) + continue; + rc = -EBUSY; + goto err_request_mem; + } + data->res[i] = res; + + /* + * Set flags appropriate for System RAM. Leave ..._BUSY clear + * so that add_memory() can add a child resource. Do not + * inherit flags from the parent since it may set new flags + * unknown to us that will break add_memory() below. + */ + res->flags = IORESOURCE_SYSTEM_RAM; + + /* + * Ensure that future kexec'd kernels will not treat + * this as RAM automatically. + */ + rc = add_memory_driver_managed(numa_node, range.start, + range_len(&range), kmem_name, MHP_NONE); + + if (rc) { + dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", + i, range.start, range.end); + remove_resource(res); + kfree(res); + data->res[i] = NULL; + if (mapped) + continue; + goto err_request_mem; + } + mapped++; + } + + dev_set_drvdata(dev, data); + + return 0; + +err_request_mem: + kfree(data->res_name); +err_res_name: + kfree(data); + return rc; +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +static int dev_dax_kmem_remove(struct dev_dax *dev_dax) +{ + int i, success = 0; + struct device *dev = &dev_dax->dev; + struct dax_kmem_data *data = dev_get_drvdata(dev); + + /* + * We have one shot for removing memory, if some memory blocks were not + * offline prior to calling this function remove_memory() will fail, and + * there is no way to hotremove this memory until reboot because device + * unbind will succeed even if we return failure. + */ + for (i = 0; i < dev_dax->nr_range; i++) { + struct range range; + int rc; + + rc = dax_kmem_range(dev_dax, i, &range); + if (rc) + continue; + + rc = remove_memory(dev_dax->target_node, range.start, + range_len(&range)); + if (rc == 0) { + remove_resource(data->res[i]); + kfree(data->res[i]); + data->res[i] = NULL; + success++; + continue; + } + any_hotremove_failed = true; + dev_err(dev, + "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n", + i, range.start, range.end); + } + + if (success >= dev_dax->nr_range) { + kfree(data->res_name); + kfree(data); + dev_set_drvdata(dev, NULL); + } + + return 0; +} +#else +static int dev_dax_kmem_remove(struct dev_dax *dev_dax) +{ + /* + * Without hotremove purposely leak the request_mem_region() for the + * device-dax range and return '0' to ->remove() attempts. The removal + * of the device from the driver always succeeds, but the region is + * permanently pinned as reserved by the unreleased + * request_mem_region(). + */ + any_hotremove_failed = true; + return 0; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +static struct dax_device_driver device_dax_kmem_driver = { + .probe = dev_dax_kmem_probe, + .remove = dev_dax_kmem_remove, +}; + +static int __init dax_kmem_init(void) +{ + int rc; + + /* Resource name is permanently allocated if any hotremove fails. */ + kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL); + if (!kmem_name) + return -ENOMEM; + + rc = dax_driver_register(&device_dax_kmem_driver); + if (rc) + kfree_const(kmem_name); + return rc; +} + +static void __exit dax_kmem_exit(void) +{ + dax_driver_unregister(&device_dax_kmem_driver); + if (!any_hotremove_failed) + kfree_const(kmem_name); +} + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); +module_init(dax_kmem_init); +module_exit(dax_kmem_exit); +MODULE_ALIAS_DAX_DEVICE(0); diff --git a/drivers/dax/pmem/Makefile b/drivers/dax/pmem/Makefile new file mode 100644 index 000000000..010269f61 --- /dev/null +++ b/drivers/dax/pmem/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o +obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem_core.o +obj-$(CONFIG_DEV_DAX_PMEM_COMPAT) += dax_pmem_compat.o + +dax_pmem-y := pmem.o +dax_pmem_core-y := core.o +dax_pmem_compat-y := compat.o diff --git a/drivers/dax/pmem/compat.c b/drivers/dax/pmem/compat.c new file mode 100644 index 000000000..863c114fd --- /dev/null +++ b/drivers/dax/pmem/compat.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include "../bus.h" + +/* we need the private definitions to implement compat suport */ +#include "../dax-private.h" + +static int dax_pmem_compat_probe(struct device *dev) +{ + struct dev_dax *dev_dax = __dax_pmem_probe(dev, DEV_DAX_CLASS); + int rc; + + if (IS_ERR(dev_dax)) + return PTR_ERR(dev_dax); + + if (!devres_open_group(&dev_dax->dev, dev_dax, GFP_KERNEL)) + return -ENOMEM; + + device_lock(&dev_dax->dev); + rc = dev_dax_probe(dev_dax); + device_unlock(&dev_dax->dev); + + devres_close_group(&dev_dax->dev, dev_dax); + if (rc) + devres_release_group(&dev_dax->dev, dev_dax); + + return rc; +} + +static int dax_pmem_compat_release(struct device *dev, void *data) +{ + device_lock(dev); + devres_release_group(dev, to_dev_dax(dev)); + device_unlock(dev); + + return 0; +} + +static int dax_pmem_compat_remove(struct device *dev) +{ + device_for_each_child(dev, NULL, dax_pmem_compat_release); + return 0; +} + +static struct nd_device_driver dax_pmem_compat_driver = { + .probe = dax_pmem_compat_probe, + .remove = dax_pmem_compat_remove, + .drv = { + .name = "dax_pmem_compat", + }, + .type = ND_DRIVER_DAX_PMEM, +}; + +static int __init dax_pmem_compat_init(void) +{ + return nd_driver_register(&dax_pmem_compat_driver); +} +module_init(dax_pmem_compat_init); + +static void __exit dax_pmem_compat_exit(void) +{ + driver_unregister(&dax_pmem_compat_driver.drv); +} +module_exit(dax_pmem_compat_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM); diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c new file mode 100644 index 000000000..62b26bfce --- /dev/null +++ b/drivers/dax/pmem/core.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include "../../nvdimm/pfn.h" +#include "../../nvdimm/nd.h" +#include "../bus.h" + +struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) +{ + struct range range; + int rc, id, region_id; + resource_size_t offset; + struct nd_pfn_sb *pfn_sb; + struct dev_dax *dev_dax; + struct dev_dax_data data; + struct nd_namespace_io *nsio; + struct dax_region *dax_region; + struct dev_pagemap pgmap = { }; + struct nd_namespace_common *ndns; + struct nd_dax *nd_dax = to_nd_dax(dev); + struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; + struct nd_region *nd_region = to_nd_region(dev->parent); + + ndns = nvdimm_namespace_common_probe(dev); + if (IS_ERR(ndns)) + return ERR_CAST(ndns); + + /* parse the 'pfn' info block via ->rw_bytes */ + rc = devm_namespace_enable(dev, ndns, nd_info_block_reserve()); + if (rc) + return ERR_PTR(rc); + rc = nvdimm_setup_pfn(nd_pfn, &pgmap); + if (rc) + return ERR_PTR(rc); + devm_namespace_disable(dev, ndns); + + /* reserve the metadata area, device-dax will reserve the data */ + pfn_sb = nd_pfn->pfn_sb; + offset = le64_to_cpu(pfn_sb->dataoff); + nsio = to_nd_namespace_io(&ndns->dev); + if (!devm_request_mem_region(dev, nsio->res.start, offset, + dev_name(&ndns->dev))) { + dev_warn(dev, "could not reserve metadata\n"); + return ERR_PTR(-EBUSY); + } + + rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", ®ion_id, &id); + if (rc != 2) + return ERR_PTR(-EINVAL); + + /* adjust the dax_region range to the start of data */ + range = pgmap.range; + range.start += offset, + dax_region = alloc_dax_region(dev, region_id, &range, + nd_region->target_node, le32_to_cpu(pfn_sb->align), + IORESOURCE_DAX_STATIC); + if (!dax_region) + return ERR_PTR(-ENOMEM); + + data = (struct dev_dax_data) { + .dax_region = dax_region, + .id = id, + .pgmap = &pgmap, + .subsys = subsys, + .size = range_len(&range), + }; + dev_dax = devm_create_dev_dax(&data); + + /* child dev_dax instances now own the lifetime of the dax_region */ + dax_region_put(dax_region); + + return dev_dax; +} +EXPORT_SYMBOL_GPL(__dax_pmem_probe); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/dax/pmem/pmem.c b/drivers/dax/pmem/pmem.c new file mode 100644 index 000000000..0ae4238a0 --- /dev/null +++ b/drivers/dax/pmem/pmem.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include "../bus.h" + +static int dax_pmem_probe(struct device *dev) +{ + return PTR_ERR_OR_ZERO(__dax_pmem_probe(dev, DEV_DAX_BUS)); +} + +static struct nd_device_driver dax_pmem_driver = { + .probe = dax_pmem_probe, + .drv = { + .name = "dax_pmem", + }, + .type = ND_DRIVER_DAX_PMEM, +}; + +static int __init dax_pmem_init(void) +{ + return nd_driver_register(&dax_pmem_driver); +} +module_init(dax_pmem_init); + +static void __exit dax_pmem_exit(void) +{ + driver_unregister(&dax_pmem_driver.drv); +} +module_exit(dax_pmem_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +#if !IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) +/* For compat builds, don't load this module by default */ +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM); +#endif diff --git a/drivers/dax/super.c b/drivers/dax/super.c new file mode 100644 index 000000000..260a247c6 --- /dev/null +++ b/drivers/dax/super.c @@ -0,0 +1,765 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dax-private.h" + +static dev_t dax_devt; +DEFINE_STATIC_SRCU(dax_srcu); +static struct vfsmount *dax_mnt; +static DEFINE_IDA(dax_minor_ida); +static struct kmem_cache *dax_cache __read_mostly; +static struct super_block *dax_superblock __read_mostly; + +#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) +static struct hlist_head dax_host_list[DAX_HASH_SIZE]; +static DEFINE_SPINLOCK(dax_host_lock); + +int dax_read_lock(void) +{ + return srcu_read_lock(&dax_srcu); +} +EXPORT_SYMBOL_GPL(dax_read_lock); + +void dax_read_unlock(int id) +{ + srcu_read_unlock(&dax_srcu, id); +} +EXPORT_SYMBOL_GPL(dax_read_unlock); + +#ifdef CONFIG_BLOCK +#include + +int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, + pgoff_t *pgoff) +{ + sector_t start_sect = bdev ? get_start_sect(bdev) : 0; + phys_addr_t phys_off = (start_sect + sector) * 512; + + if (pgoff) + *pgoff = PHYS_PFN(phys_off); + if (phys_off % PAGE_SIZE || size % PAGE_SIZE) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL(bdev_dax_pgoff); + +#if IS_ENABLED(CONFIG_FS_DAX) +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) +{ + if (!blk_queue_dax(bdev->bd_disk->queue)) + return NULL; + return dax_get_by_host(bdev->bd_disk->disk_name); +} +EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); +#endif + +bool __generic_fsdax_supported(struct dax_device *dax_dev, + struct block_device *bdev, int blocksize, sector_t start, + sector_t sectors) +{ + bool dax_enabled = false; + pgoff_t pgoff, pgoff_end; + char buf[BDEVNAME_SIZE]; + void *kaddr, *end_kaddr; + pfn_t pfn, end_pfn; + sector_t last_page; + long len, len2; + int err, id; + + if (blocksize != PAGE_SIZE) { + pr_info("%s: error: unsupported blocksize for dax\n", + bdevname(bdev, buf)); + return false; + } + + if (!dax_dev) { + pr_debug("%s: error: dax unsupported by block device\n", + bdevname(bdev, buf)); + return false; + } + + err = bdev_dax_pgoff(bdev, start, PAGE_SIZE, &pgoff); + if (err) { + pr_info("%s: error: unaligned partition for dax\n", + bdevname(bdev, buf)); + return false; + } + + last_page = PFN_DOWN((start + sectors - 1) * 512) * PAGE_SIZE / 512; + err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end); + if (err) { + pr_info("%s: error: unaligned partition for dax\n", + bdevname(bdev, buf)); + return false; + } + + id = dax_read_lock(); + len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); + len2 = dax_direct_access(dax_dev, pgoff_end, 1, &end_kaddr, &end_pfn); + + if (len < 1 || len2 < 1) { + pr_info("%s: error: dax access failed (%ld)\n", + bdevname(bdev, buf), len < 1 ? len : len2); + dax_read_unlock(id); + return false; + } + + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { + /* + * An arch that has enabled the pmem api should also + * have its drivers support pfn_t_devmap() + * + * This is a developer warning and should not trigger in + * production. dax_flush() will crash since it depends + * on being able to do (page_address(pfn_to_page())). + */ + WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); + dax_enabled = true; + } else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) { + struct dev_pagemap *pgmap, *end_pgmap; + + pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL); + end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL); + if (pgmap && pgmap == end_pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX + && pfn_t_to_page(pfn)->pgmap == pgmap + && pfn_t_to_page(end_pfn)->pgmap == pgmap + && pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr)) + && pfn_t_to_pfn(end_pfn) == PHYS_PFN(__pa(end_kaddr))) + dax_enabled = true; + put_dev_pagemap(pgmap); + put_dev_pagemap(end_pgmap); + + } + dax_read_unlock(id); + + if (!dax_enabled) { + pr_info("%s: error: dax support not enabled\n", + bdevname(bdev, buf)); + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(__generic_fsdax_supported); + +/** + * __bdev_dax_supported() - Check if the device supports dax for filesystem + * @bdev: block device to check + * @blocksize: The block size of the device + * + * This is a library function for filesystems to check if the block device + * can be mounted with dax option. + * + * Return: true if supported, false if unsupported + */ +bool __bdev_dax_supported(struct block_device *bdev, int blocksize) +{ + struct dax_device *dax_dev; + struct request_queue *q; + char buf[BDEVNAME_SIZE]; + bool ret; + int id; + + q = bdev_get_queue(bdev); + if (!q || !blk_queue_dax(q)) { + pr_debug("%s: error: request queue doesn't support dax\n", + bdevname(bdev, buf)); + return false; + } + + dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + if (!dax_dev) { + pr_debug("%s: error: device does not support dax\n", + bdevname(bdev, buf)); + return false; + } + + id = dax_read_lock(); + ret = dax_supported(dax_dev, bdev, blocksize, 0, + i_size_read(bdev->bd_inode) / 512); + dax_read_unlock(id); + + put_dax(dax_dev); + + return ret; +} +EXPORT_SYMBOL_GPL(__bdev_dax_supported); +#endif + +enum dax_device_flags { + /* !alive + rcu grace period == no new operations / mappings */ + DAXDEV_ALIVE, + /* gate whether dax_flush() calls the low level flush routine */ + DAXDEV_WRITE_CACHE, + /* flag to check if device supports synchronous flush */ + DAXDEV_SYNC, +}; + +/** + * struct dax_device - anchor object for dax services + * @inode: core vfs + * @cdev: optional character interface for "device dax" + * @host: optional name for lookups where the device path is not available + * @private: dax driver private data + * @flags: state and boolean properties + */ +struct dax_device { + struct hlist_node list; + struct inode inode; + struct cdev cdev; + const char *host; + void *private; + unsigned long flags; + const struct dax_operations *ops; +}; + +static ssize_t write_cache_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); + ssize_t rc; + + WARN_ON_ONCE(!dax_dev); + if (!dax_dev) + return -ENXIO; + + rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev)); + put_dax(dax_dev); + return rc; +} + +static ssize_t write_cache_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool write_cache; + int rc = strtobool(buf, &write_cache); + struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); + + WARN_ON_ONCE(!dax_dev); + if (!dax_dev) + return -ENXIO; + + if (rc) + len = rc; + else + dax_write_cache(dax_dev, write_cache); + + put_dax(dax_dev); + return len; +} +static DEVICE_ATTR_RW(write_cache); + +static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); + + WARN_ON_ONCE(!dax_dev); + if (!dax_dev) + return 0; + +#ifndef CONFIG_ARCH_HAS_PMEM_API + if (a == &dev_attr_write_cache.attr) + return 0; +#endif + return a->mode; +} + +static struct attribute *dax_attributes[] = { + &dev_attr_write_cache.attr, + NULL, +}; + +struct attribute_group dax_attribute_group = { + .name = "dax", + .attrs = dax_attributes, + .is_visible = dax_visible, +}; +EXPORT_SYMBOL_GPL(dax_attribute_group); + +/** + * dax_direct_access() - translate a device pgoff to an absolute pfn + * @dax_dev: a dax_device instance representing the logical memory range + * @pgoff: offset in pages from the start of the device to translate + * @nr_pages: number of consecutive pages caller can handle relative to @pfn + * @kaddr: output parameter that returns a virtual address mapping of pfn + * @pfn: output parameter that returns an absolute pfn translation of @pgoff + * + * Return: negative errno if an error occurs, otherwise the number of + * pages accessible at the device relative @pgoff. + */ +long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, + void **kaddr, pfn_t *pfn) +{ + long avail; + + if (!dax_dev) + return -EOPNOTSUPP; + + if (!dax_alive(dax_dev)) + return -ENXIO; + + if (nr_pages < 0) + return nr_pages; + + avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, + kaddr, pfn); + if (!avail) + return -ERANGE; + return min(avail, nr_pages); +} +EXPORT_SYMBOL_GPL(dax_direct_access); + +bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, + int blocksize, sector_t start, sector_t len) +{ + if (!dax_dev) + return false; + + if (!dax_alive(dax_dev)) + return false; + + return dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, start, len); +} +EXPORT_SYMBOL_GPL(dax_supported); + +size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + if (!dax_alive(dax_dev)) + return 0; + + return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); +} +EXPORT_SYMBOL_GPL(dax_copy_from_iter); + +size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + if (!dax_alive(dax_dev)) + return 0; + + return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); +} +EXPORT_SYMBOL_GPL(dax_copy_to_iter); + +int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages) +{ + if (!dax_alive(dax_dev)) + return -ENXIO; + /* + * There are no callers that want to zero more than one page as of now. + * Once users are there, this check can be removed after the + * device mapper code has been updated to split ranges across targets. + */ + if (nr_pages != 1) + return -EIO; + + return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); +} +EXPORT_SYMBOL_GPL(dax_zero_page_range); + +#ifdef CONFIG_ARCH_HAS_PMEM_API +void arch_wb_cache_pmem(void *addr, size_t size); +void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) +{ + if (unlikely(!dax_write_cache_enabled(dax_dev))) + return; + + arch_wb_cache_pmem(addr, size); +} +#else +void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) +{ +} +#endif +EXPORT_SYMBOL_GPL(dax_flush); + +void dax_write_cache(struct dax_device *dax_dev, bool wc) +{ + if (wc) + set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); + else + clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(dax_write_cache); + +bool dax_write_cache_enabled(struct dax_device *dax_dev) +{ + return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(dax_write_cache_enabled); + +bool __dax_synchronous(struct dax_device *dax_dev) +{ + return test_bit(DAXDEV_SYNC, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(__dax_synchronous); + +void __set_dax_synchronous(struct dax_device *dax_dev) +{ + set_bit(DAXDEV_SYNC, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(__set_dax_synchronous); + +bool dax_alive(struct dax_device *dax_dev) +{ + lockdep_assert_held(&dax_srcu); + return test_bit(DAXDEV_ALIVE, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(dax_alive); + +static int dax_host_hash(const char *host) +{ + return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; +} + +/* + * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring + * that any fault handlers or operations that might have seen + * dax_alive(), have completed. Any operations that start after + * synchronize_srcu() has run will abort upon seeing !dax_alive(). + */ +void kill_dax(struct dax_device *dax_dev) +{ + if (!dax_dev) + return; + + clear_bit(DAXDEV_ALIVE, &dax_dev->flags); + + synchronize_srcu(&dax_srcu); + + spin_lock(&dax_host_lock); + hlist_del_init(&dax_dev->list); + spin_unlock(&dax_host_lock); +} +EXPORT_SYMBOL_GPL(kill_dax); + +void run_dax(struct dax_device *dax_dev) +{ + set_bit(DAXDEV_ALIVE, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(run_dax); + +static struct inode *dax_alloc_inode(struct super_block *sb) +{ + struct dax_device *dax_dev; + struct inode *inode; + + dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); + if (!dax_dev) + return NULL; + + inode = &dax_dev->inode; + inode->i_rdev = 0; + return inode; +} + +static struct dax_device *to_dax_dev(struct inode *inode) +{ + return container_of(inode, struct dax_device, inode); +} + +static void dax_free_inode(struct inode *inode) +{ + struct dax_device *dax_dev = to_dax_dev(inode); + kfree(dax_dev->host); + dax_dev->host = NULL; + if (inode->i_rdev) + ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev)); + kmem_cache_free(dax_cache, dax_dev); +} + +static void dax_destroy_inode(struct inode *inode) +{ + struct dax_device *dax_dev = to_dax_dev(inode); + WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags), + "kill_dax() must be called before final iput()\n"); +} + +static const struct super_operations dax_sops = { + .statfs = simple_statfs, + .alloc_inode = dax_alloc_inode, + .destroy_inode = dax_destroy_inode, + .free_inode = dax_free_inode, + .drop_inode = generic_delete_inode, +}; + +static int dax_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx = init_pseudo(fc, DAXFS_MAGIC); + if (!ctx) + return -ENOMEM; + ctx->ops = &dax_sops; + return 0; +} + +static struct file_system_type dax_fs_type = { + .name = "dax", + .init_fs_context = dax_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int dax_test(struct inode *inode, void *data) +{ + dev_t devt = *(dev_t *) data; + + return inode->i_rdev == devt; +} + +static int dax_set(struct inode *inode, void *data) +{ + dev_t devt = *(dev_t *) data; + + inode->i_rdev = devt; + return 0; +} + +static struct dax_device *dax_dev_get(dev_t devt) +{ + struct dax_device *dax_dev; + struct inode *inode; + + inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), + dax_test, dax_set, &devt); + + if (!inode) + return NULL; + + dax_dev = to_dax_dev(inode); + if (inode->i_state & I_NEW) { + set_bit(DAXDEV_ALIVE, &dax_dev->flags); + inode->i_cdev = &dax_dev->cdev; + inode->i_mode = S_IFCHR; + inode->i_flags = S_DAX; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + unlock_new_inode(inode); + } + + return dax_dev; +} + +static void dax_add_host(struct dax_device *dax_dev, const char *host) +{ + int hash; + + /* + * Unconditionally init dax_dev since it's coming from a + * non-zeroed slab cache + */ + INIT_HLIST_NODE(&dax_dev->list); + dax_dev->host = host; + if (!host) + return; + + hash = dax_host_hash(host); + spin_lock(&dax_host_lock); + hlist_add_head(&dax_dev->list, &dax_host_list[hash]); + spin_unlock(&dax_host_lock); +} + +struct dax_device *alloc_dax(void *private, const char *__host, + const struct dax_operations *ops, unsigned long flags) +{ + struct dax_device *dax_dev; + const char *host; + dev_t devt; + int minor; + + if (ops && !ops->zero_page_range) { + pr_debug("%s: error: device does not provide dax" + " operation zero_page_range()\n", + __host ? __host : "Unknown"); + return ERR_PTR(-EINVAL); + } + + host = kstrdup(__host, GFP_KERNEL); + if (__host && !host) + return ERR_PTR(-ENOMEM); + + minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); + if (minor < 0) + goto err_minor; + + devt = MKDEV(MAJOR(dax_devt), minor); + dax_dev = dax_dev_get(devt); + if (!dax_dev) + goto err_dev; + + dax_add_host(dax_dev, host); + dax_dev->ops = ops; + dax_dev->private = private; + if (flags & DAXDEV_F_SYNC) + set_dax_synchronous(dax_dev); + + return dax_dev; + + err_dev: + ida_simple_remove(&dax_minor_ida, minor); + err_minor: + kfree(host); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(alloc_dax); + +void put_dax(struct dax_device *dax_dev) +{ + if (!dax_dev) + return; + iput(&dax_dev->inode); +} +EXPORT_SYMBOL_GPL(put_dax); + +/** + * dax_get_by_host() - temporary lookup mechanism for filesystem-dax + * @host: alternate name for the device registered by a dax driver + */ +struct dax_device *dax_get_by_host(const char *host) +{ + struct dax_device *dax_dev, *found = NULL; + int hash, id; + + if (!host) + return NULL; + + hash = dax_host_hash(host); + + id = dax_read_lock(); + spin_lock(&dax_host_lock); + hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { + if (!dax_alive(dax_dev) + || strcmp(host, dax_dev->host) != 0) + continue; + + if (igrab(&dax_dev->inode)) + found = dax_dev; + break; + } + spin_unlock(&dax_host_lock); + dax_read_unlock(id); + + return found; +} +EXPORT_SYMBOL_GPL(dax_get_by_host); + +/** + * inode_dax: convert a public inode into its dax_dev + * @inode: An inode with i_cdev pointing to a dax_dev + * + * Note this is not equivalent to to_dax_dev() which is for private + * internal use where we know the inode filesystem type == dax_fs_type. + */ +struct dax_device *inode_dax(struct inode *inode) +{ + struct cdev *cdev = inode->i_cdev; + + return container_of(cdev, struct dax_device, cdev); +} +EXPORT_SYMBOL_GPL(inode_dax); + +struct inode *dax_inode(struct dax_device *dax_dev) +{ + return &dax_dev->inode; +} +EXPORT_SYMBOL_GPL(dax_inode); + +void *dax_get_private(struct dax_device *dax_dev) +{ + if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags)) + return NULL; + return dax_dev->private; +} +EXPORT_SYMBOL_GPL(dax_get_private); + +static void init_once(void *_dax_dev) +{ + struct dax_device *dax_dev = _dax_dev; + struct inode *inode = &dax_dev->inode; + + memset(dax_dev, 0, sizeof(*dax_dev)); + inode_init_once(inode); +} + +static int dax_fs_init(void) +{ + int rc; + + dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT), + init_once); + if (!dax_cache) + return -ENOMEM; + + dax_mnt = kern_mount(&dax_fs_type); + if (IS_ERR(dax_mnt)) { + rc = PTR_ERR(dax_mnt); + goto err_mount; + } + dax_superblock = dax_mnt->mnt_sb; + + return 0; + + err_mount: + kmem_cache_destroy(dax_cache); + + return rc; +} + +static void dax_fs_exit(void) +{ + kern_unmount(dax_mnt); + rcu_barrier(); + kmem_cache_destroy(dax_cache); +} + +static int __init dax_core_init(void) +{ + int rc; + + rc = dax_fs_init(); + if (rc) + return rc; + + rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax"); + if (rc) + goto err_chrdev; + + rc = dax_bus_init(); + if (rc) + goto err_bus; + return 0; + +err_bus: + unregister_chrdev_region(dax_devt, MINORMASK+1); +err_chrdev: + dax_fs_exit(); + return 0; +} + +static void __exit dax_core_exit(void) +{ + dax_bus_exit(); + unregister_chrdev_region(dax_devt, MINORMASK+1); + ida_destroy(&dax_minor_ida); + dax_fs_exit(); +} + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); +subsys_initcall(dax_core_init); +module_exit(dax_core_exit); -- cgit v1.2.3