diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
commit | 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch) | |
tree | 848558de17fb3008cdf4d861b01ac7781903ce39 /drivers/cxl | |
parent | Initial commit. (diff) | |
download | linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip |
Adding upstream version 6.1.76.upstream/6.1.76
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | drivers/cxl/Kconfig | 114 | ||||
-rw-r--r-- | drivers/cxl/Makefile | 13 | ||||
-rw-r--r-- | drivers/cxl/acpi.c | 539 | ||||
-rw-r--r-- | drivers/cxl/core/Makefile | 13 | ||||
-rw-r--r-- | drivers/cxl/core/core.h | 63 | ||||
-rw-r--r-- | drivers/cxl/core/hdm.c | 865 | ||||
-rw-r--r-- | drivers/cxl/core/mbox.c | 862 | ||||
-rw-r--r-- | drivers/cxl/core/memdev.c | 386 | ||||
-rw-r--r-- | drivers/cxl/core/pci.c | 702 | ||||
-rw-r--r-- | drivers/cxl/core/pmem.c | 285 | ||||
-rw-r--r-- | drivers/cxl/core/port.c | 1894 | ||||
-rw-r--r-- | drivers/cxl/core/region.c | 2006 | ||||
-rw-r--r-- | drivers/cxl/core/regs.c | 304 | ||||
-rw-r--r-- | drivers/cxl/core/suspend.c | 24 | ||||
-rw-r--r-- | drivers/cxl/cxl.h | 665 | ||||
-rw-r--r-- | drivers/cxl/cxlmem.h | 419 | ||||
-rw-r--r-- | drivers/cxl/cxlpci.h | 94 | ||||
-rw-r--r-- | drivers/cxl/mem.c | 168 | ||||
-rw-r--r-- | drivers/cxl/pci.c | 525 | ||||
-rw-r--r-- | drivers/cxl/pmem.c | 662 | ||||
-rw-r--r-- | drivers/cxl/port.c | 143 |
21 files changed, 10746 insertions, 0 deletions
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig new file mode 100644 index 000000000..768ced3d6 --- /dev/null +++ b/drivers/cxl/Kconfig @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: GPL-2.0-only +menuconfig CXL_BUS + tristate "CXL (Compute Express Link) Devices Support" + depends on PCI + select PCI_DOE + help + CXL is a bus that is electrically compatible with PCI Express, but + layers three protocols on that signalling (CXL.io, CXL.cache, and + CXL.mem). The CXL.cache protocol allows devices to hold cachelines + locally, the CXL.mem protocol allows devices to be fully coherent + memory targets, the CXL.io protocol is equivalent to PCI Express. + Say 'y' to enable support for the configuration and management of + devices supporting these protocols. + +if CXL_BUS + +config CXL_PCI + tristate "PCI manageability" + default CXL_BUS + help + The CXL specification defines a "CXL memory device" sub-class in the + PCI "memory controller" base class of devices. Device's identified by + this class code provide support for volatile and / or persistent + memory to be mapped into the system address map (Host-managed Device + Memory (HDM)). + + Say 'y/m' to enable a driver that will attach to CXL memory expander + devices enumerated by the memory device class code for configuration + and management primarily via the mailbox interface. See Chapter 2.3 + Type 3 CXL Device in the CXL 2.0 specification for more details. + + If unsure say 'm'. + +config CXL_MEM_RAW_COMMANDS + bool "RAW Command Interface for Memory Devices" + depends on CXL_PCI + help + Enable CXL RAW command interface. + + The CXL driver ioctl interface may assign a kernel ioctl command + number for each specification defined opcode. At any given point in + time the number of opcodes that the specification defines and a device + may implement may exceed the kernel's set of associated ioctl function + numbers. The mismatch is either by omission, specification is too new, + or by design. When prototyping new hardware, or developing / debugging + the driver it is useful to be able to submit any possible command to + the hardware, even commands that may crash the kernel due to their + potential impact to memory currently in use by the kernel. + + If developing CXL hardware or the driver say Y, otherwise say N. + +config CXL_ACPI + tristate "CXL ACPI: Platform Support" + depends on ACPI + default CXL_BUS + select ACPI_TABLE_LIB + help + Enable support for host managed device memory (HDM) resources + published by a platform's ACPI CXL memory layout description. See + Chapter 9.14.1 CXL Early Discovery Table (CEDT) in the CXL 2.0 + specification, and CXL Fixed Memory Window Structures (CEDT.CFMWS) + (https://www.computeexpresslink.org/spec-landing). The CXL core + consumes these resource to publish the root of a cxl_port decode + hierarchy to map regions that represent System RAM, or Persistent + Memory regions to be managed by LIBNVDIMM. + + If unsure say 'm'. + +config CXL_PMEM + tristate "CXL PMEM: Persistent Memory Support" + depends on LIBNVDIMM + default CXL_BUS + help + In addition to typical memory resources a platform may also advertise + support for persistent memory attached via CXL. This support is + managed via a bridge driver from CXL to the LIBNVDIMM system + subsystem. Say 'y/m' to enable support for enumerating and + provisioning the persistent memory capacity of CXL memory expanders. + + If unsure say 'm'. + +config CXL_MEM + tristate "CXL: Memory Expansion" + depends on CXL_PCI + default CXL_BUS + help + The CXL.mem protocol allows a device to act as a provider of "System + RAM" and/or "Persistent Memory" that is fully coherent as if the + memory were attached to the typical CPU memory controller. This is + known as HDM "Host-managed Device Memory". + + Say 'y/m' to enable a driver that will attach to CXL.mem devices for + memory expansion and control of HDM. See Chapter 9.13 in the CXL 2.0 + specification for a detailed description of HDM. + + If unsure say 'm'. + +config CXL_PORT + default CXL_BUS + tristate + +config CXL_SUSPEND + def_bool y + depends on SUSPEND && CXL_MEM + +config CXL_REGION + bool + default CXL_BUS + # For MAX_PHYSMEM_BITS + depends on SPARSEMEM + select MEMREGION + select GET_FREE_REGION + +endif diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile new file mode 100644 index 000000000..a78270794 --- /dev/null +++ b/drivers/cxl/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += core/ +obj-$(CONFIG_CXL_PCI) += cxl_pci.o +obj-$(CONFIG_CXL_MEM) += cxl_mem.o +obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o +obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o +obj-$(CONFIG_CXL_PORT) += cxl_port.o + +cxl_mem-y := mem.o +cxl_pci-y := pci.o +cxl_acpi-y := acpi.o +cxl_pmem-y := pmem.o +cxl_port-y := port.o diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c new file mode 100644 index 000000000..dd610556a --- /dev/null +++ b/drivers/cxl/acpi.c @@ -0,0 +1,539 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2021 Intel Corporation. All rights reserved. */ +#include <linux/platform_device.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/pci.h> +#include "cxlpci.h" +#include "cxl.h" + +static unsigned long cfmws_to_decoder_flags(int restrictions) +{ + unsigned long flags = CXL_DECODER_F_ENABLE; + + if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE2) + flags |= CXL_DECODER_F_TYPE2; + if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE3) + flags |= CXL_DECODER_F_TYPE3; + if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_VOLATILE) + flags |= CXL_DECODER_F_RAM; + if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_PMEM) + flags |= CXL_DECODER_F_PMEM; + if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_FIXED) + flags |= CXL_DECODER_F_LOCK; + + return flags; +} + +static int cxl_acpi_cfmws_verify(struct device *dev, + struct acpi_cedt_cfmws *cfmws) +{ + int rc, expected_len; + unsigned int ways; + + if (cfmws->interleave_arithmetic != ACPI_CEDT_CFMWS_ARITHMETIC_MODULO) { + dev_err(dev, "CFMWS Unsupported Interleave Arithmetic\n"); + return -EINVAL; + } + + if (!IS_ALIGNED(cfmws->base_hpa, SZ_256M)) { + dev_err(dev, "CFMWS Base HPA not 256MB aligned\n"); + return -EINVAL; + } + + if (!IS_ALIGNED(cfmws->window_size, SZ_256M)) { + dev_err(dev, "CFMWS Window Size not 256MB aligned\n"); + return -EINVAL; + } + + rc = cxl_to_ways(cfmws->interleave_ways, &ways); + if (rc) { + dev_err(dev, "CFMWS Interleave Ways (%d) invalid\n", + cfmws->interleave_ways); + return -EINVAL; + } + + expected_len = struct_size(cfmws, interleave_targets, ways); + + if (cfmws->header.length < expected_len) { + dev_err(dev, "CFMWS length %d less than expected %d\n", + cfmws->header.length, expected_len); + return -EINVAL; + } + + if (cfmws->header.length > expected_len) + dev_dbg(dev, "CFMWS length %d greater than expected %d\n", + cfmws->header.length, expected_len); + + return 0; +} + +struct cxl_cfmws_context { + struct device *dev; + struct cxl_port *root_port; + struct resource *cxl_res; + int id; +}; + +static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, + const unsigned long end) +{ + int target_map[CXL_DECODER_MAX_INTERLEAVE]; + struct cxl_cfmws_context *ctx = arg; + struct cxl_port *root_port = ctx->root_port; + struct resource *cxl_res = ctx->cxl_res; + struct cxl_root_decoder *cxlrd; + struct device *dev = ctx->dev; + struct acpi_cedt_cfmws *cfmws; + struct cxl_decoder *cxld; + unsigned int ways, i, ig; + struct resource *res; + int rc; + + cfmws = (struct acpi_cedt_cfmws *) header; + + rc = cxl_acpi_cfmws_verify(dev, cfmws); + if (rc) { + dev_err(dev, "CFMWS range %#llx-%#llx not registered\n", + cfmws->base_hpa, + cfmws->base_hpa + cfmws->window_size - 1); + return 0; + } + + rc = cxl_to_ways(cfmws->interleave_ways, &ways); + if (rc) + return rc; + rc = cxl_to_granularity(cfmws->granularity, &ig); + if (rc) + return rc; + for (i = 0; i < ways; i++) + target_map[i] = cfmws->interleave_targets[i]; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + res->name = kasprintf(GFP_KERNEL, "CXL Window %d", ctx->id++); + if (!res->name) + goto err_name; + + res->start = cfmws->base_hpa; + res->end = cfmws->base_hpa + cfmws->window_size - 1; + res->flags = IORESOURCE_MEM; + + /* add to the local resource tracking to establish a sort order */ + rc = insert_resource(cxl_res, res); + if (rc) + goto err_insert; + + cxlrd = cxl_root_decoder_alloc(root_port, ways); + if (IS_ERR(cxlrd)) + return 0; + + cxld = &cxlrd->cxlsd.cxld; + cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions); + cxld->target_type = CXL_DECODER_EXPANDER; + cxld->hpa_range = (struct range) { + .start = res->start, + .end = res->end, + }; + cxld->interleave_ways = ways; + /* + * Minimize the x1 granularity to advertise support for any + * valid region granularity + */ + if (ways == 1) + ig = CXL_DECODER_MIN_GRANULARITY; + cxld->interleave_granularity = ig; + + rc = cxl_decoder_add(cxld, target_map); + if (rc) + put_device(&cxld->dev); + else + rc = cxl_decoder_autoremove(dev, cxld); + if (rc) { + dev_err(dev, "Failed to add decode range: %pr", res); + return rc; + } + dev_dbg(dev, "add: %s node: %d range [%#llx - %#llx]\n", + dev_name(&cxld->dev), + phys_to_target_node(cxld->hpa_range.start), + cxld->hpa_range.start, cxld->hpa_range.end); + + return 0; + +err_insert: + kfree(res->name); +err_name: + kfree(res); + return -ENOMEM; +} + +__mock struct acpi_device *to_cxl_host_bridge(struct device *host, + struct device *dev) +{ + struct acpi_device *adev = to_acpi_device(dev); + + if (!acpi_pci_find_root(adev->handle)) + return NULL; + + if (strcmp(acpi_device_hid(adev), "ACPI0016") == 0) + return adev; + return NULL; +} + +/* + * A host bridge is a dport to a CFMWS decode and it is a uport to the + * dport (PCIe Root Ports) in the host bridge. + */ +static int add_host_bridge_uport(struct device *match, void *arg) +{ + struct cxl_port *root_port = arg; + struct device *host = root_port->dev.parent; + struct acpi_device *bridge = to_cxl_host_bridge(host, match); + struct acpi_pci_root *pci_root; + struct cxl_dport *dport; + struct cxl_port *port; + int rc; + + if (!bridge) + return 0; + + dport = cxl_find_dport_by_dev(root_port, match); + if (!dport) { + dev_dbg(host, "host bridge expected and not found\n"); + return 0; + } + + /* + * Note that this lookup already succeeded in + * to_cxl_host_bridge(), so no need to check for failure here + */ + pci_root = acpi_pci_find_root(bridge->handle); + rc = devm_cxl_register_pci_bus(host, match, pci_root->bus); + if (rc) + return rc; + + port = devm_cxl_add_port(host, match, dport->component_reg_phys, dport); + if (IS_ERR(port)) + return PTR_ERR(port); + + return 0; +} + +struct cxl_chbs_context { + struct device *dev; + unsigned long long uid; + resource_size_t chbcr; +}; + +static int cxl_get_chbcr(union acpi_subtable_headers *header, void *arg, + const unsigned long end) +{ + struct cxl_chbs_context *ctx = arg; + struct acpi_cedt_chbs *chbs; + + if (ctx->chbcr) + return 0; + + chbs = (struct acpi_cedt_chbs *) header; + + if (ctx->uid != chbs->uid) + return 0; + ctx->chbcr = chbs->base; + + return 0; +} + +static int add_host_bridge_dport(struct device *match, void *arg) +{ + acpi_status status; + unsigned long long uid; + struct cxl_dport *dport; + struct cxl_chbs_context ctx; + struct cxl_port *root_port = arg; + struct device *host = root_port->dev.parent; + struct acpi_device *bridge = to_cxl_host_bridge(host, match); + + if (!bridge) + return 0; + + status = acpi_evaluate_integer(bridge->handle, METHOD_NAME__UID, NULL, + &uid); + if (status != AE_OK) { + dev_err(host, "unable to retrieve _UID of %s\n", + dev_name(match)); + return -ENODEV; + } + + ctx = (struct cxl_chbs_context) { + .dev = host, + .uid = uid, + }; + acpi_table_parse_cedt(ACPI_CEDT_TYPE_CHBS, cxl_get_chbcr, &ctx); + + if (ctx.chbcr == 0) { + dev_warn(host, "No CHBS found for Host Bridge: %s\n", + dev_name(match)); + return 0; + } + + dport = devm_cxl_add_dport(root_port, match, uid, ctx.chbcr); + if (IS_ERR(dport)) { + dev_err(host, "failed to add downstream port: %s\n", + dev_name(match)); + return PTR_ERR(dport); + } + dev_dbg(host, "add dport%llu: %s\n", uid, dev_name(match)); + return 0; +} + +static int add_root_nvdimm_bridge(struct device *match, void *data) +{ + struct cxl_decoder *cxld; + struct cxl_port *root_port = data; + struct cxl_nvdimm_bridge *cxl_nvb; + struct device *host = root_port->dev.parent; + + if (!is_root_decoder(match)) + return 0; + + cxld = to_cxl_decoder(match); + if (!(cxld->flags & CXL_DECODER_F_PMEM)) + return 0; + + cxl_nvb = devm_cxl_add_nvdimm_bridge(host, root_port); + if (IS_ERR(cxl_nvb)) { + dev_dbg(host, "failed to register pmem\n"); + return PTR_ERR(cxl_nvb); + } + dev_dbg(host, "%s: add: %s\n", dev_name(&root_port->dev), + dev_name(&cxl_nvb->dev)); + return 1; +} + +static struct lock_class_key cxl_root_key; + +static void cxl_acpi_lock_reset_class(void *dev) +{ + device_lock_reset_class(dev); +} + +static void del_cxl_resource(struct resource *res) +{ + kfree(res->name); + kfree(res); +} + +static void cxl_set_public_resource(struct resource *priv, struct resource *pub) +{ + priv->desc = (unsigned long) pub; +} + +static struct resource *cxl_get_public_resource(struct resource *priv) +{ + return (struct resource *) priv->desc; +} + +static void remove_cxl_resources(void *data) +{ + struct resource *res, *next, *cxl = data; + + for (res = cxl->child; res; res = next) { + struct resource *victim = cxl_get_public_resource(res); + + next = res->sibling; + remove_resource(res); + + if (victim) { + remove_resource(victim); + kfree(victim); + } + + del_cxl_resource(res); + } +} + +/** + * add_cxl_resources() - reflect CXL fixed memory windows in iomem_resource + * @cxl_res: A standalone resource tree where each CXL window is a sibling + * + * Walk each CXL window in @cxl_res and add it to iomem_resource potentially + * expanding its boundaries to ensure that any conflicting resources become + * children. If a window is expanded it may then conflict with a another window + * entry and require the window to be truncated or trimmed. Consider this + * situation: + * + * |-- "CXL Window 0" --||----- "CXL Window 1" -----| + * |--------------- "System RAM" -------------| + * + * ...where platform firmware has established as System RAM resource across 2 + * windows, but has left some portion of window 1 for dynamic CXL region + * provisioning. In this case "Window 0" will span the entirety of the "System + * RAM" span, and "CXL Window 1" is truncated to the remaining tail past the end + * of that "System RAM" resource. + */ +static int add_cxl_resources(struct resource *cxl_res) +{ + struct resource *res, *new, *next; + + for (res = cxl_res->child; res; res = next) { + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + new->name = res->name; + new->start = res->start; + new->end = res->end; + new->flags = IORESOURCE_MEM; + new->desc = IORES_DESC_CXL; + + /* + * Record the public resource in the private cxl_res tree for + * later removal. + */ + cxl_set_public_resource(res, new); + + insert_resource_expand_to_fit(&iomem_resource, new); + + next = res->sibling; + while (next && resource_overlaps(new, next)) { + if (resource_contains(new, next)) { + struct resource *_next = next->sibling; + + remove_resource(next); + del_cxl_resource(next); + next = _next; + } else + next->start = new->end + 1; + } + } + return 0; +} + +static int pair_cxl_resource(struct device *dev, void *data) +{ + struct resource *cxl_res = data; + struct resource *p; + + if (!is_root_decoder(dev)) + return 0; + + for (p = cxl_res->child; p; p = p->sibling) { + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct resource res = { + .start = cxld->hpa_range.start, + .end = cxld->hpa_range.end, + .flags = IORESOURCE_MEM, + }; + + if (resource_contains(p, &res)) { + cxlrd->res = cxl_get_public_resource(p); + break; + } + } + + return 0; +} + +static int cxl_acpi_probe(struct platform_device *pdev) +{ + int rc; + struct resource *cxl_res; + struct cxl_port *root_port; + struct device *host = &pdev->dev; + struct acpi_device *adev = ACPI_COMPANION(host); + struct cxl_cfmws_context ctx; + + device_lock_set_class(&pdev->dev, &cxl_root_key); + rc = devm_add_action_or_reset(&pdev->dev, cxl_acpi_lock_reset_class, + &pdev->dev); + if (rc) + return rc; + + cxl_res = devm_kzalloc(host, sizeof(*cxl_res), GFP_KERNEL); + if (!cxl_res) + return -ENOMEM; + cxl_res->name = "CXL mem"; + cxl_res->start = 0; + cxl_res->end = -1; + cxl_res->flags = IORESOURCE_MEM; + + root_port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL); + if (IS_ERR(root_port)) + return PTR_ERR(root_port); + + rc = bus_for_each_dev(adev->dev.bus, NULL, root_port, + add_host_bridge_dport); + if (rc < 0) + return rc; + + rc = devm_add_action_or_reset(host, remove_cxl_resources, cxl_res); + if (rc) + return rc; + + ctx = (struct cxl_cfmws_context) { + .dev = host, + .root_port = root_port, + .cxl_res = cxl_res, + }; + rc = acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx); + if (rc < 0) + return -ENXIO; + + rc = add_cxl_resources(cxl_res); + if (rc) + return rc; + + /* + * Populate the root decoders with their related iomem resource, + * if present + */ + device_for_each_child(&root_port->dev, cxl_res, pair_cxl_resource); + + /* + * Root level scanned with host-bridge as dports, now scan host-bridges + * for their role as CXL uports to their CXL-capable PCIe Root Ports. + */ + rc = bus_for_each_dev(adev->dev.bus, NULL, root_port, + add_host_bridge_uport); + if (rc < 0) + return rc; + + if (IS_ENABLED(CONFIG_CXL_PMEM)) + rc = device_for_each_child(&root_port->dev, root_port, + add_root_nvdimm_bridge); + if (rc < 0) + return rc; + + /* In case PCI is scanned before ACPI re-trigger memdev attach */ + return cxl_bus_rescan(); +} + +static const struct acpi_device_id cxl_acpi_ids[] = { + { "ACPI0017" }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, cxl_acpi_ids); + +static const struct platform_device_id cxl_test_ids[] = { + { "cxl_acpi" }, + { }, +}; +MODULE_DEVICE_TABLE(platform, cxl_test_ids); + +static struct platform_driver cxl_acpi_driver = { + .probe = cxl_acpi_probe, + .driver = { + .name = KBUILD_MODNAME, + .acpi_match_table = cxl_acpi_ids, + }, + .id_table = cxl_test_ids, +}; + +module_platform_driver(cxl_acpi_driver); +MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(CXL); +MODULE_IMPORT_NS(ACPI); diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile new file mode 100644 index 000000000..79c7257f4 --- /dev/null +++ b/drivers/cxl/core/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_CXL_BUS) += cxl_core.o +obj-$(CONFIG_CXL_SUSPEND) += suspend.o + +ccflags-y += -I$(srctree)/drivers/cxl +cxl_core-y := port.o +cxl_core-y += pmem.o +cxl_core-y += regs.o +cxl_core-y += memdev.o +cxl_core-y += mbox.o +cxl_core-y += pci.o +cxl_core-y += hdm.o +cxl_core-$(CONFIG_CXL_REGION) += region.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h new file mode 100644 index 000000000..cbee2340f --- /dev/null +++ b/drivers/cxl/core/core.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2020 Intel Corporation. */ + +#ifndef __CXL_CORE_H__ +#define __CXL_CORE_H__ + +extern const struct device_type cxl_nvdimm_bridge_type; +extern const struct device_type cxl_nvdimm_type; + +extern struct attribute_group cxl_base_attribute_group; + +#ifdef CONFIG_CXL_REGION +extern struct device_attribute dev_attr_create_pmem_region; +extern struct device_attribute dev_attr_delete_region; +extern struct device_attribute dev_attr_region; +extern const struct device_type cxl_pmem_region_type; +extern const struct device_type cxl_region_type; +void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); +#define CXL_REGION_ATTR(x) (&dev_attr_##x.attr) +#define CXL_REGION_TYPE(x) (&cxl_region_type) +#define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr), +#define CXL_PMEM_REGION_TYPE(x) (&cxl_pmem_region_type) +int cxl_region_init(void); +void cxl_region_exit(void); +#else +static inline void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled) +{ +} +static inline int cxl_region_init(void) +{ + return 0; +} +static inline void cxl_region_exit(void) +{ +} +#define CXL_REGION_ATTR(x) NULL +#define CXL_REGION_TYPE(x) NULL +#define SET_CXL_REGION_ATTR(x) +#define CXL_PMEM_REGION_TYPE(x) NULL +#endif + +struct cxl_send_command; +struct cxl_mem_query_commands; +int cxl_query_cmd(struct cxl_memdev *cxlmd, + struct cxl_mem_query_commands __user *q); +int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s); +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, + resource_size_t length); + +struct dentry *cxl_debugfs_create_dir(const char *dir); +int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, + enum cxl_decoder_mode mode); +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size); +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); +resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled); +resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled); +extern struct rw_semaphore cxl_dpa_rwsem; + +int cxl_memdev_init(void); +void cxl_memdev_exit(void); +void cxl_mbox_init(void); + +#endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c new file mode 100644 index 000000000..8c1db4e1b --- /dev/null +++ b/drivers/cxl/core/hdm.c @@ -0,0 +1,865 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include <linux/io-64-nonatomic-hi-lo.h> +#include <linux/seq_file.h> +#include <linux/device.h> +#include <linux/delay.h> + +#include "cxlmem.h" +#include "core.h" + +/** + * DOC: cxl core hdm + * + * Compute Express Link Host Managed Device Memory, starting with the + * CXL 2.0 specification, is managed by an array of HDM Decoder register + * instances per CXL port and per CXL endpoint. Define common helpers + * for enumerating these registers and capabilities. + */ + +DECLARE_RWSEM(cxl_dpa_rwsem); + +static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, + int *target_map) +{ + int rc; + + rc = cxl_decoder_add_locked(cxld, target_map); + if (rc) { + put_device(&cxld->dev); + dev_err(&port->dev, "Failed to add decoder\n"); + return rc; + } + + rc = cxl_decoder_autoremove(&port->dev, cxld); + if (rc) + return rc; + + dev_dbg(&cxld->dev, "Added to port %s\n", dev_name(&port->dev)); + + return 0; +} + +/* + * Per the CXL specification (8.2.5.12 CXL HDM Decoder Capability Structure) + * single ported host-bridges need not publish a decoder capability when a + * passthrough decode can be assumed, i.e. all transactions that the uport sees + * are claimed and passed to the single dport. Disable the range until the first + * CXL region is enumerated / activated. + */ +int devm_cxl_add_passthrough_decoder(struct cxl_port *port) +{ + struct cxl_switch_decoder *cxlsd; + struct cxl_dport *dport = NULL; + int single_port_map[1]; + unsigned long index; + + cxlsd = cxl_switch_decoder_alloc(port, 1); + if (IS_ERR(cxlsd)) + return PTR_ERR(cxlsd); + + device_lock_assert(&port->dev); + + xa_for_each(&port->dports, index, dport) + break; + single_port_map[0] = dport->port_id; + + return add_hdm_decoder(port, &cxlsd->cxld, single_port_map); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_passthrough_decoder, CXL); + +static void parse_hdm_decoder_caps(struct cxl_hdm *cxlhdm) +{ + u32 hdm_cap; + + hdm_cap = readl(cxlhdm->regs.hdm_decoder + CXL_HDM_DECODER_CAP_OFFSET); + cxlhdm->decoder_count = cxl_hdm_decoder_count(hdm_cap); + cxlhdm->target_count = + FIELD_GET(CXL_HDM_DECODER_TARGET_COUNT_MASK, hdm_cap); + if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_11_8, hdm_cap)) + cxlhdm->interleave_mask |= GENMASK(11, 8); + if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_14_12, hdm_cap)) + cxlhdm->interleave_mask |= GENMASK(14, 12); +} + +static void __iomem *map_hdm_decoder_regs(struct cxl_port *port, + void __iomem *crb) +{ + struct cxl_component_reg_map map; + + cxl_probe_component_regs(&port->dev, crb, &map); + if (!map.hdm_decoder.valid) { + dev_err(&port->dev, "HDM decoder registers invalid\n"); + return IOMEM_ERR_PTR(-ENXIO); + } + + return crb + map.hdm_decoder.offset; +} + +/** + * devm_cxl_setup_hdm - map HDM decoder component registers + * @port: cxl_port to map + */ +struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port) +{ + struct device *dev = &port->dev; + void __iomem *crb, *hdm; + struct cxl_hdm *cxlhdm; + + cxlhdm = devm_kzalloc(dev, sizeof(*cxlhdm), GFP_KERNEL); + if (!cxlhdm) + return ERR_PTR(-ENOMEM); + + cxlhdm->port = port; + crb = devm_cxl_iomap_block(dev, port->component_reg_phys, + CXL_COMPONENT_REG_BLOCK_SIZE); + if (!crb) { + dev_err(dev, "No component registers mapped\n"); + return ERR_PTR(-ENXIO); + } + + hdm = map_hdm_decoder_regs(port, crb); + if (IS_ERR(hdm)) + return ERR_CAST(hdm); + cxlhdm->regs.hdm_decoder = hdm; + + parse_hdm_decoder_caps(cxlhdm); + if (cxlhdm->decoder_count == 0) { + dev_err(dev, "Spec violation. Caps invalid\n"); + return ERR_PTR(-ENXIO); + } + + dev_set_drvdata(dev, cxlhdm); + + return cxlhdm; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_hdm, CXL); + +static void __cxl_dpa_debug(struct seq_file *file, struct resource *r, int depth) +{ + unsigned long long start = r->start, end = r->end; + + seq_printf(file, "%*s%08llx-%08llx : %s\n", depth * 2, "", start, end, + r->name); +} + +void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds) +{ + struct resource *p1, *p2; + + down_read(&cxl_dpa_rwsem); + for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) { + __cxl_dpa_debug(file, p1, 0); + for (p2 = p1->child; p2; p2 = p2->sibling) + __cxl_dpa_debug(file, p2, 1); + } + up_read(&cxl_dpa_rwsem); +} +EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, CXL); + +/* + * Must be called in a context that synchronizes against this decoder's + * port ->remove() callback (like an endpoint decoder sysfs attribute) + */ +static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct resource *res = cxled->dpa_res; + resource_size_t skip_start; + + lockdep_assert_held_write(&cxl_dpa_rwsem); + + /* save @skip_start, before @res is released */ + skip_start = res->start - cxled->skip; + __release_region(&cxlds->dpa_res, res->start, resource_size(res)); + if (cxled->skip) + __release_region(&cxlds->dpa_res, skip_start, cxled->skip); + cxled->skip = 0; + cxled->dpa_res = NULL; + put_device(&cxled->cxld.dev); + port->hdm_end--; +} + +static void cxl_dpa_release(void *cxled) +{ + down_write(&cxl_dpa_rwsem); + __cxl_dpa_release(cxled); + up_write(&cxl_dpa_rwsem); +} + +/* + * Must be called from context that will not race port device + * unregistration, like decoder sysfs attribute methods + */ +static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = cxled_to_port(cxled); + + lockdep_assert_held_write(&cxl_dpa_rwsem); + devm_remove_action(&port->dev, cxl_dpa_release, cxled); + __cxl_dpa_release(cxled); +} + +static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, + resource_size_t base, resource_size_t len, + resource_size_t skipped) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &port->dev; + struct resource *res; + + lockdep_assert_held_write(&cxl_dpa_rwsem); + + if (!len) { + dev_warn(dev, "decoder%d.%d: empty reservation attempted\n", + port->id, cxled->cxld.id); + return -EINVAL; + } + + if (cxled->dpa_res) { + dev_dbg(dev, "decoder%d.%d: existing allocation %pr assigned\n", + port->id, cxled->cxld.id, cxled->dpa_res); + return -EBUSY; + } + + if (port->hdm_end + 1 != cxled->cxld.id) { + /* + * Assumes alloc and commit order is always in hardware instance + * order per expectations from 8.2.5.12.20 Committing Decoder + * Programming that enforce decoder[m] committed before + * decoder[m+1] commit start. + */ + dev_dbg(dev, "decoder%d.%d: expected decoder%d.%d\n", port->id, + cxled->cxld.id, port->id, port->hdm_end + 1); + return -EBUSY; + } + + if (skipped) { + res = __request_region(&cxlds->dpa_res, base - skipped, skipped, + dev_name(&cxled->cxld.dev), 0); + if (!res) { + dev_dbg(dev, + "decoder%d.%d: failed to reserve skipped space\n", + port->id, cxled->cxld.id); + return -EBUSY; + } + } + res = __request_region(&cxlds->dpa_res, base, len, + dev_name(&cxled->cxld.dev), 0); + if (!res) { + dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n", + port->id, cxled->cxld.id); + if (skipped) + __release_region(&cxlds->dpa_res, base - skipped, + skipped); + return -EBUSY; + } + cxled->dpa_res = res; + cxled->skip = skipped; + + if (resource_contains(&cxlds->pmem_res, res)) + cxled->mode = CXL_DECODER_PMEM; + else if (resource_contains(&cxlds->ram_res, res)) + cxled->mode = CXL_DECODER_RAM; + else { + dev_dbg(dev, "decoder%d.%d: %pr mixed\n", port->id, + cxled->cxld.id, cxled->dpa_res); + cxled->mode = CXL_DECODER_MIXED; + } + + port->hdm_end++; + get_device(&cxled->cxld.dev); + return 0; +} + +int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, + resource_size_t base, resource_size_t len, + resource_size_t skipped) +{ + struct cxl_port *port = cxled_to_port(cxled); + int rc; + + down_write(&cxl_dpa_rwsem); + rc = __cxl_dpa_reserve(cxled, base, len, skipped); + up_write(&cxl_dpa_rwsem); + + if (rc) + return rc; + + return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_dpa_reserve, CXL); + +resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled) +{ + resource_size_t size = 0; + + down_read(&cxl_dpa_rwsem); + if (cxled->dpa_res) + size = resource_size(cxled->dpa_res); + up_read(&cxl_dpa_rwsem); + + return size; +} + +resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled) +{ + resource_size_t base = -1; + + down_read(&cxl_dpa_rwsem); + if (cxled->dpa_res) + base = cxled->dpa_res->start; + up_read(&cxl_dpa_rwsem); + + return base; +} + +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = cxled_to_port(cxled); + struct device *dev = &cxled->cxld.dev; + int rc; + + down_write(&cxl_dpa_rwsem); + if (!cxled->dpa_res) { + rc = 0; + goto out; + } + if (cxled->cxld.region) { + dev_dbg(dev, "decoder assigned to: %s\n", + dev_name(&cxled->cxld.region->dev)); + rc = -EBUSY; + goto out; + } + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + dev_dbg(dev, "decoder enabled\n"); + rc = -EBUSY; + goto out; + } + if (cxled->cxld.id != port->hdm_end) { + dev_dbg(dev, "expected decoder%d.%d\n", port->id, + port->hdm_end); + rc = -EBUSY; + goto out; + } + devm_cxl_dpa_release(cxled); + rc = 0; +out: + up_write(&cxl_dpa_rwsem); + return rc; +} + +int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, + enum cxl_decoder_mode mode) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &cxled->cxld.dev; + int rc; + + switch (mode) { + case CXL_DECODER_RAM: + case CXL_DECODER_PMEM: + break; + default: + dev_dbg(dev, "unsupported mode: %d\n", mode); + return -EINVAL; + } + + down_write(&cxl_dpa_rwsem); + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + rc = -EBUSY; + goto out; + } + + /* + * Only allow modes that are supported by the current partition + * configuration + */ + if (mode == CXL_DECODER_PMEM && !resource_size(&cxlds->pmem_res)) { + dev_dbg(dev, "no available pmem capacity\n"); + rc = -ENXIO; + goto out; + } + if (mode == CXL_DECODER_RAM && !resource_size(&cxlds->ram_res)) { + dev_dbg(dev, "no available ram capacity\n"); + rc = -ENXIO; + goto out; + } + + cxled->mode = mode; + rc = 0; +out: + up_write(&cxl_dpa_rwsem); + + return rc; +} + +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + resource_size_t free_ram_start, free_pmem_start; + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &cxled->cxld.dev; + resource_size_t start, avail, skip; + struct resource *p, *last; + int rc; + + down_write(&cxl_dpa_rwsem); + if (cxled->cxld.region) { + dev_dbg(dev, "decoder attached to %s\n", + dev_name(&cxled->cxld.region->dev)); + rc = -EBUSY; + goto out; + } + + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + dev_dbg(dev, "decoder enabled\n"); + rc = -EBUSY; + goto out; + } + + for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling) + last = p; + if (last) + free_ram_start = last->end + 1; + else + free_ram_start = cxlds->ram_res.start; + + for (p = cxlds->pmem_res.child, last = NULL; p; p = p->sibling) + last = p; + if (last) + free_pmem_start = last->end + 1; + else + free_pmem_start = cxlds->pmem_res.start; + + if (cxled->mode == CXL_DECODER_RAM) { + start = free_ram_start; + avail = cxlds->ram_res.end - start + 1; + skip = 0; + } else if (cxled->mode == CXL_DECODER_PMEM) { + resource_size_t skip_start, skip_end; + + start = free_pmem_start; + avail = cxlds->pmem_res.end - start + 1; + skip_start = free_ram_start; + + /* + * If some pmem is already allocated, then that allocation + * already handled the skip. + */ + if (cxlds->pmem_res.child && + skip_start == cxlds->pmem_res.child->start) + skip_end = skip_start - 1; + else + skip_end = start - 1; + skip = skip_end - skip_start + 1; + } else { + dev_dbg(dev, "mode not set\n"); + rc = -EINVAL; + goto out; + } + + if (size > avail) { + dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size, + cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem", + &avail); + rc = -ENOSPC; + goto out; + } + + rc = __cxl_dpa_reserve(cxled, start, size, skip); +out: + up_write(&cxl_dpa_rwsem); + + if (rc) + return rc; + + return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); +} + +static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl) +{ + u16 eig; + u8 eiw; + + /* + * Input validation ensures these warns never fire, but otherwise + * suppress unititalized variable usage warnings. + */ + if (WARN_ONCE(ways_to_cxl(cxld->interleave_ways, &eiw), + "invalid interleave_ways: %d\n", cxld->interleave_ways)) + return; + if (WARN_ONCE(granularity_to_cxl(cxld->interleave_granularity, &eig), + "invalid interleave_granularity: %d\n", + cxld->interleave_granularity)) + return; + + u32p_replace_bits(ctrl, eig, CXL_HDM_DECODER0_CTRL_IG_MASK); + u32p_replace_bits(ctrl, eiw, CXL_HDM_DECODER0_CTRL_IW_MASK); + *ctrl |= CXL_HDM_DECODER0_CTRL_COMMIT; +} + +static void cxld_set_type(struct cxl_decoder *cxld, u32 *ctrl) +{ + u32p_replace_bits(ctrl, !!(cxld->target_type == 3), + CXL_HDM_DECODER0_CTRL_TYPE); +} + +static int cxlsd_set_targets(struct cxl_switch_decoder *cxlsd, u64 *tgt) +{ + struct cxl_dport **t = &cxlsd->target[0]; + int ways = cxlsd->cxld.interleave_ways; + + if (dev_WARN_ONCE(&cxlsd->cxld.dev, + ways > 8 || ways > cxlsd->nr_targets, + "ways: %d overflows targets: %d\n", ways, + cxlsd->nr_targets)) + return -ENXIO; + + *tgt = FIELD_PREP(GENMASK(7, 0), t[0]->port_id); + if (ways > 1) + *tgt |= FIELD_PREP(GENMASK(15, 8), t[1]->port_id); + if (ways > 2) + *tgt |= FIELD_PREP(GENMASK(23, 16), t[2]->port_id); + if (ways > 3) + *tgt |= FIELD_PREP(GENMASK(31, 24), t[3]->port_id); + if (ways > 4) + *tgt |= FIELD_PREP(GENMASK_ULL(39, 32), t[4]->port_id); + if (ways > 5) + *tgt |= FIELD_PREP(GENMASK_ULL(47, 40), t[5]->port_id); + if (ways > 6) + *tgt |= FIELD_PREP(GENMASK_ULL(55, 48), t[6]->port_id); + if (ways > 7) + *tgt |= FIELD_PREP(GENMASK_ULL(63, 56), t[7]->port_id); + + return 0; +} + +/* + * Per CXL 2.0 8.2.5.12.20 Committing Decoder Programming, hardware must set + * committed or error within 10ms, but just be generous with 20ms to account for + * clock skew and other marginal behavior + */ +#define COMMIT_TIMEOUT_MS 20 +static int cxld_await_commit(void __iomem *hdm, int id) +{ + u32 ctrl; + int i; + + for (i = 0; i < COMMIT_TIMEOUT_MS; i++) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMIT_ERROR, ctrl)) { + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + return -EIO; + } + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl)) + return 0; + fsleep(1000); + } + + return -ETIMEDOUT; +} + +static int cxl_decoder_commit(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + int id = cxld->id, rc; + u64 base, size; + u32 ctrl; + + if (cxld->flags & CXL_DECODER_F_ENABLE) + return 0; + + if (port->commit_end + 1 != id) { + dev_dbg(&port->dev, + "%s: out of order commit, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end + 1); + return -EBUSY; + } + + down_read(&cxl_dpa_rwsem); + /* common decoder settings */ + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id)); + cxld_set_interleave(cxld, &ctrl); + cxld_set_type(cxld, &ctrl); + base = cxld->hpa_range.start; + size = range_len(&cxld->hpa_range); + + writel(upper_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id)); + writel(lower_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id)); + writel(upper_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id)); + writel(lower_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id)); + + if (is_switch_decoder(&cxld->dev)) { + struct cxl_switch_decoder *cxlsd = + to_cxl_switch_decoder(&cxld->dev); + void __iomem *tl_hi = hdm + CXL_HDM_DECODER0_TL_HIGH(id); + void __iomem *tl_lo = hdm + CXL_HDM_DECODER0_TL_LOW(id); + u64 targets; + + rc = cxlsd_set_targets(cxlsd, &targets); + if (rc) { + dev_dbg(&port->dev, "%s: target configuration error\n", + dev_name(&cxld->dev)); + goto err; + } + + writel(upper_32_bits(targets), tl_hi); + writel(lower_32_bits(targets), tl_lo); + } else { + struct cxl_endpoint_decoder *cxled = + to_cxl_endpoint_decoder(&cxld->dev); + void __iomem *sk_hi = hdm + CXL_HDM_DECODER0_SKIP_HIGH(id); + void __iomem *sk_lo = hdm + CXL_HDM_DECODER0_SKIP_LOW(id); + + writel(upper_32_bits(cxled->skip), sk_hi); + writel(lower_32_bits(cxled->skip), sk_lo); + } + + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + up_read(&cxl_dpa_rwsem); + + port->commit_end++; + rc = cxld_await_commit(hdm, cxld->id); +err: + if (rc) { + dev_dbg(&port->dev, "%s: error %d committing decoder\n", + dev_name(&cxld->dev), rc); + cxld->reset(cxld); + return rc; + } + cxld->flags |= CXL_DECODER_F_ENABLE; + + return 0; +} + +static int cxl_decoder_reset(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + int id = cxld->id; + u32 ctrl; + + if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0) + return 0; + + if (port->commit_end != id) { + dev_dbg(&port->dev, + "%s: out of order reset, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end); + return -EBUSY; + } + + down_read(&cxl_dpa_rwsem); + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + + writel(0, hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id)); + up_read(&cxl_dpa_rwsem); + + port->commit_end--; + cxld->flags &= ~CXL_DECODER_F_ENABLE; + + return 0; +} + +static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, + int *target_map, void __iomem *hdm, int which, + u64 *dpa_base) +{ + struct cxl_endpoint_decoder *cxled = NULL; + u64 size, base, skip, dpa_size; + bool committed; + u32 remainder; + int i, rc; + u32 ctrl; + union { + u64 value; + unsigned char target_id[8]; + } target_list; + + if (is_endpoint_decoder(&cxld->dev)) + cxled = to_cxl_endpoint_decoder(&cxld->dev); + + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which)); + base = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(which)); + size = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(which)); + committed = !!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED); + cxld->commit = cxl_decoder_commit; + cxld->reset = cxl_decoder_reset; + + if (!committed) + size = 0; + if (base == U64_MAX || size == U64_MAX) { + dev_warn(&port->dev, "decoder%d.%d: Invalid resource range\n", + port->id, cxld->id); + return -ENXIO; + } + + cxld->hpa_range = (struct range) { + .start = base, + .end = base + size - 1, + }; + + /* decoders are enabled if committed */ + if (committed) { + cxld->flags |= CXL_DECODER_F_ENABLE; + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) + cxld->flags |= CXL_DECODER_F_LOCK; + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl)) + cxld->target_type = CXL_DECODER_EXPANDER; + else + cxld->target_type = CXL_DECODER_ACCELERATOR; + if (cxld->id != port->commit_end + 1) { + dev_warn(&port->dev, + "decoder%d.%d: Committed out of order\n", + port->id, cxld->id); + return -ENXIO; + } + + if (size == 0) { + dev_warn(&port->dev, + "decoder%d.%d: Committed with zero size\n", + port->id, cxld->id); + return -ENXIO; + } + port->commit_end = cxld->id; + } else { + /* unless / until type-2 drivers arrive, assume type-3 */ + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl) == 0) { + ctrl |= CXL_HDM_DECODER0_CTRL_TYPE; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which)); + } + cxld->target_type = CXL_DECODER_EXPANDER; + } + rc = cxl_to_ways(FIELD_GET(CXL_HDM_DECODER0_CTRL_IW_MASK, ctrl), + &cxld->interleave_ways); + if (rc) { + dev_warn(&port->dev, + "decoder%d.%d: Invalid interleave ways (ctrl: %#x)\n", + port->id, cxld->id, ctrl); + return rc; + } + rc = cxl_to_granularity(FIELD_GET(CXL_HDM_DECODER0_CTRL_IG_MASK, ctrl), + &cxld->interleave_granularity); + if (rc) + return rc; + + if (!cxled) { + target_list.value = + ioread64_hi_lo(hdm + CXL_HDM_DECODER0_TL_LOW(which)); + for (i = 0; i < cxld->interleave_ways; i++) + target_map[i] = target_list.target_id[i]; + + return 0; + } + + if (!committed) + return 0; + + dpa_size = div_u64_rem(size, cxld->interleave_ways, &remainder); + if (remainder) { + dev_err(&port->dev, + "decoder%d.%d: invalid committed configuration size: %#llx ways: %d\n", + port->id, cxld->id, size, cxld->interleave_ways); + return -ENXIO; + } + skip = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_SKIP_LOW(which)); + rc = devm_cxl_dpa_reserve(cxled, *dpa_base + skip, dpa_size, skip); + if (rc) { + dev_err(&port->dev, + "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)", + port->id, cxld->id, *dpa_base, + *dpa_base + dpa_size + skip - 1, rc); + return rc; + } + *dpa_base += dpa_size + skip; + return 0; +} + +/** + * devm_cxl_enumerate_decoders - add decoder objects per HDM register set + * @cxlhdm: Structure to populate with HDM capabilities + */ +int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) +{ + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + struct cxl_port *port = cxlhdm->port; + int i, committed; + u64 dpa_base = 0; + u32 ctrl; + + /* + * Since the register resource was recently claimed via request_region() + * be careful about trusting the "not-committed" status until the commit + * timeout has elapsed. The commit timeout is 10ms (CXL 2.0 + * 8.2.5.12.20), but double it to be tolerant of any clock skew between + * host and target. + */ + for (i = 0, committed = 0; i < cxlhdm->decoder_count; i++) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + committed++; + } + + /* ensure that future checks of committed can be trusted */ + if (committed != cxlhdm->decoder_count) + msleep(20); + + for (i = 0; i < cxlhdm->decoder_count; i++) { + int target_map[CXL_DECODER_MAX_INTERLEAVE] = { 0 }; + int rc, target_count = cxlhdm->target_count; + struct cxl_decoder *cxld; + + if (is_cxl_endpoint(port)) { + struct cxl_endpoint_decoder *cxled; + + cxled = cxl_endpoint_decoder_alloc(port); + if (IS_ERR(cxled)) { + dev_warn(&port->dev, + "Failed to allocate the decoder\n"); + return PTR_ERR(cxled); + } + cxld = &cxled->cxld; + } else { + struct cxl_switch_decoder *cxlsd; + + cxlsd = cxl_switch_decoder_alloc(port, target_count); + if (IS_ERR(cxlsd)) { + dev_warn(&port->dev, + "Failed to allocate the decoder\n"); + return PTR_ERR(cxlsd); + } + cxld = &cxlsd->cxld; + } + + rc = init_hdm_decoder(port, cxld, target_map, hdm, i, &dpa_base); + if (rc) { + put_device(&cxld->dev); + return rc; + } + rc = add_hdm_decoder(port, cxld, target_map); + if (rc) { + dev_warn(&port->dev, + "Failed to add decoder to port\n"); + return rc; + } + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_decoders, CXL); diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c new file mode 100644 index 000000000..0c90f1387 --- /dev/null +++ b/drivers/cxl/core/mbox.c @@ -0,0 +1,862 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. All rights reserved. */ +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/security.h> +#include <linux/debugfs.h> +#include <linux/mutex.h> +#include <cxlmem.h> +#include <cxl.h> + +#include "core.h" + +static bool cxl_raw_allow_all; + +/** + * DOC: cxl mbox + * + * Core implementation of the CXL 2.0 Type-3 Memory Device Mailbox. The + * implementation is used by the cxl_pci driver to initialize the device + * and implement the cxl_mem.h IOCTL UAPI. It also implements the + * backend of the cxl_pmem_ctl() transport for LIBNVDIMM. + */ + +#define cxl_for_each_cmd(cmd) \ + for ((cmd) = &cxl_mem_commands[0]; \ + ((cmd) - cxl_mem_commands) < ARRAY_SIZE(cxl_mem_commands); (cmd)++) + +#define CXL_CMD(_id, sin, sout, _flags) \ + [CXL_MEM_COMMAND_ID_##_id] = { \ + .info = { \ + .id = CXL_MEM_COMMAND_ID_##_id, \ + .size_in = sin, \ + .size_out = sout, \ + }, \ + .opcode = CXL_MBOX_OP_##_id, \ + .flags = _flags, \ + } + +#define CXL_VARIABLE_PAYLOAD ~0U +/* + * This table defines the supported mailbox commands for the driver. This table + * is made up of a UAPI structure. Non-negative values as parameters in the + * table will be validated against the user's input. For example, if size_in is + * 0, and the user passed in 1, it is an error. + */ +static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = { + CXL_CMD(IDENTIFY, 0, 0x43, CXL_CMD_FLAG_FORCE_ENABLE), +#ifdef CONFIG_CXL_MEM_RAW_COMMANDS + CXL_CMD(RAW, CXL_VARIABLE_PAYLOAD, CXL_VARIABLE_PAYLOAD, 0), +#endif + CXL_CMD(GET_SUPPORTED_LOGS, 0, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE), + CXL_CMD(GET_FW_INFO, 0, 0x50, 0), + CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0), + CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0), + CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0), + CXL_CMD(GET_LOG, 0x18, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE), + CXL_CMD(SET_PARTITION_INFO, 0x0a, 0, 0), + CXL_CMD(SET_LSA, CXL_VARIABLE_PAYLOAD, 0, 0), + CXL_CMD(GET_ALERT_CONFIG, 0, 0x10, 0), + CXL_CMD(SET_ALERT_CONFIG, 0xc, 0, 0), + CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0), + CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0), + CXL_CMD(GET_POISON, 0x10, CXL_VARIABLE_PAYLOAD, 0), + CXL_CMD(INJECT_POISON, 0x8, 0, 0), + CXL_CMD(CLEAR_POISON, 0x48, 0, 0), + CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0), + CXL_CMD(SCAN_MEDIA, 0x11, 0, 0), + CXL_CMD(GET_SCAN_MEDIA, 0, CXL_VARIABLE_PAYLOAD, 0), +}; + +/* + * Commands that RAW doesn't permit. The rationale for each: + * + * CXL_MBOX_OP_ACTIVATE_FW: Firmware activation requires adjustment / + * coordination of transaction timeout values at the root bridge level. + * + * CXL_MBOX_OP_SET_PARTITION_INFO: The device memory map may change live + * and needs to be coordinated with HDM updates. + * + * CXL_MBOX_OP_SET_LSA: The label storage area may be cached by the + * driver and any writes from userspace invalidates those contents. + * + * CXL_MBOX_OP_SET_SHUTDOWN_STATE: Set shutdown state assumes no writes + * to the device after it is marked clean, userspace can not make that + * assertion. + * + * CXL_MBOX_OP_[GET_]SCAN_MEDIA: The kernel provides a native error list that + * is kept up to date with patrol notifications and error management. + */ +static u16 cxl_disabled_raw_commands[] = { + CXL_MBOX_OP_ACTIVATE_FW, + CXL_MBOX_OP_SET_PARTITION_INFO, + CXL_MBOX_OP_SET_LSA, + CXL_MBOX_OP_SET_SHUTDOWN_STATE, + CXL_MBOX_OP_SCAN_MEDIA, + CXL_MBOX_OP_GET_SCAN_MEDIA, +}; + +/* + * Command sets that RAW doesn't permit. All opcodes in this set are + * disabled because they pass plain text security payloads over the + * user/kernel boundary. This functionality is intended to be wrapped + * behind the keys ABI which allows for encrypted payloads in the UAPI + */ +static u8 security_command_sets[] = { + 0x44, /* Sanitize */ + 0x45, /* Persistent Memory Data-at-rest Security */ + 0x46, /* Security Passthrough */ +}; + +static bool cxl_is_security_command(u16 opcode) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(security_command_sets); i++) + if (security_command_sets[i] == (opcode >> 8)) + return true; + return false; +} + +static struct cxl_mem_command *cxl_mem_find_command(u16 opcode) +{ + struct cxl_mem_command *c; + + cxl_for_each_cmd(c) + if (c->opcode == opcode) + return c; + + return NULL; +} + +static const char *cxl_mem_opcode_to_name(u16 opcode) +{ + struct cxl_mem_command *c; + + c = cxl_mem_find_command(opcode); + if (!c) + return NULL; + + return cxl_command_names[c->info.id].name; +} + +/** + * cxl_mbox_send_cmd() - Send a mailbox command to a device. + * @cxlds: The device data for the operation + * @opcode: Opcode for the mailbox command. + * @in: The input payload for the mailbox command. + * @in_size: The length of the input payload + * @out: Caller allocated buffer for the output. + * @out_size: Expected size of output. + * + * Context: Any context. + * Return: + * * %>=0 - Number of bytes returned in @out. + * * %-E2BIG - Payload is too large for hardware. + * * %-EBUSY - Couldn't acquire exclusive mailbox access. + * * %-EFAULT - Hardware error occurred. + * * %-ENXIO - Command completed, but device reported an error. + * * %-EIO - Unexpected output size. + * + * Mailbox commands may execute successfully yet the device itself reported an + * error. While this distinction can be useful for commands from userspace, the + * kernel will only be able to use results when both are successful. + */ +int cxl_mbox_send_cmd(struct cxl_dev_state *cxlds, u16 opcode, void *in, + size_t in_size, void *out, size_t out_size) +{ + const struct cxl_mem_command *cmd = cxl_mem_find_command(opcode); + struct cxl_mbox_cmd mbox_cmd = { + .opcode = opcode, + .payload_in = in, + .size_in = in_size, + .size_out = out_size, + .payload_out = out, + }; + int rc; + + if (in_size > cxlds->payload_size || out_size > cxlds->payload_size) + return -E2BIG; + + rc = cxlds->mbox_send(cxlds, &mbox_cmd); + if (rc) + return rc; + + if (mbox_cmd.return_code != CXL_MBOX_CMD_RC_SUCCESS) + return cxl_mbox_cmd_rc2errno(&mbox_cmd); + + /* + * Variable sized commands can't be validated and so it's up to the + * caller to do that if they wish. + */ + if (cmd->info.size_out != CXL_VARIABLE_PAYLOAD) { + if (mbox_cmd.size_out != out_size) + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_mbox_send_cmd, CXL); + +static bool cxl_mem_raw_command_allowed(u16 opcode) +{ + int i; + + if (!IS_ENABLED(CONFIG_CXL_MEM_RAW_COMMANDS)) + return false; + + if (security_locked_down(LOCKDOWN_PCI_ACCESS)) + return false; + + if (cxl_raw_allow_all) + return true; + + if (cxl_is_security_command(opcode)) + return false; + + for (i = 0; i < ARRAY_SIZE(cxl_disabled_raw_commands); i++) + if (cxl_disabled_raw_commands[i] == opcode) + return false; + + return true; +} + +/** + * cxl_payload_from_user_allowed() - Check contents of in_payload. + * @opcode: The mailbox command opcode. + * @payload_in: Pointer to the input payload passed in from user space. + * + * Return: + * * true - payload_in passes check for @opcode. + * * false - payload_in contains invalid or unsupported values. + * + * The driver may inspect payload contents before sending a mailbox + * command from user space to the device. The intent is to reject + * commands with input payloads that are known to be unsafe. This + * check is not intended to replace the users careful selection of + * mailbox command parameters and makes no guarantee that the user + * command will succeed, nor that it is appropriate. + * + * The specific checks are determined by the opcode. + */ +static bool cxl_payload_from_user_allowed(u16 opcode, void *payload_in) +{ + switch (opcode) { + case CXL_MBOX_OP_SET_PARTITION_INFO: { + struct cxl_mbox_set_partition_info *pi = payload_in; + + if (pi->flags & CXL_SET_PARTITION_IMMEDIATE_FLAG) + return false; + break; + } + default: + break; + } + return true; +} + +static int cxl_mbox_cmd_ctor(struct cxl_mbox_cmd *mbox, + struct cxl_dev_state *cxlds, u16 opcode, + size_t in_size, size_t out_size, u64 in_payload) +{ + *mbox = (struct cxl_mbox_cmd) { + .opcode = opcode, + .size_in = in_size, + }; + + if (in_size) { + mbox->payload_in = vmemdup_user(u64_to_user_ptr(in_payload), + in_size); + if (IS_ERR(mbox->payload_in)) + return PTR_ERR(mbox->payload_in); + + if (!cxl_payload_from_user_allowed(opcode, mbox->payload_in)) { + dev_dbg(cxlds->dev, "%s: input payload not allowed\n", + cxl_mem_opcode_to_name(opcode)); + kvfree(mbox->payload_in); + return -EBUSY; + } + } + + /* Prepare to handle a full payload for variable sized output */ + if (out_size == CXL_VARIABLE_PAYLOAD) + mbox->size_out = cxlds->payload_size; + else + mbox->size_out = out_size; + + if (mbox->size_out) { + mbox->payload_out = kvzalloc(mbox->size_out, GFP_KERNEL); + if (!mbox->payload_out) { + kvfree(mbox->payload_in); + return -ENOMEM; + } + } + return 0; +} + +static void cxl_mbox_cmd_dtor(struct cxl_mbox_cmd *mbox) +{ + kvfree(mbox->payload_in); + kvfree(mbox->payload_out); +} + +static int cxl_to_mem_cmd_raw(struct cxl_mem_command *mem_cmd, + const struct cxl_send_command *send_cmd, + struct cxl_dev_state *cxlds) +{ + if (send_cmd->raw.rsvd) + return -EINVAL; + + /* + * Unlike supported commands, the output size of RAW commands + * gets passed along without further checking, so it must be + * validated here. + */ + if (send_cmd->out.size > cxlds->payload_size) + return -EINVAL; + + if (!cxl_mem_raw_command_allowed(send_cmd->raw.opcode)) + return -EPERM; + + dev_WARN_ONCE(cxlds->dev, true, "raw command path used\n"); + + *mem_cmd = (struct cxl_mem_command) { + .info = { + .id = CXL_MEM_COMMAND_ID_RAW, + .size_in = send_cmd->in.size, + .size_out = send_cmd->out.size, + }, + .opcode = send_cmd->raw.opcode + }; + + return 0; +} + +static int cxl_to_mem_cmd(struct cxl_mem_command *mem_cmd, + const struct cxl_send_command *send_cmd, + struct cxl_dev_state *cxlds) +{ + struct cxl_mem_command *c = &cxl_mem_commands[send_cmd->id]; + const struct cxl_command_info *info = &c->info; + + if (send_cmd->flags & ~CXL_MEM_COMMAND_FLAG_MASK) + return -EINVAL; + + if (send_cmd->rsvd) + return -EINVAL; + + if (send_cmd->in.rsvd || send_cmd->out.rsvd) + return -EINVAL; + + /* Check that the command is enabled for hardware */ + if (!test_bit(info->id, cxlds->enabled_cmds)) + return -ENOTTY; + + /* Check that the command is not claimed for exclusive kernel use */ + if (test_bit(info->id, cxlds->exclusive_cmds)) + return -EBUSY; + + /* Check the input buffer is the expected size */ + if ((info->size_in != CXL_VARIABLE_PAYLOAD) && + (info->size_in != send_cmd->in.size)) + return -ENOMEM; + + /* Check the output buffer is at least large enough */ + if ((info->size_out != CXL_VARIABLE_PAYLOAD) && + (send_cmd->out.size < info->size_out)) + return -ENOMEM; + + *mem_cmd = (struct cxl_mem_command) { + .info = { + .id = info->id, + .flags = info->flags, + .size_in = send_cmd->in.size, + .size_out = send_cmd->out.size, + }, + .opcode = c->opcode + }; + + return 0; +} + +/** + * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND. + * @mbox_cmd: Sanitized and populated &struct cxl_mbox_cmd. + * @cxlds: The device data for the operation + * @send_cmd: &struct cxl_send_command copied in from userspace. + * + * Return: + * * %0 - @out_cmd is ready to send. + * * %-ENOTTY - Invalid command specified. + * * %-EINVAL - Reserved fields or invalid values were used. + * * %-ENOMEM - Input or output buffer wasn't sized properly. + * * %-EPERM - Attempted to use a protected command. + * * %-EBUSY - Kernel has claimed exclusive access to this opcode + * + * The result of this command is a fully validated command in @mbox_cmd that is + * safe to send to the hardware. + */ +static int cxl_validate_cmd_from_user(struct cxl_mbox_cmd *mbox_cmd, + struct cxl_dev_state *cxlds, + const struct cxl_send_command *send_cmd) +{ + struct cxl_mem_command mem_cmd; + int rc; + + if (send_cmd->id == 0 || send_cmd->id >= CXL_MEM_COMMAND_ID_MAX) + return -ENOTTY; + + /* + * The user can never specify an input payload larger than what hardware + * supports, but output can be arbitrarily large (simply write out as + * much data as the hardware provides). + */ + if (send_cmd->in.size > cxlds->payload_size) + return -EINVAL; + + /* Sanitize and construct a cxl_mem_command */ + if (send_cmd->id == CXL_MEM_COMMAND_ID_RAW) + rc = cxl_to_mem_cmd_raw(&mem_cmd, send_cmd, cxlds); + else + rc = cxl_to_mem_cmd(&mem_cmd, send_cmd, cxlds); + + if (rc) + return rc; + + /* Sanitize and construct a cxl_mbox_cmd */ + return cxl_mbox_cmd_ctor(mbox_cmd, cxlds, mem_cmd.opcode, + mem_cmd.info.size_in, mem_cmd.info.size_out, + send_cmd->in.payload); +} + +int cxl_query_cmd(struct cxl_memdev *cxlmd, + struct cxl_mem_query_commands __user *q) +{ + struct device *dev = &cxlmd->dev; + struct cxl_mem_command *cmd; + u32 n_commands; + int j = 0; + + dev_dbg(dev, "Query IOCTL\n"); + + if (get_user(n_commands, &q->n_commands)) + return -EFAULT; + + /* returns the total number if 0 elements are requested. */ + if (n_commands == 0) + return put_user(ARRAY_SIZE(cxl_mem_commands), &q->n_commands); + + /* + * otherwise, return max(n_commands, total commands) cxl_command_info + * structures. + */ + cxl_for_each_cmd(cmd) { + const struct cxl_command_info *info = &cmd->info; + + if (copy_to_user(&q->commands[j++], info, sizeof(*info))) + return -EFAULT; + + if (j == n_commands) + break; + } + + return 0; +} + +/** + * handle_mailbox_cmd_from_user() - Dispatch a mailbox command for userspace. + * @cxlds: The device data for the operation + * @mbox_cmd: The validated mailbox command. + * @out_payload: Pointer to userspace's output payload. + * @size_out: (Input) Max payload size to copy out. + * (Output) Payload size hardware generated. + * @retval: Hardware generated return code from the operation. + * + * Return: + * * %0 - Mailbox transaction succeeded. This implies the mailbox + * protocol completed successfully not that the operation itself + * was successful. + * * %-ENOMEM - Couldn't allocate a bounce buffer. + * * %-EFAULT - Something happened with copy_to/from_user. + * * %-EINTR - Mailbox acquisition interrupted. + * * %-EXXX - Transaction level failures. + * + * Dispatches a mailbox command on behalf of a userspace request. + * The output payload is copied to userspace. + * + * See cxl_send_cmd(). + */ +static int handle_mailbox_cmd_from_user(struct cxl_dev_state *cxlds, + struct cxl_mbox_cmd *mbox_cmd, + u64 out_payload, s32 *size_out, + u32 *retval) +{ + struct device *dev = cxlds->dev; + int rc; + + dev_dbg(dev, + "Submitting %s command for user\n" + "\topcode: %x\n" + "\tsize: %zx\n", + cxl_mem_opcode_to_name(mbox_cmd->opcode), + mbox_cmd->opcode, mbox_cmd->size_in); + + rc = cxlds->mbox_send(cxlds, mbox_cmd); + if (rc) + goto out; + + /* + * @size_out contains the max size that's allowed to be written back out + * to userspace. While the payload may have written more output than + * this it will have to be ignored. + */ + if (mbox_cmd->size_out) { + dev_WARN_ONCE(dev, mbox_cmd->size_out > *size_out, + "Invalid return size\n"); + if (copy_to_user(u64_to_user_ptr(out_payload), + mbox_cmd->payload_out, mbox_cmd->size_out)) { + rc = -EFAULT; + goto out; + } + } + + *size_out = mbox_cmd->size_out; + *retval = mbox_cmd->return_code; + +out: + cxl_mbox_cmd_dtor(mbox_cmd); + return rc; +} + +int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s) +{ + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &cxlmd->dev; + struct cxl_send_command send; + struct cxl_mbox_cmd mbox_cmd; + int rc; + + dev_dbg(dev, "Send IOCTL\n"); + + if (copy_from_user(&send, s, sizeof(send))) + return -EFAULT; + + rc = cxl_validate_cmd_from_user(&mbox_cmd, cxlmd->cxlds, &send); + if (rc) + return rc; + + rc = handle_mailbox_cmd_from_user(cxlds, &mbox_cmd, send.out.payload, + &send.out.size, &send.retval); + if (rc) + return rc; + + if (copy_to_user(s, &send, sizeof(send))) + return -EFAULT; + + return 0; +} + +static int cxl_xfer_log(struct cxl_dev_state *cxlds, uuid_t *uuid, u32 size, u8 *out) +{ + u32 remaining = size; + u32 offset = 0; + + while (remaining) { + u32 xfer_size = min_t(u32, remaining, cxlds->payload_size); + struct cxl_mbox_get_log log = { + .uuid = *uuid, + .offset = cpu_to_le32(offset), + .length = cpu_to_le32(xfer_size) + }; + int rc; + + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_LOG, &log, sizeof(log), + out, xfer_size); + if (rc < 0) + return rc; + + out += xfer_size; + remaining -= xfer_size; + offset += xfer_size; + } + + return 0; +} + +/** + * cxl_walk_cel() - Walk through the Command Effects Log. + * @cxlds: The device data for the operation + * @size: Length of the Command Effects Log. + * @cel: CEL + * + * Iterate over each entry in the CEL and determine if the driver supports the + * command. If so, the command is enabled for the device and can be used later. + */ +static void cxl_walk_cel(struct cxl_dev_state *cxlds, size_t size, u8 *cel) +{ + struct cxl_cel_entry *cel_entry; + const int cel_entries = size / sizeof(*cel_entry); + int i; + + cel_entry = (struct cxl_cel_entry *) cel; + + for (i = 0; i < cel_entries; i++) { + u16 opcode = le16_to_cpu(cel_entry[i].opcode); + struct cxl_mem_command *cmd = cxl_mem_find_command(opcode); + + if (!cmd) { + dev_dbg(cxlds->dev, + "Opcode 0x%04x unsupported by driver", opcode); + continue; + } + + set_bit(cmd->info.id, cxlds->enabled_cmds); + } +} + +static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_dev_state *cxlds) +{ + struct cxl_mbox_get_supported_logs *ret; + int rc; + + ret = kvmalloc(cxlds->payload_size, GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_SUPPORTED_LOGS, NULL, 0, ret, + cxlds->payload_size); + if (rc < 0) { + kvfree(ret); + return ERR_PTR(rc); + } + + return ret; +} + +enum { + CEL_UUID, + VENDOR_DEBUG_UUID, +}; + +/* See CXL 2.0 Table 170. Get Log Input Payload */ +static const uuid_t log_uuid[] = { + [CEL_UUID] = DEFINE_CXL_CEL_UUID, + [VENDOR_DEBUG_UUID] = DEFINE_CXL_VENDOR_DEBUG_UUID, +}; + +/** + * cxl_enumerate_cmds() - Enumerate commands for a device. + * @cxlds: The device data for the operation + * + * Returns 0 if enumerate completed successfully. + * + * CXL devices have optional support for certain commands. This function will + * determine the set of supported commands for the hardware and update the + * enabled_cmds bitmap in the @cxlds. + */ +int cxl_enumerate_cmds(struct cxl_dev_state *cxlds) +{ + struct cxl_mbox_get_supported_logs *gsl; + struct device *dev = cxlds->dev; + struct cxl_mem_command *cmd; + int i, rc; + + gsl = cxl_get_gsl(cxlds); + if (IS_ERR(gsl)) + return PTR_ERR(gsl); + + rc = -ENOENT; + for (i = 0; i < le16_to_cpu(gsl->entries); i++) { + u32 size = le32_to_cpu(gsl->entry[i].size); + uuid_t uuid = gsl->entry[i].uuid; + u8 *log; + + dev_dbg(dev, "Found LOG type %pU of size %d", &uuid, size); + + if (!uuid_equal(&uuid, &log_uuid[CEL_UUID])) + continue; + + log = kvmalloc(size, GFP_KERNEL); + if (!log) { + rc = -ENOMEM; + goto out; + } + + rc = cxl_xfer_log(cxlds, &uuid, size, log); + if (rc) { + kvfree(log); + goto out; + } + + cxl_walk_cel(cxlds, size, log); + kvfree(log); + + /* In case CEL was bogus, enable some default commands. */ + cxl_for_each_cmd(cmd) + if (cmd->flags & CXL_CMD_FLAG_FORCE_ENABLE) + set_bit(cmd->info.id, cxlds->enabled_cmds); + + /* Found the required CEL */ + rc = 0; + } + +out: + kvfree(gsl); + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); + +/** + * cxl_mem_get_partition_info - Get partition info + * @cxlds: The device data for the operation + * + * Retrieve the current partition info for the device specified. The active + * values are the current capacity in bytes. If not 0, the 'next' values are + * the pending values, in bytes, which take affect on next cold reset. + * + * Return: 0 if no error: or the result of the mailbox command. + * + * See CXL @8.2.9.5.2.1 Get Partition Info + */ +static int cxl_mem_get_partition_info(struct cxl_dev_state *cxlds) +{ + struct cxl_mbox_get_partition_info pi; + int rc; + + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_PARTITION_INFO, NULL, 0, + &pi, sizeof(pi)); + + if (rc) + return rc; + + cxlds->active_volatile_bytes = + le64_to_cpu(pi.active_volatile_cap) * CXL_CAPACITY_MULTIPLIER; + cxlds->active_persistent_bytes = + le64_to_cpu(pi.active_persistent_cap) * CXL_CAPACITY_MULTIPLIER; + cxlds->next_volatile_bytes = + le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER; + cxlds->next_persistent_bytes = + le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER; + + return 0; +} + +/** + * cxl_dev_state_identify() - Send the IDENTIFY command to the device. + * @cxlds: The device data for the operation + * + * Return: 0 if identify was executed successfully. + * + * This will dispatch the identify command to the device and on success populate + * structures to be exported to sysfs. + */ +int cxl_dev_state_identify(struct cxl_dev_state *cxlds) +{ + /* See CXL 2.0 Table 175 Identify Memory Device Output Payload */ + struct cxl_mbox_identify id; + int rc; + + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id, + sizeof(id)); + if (rc < 0) + return rc; + + cxlds->total_bytes = + le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER; + cxlds->volatile_only_bytes = + le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER; + cxlds->persistent_only_bytes = + le64_to_cpu(id.persistent_capacity) * CXL_CAPACITY_MULTIPLIER; + cxlds->partition_align_bytes = + le64_to_cpu(id.partition_align) * CXL_CAPACITY_MULTIPLIER; + + cxlds->lsa_size = le32_to_cpu(id.lsa_size); + memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision)); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL); + +static int add_dpa_res(struct device *dev, struct resource *parent, + struct resource *res, resource_size_t start, + resource_size_t size, const char *type) +{ + int rc; + + res->name = type; + res->start = start; + res->end = start + size - 1; + res->flags = IORESOURCE_MEM; + if (resource_size(res) == 0) { + dev_dbg(dev, "DPA(%s): no capacity\n", res->name); + return 0; + } + rc = request_resource(parent, res); + if (rc) { + dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name, + res, rc); + return rc; + } + + dev_dbg(dev, "DPA(%s): %pr\n", res->name, res); + + return 0; +} + +int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) +{ + struct device *dev = cxlds->dev; + int rc; + + cxlds->dpa_res = + (struct resource)DEFINE_RES_MEM(0, cxlds->total_bytes); + + if (cxlds->partition_align_bytes == 0) { + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, + cxlds->volatile_only_bytes, "ram"); + if (rc) + return rc; + return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res, + cxlds->volatile_only_bytes, + cxlds->persistent_only_bytes, "pmem"); + } + + rc = cxl_mem_get_partition_info(cxlds); + if (rc) { + dev_err(dev, "Failed to query partition information\n"); + return rc; + } + + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, + cxlds->active_volatile_bytes, "ram"); + if (rc) + return rc; + return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res, + cxlds->active_volatile_bytes, + cxlds->active_persistent_bytes, "pmem"); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL); + +struct cxl_dev_state *cxl_dev_state_create(struct device *dev) +{ + struct cxl_dev_state *cxlds; + + cxlds = devm_kzalloc(dev, sizeof(*cxlds), GFP_KERNEL); + if (!cxlds) { + dev_err(dev, "No memory available\n"); + return ERR_PTR(-ENOMEM); + } + + mutex_init(&cxlds->mbox_mutex); + cxlds->dev = dev; + + return cxlds; +} +EXPORT_SYMBOL_NS_GPL(cxl_dev_state_create, CXL); + +void __init cxl_mbox_init(void) +{ + struct dentry *mbox_debugfs; + + mbox_debugfs = cxl_debugfs_create_dir("mbox"); + debugfs_create_bool("raw_allow_all", 0600, mbox_debugfs, + &cxl_raw_allow_all); +} diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c new file mode 100644 index 000000000..03cf99cce --- /dev/null +++ b/drivers/cxl/core/memdev.c @@ -0,0 +1,386 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. */ + +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/pci.h> +#include <cxlmem.h> +#include "core.h" + +static DECLARE_RWSEM(cxl_memdev_rwsem); + +/* + * An entire PCI topology full of devices should be enough for any + * config + */ +#define CXL_MEM_MAX_DEVS 65536 + +static int cxl_mem_major; +static DEFINE_IDA(cxl_memdev_ida); + +static void cxl_memdev_release(struct device *dev) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + + ida_free(&cxl_memdev_ida, cxlmd->id); + kfree(cxlmd); +} + +static char *cxl_memdev_devnode(struct device *dev, umode_t *mode, kuid_t *uid, + kgid_t *gid) +{ + return kasprintf(GFP_KERNEL, "cxl/%s", dev_name(dev)); +} + +static ssize_t firmware_version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + return sysfs_emit(buf, "%.16s\n", cxlds->firmware_version); +} +static DEVICE_ATTR_RO(firmware_version); + +static ssize_t payload_max_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + return sysfs_emit(buf, "%zu\n", cxlds->payload_size); +} +static DEVICE_ATTR_RO(payload_max); + +static ssize_t label_storage_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + return sysfs_emit(buf, "%zu\n", cxlds->lsa_size); +} +static DEVICE_ATTR_RO(label_storage_size); + +static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + unsigned long long len = resource_size(&cxlds->ram_res); + + return sysfs_emit(buf, "%#llx\n", len); +} + +static struct device_attribute dev_attr_ram_size = + __ATTR(size, 0444, ram_size_show, NULL); + +static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + unsigned long long len = resource_size(&cxlds->pmem_res); + + return sysfs_emit(buf, "%#llx\n", len); +} + +static struct device_attribute dev_attr_pmem_size = + __ATTR(size, 0444, pmem_size_show, NULL); + +static ssize_t serial_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + return sysfs_emit(buf, "%#llx\n", cxlds->serial); +} +static DEVICE_ATTR_RO(serial); + +static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", dev_to_node(dev)); +} +static DEVICE_ATTR_RO(numa_node); + +static struct attribute *cxl_memdev_attributes[] = { + &dev_attr_serial.attr, + &dev_attr_firmware_version.attr, + &dev_attr_payload_max.attr, + &dev_attr_label_storage_size.attr, + &dev_attr_numa_node.attr, + NULL, +}; + +static struct attribute *cxl_memdev_pmem_attributes[] = { + &dev_attr_pmem_size.attr, + NULL, +}; + +static struct attribute *cxl_memdev_ram_attributes[] = { + &dev_attr_ram_size.attr, + NULL, +}; + +static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr) + return 0; + return a->mode; +} + +static struct attribute_group cxl_memdev_attribute_group = { + .attrs = cxl_memdev_attributes, + .is_visible = cxl_memdev_visible, +}; + +static struct attribute_group cxl_memdev_ram_attribute_group = { + .name = "ram", + .attrs = cxl_memdev_ram_attributes, +}; + +static struct attribute_group cxl_memdev_pmem_attribute_group = { + .name = "pmem", + .attrs = cxl_memdev_pmem_attributes, +}; + +static const struct attribute_group *cxl_memdev_attribute_groups[] = { + &cxl_memdev_attribute_group, + &cxl_memdev_ram_attribute_group, + &cxl_memdev_pmem_attribute_group, + NULL, +}; + +static const struct device_type cxl_memdev_type = { + .name = "cxl_memdev", + .release = cxl_memdev_release, + .devnode = cxl_memdev_devnode, + .groups = cxl_memdev_attribute_groups, +}; + +bool is_cxl_memdev(struct device *dev) +{ + return dev->type == &cxl_memdev_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, CXL); + +/** + * set_exclusive_cxl_commands() - atomically disable user cxl commands + * @cxlds: The device state to operate on + * @cmds: bitmap of commands to mark exclusive + * + * Grab the cxl_memdev_rwsem in write mode to flush in-flight + * invocations of the ioctl path and then disable future execution of + * commands with the command ids set in @cmds. + */ +void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds) +{ + down_write(&cxl_memdev_rwsem); + bitmap_or(cxlds->exclusive_cmds, cxlds->exclusive_cmds, cmds, + CXL_MEM_COMMAND_ID_MAX); + up_write(&cxl_memdev_rwsem); +} +EXPORT_SYMBOL_NS_GPL(set_exclusive_cxl_commands, CXL); + +/** + * clear_exclusive_cxl_commands() - atomically enable user cxl commands + * @cxlds: The device state to modify + * @cmds: bitmap of commands to mark available for userspace + */ +void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds) +{ + down_write(&cxl_memdev_rwsem); + bitmap_andnot(cxlds->exclusive_cmds, cxlds->exclusive_cmds, cmds, + CXL_MEM_COMMAND_ID_MAX); + up_write(&cxl_memdev_rwsem); +} +EXPORT_SYMBOL_NS_GPL(clear_exclusive_cxl_commands, CXL); + +static void cxl_memdev_shutdown(struct device *dev) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + + down_write(&cxl_memdev_rwsem); + cxlmd->cxlds = NULL; + up_write(&cxl_memdev_rwsem); +} + +static void cxl_memdev_unregister(void *_cxlmd) +{ + struct cxl_memdev *cxlmd = _cxlmd; + struct device *dev = &cxlmd->dev; + + cdev_device_del(&cxlmd->cdev, dev); + cxl_memdev_shutdown(dev); + put_device(dev); +} + +static void detach_memdev(struct work_struct *work) +{ + struct cxl_memdev *cxlmd; + + cxlmd = container_of(work, typeof(*cxlmd), detach_work); + device_release_driver(&cxlmd->dev); + put_device(&cxlmd->dev); +} + +static struct lock_class_key cxl_memdev_key; + +static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, + const struct file_operations *fops) +{ + struct cxl_memdev *cxlmd; + struct device *dev; + struct cdev *cdev; + int rc; + + cxlmd = kzalloc(sizeof(*cxlmd), GFP_KERNEL); + if (!cxlmd) + return ERR_PTR(-ENOMEM); + + rc = ida_alloc_range(&cxl_memdev_ida, 0, CXL_MEM_MAX_DEVS, GFP_KERNEL); + if (rc < 0) + goto err; + cxlmd->id = rc; + + dev = &cxlmd->dev; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_memdev_key); + dev->parent = cxlds->dev; + dev->bus = &cxl_bus_type; + dev->devt = MKDEV(cxl_mem_major, cxlmd->id); + dev->type = &cxl_memdev_type; + device_set_pm_not_required(dev); + INIT_WORK(&cxlmd->detach_work, detach_memdev); + + cdev = &cxlmd->cdev; + cdev_init(cdev, fops); + return cxlmd; + +err: + kfree(cxlmd); + return ERR_PTR(rc); +} + +static long __cxl_memdev_ioctl(struct cxl_memdev *cxlmd, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case CXL_MEM_QUERY_COMMANDS: + return cxl_query_cmd(cxlmd, (void __user *)arg); + case CXL_MEM_SEND_COMMAND: + return cxl_send_cmd(cxlmd, (void __user *)arg); + default: + return -ENOTTY; + } +} + +static long cxl_memdev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct cxl_memdev *cxlmd = file->private_data; + int rc = -ENXIO; + + down_read(&cxl_memdev_rwsem); + if (cxlmd->cxlds) + rc = __cxl_memdev_ioctl(cxlmd, cmd, arg); + up_read(&cxl_memdev_rwsem); + + return rc; +} + +static int cxl_memdev_open(struct inode *inode, struct file *file) +{ + struct cxl_memdev *cxlmd = + container_of(inode->i_cdev, typeof(*cxlmd), cdev); + + get_device(&cxlmd->dev); + file->private_data = cxlmd; + + return 0; +} + +static int cxl_memdev_release_file(struct inode *inode, struct file *file) +{ + struct cxl_memdev *cxlmd = + container_of(inode->i_cdev, typeof(*cxlmd), cdev); + + put_device(&cxlmd->dev); + + return 0; +} + +static const struct file_operations cxl_memdev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = cxl_memdev_ioctl, + .open = cxl_memdev_open, + .release = cxl_memdev_release_file, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds) +{ + struct cxl_memdev *cxlmd; + struct device *dev; + struct cdev *cdev; + int rc; + + cxlmd = cxl_memdev_alloc(cxlds, &cxl_memdev_fops); + if (IS_ERR(cxlmd)) + return cxlmd; + + dev = &cxlmd->dev; + rc = dev_set_name(dev, "mem%d", cxlmd->id); + if (rc) + goto err; + + /* + * Activate ioctl operations, no cxl_memdev_rwsem manipulation + * needed as this is ordered with cdev_add() publishing the device. + */ + cxlmd->cxlds = cxlds; + + cdev = &cxlmd->cdev; + rc = cdev_device_add(cdev, dev); + if (rc) + goto err; + + rc = devm_add_action_or_reset(cxlds->dev, cxl_memdev_unregister, cxlmd); + if (rc) + return ERR_PTR(rc); + return cxlmd; + +err: + /* + * The cdev was briefly live, shutdown any ioctl operations that + * saw that state. + */ + cxl_memdev_shutdown(dev); + put_device(dev); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, CXL); + +__init int cxl_memdev_init(void) +{ + dev_t devt; + int rc; + + rc = alloc_chrdev_region(&devt, 0, CXL_MEM_MAX_DEVS, "cxl"); + if (rc) + return rc; + + cxl_mem_major = MAJOR(devt); + + return 0; +} + +void cxl_memdev_exit(void) +{ + unregister_chrdev_region(MKDEV(cxl_mem_major, 0), CXL_MEM_MAX_DEVS); +} diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c new file mode 100644 index 000000000..003a44132 --- /dev/null +++ b/drivers/cxl/core/pci.c @@ -0,0 +1,702 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2021 Intel Corporation. All rights reserved. */ +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/device.h> +#include <linux/delay.h> +#include <linux/pci.h> +#include <linux/pci-doe.h> +#include <cxlpci.h> +#include <cxlmem.h> +#include <cxl.h> +#include "core.h" + +/** + * DOC: cxl core pci + * + * Compute Express Link protocols are layered on top of PCIe. CXL core provides + * a set of helpers for CXL interactions which occur via PCIe. + */ + +static unsigned short media_ready_timeout = 60; +module_param(media_ready_timeout, ushort, 0644); +MODULE_PARM_DESC(media_ready_timeout, "seconds to wait for media ready"); + +struct cxl_walk_context { + struct pci_bus *bus; + struct cxl_port *port; + int type; + int error; + int count; +}; + +static int match_add_dports(struct pci_dev *pdev, void *data) +{ + struct cxl_walk_context *ctx = data; + struct cxl_port *port = ctx->port; + int type = pci_pcie_type(pdev); + struct cxl_register_map map; + struct cxl_dport *dport; + u32 lnkcap, port_num; + int rc; + + if (pdev->bus != ctx->bus) + return 0; + if (!pci_is_pcie(pdev)) + return 0; + if (type != ctx->type) + return 0; + if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP, + &lnkcap)) + return 0; + + rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); + if (rc) + dev_dbg(&port->dev, "failed to find component registers\n"); + + port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); + dport = devm_cxl_add_dport(port, &pdev->dev, port_num, + cxl_regmap_to_base(pdev, &map)); + if (IS_ERR(dport)) { + ctx->error = PTR_ERR(dport); + return PTR_ERR(dport); + } + ctx->count++; + + dev_dbg(&port->dev, "add dport%d: %s\n", port_num, dev_name(&pdev->dev)); + + return 0; +} + +/** + * devm_cxl_port_enumerate_dports - enumerate downstream ports of the upstream port + * @port: cxl_port whose ->uport is the upstream of dports to be enumerated + * + * Returns a positive number of dports enumerated or a negative error + * code. + */ +int devm_cxl_port_enumerate_dports(struct cxl_port *port) +{ + struct pci_bus *bus = cxl_port_to_pci_bus(port); + struct cxl_walk_context ctx; + int type; + + if (!bus) + return -ENXIO; + + if (pci_is_root_bus(bus)) + type = PCI_EXP_TYPE_ROOT_PORT; + else + type = PCI_EXP_TYPE_DOWNSTREAM; + + ctx = (struct cxl_walk_context) { + .port = port, + .bus = bus, + .type = type, + }; + pci_walk_bus(bus, match_add_dports, &ctx); + + if (ctx.count == 0) + return -ENODEV; + if (ctx.error) + return ctx.error; + return ctx.count; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_port_enumerate_dports, CXL); + +static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + bool valid = false; + int rc, i; + u32 temp; + + if (id > CXL_DVSEC_RANGE_MAX) + return -EINVAL; + + /* Check MEM INFO VALID bit first, give up after 1s */ + i = 1; + do { + rc = pci_read_config_dword(pdev, + d + CXL_DVSEC_RANGE_SIZE_LOW(id), + &temp); + if (rc) + return rc; + + valid = FIELD_GET(CXL_DVSEC_MEM_INFO_VALID, temp); + if (valid) + break; + msleep(1000); + } while (i--); + + if (!valid) { + dev_err(&pdev->dev, + "Timeout awaiting memory range %d valid after 1s.\n", + id); + return -ETIMEDOUT; + } + + return 0; +} + +static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + bool active = false; + int rc, i; + u32 temp; + + if (id > CXL_DVSEC_RANGE_MAX) + return -EINVAL; + + /* Check MEM ACTIVE bit, up to 60s timeout by default */ + for (i = media_ready_timeout; i; i--) { + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(id), &temp); + if (rc) + return rc; + + active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp); + if (active) + break; + msleep(1000); + } + + if (!active) { + dev_err(&pdev->dev, + "timeout awaiting memory active after %d seconds\n", + media_ready_timeout); + return -ETIMEDOUT; + } + + return 0; +} + +/* + * Wait up to @media_ready_timeout for the device to report memory + * active. + */ +int cxl_await_media_ready(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + int rc, i, hdm_count; + u64 md_status; + u16 cap; + + rc = pci_read_config_word(pdev, + d + CXL_DVSEC_CAP_OFFSET, &cap); + if (rc) + return rc; + + hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + for (i = 0; i < hdm_count; i++) { + rc = cxl_dvsec_mem_range_valid(cxlds, i); + if (rc) + return rc; + } + + for (i = 0; i < hdm_count; i++) { + rc = cxl_dvsec_mem_range_active(cxlds, i); + if (rc) + return rc; + } + + md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); + if (!CXLMDEV_READY(md_status)) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_await_media_ready, CXL); + +static int wait_for_valid(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec, rc; + u32 val; + + /* + * Memory_Info_Valid: When set, indicates that the CXL Range 1 Size high + * and Size Low registers are valid. Must be set within 1 second of + * deassertion of reset to CXL device. Likely it is already set by the + * time this runs, but otherwise give a 1.5 second timeout in case of + * clock skew. + */ + rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val); + if (rc) + return rc; + + if (val & CXL_DVSEC_MEM_INFO_VALID) + return 0; + + msleep(1500); + + rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val); + if (rc) + return rc; + + if (val & CXL_DVSEC_MEM_INFO_VALID) + return 0; + + return -ETIMEDOUT; +} + +static int cxl_set_mem_enable(struct cxl_dev_state *cxlds, u16 val) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + u16 ctrl; + int rc; + + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + if (rc < 0) + return rc; + + if ((ctrl & CXL_DVSEC_MEM_ENABLE) == val) + return 1; + ctrl &= ~CXL_DVSEC_MEM_ENABLE; + ctrl |= val; + + rc = pci_write_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, ctrl); + if (rc < 0) + return rc; + + return 0; +} + +static void clear_mem_enable(void *cxlds) +{ + cxl_set_mem_enable(cxlds, 0); +} + +static int devm_cxl_enable_mem(struct device *host, struct cxl_dev_state *cxlds) +{ + int rc; + + rc = cxl_set_mem_enable(cxlds, CXL_DVSEC_MEM_ENABLE); + if (rc < 0) + return rc; + if (rc > 0) + return 0; + return devm_add_action_or_reset(host, clear_mem_enable, cxlds); +} + +static bool range_contains(struct range *r1, struct range *r2) +{ + return r1->start <= r2->start && r1->end >= r2->end; +} + +/* require dvsec ranges to be covered by a locked platform window */ +static int dvsec_range_allowed(struct device *dev, void *arg) +{ + struct range *dev_range = arg; + struct cxl_decoder *cxld; + + if (!is_root_decoder(dev)) + return 0; + + cxld = to_cxl_decoder(dev); + + if (!(cxld->flags & CXL_DECODER_F_LOCK)) + return 0; + if (!(cxld->flags & CXL_DECODER_F_RAM)) + return 0; + + return range_contains(&cxld->hpa_range, dev_range); +} + +static void disable_hdm(void *_cxlhdm) +{ + u32 global_ctrl; + struct cxl_hdm *cxlhdm = _cxlhdm; + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + + global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET); + writel(global_ctrl & ~CXL_HDM_DECODER_ENABLE, + hdm + CXL_HDM_DECODER_CTRL_OFFSET); +} + +static int devm_cxl_enable_hdm(struct device *host, struct cxl_hdm *cxlhdm) +{ + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + u32 global_ctrl; + + global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET); + writel(global_ctrl | CXL_HDM_DECODER_ENABLE, + hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + return devm_add_action_or_reset(host, disable_hdm, cxlhdm); +} + +static bool __cxl_hdm_decode_init(struct cxl_dev_state *cxlds, + struct cxl_hdm *cxlhdm, + struct cxl_endpoint_dvsec_info *info) +{ + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + struct cxl_port *port = cxlhdm->port; + struct device *dev = cxlds->dev; + struct cxl_port *root; + int i, rc, allowed; + u32 global_ctrl; + + global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + /* + * If the HDM Decoder Capability is already enabled then assume + * that some other agent like platform firmware set it up. + */ + if (global_ctrl & CXL_HDM_DECODER_ENABLE) { + rc = devm_cxl_enable_mem(&port->dev, cxlds); + if (rc) + return false; + return true; + } + + root = to_cxl_port(port->dev.parent); + while (!is_cxl_root(root) && is_cxl_port(root->dev.parent)) + root = to_cxl_port(root->dev.parent); + if (!is_cxl_root(root)) { + dev_err(dev, "Failed to acquire root port for HDM enable\n"); + return false; + } + + for (i = 0, allowed = 0; info->mem_enabled && i < info->ranges; i++) { + struct device *cxld_dev; + + cxld_dev = device_find_child(&root->dev, &info->dvsec_range[i], + dvsec_range_allowed); + if (!cxld_dev) { + dev_dbg(dev, "DVSEC Range%d denied by platform\n", i); + continue; + } + dev_dbg(dev, "DVSEC Range%d allowed by platform\n", i); + put_device(cxld_dev); + allowed++; + } + + if (!allowed) { + cxl_set_mem_enable(cxlds, 0); + info->mem_enabled = 0; + } + + /* + * Per CXL 2.0 Section 8.1.3.8.3 and 8.1.3.8.4 DVSEC CXL Range 1 Base + * [High,Low] when HDM operation is enabled the range register values + * are ignored by the device, but the spec also recommends matching the + * DVSEC Range 1,2 to HDM Decoder Range 0,1. So, non-zero info->ranges + * are expected even though Linux does not require or maintain that + * match. If at least one DVSEC range is enabled and allowed, skip HDM + * Decoder Capability Enable. + */ + if (info->mem_enabled) + return false; + + rc = devm_cxl_enable_hdm(&port->dev, cxlhdm); + if (rc) + return false; + + rc = devm_cxl_enable_mem(&port->dev, cxlds); + if (rc) + return false; + + return true; +} + +/** + * cxl_hdm_decode_init() - Setup HDM decoding for the endpoint + * @cxlds: Device state + * @cxlhdm: Mapped HDM decoder Capability + * + * Try to enable the endpoint's HDM Decoder Capability + */ +int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + struct cxl_endpoint_dvsec_info info = { 0 }; + int hdm_count, rc, i, ranges = 0; + struct device *dev = &pdev->dev; + int d = cxlds->cxl_dvsec; + u16 cap, ctrl; + + if (!d) { + dev_dbg(dev, "No DVSEC Capability\n"); + return -ENXIO; + } + + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap); + if (rc) + return rc; + + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + if (rc) + return rc; + + if (!(cap & CXL_DVSEC_MEM_CAPABLE)) { + dev_dbg(dev, "Not MEM Capable\n"); + return -ENXIO; + } + + /* + * It is not allowed by spec for MEM.capable to be set and have 0 legacy + * HDM decoders (values > 2 are also undefined as of CXL 2.0). As this + * driver is for a spec defined class code which must be CXL.mem + * capable, there is no point in continuing to enable CXL.mem. + */ + hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + if (!hdm_count || hdm_count > 2) + return -EINVAL; + + rc = wait_for_valid(cxlds); + if (rc) { + dev_dbg(dev, "Failure awaiting MEM_INFO_VALID (%d)\n", rc); + return rc; + } + + /* + * The current DVSEC values are moot if the memory capability is + * disabled, and they will remain moot after the HDM Decoder + * capability is enabled. + */ + info.mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl); + if (!info.mem_enabled) + goto hdm_init; + + for (i = 0; i < hdm_count; i++) { + u64 base, size; + u32 temp; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp); + if (rc) + return rc; + + size = (u64)temp << 32; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp); + if (rc) + return rc; + + size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp); + if (rc) + return rc; + + base = (u64)temp << 32; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp); + if (rc) + return rc; + + base |= temp & CXL_DVSEC_MEM_BASE_LOW_MASK; + + info.dvsec_range[i] = (struct range) { + .start = base, + .end = base + size - 1 + }; + + if (size) + ranges++; + } + + info.ranges = ranges; + + /* + * If DVSEC ranges are being used instead of HDM decoder registers there + * is no use in trying to manage those. + */ +hdm_init: + if (!__cxl_hdm_decode_init(cxlds, cxlhdm, &info)) { + dev_err(dev, + "Legacy range registers configuration prevents HDM operation.\n"); + return -EBUSY; + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, CXL); + +#define CXL_DOE_TABLE_ACCESS_REQ_CODE 0x000000ff +#define CXL_DOE_TABLE_ACCESS_REQ_CODE_READ 0 +#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE 0x0000ff00 +#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA 0 +#define CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE 0xffff0000 +#define CXL_DOE_TABLE_ACCESS_LAST_ENTRY 0xffff +#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2 + +static struct pci_doe_mb *find_cdat_doe(struct device *uport) +{ + struct cxl_memdev *cxlmd; + struct cxl_dev_state *cxlds; + unsigned long index; + void *entry; + + cxlmd = to_cxl_memdev(uport); + cxlds = cxlmd->cxlds; + + xa_for_each(&cxlds->doe_mbs, index, entry) { + struct pci_doe_mb *cur = entry; + + if (pci_doe_supports_prot(cur, PCI_DVSEC_VENDOR_ID_CXL, + CXL_DOE_PROTOCOL_TABLE_ACCESS)) + return cur; + } + + return NULL; +} + +#define CDAT_DOE_REQ(entry_handle) cpu_to_le32 \ + (FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE, \ + CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) | \ + FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE, \ + CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA) | \ + FIELD_PREP(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, (entry_handle))) + +static void cxl_doe_task_complete(struct pci_doe_task *task) +{ + complete(task->private); +} + +struct cdat_doe_task { + __le32 request_pl; + __le32 response_pl[32]; + struct completion c; + struct pci_doe_task task; +}; + +#define DECLARE_CDAT_DOE_TASK(req, cdt) \ +struct cdat_doe_task cdt = { \ + .c = COMPLETION_INITIALIZER_ONSTACK(cdt.c), \ + .request_pl = req, \ + .task = { \ + .prot.vid = PCI_DVSEC_VENDOR_ID_CXL, \ + .prot.type = CXL_DOE_PROTOCOL_TABLE_ACCESS, \ + .request_pl = &cdt.request_pl, \ + .request_pl_sz = sizeof(cdt.request_pl), \ + .response_pl = cdt.response_pl, \ + .response_pl_sz = sizeof(cdt.response_pl), \ + .complete = cxl_doe_task_complete, \ + .private = &cdt.c, \ + } \ +} + +static int cxl_cdat_get_length(struct device *dev, + struct pci_doe_mb *cdat_doe, + size_t *length) +{ + DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(0), t); + int rc; + + rc = pci_doe_submit_task(cdat_doe, &t.task); + if (rc < 0) { + dev_err(dev, "DOE submit failed: %d", rc); + return rc; + } + wait_for_completion(&t.c); + if (t.task.rv < 2 * sizeof(__le32)) + return -EIO; + + *length = le32_to_cpu(t.response_pl[1]); + dev_dbg(dev, "CDAT length %zu\n", *length); + + return 0; +} + +static int cxl_cdat_read_table(struct device *dev, + struct pci_doe_mb *cdat_doe, + struct cxl_cdat *cdat) +{ + size_t length = cdat->length; + __le32 *data = cdat->table; + int entry_handle = 0; + + do { + DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(entry_handle), t); + struct cdat_entry_header *entry; + size_t entry_dw; + int rc; + + rc = pci_doe_submit_task(cdat_doe, &t.task); + if (rc < 0) { + dev_err(dev, "DOE submit failed: %d", rc); + return rc; + } + wait_for_completion(&t.c); + + /* 1 DW Table Access Response Header + CDAT entry */ + entry = (struct cdat_entry_header *)(t.response_pl + 1); + if ((entry_handle == 0 && + t.task.rv != sizeof(__le32) + sizeof(struct cdat_header)) || + (entry_handle > 0 && + (t.task.rv < sizeof(__le32) + sizeof(*entry) || + t.task.rv != sizeof(__le32) + le16_to_cpu(entry->length)))) + return -EIO; + + /* Get the CXL table access header entry handle */ + entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, + le32_to_cpu(t.response_pl[0])); + entry_dw = t.task.rv / sizeof(__le32); + /* Skip Header */ + entry_dw -= 1; + entry_dw = min(length / sizeof(__le32), entry_dw); + /* Prevent length < 1 DW from causing a buffer overflow */ + if (entry_dw) { + memcpy(data, entry, entry_dw * sizeof(__le32)); + length -= entry_dw * sizeof(__le32); + data += entry_dw; + } + } while (entry_handle != CXL_DOE_TABLE_ACCESS_LAST_ENTRY); + + /* Length in CDAT header may exceed concatenation of CDAT entries */ + cdat->length -= length; + + return 0; +} + +/** + * read_cdat_data - Read the CDAT data on this port + * @port: Port to read data from + * + * This call will sleep waiting for responses from the DOE mailbox. + */ +void read_cdat_data(struct cxl_port *port) +{ + struct pci_doe_mb *cdat_doe; + struct device *dev = &port->dev; + struct device *uport = port->uport; + size_t cdat_length; + int rc; + + cdat_doe = find_cdat_doe(uport); + if (!cdat_doe) { + dev_dbg(dev, "No CDAT mailbox\n"); + return; + } + + port->cdat_available = true; + + if (cxl_cdat_get_length(dev, cdat_doe, &cdat_length)) { + dev_dbg(dev, "No CDAT length\n"); + return; + } + + port->cdat.table = devm_kzalloc(dev, cdat_length, GFP_KERNEL); + if (!port->cdat.table) + return; + + port->cdat.length = cdat_length; + rc = cxl_cdat_read_table(dev, cdat_doe, &port->cdat); + if (rc) { + /* Don't leave table data allocated on error */ + devm_kfree(dev, port->cdat.table); + port->cdat.table = NULL; + port->cdat.length = 0; + dev_err(dev, "CDAT data read error\n"); + } +} +EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c new file mode 100644 index 000000000..36aa5070d --- /dev/null +++ b/drivers/cxl/core/pmem.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. */ +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <cxlmem.h> +#include <cxl.h> +#include "core.h" + +/** + * DOC: cxl pmem + * + * The core CXL PMEM infrastructure supports persistent memory + * provisioning and serves as a bridge to the LIBNVDIMM subsystem. A CXL + * 'bridge' device is added at the root of a CXL device topology if + * platform firmware advertises at least one persistent memory capable + * CXL window. That root-level bridge corresponds to a LIBNVDIMM 'bus' + * device. Then for each cxl_memdev in the CXL device topology a bridge + * device is added to host a LIBNVDIMM dimm object. When these bridges + * are registered native LIBNVDIMM uapis are translated to CXL + * operations, for example, namespace label access commands. + */ + +static DEFINE_IDA(cxl_nvdimm_bridge_ida); + +static void cxl_nvdimm_bridge_release(struct device *dev) +{ + struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev); + + ida_free(&cxl_nvdimm_bridge_ida, cxl_nvb->id); + kfree(cxl_nvb); +} + +static const struct attribute_group *cxl_nvdimm_bridge_attribute_groups[] = { + &cxl_base_attribute_group, + NULL, +}; + +const struct device_type cxl_nvdimm_bridge_type = { + .name = "cxl_nvdimm_bridge", + .release = cxl_nvdimm_bridge_release, + .groups = cxl_nvdimm_bridge_attribute_groups, +}; + +struct cxl_nvdimm_bridge *to_cxl_nvdimm_bridge(struct device *dev) +{ + if (dev_WARN_ONCE(dev, dev->type != &cxl_nvdimm_bridge_type, + "not a cxl_nvdimm_bridge device\n")) + return NULL; + return container_of(dev, struct cxl_nvdimm_bridge, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm_bridge, CXL); + +bool is_cxl_nvdimm_bridge(struct device *dev) +{ + return dev->type == &cxl_nvdimm_bridge_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_nvdimm_bridge, CXL); + +static int match_nvdimm_bridge(struct device *dev, void *data) +{ + return is_cxl_nvdimm_bridge(dev); +} + +struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *start) +{ + struct cxl_port *port = find_cxl_root(start); + struct device *dev; + + if (!port) + return NULL; + + dev = device_find_child(&port->dev, NULL, match_nvdimm_bridge); + put_device(&port->dev); + + if (!dev) + return NULL; + + return to_cxl_nvdimm_bridge(dev); +} +EXPORT_SYMBOL_NS_GPL(cxl_find_nvdimm_bridge, CXL); + +static struct lock_class_key cxl_nvdimm_bridge_key; + +static struct cxl_nvdimm_bridge *cxl_nvdimm_bridge_alloc(struct cxl_port *port) +{ + struct cxl_nvdimm_bridge *cxl_nvb; + struct device *dev; + int rc; + + cxl_nvb = kzalloc(sizeof(*cxl_nvb), GFP_KERNEL); + if (!cxl_nvb) + return ERR_PTR(-ENOMEM); + + rc = ida_alloc(&cxl_nvdimm_bridge_ida, GFP_KERNEL); + if (rc < 0) + goto err; + cxl_nvb->id = rc; + + dev = &cxl_nvb->dev; + cxl_nvb->port = port; + cxl_nvb->state = CXL_NVB_NEW; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_nvdimm_bridge_key); + device_set_pm_not_required(dev); + dev->parent = &port->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_nvdimm_bridge_type; + + return cxl_nvb; + +err: + kfree(cxl_nvb); + return ERR_PTR(rc); +} + +static void unregister_nvb(void *_cxl_nvb) +{ + struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb; + bool flush; + + /* + * If the bridge was ever activated then there might be in-flight state + * work to flush. Once the state has been changed to 'dead' then no new + * work can be queued by user-triggered bind. + */ + device_lock(&cxl_nvb->dev); + flush = cxl_nvb->state != CXL_NVB_NEW; + cxl_nvb->state = CXL_NVB_DEAD; + device_unlock(&cxl_nvb->dev); + + /* + * Even though the device core will trigger device_release_driver() + * before the unregister, it does not know about the fact that + * cxl_nvdimm_bridge_driver defers ->remove() work. So, do the driver + * release not and flush it before tearing down the nvdimm device + * hierarchy. + */ + device_release_driver(&cxl_nvb->dev); + if (flush) + flush_work(&cxl_nvb->state_work); + device_unregister(&cxl_nvb->dev); +} + +/** + * devm_cxl_add_nvdimm_bridge() - add the root of a LIBNVDIMM topology + * @host: platform firmware root device + * @port: CXL port at the root of a CXL topology + * + * Return: bridge device that can host cxl_nvdimm objects + */ +struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host, + struct cxl_port *port) +{ + struct cxl_nvdimm_bridge *cxl_nvb; + struct device *dev; + int rc; + + if (!IS_ENABLED(CONFIG_CXL_PMEM)) + return ERR_PTR(-ENXIO); + + cxl_nvb = cxl_nvdimm_bridge_alloc(port); + if (IS_ERR(cxl_nvb)) + return cxl_nvb; + + dev = &cxl_nvb->dev; + rc = dev_set_name(dev, "nvdimm-bridge%d", cxl_nvb->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + rc = devm_add_action_or_reset(host, unregister_nvb, cxl_nvb); + if (rc) + return ERR_PTR(rc); + + return cxl_nvb; + +err: + put_device(dev); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm_bridge, CXL); + +static void cxl_nvdimm_release(struct device *dev) +{ + struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev); + + xa_destroy(&cxl_nvd->pmem_regions); + kfree(cxl_nvd); +} + +static const struct attribute_group *cxl_nvdimm_attribute_groups[] = { + &cxl_base_attribute_group, + NULL, +}; + +const struct device_type cxl_nvdimm_type = { + .name = "cxl_nvdimm", + .release = cxl_nvdimm_release, + .groups = cxl_nvdimm_attribute_groups, +}; + +bool is_cxl_nvdimm(struct device *dev) +{ + return dev->type == &cxl_nvdimm_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_nvdimm, CXL); + +struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_cxl_nvdimm(dev), + "not a cxl_nvdimm device\n")) + return NULL; + return container_of(dev, struct cxl_nvdimm, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm, CXL); + +static struct lock_class_key cxl_nvdimm_key; + +static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_memdev *cxlmd) +{ + struct cxl_nvdimm *cxl_nvd; + struct device *dev; + + cxl_nvd = kzalloc(sizeof(*cxl_nvd), GFP_KERNEL); + if (!cxl_nvd) + return ERR_PTR(-ENOMEM); + + dev = &cxl_nvd->dev; + cxl_nvd->cxlmd = cxlmd; + xa_init(&cxl_nvd->pmem_regions); + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_nvdimm_key); + device_set_pm_not_required(dev); + dev->parent = &cxlmd->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_nvdimm_type; + + return cxl_nvd; +} + +static void cxl_nvd_unregister(void *dev) +{ + device_unregister(dev); +} + +/** + * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm + * @host: same host as @cxlmd + * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations + * + * Return: 0 on success negative error code on failure. + */ +int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd) +{ + struct cxl_nvdimm *cxl_nvd; + struct device *dev; + int rc; + + cxl_nvd = cxl_nvdimm_alloc(cxlmd); + if (IS_ERR(cxl_nvd)) + return PTR_ERR(cxl_nvd); + + dev = &cxl_nvd->dev; + rc = dev_set_name(dev, "pmem%d", cxlmd->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + dev_dbg(host, "%s: register %s\n", dev_name(dev->parent), + dev_name(dev)); + + return devm_add_action_or_reset(host, cxl_nvd_unregister, dev); + +err: + put_device(dev); + return rc; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm, CXL); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c new file mode 100644 index 000000000..1f1483a9e --- /dev/null +++ b/drivers/cxl/core/port.c @@ -0,0 +1,1894 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. All rights reserved. */ +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/memregion.h> +#include <linux/workqueue.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <cxlmem.h> +#include <cxlpci.h> +#include <cxl.h> +#include "core.h" + +/** + * DOC: cxl core + * + * The CXL core provides a set of interfaces that can be consumed by CXL aware + * drivers. The interfaces allow for creation, modification, and destruction of + * regions, memory devices, ports, and decoders. CXL aware drivers must register + * with the CXL core via these interfaces in order to be able to participate in + * cross-device interleave coordination. The CXL core also establishes and + * maintains the bridge to the nvdimm subsystem. + * + * CXL core introduces sysfs hierarchy to control the devices that are + * instantiated by the core. + */ + +static DEFINE_IDA(cxl_port_ida); +static DEFINE_XARRAY(cxl_root_buses); + +static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%s\n", dev->type->name); +} +static DEVICE_ATTR_RO(devtype); + +static int cxl_device_id(struct device *dev) +{ + if (dev->type == &cxl_nvdimm_bridge_type) + return CXL_DEVICE_NVDIMM_BRIDGE; + if (dev->type == &cxl_nvdimm_type) + return CXL_DEVICE_NVDIMM; + if (dev->type == CXL_PMEM_REGION_TYPE()) + return CXL_DEVICE_PMEM_REGION; + if (is_cxl_port(dev)) { + if (is_cxl_root(to_cxl_port(dev))) + return CXL_DEVICE_ROOT; + return CXL_DEVICE_PORT; + } + if (is_cxl_memdev(dev)) + return CXL_DEVICE_MEMORY_EXPANDER; + if (dev->type == CXL_REGION_TYPE()) + return CXL_DEVICE_REGION; + return 0; +} + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, CXL_MODALIAS_FMT "\n", cxl_device_id(dev)); +} +static DEVICE_ATTR_RO(modalias); + +static struct attribute *cxl_base_attributes[] = { + &dev_attr_devtype.attr, + &dev_attr_modalias.attr, + NULL, +}; + +struct attribute_group cxl_base_attribute_group = { + .attrs = cxl_base_attributes, +}; + +static ssize_t start_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%#llx\n", cxld->hpa_range.start); +} +static DEVICE_ATTR_ADMIN_RO(start); + +static ssize_t size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%#llx\n", range_len(&cxld->hpa_range)); +} +static DEVICE_ATTR_RO(size); + +#define CXL_DECODER_FLAG_ATTR(name, flag) \ +static ssize_t name##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct cxl_decoder *cxld = to_cxl_decoder(dev); \ + \ + return sysfs_emit(buf, "%s\n", \ + (cxld->flags & (flag)) ? "1" : "0"); \ +} \ +static DEVICE_ATTR_RO(name) + +CXL_DECODER_FLAG_ATTR(cap_pmem, CXL_DECODER_F_PMEM); +CXL_DECODER_FLAG_ATTR(cap_ram, CXL_DECODER_F_RAM); +CXL_DECODER_FLAG_ATTR(cap_type2, CXL_DECODER_F_TYPE2); +CXL_DECODER_FLAG_ATTR(cap_type3, CXL_DECODER_F_TYPE3); +CXL_DECODER_FLAG_ATTR(locked, CXL_DECODER_F_LOCK); + +static ssize_t target_type_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + switch (cxld->target_type) { + case CXL_DECODER_ACCELERATOR: + return sysfs_emit(buf, "accelerator\n"); + case CXL_DECODER_EXPANDER: + return sysfs_emit(buf, "expander\n"); + } + return -ENXIO; +} +static DEVICE_ATTR_RO(target_type); + +static ssize_t emit_target_list(struct cxl_switch_decoder *cxlsd, char *buf) +{ + struct cxl_decoder *cxld = &cxlsd->cxld; + ssize_t offset = 0; + int i, rc = 0; + + for (i = 0; i < cxld->interleave_ways; i++) { + struct cxl_dport *dport = cxlsd->target[i]; + struct cxl_dport *next = NULL; + + if (!dport) + break; + + if (i + 1 < cxld->interleave_ways) + next = cxlsd->target[i + 1]; + rc = sysfs_emit_at(buf, offset, "%d%s", dport->port_id, + next ? "," : ""); + if (rc < 0) + return rc; + offset += rc; + } + + return offset; +} + +static ssize_t target_list_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); + ssize_t offset; + unsigned int seq; + int rc; + + do { + seq = read_seqbegin(&cxlsd->target_lock); + rc = emit_target_list(cxlsd, buf); + } while (read_seqretry(&cxlsd->target_lock, seq)); + + if (rc < 0) + return rc; + offset = rc; + + rc = sysfs_emit_at(buf, offset, "\n"); + if (rc < 0) + return rc; + + return offset + rc; +} +static DEVICE_ATTR_RO(target_list); + +static ssize_t mode_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + + switch (cxled->mode) { + case CXL_DECODER_RAM: + return sysfs_emit(buf, "ram\n"); + case CXL_DECODER_PMEM: + return sysfs_emit(buf, "pmem\n"); + case CXL_DECODER_NONE: + return sysfs_emit(buf, "none\n"); + case CXL_DECODER_MIXED: + default: + return sysfs_emit(buf, "mixed\n"); + } +} + +static ssize_t mode_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + enum cxl_decoder_mode mode; + ssize_t rc; + + if (sysfs_streq(buf, "pmem")) + mode = CXL_DECODER_PMEM; + else if (sysfs_streq(buf, "ram")) + mode = CXL_DECODER_RAM; + else + return -EINVAL; + + rc = cxl_dpa_set_mode(cxled, mode); + if (rc) + return rc; + + return len; +} +static DEVICE_ATTR_RW(mode); + +static ssize_t dpa_resource_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + u64 base = cxl_dpa_resource_start(cxled); + + return sysfs_emit(buf, "%#llx\n", base); +} +static DEVICE_ATTR_RO(dpa_resource); + +static ssize_t dpa_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + resource_size_t size = cxl_dpa_size(cxled); + + return sysfs_emit(buf, "%pa\n", &size); +} + +static ssize_t dpa_size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + unsigned long long size; + ssize_t rc; + + rc = kstrtoull(buf, 0, &size); + if (rc) + return rc; + + if (!IS_ALIGNED(size, SZ_256M)) + return -EINVAL; + + rc = cxl_dpa_free(cxled); + if (rc) + return rc; + + if (size == 0) + return len; + + rc = cxl_dpa_alloc(cxled, size); + if (rc) + return rc; + + return len; +} +static DEVICE_ATTR_RW(dpa_size); + +static ssize_t interleave_granularity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%d\n", cxld->interleave_granularity); +} + +static DEVICE_ATTR_RO(interleave_granularity); + +static ssize_t interleave_ways_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%d\n", cxld->interleave_ways); +} + +static DEVICE_ATTR_RO(interleave_ways); + +static struct attribute *cxl_decoder_base_attrs[] = { + &dev_attr_start.attr, + &dev_attr_size.attr, + &dev_attr_locked.attr, + &dev_attr_interleave_granularity.attr, + &dev_attr_interleave_ways.attr, + NULL, +}; + +static struct attribute_group cxl_decoder_base_attribute_group = { + .attrs = cxl_decoder_base_attrs, +}; + +static struct attribute *cxl_decoder_root_attrs[] = { + &dev_attr_cap_pmem.attr, + &dev_attr_cap_ram.attr, + &dev_attr_cap_type2.attr, + &dev_attr_cap_type3.attr, + &dev_attr_target_list.attr, + SET_CXL_REGION_ATTR(create_pmem_region) + SET_CXL_REGION_ATTR(delete_region) + NULL, +}; + +static bool can_create_pmem(struct cxl_root_decoder *cxlrd) +{ + unsigned long flags = CXL_DECODER_F_TYPE3 | CXL_DECODER_F_PMEM; + + return (cxlrd->cxlsd.cxld.flags & flags) == flags; +} + +static umode_t cxl_root_decoder_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + if (a == CXL_REGION_ATTR(create_pmem_region) && !can_create_pmem(cxlrd)) + return 0; + + if (a == CXL_REGION_ATTR(delete_region) && !can_create_pmem(cxlrd)) + return 0; + + return a->mode; +} + +static struct attribute_group cxl_decoder_root_attribute_group = { + .attrs = cxl_decoder_root_attrs, + .is_visible = cxl_root_decoder_visible, +}; + +static const struct attribute_group *cxl_decoder_root_attribute_groups[] = { + &cxl_decoder_root_attribute_group, + &cxl_decoder_base_attribute_group, + &cxl_base_attribute_group, + NULL, +}; + +static struct attribute *cxl_decoder_switch_attrs[] = { + &dev_attr_target_type.attr, + &dev_attr_target_list.attr, + SET_CXL_REGION_ATTR(region) + NULL, +}; + +static struct attribute_group cxl_decoder_switch_attribute_group = { + .attrs = cxl_decoder_switch_attrs, +}; + +static const struct attribute_group *cxl_decoder_switch_attribute_groups[] = { + &cxl_decoder_switch_attribute_group, + &cxl_decoder_base_attribute_group, + &cxl_base_attribute_group, + NULL, +}; + +static struct attribute *cxl_decoder_endpoint_attrs[] = { + &dev_attr_target_type.attr, + &dev_attr_mode.attr, + &dev_attr_dpa_size.attr, + &dev_attr_dpa_resource.attr, + SET_CXL_REGION_ATTR(region) + NULL, +}; + +static struct attribute_group cxl_decoder_endpoint_attribute_group = { + .attrs = cxl_decoder_endpoint_attrs, +}; + +static const struct attribute_group *cxl_decoder_endpoint_attribute_groups[] = { + &cxl_decoder_base_attribute_group, + &cxl_decoder_endpoint_attribute_group, + &cxl_base_attribute_group, + NULL, +}; + +static void __cxl_decoder_release(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + + ida_free(&port->decoder_ida, cxld->id); + put_device(&port->dev); +} + +static void cxl_endpoint_decoder_release(struct device *dev) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + + __cxl_decoder_release(&cxled->cxld); + kfree(cxled); +} + +static void cxl_switch_decoder_release(struct device *dev) +{ + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); + + __cxl_decoder_release(&cxlsd->cxld); + kfree(cxlsd); +} + +struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_root_decoder(dev), + "not a cxl_root_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_root_decoder, cxlsd.cxld.dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_root_decoder, CXL); + +static void cxl_root_decoder_release(struct device *dev) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + if (atomic_read(&cxlrd->region_id) >= 0) + memregion_free(atomic_read(&cxlrd->region_id)); + __cxl_decoder_release(&cxlrd->cxlsd.cxld); + kfree(cxlrd); +} + +static const struct device_type cxl_decoder_endpoint_type = { + .name = "cxl_decoder_endpoint", + .release = cxl_endpoint_decoder_release, + .groups = cxl_decoder_endpoint_attribute_groups, +}; + +static const struct device_type cxl_decoder_switch_type = { + .name = "cxl_decoder_switch", + .release = cxl_switch_decoder_release, + .groups = cxl_decoder_switch_attribute_groups, +}; + +static const struct device_type cxl_decoder_root_type = { + .name = "cxl_decoder_root", + .release = cxl_root_decoder_release, + .groups = cxl_decoder_root_attribute_groups, +}; + +bool is_endpoint_decoder(struct device *dev) +{ + return dev->type == &cxl_decoder_endpoint_type; +} + +bool is_root_decoder(struct device *dev) +{ + return dev->type == &cxl_decoder_root_type; +} +EXPORT_SYMBOL_NS_GPL(is_root_decoder, CXL); + +bool is_switch_decoder(struct device *dev) +{ + return is_root_decoder(dev) || dev->type == &cxl_decoder_switch_type; +} +EXPORT_SYMBOL_NS_GPL(is_switch_decoder, CXL); + +struct cxl_decoder *to_cxl_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, + !is_switch_decoder(dev) && !is_endpoint_decoder(dev), + "not a cxl_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_decoder, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_decoder, CXL); + +struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_endpoint_decoder(dev), + "not a cxl_endpoint_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_endpoint_decoder, cxld.dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_endpoint_decoder, CXL); + +struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_switch_decoder(dev), + "not a cxl_switch_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_switch_decoder, cxld.dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_switch_decoder, CXL); + +static void cxl_ep_release(struct cxl_ep *ep) +{ + put_device(ep->ep); + kfree(ep); +} + +static void cxl_ep_remove(struct cxl_port *port, struct cxl_ep *ep) +{ + if (!ep) + return; + xa_erase(&port->endpoints, (unsigned long) ep->ep); + cxl_ep_release(ep); +} + +static void cxl_port_release(struct device *dev) +{ + struct cxl_port *port = to_cxl_port(dev); + unsigned long index; + struct cxl_ep *ep; + + xa_for_each(&port->endpoints, index, ep) + cxl_ep_remove(port, ep); + xa_destroy(&port->endpoints); + xa_destroy(&port->dports); + xa_destroy(&port->regions); + ida_free(&cxl_port_ida, port->id); + kfree(port); +} + +static const struct attribute_group *cxl_port_attribute_groups[] = { + &cxl_base_attribute_group, + NULL, +}; + +static const struct device_type cxl_port_type = { + .name = "cxl_port", + .release = cxl_port_release, + .groups = cxl_port_attribute_groups, +}; + +bool is_cxl_port(struct device *dev) +{ + return dev->type == &cxl_port_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_port, CXL); + +struct cxl_port *to_cxl_port(struct device *dev) +{ + if (dev_WARN_ONCE(dev, dev->type != &cxl_port_type, + "not a cxl_port device\n")) + return NULL; + return container_of(dev, struct cxl_port, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_port, CXL); + +static void unregister_port(void *_port) +{ + struct cxl_port *port = _port; + struct cxl_port *parent; + struct device *lock_dev; + + if (is_cxl_root(port)) + parent = NULL; + else + parent = to_cxl_port(port->dev.parent); + + /* + * CXL root port's and the first level of ports are unregistered + * under the platform firmware device lock, all other ports are + * unregistered while holding their parent port lock. + */ + if (!parent) + lock_dev = port->uport; + else if (is_cxl_root(parent)) + lock_dev = parent->uport; + else + lock_dev = &parent->dev; + + device_lock_assert(lock_dev); + port->dead = true; + device_unregister(&port->dev); +} + +static void cxl_unlink_uport(void *_port) +{ + struct cxl_port *port = _port; + + sysfs_remove_link(&port->dev.kobj, "uport"); +} + +static int devm_cxl_link_uport(struct device *host, struct cxl_port *port) +{ + int rc; + + rc = sysfs_create_link(&port->dev.kobj, &port->uport->kobj, "uport"); + if (rc) + return rc; + return devm_add_action_or_reset(host, cxl_unlink_uport, port); +} + +static struct lock_class_key cxl_port_key; + +static struct cxl_port *cxl_port_alloc(struct device *uport, + resource_size_t component_reg_phys, + struct cxl_dport *parent_dport) +{ + struct cxl_port *port; + struct device *dev; + int rc; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + rc = ida_alloc(&cxl_port_ida, GFP_KERNEL); + if (rc < 0) + goto err; + port->id = rc; + port->uport = uport; + + /* + * The top-level cxl_port "cxl_root" does not have a cxl_port as + * its parent and it does not have any corresponding component + * registers as its decode is described by a fixed platform + * description. + */ + dev = &port->dev; + if (parent_dport) { + struct cxl_port *parent_port = parent_dport->port; + struct cxl_port *iter; + + dev->parent = &parent_port->dev; + port->depth = parent_port->depth + 1; + port->parent_dport = parent_dport; + + /* + * walk to the host bridge, or the first ancestor that knows + * the host bridge + */ + iter = port; + while (!iter->host_bridge && + !is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + if (iter->host_bridge) + port->host_bridge = iter->host_bridge; + else + port->host_bridge = iter->uport; + dev_dbg(uport, "host-bridge: %s\n", dev_name(port->host_bridge)); + } else + dev->parent = uport; + + port->component_reg_phys = component_reg_phys; + ida_init(&port->decoder_ida); + port->hdm_end = -1; + port->commit_end = -1; + xa_init(&port->dports); + xa_init(&port->endpoints); + xa_init(&port->regions); + + device_initialize(dev); + lockdep_set_class_and_subclass(&dev->mutex, &cxl_port_key, port->depth); + device_set_pm_not_required(dev); + dev->bus = &cxl_bus_type; + dev->type = &cxl_port_type; + + return port; + +err: + kfree(port); + return ERR_PTR(rc); +} + +static struct cxl_port *__devm_cxl_add_port(struct device *host, + struct device *uport, + resource_size_t component_reg_phys, + struct cxl_dport *parent_dport) +{ + struct cxl_port *port; + struct device *dev; + int rc; + + port = cxl_port_alloc(uport, component_reg_phys, parent_dport); + if (IS_ERR(port)) + return port; + + dev = &port->dev; + if (is_cxl_memdev(uport)) + rc = dev_set_name(dev, "endpoint%d", port->id); + else if (parent_dport) + rc = dev_set_name(dev, "port%d", port->id); + else + rc = dev_set_name(dev, "root%d", port->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + rc = devm_add_action_or_reset(host, unregister_port, port); + if (rc) + return ERR_PTR(rc); + + rc = devm_cxl_link_uport(host, port); + if (rc) + return ERR_PTR(rc); + + return port; + +err: + put_device(dev); + return ERR_PTR(rc); +} + +/** + * devm_cxl_add_port - register a cxl_port in CXL memory decode hierarchy + * @host: host device for devm operations + * @uport: "physical" device implementing this upstream port + * @component_reg_phys: (optional) for configurable cxl_port instances + * @parent_dport: next hop up in the CXL memory decode hierarchy + */ +struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, + resource_size_t component_reg_phys, + struct cxl_dport *parent_dport) +{ + struct cxl_port *port, *parent_port; + + port = __devm_cxl_add_port(host, uport, component_reg_phys, + parent_dport); + + parent_port = parent_dport ? parent_dport->port : NULL; + if (IS_ERR(port)) { + dev_dbg(uport, "Failed to add%s%s%s: %ld\n", + parent_port ? " port to " : "", + parent_port ? dev_name(&parent_port->dev) : "", + parent_port ? "" : " root port", + PTR_ERR(port)); + } else { + dev_dbg(uport, "%s added%s%s%s\n", + dev_name(&port->dev), + parent_port ? " to " : "", + parent_port ? dev_name(&parent_port->dev) : "", + parent_port ? "" : " (root port)"); + } + + return port; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_port, CXL); + +struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port) +{ + /* There is no pci_bus associated with a CXL platform-root port */ + if (is_cxl_root(port)) + return NULL; + + if (dev_is_pci(port->uport)) { + struct pci_dev *pdev = to_pci_dev(port->uport); + + return pdev->subordinate; + } + + return xa_load(&cxl_root_buses, (unsigned long)port->uport); +} +EXPORT_SYMBOL_NS_GPL(cxl_port_to_pci_bus, CXL); + +static void unregister_pci_bus(void *uport) +{ + xa_erase(&cxl_root_buses, (unsigned long)uport); +} + +int devm_cxl_register_pci_bus(struct device *host, struct device *uport, + struct pci_bus *bus) +{ + int rc; + + if (dev_is_pci(uport)) + return -EINVAL; + + rc = xa_insert(&cxl_root_buses, (unsigned long)uport, bus, GFP_KERNEL); + if (rc) + return rc; + return devm_add_action_or_reset(host, unregister_pci_bus, uport); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_register_pci_bus, CXL); + +static bool dev_is_cxl_root_child(struct device *dev) +{ + struct cxl_port *port, *parent; + + if (!is_cxl_port(dev)) + return false; + + port = to_cxl_port(dev); + if (is_cxl_root(port)) + return false; + + parent = to_cxl_port(port->dev.parent); + if (is_cxl_root(parent)) + return true; + + return false; +} + +/* Find a 2nd level CXL port that has a dport that is an ancestor of @match */ +static int match_root_child(struct device *dev, const void *match) +{ + const struct device *iter = NULL; + struct cxl_dport *dport; + struct cxl_port *port; + + if (!dev_is_cxl_root_child(dev)) + return 0; + + port = to_cxl_port(dev); + iter = match; + while (iter) { + dport = cxl_find_dport_by_dev(port, iter); + if (dport) + break; + iter = iter->parent; + } + + return !!iter; +} + +struct cxl_port *find_cxl_root(struct device *dev) +{ + struct device *port_dev; + struct cxl_port *root; + + port_dev = bus_find_device(&cxl_bus_type, NULL, dev, match_root_child); + if (!port_dev) + return NULL; + + root = to_cxl_port(port_dev->parent); + get_device(&root->dev); + put_device(port_dev); + return root; +} +EXPORT_SYMBOL_NS_GPL(find_cxl_root, CXL); + +static struct cxl_dport *find_dport(struct cxl_port *port, int id) +{ + struct cxl_dport *dport; + unsigned long index; + + device_lock_assert(&port->dev); + xa_for_each(&port->dports, index, dport) + if (dport->port_id == id) + return dport; + return NULL; +} + +static int add_dport(struct cxl_port *port, struct cxl_dport *new) +{ + struct cxl_dport *dup; + int rc; + + device_lock_assert(&port->dev); + dup = find_dport(port, new->port_id); + if (dup) { + dev_err(&port->dev, + "unable to add dport%d-%s non-unique port id (%s)\n", + new->port_id, dev_name(new->dport), + dev_name(dup->dport)); + return -EBUSY; + } + + rc = xa_insert(&port->dports, (unsigned long)new->dport, new, + GFP_KERNEL); + if (rc) + return rc; + + port->nr_dports++; + return 0; +} + +/* + * Since root-level CXL dports cannot be enumerated by PCI they are not + * enumerated by the common port driver that acquires the port lock over + * dport add/remove. Instead, root dports are manually added by a + * platform driver and cond_cxl_root_lock() is used to take the missing + * port lock in that case. + */ +static void cond_cxl_root_lock(struct cxl_port *port) +{ + if (is_cxl_root(port)) + device_lock(&port->dev); +} + +static void cond_cxl_root_unlock(struct cxl_port *port) +{ + if (is_cxl_root(port)) + device_unlock(&port->dev); +} + +static void cxl_dport_remove(void *data) +{ + struct cxl_dport *dport = data; + struct cxl_port *port = dport->port; + + xa_erase(&port->dports, (unsigned long) dport->dport); + put_device(dport->dport); +} + +static void cxl_dport_unlink(void *data) +{ + struct cxl_dport *dport = data; + struct cxl_port *port = dport->port; + char link_name[CXL_TARGET_STRLEN]; + + sprintf(link_name, "dport%d", dport->port_id); + sysfs_remove_link(&port->dev.kobj, link_name); +} + +/** + * devm_cxl_add_dport - append downstream port data to a cxl_port + * @port: the cxl_port that references this dport + * @dport_dev: firmware or PCI device representing the dport + * @port_id: identifier for this dport in a decoder's target list + * @component_reg_phys: optional location of CXL component registers + * + * Note that dports are appended to the devm release action's of the + * either the port's host (for root ports), or the port itself (for + * switch ports) + */ +struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, + struct device *dport_dev, int port_id, + resource_size_t component_reg_phys) +{ + char link_name[CXL_TARGET_STRLEN]; + struct cxl_dport *dport; + struct device *host; + int rc; + + if (is_cxl_root(port)) + host = port->uport; + else + host = &port->dev; + + if (!host->driver) { + dev_WARN_ONCE(&port->dev, 1, "dport:%s bad devm context\n", + dev_name(dport_dev)); + return ERR_PTR(-ENXIO); + } + + if (snprintf(link_name, CXL_TARGET_STRLEN, "dport%d", port_id) >= + CXL_TARGET_STRLEN) + return ERR_PTR(-EINVAL); + + dport = devm_kzalloc(host, sizeof(*dport), GFP_KERNEL); + if (!dport) + return ERR_PTR(-ENOMEM); + + dport->dport = dport_dev; + dport->port_id = port_id; + dport->component_reg_phys = component_reg_phys; + dport->port = port; + + cond_cxl_root_lock(port); + rc = add_dport(port, dport); + cond_cxl_root_unlock(port); + if (rc) + return ERR_PTR(rc); + + get_device(dport_dev); + rc = devm_add_action_or_reset(host, cxl_dport_remove, dport); + if (rc) + return ERR_PTR(rc); + + rc = sysfs_create_link(&port->dev.kobj, &dport_dev->kobj, link_name); + if (rc) + return ERR_PTR(rc); + + rc = devm_add_action_or_reset(host, cxl_dport_unlink, dport); + if (rc) + return ERR_PTR(rc); + + return dport; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport, CXL); + +static int add_ep(struct cxl_ep *new) +{ + struct cxl_port *port = new->dport->port; + int rc; + + device_lock(&port->dev); + if (port->dead) { + device_unlock(&port->dev); + return -ENXIO; + } + rc = xa_insert(&port->endpoints, (unsigned long)new->ep, new, + GFP_KERNEL); + device_unlock(&port->dev); + + return rc; +} + +/** + * cxl_add_ep - register an endpoint's interest in a port + * @dport: the dport that routes to @ep_dev + * @ep_dev: device representing the endpoint + * + * Intermediate CXL ports are scanned based on the arrival of endpoints. + * When those endpoints depart the port can be destroyed once all + * endpoints that care about that port have been removed. + */ +static int cxl_add_ep(struct cxl_dport *dport, struct device *ep_dev) +{ + struct cxl_ep *ep; + int rc; + + ep = kzalloc(sizeof(*ep), GFP_KERNEL); + if (!ep) + return -ENOMEM; + + ep->ep = get_device(ep_dev); + ep->dport = dport; + + rc = add_ep(ep); + if (rc) + cxl_ep_release(ep); + return rc; +} + +struct cxl_find_port_ctx { + const struct device *dport_dev; + const struct cxl_port *parent_port; + struct cxl_dport **dport; +}; + +static int match_port_by_dport(struct device *dev, const void *data) +{ + const struct cxl_find_port_ctx *ctx = data; + struct cxl_dport *dport; + struct cxl_port *port; + + if (!is_cxl_port(dev)) + return 0; + if (ctx->parent_port && dev->parent != &ctx->parent_port->dev) + return 0; + + port = to_cxl_port(dev); + dport = cxl_find_dport_by_dev(port, ctx->dport_dev); + if (ctx->dport) + *ctx->dport = dport; + return dport != NULL; +} + +static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx) +{ + struct device *dev; + + if (!ctx->dport_dev) + return NULL; + + dev = bus_find_device(&cxl_bus_type, NULL, ctx, match_port_by_dport); + if (dev) + return to_cxl_port(dev); + return NULL; +} + +static struct cxl_port *find_cxl_port(struct device *dport_dev, + struct cxl_dport **dport) +{ + struct cxl_find_port_ctx ctx = { + .dport_dev = dport_dev, + .dport = dport, + }; + struct cxl_port *port; + + port = __find_cxl_port(&ctx); + return port; +} + +static struct cxl_port *find_cxl_port_at(struct cxl_port *parent_port, + struct device *dport_dev, + struct cxl_dport **dport) +{ + struct cxl_find_port_ctx ctx = { + .dport_dev = dport_dev, + .parent_port = parent_port, + .dport = dport, + }; + struct cxl_port *port; + + port = __find_cxl_port(&ctx); + return port; +} + +/* + * All users of grandparent() are using it to walk PCIe-like swich port + * hierarchy. A PCIe switch is comprised of a bridge device representing the + * upstream switch port and N bridges representing downstream switch ports. When + * bridges stack the grand-parent of a downstream switch port is another + * downstream switch port in the immediate ancestor switch. + */ +static struct device *grandparent(struct device *dev) +{ + if (dev && dev->parent) + return dev->parent->parent; + return NULL; +} + +static void delete_endpoint(void *data) +{ + struct cxl_memdev *cxlmd = data; + struct cxl_port *endpoint = dev_get_drvdata(&cxlmd->dev); + struct cxl_port *parent_port; + struct device *parent; + + parent_port = cxl_mem_find_port(cxlmd, NULL); + if (!parent_port) + goto out; + parent = &parent_port->dev; + + device_lock(parent); + if (parent->driver && !endpoint->dead) { + devm_release_action(parent, cxl_unlink_uport, endpoint); + devm_release_action(parent, unregister_port, endpoint); + } + device_unlock(parent); + put_device(parent); +out: + put_device(&endpoint->dev); +} + +int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint) +{ + struct device *dev = &cxlmd->dev; + + get_device(&endpoint->dev); + dev_set_drvdata(dev, endpoint); + return devm_add_action_or_reset(dev, delete_endpoint, cxlmd); +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_autoremove, CXL); + +/* + * The natural end of life of a non-root 'cxl_port' is when its parent port goes + * through a ->remove() event ("top-down" unregistration). The unnatural trigger + * for a port to be unregistered is when all memdevs beneath that port have gone + * through ->remove(). This "bottom-up" removal selectively removes individual + * child ports manually. This depends on devm_cxl_add_port() to not change is + * devm action registration order, and for dports to have already been + * destroyed by reap_dports(). + */ +static void delete_switch_port(struct cxl_port *port) +{ + devm_release_action(port->dev.parent, cxl_unlink_uport, port); + devm_release_action(port->dev.parent, unregister_port, port); +} + +static void reap_dports(struct cxl_port *port) +{ + struct cxl_dport *dport; + unsigned long index; + + device_lock_assert(&port->dev); + + xa_for_each(&port->dports, index, dport) { + devm_release_action(&port->dev, cxl_dport_unlink, dport); + devm_release_action(&port->dev, cxl_dport_remove, dport); + devm_kfree(&port->dev, dport); + } +} + +static void cxl_detach_ep(void *data) +{ + struct cxl_memdev *cxlmd = data; + struct device *iter; + + for (iter = &cxlmd->dev; iter; iter = grandparent(iter)) { + struct device *dport_dev = grandparent(iter); + struct cxl_port *port, *parent_port; + struct cxl_ep *ep; + bool died = false; + + if (!dport_dev) + break; + + port = find_cxl_port(dport_dev, NULL); + if (!port) + continue; + + if (is_cxl_root(port)) { + put_device(&port->dev); + continue; + } + + parent_port = to_cxl_port(port->dev.parent); + device_lock(&parent_port->dev); + if (!parent_port->dev.driver) { + /* + * The bottom-up race to delete the port lost to a + * top-down port disable, give up here, because the + * parent_port ->remove() will have cleaned up all + * descendants. + */ + device_unlock(&parent_port->dev); + put_device(&port->dev); + continue; + } + + device_lock(&port->dev); + ep = cxl_ep_load(port, cxlmd); + dev_dbg(&cxlmd->dev, "disconnect %s from %s\n", + ep ? dev_name(ep->ep) : "", dev_name(&port->dev)); + cxl_ep_remove(port, ep); + if (ep && !port->dead && xa_empty(&port->endpoints) && + !is_cxl_root(parent_port)) { + /* + * This was the last ep attached to a dynamically + * enumerated port. Block new cxl_add_ep() and garbage + * collect the port. + */ + died = true; + port->dead = true; + reap_dports(port); + } + device_unlock(&port->dev); + + if (died) { + dev_dbg(&cxlmd->dev, "delete %s\n", + dev_name(&port->dev)); + delete_switch_port(port); + } + put_device(&port->dev); + device_unlock(&parent_port->dev); + } +} + +static resource_size_t find_component_registers(struct device *dev) +{ + struct cxl_register_map map; + struct pci_dev *pdev; + + /* + * Theoretically, CXL component registers can be hosted on a + * non-PCI device, in practice, only cxl_test hits this case. + */ + if (!dev_is_pci(dev)) + return CXL_RESOURCE_NONE; + + pdev = to_pci_dev(dev); + + cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); + return cxl_regmap_to_base(pdev, &map); +} + +static int add_port_attach_ep(struct cxl_memdev *cxlmd, + struct device *uport_dev, + struct device *dport_dev) +{ + struct device *dparent = grandparent(dport_dev); + struct cxl_port *port, *parent_port = NULL; + struct cxl_dport *dport, *parent_dport; + resource_size_t component_reg_phys; + int rc; + + if (!dparent) { + /* + * The iteration reached the topology root without finding the + * CXL-root 'cxl_port' on a previous iteration, fail for now to + * be re-probed after platform driver attaches. + */ + dev_dbg(&cxlmd->dev, "%s is a root dport\n", + dev_name(dport_dev)); + return -ENXIO; + } + + parent_port = find_cxl_port(dparent, &parent_dport); + if (!parent_port) { + /* iterate to create this parent_port */ + return -EAGAIN; + } + + device_lock(&parent_port->dev); + if (!parent_port->dev.driver) { + dev_warn(&cxlmd->dev, + "port %s:%s disabled, failed to enumerate CXL.mem\n", + dev_name(&parent_port->dev), dev_name(uport_dev)); + port = ERR_PTR(-ENXIO); + goto out; + } + + port = find_cxl_port_at(parent_port, dport_dev, &dport); + if (!port) { + component_reg_phys = find_component_registers(uport_dev); + port = devm_cxl_add_port(&parent_port->dev, uport_dev, + component_reg_phys, parent_dport); + /* retry find to pick up the new dport information */ + if (!IS_ERR(port)) + port = find_cxl_port_at(parent_port, dport_dev, &dport); + } +out: + device_unlock(&parent_port->dev); + + if (IS_ERR(port)) + rc = PTR_ERR(port); + else { + dev_dbg(&cxlmd->dev, "add to new port %s:%s\n", + dev_name(&port->dev), dev_name(port->uport)); + rc = cxl_add_ep(dport, &cxlmd->dev); + if (rc == -EBUSY) { + /* + * "can't" happen, but this error code means + * something to the caller, so translate it. + */ + rc = -ENXIO; + } + put_device(&port->dev); + } + + put_device(&parent_port->dev); + return rc; +} + +int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd) +{ + struct device *dev = &cxlmd->dev; + struct device *iter; + int rc; + + rc = devm_add_action_or_reset(&cxlmd->dev, cxl_detach_ep, cxlmd); + if (rc) + return rc; + + /* + * Scan for and add all cxl_ports in this device's ancestry. + * Repeat until no more ports are added. Abort if a port add + * attempt fails. + */ +retry: + for (iter = dev; iter; iter = grandparent(iter)) { + struct device *dport_dev = grandparent(iter); + struct device *uport_dev; + struct cxl_dport *dport; + struct cxl_port *port; + + if (!dport_dev) + return 0; + + uport_dev = dport_dev->parent; + if (!uport_dev) { + dev_warn(dev, "at %s no parent for dport: %s\n", + dev_name(iter), dev_name(dport_dev)); + return -ENXIO; + } + + dev_dbg(dev, "scan: iter: %s dport_dev: %s parent: %s\n", + dev_name(iter), dev_name(dport_dev), + dev_name(uport_dev)); + port = find_cxl_port(dport_dev, &dport); + if (port) { + dev_dbg(&cxlmd->dev, + "found already registered port %s:%s\n", + dev_name(&port->dev), dev_name(port->uport)); + rc = cxl_add_ep(dport, &cxlmd->dev); + + /* + * If the endpoint already exists in the port's list, + * that's ok, it was added on a previous pass. + * Otherwise, retry in add_port_attach_ep() after taking + * the parent_port lock as the current port may be being + * reaped. + */ + if (rc && rc != -EBUSY) { + put_device(&port->dev); + return rc; + } + + /* Any more ports to add between this one and the root? */ + if (!dev_is_cxl_root_child(&port->dev)) { + put_device(&port->dev); + continue; + } + + put_device(&port->dev); + return 0; + } + + rc = add_port_attach_ep(cxlmd, uport_dev, dport_dev); + /* port missing, try to add parent */ + if (rc == -EAGAIN) + continue; + /* failed to add ep or port */ + if (rc) + return rc; + /* port added, new descendants possible, start over */ + goto retry; + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_ports, CXL); + +struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd, + struct cxl_dport **dport) +{ + return find_cxl_port(grandparent(&cxlmd->dev), dport); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_find_port, CXL); + +static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, + struct cxl_port *port, int *target_map) +{ + int i, rc = 0; + + if (!target_map) + return 0; + + device_lock_assert(&port->dev); + + if (xa_empty(&port->dports)) + return -EINVAL; + + write_seqlock(&cxlsd->target_lock); + for (i = 0; i < cxlsd->cxld.interleave_ways; i++) { + struct cxl_dport *dport = find_dport(port, target_map[i]); + + if (!dport) { + rc = -ENXIO; + break; + } + cxlsd->target[i] = dport; + } + write_sequnlock(&cxlsd->target_lock); + + return rc; +} + +static struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos) +{ + struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; + struct cxl_decoder *cxld = &cxlsd->cxld; + int iw; + + iw = cxld->interleave_ways; + if (dev_WARN_ONCE(&cxld->dev, iw != cxlsd->nr_targets, + "misconfigured root decoder\n")) + return NULL; + + return cxlrd->cxlsd.target[pos % iw]; +} + +static struct lock_class_key cxl_decoder_key; + +/** + * cxl_decoder_init - Common decoder setup / initialization + * @port: owning port of this decoder + * @cxld: common decoder properties to initialize + * + * A port may contain one or more decoders. Each of those decoders + * enable some address space for CXL.mem utilization. A decoder is + * expected to be configured by the caller before registering via + * cxl_decoder_add() + */ +static int cxl_decoder_init(struct cxl_port *port, struct cxl_decoder *cxld) +{ + struct device *dev; + int rc; + + rc = ida_alloc(&port->decoder_ida, GFP_KERNEL); + if (rc < 0) + return rc; + + /* need parent to stick around to release the id */ + get_device(&port->dev); + cxld->id = rc; + + dev = &cxld->dev; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_decoder_key); + device_set_pm_not_required(dev); + dev->parent = &port->dev; + dev->bus = &cxl_bus_type; + + /* Pre initialize an "empty" decoder */ + cxld->interleave_ways = 1; + cxld->interleave_granularity = PAGE_SIZE; + cxld->target_type = CXL_DECODER_EXPANDER; + cxld->hpa_range = (struct range) { + .start = 0, + .end = -1, + }; + + return 0; +} + +static int cxl_switch_decoder_init(struct cxl_port *port, + struct cxl_switch_decoder *cxlsd, + int nr_targets) +{ + if (nr_targets > CXL_DECODER_MAX_INTERLEAVE) + return -EINVAL; + + cxlsd->nr_targets = nr_targets; + seqlock_init(&cxlsd->target_lock); + return cxl_decoder_init(port, &cxlsd->cxld); +} + +/** + * cxl_root_decoder_alloc - Allocate a root level decoder + * @port: owning CXL root of this decoder + * @nr_targets: static number of downstream targets + * + * Return: A new cxl decoder to be registered by cxl_decoder_add(). A + * 'CXL root' decoder is one that decodes from a top-level / static platform + * firmware description of CXL resources into a CXL standard decode + * topology. + */ +struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets) +{ + struct cxl_root_decoder *cxlrd; + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + int rc; + + if (!is_cxl_root(port)) + return ERR_PTR(-EINVAL); + + cxlrd = kzalloc(struct_size(cxlrd, cxlsd.target, nr_targets), + GFP_KERNEL); + if (!cxlrd) + return ERR_PTR(-ENOMEM); + + cxlsd = &cxlrd->cxlsd; + rc = cxl_switch_decoder_init(port, cxlsd, nr_targets); + if (rc) { + kfree(cxlrd); + return ERR_PTR(rc); + } + + cxlrd->calc_hb = cxl_hb_modulo; + + cxld = &cxlsd->cxld; + cxld->dev.type = &cxl_decoder_root_type; + /* + * cxl_root_decoder_release() special cases negative ids to + * detect memregion_alloc() failures. + */ + atomic_set(&cxlrd->region_id, -1); + rc = memregion_alloc(GFP_KERNEL); + if (rc < 0) { + put_device(&cxld->dev); + return ERR_PTR(rc); + } + + atomic_set(&cxlrd->region_id, rc); + return cxlrd; +} +EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, CXL); + +/** + * cxl_switch_decoder_alloc - Allocate a switch level decoder + * @port: owning CXL switch port of this decoder + * @nr_targets: max number of dynamically addressable downstream targets + * + * Return: A new cxl decoder to be registered by cxl_decoder_add(). A + * 'switch' decoder is any decoder that can be enumerated by PCIe + * topology and the HDM Decoder Capability. This includes the decoders + * that sit between Switch Upstream Ports / Switch Downstream Ports and + * Host Bridges / Root Ports. + */ +struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets) +{ + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + int rc; + + if (is_cxl_root(port) || is_cxl_endpoint(port)) + return ERR_PTR(-EINVAL); + + cxlsd = kzalloc(struct_size(cxlsd, target, nr_targets), GFP_KERNEL); + if (!cxlsd) + return ERR_PTR(-ENOMEM); + + rc = cxl_switch_decoder_init(port, cxlsd, nr_targets); + if (rc) { + kfree(cxlsd); + return ERR_PTR(rc); + } + + cxld = &cxlsd->cxld; + cxld->dev.type = &cxl_decoder_switch_type; + return cxlsd; +} +EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, CXL); + +/** + * cxl_endpoint_decoder_alloc - Allocate an endpoint decoder + * @port: owning port of this decoder + * + * Return: A new cxl decoder to be registered by cxl_decoder_add() + */ +struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_decoder *cxld; + int rc; + + if (!is_cxl_endpoint(port)) + return ERR_PTR(-EINVAL); + + cxled = kzalloc(sizeof(*cxled), GFP_KERNEL); + if (!cxled) + return ERR_PTR(-ENOMEM); + + cxled->pos = -1; + cxld = &cxled->cxld; + rc = cxl_decoder_init(port, cxld); + if (rc) { + kfree(cxled); + return ERR_PTR(rc); + } + + cxld->dev.type = &cxl_decoder_endpoint_type; + return cxled; +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_alloc, CXL); + +/** + * cxl_decoder_add_locked - Add a decoder with targets + * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc() + * @target_map: A list of downstream ports that this decoder can direct memory + * traffic to. These numbers should correspond with the port number + * in the PCIe Link Capabilities structure. + * + * Certain types of decoders may not have any targets. The main example of this + * is an endpoint device. A more awkward example is a hostbridge whose root + * ports get hot added (technically possible, though unlikely). + * + * This is the locked variant of cxl_decoder_add(). + * + * Context: Process context. Expects the device lock of the port that owns the + * @cxld to be held. + * + * Return: Negative error code if the decoder wasn't properly configured; else + * returns 0. + */ +int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map) +{ + struct cxl_port *port; + struct device *dev; + int rc; + + if (WARN_ON_ONCE(!cxld)) + return -EINVAL; + + if (WARN_ON_ONCE(IS_ERR(cxld))) + return PTR_ERR(cxld); + + if (cxld->interleave_ways < 1) + return -EINVAL; + + dev = &cxld->dev; + + port = to_cxl_port(cxld->dev.parent); + if (!is_endpoint_decoder(dev)) { + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); + + rc = decoder_populate_targets(cxlsd, port, target_map); + if (rc && (cxld->flags & CXL_DECODER_F_ENABLE)) { + dev_err(&port->dev, + "Failed to populate active decoder targets\n"); + return rc; + } + } + + rc = dev_set_name(dev, "decoder%d.%d", port->id, cxld->id); + if (rc) + return rc; + + return device_add(dev); +} +EXPORT_SYMBOL_NS_GPL(cxl_decoder_add_locked, CXL); + +/** + * cxl_decoder_add - Add a decoder with targets + * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc() + * @target_map: A list of downstream ports that this decoder can direct memory + * traffic to. These numbers should correspond with the port number + * in the PCIe Link Capabilities structure. + * + * This is the unlocked variant of cxl_decoder_add_locked(). + * See cxl_decoder_add_locked(). + * + * Context: Process context. Takes and releases the device lock of the port that + * owns the @cxld. + */ +int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map) +{ + struct cxl_port *port; + int rc; + + if (WARN_ON_ONCE(!cxld)) + return -EINVAL; + + if (WARN_ON_ONCE(IS_ERR(cxld))) + return PTR_ERR(cxld); + + port = to_cxl_port(cxld->dev.parent); + + device_lock(&port->dev); + rc = cxl_decoder_add_locked(cxld, target_map); + device_unlock(&port->dev); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_decoder_add, CXL); + +static void cxld_unregister(void *dev) +{ + struct cxl_endpoint_decoder *cxled; + + if (is_endpoint_decoder(dev)) { + cxled = to_cxl_endpoint_decoder(dev); + cxl_decoder_kill_region(cxled); + } + + device_unregister(dev); +} + +int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld) +{ + return devm_add_action_or_reset(host, cxld_unregister, &cxld->dev); +} +EXPORT_SYMBOL_NS_GPL(cxl_decoder_autoremove, CXL); + +/** + * __cxl_driver_register - register a driver for the cxl bus + * @cxl_drv: cxl driver structure to attach + * @owner: owning module/driver + * @modname: KBUILD_MODNAME for parent driver + */ +int __cxl_driver_register(struct cxl_driver *cxl_drv, struct module *owner, + const char *modname) +{ + if (!cxl_drv->probe) { + pr_debug("%s ->probe() must be specified\n", modname); + return -EINVAL; + } + + if (!cxl_drv->name) { + pr_debug("%s ->name must be specified\n", modname); + return -EINVAL; + } + + if (!cxl_drv->id) { + pr_debug("%s ->id must be specified\n", modname); + return -EINVAL; + } + + cxl_drv->drv.bus = &cxl_bus_type; + cxl_drv->drv.owner = owner; + cxl_drv->drv.mod_name = modname; + cxl_drv->drv.name = cxl_drv->name; + + return driver_register(&cxl_drv->drv); +} +EXPORT_SYMBOL_NS_GPL(__cxl_driver_register, CXL); + +void cxl_driver_unregister(struct cxl_driver *cxl_drv) +{ + driver_unregister(&cxl_drv->drv); +} +EXPORT_SYMBOL_NS_GPL(cxl_driver_unregister, CXL); + +static int cxl_bus_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + return add_uevent_var(env, "MODALIAS=" CXL_MODALIAS_FMT, + cxl_device_id(dev)); +} + +static int cxl_bus_match(struct device *dev, struct device_driver *drv) +{ + return cxl_device_id(dev) == to_cxl_drv(drv)->id; +} + +static int cxl_bus_probe(struct device *dev) +{ + int rc; + + rc = to_cxl_drv(dev->driver)->probe(dev); + dev_dbg(dev, "probe: %d\n", rc); + return rc; +} + +static void cxl_bus_remove(struct device *dev) +{ + struct cxl_driver *cxl_drv = to_cxl_drv(dev->driver); + + if (cxl_drv->remove) + cxl_drv->remove(dev); +} + +static struct workqueue_struct *cxl_bus_wq; + +int cxl_bus_rescan(void) +{ + return bus_rescan_devices(&cxl_bus_type); +} +EXPORT_SYMBOL_NS_GPL(cxl_bus_rescan, CXL); + +bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) +{ + return queue_work(cxl_bus_wq, &cxlmd->detach_work); +} +EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL); + +/* for user tooling to ensure port disable work has completed */ +static ssize_t flush_store(struct bus_type *bus, const char *buf, size_t count) +{ + if (sysfs_streq(buf, "1")) { + flush_workqueue(cxl_bus_wq); + return count; + } + + return -EINVAL; +} + +static BUS_ATTR_WO(flush); + +static struct attribute *cxl_bus_attributes[] = { + &bus_attr_flush.attr, + NULL, +}; + +static struct attribute_group cxl_bus_attribute_group = { + .attrs = cxl_bus_attributes, +}; + +static const struct attribute_group *cxl_bus_attribute_groups[] = { + &cxl_bus_attribute_group, + NULL, +}; + +struct bus_type cxl_bus_type = { + .name = "cxl", + .uevent = cxl_bus_uevent, + .match = cxl_bus_match, + .probe = cxl_bus_probe, + .remove = cxl_bus_remove, + .bus_groups = cxl_bus_attribute_groups, +}; +EXPORT_SYMBOL_NS_GPL(cxl_bus_type, CXL); + +static struct dentry *cxl_debugfs; + +struct dentry *cxl_debugfs_create_dir(const char *dir) +{ + return debugfs_create_dir(dir, cxl_debugfs); +} +EXPORT_SYMBOL_NS_GPL(cxl_debugfs_create_dir, CXL); + +static __init int cxl_core_init(void) +{ + int rc; + + cxl_debugfs = debugfs_create_dir("cxl", NULL); + + cxl_mbox_init(); + + rc = cxl_memdev_init(); + if (rc) + return rc; + + cxl_bus_wq = alloc_ordered_workqueue("cxl_port", 0); + if (!cxl_bus_wq) { + rc = -ENOMEM; + goto err_wq; + } + + rc = bus_register(&cxl_bus_type); + if (rc) + goto err_bus; + + rc = cxl_region_init(); + if (rc) + goto err_region; + + return 0; + +err_region: + bus_unregister(&cxl_bus_type); +err_bus: + destroy_workqueue(cxl_bus_wq); +err_wq: + cxl_memdev_exit(); + return rc; +} + +static void cxl_core_exit(void) +{ + cxl_region_exit(); + bus_unregister(&cxl_bus_type); + destroy_workqueue(cxl_bus_wq); + cxl_memdev_exit(); + debugfs_remove_recursive(cxl_debugfs); +} + +module_init(cxl_core_init); +module_exit(cxl_core_exit); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c new file mode 100644 index 000000000..5b7d848a6 --- /dev/null +++ b/drivers/cxl/core/region.c @@ -0,0 +1,2006 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include <linux/memregion.h> +#include <linux/genalloc.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/idr.h> +#include <cxlmem.h> +#include <cxl.h> +#include "core.h" + +/** + * DOC: cxl core region + * + * CXL Regions represent mapped memory capacity in system physical address + * space. Whereas the CXL Root Decoders identify the bounds of potential CXL + * Memory ranges, Regions represent the active mapped capacity by the HDM + * Decoder Capability structures throughout the Host Bridges, Switches, and + * Endpoints in the topology. + * + * Region configuration has ordering constraints. UUID may be set at any time + * but is only visible for persistent regions. + * 1. Interleave granularity + * 2. Interleave size + * 3. Decoder targets + */ + +/* + * All changes to the interleave configuration occur with this lock held + * for write. + */ +static DECLARE_RWSEM(cxl_region_rwsem); + +static struct cxl_region *to_cxl_region(struct device *dev); + +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%pUb\n", &p->uuid); + up_read(&cxl_region_rwsem); + + return rc; +} + +static int is_dup(struct device *match, void *data) +{ + struct cxl_region_params *p; + struct cxl_region *cxlr; + uuid_t *uuid = data; + + if (!is_cxl_region(match)) + return 0; + + lockdep_assert_held(&cxl_region_rwsem); + cxlr = to_cxl_region(match); + p = &cxlr->params; + + if (uuid_equal(&p->uuid, uuid)) { + dev_dbg(match, "already has uuid: %pUb\n", uuid); + return -EBUSY; + } + + return 0; +} + +static ssize_t uuid_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + uuid_t temp; + ssize_t rc; + + if (len != UUID_STRING_LEN + 1) + return -EINVAL; + + rc = uuid_parse(buf, &temp); + if (rc) + return rc; + + if (uuid_is_null(&temp)) + return -EINVAL; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + if (uuid_equal(&p->uuid, &temp)) + goto out; + + rc = -EBUSY; + if (p->state >= CXL_CONFIG_ACTIVE) + goto out; + + rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup); + if (rc < 0) + goto out; + + uuid_copy(&p->uuid, &temp); +out: + up_write(&cxl_region_rwsem); + + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(uuid); + +static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port, + struct cxl_region *cxlr) +{ + return xa_load(&port->regions, (unsigned long)cxlr); +} + +static int cxl_region_decode_reset(struct cxl_region *cxlr, int count) +{ + struct cxl_region_params *p = &cxlr->params; + int i; + + for (i = count - 1; i >= 0; i--) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *iter = cxled_to_port(cxled); + struct cxl_ep *ep; + int rc = 0; + + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + + for (ep = cxl_ep_load(iter, cxlmd); iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + struct cxl_region_ref *cxl_rr; + struct cxl_decoder *cxld; + + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + if (cxld->reset) + rc = cxld->reset(cxld); + if (rc) + return rc; + } + + rc = cxled->cxld.reset(&cxled->cxld); + if (rc) + return rc; + } + + return 0; +} + +static int cxl_region_decode_commit(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + int i, rc = 0; + + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_region_ref *cxl_rr; + struct cxl_decoder *cxld; + struct cxl_port *iter; + struct cxl_ep *ep; + + /* commit bottom up */ + for (iter = cxled_to_port(cxled); !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + if (cxld->commit) + rc = cxld->commit(cxld); + if (rc) + break; + } + + if (rc) { + /* programming @iter failed, teardown */ + for (ep = cxl_ep_load(iter, cxlmd); ep && iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + if (cxld->reset) + cxld->reset(cxld); + } + + cxled->cxld.reset(&cxled->cxld); + goto err; + } + } + + return 0; + +err: + /* undo the targets that were successfully committed */ + cxl_region_decode_reset(cxlr, i); + return rc; +} + +static ssize_t commit_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + bool commit; + ssize_t rc; + + rc = kstrtobool(buf, &commit); + if (rc) + return rc; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + /* Already in the requested state? */ + if (commit && p->state >= CXL_CONFIG_COMMIT) + goto out; + if (!commit && p->state < CXL_CONFIG_COMMIT) + goto out; + + /* Not ready to commit? */ + if (commit && p->state < CXL_CONFIG_ACTIVE) { + rc = -ENXIO; + goto out; + } + + if (commit) + rc = cxl_region_decode_commit(cxlr); + else { + p->state = CXL_CONFIG_RESET_PENDING; + up_write(&cxl_region_rwsem); + device_release_driver(&cxlr->dev); + down_write(&cxl_region_rwsem); + + /* + * The lock was dropped, so need to revalidate that the reset is + * still pending. + */ + if (p->state == CXL_CONFIG_RESET_PENDING) + rc = cxl_region_decode_reset(cxlr, p->interleave_ways); + } + + if (rc) + goto out; + + if (commit) + p->state = CXL_CONFIG_COMMIT; + else if (p->state == CXL_CONFIG_RESET_PENDING) + p->state = CXL_CONFIG_ACTIVE; + +out: + up_write(&cxl_region_rwsem); + + if (rc) + return rc; + return len; +} + +static ssize_t commit_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT); + up_read(&cxl_region_rwsem); + + return rc; +} +static DEVICE_ATTR_RW(commit); + +static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_region *cxlr = to_cxl_region(dev); + + if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM) + return 0; + return a->mode; +} + +static ssize_t interleave_ways_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%d\n", p->interleave_ways); + up_read(&cxl_region_rwsem); + + return rc; +} + +static const struct attribute_group *get_cxl_region_target_group(void); + +static ssize_t interleave_ways_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + unsigned int val, save; + int rc; + u8 iw; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + + rc = ways_to_cxl(val, &iw); + if (rc) + return rc; + + /* + * Even for x3, x6, and x12 interleaves the region interleave must be a + * power of 2 multiple of the host bridge interleave. + */ + if (!is_power_of_2(val / cxld->interleave_ways) || + (val % cxld->interleave_ways)) { + dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val); + return -EINVAL; + } + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { + rc = -EBUSY; + goto out; + } + + save = p->interleave_ways; + p->interleave_ways = val; + rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); + if (rc) + p->interleave_ways = save; +out: + up_write(&cxl_region_rwsem); + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(interleave_ways); + +static ssize_t interleave_granularity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%d\n", p->interleave_granularity); + up_read(&cxl_region_rwsem); + + return rc; +} + +static ssize_t interleave_granularity_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + int rc, val; + u16 ig; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + + rc = granularity_to_cxl(val, &ig); + if (rc) + return rc; + + /* + * When the host-bridge is interleaved, disallow region granularity != + * root granularity. Regions with a granularity less than the root + * interleave result in needing multiple endpoints to support a single + * slot in the interleave (possible to suport in the future). Regions + * with a granularity greater than the root interleave result in invalid + * DPA translations (invalid to support). + */ + if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity) + return -EINVAL; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { + rc = -EBUSY; + goto out; + } + + p->interleave_granularity = val; +out: + up_write(&cxl_region_rwsem); + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(interleave_granularity); + +static ssize_t resource_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + u64 resource = -1ULL; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + if (p->res) + resource = p->res->start; + rc = sysfs_emit(buf, "%#llx\n", resource); + up_read(&cxl_region_rwsem); + + return rc; +} +static DEVICE_ATTR_RO(resource); + +static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + struct resource *res; + u64 remainder = 0; + + lockdep_assert_held_write(&cxl_region_rwsem); + + /* Nothing to do... */ + if (p->res && resource_size(p->res) == size) + return 0; + + /* To change size the old size must be freed first */ + if (p->res) + return -EBUSY; + + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) + return -EBUSY; + + /* ways, granularity and uuid (if PMEM) need to be set before HPA */ + if (!p->interleave_ways || !p->interleave_granularity || + (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid))) + return -ENXIO; + + div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder); + if (remainder) + return -EINVAL; + + res = alloc_free_mem_region(cxlrd->res, size, SZ_256M, + dev_name(&cxlr->dev)); + if (IS_ERR(res)) { + dev_dbg(&cxlr->dev, "failed to allocate HPA: %ld\n", + PTR_ERR(res)); + return PTR_ERR(res); + } + + p->res = res; + p->state = CXL_CONFIG_INTERLEAVE_ACTIVE; + + return 0; +} + +static void cxl_region_iomem_release(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + + if (device_is_registered(&cxlr->dev)) + lockdep_assert_held_write(&cxl_region_rwsem); + if (p->res) { + remove_resource(p->res); + kfree(p->res); + p->res = NULL; + } +} + +static int free_hpa(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + + lockdep_assert_held_write(&cxl_region_rwsem); + + if (!p->res) + return 0; + + if (p->state >= CXL_CONFIG_ACTIVE) + return -EBUSY; + + cxl_region_iomem_release(cxlr); + p->state = CXL_CONFIG_IDLE; + return 0; +} + +static ssize_t size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + u64 val; + int rc; + + rc = kstrtou64(buf, 0, &val); + if (rc) + return rc; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + if (val) + rc = alloc_hpa(cxlr, val); + else + rc = free_hpa(cxlr); + up_write(&cxl_region_rwsem); + + if (rc) + return rc; + + return len; +} + +static ssize_t size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + u64 size = 0; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + if (p->res) + size = resource_size(p->res); + rc = sysfs_emit(buf, "%#llx\n", size); + up_read(&cxl_region_rwsem); + + return rc; +} +static DEVICE_ATTR_RW(size); + +static struct attribute *cxl_region_attrs[] = { + &dev_attr_uuid.attr, + &dev_attr_commit.attr, + &dev_attr_interleave_ways.attr, + &dev_attr_interleave_granularity.attr, + &dev_attr_resource.attr, + &dev_attr_size.attr, + NULL, +}; + +static const struct attribute_group cxl_region_group = { + .attrs = cxl_region_attrs, + .is_visible = cxl_region_visible, +}; + +static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + int rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + + if (pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + rc = -ENXIO; + goto out; + } + + cxled = p->targets[pos]; + if (!cxled) + rc = sysfs_emit(buf, "\n"); + else + rc = sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev)); +out: + up_read(&cxl_region_rwsem); + + return rc; +} + +static int match_free_decoder(struct device *dev, void *data) +{ + struct cxl_decoder *cxld; + int *id = data; + + if (!is_switch_decoder(dev)) + return 0; + + cxld = to_cxl_decoder(dev); + + /* enforce ordered allocation */ + if (cxld->id != *id) + return 0; + + if (!cxld->region) + return 1; + + (*id)++; + + return 0; +} + +static struct cxl_decoder *cxl_region_find_decoder(struct cxl_port *port, + struct cxl_region *cxlr) +{ + struct device *dev; + int id = 0; + + dev = device_find_child(&port->dev, &id, match_free_decoder); + if (!dev) + return NULL; + /* + * This decoder is pinned registered as long as the endpoint decoder is + * registered, and endpoint decoder unregistration holds the + * cxl_region_rwsem over unregister events, so no need to hold on to + * this extra reference. + */ + put_device(dev); + return to_cxl_decoder(dev); +} + +static struct cxl_region_ref *alloc_region_ref(struct cxl_port *port, + struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_region_ref *cxl_rr, *iter; + unsigned long index; + int rc; + + xa_for_each(&port->regions, index, iter) { + struct cxl_region_params *ip = &iter->region->params; + + if (!ip->res) + continue; + + if (ip->res->start > p->res->start) { + dev_dbg(&cxlr->dev, + "%s: HPA order violation %s:%pr vs %pr\n", + dev_name(&port->dev), + dev_name(&iter->region->dev), ip->res, p->res); + return ERR_PTR(-EBUSY); + } + } + + cxl_rr = kzalloc(sizeof(*cxl_rr), GFP_KERNEL); + if (!cxl_rr) + return ERR_PTR(-ENOMEM); + cxl_rr->port = port; + cxl_rr->region = cxlr; + cxl_rr->nr_targets = 1; + xa_init(&cxl_rr->endpoints); + + rc = xa_insert(&port->regions, (unsigned long)cxlr, cxl_rr, GFP_KERNEL); + if (rc) { + dev_dbg(&cxlr->dev, + "%s: failed to track region reference: %d\n", + dev_name(&port->dev), rc); + kfree(cxl_rr); + return ERR_PTR(rc); + } + + return cxl_rr; +} + +static void cxl_rr_free_decoder(struct cxl_region_ref *cxl_rr) +{ + struct cxl_region *cxlr = cxl_rr->region; + struct cxl_decoder *cxld = cxl_rr->decoder; + + if (!cxld) + return; + + dev_WARN_ONCE(&cxlr->dev, cxld->region != cxlr, "region mismatch\n"); + if (cxld->region == cxlr) { + cxld->region = NULL; + put_device(&cxlr->dev); + } +} + +static void free_region_ref(struct cxl_region_ref *cxl_rr) +{ + struct cxl_port *port = cxl_rr->port; + struct cxl_region *cxlr = cxl_rr->region; + + cxl_rr_free_decoder(cxl_rr); + xa_erase(&port->regions, (unsigned long)cxlr); + xa_destroy(&cxl_rr->endpoints); + kfree(cxl_rr); +} + +static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr, + struct cxl_endpoint_decoder *cxled) +{ + int rc; + struct cxl_port *port = cxl_rr->port; + struct cxl_region *cxlr = cxl_rr->region; + struct cxl_decoder *cxld = cxl_rr->decoder; + struct cxl_ep *ep = cxl_ep_load(port, cxled_to_memdev(cxled)); + + if (ep) { + rc = xa_insert(&cxl_rr->endpoints, (unsigned long)cxled, ep, + GFP_KERNEL); + if (rc) + return rc; + } + cxl_rr->nr_eps++; + + if (!cxld->region) { + cxld->region = cxlr; + get_device(&cxlr->dev); + } + + return 0; +} + +static int cxl_rr_alloc_decoder(struct cxl_port *port, struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, + struct cxl_region_ref *cxl_rr) +{ + struct cxl_decoder *cxld; + + if (port == cxled_to_port(cxled)) + cxld = &cxled->cxld; + else + cxld = cxl_region_find_decoder(port, cxlr); + if (!cxld) { + dev_dbg(&cxlr->dev, "%s: no decoder available\n", + dev_name(&port->dev)); + return -EBUSY; + } + + if (cxld->region) { + dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n", + dev_name(&port->dev), dev_name(&cxld->dev), + dev_name(&cxld->region->dev)); + return -EBUSY; + } + + cxl_rr->decoder = cxld; + return 0; +} + +/** + * cxl_port_attach_region() - track a region's interest in a port by endpoint + * @port: port to add a new region reference 'struct cxl_region_ref' + * @cxlr: region to attach to @port + * @cxled: endpoint decoder used to create or further pin a region reference + * @pos: interleave position of @cxled in @cxlr + * + * The attach event is an opportunity to validate CXL decode setup + * constraints and record metadata needed for programming HDM decoders, + * in particular decoder target lists. + * + * The steps are: + * + * - validate that there are no other regions with a higher HPA already + * associated with @port + * - establish a region reference if one is not already present + * + * - additionally allocate a decoder instance that will host @cxlr on + * @port + * + * - pin the region reference by the endpoint + * - account for how many entries in @port's target list are needed to + * cover all of the added endpoints. + */ +static int cxl_port_attach_region(struct cxl_port *port, + struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_ep *ep = cxl_ep_load(port, cxlmd); + struct cxl_region_ref *cxl_rr; + bool nr_targets_inc = false; + struct cxl_decoder *cxld; + unsigned long index; + int rc = -EBUSY; + + lockdep_assert_held_write(&cxl_region_rwsem); + + cxl_rr = cxl_rr_load(port, cxlr); + if (cxl_rr) { + struct cxl_ep *ep_iter; + int found = 0; + + /* + * Walk the existing endpoints that have been attached to + * @cxlr at @port and see if they share the same 'next' port + * in the downstream direction. I.e. endpoints that share common + * upstream switch. + */ + xa_for_each(&cxl_rr->endpoints, index, ep_iter) { + if (ep_iter == ep) + continue; + if (ep_iter->next == ep->next) { + found++; + break; + } + } + + /* + * New target port, or @port is an endpoint port that always + * accounts its own local decode as a target. + */ + if (!found || !ep->next) { + cxl_rr->nr_targets++; + nr_targets_inc = true; + } + } else { + cxl_rr = alloc_region_ref(port, cxlr); + if (IS_ERR(cxl_rr)) { + dev_dbg(&cxlr->dev, + "%s: failed to allocate region reference\n", + dev_name(&port->dev)); + return PTR_ERR(cxl_rr); + } + nr_targets_inc = true; + + rc = cxl_rr_alloc_decoder(port, cxlr, cxled, cxl_rr); + if (rc) + goto out_erase; + } + cxld = cxl_rr->decoder; + + rc = cxl_rr_ep_add(cxl_rr, cxled); + if (rc) { + dev_dbg(&cxlr->dev, + "%s: failed to track endpoint %s:%s reference\n", + dev_name(&port->dev), dev_name(&cxlmd->dev), + dev_name(&cxld->dev)); + goto out_erase; + } + + dev_dbg(&cxlr->dev, + "%s:%s %s add: %s:%s @ %d next: %s nr_eps: %d nr_targets: %d\n", + dev_name(port->uport), dev_name(&port->dev), + dev_name(&cxld->dev), dev_name(&cxlmd->dev), + dev_name(&cxled->cxld.dev), pos, + ep ? ep->next ? dev_name(ep->next->uport) : + dev_name(&cxlmd->dev) : + "none", + cxl_rr->nr_eps, cxl_rr->nr_targets); + + return 0; +out_erase: + if (nr_targets_inc) + cxl_rr->nr_targets--; + if (cxl_rr->nr_eps == 0) + free_region_ref(cxl_rr); + return rc; +} + +static void cxl_port_detach_region(struct cxl_port *port, + struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_region_ref *cxl_rr; + struct cxl_ep *ep = NULL; + + lockdep_assert_held_write(&cxl_region_rwsem); + + cxl_rr = cxl_rr_load(port, cxlr); + if (!cxl_rr) + return; + + /* + * Endpoint ports do not carry cxl_ep references, and they + * never target more than one endpoint by definition + */ + if (cxl_rr->decoder == &cxled->cxld) + cxl_rr->nr_eps--; + else + ep = xa_erase(&cxl_rr->endpoints, (unsigned long)cxled); + if (ep) { + struct cxl_ep *ep_iter; + unsigned long index; + int found = 0; + + cxl_rr->nr_eps--; + xa_for_each(&cxl_rr->endpoints, index, ep_iter) { + if (ep_iter->next == ep->next) { + found++; + break; + } + } + if (!found) + cxl_rr->nr_targets--; + } + + if (cxl_rr->nr_eps == 0) + free_region_ref(cxl_rr); +} + +static int check_last_peer(struct cxl_endpoint_decoder *cxled, + struct cxl_ep *ep, struct cxl_region_ref *cxl_rr, + int distance) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_region *cxlr = cxl_rr->region; + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled_peer; + struct cxl_port *port = cxl_rr->port; + struct cxl_memdev *cxlmd_peer; + struct cxl_ep *ep_peer; + int pos = cxled->pos; + + /* + * If this position wants to share a dport with the last endpoint mapped + * then that endpoint, at index 'position - distance', must also be + * mapped by this dport. + */ + if (pos < distance) { + dev_dbg(&cxlr->dev, "%s:%s: cannot host %s:%s at %d\n", + dev_name(port->uport), dev_name(&port->dev), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos); + return -ENXIO; + } + cxled_peer = p->targets[pos - distance]; + cxlmd_peer = cxled_to_memdev(cxled_peer); + ep_peer = cxl_ep_load(port, cxlmd_peer); + if (ep->dport != ep_peer->dport) { + dev_dbg(&cxlr->dev, + "%s:%s: %s:%s pos %d mismatched peer %s:%s\n", + dev_name(port->uport), dev_name(&port->dev), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos, + dev_name(&cxlmd_peer->dev), + dev_name(&cxled_peer->cxld.dev)); + return -ENXIO; + } + + return 0; +} + +static int cxl_port_setup_targets(struct cxl_port *port, + struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos; + struct cxl_port *parent_port = to_cxl_port(port->dev.parent); + struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_ep *ep = cxl_ep_load(port, cxlmd); + struct cxl_region_params *p = &cxlr->params; + struct cxl_decoder *cxld = cxl_rr->decoder; + struct cxl_switch_decoder *cxlsd; + u16 eig, peig; + u8 eiw, peiw; + + /* + * While root level decoders support x3, x6, x12, switch level + * decoders only support powers of 2 up to x16. + */ + if (!is_power_of_2(cxl_rr->nr_targets)) { + dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n", + dev_name(port->uport), dev_name(&port->dev), + cxl_rr->nr_targets); + return -EINVAL; + } + + cxlsd = to_cxl_switch_decoder(&cxld->dev); + if (cxl_rr->nr_targets_set) { + int i, distance; + + /* + * Passthrough decoders impose no distance requirements between + * peers + */ + if (cxl_rr->nr_targets == 1) + distance = 0; + else + distance = p->nr_targets / cxl_rr->nr_targets; + for (i = 0; i < cxl_rr->nr_targets_set; i++) + if (ep->dport == cxlsd->target[i]) { + rc = check_last_peer(cxled, ep, cxl_rr, + distance); + if (rc) + return rc; + goto out_target_set; + } + goto add_target; + } + + if (is_cxl_root(parent_port)) { + /* + * Root decoder IG is always set to value in CFMWS which + * may be different than this region's IG. We can use the + * region's IG here since interleave_granularity_store() + * does not allow interleaved host-bridges with + * root IG != region IG. + */ + parent_ig = p->interleave_granularity; + parent_iw = cxlrd->cxlsd.cxld.interleave_ways; + /* + * For purposes of address bit routing, use power-of-2 math for + * switch ports. + */ + if (!is_power_of_2(parent_iw)) + parent_iw /= 3; + } else { + struct cxl_region_ref *parent_rr; + struct cxl_decoder *parent_cxld; + + parent_rr = cxl_rr_load(parent_port, cxlr); + parent_cxld = parent_rr->decoder; + parent_ig = parent_cxld->interleave_granularity; + parent_iw = parent_cxld->interleave_ways; + } + + rc = granularity_to_cxl(parent_ig, &peig); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n", + dev_name(parent_port->uport), + dev_name(&parent_port->dev), parent_ig); + return rc; + } + + rc = ways_to_cxl(parent_iw, &peiw); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n", + dev_name(parent_port->uport), + dev_name(&parent_port->dev), parent_iw); + return rc; + } + + iw = cxl_rr->nr_targets; + rc = ways_to_cxl(iw, &eiw); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n", + dev_name(port->uport), dev_name(&port->dev), iw); + return rc; + } + + /* + * If @parent_port is masking address bits, pick the next unused address + * bit to route @port's targets. + */ + if (parent_iw > 1 && cxl_rr->nr_targets > 1) { + u32 address_bit = max(peig + peiw, eiw + peig); + + eig = address_bit - eiw + 1; + } else { + eiw = peiw; + eig = peig; + } + + rc = cxl_to_granularity(eig, &ig); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n", + dev_name(port->uport), dev_name(&port->dev), + 256 << eig); + return rc; + } + + cxld->interleave_ways = iw; + cxld->interleave_granularity = ig; + cxld->hpa_range = (struct range) { + .start = p->res->start, + .end = p->res->end, + }; + dev_dbg(&cxlr->dev, "%s:%s iw: %d ig: %d\n", dev_name(port->uport), + dev_name(&port->dev), iw, ig); +add_target: + if (cxl_rr->nr_targets_set == cxl_rr->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s: targets full trying to add %s:%s at %d\n", + dev_name(port->uport), dev_name(&port->dev), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos); + return -ENXIO; + } + cxlsd->target[cxl_rr->nr_targets_set] = ep->dport; + inc = 1; +out_target_set: + cxl_rr->nr_targets_set += inc; + dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n", + dev_name(port->uport), dev_name(&port->dev), + cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos); + + return 0; +} + +static void cxl_port_reset_targets(struct cxl_port *port, + struct cxl_region *cxlr) +{ + struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); + struct cxl_decoder *cxld; + + /* + * After the last endpoint has been detached the entire cxl_rr may now + * be gone. + */ + if (!cxl_rr) + return; + cxl_rr->nr_targets_set = 0; + + cxld = cxl_rr->decoder; + cxld->hpa_range = (struct range) { + .start = 0, + .end = -1, + }; +} + +static void cxl_region_teardown_targets(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + struct cxl_memdev *cxlmd; + struct cxl_port *iter; + struct cxl_ep *ep; + int i; + + for (i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + cxlmd = cxled_to_memdev(cxled); + + iter = cxled_to_port(cxled); + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + + for (ep = cxl_ep_load(iter, cxlmd); iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) + cxl_port_reset_targets(iter, cxlr); + } +} + +static int cxl_region_setup_targets(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + struct cxl_memdev *cxlmd; + struct cxl_port *iter; + struct cxl_ep *ep; + int i, rc; + + for (i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + cxlmd = cxled_to_memdev(cxled); + + iter = cxled_to_port(cxled); + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + + /* + * Descend the topology tree programming targets while + * looking for conflicts. + */ + for (ep = cxl_ep_load(iter, cxlmd); iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + rc = cxl_port_setup_targets(iter, cxlr, cxled); + if (rc) { + cxl_region_teardown_targets(cxlr); + return rc; + } + } + } + + return 0; +} + +static int cxl_region_validate_position(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, + int pos) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_region_params *p = &cxlr->params; + int i; + + if (pos < 0 || pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + return -ENXIO; + } + + if (p->targets[pos] == cxled) + return 0; + + if (p->targets[pos]) { + struct cxl_endpoint_decoder *cxled_target = p->targets[pos]; + struct cxl_memdev *cxlmd_target = cxled_to_memdev(cxled_target); + + dev_dbg(&cxlr->dev, "position %d already assigned to %s:%s\n", + pos, dev_name(&cxlmd_target->dev), + dev_name(&cxled_target->cxld.dev)); + return -EBUSY; + } + + for (i = 0; i < p->interleave_ways; i++) { + struct cxl_endpoint_decoder *cxled_target; + struct cxl_memdev *cxlmd_target; + + cxled_target = p->targets[i]; + if (!cxled_target) + continue; + + cxlmd_target = cxled_to_memdev(cxled_target); + if (cxlmd_target == cxlmd) { + dev_dbg(&cxlr->dev, + "%s already specified at position %d via: %s\n", + dev_name(&cxlmd->dev), pos, + dev_name(&cxled_target->cxld.dev)); + return -EBUSY; + } + } + + return 0; +} + +static int cxl_region_attach_position(struct cxl_region *cxlr, + struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled, + const struct cxl_dport *dport, int pos) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *iter; + int rc; + + if (cxlrd->calc_hb(cxlrd, pos) != dport) { + dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + dev_name(&cxlrd->cxlsd.cxld.dev)); + return -ENXIO; + } + + for (iter = cxled_to_port(cxled); !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + rc = cxl_port_attach_region(iter, cxlr, cxled, pos); + if (rc) + goto err; + } + + return 0; + +err: + for (iter = cxled_to_port(cxled); !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) + cxl_port_detach_region(iter, cxlr, cxled); + return rc; +} + +static int cxl_region_attach(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_region_params *p = &cxlr->params; + struct cxl_port *ep_port, *root_port; + struct cxl_dport *dport; + int rc = -ENXIO; + + if (cxled->mode != cxlr->mode) { + dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n", + dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode); + return -EINVAL; + } + + if (cxled->mode == CXL_DECODER_DEAD) { + dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev)); + return -ENODEV; + } + + /* all full of members, or interleave config not established? */ + if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) { + dev_dbg(&cxlr->dev, "region already active\n"); + return -EBUSY; + } else if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) { + dev_dbg(&cxlr->dev, "interleave config missing\n"); + return -ENXIO; + } + + if (p->nr_targets >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "region already has %d endpoints\n", + p->nr_targets); + return -EINVAL; + } + + ep_port = cxled_to_port(cxled); + root_port = cxlrd_to_port(cxlrd); + dport = cxl_find_dport_by_dev(root_port, ep_port->host_bridge); + if (!dport) { + dev_dbg(&cxlr->dev, "%s:%s invalid target for %s\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + dev_name(cxlr->dev.parent)); + return -ENXIO; + } + + if (cxled->cxld.target_type != cxlr->type) { + dev_dbg(&cxlr->dev, "%s:%s type mismatch: %d vs %d\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + cxled->cxld.target_type, cxlr->type); + return -ENXIO; + } + + if (!cxled->dpa_res) { + dev_dbg(&cxlr->dev, "%s:%s: missing DPA allocation.\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev)); + return -ENXIO; + } + + if (resource_size(cxled->dpa_res) * p->interleave_ways != + resource_size(p->res)) { + dev_dbg(&cxlr->dev, + "%s:%s: decoder-size-%#llx * ways-%d != region-size-%#llx\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + (u64)resource_size(cxled->dpa_res), p->interleave_ways, + (u64)resource_size(p->res)); + return -EINVAL; + } + + rc = cxl_region_validate_position(cxlr, cxled, pos); + if (rc) + return rc; + + rc = cxl_region_attach_position(cxlr, cxlrd, cxled, dport, pos); + if (rc) + return rc; + + p->targets[pos] = cxled; + cxled->pos = pos; + p->nr_targets++; + + if (p->nr_targets == p->interleave_ways) { + rc = cxl_region_setup_targets(cxlr); + if (rc) + return rc; + p->state = CXL_CONFIG_ACTIVE; + } + + cxled->cxld.interleave_ways = p->interleave_ways; + cxled->cxld.interleave_granularity = p->interleave_granularity; + cxled->cxld.hpa_range = (struct range) { + .start = p->res->start, + .end = p->res->end, + }; + + return 0; +} + +static int cxl_region_detach(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *iter, *ep_port = cxled_to_port(cxled); + struct cxl_region *cxlr = cxled->cxld.region; + struct cxl_region_params *p; + int rc = 0; + + lockdep_assert_held_write(&cxl_region_rwsem); + + if (!cxlr) + return 0; + + p = &cxlr->params; + get_device(&cxlr->dev); + + if (p->state > CXL_CONFIG_ACTIVE) { + /* + * TODO: tear down all impacted regions if a device is + * removed out of order + */ + rc = cxl_region_decode_reset(cxlr, p->interleave_ways); + if (rc) + goto out; + p->state = CXL_CONFIG_ACTIVE; + } + + for (iter = ep_port; !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) + cxl_port_detach_region(iter, cxlr, cxled); + + if (cxled->pos < 0 || cxled->pos >= p->interleave_ways || + p->targets[cxled->pos] != cxled) { + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + + dev_WARN_ONCE(&cxlr->dev, 1, "expected %s:%s at position %d\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + cxled->pos); + goto out; + } + + if (p->state == CXL_CONFIG_ACTIVE) { + p->state = CXL_CONFIG_INTERLEAVE_ACTIVE; + cxl_region_teardown_targets(cxlr); + } + p->targets[cxled->pos] = NULL; + p->nr_targets--; + cxled->cxld.hpa_range = (struct range) { + .start = 0, + .end = -1, + }; + + /* notify the region driver that one of its targets has departed */ + up_write(&cxl_region_rwsem); + device_release_driver(&cxlr->dev); + down_write(&cxl_region_rwsem); +out: + put_device(&cxlr->dev); + return rc; +} + +void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled) +{ + down_write(&cxl_region_rwsem); + cxled->mode = CXL_DECODER_DEAD; + cxl_region_detach(cxled); + up_write(&cxl_region_rwsem); +} + +static int attach_target(struct cxl_region *cxlr, const char *decoder, int pos) +{ + struct device *dev; + int rc; + + dev = bus_find_device_by_name(&cxl_bus_type, NULL, decoder); + if (!dev) + return -ENODEV; + + if (!is_endpoint_decoder(dev)) { + put_device(dev); + return -EINVAL; + } + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + goto out; + down_read(&cxl_dpa_rwsem); + rc = cxl_region_attach(cxlr, to_cxl_endpoint_decoder(dev), pos); + up_read(&cxl_dpa_rwsem); + up_write(&cxl_region_rwsem); +out: + put_device(dev); + return rc; +} + +static int detach_target(struct cxl_region *cxlr, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + int rc; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + if (pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + rc = -ENXIO; + goto out; + } + + if (!p->targets[pos]) { + rc = 0; + goto out; + } + + rc = cxl_region_detach(p->targets[pos]); +out: + up_write(&cxl_region_rwsem); + return rc; +} + +static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos, + size_t len) +{ + int rc; + + if (sysfs_streq(buf, "\n")) + rc = detach_target(cxlr, pos); + else + rc = attach_target(cxlr, buf, pos); + + if (rc < 0) + return rc; + return len; +} + +#define TARGET_ATTR_RW(n) \ +static ssize_t target##n##_show( \ + struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + return show_targetN(to_cxl_region(dev), buf, (n)); \ +} \ +static ssize_t target##n##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return store_targetN(to_cxl_region(dev), buf, (n), len); \ +} \ +static DEVICE_ATTR_RW(target##n) + +TARGET_ATTR_RW(0); +TARGET_ATTR_RW(1); +TARGET_ATTR_RW(2); +TARGET_ATTR_RW(3); +TARGET_ATTR_RW(4); +TARGET_ATTR_RW(5); +TARGET_ATTR_RW(6); +TARGET_ATTR_RW(7); +TARGET_ATTR_RW(8); +TARGET_ATTR_RW(9); +TARGET_ATTR_RW(10); +TARGET_ATTR_RW(11); +TARGET_ATTR_RW(12); +TARGET_ATTR_RW(13); +TARGET_ATTR_RW(14); +TARGET_ATTR_RW(15); + +static struct attribute *target_attrs[] = { + &dev_attr_target0.attr, + &dev_attr_target1.attr, + &dev_attr_target2.attr, + &dev_attr_target3.attr, + &dev_attr_target4.attr, + &dev_attr_target5.attr, + &dev_attr_target6.attr, + &dev_attr_target7.attr, + &dev_attr_target8.attr, + &dev_attr_target9.attr, + &dev_attr_target10.attr, + &dev_attr_target11.attr, + &dev_attr_target12.attr, + &dev_attr_target13.attr, + &dev_attr_target14.attr, + &dev_attr_target15.attr, + NULL, +}; + +static umode_t cxl_region_target_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + + if (n < p->interleave_ways) + return a->mode; + return 0; +} + +static const struct attribute_group cxl_region_target_group = { + .attrs = target_attrs, + .is_visible = cxl_region_target_visible, +}; + +static const struct attribute_group *get_cxl_region_target_group(void) +{ + return &cxl_region_target_group; +} + +static const struct attribute_group *region_groups[] = { + &cxl_base_attribute_group, + &cxl_region_group, + &cxl_region_target_group, + NULL, +}; + +static void cxl_region_release(struct device *dev) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_region *cxlr = to_cxl_region(dev); + int id = atomic_read(&cxlrd->region_id); + + /* + * Try to reuse the recently idled id rather than the cached + * next id to prevent the region id space from increasing + * unnecessarily. + */ + if (cxlr->id < id) + if (atomic_try_cmpxchg(&cxlrd->region_id, &id, cxlr->id)) { + memregion_free(id); + goto out; + } + + memregion_free(cxlr->id); +out: + put_device(dev->parent); + kfree(cxlr); +} + +const struct device_type cxl_region_type = { + .name = "cxl_region", + .release = cxl_region_release, + .groups = region_groups +}; + +bool is_cxl_region(struct device *dev) +{ + return dev->type == &cxl_region_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_region, CXL); + +static struct cxl_region *to_cxl_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, dev->type != &cxl_region_type, + "not a cxl_region device\n")) + return NULL; + + return container_of(dev, struct cxl_region, dev); +} + +static void unregister_region(void *dev) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + int i; + + device_del(dev); + + /* + * Now that region sysfs is shutdown, the parameter block is now + * read-only, so no need to hold the region rwsem to access the + * region parameters. + */ + for (i = 0; i < p->interleave_ways; i++) + detach_target(cxlr, i); + + cxl_region_iomem_release(cxlr); + put_device(dev); +} + +static struct lock_class_key cxl_region_key; + +static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id) +{ + struct cxl_region *cxlr; + struct device *dev; + + cxlr = kzalloc(sizeof(*cxlr), GFP_KERNEL); + if (!cxlr) { + memregion_free(id); + return ERR_PTR(-ENOMEM); + } + + dev = &cxlr->dev; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_region_key); + dev->parent = &cxlrd->cxlsd.cxld.dev; + /* + * Keep root decoder pinned through cxl_region_release to fixup + * region id allocations + */ + get_device(dev->parent); + device_set_pm_not_required(dev); + dev->bus = &cxl_bus_type; + dev->type = &cxl_region_type; + cxlr->id = id; + + return cxlr; +} + +/** + * devm_cxl_add_region - Adds a region to a decoder + * @cxlrd: root decoder + * @id: memregion id to create, or memregion_free() on failure + * @mode: mode for the endpoint decoders of this region + * @type: select whether this is an expander or accelerator (type-2 or type-3) + * + * This is the second step of region initialization. Regions exist within an + * address space which is mapped by a @cxlrd. + * + * Return: 0 if the region was added to the @cxlrd, else returns negative error + * code. The region will be named "regionZ" where Z is the unique region number. + */ +static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, + int id, + enum cxl_decoder_mode mode, + enum cxl_decoder_type type) +{ + struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent); + struct cxl_region *cxlr; + struct device *dev; + int rc; + + cxlr = cxl_region_alloc(cxlrd, id); + if (IS_ERR(cxlr)) + return cxlr; + cxlr->mode = mode; + cxlr->type = type; + + dev = &cxlr->dev; + rc = dev_set_name(dev, "region%d", id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + rc = devm_add_action_or_reset(port->uport, unregister_region, cxlr); + if (rc) + return ERR_PTR(rc); + + dev_dbg(port->uport, "%s: created %s\n", + dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev)); + return cxlr; + +err: + put_device(dev); + return ERR_PTR(rc); +} + +static ssize_t create_pmem_region_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id)); +} + +static ssize_t create_pmem_region_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + struct cxl_region *cxlr; + int id, rc; + + rc = sscanf(buf, "region%d\n", &id); + if (rc != 1) + return -EINVAL; + + rc = memregion_alloc(GFP_KERNEL); + if (rc < 0) + return rc; + + if (atomic_cmpxchg(&cxlrd->region_id, id, rc) != id) { + memregion_free(rc); + return -EBUSY; + } + + cxlr = devm_cxl_add_region(cxlrd, id, CXL_DECODER_PMEM, + CXL_DECODER_EXPANDER); + if (IS_ERR(cxlr)) + return PTR_ERR(cxlr); + + return len; +} +DEVICE_ATTR_RW(create_pmem_region); + +static ssize_t region_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + + if (cxld->region) + rc = sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev)); + else + rc = sysfs_emit(buf, "\n"); + up_read(&cxl_region_rwsem); + + return rc; +} +DEVICE_ATTR_RO(region); + +static struct cxl_region * +cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name) +{ + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct device *region_dev; + + region_dev = device_find_child_by_name(&cxld->dev, name); + if (!region_dev) + return ERR_PTR(-ENODEV); + + return to_cxl_region(region_dev); +} + +static ssize_t delete_region_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + struct cxl_port *port = to_cxl_port(dev->parent); + struct cxl_region *cxlr; + + cxlr = cxl_find_region_by_name(cxlrd, buf); + if (IS_ERR(cxlr)) + return PTR_ERR(cxlr); + + devm_release_action(port->uport, unregister_region, cxlr); + put_device(&cxlr->dev); + + return len; +} +DEVICE_ATTR_WO(delete_region); + +static void cxl_pmem_region_release(struct device *dev) +{ + struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev); + int i; + + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd; + + put_device(&cxlmd->dev); + } + + kfree(cxlr_pmem); +} + +static const struct attribute_group *cxl_pmem_region_attribute_groups[] = { + &cxl_base_attribute_group, + NULL, +}; + +const struct device_type cxl_pmem_region_type = { + .name = "cxl_pmem_region", + .release = cxl_pmem_region_release, + .groups = cxl_pmem_region_attribute_groups, +}; + +bool is_cxl_pmem_region(struct device *dev) +{ + return dev->type == &cxl_pmem_region_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, CXL); + +struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev), + "not a cxl_pmem_region device\n")) + return NULL; + return container_of(dev, struct cxl_pmem_region, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, CXL); + +static struct lock_class_key cxl_pmem_region_key; + +static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_pmem_region *cxlr_pmem; + struct device *dev; + int i; + + down_read(&cxl_region_rwsem); + if (p->state != CXL_CONFIG_COMMIT) { + cxlr_pmem = ERR_PTR(-ENXIO); + goto out; + } + + cxlr_pmem = kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets), + GFP_KERNEL); + if (!cxlr_pmem) { + cxlr_pmem = ERR_PTR(-ENOMEM); + goto out; + } + + cxlr_pmem->hpa_range.start = p->res->start; + cxlr_pmem->hpa_range.end = p->res->end; + + /* Snapshot the region configuration underneath the cxl_region_rwsem */ + cxlr_pmem->nr_mappings = p->nr_targets; + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + + m->cxlmd = cxlmd; + get_device(&cxlmd->dev); + m->start = cxled->dpa_res->start; + m->size = resource_size(cxled->dpa_res); + m->position = i; + } + + dev = &cxlr_pmem->dev; + cxlr_pmem->cxlr = cxlr; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_pmem_region_key); + device_set_pm_not_required(dev); + dev->parent = &cxlr->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_pmem_region_type; +out: + up_read(&cxl_region_rwsem); + + return cxlr_pmem; +} + +static void cxlr_pmem_unregister(void *dev) +{ + device_unregister(dev); +} + +/** + * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge + * @cxlr: parent CXL region for this pmem region bridge device + * + * Return: 0 on success negative error code on failure. + */ +static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) +{ + struct cxl_pmem_region *cxlr_pmem; + struct device *dev; + int rc; + + cxlr_pmem = cxl_pmem_region_alloc(cxlr); + if (IS_ERR(cxlr_pmem)) + return PTR_ERR(cxlr_pmem); + + dev = &cxlr_pmem->dev; + rc = dev_set_name(dev, "pmem_region%d", cxlr->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), + dev_name(dev)); + + return devm_add_action_or_reset(&cxlr->dev, cxlr_pmem_unregister, dev); + +err: + put_device(dev); + return rc; +} + +static int cxl_region_probe(struct device *dev) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + int rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) { + dev_dbg(&cxlr->dev, "probe interrupted\n"); + return rc; + } + + if (p->state < CXL_CONFIG_COMMIT) { + dev_dbg(&cxlr->dev, "config state: %d\n", p->state); + rc = -ENXIO; + } + + /* + * From this point on any path that changes the region's state away from + * CXL_CONFIG_COMMIT is also responsible for releasing the driver. + */ + up_read(&cxl_region_rwsem); + + if (rc) + return rc; + + switch (cxlr->mode) { + case CXL_DECODER_PMEM: + return devm_cxl_add_pmem_region(cxlr); + default: + dev_dbg(&cxlr->dev, "unsupported region mode: %d\n", + cxlr->mode); + return -ENXIO; + } +} + +static struct cxl_driver cxl_region_driver = { + .name = "cxl_region", + .probe = cxl_region_probe, + .id = CXL_DEVICE_REGION, +}; + +int cxl_region_init(void) +{ + return cxl_driver_register(&cxl_region_driver); +} + +void cxl_region_exit(void) +{ + cxl_driver_unregister(&cxl_region_driver); +} + +MODULE_IMPORT_NS(CXL); +MODULE_ALIAS_CXL(CXL_DEVICE_REGION); diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c new file mode 100644 index 000000000..39a129c57 --- /dev/null +++ b/drivers/cxl/core/regs.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. */ +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <cxlmem.h> +#include <cxlpci.h> + +/** + * DOC: cxl registers + * + * CXL device capabilities are enumerated by PCI DVSEC (Designated + * Vendor-specific) and / or descriptors provided by platform firmware. + * They can be defined as a set like the device and component registers + * mandated by CXL Section 8.1.12.2 Memory Device PCIe Capabilities and + * Extended Capabilities, or they can be individual capabilities + * appended to bridged and endpoint devices. + * + * Provide common infrastructure for enumerating and mapping these + * discrete capabilities. + */ + +/** + * cxl_probe_component_regs() - Detect CXL Component register blocks + * @dev: Host device of the @base mapping + * @base: Mapping containing the HDM Decoder Capability Header + * @map: Map object describing the register block information found + * + * See CXL 2.0 8.2.4 Component Register Layout and Definition + * See CXL 2.0 8.2.5.5 CXL Device Register Interface + * + * Probe for component register information and return it in map object. + */ +void cxl_probe_component_regs(struct device *dev, void __iomem *base, + struct cxl_component_reg_map *map) +{ + int cap, cap_count; + u32 cap_array; + + *map = (struct cxl_component_reg_map) { 0 }; + + /* + * CXL.cache and CXL.mem registers are at offset 0x1000 as defined in + * CXL 2.0 8.2.4 Table 141. + */ + base += CXL_CM_OFFSET; + + cap_array = readl(base + CXL_CM_CAP_HDR_OFFSET); + + if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, cap_array) != CM_CAP_HDR_CAP_ID) { + dev_err(dev, + "Couldn't locate the CXL.cache and CXL.mem capability array header.\n"); + return; + } + + /* It's assumed that future versions will be backward compatible */ + cap_count = FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_array); + + for (cap = 1; cap <= cap_count; cap++) { + void __iomem *register_block; + u32 hdr; + int decoder_cnt; + u16 cap_id, offset; + u32 length; + + hdr = readl(base + cap * 0x4); + + cap_id = FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, hdr); + offset = FIELD_GET(CXL_CM_CAP_PTR_MASK, hdr); + register_block = base + offset; + + switch (cap_id) { + case CXL_CM_CAP_CAP_ID_HDM: + dev_dbg(dev, "found HDM decoder capability (0x%x)\n", + offset); + + hdr = readl(register_block); + + decoder_cnt = cxl_hdm_decoder_count(hdr); + length = 0x20 * decoder_cnt + 0x10; + + map->hdm_decoder.valid = true; + map->hdm_decoder.offset = CXL_CM_OFFSET + offset; + map->hdm_decoder.size = length; + break; + default: + dev_dbg(dev, "Unknown CM cap ID: %d (0x%x)\n", cap_id, + offset); + break; + } + } +} +EXPORT_SYMBOL_NS_GPL(cxl_probe_component_regs, CXL); + +/** + * cxl_probe_device_regs() - Detect CXL Device register blocks + * @dev: Host device of the @base mapping + * @base: Mapping of CXL 2.0 8.2.8 CXL Device Register Interface + * @map: Map object describing the register block information found + * + * Probe for device register information and return it in map object. + */ +void cxl_probe_device_regs(struct device *dev, void __iomem *base, + struct cxl_device_reg_map *map) +{ + int cap, cap_count; + u64 cap_array; + + *map = (struct cxl_device_reg_map){ 0 }; + + cap_array = readq(base + CXLDEV_CAP_ARRAY_OFFSET); + if (FIELD_GET(CXLDEV_CAP_ARRAY_ID_MASK, cap_array) != + CXLDEV_CAP_ARRAY_CAP_ID) + return; + + cap_count = FIELD_GET(CXLDEV_CAP_ARRAY_COUNT_MASK, cap_array); + + for (cap = 1; cap <= cap_count; cap++) { + u32 offset, length; + u16 cap_id; + + cap_id = FIELD_GET(CXLDEV_CAP_HDR_CAP_ID_MASK, + readl(base + cap * 0x10)); + offset = readl(base + cap * 0x10 + 0x4); + length = readl(base + cap * 0x10 + 0x8); + + switch (cap_id) { + case CXLDEV_CAP_CAP_ID_DEVICE_STATUS: + dev_dbg(dev, "found Status capability (0x%x)\n", offset); + + map->status.valid = true; + map->status.offset = offset; + map->status.size = length; + break; + case CXLDEV_CAP_CAP_ID_PRIMARY_MAILBOX: + dev_dbg(dev, "found Mailbox capability (0x%x)\n", offset); + map->mbox.valid = true; + map->mbox.offset = offset; + map->mbox.size = length; + break; + case CXLDEV_CAP_CAP_ID_SECONDARY_MAILBOX: + dev_dbg(dev, "found Secondary Mailbox capability (0x%x)\n", offset); + break; + case CXLDEV_CAP_CAP_ID_MEMDEV: + dev_dbg(dev, "found Memory Device capability (0x%x)\n", offset); + map->memdev.valid = true; + map->memdev.offset = offset; + map->memdev.size = length; + break; + default: + if (cap_id >= 0x8000) + dev_dbg(dev, "Vendor cap ID: %#x offset: %#x\n", cap_id, offset); + else + dev_dbg(dev, "Unknown cap ID: %#x offset: %#x\n", cap_id, offset); + break; + } + } +} +EXPORT_SYMBOL_NS_GPL(cxl_probe_device_regs, CXL); + +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, + resource_size_t length) +{ + void __iomem *ret_val; + struct resource *res; + + res = devm_request_mem_region(dev, addr, length, dev_name(dev)); + if (!res) { + resource_size_t end = addr + length - 1; + + dev_err(dev, "Failed to request region %pa-%pa\n", &addr, &end); + return NULL; + } + + ret_val = devm_ioremap(dev, addr, length); + if (!ret_val) + dev_err(dev, "Failed to map region %pr\n", res); + + return ret_val; +} + +int cxl_map_component_regs(struct pci_dev *pdev, + struct cxl_component_regs *regs, + struct cxl_register_map *map) +{ + struct device *dev = &pdev->dev; + resource_size_t phys_addr; + resource_size_t length; + + phys_addr = pci_resource_start(pdev, map->barno); + phys_addr += map->block_offset; + + phys_addr += map->component_map.hdm_decoder.offset; + length = map->component_map.hdm_decoder.size; + regs->hdm_decoder = devm_cxl_iomap_block(dev, phys_addr, length); + if (!regs->hdm_decoder) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_map_component_regs, CXL); + +int cxl_map_device_regs(struct pci_dev *pdev, + struct cxl_device_regs *regs, + struct cxl_register_map *map) +{ + struct device *dev = &pdev->dev; + resource_size_t phys_addr; + + phys_addr = pci_resource_start(pdev, map->barno); + phys_addr += map->block_offset; + + if (map->device_map.status.valid) { + resource_size_t addr; + resource_size_t length; + + addr = phys_addr + map->device_map.status.offset; + length = map->device_map.status.size; + regs->status = devm_cxl_iomap_block(dev, addr, length); + if (!regs->status) + return -ENOMEM; + } + + if (map->device_map.mbox.valid) { + resource_size_t addr; + resource_size_t length; + + addr = phys_addr + map->device_map.mbox.offset; + length = map->device_map.mbox.size; + regs->mbox = devm_cxl_iomap_block(dev, addr, length); + if (!regs->mbox) + return -ENOMEM; + } + + if (map->device_map.memdev.valid) { + resource_size_t addr; + resource_size_t length; + + addr = phys_addr + map->device_map.memdev.offset; + length = map->device_map.memdev.size; + regs->memdev = devm_cxl_iomap_block(dev, addr, length); + if (!regs->memdev) + return -ENOMEM; + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, CXL); + +static void cxl_decode_regblock(u32 reg_lo, u32 reg_hi, + struct cxl_register_map *map) +{ + map->block_offset = ((u64)reg_hi << 32) | + (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); + map->barno = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); + map->reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); +} + +/** + * cxl_find_regblock() - Locate register blocks by type + * @pdev: The CXL PCI device to enumerate. + * @type: Register Block Indicator id + * @map: Enumeration output, clobbered on error + * + * Return: 0 if register block enumerated, negative error code otherwise + * + * A CXL DVSEC may point to one or more register blocks, search for them + * by @type. + */ +int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map) +{ + u32 regloc_size, regblocks; + int regloc, i; + + map->block_offset = U64_MAX; + regloc = pci_find_dvsec_capability(pdev, PCI_DVSEC_VENDOR_ID_CXL, + CXL_DVSEC_REG_LOCATOR); + if (!regloc) + return -ENXIO; + + pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); + regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size); + + regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET; + regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8; + + for (i = 0; i < regblocks; i++, regloc += 8) { + u32 reg_lo, reg_hi; + + pci_read_config_dword(pdev, regloc, ®_lo); + pci_read_config_dword(pdev, regloc + 4, ®_hi); + + cxl_decode_regblock(reg_lo, reg_hi, map); + + if (map->reg_type == type) + return 0; + } + + map->block_offset = U64_MAX; + return -ENODEV; +} +EXPORT_SYMBOL_NS_GPL(cxl_find_regblock, CXL); diff --git a/drivers/cxl/core/suspend.c b/drivers/cxl/core/suspend.c new file mode 100644 index 000000000..a5984d96e --- /dev/null +++ b/drivers/cxl/core/suspend.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include <linux/atomic.h> +#include <linux/export.h> +#include "cxlmem.h" + +static atomic_t mem_active; + +bool cxl_mem_active(void) +{ + return atomic_read(&mem_active) != 0; +} + +void cxl_mem_active_inc(void) +{ + atomic_inc(&mem_active); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_active_inc, CXL); + +void cxl_mem_active_dec(void) +{ + atomic_dec(&mem_active); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_active_dec, CXL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h new file mode 100644 index 000000000..7750ccb76 --- /dev/null +++ b/drivers/cxl/cxl.h @@ -0,0 +1,665 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2020 Intel Corporation. */ + +#ifndef __CXL_H__ +#define __CXL_H__ + +#include <linux/libnvdimm.h> +#include <linux/bitfield.h> +#include <linux/bitops.h> +#include <linux/log2.h> +#include <linux/io.h> + +/** + * DOC: cxl objects + * + * The CXL core objects like ports, decoders, and regions are shared + * between the subsystem drivers cxl_acpi, cxl_pci, and core drivers + * (port-driver, region-driver, nvdimm object-drivers... etc). + */ + +/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ +#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K + +/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers*/ +#define CXL_CM_OFFSET 0x1000 +#define CXL_CM_CAP_HDR_OFFSET 0x0 +#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) +#define CM_CAP_HDR_CAP_ID 1 +#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) +#define CM_CAP_HDR_CAP_VERSION 1 +#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) +#define CM_CAP_HDR_CACHE_MEM_VERSION 1 +#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) +#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) + +#define CXL_CM_CAP_CAP_ID_HDM 0x5 +#define CXL_CM_CAP_CAP_HDM_VERSION 1 + +/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ +#define CXL_HDM_DECODER_CAP_OFFSET 0x0 +#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) +#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) +#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_ENABLE BIT(1) +#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) +#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) +#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) +#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) +#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) +#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) +#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) +#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) +#define CXL_HDM_DECODER0_CTRL_TYPE BIT(12) +#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) +#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) + +static inline int cxl_hdm_decoder_count(u32 cap_hdr) +{ + int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr); + + return val ? val * 2 : 1; +} + +/* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ +static inline int cxl_to_granularity(u16 ig, unsigned int *val) +{ + if (ig > 6) + return -EINVAL; + *val = 256 << ig; + return 0; +} + +/* Encode defined in CXL ECN "3, 6, 12 and 16-way memory Interleaving" */ +static inline int cxl_to_ways(u8 eniw, unsigned int *val) +{ + switch (eniw) { + case 0 ... 4: + *val = 1 << eniw; + break; + case 8 ... 10: + *val = 3 << (eniw - 8); + break; + default: + return -EINVAL; + } + + return 0; +} + +static inline int granularity_to_cxl(int g, u16 *ig) +{ + if (g > SZ_16K || g < 256 || !is_power_of_2(g)) + return -EINVAL; + *ig = ilog2(g) - 8; + return 0; +} + +static inline int ways_to_cxl(unsigned int ways, u8 *iw) +{ + if (ways > 16) + return -EINVAL; + if (is_power_of_2(ways)) { + *iw = ilog2(ways); + return 0; + } + if (ways % 3) + return -EINVAL; + ways /= 3; + if (!is_power_of_2(ways)) + return -EINVAL; + *iw = ilog2(ways) + 8; + return 0; +} + +/* CXL 2.0 8.2.8.1 Device Capabilities Array Register */ +#define CXLDEV_CAP_ARRAY_OFFSET 0x0 +#define CXLDEV_CAP_ARRAY_CAP_ID 0 +#define CXLDEV_CAP_ARRAY_ID_MASK GENMASK_ULL(15, 0) +#define CXLDEV_CAP_ARRAY_COUNT_MASK GENMASK_ULL(47, 32) +/* CXL 2.0 8.2.8.2 CXL Device Capability Header Register */ +#define CXLDEV_CAP_HDR_CAP_ID_MASK GENMASK(15, 0) +/* CXL 2.0 8.2.8.2.1 CXL Device Capabilities */ +#define CXLDEV_CAP_CAP_ID_DEVICE_STATUS 0x1 +#define CXLDEV_CAP_CAP_ID_PRIMARY_MAILBOX 0x2 +#define CXLDEV_CAP_CAP_ID_SECONDARY_MAILBOX 0x3 +#define CXLDEV_CAP_CAP_ID_MEMDEV 0x4000 + +/* CXL 2.0 8.2.8.4 Mailbox Registers */ +#define CXLDEV_MBOX_CAPS_OFFSET 0x00 +#define CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0) +#define CXLDEV_MBOX_CTRL_OFFSET 0x04 +#define CXLDEV_MBOX_CTRL_DOORBELL BIT(0) +#define CXLDEV_MBOX_CMD_OFFSET 0x08 +#define CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0) +#define CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK GENMASK_ULL(36, 16) +#define CXLDEV_MBOX_STATUS_OFFSET 0x10 +#define CXLDEV_MBOX_STATUS_RET_CODE_MASK GENMASK_ULL(47, 32) +#define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18 +#define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20 + +/* + * Using struct_group() allows for per register-block-type helper routines, + * without requiring block-type agnostic code to include the prefix. + */ +struct cxl_regs { + /* + * Common set of CXL Component register block base pointers + * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure + */ + struct_group_tagged(cxl_component_regs, component, + void __iomem *hdm_decoder; + ); + /* + * Common set of CXL Device register block base pointers + * @status: CXL 2.0 8.2.8.3 Device Status Registers + * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers + * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers + */ + struct_group_tagged(cxl_device_regs, device_regs, + void __iomem *status, *mbox, *memdev; + ); +}; + +struct cxl_reg_map { + bool valid; + unsigned long offset; + unsigned long size; +}; + +struct cxl_component_reg_map { + struct cxl_reg_map hdm_decoder; +}; + +struct cxl_device_reg_map { + struct cxl_reg_map status; + struct cxl_reg_map mbox; + struct cxl_reg_map memdev; +}; + +/** + * struct cxl_register_map - DVSEC harvested register block mapping parameters + * @base: virtual base of the register-block-BAR + @block_offset + * @block_offset: offset to start of register block in @barno + * @reg_type: see enum cxl_regloc_type + * @barno: PCI BAR number containing the register block + * @component_map: cxl_reg_map for component registers + * @device_map: cxl_reg_maps for device registers + */ +struct cxl_register_map { + void __iomem *base; + u64 block_offset; + u8 reg_type; + u8 barno; + union { + struct cxl_component_reg_map component_map; + struct cxl_device_reg_map device_map; + }; +}; + +void cxl_probe_component_regs(struct device *dev, void __iomem *base, + struct cxl_component_reg_map *map); +void cxl_probe_device_regs(struct device *dev, void __iomem *base, + struct cxl_device_reg_map *map); +int cxl_map_component_regs(struct pci_dev *pdev, + struct cxl_component_regs *regs, + struct cxl_register_map *map); +int cxl_map_device_regs(struct pci_dev *pdev, + struct cxl_device_regs *regs, + struct cxl_register_map *map); + +enum cxl_regloc_type; +int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, + resource_size_t length); + +#define CXL_RESOURCE_NONE ((resource_size_t) -1) +#define CXL_TARGET_STRLEN 20 + +/* + * cxl_decoder flags that define the type of memory / devices this + * decoder supports as well as configuration lock status See "CXL 2.0 + * 8.2.5.12.7 CXL HDM Decoder 0 Control Register" for details. + */ +#define CXL_DECODER_F_RAM BIT(0) +#define CXL_DECODER_F_PMEM BIT(1) +#define CXL_DECODER_F_TYPE2 BIT(2) +#define CXL_DECODER_F_TYPE3 BIT(3) +#define CXL_DECODER_F_LOCK BIT(4) +#define CXL_DECODER_F_ENABLE BIT(5) +#define CXL_DECODER_F_MASK GENMASK(5, 0) + +enum cxl_decoder_type { + CXL_DECODER_ACCELERATOR = 2, + CXL_DECODER_EXPANDER = 3, +}; + +/* + * Current specification goes up to 8, double that seems a reasonable + * software max for the foreseeable future + */ +#define CXL_DECODER_MAX_INTERLEAVE 16 + +#define CXL_DECODER_MIN_GRANULARITY 256 + +/** + * struct cxl_decoder - Common CXL HDM Decoder Attributes + * @dev: this decoder's device + * @id: kernel device name id + * @hpa_range: Host physical address range mapped by this decoder + * @interleave_ways: number of cxl_dports in this decode + * @interleave_granularity: data stride per dport + * @target_type: accelerator vs expander (type2 vs type3) selector + * @region: currently assigned region for this decoder + * @flags: memory type capabilities and locking + * @commit: device/decoder-type specific callback to commit settings to hw + * @reset: device/decoder-type specific callback to reset hw settings +*/ +struct cxl_decoder { + struct device dev; + int id; + struct range hpa_range; + int interleave_ways; + int interleave_granularity; + enum cxl_decoder_type target_type; + struct cxl_region *region; + unsigned long flags; + int (*commit)(struct cxl_decoder *cxld); + int (*reset)(struct cxl_decoder *cxld); +}; + +/* + * CXL_DECODER_DEAD prevents endpoints from being reattached to regions + * while cxld_unregister() is running + */ +enum cxl_decoder_mode { + CXL_DECODER_NONE, + CXL_DECODER_RAM, + CXL_DECODER_PMEM, + CXL_DECODER_MIXED, + CXL_DECODER_DEAD, +}; + +/** + * struct cxl_endpoint_decoder - Endpoint / SPA to DPA decoder + * @cxld: base cxl_decoder_object + * @dpa_res: actively claimed DPA span of this decoder + * @skip: offset into @dpa_res where @cxld.hpa_range maps + * @mode: which memory type / access-mode-partition this decoder targets + * @pos: interleave position in @cxld.region + */ +struct cxl_endpoint_decoder { + struct cxl_decoder cxld; + struct resource *dpa_res; + resource_size_t skip; + enum cxl_decoder_mode mode; + int pos; +}; + +/** + * struct cxl_switch_decoder - Switch specific CXL HDM Decoder + * @cxld: base cxl_decoder object + * @target_lock: coordinate coherent reads of the target list + * @nr_targets: number of elements in @target + * @target: active ordered target list in current decoder configuration + * + * The 'switch' decoder type represents the decoder instances of cxl_port's that + * route from the root of a CXL memory decode topology to the endpoints. They + * come in two flavors, root-level decoders, statically defined by platform + * firmware, and mid-level decoders, where interleave-granularity, + * interleave-width, and the target list are mutable. + */ +struct cxl_switch_decoder { + struct cxl_decoder cxld; + seqlock_t target_lock; + int nr_targets; + struct cxl_dport *target[]; +}; + + +/** + * struct cxl_root_decoder - Static platform CXL address decoder + * @res: host / parent resource for region allocations + * @region_id: region id for next region provisioning event + * @calc_hb: which host bridge covers the n'th position by granularity + * @cxlsd: base cxl switch decoder + */ +struct cxl_root_decoder { + struct resource *res; + atomic_t region_id; + struct cxl_dport *(*calc_hb)(struct cxl_root_decoder *cxlrd, int pos); + struct cxl_switch_decoder cxlsd; +}; + +/* + * enum cxl_config_state - State machine for region configuration + * @CXL_CONFIG_IDLE: Any sysfs attribute can be written freely + * @CXL_CONFIG_INTERLEAVE_ACTIVE: region size has been set, no more + * changes to interleave_ways or interleave_granularity + * @CXL_CONFIG_ACTIVE: All targets have been added the region is now + * active + * @CXL_CONFIG_RESET_PENDING: see commit_store() + * @CXL_CONFIG_COMMIT: Soft-config has been committed to hardware + */ +enum cxl_config_state { + CXL_CONFIG_IDLE, + CXL_CONFIG_INTERLEAVE_ACTIVE, + CXL_CONFIG_ACTIVE, + CXL_CONFIG_RESET_PENDING, + CXL_CONFIG_COMMIT, +}; + +/** + * struct cxl_region_params - region settings + * @state: allow the driver to lockdown further parameter changes + * @uuid: unique id for persistent regions + * @interleave_ways: number of endpoints in the region + * @interleave_granularity: capacity each endpoint contributes to a stripe + * @res: allocated iomem capacity for this region + * @targets: active ordered targets in current decoder configuration + * @nr_targets: number of targets + * + * State transitions are protected by the cxl_region_rwsem + */ +struct cxl_region_params { + enum cxl_config_state state; + uuid_t uuid; + int interleave_ways; + int interleave_granularity; + struct resource *res; + struct cxl_endpoint_decoder *targets[CXL_DECODER_MAX_INTERLEAVE]; + int nr_targets; +}; + +/** + * struct cxl_region - CXL region + * @dev: This region's device + * @id: This region's id. Id is globally unique across all regions + * @mode: Endpoint decoder allocation / access mode + * @type: Endpoint decoder target type + * @params: active + config params for the region + */ +struct cxl_region { + struct device dev; + int id; + enum cxl_decoder_mode mode; + enum cxl_decoder_type type; + struct cxl_region_params params; +}; + +/** + * enum cxl_nvdimm_brige_state - state machine for managing bus rescans + * @CXL_NVB_NEW: Set at bridge create and after cxl_pmem_wq is destroyed + * @CXL_NVB_DEAD: Set at brige unregistration to preclude async probing + * @CXL_NVB_ONLINE: Target state after successful ->probe() + * @CXL_NVB_OFFLINE: Target state after ->remove() or failed ->probe() + */ +enum cxl_nvdimm_brige_state { + CXL_NVB_NEW, + CXL_NVB_DEAD, + CXL_NVB_ONLINE, + CXL_NVB_OFFLINE, +}; + +struct cxl_nvdimm_bridge { + int id; + struct device dev; + struct cxl_port *port; + struct nvdimm_bus *nvdimm_bus; + struct nvdimm_bus_descriptor nd_desc; + struct work_struct state_work; + enum cxl_nvdimm_brige_state state; +}; + +struct cxl_nvdimm { + struct device dev; + struct cxl_memdev *cxlmd; + struct cxl_nvdimm_bridge *bridge; + struct xarray pmem_regions; +}; + +struct cxl_pmem_region_mapping { + struct cxl_memdev *cxlmd; + struct cxl_nvdimm *cxl_nvd; + u64 start; + u64 size; + int position; +}; + +struct cxl_pmem_region { + struct device dev; + struct cxl_region *cxlr; + struct nd_region *nd_region; + struct cxl_nvdimm_bridge *bridge; + struct range hpa_range; + int nr_mappings; + struct cxl_pmem_region_mapping mapping[]; +}; + +/** + * struct cxl_port - logical collection of upstream port devices and + * downstream port devices to construct a CXL memory + * decode hierarchy. + * @dev: this port's device + * @uport: PCI or platform device implementing the upstream port capability + * @host_bridge: Shortcut to the platform attach point for this port + * @id: id for port device-name + * @dports: cxl_dport instances referenced by decoders + * @endpoints: cxl_ep instances, endpoints that are a descendant of this port + * @regions: cxl_region_ref instances, regions mapped by this port + * @parent_dport: dport that points to this port in the parent + * @decoder_ida: allocator for decoder ids + * @nr_dports: number of entries in @dports + * @hdm_end: track last allocated HDM decoder instance for allocation ordering + * @commit_end: cursor to track highest committed decoder for commit ordering + * @component_reg_phys: component register capability base address (optional) + * @dead: last ep has been removed, force port re-creation + * @depth: How deep this port is relative to the root. depth 0 is the root. + * @cdat: Cached CDAT data + * @cdat_available: Should a CDAT attribute be available in sysfs + */ +struct cxl_port { + struct device dev; + struct device *uport; + struct device *host_bridge; + int id; + struct xarray dports; + struct xarray endpoints; + struct xarray regions; + struct cxl_dport *parent_dport; + struct ida decoder_ida; + int nr_dports; + int hdm_end; + int commit_end; + resource_size_t component_reg_phys; + bool dead; + unsigned int depth; + struct cxl_cdat { + void *table; + size_t length; + } cdat; + bool cdat_available; +}; + +static inline struct cxl_dport * +cxl_find_dport_by_dev(struct cxl_port *port, const struct device *dport_dev) +{ + return xa_load(&port->dports, (unsigned long)dport_dev); +} + +/** + * struct cxl_dport - CXL downstream port + * @dport: PCI bridge or firmware device representing the downstream link + * @port_id: unique hardware identifier for dport in decoder target list + * @component_reg_phys: downstream port component registers + * @port: reference to cxl_port that contains this downstream port + */ +struct cxl_dport { + struct device *dport; + int port_id; + resource_size_t component_reg_phys; + struct cxl_port *port; +}; + +/** + * struct cxl_ep - track an endpoint's interest in a port + * @ep: device that hosts a generic CXL endpoint (expander or accelerator) + * @dport: which dport routes to this endpoint on @port + * @next: cxl switch port across the link attached to @dport NULL if + * attached to an endpoint + */ +struct cxl_ep { + struct device *ep; + struct cxl_dport *dport; + struct cxl_port *next; +}; + +/** + * struct cxl_region_ref - track a region's interest in a port + * @port: point in topology to install this reference + * @decoder: decoder assigned for @region in @port + * @region: region for this reference + * @endpoints: cxl_ep references for region members beneath @port + * @nr_targets_set: track how many targets have been programmed during setup + * @nr_eps: number of endpoints beneath @port + * @nr_targets: number of distinct targets needed to reach @nr_eps + */ +struct cxl_region_ref { + struct cxl_port *port; + struct cxl_decoder *decoder; + struct cxl_region *region; + struct xarray endpoints; + int nr_targets_set; + int nr_eps; + int nr_targets; +}; + +/* + * The platform firmware device hosting the root is also the top of the + * CXL port topology. All other CXL ports have another CXL port as their + * parent and their ->uport / host device is out-of-line of the port + * ancestry. + */ +static inline bool is_cxl_root(struct cxl_port *port) +{ + return port->uport == port->dev.parent; +} + +bool is_cxl_port(struct device *dev); +struct cxl_port *to_cxl_port(struct device *dev); +struct pci_bus; +int devm_cxl_register_pci_bus(struct device *host, struct device *uport, + struct pci_bus *bus); +struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port); +struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, + resource_size_t component_reg_phys, + struct cxl_dport *parent_dport); +struct cxl_port *find_cxl_root(struct device *dev); +int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd); +int cxl_bus_rescan(void); +struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd, + struct cxl_dport **dport); +bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd); + +struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, + struct device *dport, int port_id, + resource_size_t component_reg_phys); + +struct cxl_decoder *to_cxl_decoder(struct device *dev); +struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev); +struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev); +struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev); +bool is_root_decoder(struct device *dev); +bool is_switch_decoder(struct device *dev); +bool is_endpoint_decoder(struct device *dev); +struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets); +struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets); +int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); +struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port); +int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map); +int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld); +int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint); + +struct cxl_hdm; +struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port); +int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm); +int devm_cxl_add_passthrough_decoder(struct cxl_port *port); + +bool is_cxl_region(struct device *dev); + +extern struct bus_type cxl_bus_type; + +struct cxl_driver { + const char *name; + int (*probe)(struct device *dev); + void (*remove)(struct device *dev); + struct device_driver drv; + int id; +}; + +static inline struct cxl_driver *to_cxl_drv(struct device_driver *drv) +{ + return container_of(drv, struct cxl_driver, drv); +} + +int __cxl_driver_register(struct cxl_driver *cxl_drv, struct module *owner, + const char *modname); +#define cxl_driver_register(x) __cxl_driver_register(x, THIS_MODULE, KBUILD_MODNAME) +void cxl_driver_unregister(struct cxl_driver *cxl_drv); + +#define module_cxl_driver(__cxl_driver) \ + module_driver(__cxl_driver, cxl_driver_register, cxl_driver_unregister) + +#define CXL_DEVICE_NVDIMM_BRIDGE 1 +#define CXL_DEVICE_NVDIMM 2 +#define CXL_DEVICE_PORT 3 +#define CXL_DEVICE_ROOT 4 +#define CXL_DEVICE_MEMORY_EXPANDER 5 +#define CXL_DEVICE_REGION 6 +#define CXL_DEVICE_PMEM_REGION 7 + +#define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*") +#define CXL_MODALIAS_FMT "cxl:t%d" + +struct cxl_nvdimm_bridge *to_cxl_nvdimm_bridge(struct device *dev); +struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host, + struct cxl_port *port); +struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev); +bool is_cxl_nvdimm(struct device *dev); +bool is_cxl_nvdimm_bridge(struct device *dev); +int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd); +struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *dev); + +#ifdef CONFIG_CXL_REGION +bool is_cxl_pmem_region(struct device *dev); +struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev); +#else +static inline bool is_cxl_pmem_region(struct device *dev) +{ + return false; +} +static inline struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) +{ + return NULL; +} +#endif + +/* + * Unit test builds overrides this to __weak, find the 'strong' version + * of these symbols in tools/testing/cxl/. + */ +#ifndef __mock +#define __mock static +#endif + +#endif /* __CXL_H__ */ diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h new file mode 100644 index 000000000..b58a5b782 --- /dev/null +++ b/drivers/cxl/cxlmem.h @@ -0,0 +1,419 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2020-2021 Intel Corporation. */ +#ifndef __CXL_MEM_H__ +#define __CXL_MEM_H__ +#include <uapi/linux/cxl_mem.h> +#include <linux/cdev.h> +#include "cxl.h" + +/* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */ +#define CXLMDEV_STATUS_OFFSET 0x0 +#define CXLMDEV_DEV_FATAL BIT(0) +#define CXLMDEV_FW_HALT BIT(1) +#define CXLMDEV_STATUS_MEDIA_STATUS_MASK GENMASK(3, 2) +#define CXLMDEV_MS_NOT_READY 0 +#define CXLMDEV_MS_READY 1 +#define CXLMDEV_MS_ERROR 2 +#define CXLMDEV_MS_DISABLED 3 +#define CXLMDEV_READY(status) \ + (FIELD_GET(CXLMDEV_STATUS_MEDIA_STATUS_MASK, status) == \ + CXLMDEV_MS_READY) +#define CXLMDEV_MBOX_IF_READY BIT(4) +#define CXLMDEV_RESET_NEEDED_MASK GENMASK(7, 5) +#define CXLMDEV_RESET_NEEDED_NOT 0 +#define CXLMDEV_RESET_NEEDED_COLD 1 +#define CXLMDEV_RESET_NEEDED_WARM 2 +#define CXLMDEV_RESET_NEEDED_HOT 3 +#define CXLMDEV_RESET_NEEDED_CXL 4 +#define CXLMDEV_RESET_NEEDED(status) \ + (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \ + CXLMDEV_RESET_NEEDED_NOT) + +/** + * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device + * @dev: driver core device object + * @cdev: char dev core object for ioctl operations + * @cxlds: The device state backing this device + * @detach_work: active memdev lost a port in its ancestry + * @id: id number of this memdev instance. + */ +struct cxl_memdev { + struct device dev; + struct cdev cdev; + struct cxl_dev_state *cxlds; + struct work_struct detach_work; + int id; +}; + +static inline struct cxl_memdev *to_cxl_memdev(struct device *dev) +{ + return container_of(dev, struct cxl_memdev, dev); +} + +static inline struct cxl_port *cxled_to_port(struct cxl_endpoint_decoder *cxled) +{ + return to_cxl_port(cxled->cxld.dev.parent); +} + +static inline struct cxl_port *cxlrd_to_port(struct cxl_root_decoder *cxlrd) +{ + return to_cxl_port(cxlrd->cxlsd.cxld.dev.parent); +} + +static inline struct cxl_memdev * +cxled_to_memdev(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = to_cxl_port(cxled->cxld.dev.parent); + + return to_cxl_memdev(port->uport); +} + +bool is_cxl_memdev(struct device *dev); +static inline bool is_cxl_endpoint(struct cxl_port *port) +{ + return is_cxl_memdev(port->uport); +} + +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds); +int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, + resource_size_t base, resource_size_t len, + resource_size_t skipped); + +static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port, + struct cxl_memdev *cxlmd) +{ + if (!port) + return NULL; + + return xa_load(&port->endpoints, (unsigned long)&cxlmd->dev); +} + +/** + * struct cxl_mbox_cmd - A command to be submitted to hardware. + * @opcode: (input) The command set and command submitted to hardware. + * @payload_in: (input) Pointer to the input payload. + * @payload_out: (output) Pointer to the output payload. Must be allocated by + * the caller. + * @size_in: (input) Number of bytes to load from @payload_in. + * @size_out: (input) Max number of bytes loaded into @payload_out. + * (output) Number of bytes generated by the device. For fixed size + * outputs commands this is always expected to be deterministic. For + * variable sized output commands, it tells the exact number of bytes + * written. + * @return_code: (output) Error code returned from hardware. + * + * This is the primary mechanism used to send commands to the hardware. + * All the fields except @payload_* correspond exactly to the fields described in + * Command Register section of the CXL 2.0 8.2.8.4.5. @payload_in and + * @payload_out are written to, and read from the Command Payload Registers + * defined in CXL 2.0 8.2.8.4.8. + */ +struct cxl_mbox_cmd { + u16 opcode; + void *payload_in; + void *payload_out; + size_t size_in; + size_t size_out; + u16 return_code; +}; + +/* + * Per CXL 2.0 Section 8.2.8.4.5.1 + */ +#define CMD_CMD_RC_TABLE \ + C(SUCCESS, 0, NULL), \ + C(BACKGROUND, -ENXIO, "background cmd started successfully"), \ + C(INPUT, -ENXIO, "cmd input was invalid"), \ + C(UNSUPPORTED, -ENXIO, "cmd is not supported"), \ + C(INTERNAL, -ENXIO, "internal device error"), \ + C(RETRY, -ENXIO, "temporary error, retry once"), \ + C(BUSY, -ENXIO, "ongoing background operation"), \ + C(MEDIADISABLED, -ENXIO, "media access is disabled"), \ + C(FWINPROGRESS, -ENXIO, "one FW package can be transferred at a time"), \ + C(FWOOO, -ENXIO, "FW package content was transferred out of order"), \ + C(FWAUTH, -ENXIO, "FW package authentication failed"), \ + C(FWSLOT, -ENXIO, "FW slot is not supported for requested operation"), \ + C(FWROLLBACK, -ENXIO, "rolled back to the previous active FW"), \ + C(FWRESET, -ENXIO, "FW failed to activate, needs cold reset"), \ + C(HANDLE, -ENXIO, "one or more Event Record Handles were invalid"), \ + C(PADDR, -ENXIO, "physical address specified is invalid"), \ + C(POISONLMT, -ENXIO, "poison injection limit has been reached"), \ + C(MEDIAFAILURE, -ENXIO, "permanent issue with the media"), \ + C(ABORT, -ENXIO, "background cmd was aborted by device"), \ + C(SECURITY, -ENXIO, "not valid in the current security state"), \ + C(PASSPHRASE, -ENXIO, "phrase doesn't match current set passphrase"), \ + C(MBUNSUPPORTED, -ENXIO, "unsupported on the mailbox it was issued on"),\ + C(PAYLOADLEN, -ENXIO, "invalid payload length") + +#undef C +#define C(a, b, c) CXL_MBOX_CMD_RC_##a +enum { CMD_CMD_RC_TABLE }; +#undef C +#define C(a, b, c) { b, c } +struct cxl_mbox_cmd_rc { + int err; + const char *desc; +}; + +static const +struct cxl_mbox_cmd_rc cxl_mbox_cmd_rctable[] ={ CMD_CMD_RC_TABLE }; +#undef C + +static inline const char *cxl_mbox_cmd_rc2str(struct cxl_mbox_cmd *mbox_cmd) +{ + return cxl_mbox_cmd_rctable[mbox_cmd->return_code].desc; +} + +static inline int cxl_mbox_cmd_rc2errno(struct cxl_mbox_cmd *mbox_cmd) +{ + return cxl_mbox_cmd_rctable[mbox_cmd->return_code].err; +} + +/* + * CXL 2.0 - Memory capacity multiplier + * See Section 8.2.9.5 + * + * Volatile, Persistent, and Partition capacities are specified to be in + * multiples of 256MB - define a multiplier to convert to/from bytes. + */ +#define CXL_CAPACITY_MULTIPLIER SZ_256M + +/** + * struct cxl_endpoint_dvsec_info - Cached DVSEC info + * @mem_enabled: cached value of mem_enabled in the DVSEC, PCIE_DEVICE + * @ranges: Number of active HDM ranges this device uses. + * @dvsec_range: cached attributes of the ranges in the DVSEC, PCIE_DEVICE + */ +struct cxl_endpoint_dvsec_info { + bool mem_enabled; + int ranges; + struct range dvsec_range[2]; +}; + +/** + * struct cxl_dev_state - The driver device state + * + * cxl_dev_state represents the CXL driver/device state. It provides an + * interface to mailbox commands as well as some cached data about the device. + * Currently only memory devices are represented. + * + * @dev: The device associated with this CXL state + * @regs: Parsed register blocks + * @cxl_dvsec: Offset to the PCIe device DVSEC + * @payload_size: Size of space for payload + * (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register) + * @lsa_size: Size of Label Storage Area + * (CXL 2.0 8.2.9.5.1.1 Identify Memory Device) + * @mbox_mutex: Mutex to synchronize mailbox access. + * @firmware_version: Firmware version for the memory device. + * @enabled_cmds: Hardware commands found enabled in CEL. + * @exclusive_cmds: Commands that are kernel-internal only + * @dpa_res: Overall DPA resource tree for the device + * @pmem_res: Active Persistent memory capacity configuration + * @ram_res: Active Volatile memory capacity configuration + * @total_bytes: sum of all possible capacities + * @volatile_only_bytes: hard volatile capacity + * @persistent_only_bytes: hard persistent capacity + * @partition_align_bytes: alignment size for partition-able capacity + * @active_volatile_bytes: sum of hard + soft volatile + * @active_persistent_bytes: sum of hard + soft persistent + * @next_volatile_bytes: volatile capacity change pending device reset + * @next_persistent_bytes: persistent capacity change pending device reset + * @component_reg_phys: register base of component registers + * @info: Cached DVSEC information about the device. + * @serial: PCIe Device Serial Number + * @doe_mbs: PCI DOE mailbox array + * @mbox_send: @dev specific transport for transmitting mailbox commands + * + * See section 8.2.9.5.2 Capacity Configuration and Label Storage for + * details on capacity parameters. + */ +struct cxl_dev_state { + struct device *dev; + + struct cxl_regs regs; + int cxl_dvsec; + + size_t payload_size; + size_t lsa_size; + struct mutex mbox_mutex; /* Protects device mailbox and firmware */ + char firmware_version[0x10]; + DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); + DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); + + struct resource dpa_res; + struct resource pmem_res; + struct resource ram_res; + u64 total_bytes; + u64 volatile_only_bytes; + u64 persistent_only_bytes; + u64 partition_align_bytes; + + u64 active_volatile_bytes; + u64 active_persistent_bytes; + u64 next_volatile_bytes; + u64 next_persistent_bytes; + + resource_size_t component_reg_phys; + u64 serial; + + struct xarray doe_mbs; + + int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd); +}; + +enum cxl_opcode { + CXL_MBOX_OP_INVALID = 0x0000, + CXL_MBOX_OP_RAW = CXL_MBOX_OP_INVALID, + CXL_MBOX_OP_GET_FW_INFO = 0x0200, + CXL_MBOX_OP_ACTIVATE_FW = 0x0202, + CXL_MBOX_OP_GET_SUPPORTED_LOGS = 0x0400, + CXL_MBOX_OP_GET_LOG = 0x0401, + CXL_MBOX_OP_IDENTIFY = 0x4000, + CXL_MBOX_OP_GET_PARTITION_INFO = 0x4100, + CXL_MBOX_OP_SET_PARTITION_INFO = 0x4101, + CXL_MBOX_OP_GET_LSA = 0x4102, + CXL_MBOX_OP_SET_LSA = 0x4103, + CXL_MBOX_OP_GET_HEALTH_INFO = 0x4200, + CXL_MBOX_OP_GET_ALERT_CONFIG = 0x4201, + CXL_MBOX_OP_SET_ALERT_CONFIG = 0x4202, + CXL_MBOX_OP_GET_SHUTDOWN_STATE = 0x4203, + CXL_MBOX_OP_SET_SHUTDOWN_STATE = 0x4204, + CXL_MBOX_OP_GET_POISON = 0x4300, + CXL_MBOX_OP_INJECT_POISON = 0x4301, + CXL_MBOX_OP_CLEAR_POISON = 0x4302, + CXL_MBOX_OP_GET_SCAN_MEDIA_CAPS = 0x4303, + CXL_MBOX_OP_SCAN_MEDIA = 0x4304, + CXL_MBOX_OP_GET_SCAN_MEDIA = 0x4305, + CXL_MBOX_OP_MAX = 0x10000 +}; + +#define DEFINE_CXL_CEL_UUID \ + UUID_INIT(0xda9c0b5, 0xbf41, 0x4b78, 0x8f, 0x79, 0x96, 0xb1, 0x62, \ + 0x3b, 0x3f, 0x17) + +#define DEFINE_CXL_VENDOR_DEBUG_UUID \ + UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ + 0x40, 0x3d, 0x86) + +struct cxl_mbox_get_supported_logs { + __le16 entries; + u8 rsvd[6]; + struct cxl_gsl_entry { + uuid_t uuid; + __le32 size; + } __packed entry[]; +} __packed; + +struct cxl_cel_entry { + __le16 opcode; + __le16 effect; +} __packed; + +struct cxl_mbox_get_log { + uuid_t uuid; + __le32 offset; + __le32 length; +} __packed; + +/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */ +struct cxl_mbox_identify { + char fw_revision[0x10]; + __le64 total_capacity; + __le64 volatile_capacity; + __le64 persistent_capacity; + __le64 partition_align; + __le16 info_event_log_size; + __le16 warning_event_log_size; + __le16 failure_event_log_size; + __le16 fatal_event_log_size; + __le32 lsa_size; + u8 poison_list_max_mer[3]; + __le16 inject_poison_limit; + u8 poison_caps; + u8 qos_telemetry_caps; +} __packed; + +struct cxl_mbox_get_partition_info { + __le64 active_volatile_cap; + __le64 active_persistent_cap; + __le64 next_volatile_cap; + __le64 next_persistent_cap; +} __packed; + +struct cxl_mbox_get_lsa { + __le32 offset; + __le32 length; +} __packed; + +struct cxl_mbox_set_lsa { + __le32 offset; + __le32 reserved; + u8 data[]; +} __packed; + +struct cxl_mbox_set_partition_info { + __le64 volatile_capacity; + u8 flags; +} __packed; + +#define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) + +/** + * struct cxl_mem_command - Driver representation of a memory device command + * @info: Command information as it exists for the UAPI + * @opcode: The actual bits used for the mailbox protocol + * @flags: Set of flags effecting driver behavior. + * + * * %CXL_CMD_FLAG_FORCE_ENABLE: In cases of error, commands with this flag + * will be enabled by the driver regardless of what hardware may have + * advertised. + * + * The cxl_mem_command is the driver's internal representation of commands that + * are supported by the driver. Some of these commands may not be supported by + * the hardware. The driver will use @info to validate the fields passed in by + * the user then submit the @opcode to the hardware. + * + * See struct cxl_command_info. + */ +struct cxl_mem_command { + struct cxl_command_info info; + enum cxl_opcode opcode; + u32 flags; +#define CXL_CMD_FLAG_NONE 0 +#define CXL_CMD_FLAG_FORCE_ENABLE BIT(0) +}; + +int cxl_mbox_send_cmd(struct cxl_dev_state *cxlds, u16 opcode, void *in, + size_t in_size, void *out, size_t out_size); +int cxl_dev_state_identify(struct cxl_dev_state *cxlds); +int cxl_await_media_ready(struct cxl_dev_state *cxlds); +int cxl_enumerate_cmds(struct cxl_dev_state *cxlds); +int cxl_mem_create_range_info(struct cxl_dev_state *cxlds); +struct cxl_dev_state *cxl_dev_state_create(struct device *dev); +void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); +void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); +#ifdef CONFIG_CXL_SUSPEND +void cxl_mem_active_inc(void); +void cxl_mem_active_dec(void); +#else +static inline void cxl_mem_active_inc(void) +{ +} +static inline void cxl_mem_active_dec(void) +{ +} +#endif + +struct cxl_hdm { + struct cxl_component_regs regs; + unsigned int decoder_count; + unsigned int target_count; + unsigned int interleave_mask; + struct cxl_port *port; +}; + +struct seq_file; +struct dentry *cxl_debugfs_create_dir(const char *dir); +void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds); +#endif /* __CXL_MEM_H__ */ diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h new file mode 100644 index 000000000..d6fafa1f9 --- /dev/null +++ b/drivers/cxl/cxlpci.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2020 Intel Corporation. All rights reserved. */ +#ifndef __CXL_PCI_H__ +#define __CXL_PCI_H__ +#include <linux/pci.h> +#include "cxl.h" + +#define CXL_MEMORY_PROGIF 0x10 + +/* + * See section 8.1 Configuration Space Registers in the CXL 2.0 + * Specification. Names are taken straight from the specification with "CXL" and + * "DVSEC" redundancies removed. When obvious, abbreviations may be used. + */ +#define PCI_DVSEC_HEADER1_LENGTH_MASK GENMASK(31, 20) +#define PCI_DVSEC_VENDOR_ID_CXL 0x1E98 + +/* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */ +#define CXL_DVSEC_PCIE_DEVICE 0 +#define CXL_DVSEC_CAP_OFFSET 0xA +#define CXL_DVSEC_MEM_CAPABLE BIT(2) +#define CXL_DVSEC_HDM_COUNT_MASK GENMASK(5, 4) +#define CXL_DVSEC_CTRL_OFFSET 0xC +#define CXL_DVSEC_MEM_ENABLE BIT(2) +#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) +#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) +#define CXL_DVSEC_MEM_INFO_VALID BIT(0) +#define CXL_DVSEC_MEM_ACTIVE BIT(1) +#define CXL_DVSEC_MEM_SIZE_LOW_MASK GENMASK(31, 28) +#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) +#define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28) + +#define CXL_DVSEC_RANGE_MAX 2 + +/* CXL 2.0 8.1.4: Non-CXL Function Map DVSEC */ +#define CXL_DVSEC_FUNCTION_MAP 2 + +/* CXL 2.0 8.1.5: CXL 2.0 Extensions DVSEC for Ports */ +#define CXL_DVSEC_PORT_EXTENSIONS 3 + +/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */ +#define CXL_DVSEC_PORT_GPF 4 + +/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */ +#define CXL_DVSEC_DEVICE_GPF 5 + +/* CXL 2.0 8.1.8: PCIe DVSEC for Flex Bus Port */ +#define CXL_DVSEC_PCIE_FLEXBUS_PORT 7 + +/* CXL 2.0 8.1.9: Register Locator DVSEC */ +#define CXL_DVSEC_REG_LOCATOR 8 +#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC +#define CXL_DVSEC_REG_LOCATOR_BIR_MASK GENMASK(2, 0) +#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK GENMASK(15, 8) +#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK GENMASK(31, 16) + +/* Register Block Identifier (RBI) */ +enum cxl_regloc_type { + CXL_REGLOC_RBI_EMPTY = 0, + CXL_REGLOC_RBI_COMPONENT, + CXL_REGLOC_RBI_VIRT, + CXL_REGLOC_RBI_MEMDEV, + CXL_REGLOC_RBI_TYPES +}; + +static inline resource_size_t cxl_regmap_to_base(struct pci_dev *pdev, + struct cxl_register_map *map) +{ + if (map->block_offset == U64_MAX) + return CXL_RESOURCE_NONE; + + return pci_resource_start(pdev, map->barno) + map->block_offset; +} + +struct cdat_header { + __le32 length; + u8 revision; + u8 checksum; + u8 reserved[6]; + __le32 sequence; +} __packed; + +struct cdat_entry_header { + u8 type; + u8 reserved; + __le16 length; +} __packed; + +int devm_cxl_port_enumerate_dports(struct cxl_port *port); +struct cxl_dev_state; +int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm); +void read_cdat_data(struct cxl_port *port); +#endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c new file mode 100644 index 000000000..80263d12a --- /dev/null +++ b/drivers/cxl/mem.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include "cxlmem.h" +#include "cxlpci.h" + +/** + * DOC: cxl mem + * + * CXL memory endpoint devices and switches are CXL capable devices that are + * participating in CXL.mem protocol. Their functionality builds on top of the + * CXL.io protocol that allows enumerating and configuring components via + * standard PCI mechanisms. + * + * The cxl_mem driver owns kicking off the enumeration of this CXL.mem + * capability. With the detection of a CXL capable endpoint, the driver will + * walk up to find the platform specific port it is connected to, and determine + * if there are intervening switches in the path. If there are switches, a + * secondary action is to enumerate those (implemented in cxl_core). Finally the + * cxl_mem driver adds the device it is bound to as a CXL endpoint-port for use + * in higher level operations. + */ + +static void enable_suspend(void *data) +{ + cxl_mem_active_dec(); +} + +static void remove_debugfs(void *dentry) +{ + debugfs_remove_recursive(dentry); +} + +static int cxl_mem_dpa_show(struct seq_file *file, void *data) +{ + struct device *dev = file->private; + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + + cxl_dpa_debug(file, cxlmd->cxlds); + + return 0; +} + +static int devm_cxl_add_endpoint(struct cxl_memdev *cxlmd, + struct cxl_dport *parent_dport) +{ + struct cxl_port *parent_port = parent_dport->port; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_port *endpoint, *iter, *down; + int rc; + + /* + * Now that the path to the root is established record all the + * intervening ports in the chain. + */ + for (iter = parent_port, down = NULL; !is_cxl_root(iter); + down = iter, iter = to_cxl_port(iter->dev.parent)) { + struct cxl_ep *ep; + + ep = cxl_ep_load(iter, cxlmd); + ep->next = down; + } + + endpoint = devm_cxl_add_port(&parent_port->dev, &cxlmd->dev, + cxlds->component_reg_phys, parent_dport); + if (IS_ERR(endpoint)) + return PTR_ERR(endpoint); + + rc = cxl_endpoint_autoremove(cxlmd, endpoint); + if (rc) + return rc; + + if (!endpoint->dev.driver) { + dev_err(&cxlmd->dev, "%s failed probe\n", + dev_name(&endpoint->dev)); + return -ENXIO; + } + + return 0; +} + +static int cxl_mem_probe(struct device *dev) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_port *parent_port; + struct cxl_dport *dport; + struct dentry *dentry; + int rc; + + /* + * Someone is trying to reattach this device after it lost its port + * connection (an endpoint port previously registered by this memdev was + * disabled). This racy check is ok because if the port is still gone, + * no harm done, and if the port hierarchy comes back it will re-trigger + * this probe. Port rescan and memdev detach work share the same + * single-threaded workqueue. + */ + if (work_pending(&cxlmd->detach_work)) + return -EBUSY; + + dentry = cxl_debugfs_create_dir(dev_name(dev)); + debugfs_create_devm_seqfile(dev, "dpamem", dentry, cxl_mem_dpa_show); + rc = devm_add_action_or_reset(dev, remove_debugfs, dentry); + if (rc) + return rc; + + rc = devm_cxl_enumerate_ports(cxlmd); + if (rc) + return rc; + + parent_port = cxl_mem_find_port(cxlmd, &dport); + if (!parent_port) { + dev_err(dev, "CXL port topology not found\n"); + return -ENXIO; + } + + device_lock(&parent_port->dev); + if (!parent_port->dev.driver) { + dev_err(dev, "CXL port topology %s not enabled\n", + dev_name(&parent_port->dev)); + rc = -ENXIO; + goto unlock; + } + + rc = devm_cxl_add_endpoint(cxlmd, dport); +unlock: + device_unlock(&parent_port->dev); + put_device(&parent_port->dev); + if (rc) + return rc; + + /* + * The kernel may be operating out of CXL memory on this device, + * there is no spec defined way to determine whether this device + * preserves contents over suspend, and there is no simple way + * to arrange for the suspend image to avoid CXL memory which + * would setup a circular dependency between PCI resume and save + * state restoration. + * + * TODO: support suspend when all the regions this device is + * hosting are locked and covered by the system address map, + * i.e. platform firmware owns restoring the HDM configuration + * that it locked. + */ + cxl_mem_active_inc(); + return devm_add_action_or_reset(dev, enable_suspend, NULL); +} + +static struct cxl_driver cxl_mem_driver = { + .name = "cxl_mem", + .probe = cxl_mem_probe, + .id = CXL_DEVICE_MEMORY_EXPANDER, +}; + +module_cxl_driver(cxl_mem_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(CXL); +MODULE_ALIAS_CXL(CXL_DEVICE_MEMORY_EXPANDER); +/* + * create_endpoint() wants to validate port driver attach immediately after + * endpoint registration. + */ +MODULE_SOFTDEP("pre: cxl_port"); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c new file mode 100644 index 000000000..faeb5d9d7 --- /dev/null +++ b/drivers/cxl/pci.c @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. All rights reserved. */ +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/moduleparam.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/sizes.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/pci-doe.h> +#include <linux/io.h> +#include "cxlmem.h" +#include "cxlpci.h" +#include "cxl.h" + +/** + * DOC: cxl pci + * + * This implements the PCI exclusive functionality for a CXL device as it is + * defined by the Compute Express Link specification. CXL devices may surface + * certain functionality even if it isn't CXL enabled. While this driver is + * focused around the PCI specific aspects of a CXL device, it binds to the + * specific CXL memory device class code, and therefore the implementation of + * cxl_pci is focused around CXL memory devices. + * + * The driver has several responsibilities, mainly: + * - Create the memX device and register on the CXL bus. + * - Enumerate device's register interface and map them. + * - Registers nvdimm bridge device with cxl_core. + * - Registers a CXL mailbox with cxl_core. + */ + +#define cxl_doorbell_busy(cxlds) \ + (readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) & \ + CXLDEV_MBOX_CTRL_DOORBELL) + +/* CXL 2.0 - 8.2.8.4 */ +#define CXL_MAILBOX_TIMEOUT_MS (2 * HZ) + +/* + * CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to + * dictate how long to wait for the mailbox to become ready. The new + * field allows the device to tell software the amount of time to wait + * before mailbox ready. This field per the spec theoretically allows + * for up to 255 seconds. 255 seconds is unreasonably long, its longer + * than the maximum SATA port link recovery wait. Default to 60 seconds + * until someone builds a CXL device that needs more time in practice. + */ +static unsigned short mbox_ready_timeout = 60; +module_param(mbox_ready_timeout, ushort, 0644); +MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready"); + +static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds) +{ + const unsigned long start = jiffies; + unsigned long end = start; + + while (cxl_doorbell_busy(cxlds)) { + end = jiffies; + + if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) { + /* Check again in case preempted before timeout test */ + if (!cxl_doorbell_busy(cxlds)) + break; + return -ETIMEDOUT; + } + cpu_relax(); + } + + dev_dbg(cxlds->dev, "Doorbell wait took %dms", + jiffies_to_msecs(end) - jiffies_to_msecs(start)); + return 0; +} + +#define cxl_err(dev, status, msg) \ + dev_err_ratelimited(dev, msg ", device state %s%s\n", \ + status & CXLMDEV_DEV_FATAL ? " fatal" : "", \ + status & CXLMDEV_FW_HALT ? " firmware-halt" : "") + +#define cxl_cmd_err(dev, cmd, status, msg) \ + dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n", \ + (cmd)->opcode, \ + status & CXLMDEV_DEV_FATAL ? " fatal" : "", \ + status & CXLMDEV_FW_HALT ? " firmware-halt" : "") + +/** + * __cxl_pci_mbox_send_cmd() - Execute a mailbox command + * @cxlds: The device state to communicate with. + * @mbox_cmd: Command to send to the memory device. + * + * Context: Any context. Expects mbox_mutex to be held. + * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success. + * Caller should check the return code in @mbox_cmd to make sure it + * succeeded. + * + * This is a generic form of the CXL mailbox send command thus only using the + * registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory + * devices, and perhaps other types of CXL devices may have further information + * available upon error conditions. Driver facilities wishing to send mailbox + * commands should use the wrapper command. + * + * The CXL spec allows for up to two mailboxes. The intention is for the primary + * mailbox to be OS controlled and the secondary mailbox to be used by system + * firmware. This allows the OS and firmware to communicate with the device and + * not need to coordinate with each other. The driver only uses the primary + * mailbox. + */ +static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds, + struct cxl_mbox_cmd *mbox_cmd) +{ + void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET; + struct device *dev = cxlds->dev; + u64 cmd_reg, status_reg; + size_t out_len; + int rc; + + lockdep_assert_held(&cxlds->mbox_mutex); + + /* + * Here are the steps from 8.2.8.4 of the CXL 2.0 spec. + * 1. Caller reads MB Control Register to verify doorbell is clear + * 2. Caller writes Command Register + * 3. Caller writes Command Payload Registers if input payload is non-empty + * 4. Caller writes MB Control Register to set doorbell + * 5. Caller either polls for doorbell to be clear or waits for interrupt if configured + * 6. Caller reads MB Status Register to fetch Return code + * 7. If command successful, Caller reads Command Register to get Payload Length + * 8. If output payload is non-empty, host reads Command Payload Registers + * + * Hardware is free to do whatever it wants before the doorbell is rung, + * and isn't allowed to change anything after it clears the doorbell. As + * such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can + * also happen in any order (though some orders might not make sense). + */ + + /* #1 */ + if (cxl_doorbell_busy(cxlds)) { + u64 md_status = + readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); + + cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, + "mailbox queue busy"); + return -EBUSY; + } + + cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK, + mbox_cmd->opcode); + if (mbox_cmd->size_in) { + if (WARN_ON(!mbox_cmd->payload_in)) + return -EINVAL; + + cmd_reg |= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, + mbox_cmd->size_in); + memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in); + } + + /* #2, #3 */ + writeq(cmd_reg, cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET); + + /* #4 */ + dev_dbg(dev, "Sending command\n"); + writel(CXLDEV_MBOX_CTRL_DOORBELL, + cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET); + + /* #5 */ + rc = cxl_pci_mbox_wait_for_doorbell(cxlds); + if (rc == -ETIMEDOUT) { + u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); + + cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout"); + return rc; + } + + /* #6 */ + status_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET); + mbox_cmd->return_code = + FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg); + + if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) { + dev_dbg(dev, "Mailbox operation had an error: %s\n", + cxl_mbox_cmd_rc2str(mbox_cmd)); + return 0; /* completed but caller must check return_code */ + } + + /* #7 */ + cmd_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET); + out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg); + + /* #8 */ + if (out_len && mbox_cmd->payload_out) { + /* + * Sanitize the copy. If hardware misbehaves, out_len per the + * spec can actually be greater than the max allowed size (21 + * bits available but spec defined 1M max). The caller also may + * have requested less data than the hardware supplied even + * within spec. + */ + size_t n = min3(mbox_cmd->size_out, cxlds->payload_size, out_len); + + memcpy_fromio(mbox_cmd->payload_out, payload, n); + mbox_cmd->size_out = n; + } else { + mbox_cmd->size_out = 0; + } + + return 0; +} + +static int cxl_pci_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd) +{ + int rc; + + mutex_lock_io(&cxlds->mbox_mutex); + rc = __cxl_pci_mbox_send_cmd(cxlds, cmd); + mutex_unlock(&cxlds->mbox_mutex); + + return rc; +} + +static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds) +{ + const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET); + unsigned long timeout; + u64 md_status; + + timeout = jiffies + mbox_ready_timeout * HZ; + do { + md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); + if (md_status & CXLMDEV_MBOX_IF_READY) + break; + if (msleep_interruptible(100)) + break; + } while (!time_after(jiffies, timeout)); + + if (!(md_status & CXLMDEV_MBOX_IF_READY)) { + cxl_err(cxlds->dev, md_status, + "timeout awaiting mailbox ready"); + return -ETIMEDOUT; + } + + /* + * A command may be in flight from a previous driver instance, + * think kexec, do one doorbell wait so that + * __cxl_pci_mbox_send_cmd() can assume that it is the only + * source for future doorbell busy events. + */ + if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) { + cxl_err(cxlds->dev, md_status, "timeout awaiting mailbox idle"); + return -ETIMEDOUT; + } + + cxlds->mbox_send = cxl_pci_mbox_send; + cxlds->payload_size = + 1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap); + + /* + * CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register + * + * If the size is too small, mandatory commands will not work and so + * there's no point in going forward. If the size is too large, there's + * no harm is soft limiting it. + */ + cxlds->payload_size = min_t(size_t, cxlds->payload_size, SZ_1M); + if (cxlds->payload_size < 256) { + dev_err(cxlds->dev, "Mailbox is too small (%zub)", + cxlds->payload_size); + return -ENXIO; + } + + dev_dbg(cxlds->dev, "Mailbox payload sized %zu", + cxlds->payload_size); + + return 0; +} + +static int cxl_map_regblock(struct pci_dev *pdev, struct cxl_register_map *map) +{ + void __iomem *addr; + int bar = map->barno; + struct device *dev = &pdev->dev; + resource_size_t offset = map->block_offset; + + /* Basic sanity check that BAR is big enough */ + if (pci_resource_len(pdev, bar) < offset) { + dev_err(dev, "BAR%d: %pr: too small (offset: %pa)\n", bar, + &pdev->resource[bar], &offset); + return -ENXIO; + } + + addr = pci_iomap(pdev, bar, 0); + if (!addr) { + dev_err(dev, "failed to map registers\n"); + return -ENOMEM; + } + + dev_dbg(dev, "Mapped CXL Memory Device resource bar %u @ %pa\n", + bar, &offset); + + map->base = addr + map->block_offset; + return 0; +} + +static void cxl_unmap_regblock(struct pci_dev *pdev, + struct cxl_register_map *map) +{ + pci_iounmap(pdev, map->base - map->block_offset); + map->base = NULL; +} + +static int cxl_probe_regs(struct pci_dev *pdev, struct cxl_register_map *map) +{ + struct cxl_component_reg_map *comp_map; + struct cxl_device_reg_map *dev_map; + struct device *dev = &pdev->dev; + void __iomem *base = map->base; + + switch (map->reg_type) { + case CXL_REGLOC_RBI_COMPONENT: + comp_map = &map->component_map; + cxl_probe_component_regs(dev, base, comp_map); + if (!comp_map->hdm_decoder.valid) { + dev_err(dev, "HDM decoder registers not found\n"); + return -ENXIO; + } + + dev_dbg(dev, "Set up component registers\n"); + break; + case CXL_REGLOC_RBI_MEMDEV: + dev_map = &map->device_map; + cxl_probe_device_regs(dev, base, dev_map); + if (!dev_map->status.valid || !dev_map->mbox.valid || + !dev_map->memdev.valid) { + dev_err(dev, "registers not found: %s%s%s\n", + !dev_map->status.valid ? "status " : "", + !dev_map->mbox.valid ? "mbox " : "", + !dev_map->memdev.valid ? "memdev " : ""); + return -ENXIO; + } + + dev_dbg(dev, "Probing device registers...\n"); + break; + default: + break; + } + + return 0; +} + +static int cxl_map_regs(struct cxl_dev_state *cxlds, struct cxl_register_map *map) +{ + struct device *dev = cxlds->dev; + struct pci_dev *pdev = to_pci_dev(dev); + + switch (map->reg_type) { + case CXL_REGLOC_RBI_COMPONENT: + cxl_map_component_regs(pdev, &cxlds->regs.component, map); + dev_dbg(dev, "Mapping component registers...\n"); + break; + case CXL_REGLOC_RBI_MEMDEV: + cxl_map_device_regs(pdev, &cxlds->regs.device_regs, map); + dev_dbg(dev, "Probing device registers...\n"); + break; + default: + break; + } + + return 0; +} + +static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map) +{ + int rc; + + rc = cxl_find_regblock(pdev, type, map); + if (rc) + return rc; + + rc = cxl_map_regblock(pdev, map); + if (rc) + return rc; + + rc = cxl_probe_regs(pdev, map); + cxl_unmap_regblock(pdev, map); + + return rc; +} + +static void cxl_pci_destroy_doe(void *mbs) +{ + xa_destroy(mbs); +} + +static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds) +{ + struct device *dev = cxlds->dev; + struct pci_dev *pdev = to_pci_dev(dev); + u16 off = 0; + + xa_init(&cxlds->doe_mbs); + if (devm_add_action(&pdev->dev, cxl_pci_destroy_doe, &cxlds->doe_mbs)) { + dev_err(dev, "Failed to create XArray for DOE's\n"); + return; + } + + /* + * Mailbox creation is best effort. Higher layers must determine if + * the lack of a mailbox for their protocol is a device failure or not. + */ + pci_doe_for_each_off(pdev, off) { + struct pci_doe_mb *doe_mb; + + doe_mb = pcim_doe_create_mb(pdev, off); + if (IS_ERR(doe_mb)) { + dev_err(dev, "Failed to create MB object for MB @ %x\n", + off); + continue; + } + + if (xa_insert(&cxlds->doe_mbs, off, doe_mb, GFP_KERNEL)) { + dev_err(dev, "xa_insert failed to insert MB @ %x\n", + off); + continue; + } + + dev_dbg(dev, "Created DOE mailbox @%x\n", off); + } +} + +static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct cxl_register_map map; + struct cxl_memdev *cxlmd; + struct cxl_dev_state *cxlds; + int rc; + + /* + * Double check the anonymous union trickery in struct cxl_regs + * FIXME switch to struct_group() + */ + BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) != + offsetof(struct cxl_regs, device_regs.memdev)); + + rc = pcim_enable_device(pdev); + if (rc) + return rc; + + cxlds = cxl_dev_state_create(&pdev->dev); + if (IS_ERR(cxlds)) + return PTR_ERR(cxlds); + + cxlds->serial = pci_get_dsn(pdev); + cxlds->cxl_dvsec = pci_find_dvsec_capability( + pdev, PCI_DVSEC_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE); + if (!cxlds->cxl_dvsec) + dev_warn(&pdev->dev, + "Device DVSEC not present, skip CXL.mem init\n"); + + rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map); + if (rc) + return rc; + + rc = cxl_map_regs(cxlds, &map); + if (rc) + return rc; + + /* + * If the component registers can't be found, the cxl_pci driver may + * still be useful for management functions so don't return an error. + */ + cxlds->component_reg_phys = CXL_RESOURCE_NONE; + rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT, &map); + if (rc) + dev_warn(&pdev->dev, "No component registers (%d)\n", rc); + + cxlds->component_reg_phys = cxl_regmap_to_base(pdev, &map); + + devm_cxl_pci_create_doe(cxlds); + + rc = cxl_pci_setup_mailbox(cxlds); + if (rc) + return rc; + + rc = cxl_enumerate_cmds(cxlds); + if (rc) + return rc; + + rc = cxl_dev_state_identify(cxlds); + if (rc) + return rc; + + rc = cxl_mem_create_range_info(cxlds); + if (rc) + return rc; + + cxlmd = devm_cxl_add_memdev(cxlds); + if (IS_ERR(cxlmd)) + return PTR_ERR(cxlmd); + + if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) + rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd); + + return rc; +} + +static const struct pci_device_id cxl_mem_pci_tbl[] = { + /* PCI class code for CXL.mem Type-3 Devices */ + { PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)}, + { /* terminate list */ }, +}; +MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl); + +static struct pci_driver cxl_pci_driver = { + .name = KBUILD_MODNAME, + .id_table = cxl_mem_pci_tbl, + .probe = cxl_pci_probe, + .driver = { + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, +}; + +MODULE_LICENSE("GPL v2"); +module_pci_driver(cxl_pci_driver); +MODULE_IMPORT_NS(CXL); diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c new file mode 100644 index 000000000..1fca98488 --- /dev/null +++ b/drivers/cxl/pmem.c @@ -0,0 +1,662 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2021 Intel Corporation. All rights reserved. */ +#include <linux/libnvdimm.h> +#include <asm/unaligned.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/ndctl.h> +#include <linux/async.h> +#include <linux/slab.h> +#include <linux/nd.h> +#include "cxlmem.h" +#include "cxl.h" + +/* + * Ordered workqueue for cxl nvdimm device arrival and departure + * to coordinate bus rescans when a bridge arrives and trigger remove + * operations when the bridge is removed. + */ +static struct workqueue_struct *cxl_pmem_wq; + +static __read_mostly DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); + +static void clear_exclusive(void *cxlds) +{ + clear_exclusive_cxl_commands(cxlds, exclusive_cmds); +} + +static void unregister_nvdimm(void *nvdimm) +{ + struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm); + struct cxl_nvdimm_bridge *cxl_nvb = cxl_nvd->bridge; + struct cxl_pmem_region *cxlr_pmem; + unsigned long index; + + device_lock(&cxl_nvb->dev); + dev_set_drvdata(&cxl_nvd->dev, NULL); + xa_for_each(&cxl_nvd->pmem_regions, index, cxlr_pmem) { + get_device(&cxlr_pmem->dev); + device_unlock(&cxl_nvb->dev); + + device_release_driver(&cxlr_pmem->dev); + put_device(&cxlr_pmem->dev); + + device_lock(&cxl_nvb->dev); + } + device_unlock(&cxl_nvb->dev); + + nvdimm_delete(nvdimm); + cxl_nvd->bridge = NULL; +} + +static int cxl_nvdimm_probe(struct device *dev) +{ + struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev); + struct cxl_memdev *cxlmd = cxl_nvd->cxlmd; + unsigned long flags = 0, cmd_mask = 0; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_nvdimm_bridge *cxl_nvb; + struct nvdimm *nvdimm; + int rc; + + cxl_nvb = cxl_find_nvdimm_bridge(dev); + if (!cxl_nvb) + return -ENXIO; + + device_lock(&cxl_nvb->dev); + if (!cxl_nvb->nvdimm_bus) { + rc = -ENXIO; + goto out; + } + + set_exclusive_cxl_commands(cxlds, exclusive_cmds); + rc = devm_add_action_or_reset(dev, clear_exclusive, cxlds); + if (rc) + goto out; + + set_bit(NDD_LABELING, &flags); + set_bit(NDD_REGISTER_SYNC, &flags); + set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask); + set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask); + set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask); + nvdimm = nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd, NULL, flags, + cmd_mask, 0, NULL); + if (!nvdimm) { + rc = -ENOMEM; + goto out; + } + + dev_set_drvdata(dev, nvdimm); + cxl_nvd->bridge = cxl_nvb; + rc = devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm); +out: + device_unlock(&cxl_nvb->dev); + put_device(&cxl_nvb->dev); + + return rc; +} + +static struct cxl_driver cxl_nvdimm_driver = { + .name = "cxl_nvdimm", + .probe = cxl_nvdimm_probe, + .id = CXL_DEVICE_NVDIMM, +}; + +static int cxl_pmem_get_config_size(struct cxl_dev_state *cxlds, + struct nd_cmd_get_config_size *cmd, + unsigned int buf_len) +{ + if (sizeof(*cmd) > buf_len) + return -EINVAL; + + *cmd = (struct nd_cmd_get_config_size) { + .config_size = cxlds->lsa_size, + .max_xfer = cxlds->payload_size - sizeof(struct cxl_mbox_set_lsa), + }; + + return 0; +} + +static int cxl_pmem_get_config_data(struct cxl_dev_state *cxlds, + struct nd_cmd_get_config_data_hdr *cmd, + unsigned int buf_len) +{ + struct cxl_mbox_get_lsa get_lsa; + int rc; + + if (sizeof(*cmd) > buf_len) + return -EINVAL; + if (struct_size(cmd, out_buf, cmd->in_length) > buf_len) + return -EINVAL; + + get_lsa = (struct cxl_mbox_get_lsa) { + .offset = cpu_to_le32(cmd->in_offset), + .length = cpu_to_le32(cmd->in_length), + }; + + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_LSA, &get_lsa, + sizeof(get_lsa), cmd->out_buf, cmd->in_length); + cmd->status = 0; + + return rc; +} + +static int cxl_pmem_set_config_data(struct cxl_dev_state *cxlds, + struct nd_cmd_set_config_hdr *cmd, + unsigned int buf_len) +{ + struct cxl_mbox_set_lsa *set_lsa; + int rc; + + if (sizeof(*cmd) > buf_len) + return -EINVAL; + + /* 4-byte status follows the input data in the payload */ + if (size_add(struct_size(cmd, in_buf, cmd->in_length), 4) > buf_len) + return -EINVAL; + + set_lsa = + kvzalloc(struct_size(set_lsa, data, cmd->in_length), GFP_KERNEL); + if (!set_lsa) + return -ENOMEM; + + *set_lsa = (struct cxl_mbox_set_lsa) { + .offset = cpu_to_le32(cmd->in_offset), + }; + memcpy(set_lsa->data, cmd->in_buf, cmd->in_length); + + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_SET_LSA, set_lsa, + struct_size(set_lsa, data, cmd->in_length), + NULL, 0); + + /* + * Set "firmware" status (4-packed bytes at the end of the input + * payload. + */ + put_unaligned(0, (u32 *) &cmd->in_buf[cmd->in_length]); + kvfree(set_lsa); + + return rc; +} + +static int cxl_pmem_nvdimm_ctl(struct nvdimm *nvdimm, unsigned int cmd, + void *buf, unsigned int buf_len) +{ + struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm); + unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm); + struct cxl_memdev *cxlmd = cxl_nvd->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + if (!test_bit(cmd, &cmd_mask)) + return -ENOTTY; + + switch (cmd) { + case ND_CMD_GET_CONFIG_SIZE: + return cxl_pmem_get_config_size(cxlds, buf, buf_len); + case ND_CMD_GET_CONFIG_DATA: + return cxl_pmem_get_config_data(cxlds, buf, buf_len); + case ND_CMD_SET_CONFIG_DATA: + return cxl_pmem_set_config_data(cxlds, buf, buf_len); + default: + return -ENOTTY; + } +} + +static int cxl_pmem_ctl(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len, int *cmd_rc) +{ + /* + * No firmware response to translate, let the transport error + * code take precedence. + */ + *cmd_rc = 0; + + if (!nvdimm) + return -ENOTTY; + return cxl_pmem_nvdimm_ctl(nvdimm, cmd, buf, buf_len); +} + +static bool online_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb) +{ + if (cxl_nvb->nvdimm_bus) + return true; + cxl_nvb->nvdimm_bus = + nvdimm_bus_register(&cxl_nvb->dev, &cxl_nvb->nd_desc); + return cxl_nvb->nvdimm_bus != NULL; +} + +static int cxl_nvdimm_release_driver(struct device *dev, void *cxl_nvb) +{ + struct cxl_nvdimm *cxl_nvd; + + if (!is_cxl_nvdimm(dev)) + return 0; + + cxl_nvd = to_cxl_nvdimm(dev); + if (cxl_nvd->bridge != cxl_nvb) + return 0; + + device_release_driver(dev); + return 0; +} + +static int cxl_pmem_region_release_driver(struct device *dev, void *cxl_nvb) +{ + struct cxl_pmem_region *cxlr_pmem; + + if (!is_cxl_pmem_region(dev)) + return 0; + + cxlr_pmem = to_cxl_pmem_region(dev); + if (cxlr_pmem->bridge != cxl_nvb) + return 0; + + device_release_driver(dev); + return 0; +} + +static void offline_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb, + struct nvdimm_bus *nvdimm_bus) +{ + if (!nvdimm_bus) + return; + + /* + * Set the state of cxl_nvdimm devices to unbound / idle before + * nvdimm_bus_unregister() rips the nvdimm objects out from + * underneath them. + */ + bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb, + cxl_pmem_region_release_driver); + bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb, + cxl_nvdimm_release_driver); + nvdimm_bus_unregister(nvdimm_bus); +} + +static void cxl_nvb_update_state(struct work_struct *work) +{ + struct cxl_nvdimm_bridge *cxl_nvb = + container_of(work, typeof(*cxl_nvb), state_work); + struct nvdimm_bus *victim_bus = NULL; + bool release = false, rescan = false; + + device_lock(&cxl_nvb->dev); + switch (cxl_nvb->state) { + case CXL_NVB_ONLINE: + if (!online_nvdimm_bus(cxl_nvb)) { + dev_err(&cxl_nvb->dev, + "failed to establish nvdimm bus\n"); + release = true; + } else + rescan = true; + break; + case CXL_NVB_OFFLINE: + case CXL_NVB_DEAD: + victim_bus = cxl_nvb->nvdimm_bus; + cxl_nvb->nvdimm_bus = NULL; + break; + default: + break; + } + device_unlock(&cxl_nvb->dev); + + if (release) + device_release_driver(&cxl_nvb->dev); + if (rescan) { + int rc = bus_rescan_devices(&cxl_bus_type); + + dev_dbg(&cxl_nvb->dev, "rescan: %d\n", rc); + } + offline_nvdimm_bus(cxl_nvb, victim_bus); + + put_device(&cxl_nvb->dev); +} + +static void cxl_nvdimm_bridge_state_work(struct cxl_nvdimm_bridge *cxl_nvb) +{ + /* + * Take a reference that the workqueue will drop if new work + * gets queued. + */ + get_device(&cxl_nvb->dev); + if (!queue_work(cxl_pmem_wq, &cxl_nvb->state_work)) + put_device(&cxl_nvb->dev); +} + +static void cxl_nvdimm_bridge_remove(struct device *dev) +{ + struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev); + + if (cxl_nvb->state == CXL_NVB_ONLINE) + cxl_nvb->state = CXL_NVB_OFFLINE; + cxl_nvdimm_bridge_state_work(cxl_nvb); +} + +static int cxl_nvdimm_bridge_probe(struct device *dev) +{ + struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev); + + if (cxl_nvb->state == CXL_NVB_DEAD) + return -ENXIO; + + if (cxl_nvb->state == CXL_NVB_NEW) { + cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor) { + .provider_name = "CXL", + .module = THIS_MODULE, + .ndctl = cxl_pmem_ctl, + }; + + INIT_WORK(&cxl_nvb->state_work, cxl_nvb_update_state); + } + + cxl_nvb->state = CXL_NVB_ONLINE; + cxl_nvdimm_bridge_state_work(cxl_nvb); + + return 0; +} + +static struct cxl_driver cxl_nvdimm_bridge_driver = { + .name = "cxl_nvdimm_bridge", + .probe = cxl_nvdimm_bridge_probe, + .remove = cxl_nvdimm_bridge_remove, + .id = CXL_DEVICE_NVDIMM_BRIDGE, +}; + +static int match_cxl_nvdimm(struct device *dev, void *data) +{ + return is_cxl_nvdimm(dev); +} + +static void unregister_nvdimm_region(void *nd_region) +{ + nvdimm_region_delete(nd_region); +} + +static int cxl_nvdimm_add_region(struct cxl_nvdimm *cxl_nvd, + struct cxl_pmem_region *cxlr_pmem) +{ + int rc; + + rc = xa_insert(&cxl_nvd->pmem_regions, (unsigned long)cxlr_pmem, + cxlr_pmem, GFP_KERNEL); + if (rc) + return rc; + + get_device(&cxlr_pmem->dev); + return 0; +} + +static void cxl_nvdimm_del_region(struct cxl_nvdimm *cxl_nvd, + struct cxl_pmem_region *cxlr_pmem) +{ + /* + * It is possible this is called without a corresponding + * cxl_nvdimm_add_region for @cxlr_pmem + */ + cxlr_pmem = xa_erase(&cxl_nvd->pmem_regions, (unsigned long)cxlr_pmem); + if (cxlr_pmem) + put_device(&cxlr_pmem->dev); +} + +static void release_mappings(void *data) +{ + int i; + struct cxl_pmem_region *cxlr_pmem = data; + struct cxl_nvdimm_bridge *cxl_nvb = cxlr_pmem->bridge; + + device_lock(&cxl_nvb->dev); + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + struct cxl_nvdimm *cxl_nvd = m->cxl_nvd; + + cxl_nvdimm_del_region(cxl_nvd, cxlr_pmem); + } + device_unlock(&cxl_nvb->dev); +} + +static void cxlr_pmem_remove_resource(void *res) +{ + remove_resource(res); +} + +struct cxl_pmem_region_info { + u64 offset; + u64 serial; +}; + +static int cxl_pmem_region_probe(struct device *dev) +{ + struct nd_mapping_desc mappings[CXL_DECODER_MAX_INTERLEAVE]; + struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev); + struct cxl_region *cxlr = cxlr_pmem->cxlr; + struct cxl_pmem_region_info *info = NULL; + struct cxl_nvdimm_bridge *cxl_nvb; + struct nd_interleave_set *nd_set; + struct nd_region_desc ndr_desc; + struct cxl_nvdimm *cxl_nvd; + struct nvdimm *nvdimm; + struct resource *res; + int rc, i = 0; + + cxl_nvb = cxl_find_nvdimm_bridge(&cxlr_pmem->mapping[0].cxlmd->dev); + if (!cxl_nvb) { + dev_dbg(dev, "bridge not found\n"); + return -ENXIO; + } + cxlr_pmem->bridge = cxl_nvb; + + device_lock(&cxl_nvb->dev); + if (!cxl_nvb->nvdimm_bus) { + dev_dbg(dev, "nvdimm bus not found\n"); + rc = -ENXIO; + goto out_nvb; + } + + memset(&mappings, 0, sizeof(mappings)); + memset(&ndr_desc, 0, sizeof(ndr_desc)); + + res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL); + if (!res) { + rc = -ENOMEM; + goto out_nvb; + } + + res->name = "Persistent Memory"; + res->start = cxlr_pmem->hpa_range.start; + res->end = cxlr_pmem->hpa_range.end; + res->flags = IORESOURCE_MEM; + res->desc = IORES_DESC_PERSISTENT_MEMORY; + + rc = insert_resource(&iomem_resource, res); + if (rc) + goto out_nvb; + + rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res); + if (rc) + goto out_nvb; + + ndr_desc.res = res; + ndr_desc.provider_data = cxlr_pmem; + + ndr_desc.numa_node = memory_add_physaddr_to_nid(res->start); + ndr_desc.target_node = phys_to_target_node(res->start); + if (ndr_desc.target_node == NUMA_NO_NODE) { + ndr_desc.target_node = ndr_desc.numa_node; + dev_dbg(&cxlr->dev, "changing target node from %d to %d", + NUMA_NO_NODE, ndr_desc.target_node); + } + + nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL); + if (!nd_set) { + rc = -ENOMEM; + goto out_nvb; + } + + ndr_desc.memregion = cxlr->id; + set_bit(ND_REGION_CXL, &ndr_desc.flags); + set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags); + + info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto out_nvb; + } + + rc = devm_add_action_or_reset(dev, release_mappings, cxlr_pmem); + if (rc) + goto out_nvd; + + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + struct cxl_memdev *cxlmd = m->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *d; + + d = device_find_child(&cxlmd->dev, NULL, match_cxl_nvdimm); + if (!d) { + dev_dbg(dev, "[%d]: %s: no cxl_nvdimm found\n", i, + dev_name(&cxlmd->dev)); + rc = -ENODEV; + goto out_nvd; + } + + /* safe to drop ref now with bridge lock held */ + put_device(d); + + cxl_nvd = to_cxl_nvdimm(d); + nvdimm = dev_get_drvdata(&cxl_nvd->dev); + if (!nvdimm) { + dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i, + dev_name(&cxlmd->dev)); + rc = -ENODEV; + goto out_nvd; + } + + /* + * Pin the region per nvdimm device as those may be released + * out-of-order with respect to the region, and a single nvdimm + * maybe associated with multiple regions + */ + rc = cxl_nvdimm_add_region(cxl_nvd, cxlr_pmem); + if (rc) + goto out_nvd; + m->cxl_nvd = cxl_nvd; + mappings[i] = (struct nd_mapping_desc) { + .nvdimm = nvdimm, + .start = m->start, + .size = m->size, + .position = i, + }; + info[i].offset = m->start; + info[i].serial = cxlds->serial; + } + ndr_desc.num_mappings = cxlr_pmem->nr_mappings; + ndr_desc.mapping = mappings; + + /* + * TODO enable CXL labels which skip the need for 'interleave-set cookie' + */ + nd_set->cookie1 = + nd_fletcher64(info, sizeof(*info) * cxlr_pmem->nr_mappings, 0); + nd_set->cookie2 = nd_set->cookie1; + ndr_desc.nd_set = nd_set; + + cxlr_pmem->nd_region = + nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc); + if (!cxlr_pmem->nd_region) { + rc = -ENOMEM; + goto out_nvd; + } + + rc = devm_add_action_or_reset(dev, unregister_nvdimm_region, + cxlr_pmem->nd_region); +out_nvd: + kfree(info); +out_nvb: + device_unlock(&cxl_nvb->dev); + put_device(&cxl_nvb->dev); + + return rc; +} + +static struct cxl_driver cxl_pmem_region_driver = { + .name = "cxl_pmem_region", + .probe = cxl_pmem_region_probe, + .id = CXL_DEVICE_PMEM_REGION, +}; + +/* + * Return all bridges to the CXL_NVB_NEW state to invalidate any + * ->state_work referring to the now destroyed cxl_pmem_wq. + */ +static int cxl_nvdimm_bridge_reset(struct device *dev, void *data) +{ + struct cxl_nvdimm_bridge *cxl_nvb; + + if (!is_cxl_nvdimm_bridge(dev)) + return 0; + + cxl_nvb = to_cxl_nvdimm_bridge(dev); + device_lock(dev); + cxl_nvb->state = CXL_NVB_NEW; + device_unlock(dev); + + return 0; +} + +static void destroy_cxl_pmem_wq(void) +{ + destroy_workqueue(cxl_pmem_wq); + bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_nvdimm_bridge_reset); +} + +static __init int cxl_pmem_init(void) +{ + int rc; + + set_bit(CXL_MEM_COMMAND_ID_SET_SHUTDOWN_STATE, exclusive_cmds); + set_bit(CXL_MEM_COMMAND_ID_SET_LSA, exclusive_cmds); + + cxl_pmem_wq = alloc_ordered_workqueue("cxl_pmem", 0); + if (!cxl_pmem_wq) + return -ENXIO; + + rc = cxl_driver_register(&cxl_nvdimm_bridge_driver); + if (rc) + goto err_bridge; + + rc = cxl_driver_register(&cxl_nvdimm_driver); + if (rc) + goto err_nvdimm; + + rc = cxl_driver_register(&cxl_pmem_region_driver); + if (rc) + goto err_region; + + return 0; + +err_region: + cxl_driver_unregister(&cxl_nvdimm_driver); +err_nvdimm: + cxl_driver_unregister(&cxl_nvdimm_bridge_driver); +err_bridge: + destroy_cxl_pmem_wq(); + return rc; +} + +static __exit void cxl_pmem_exit(void) +{ + cxl_driver_unregister(&cxl_pmem_region_driver); + cxl_driver_unregister(&cxl_nvdimm_driver); + cxl_driver_unregister(&cxl_nvdimm_bridge_driver); + destroy_cxl_pmem_wq(); +} + +MODULE_LICENSE("GPL v2"); +module_init(cxl_pmem_init); +module_exit(cxl_pmem_exit); +MODULE_IMPORT_NS(CXL); +MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM_BRIDGE); +MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM); +MODULE_ALIAS_CXL(CXL_DEVICE_PMEM_REGION); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c new file mode 100644 index 000000000..5453771bf --- /dev/null +++ b/drivers/cxl/port.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include <linux/device.h> +#include <linux/module.h> +#include <linux/slab.h> + +#include "cxlmem.h" +#include "cxlpci.h" + +/** + * DOC: cxl port + * + * The port driver enumerates dport via PCI and scans for HDM + * (Host-managed-Device-Memory) decoder resources via the + * @component_reg_phys value passed in by the agent that registered the + * port. All descendant ports of a CXL root port (described by platform + * firmware) are managed in this drivers context. Each driver instance + * is responsible for tearing down the driver context of immediate + * descendant ports. The locking for this is validated by + * CONFIG_PROVE_CXL_LOCKING. + * + * The primary service this driver provides is presenting APIs to other + * drivers to utilize the decoders, and indicating to userspace (via bind + * status) the connectivity of the CXL.mem protocol throughout the + * PCIe topology. + */ + +static void schedule_detach(void *cxlmd) +{ + schedule_cxl_memdev_detach(cxlmd); +} + +static int cxl_port_probe(struct device *dev) +{ + struct cxl_port *port = to_cxl_port(dev); + struct cxl_hdm *cxlhdm; + int rc; + + + if (!is_cxl_endpoint(port)) { + rc = devm_cxl_port_enumerate_dports(port); + if (rc < 0) + return rc; + if (rc == 1) + return devm_cxl_add_passthrough_decoder(port); + } + + cxlhdm = devm_cxl_setup_hdm(port); + if (IS_ERR(cxlhdm)) + return PTR_ERR(cxlhdm); + + if (is_cxl_endpoint(port)) { + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + /* Cache the data early to ensure is_visible() works */ + read_cdat_data(port); + + get_device(&cxlmd->dev); + rc = devm_add_action_or_reset(dev, schedule_detach, cxlmd); + if (rc) + return rc; + + rc = cxl_hdm_decode_init(cxlds, cxlhdm); + if (rc) + return rc; + + rc = cxl_await_media_ready(cxlds); + if (rc) { + dev_err(dev, "Media not active (%d)\n", rc); + return rc; + } + } + + rc = devm_cxl_enumerate_decoders(cxlhdm); + if (rc) { + dev_err(dev, "Couldn't enumerate decoders (%d)\n", rc); + return rc; + } + + return 0; +} + +static ssize_t CDAT_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t offset, size_t count) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_port *port = to_cxl_port(dev); + + if (!port->cdat_available) + return -ENXIO; + + if (!port->cdat.table) + return 0; + + return memory_read_from_buffer(buf, count, &offset, + port->cdat.table, + port->cdat.length); +} + +static BIN_ATTR_ADMIN_RO(CDAT, 0); + +static umode_t cxl_port_bin_attr_is_visible(struct kobject *kobj, + struct bin_attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_port *port = to_cxl_port(dev); + + if ((attr == &bin_attr_CDAT) && port->cdat_available) + return attr->attr.mode; + + return 0; +} + +static struct bin_attribute *cxl_cdat_bin_attributes[] = { + &bin_attr_CDAT, + NULL, +}; + +static struct attribute_group cxl_cdat_attribute_group = { + .bin_attrs = cxl_cdat_bin_attributes, + .is_bin_visible = cxl_port_bin_attr_is_visible, +}; + +static const struct attribute_group *cxl_port_attribute_groups[] = { + &cxl_cdat_attribute_group, + NULL, +}; + +static struct cxl_driver cxl_port_driver = { + .name = "cxl_port", + .probe = cxl_port_probe, + .id = CXL_DEVICE_PORT, + .drv = { + .dev_groups = cxl_port_attribute_groups, + }, +}; + +module_cxl_driver(cxl_port_driver); +MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(CXL); +MODULE_ALIAS_CXL(CXL_DEVICE_PORT); |