From ace9429bb58fd418f0c81d4c2835699bddf6bde6 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Thu, 11 Apr 2024 10:27:49 +0200
Subject: Adding upstream version 6.6.15.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 drivers/cxl/Kconfig        |  157 +++
 drivers/cxl/Makefile       |   13 +
 drivers/cxl/acpi.c         |  764 +++++++++++
 drivers/cxl/core/Makefile  |   17 +
 drivers/cxl/core/core.h    |   90 ++
 drivers/cxl/core/hdm.c     | 1036 +++++++++++++++
 drivers/cxl/core/mbox.c    | 1418 +++++++++++++++++++++
 drivers/cxl/core/memdev.c  | 1101 ++++++++++++++++
 drivers/cxl/core/pci.c     |  757 +++++++++++
 drivers/cxl/core/pmem.c    |  290 +++++
 drivers/cxl/core/pmu.c     |   68 +
 drivers/cxl/core/port.c    | 2092 ++++++++++++++++++++++++++++++
 drivers/cxl/core/region.c  | 3006 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/core/regs.c    |  544 ++++++++
 drivers/cxl/core/suspend.c |   24 +
 drivers/cxl/core/trace.c   |   99 ++
 drivers/cxl/core/trace.h   |  709 +++++++++++
 drivers/cxl/cxl.h          |  831 ++++++++++++
 drivers/cxl/cxlmem.h       |  902 +++++++++++++
 drivers/cxl/cxlpci.h       |   96 ++
 drivers/cxl/mem.c          |  262 ++++
 drivers/cxl/pci.c          |  973 ++++++++++++++
 drivers/cxl/pmem.c         |  464 +++++++
 drivers/cxl/pmu.h          |   28 +
 drivers/cxl/port.c         |  207 +++
 drivers/cxl/security.c     |  205 +++
 26 files changed, 16153 insertions(+)
 create mode 100644 drivers/cxl/Kconfig
 create mode 100644 drivers/cxl/Makefile
 create mode 100644 drivers/cxl/acpi.c
 create mode 100644 drivers/cxl/core/Makefile
 create mode 100644 drivers/cxl/core/core.h
 create mode 100644 drivers/cxl/core/hdm.c
 create mode 100644 drivers/cxl/core/mbox.c
 create mode 100644 drivers/cxl/core/memdev.c
 create mode 100644 drivers/cxl/core/pci.c
 create mode 100644 drivers/cxl/core/pmem.c
 create mode 100644 drivers/cxl/core/pmu.c
 create mode 100644 drivers/cxl/core/port.c
 create mode 100644 drivers/cxl/core/region.c
 create mode 100644 drivers/cxl/core/regs.c
 create mode 100644 drivers/cxl/core/suspend.c
 create mode 100644 drivers/cxl/core/trace.c
 create mode 100644 drivers/cxl/core/trace.h
 create mode 100644 drivers/cxl/cxl.h
 create mode 100644 drivers/cxl/cxlmem.h
 create mode 100644 drivers/cxl/cxlpci.h
 create mode 100644 drivers/cxl/mem.c
 create mode 100644 drivers/cxl/pci.c
 create mode 100644 drivers/cxl/pmem.c
 create mode 100644 drivers/cxl/pmu.h
 create mode 100644 drivers/cxl/port.c
 create mode 100644 drivers/cxl/security.c

(limited to 'drivers/cxl')

diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
new file mode 100644
index 0000000000..8ea1d340e4
--- /dev/null
+++ b/drivers/cxl/Kconfig
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig CXL_BUS
+	tristate "CXL (Compute Express Link) Devices Support"
+	depends on PCI
+	select FW_LOADER
+	select FW_UPLOAD
+	select PCI_DOE
+	help
+	  CXL is a bus that is electrically compatible with PCI Express, but
+	  layers three protocols on that signalling (CXL.io, CXL.cache, and
+	  CXL.mem). The CXL.cache protocol allows devices to hold cachelines
+	  locally, the CXL.mem protocol allows devices to be fully coherent
+	  memory targets, the CXL.io protocol is equivalent to PCI Express.
+	  Say 'y' to enable support for the configuration and management of
+	  devices supporting these protocols.
+
+if CXL_BUS
+
+config CXL_PCI
+	tristate "PCI manageability"
+	default CXL_BUS
+	help
+	  The CXL specification defines a "CXL memory device" sub-class in the
+	  PCI "memory controller" base class of devices. Device's identified by
+	  this class code provide support for volatile and / or persistent
+	  memory to be mapped into the system address map (Host-managed Device
+	  Memory (HDM)).
+
+	  Say 'y/m' to enable a driver that will attach to CXL memory expander
+	  devices enumerated by the memory device class code for configuration
+	  and management primarily via the mailbox interface. See Chapter 2.3
+	  Type 3 CXL Device in the CXL 2.0 specification for more details.
+
+	  If unsure say 'm'.
+
+config CXL_MEM_RAW_COMMANDS
+	bool "RAW Command Interface for Memory Devices"
+	depends on CXL_PCI
+	help
+	  Enable CXL RAW command interface.
+
+	  The CXL driver ioctl interface may assign a kernel ioctl command
+	  number for each specification defined opcode. At any given point in
+	  time the number of opcodes that the specification defines and a device
+	  may implement may exceed the kernel's set of associated ioctl function
+	  numbers. The mismatch is either by omission, specification is too new,
+	  or by design. When prototyping new hardware, or developing / debugging
+	  the driver it is useful to be able to submit any possible command to
+	  the hardware, even commands that may crash the kernel due to their
+	  potential impact to memory currently in use by the kernel.
+
+	  If developing CXL hardware or the driver say Y, otherwise say N.
+
+config CXL_ACPI
+	tristate "CXL ACPI: Platform Support"
+	depends on ACPI
+	default CXL_BUS
+	select ACPI_TABLE_LIB
+	help
+	  Enable support for host managed device memory (HDM) resources
+	  published by a platform's ACPI CXL memory layout description.  See
+	  Chapter 9.14.1 CXL Early Discovery Table (CEDT) in the CXL 2.0
+	  specification, and CXL Fixed Memory Window Structures (CEDT.CFMWS)
+	  (https://www.computeexpresslink.org/spec-landing). The CXL core
+	  consumes these resource to publish the root of a cxl_port decode
+	  hierarchy to map regions that represent System RAM, or Persistent
+	  Memory regions to be managed by LIBNVDIMM.
+
+	  If unsure say 'm'.
+
+config CXL_PMEM
+	tristate "CXL PMEM: Persistent Memory Support"
+	depends on LIBNVDIMM
+	default CXL_BUS
+	help
+	  In addition to typical memory resources a platform may also advertise
+	  support for persistent memory attached via CXL. This support is
+	  managed via a bridge driver from CXL to the LIBNVDIMM system
+	  subsystem. Say 'y/m' to enable support for enumerating and
+	  provisioning the persistent memory capacity of CXL memory expanders.
+
+	  If unsure say 'm'.
+
+config CXL_MEM
+	tristate "CXL: Memory Expansion"
+	depends on CXL_PCI
+	default CXL_BUS
+	help
+	  The CXL.mem protocol allows a device to act as a provider of "System
+	  RAM" and/or "Persistent Memory" that is fully coherent as if the
+	  memory were attached to the typical CPU memory controller. This is
+	  known as HDM "Host-managed Device Memory".
+
+	  Say 'y/m' to enable a driver that will attach to CXL.mem devices for
+	  memory expansion and control of HDM. See Chapter 9.13 in the CXL 2.0
+	  specification for a detailed description of HDM.
+
+	  If unsure say 'm'.
+
+config CXL_PORT
+	default CXL_BUS
+	tristate
+
+config CXL_SUSPEND
+	def_bool y
+	depends on SUSPEND && CXL_MEM
+
+config CXL_REGION
+	bool "CXL: Region Support"
+	default CXL_BUS
+	# For MAX_PHYSMEM_BITS
+	depends on SPARSEMEM
+	select MEMREGION
+	select GET_FREE_REGION
+	help
+	  Enable the CXL core to enumerate and provision CXL regions. A CXL
+	  region is defined by one or more CXL expanders that decode a given
+	  system-physical address range. For CXL regions established by
+	  platform-firmware this option enables memory error handling to
+	  identify the devices participating in a given interleaved memory
+	  range. Otherwise, platform-firmware managed CXL is enabled by being
+	  placed in the system address map and does not need a driver.
+
+	  If unsure say 'y'
+
+config CXL_REGION_INVALIDATION_TEST
+	bool "CXL: Region Cache Management Bypass (TEST)"
+	depends on CXL_REGION
+	help
+	  CXL Region management and security operations potentially invalidate
+	  the content of CPU caches without notifying those caches to
+	  invalidate the affected cachelines. The CXL Region driver attempts
+	  to invalidate caches when those events occur.  If that invalidation
+	  fails the region will fail to enable.  Reasons for cache
+	  invalidation failure are due to the CPU not providing a cache
+	  invalidation mechanism. For example usage of wbinvd is restricted to
+	  bare metal x86. However, for testing purposes toggling this option
+	  can disable that data integrity safety and proceed with enabling
+	  regions when there might be conflicting contents in the CPU cache.
+
+	  If unsure, or if this kernel is meant for production environments,
+	  say N.
+
+config CXL_PMU
+	tristate "CXL Performance Monitoring Unit"
+	default CXL_BUS
+	depends on PERF_EVENTS
+	help
+	  Support performance monitoring as defined in CXL rev 3.0
+	  section 13.2: Performance Monitoring. CXL components may have
+	  one or more CXL Performance Monitoring Units (CPMUs).
+
+	  Say 'y/m' to enable a driver that will attach to performance
+	  monitoring units and provide standard perf based interfaces.
+
+	  If unsure say 'm'.
+endif
diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile
new file mode 100644
index 0000000000..db321f48ba
--- /dev/null
+++ b/drivers/cxl/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y += core/
+obj-$(CONFIG_CXL_PCI) += cxl_pci.o
+obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o
+obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o
+obj-$(CONFIG_CXL_PORT) += cxl_port.o
+
+cxl_mem-y := mem.o
+cxl_pci-y := pci.o
+cxl_acpi-y := acpi.o
+cxl_pmem-y := pmem.o security.o
+cxl_port-y := port.o
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
new file mode 100644
index 0000000000..40d055560e
--- /dev/null
+++ b/drivers/cxl/acpi.c
@@ -0,0 +1,764 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2021 Intel Corporation. All rights reserved. */
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/pci.h>
+#include <asm/div64.h>
+#include "cxlpci.h"
+#include "cxl.h"
+
+#define CXL_RCRB_SIZE	SZ_8K
+
+struct cxl_cxims_data {
+	int nr_maps;
+	u64 xormaps[] __counted_by(nr_maps);
+};
+
+/*
+ * Find a targets entry (n) in the host bridge interleave list.
+ * CXL Specification 3.0 Table 9-22
+ */
+static int cxl_xor_calc_n(u64 hpa, struct cxl_cxims_data *cximsd, int iw,
+			  int ig)
+{
+	int i = 0, n = 0;
+	u8 eiw;
+
+	/* IW: 2,4,6,8,12,16 begin building 'n' using xormaps */
+	if (iw != 3) {
+		for (i = 0; i < cximsd->nr_maps; i++)
+			n |= (hweight64(hpa & cximsd->xormaps[i]) & 1) << i;
+	}
+	/* IW: 3,6,12 add a modulo calculation to 'n' */
+	if (!is_power_of_2(iw)) {
+		if (ways_to_eiw(iw, &eiw))
+			return -1;
+		hpa &= GENMASK_ULL(51, eiw + ig);
+		n |= do_div(hpa, 3) << i;
+	}
+	return n;
+}
+
+static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos)
+{
+	struct cxl_cxims_data *cximsd = cxlrd->platform_data;
+	struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd;
+	struct cxl_decoder *cxld = &cxlsd->cxld;
+	int ig = cxld->interleave_granularity;
+	int iw = cxld->interleave_ways;
+	int n = 0;
+	u64 hpa;
+
+	if (dev_WARN_ONCE(&cxld->dev,
+			  cxld->interleave_ways != cxlsd->nr_targets,
+			  "misconfigured root decoder\n"))
+		return NULL;
+
+	hpa = cxlrd->res->start + pos * ig;
+
+	/* Entry (n) is 0 for no interleave (iw == 1) */
+	if (iw != 1)
+		n = cxl_xor_calc_n(hpa, cximsd, iw, ig);
+
+	if (n < 0)
+		return NULL;
+
+	return cxlrd->cxlsd.target[n];
+}
+
+struct cxl_cxims_context {
+	struct device *dev;
+	struct cxl_root_decoder *cxlrd;
+};
+
+static int cxl_parse_cxims(union acpi_subtable_headers *header, void *arg,
+			   const unsigned long end)
+{
+	struct acpi_cedt_cxims *cxims = (struct acpi_cedt_cxims *)header;
+	struct cxl_cxims_context *ctx = arg;
+	struct cxl_root_decoder *cxlrd = ctx->cxlrd;
+	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+	struct device *dev = ctx->dev;
+	struct cxl_cxims_data *cximsd;
+	unsigned int hbig, nr_maps;
+	int rc;
+
+	rc = eig_to_granularity(cxims->hbig, &hbig);
+	if (rc)
+		return rc;
+
+	/* Does this CXIMS entry apply to the given CXL Window? */
+	if (hbig != cxld->interleave_granularity)
+		return 0;
+
+	/* IW 1,3 do not use xormaps and skip this parsing entirely */
+	if (is_power_of_2(cxld->interleave_ways))
+		/* 2, 4, 8, 16 way */
+		nr_maps = ilog2(cxld->interleave_ways);
+	else
+		/* 6, 12 way */
+		nr_maps = ilog2(cxld->interleave_ways / 3);
+
+	if (cxims->nr_xormaps < nr_maps) {
+		dev_dbg(dev, "CXIMS nr_xormaps[%d] expected[%d]\n",
+			cxims->nr_xormaps, nr_maps);
+		return -ENXIO;
+	}
+
+	cximsd = devm_kzalloc(dev, struct_size(cximsd, xormaps, nr_maps),
+			      GFP_KERNEL);
+	if (!cximsd)
+		return -ENOMEM;
+	cximsd->nr_maps = nr_maps;
+	memcpy(cximsd->xormaps, cxims->xormap_list,
+	       nr_maps * sizeof(*cximsd->xormaps));
+	cxlrd->platform_data = cximsd;
+
+	return 0;
+}
+
+static unsigned long cfmws_to_decoder_flags(int restrictions)
+{
+	unsigned long flags = CXL_DECODER_F_ENABLE;
+
+	if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE2)
+		flags |= CXL_DECODER_F_TYPE2;
+	if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE3)
+		flags |= CXL_DECODER_F_TYPE3;
+	if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_VOLATILE)
+		flags |= CXL_DECODER_F_RAM;
+	if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_PMEM)
+		flags |= CXL_DECODER_F_PMEM;
+	if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_FIXED)
+		flags |= CXL_DECODER_F_LOCK;
+
+	return flags;
+}
+
+static int cxl_acpi_cfmws_verify(struct device *dev,
+				 struct acpi_cedt_cfmws *cfmws)
+{
+	int rc, expected_len;
+	unsigned int ways;
+
+	if (cfmws->interleave_arithmetic != ACPI_CEDT_CFMWS_ARITHMETIC_MODULO &&
+	    cfmws->interleave_arithmetic != ACPI_CEDT_CFMWS_ARITHMETIC_XOR) {
+		dev_err(dev, "CFMWS Unknown Interleave Arithmetic: %d\n",
+			cfmws->interleave_arithmetic);
+		return -EINVAL;
+	}
+
+	if (!IS_ALIGNED(cfmws->base_hpa, SZ_256M)) {
+		dev_err(dev, "CFMWS Base HPA not 256MB aligned\n");
+		return -EINVAL;
+	}
+
+	if (!IS_ALIGNED(cfmws->window_size, SZ_256M)) {
+		dev_err(dev, "CFMWS Window Size not 256MB aligned\n");
+		return -EINVAL;
+	}
+
+	rc = eiw_to_ways(cfmws->interleave_ways, &ways);
+	if (rc) {
+		dev_err(dev, "CFMWS Interleave Ways (%d) invalid\n",
+			cfmws->interleave_ways);
+		return -EINVAL;
+	}
+
+	expected_len = struct_size(cfmws, interleave_targets, ways);
+
+	if (cfmws->header.length < expected_len) {
+		dev_err(dev, "CFMWS length %d less than expected %d\n",
+			cfmws->header.length, expected_len);
+		return -EINVAL;
+	}
+
+	if (cfmws->header.length > expected_len)
+		dev_dbg(dev, "CFMWS length %d greater than expected %d\n",
+			cfmws->header.length, expected_len);
+
+	return 0;
+}
+
+/*
+ * Note, @dev must be the first member, see 'struct cxl_chbs_context'
+ * and mock_acpi_table_parse_cedt()
+ */
+struct cxl_cfmws_context {
+	struct device *dev;
+	struct cxl_port *root_port;
+	struct resource *cxl_res;
+	int id;
+};
+
+static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
+			   const unsigned long end)
+{
+	int target_map[CXL_DECODER_MAX_INTERLEAVE];
+	struct cxl_cfmws_context *ctx = arg;
+	struct cxl_port *root_port = ctx->root_port;
+	struct resource *cxl_res = ctx->cxl_res;
+	struct cxl_cxims_context cxims_ctx;
+	struct cxl_root_decoder *cxlrd;
+	struct device *dev = ctx->dev;
+	struct acpi_cedt_cfmws *cfmws;
+	cxl_calc_hb_fn cxl_calc_hb;
+	struct cxl_decoder *cxld;
+	unsigned int ways, i, ig;
+	struct resource *res;
+	int rc;
+
+	cfmws = (struct acpi_cedt_cfmws *) header;
+
+	rc = cxl_acpi_cfmws_verify(dev, cfmws);
+	if (rc) {
+		dev_err(dev, "CFMWS range %#llx-%#llx not registered\n",
+			cfmws->base_hpa,
+			cfmws->base_hpa + cfmws->window_size - 1);
+		return 0;
+	}
+
+	rc = eiw_to_ways(cfmws->interleave_ways, &ways);
+	if (rc)
+		return rc;
+	rc = eig_to_granularity(cfmws->granularity, &ig);
+	if (rc)
+		return rc;
+	for (i = 0; i < ways; i++)
+		target_map[i] = cfmws->interleave_targets[i];
+
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	res->name = kasprintf(GFP_KERNEL, "CXL Window %d", ctx->id++);
+	if (!res->name)
+		goto err_name;
+
+	res->start = cfmws->base_hpa;
+	res->end = cfmws->base_hpa + cfmws->window_size - 1;
+	res->flags = IORESOURCE_MEM;
+
+	/* add to the local resource tracking to establish a sort order */
+	rc = insert_resource(cxl_res, res);
+	if (rc)
+		goto err_insert;
+
+	if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_MODULO)
+		cxl_calc_hb = cxl_hb_modulo;
+	else
+		cxl_calc_hb = cxl_hb_xor;
+
+	cxlrd = cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb);
+	if (IS_ERR(cxlrd))
+		return 0;
+
+	cxld = &cxlrd->cxlsd.cxld;
+	cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions);
+	cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+	cxld->hpa_range = (struct range) {
+		.start = res->start,
+		.end = res->end,
+	};
+	cxld->interleave_ways = ways;
+	/*
+	 * Minimize the x1 granularity to advertise support for any
+	 * valid region granularity
+	 */
+	if (ways == 1)
+		ig = CXL_DECODER_MIN_GRANULARITY;
+	cxld->interleave_granularity = ig;
+
+	if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) {
+		if (ways != 1 && ways != 3) {
+			cxims_ctx = (struct cxl_cxims_context) {
+				.dev = dev,
+				.cxlrd = cxlrd,
+			};
+			rc = acpi_table_parse_cedt(ACPI_CEDT_TYPE_CXIMS,
+						   cxl_parse_cxims, &cxims_ctx);
+			if (rc < 0)
+				goto err_xormap;
+			if (!cxlrd->platform_data) {
+				dev_err(dev, "No CXIMS for HBIG %u\n", ig);
+				rc = -EINVAL;
+				goto err_xormap;
+			}
+		}
+	}
+	rc = cxl_decoder_add(cxld, target_map);
+err_xormap:
+	if (rc)
+		put_device(&cxld->dev);
+	else
+		rc = cxl_decoder_autoremove(dev, cxld);
+	if (rc) {
+		dev_err(dev, "Failed to add decode range: %pr", res);
+		return rc;
+	}
+	dev_dbg(dev, "add: %s node: %d range [%#llx - %#llx]\n",
+		dev_name(&cxld->dev),
+		phys_to_target_node(cxld->hpa_range.start),
+		cxld->hpa_range.start, cxld->hpa_range.end);
+
+	return 0;
+
+err_insert:
+	kfree(res->name);
+err_name:
+	kfree(res);
+	return -ENOMEM;
+}
+
+__mock struct acpi_device *to_cxl_host_bridge(struct device *host,
+					      struct device *dev)
+{
+	struct acpi_device *adev = to_acpi_device(dev);
+
+	if (!acpi_pci_find_root(adev->handle))
+		return NULL;
+
+	if (strcmp(acpi_device_hid(adev), "ACPI0016") == 0)
+		return adev;
+	return NULL;
+}
+
+/* Note, @dev is used by mock_acpi_table_parse_cedt() */
+struct cxl_chbs_context {
+	struct device *dev;
+	unsigned long long uid;
+	resource_size_t base;
+	u32 cxl_version;
+};
+
+static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg,
+			     const unsigned long end)
+{
+	struct cxl_chbs_context *ctx = arg;
+	struct acpi_cedt_chbs *chbs;
+
+	if (ctx->base != CXL_RESOURCE_NONE)
+		return 0;
+
+	chbs = (struct acpi_cedt_chbs *) header;
+
+	if (ctx->uid != chbs->uid)
+		return 0;
+
+	ctx->cxl_version = chbs->cxl_version;
+	if (!chbs->base)
+		return 0;
+
+	if (chbs->cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11 &&
+	    chbs->length != CXL_RCRB_SIZE)
+		return 0;
+
+	ctx->base = chbs->base;
+
+	return 0;
+}
+
+static int cxl_get_chbs(struct device *dev, struct acpi_device *hb,
+			struct cxl_chbs_context *ctx)
+{
+	unsigned long long uid;
+	int rc;
+
+	rc = acpi_evaluate_integer(hb->handle, METHOD_NAME__UID, NULL, &uid);
+	if (rc != AE_OK) {
+		dev_err(dev, "unable to retrieve _UID\n");
+		return -ENOENT;
+	}
+
+	dev_dbg(dev, "UID found: %lld\n", uid);
+	*ctx = (struct cxl_chbs_context) {
+		.dev = dev,
+		.uid = uid,
+		.base = CXL_RESOURCE_NONE,
+		.cxl_version = UINT_MAX,
+	};
+
+	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CHBS, cxl_get_chbs_iter, ctx);
+
+	return 0;
+}
+
+static int add_host_bridge_dport(struct device *match, void *arg)
+{
+	acpi_status rc;
+	struct device *bridge;
+	struct cxl_dport *dport;
+	struct cxl_chbs_context ctx;
+	struct acpi_pci_root *pci_root;
+	struct cxl_port *root_port = arg;
+	struct device *host = root_port->dev.parent;
+	struct acpi_device *hb = to_cxl_host_bridge(host, match);
+
+	if (!hb)
+		return 0;
+
+	rc = cxl_get_chbs(match, hb, &ctx);
+	if (rc)
+		return rc;
+
+	if (ctx.cxl_version == UINT_MAX) {
+		dev_warn(match, "No CHBS found for Host Bridge (UID %lld)\n",
+			 ctx.uid);
+		return 0;
+	}
+
+	if (ctx.base == CXL_RESOURCE_NONE) {
+		dev_warn(match, "CHBS invalid for Host Bridge (UID %lld)\n",
+			 ctx.uid);
+		return 0;
+	}
+
+	pci_root = acpi_pci_find_root(hb->handle);
+	bridge = pci_root->bus->bridge;
+
+	/*
+	 * In RCH mode, bind the component regs base to the dport. In
+	 * VH mode it will be bound to the CXL host bridge's port
+	 * object later in add_host_bridge_uport().
+	 */
+	if (ctx.cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11) {
+		dev_dbg(match, "RCRB found for UID %lld: %pa\n", ctx.uid,
+			&ctx.base);
+		dport = devm_cxl_add_rch_dport(root_port, bridge, ctx.uid,
+					       ctx.base);
+	} else {
+		dport = devm_cxl_add_dport(root_port, bridge, ctx.uid,
+					   CXL_RESOURCE_NONE);
+	}
+
+	if (IS_ERR(dport))
+		return PTR_ERR(dport);
+
+	return 0;
+}
+
+/*
+ * A host bridge is a dport to a CFMWS decode and it is a uport to the
+ * dport (PCIe Root Ports) in the host bridge.
+ */
+static int add_host_bridge_uport(struct device *match, void *arg)
+{
+	struct cxl_port *root_port = arg;
+	struct device *host = root_port->dev.parent;
+	struct acpi_device *hb = to_cxl_host_bridge(host, match);
+	struct acpi_pci_root *pci_root;
+	struct cxl_dport *dport;
+	struct cxl_port *port;
+	struct device *bridge;
+	struct cxl_chbs_context ctx;
+	resource_size_t component_reg_phys;
+	int rc;
+
+	if (!hb)
+		return 0;
+
+	pci_root = acpi_pci_find_root(hb->handle);
+	bridge = pci_root->bus->bridge;
+	dport = cxl_find_dport_by_dev(root_port, bridge);
+	if (!dport) {
+		dev_dbg(host, "host bridge expected and not found\n");
+		return 0;
+	}
+
+	if (dport->rch) {
+		dev_info(bridge, "host supports CXL (restricted)\n");
+		return 0;
+	}
+
+	rc = cxl_get_chbs(match, hb, &ctx);
+	if (rc)
+		return rc;
+
+	if (ctx.cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11) {
+		dev_warn(bridge,
+			 "CXL CHBS version mismatch, skip port registration\n");
+		return 0;
+	}
+
+	component_reg_phys = ctx.base;
+	if (component_reg_phys != CXL_RESOURCE_NONE)
+		dev_dbg(match, "CHBCR found for UID %lld: %pa\n",
+			ctx.uid, &component_reg_phys);
+
+	rc = devm_cxl_register_pci_bus(host, bridge, pci_root->bus);
+	if (rc)
+		return rc;
+
+	port = devm_cxl_add_port(host, bridge, component_reg_phys, dport);
+	if (IS_ERR(port))
+		return PTR_ERR(port);
+
+	dev_info(bridge, "host supports CXL\n");
+
+	return 0;
+}
+
+static int add_root_nvdimm_bridge(struct device *match, void *data)
+{
+	struct cxl_decoder *cxld;
+	struct cxl_port *root_port = data;
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct device *host = root_port->dev.parent;
+
+	if (!is_root_decoder(match))
+		return 0;
+
+	cxld = to_cxl_decoder(match);
+	if (!(cxld->flags & CXL_DECODER_F_PMEM))
+		return 0;
+
+	cxl_nvb = devm_cxl_add_nvdimm_bridge(host, root_port);
+	if (IS_ERR(cxl_nvb)) {
+		dev_dbg(host, "failed to register pmem\n");
+		return PTR_ERR(cxl_nvb);
+	}
+	dev_dbg(host, "%s: add: %s\n", dev_name(&root_port->dev),
+		dev_name(&cxl_nvb->dev));
+	return 1;
+}
+
+static struct lock_class_key cxl_root_key;
+
+static void cxl_acpi_lock_reset_class(void *dev)
+{
+	device_lock_reset_class(dev);
+}
+
+static void del_cxl_resource(struct resource *res)
+{
+	kfree(res->name);
+	kfree(res);
+}
+
+static void cxl_set_public_resource(struct resource *priv, struct resource *pub)
+{
+	priv->desc = (unsigned long) pub;
+}
+
+static struct resource *cxl_get_public_resource(struct resource *priv)
+{
+	return (struct resource *) priv->desc;
+}
+
+static void remove_cxl_resources(void *data)
+{
+	struct resource *res, *next, *cxl = data;
+
+	for (res = cxl->child; res; res = next) {
+		struct resource *victim = cxl_get_public_resource(res);
+
+		next = res->sibling;
+		remove_resource(res);
+
+		if (victim) {
+			remove_resource(victim);
+			kfree(victim);
+		}
+
+		del_cxl_resource(res);
+	}
+}
+
+/**
+ * add_cxl_resources() - reflect CXL fixed memory windows in iomem_resource
+ * @cxl_res: A standalone resource tree where each CXL window is a sibling
+ *
+ * Walk each CXL window in @cxl_res and add it to iomem_resource potentially
+ * expanding its boundaries to ensure that any conflicting resources become
+ * children. If a window is expanded it may then conflict with a another window
+ * entry and require the window to be truncated or trimmed. Consider this
+ * situation:
+ *
+ * |-- "CXL Window 0" --||----- "CXL Window 1" -----|
+ * |--------------- "System RAM" -------------|
+ *
+ * ...where platform firmware has established as System RAM resource across 2
+ * windows, but has left some portion of window 1 for dynamic CXL region
+ * provisioning. In this case "Window 0" will span the entirety of the "System
+ * RAM" span, and "CXL Window 1" is truncated to the remaining tail past the end
+ * of that "System RAM" resource.
+ */
+static int add_cxl_resources(struct resource *cxl_res)
+{
+	struct resource *res, *new, *next;
+
+	for (res = cxl_res->child; res; res = next) {
+		new = kzalloc(sizeof(*new), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		new->name = res->name;
+		new->start = res->start;
+		new->end = res->end;
+		new->flags = IORESOURCE_MEM;
+		new->desc = IORES_DESC_CXL;
+
+		/*
+		 * Record the public resource in the private cxl_res tree for
+		 * later removal.
+		 */
+		cxl_set_public_resource(res, new);
+
+		insert_resource_expand_to_fit(&iomem_resource, new);
+
+		next = res->sibling;
+		while (next && resource_overlaps(new, next)) {
+			if (resource_contains(new, next)) {
+				struct resource *_next = next->sibling;
+
+				remove_resource(next);
+				del_cxl_resource(next);
+				next = _next;
+			} else
+				next->start = new->end + 1;
+		}
+	}
+	return 0;
+}
+
+static int pair_cxl_resource(struct device *dev, void *data)
+{
+	struct resource *cxl_res = data;
+	struct resource *p;
+
+	if (!is_root_decoder(dev))
+		return 0;
+
+	for (p = cxl_res->child; p; p = p->sibling) {
+		struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+		struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+		struct resource res = {
+			.start = cxld->hpa_range.start,
+			.end = cxld->hpa_range.end,
+			.flags = IORESOURCE_MEM,
+		};
+
+		if (resource_contains(p, &res)) {
+			cxlrd->res = cxl_get_public_resource(p);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int cxl_acpi_probe(struct platform_device *pdev)
+{
+	int rc;
+	struct resource *cxl_res;
+	struct cxl_port *root_port;
+	struct device *host = &pdev->dev;
+	struct acpi_device *adev = ACPI_COMPANION(host);
+	struct cxl_cfmws_context ctx;
+
+	device_lock_set_class(&pdev->dev, &cxl_root_key);
+	rc = devm_add_action_or_reset(&pdev->dev, cxl_acpi_lock_reset_class,
+				      &pdev->dev);
+	if (rc)
+		return rc;
+
+	cxl_res = devm_kzalloc(host, sizeof(*cxl_res), GFP_KERNEL);
+	if (!cxl_res)
+		return -ENOMEM;
+	cxl_res->name = "CXL mem";
+	cxl_res->start = 0;
+	cxl_res->end = -1;
+	cxl_res->flags = IORESOURCE_MEM;
+
+	root_port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL);
+	if (IS_ERR(root_port))
+		return PTR_ERR(root_port);
+
+	rc = bus_for_each_dev(adev->dev.bus, NULL, root_port,
+			      add_host_bridge_dport);
+	if (rc < 0)
+		return rc;
+
+	rc = devm_add_action_or_reset(host, remove_cxl_resources, cxl_res);
+	if (rc)
+		return rc;
+
+	ctx = (struct cxl_cfmws_context) {
+		.dev = host,
+		.root_port = root_port,
+		.cxl_res = cxl_res,
+	};
+	rc = acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx);
+	if (rc < 0)
+		return -ENXIO;
+
+	rc = add_cxl_resources(cxl_res);
+	if (rc)
+		return rc;
+
+	/*
+	 * Populate the root decoders with their related iomem resource,
+	 * if present
+	 */
+	device_for_each_child(&root_port->dev, cxl_res, pair_cxl_resource);
+
+	/*
+	 * Root level scanned with host-bridge as dports, now scan host-bridges
+	 * for their role as CXL uports to their CXL-capable PCIe Root Ports.
+	 */
+	rc = bus_for_each_dev(adev->dev.bus, NULL, root_port,
+			      add_host_bridge_uport);
+	if (rc < 0)
+		return rc;
+
+	if (IS_ENABLED(CONFIG_CXL_PMEM))
+		rc = device_for_each_child(&root_port->dev, root_port,
+					   add_root_nvdimm_bridge);
+	if (rc < 0)
+		return rc;
+
+	/* In case PCI is scanned before ACPI re-trigger memdev attach */
+	cxl_bus_rescan();
+	return 0;
+}
+
+static const struct acpi_device_id cxl_acpi_ids[] = {
+	{ "ACPI0017" },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, cxl_acpi_ids);
+
+static const struct platform_device_id cxl_test_ids[] = {
+	{ "cxl_acpi" },
+	{ },
+};
+MODULE_DEVICE_TABLE(platform, cxl_test_ids);
+
+static struct platform_driver cxl_acpi_driver = {
+	.probe = cxl_acpi_probe,
+	.driver = {
+		.name = KBUILD_MODNAME,
+		.acpi_match_table = cxl_acpi_ids,
+	},
+	.id_table = cxl_test_ids,
+};
+
+static int __init cxl_acpi_init(void)
+{
+	return platform_driver_register(&cxl_acpi_driver);
+}
+
+static void __exit cxl_acpi_exit(void)
+{
+	platform_driver_unregister(&cxl_acpi_driver);
+	cxl_bus_drain();
+}
+
+/* load before dax_hmem sees 'Soft Reserved' CXL ranges */
+subsys_initcall(cxl_acpi_init);
+module_exit(cxl_acpi_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS(CXL);
+MODULE_IMPORT_NS(ACPI);
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
new file mode 100644
index 0000000000..1f66b5d4d9
--- /dev/null
+++ b/drivers/cxl/core/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_CXL_BUS) += cxl_core.o
+obj-$(CONFIG_CXL_SUSPEND) += suspend.o
+
+ccflags-y += -I$(srctree)/drivers/cxl
+CFLAGS_trace.o = -DTRACE_INCLUDE_PATH=. -I$(src)
+
+cxl_core-y := port.o
+cxl_core-y += pmem.o
+cxl_core-y += regs.o
+cxl_core-y += memdev.o
+cxl_core-y += mbox.o
+cxl_core-y += pci.o
+cxl_core-y += hdm.o
+cxl_core-y += pmu.o
+cxl_core-$(CONFIG_TRACING) += trace.o
+cxl_core-$(CONFIG_CXL_REGION) += region.o
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
new file mode 100644
index 0000000000..8e5f3d8431
--- /dev/null
+++ b/drivers/cxl/core/core.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020 Intel Corporation. */
+
+#ifndef __CXL_CORE_H__
+#define __CXL_CORE_H__
+
+extern const struct device_type cxl_nvdimm_bridge_type;
+extern const struct device_type cxl_nvdimm_type;
+extern const struct device_type cxl_pmu_type;
+
+extern struct attribute_group cxl_base_attribute_group;
+
+#ifdef CONFIG_CXL_REGION
+extern struct device_attribute dev_attr_create_pmem_region;
+extern struct device_attribute dev_attr_create_ram_region;
+extern struct device_attribute dev_attr_delete_region;
+extern struct device_attribute dev_attr_region;
+extern const struct device_type cxl_pmem_region_type;
+extern const struct device_type cxl_dax_region_type;
+extern const struct device_type cxl_region_type;
+void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled);
+#define CXL_REGION_ATTR(x) (&dev_attr_##x.attr)
+#define CXL_REGION_TYPE(x) (&cxl_region_type)
+#define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr),
+#define CXL_PMEM_REGION_TYPE(x) (&cxl_pmem_region_type)
+#define CXL_DAX_REGION_TYPE(x) (&cxl_dax_region_type)
+int cxl_region_init(void);
+void cxl_region_exit(void);
+int cxl_get_poison_by_endpoint(struct cxl_port *port);
+#else
+static inline int cxl_get_poison_by_endpoint(struct cxl_port *port)
+{
+	return 0;
+}
+static inline void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled)
+{
+}
+static inline int cxl_region_init(void)
+{
+	return 0;
+}
+static inline void cxl_region_exit(void)
+{
+}
+#define CXL_REGION_ATTR(x) NULL
+#define CXL_REGION_TYPE(x) NULL
+#define SET_CXL_REGION_ATTR(x)
+#define CXL_PMEM_REGION_TYPE(x) NULL
+#define CXL_DAX_REGION_TYPE(x) NULL
+#endif
+
+struct cxl_send_command;
+struct cxl_mem_query_commands;
+int cxl_query_cmd(struct cxl_memdev *cxlmd,
+		  struct cxl_mem_query_commands __user *q);
+int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s);
+void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
+				   resource_size_t length);
+
+struct dentry *cxl_debugfs_create_dir(const char *dir);
+int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
+		     enum cxl_decoder_mode mode);
+int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size);
+int cxl_dpa_free(struct cxl_endpoint_decoder *cxled);
+resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled);
+resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled);
+
+enum cxl_rcrb {
+	CXL_RCRB_DOWNSTREAM,
+	CXL_RCRB_UPSTREAM,
+};
+struct cxl_rcrb_info;
+resource_size_t __rcrb_to_component(struct device *dev,
+				    struct cxl_rcrb_info *ri,
+				    enum cxl_rcrb which);
+
+extern struct rw_semaphore cxl_dpa_rwsem;
+extern struct rw_semaphore cxl_region_rwsem;
+
+int cxl_memdev_init(void);
+void cxl_memdev_exit(void);
+void cxl_mbox_init(void);
+
+enum cxl_poison_trace_type {
+	CXL_POISON_TRACE_LIST,
+	CXL_POISON_TRACE_INJECT,
+	CXL_POISON_TRACE_CLEAR,
+};
+
+#endif /* __CXL_CORE_H__ */
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
new file mode 100644
index 0000000000..90664659d5
--- /dev/null
+++ b/drivers/cxl/core/hdm.c
@@ -0,0 +1,1036 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+
+#include "cxlmem.h"
+#include "core.h"
+
+/**
+ * DOC: cxl core hdm
+ *
+ * Compute Express Link Host Managed Device Memory, starting with the
+ * CXL 2.0 specification, is managed by an array of HDM Decoder register
+ * instances per CXL port and per CXL endpoint. Define common helpers
+ * for enumerating these registers and capabilities.
+ */
+
+DECLARE_RWSEM(cxl_dpa_rwsem);
+
+static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
+			   int *target_map)
+{
+	int rc;
+
+	rc = cxl_decoder_add_locked(cxld, target_map);
+	if (rc) {
+		put_device(&cxld->dev);
+		dev_err(&port->dev, "Failed to add decoder\n");
+		return rc;
+	}
+
+	rc = cxl_decoder_autoremove(&port->dev, cxld);
+	if (rc)
+		return rc;
+
+	dev_dbg(&cxld->dev, "Added to port %s\n", dev_name(&port->dev));
+
+	return 0;
+}
+
+/*
+ * Per the CXL specification (8.2.5.12 CXL HDM Decoder Capability Structure)
+ * single ported host-bridges need not publish a decoder capability when a
+ * passthrough decode can be assumed, i.e. all transactions that the uport sees
+ * are claimed and passed to the single dport. Disable the range until the first
+ * CXL region is enumerated / activated.
+ */
+int devm_cxl_add_passthrough_decoder(struct cxl_port *port)
+{
+	struct cxl_switch_decoder *cxlsd;
+	struct cxl_dport *dport = NULL;
+	int single_port_map[1];
+	unsigned long index;
+
+	cxlsd = cxl_switch_decoder_alloc(port, 1);
+	if (IS_ERR(cxlsd))
+		return PTR_ERR(cxlsd);
+
+	device_lock_assert(&port->dev);
+
+	xa_for_each(&port->dports, index, dport)
+		break;
+	single_port_map[0] = dport->port_id;
+
+	return add_hdm_decoder(port, &cxlsd->cxld, single_port_map);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_passthrough_decoder, CXL);
+
+static void parse_hdm_decoder_caps(struct cxl_hdm *cxlhdm)
+{
+	u32 hdm_cap;
+
+	hdm_cap = readl(cxlhdm->regs.hdm_decoder + CXL_HDM_DECODER_CAP_OFFSET);
+	cxlhdm->decoder_count = cxl_hdm_decoder_count(hdm_cap);
+	cxlhdm->target_count =
+		FIELD_GET(CXL_HDM_DECODER_TARGET_COUNT_MASK, hdm_cap);
+	if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_11_8, hdm_cap))
+		cxlhdm->interleave_mask |= GENMASK(11, 8);
+	if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_14_12, hdm_cap))
+		cxlhdm->interleave_mask |= GENMASK(14, 12);
+}
+
+static int map_hdm_decoder_regs(struct cxl_port *port, void __iomem *crb,
+				struct cxl_component_regs *regs)
+{
+	struct cxl_register_map map = {
+		.host = &port->dev,
+		.resource = port->component_reg_phys,
+		.base = crb,
+		.max_size = CXL_COMPONENT_REG_BLOCK_SIZE,
+	};
+
+	cxl_probe_component_regs(&port->dev, crb, &map.component_map);
+	if (!map.component_map.hdm_decoder.valid) {
+		dev_dbg(&port->dev, "HDM decoder registers not implemented\n");
+		/* unique error code to indicate no HDM decoder capability */
+		return -ENODEV;
+	}
+
+	return cxl_map_component_regs(&map, regs, BIT(CXL_CM_CAP_CAP_ID_HDM));
+}
+
+static bool should_emulate_decoders(struct cxl_endpoint_dvsec_info *info)
+{
+	struct cxl_hdm *cxlhdm;
+	void __iomem *hdm;
+	u32 ctrl;
+	int i;
+
+	if (!info)
+		return false;
+
+	cxlhdm = dev_get_drvdata(&info->port->dev);
+	hdm = cxlhdm->regs.hdm_decoder;
+
+	if (!hdm)
+		return true;
+
+	/*
+	 * If HDM decoders are present and the driver is in control of
+	 * Mem_Enable skip DVSEC based emulation
+	 */
+	if (!info->mem_enabled)
+		return false;
+
+	/*
+	 * If any decoders are committed already, there should not be any
+	 * emulated DVSEC decoders.
+	 */
+	for (i = 0; i < cxlhdm->decoder_count; i++) {
+		ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i));
+		dev_dbg(&info->port->dev,
+			"decoder%d.%d: committed: %ld base: %#x_%.8x size: %#x_%.8x\n",
+			info->port->id, i,
+			FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl),
+			readl(hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i)),
+			readl(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(i)),
+			readl(hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i)),
+			readl(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i)));
+		if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl))
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * devm_cxl_setup_hdm - map HDM decoder component registers
+ * @port: cxl_port to map
+ * @info: cached DVSEC range register info
+ */
+struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port,
+				   struct cxl_endpoint_dvsec_info *info)
+{
+	struct device *dev = &port->dev;
+	struct cxl_hdm *cxlhdm;
+	void __iomem *crb;
+	int rc;
+
+	cxlhdm = devm_kzalloc(dev, sizeof(*cxlhdm), GFP_KERNEL);
+	if (!cxlhdm)
+		return ERR_PTR(-ENOMEM);
+	cxlhdm->port = port;
+	dev_set_drvdata(dev, cxlhdm);
+
+	crb = ioremap(port->component_reg_phys, CXL_COMPONENT_REG_BLOCK_SIZE);
+	if (!crb && info && info->mem_enabled) {
+		cxlhdm->decoder_count = info->ranges;
+		return cxlhdm;
+	} else if (!crb) {
+		dev_err(dev, "No component registers mapped\n");
+		return ERR_PTR(-ENXIO);
+	}
+
+	rc = map_hdm_decoder_regs(port, crb, &cxlhdm->regs);
+	iounmap(crb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	parse_hdm_decoder_caps(cxlhdm);
+	if (cxlhdm->decoder_count == 0) {
+		dev_err(dev, "Spec violation. Caps invalid\n");
+		return ERR_PTR(-ENXIO);
+	}
+
+	/*
+	 * Now that the hdm capability is parsed, decide if range
+	 * register emulation is needed and fixup cxlhdm accordingly.
+	 */
+	if (should_emulate_decoders(info)) {
+		dev_dbg(dev, "Fallback map %d range register%s\n", info->ranges,
+			info->ranges > 1 ? "s" : "");
+		cxlhdm->decoder_count = info->ranges;
+	}
+
+	return cxlhdm;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_hdm, CXL);
+
+static void __cxl_dpa_debug(struct seq_file *file, struct resource *r, int depth)
+{
+	unsigned long long start = r->start, end = r->end;
+
+	seq_printf(file, "%*s%08llx-%08llx : %s\n", depth * 2, "", start, end,
+		   r->name);
+}
+
+void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
+{
+	struct resource *p1, *p2;
+
+	down_read(&cxl_dpa_rwsem);
+	for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) {
+		__cxl_dpa_debug(file, p1, 0);
+		for (p2 = p1->child; p2; p2 = p2->sibling)
+			__cxl_dpa_debug(file, p2, 1);
+	}
+	up_read(&cxl_dpa_rwsem);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, CXL);
+
+/*
+ * Must be called in a context that synchronizes against this decoder's
+ * port ->remove() callback (like an endpoint decoder sysfs attribute)
+ */
+static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_port *port = cxled_to_port(cxled);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct resource *res = cxled->dpa_res;
+	resource_size_t skip_start;
+
+	lockdep_assert_held_write(&cxl_dpa_rwsem);
+
+	/* save @skip_start, before @res is released */
+	skip_start = res->start - cxled->skip;
+	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
+	if (cxled->skip)
+		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
+	cxled->skip = 0;
+	cxled->dpa_res = NULL;
+	put_device(&cxled->cxld.dev);
+	port->hdm_end--;
+}
+
+static void cxl_dpa_release(void *cxled)
+{
+	down_write(&cxl_dpa_rwsem);
+	__cxl_dpa_release(cxled);
+	up_write(&cxl_dpa_rwsem);
+}
+
+/*
+ * Must be called from context that will not race port device
+ * unregistration, like decoder sysfs attribute methods
+ */
+static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_port *port = cxled_to_port(cxled);
+
+	lockdep_assert_held_write(&cxl_dpa_rwsem);
+	devm_remove_action(&port->dev, cxl_dpa_release, cxled);
+	__cxl_dpa_release(cxled);
+}
+
+static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
+			     resource_size_t base, resource_size_t len,
+			     resource_size_t skipped)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_port *port = cxled_to_port(cxled);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct device *dev = &port->dev;
+	struct resource *res;
+
+	lockdep_assert_held_write(&cxl_dpa_rwsem);
+
+	if (!len) {
+		dev_warn(dev, "decoder%d.%d: empty reservation attempted\n",
+			 port->id, cxled->cxld.id);
+		return -EINVAL;
+	}
+
+	if (cxled->dpa_res) {
+		dev_dbg(dev, "decoder%d.%d: existing allocation %pr assigned\n",
+			port->id, cxled->cxld.id, cxled->dpa_res);
+		return -EBUSY;
+	}
+
+	if (port->hdm_end + 1 != cxled->cxld.id) {
+		/*
+		 * Assumes alloc and commit order is always in hardware instance
+		 * order per expectations from 8.2.5.12.20 Committing Decoder
+		 * Programming that enforce decoder[m] committed before
+		 * decoder[m+1] commit start.
+		 */
+		dev_dbg(dev, "decoder%d.%d: expected decoder%d.%d\n", port->id,
+			cxled->cxld.id, port->id, port->hdm_end + 1);
+		return -EBUSY;
+	}
+
+	if (skipped) {
+		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
+				       dev_name(&cxled->cxld.dev), 0);
+		if (!res) {
+			dev_dbg(dev,
+				"decoder%d.%d: failed to reserve skipped space\n",
+				port->id, cxled->cxld.id);
+			return -EBUSY;
+		}
+	}
+	res = __request_region(&cxlds->dpa_res, base, len,
+			       dev_name(&cxled->cxld.dev), 0);
+	if (!res) {
+		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
+			port->id, cxled->cxld.id);
+		if (skipped)
+			__release_region(&cxlds->dpa_res, base - skipped,
+					 skipped);
+		return -EBUSY;
+	}
+	cxled->dpa_res = res;
+	cxled->skip = skipped;
+
+	if (resource_contains(&cxlds->pmem_res, res))
+		cxled->mode = CXL_DECODER_PMEM;
+	else if (resource_contains(&cxlds->ram_res, res))
+		cxled->mode = CXL_DECODER_RAM;
+	else {
+		dev_dbg(dev, "decoder%d.%d: %pr mixed\n", port->id,
+			cxled->cxld.id, cxled->dpa_res);
+		cxled->mode = CXL_DECODER_MIXED;
+	}
+
+	port->hdm_end++;
+	get_device(&cxled->cxld.dev);
+	return 0;
+}
+
+int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
+				resource_size_t base, resource_size_t len,
+				resource_size_t skipped)
+{
+	struct cxl_port *port = cxled_to_port(cxled);
+	int rc;
+
+	down_write(&cxl_dpa_rwsem);
+	rc = __cxl_dpa_reserve(cxled, base, len, skipped);
+	up_write(&cxl_dpa_rwsem);
+
+	if (rc)
+		return rc;
+
+	return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_dpa_reserve, CXL);
+
+resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled)
+{
+	resource_size_t size = 0;
+
+	down_read(&cxl_dpa_rwsem);
+	if (cxled->dpa_res)
+		size = resource_size(cxled->dpa_res);
+	up_read(&cxl_dpa_rwsem);
+
+	return size;
+}
+
+resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled)
+{
+	resource_size_t base = -1;
+
+	lockdep_assert_held(&cxl_dpa_rwsem);
+	if (cxled->dpa_res)
+		base = cxled->dpa_res->start;
+
+	return base;
+}
+
+int cxl_dpa_free(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_port *port = cxled_to_port(cxled);
+	struct device *dev = &cxled->cxld.dev;
+	int rc;
+
+	down_write(&cxl_dpa_rwsem);
+	if (!cxled->dpa_res) {
+		rc = 0;
+		goto out;
+	}
+	if (cxled->cxld.region) {
+		dev_dbg(dev, "decoder assigned to: %s\n",
+			dev_name(&cxled->cxld.region->dev));
+		rc = -EBUSY;
+		goto out;
+	}
+	if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
+		dev_dbg(dev, "decoder enabled\n");
+		rc = -EBUSY;
+		goto out;
+	}
+	if (cxled->cxld.id != port->hdm_end) {
+		dev_dbg(dev, "expected decoder%d.%d\n", port->id,
+			port->hdm_end);
+		rc = -EBUSY;
+		goto out;
+	}
+	devm_cxl_dpa_release(cxled);
+	rc = 0;
+out:
+	up_write(&cxl_dpa_rwsem);
+	return rc;
+}
+
+int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
+		     enum cxl_decoder_mode mode)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct device *dev = &cxled->cxld.dev;
+	int rc;
+
+	switch (mode) {
+	case CXL_DECODER_RAM:
+	case CXL_DECODER_PMEM:
+		break;
+	default:
+		dev_dbg(dev, "unsupported mode: %d\n", mode);
+		return -EINVAL;
+	}
+
+	down_write(&cxl_dpa_rwsem);
+	if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/*
+	 * Only allow modes that are supported by the current partition
+	 * configuration
+	 */
+	if (mode == CXL_DECODER_PMEM && !resource_size(&cxlds->pmem_res)) {
+		dev_dbg(dev, "no available pmem capacity\n");
+		rc = -ENXIO;
+		goto out;
+	}
+	if (mode == CXL_DECODER_RAM && !resource_size(&cxlds->ram_res)) {
+		dev_dbg(dev, "no available ram capacity\n");
+		rc = -ENXIO;
+		goto out;
+	}
+
+	cxled->mode = mode;
+	rc = 0;
+out:
+	up_write(&cxl_dpa_rwsem);
+
+	return rc;
+}
+
+int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	resource_size_t free_ram_start, free_pmem_start;
+	struct cxl_port *port = cxled_to_port(cxled);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct device *dev = &cxled->cxld.dev;
+	resource_size_t start, avail, skip;
+	struct resource *p, *last;
+	int rc;
+
+	down_write(&cxl_dpa_rwsem);
+	if (cxled->cxld.region) {
+		dev_dbg(dev, "decoder attached to %s\n",
+			dev_name(&cxled->cxld.region->dev));
+		rc = -EBUSY;
+		goto out;
+	}
+
+	if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
+		dev_dbg(dev, "decoder enabled\n");
+		rc = -EBUSY;
+		goto out;
+	}
+
+	for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling)
+		last = p;
+	if (last)
+		free_ram_start = last->end + 1;
+	else
+		free_ram_start = cxlds->ram_res.start;
+
+	for (p = cxlds->pmem_res.child, last = NULL; p; p = p->sibling)
+		last = p;
+	if (last)
+		free_pmem_start = last->end + 1;
+	else
+		free_pmem_start = cxlds->pmem_res.start;
+
+	if (cxled->mode == CXL_DECODER_RAM) {
+		start = free_ram_start;
+		avail = cxlds->ram_res.end - start + 1;
+		skip = 0;
+	} else if (cxled->mode == CXL_DECODER_PMEM) {
+		resource_size_t skip_start, skip_end;
+
+		start = free_pmem_start;
+		avail = cxlds->pmem_res.end - start + 1;
+		skip_start = free_ram_start;
+
+		/*
+		 * If some pmem is already allocated, then that allocation
+		 * already handled the skip.
+		 */
+		if (cxlds->pmem_res.child &&
+		    skip_start == cxlds->pmem_res.child->start)
+			skip_end = skip_start - 1;
+		else
+			skip_end = start - 1;
+		skip = skip_end - skip_start + 1;
+	} else {
+		dev_dbg(dev, "mode not set\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (size > avail) {
+		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
+			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
+			&avail);
+		rc = -ENOSPC;
+		goto out;
+	}
+
+	rc = __cxl_dpa_reserve(cxled, start, size, skip);
+out:
+	up_write(&cxl_dpa_rwsem);
+
+	if (rc)
+		return rc;
+
+	return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled);
+}
+
+static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl)
+{
+	u16 eig;
+	u8 eiw;
+
+	/*
+	 * Input validation ensures these warns never fire, but otherwise
+	 * suppress unititalized variable usage warnings.
+	 */
+	if (WARN_ONCE(ways_to_eiw(cxld->interleave_ways, &eiw),
+		      "invalid interleave_ways: %d\n", cxld->interleave_ways))
+		return;
+	if (WARN_ONCE(granularity_to_eig(cxld->interleave_granularity, &eig),
+		      "invalid interleave_granularity: %d\n",
+		      cxld->interleave_granularity))
+		return;
+
+	u32p_replace_bits(ctrl, eig, CXL_HDM_DECODER0_CTRL_IG_MASK);
+	u32p_replace_bits(ctrl, eiw, CXL_HDM_DECODER0_CTRL_IW_MASK);
+	*ctrl |= CXL_HDM_DECODER0_CTRL_COMMIT;
+}
+
+static void cxld_set_type(struct cxl_decoder *cxld, u32 *ctrl)
+{
+	u32p_replace_bits(ctrl,
+			  !!(cxld->target_type == CXL_DECODER_HOSTONLYMEM),
+			  CXL_HDM_DECODER0_CTRL_HOSTONLY);
+}
+
+static void cxlsd_set_targets(struct cxl_switch_decoder *cxlsd, u64 *tgt)
+{
+	struct cxl_dport **t = &cxlsd->target[0];
+	int ways = cxlsd->cxld.interleave_ways;
+
+	*tgt = FIELD_PREP(GENMASK(7, 0), t[0]->port_id);
+	if (ways > 1)
+		*tgt |= FIELD_PREP(GENMASK(15, 8), t[1]->port_id);
+	if (ways > 2)
+		*tgt |= FIELD_PREP(GENMASK(23, 16), t[2]->port_id);
+	if (ways > 3)
+		*tgt |= FIELD_PREP(GENMASK(31, 24), t[3]->port_id);
+	if (ways > 4)
+		*tgt |= FIELD_PREP(GENMASK_ULL(39, 32), t[4]->port_id);
+	if (ways > 5)
+		*tgt |= FIELD_PREP(GENMASK_ULL(47, 40), t[5]->port_id);
+	if (ways > 6)
+		*tgt |= FIELD_PREP(GENMASK_ULL(55, 48), t[6]->port_id);
+	if (ways > 7)
+		*tgt |= FIELD_PREP(GENMASK_ULL(63, 56), t[7]->port_id);
+}
+
+/*
+ * Per CXL 2.0 8.2.5.12.20 Committing Decoder Programming, hardware must set
+ * committed or error within 10ms, but just be generous with 20ms to account for
+ * clock skew and other marginal behavior
+ */
+#define COMMIT_TIMEOUT_MS 20
+static int cxld_await_commit(void __iomem *hdm, int id)
+{
+	u32 ctrl;
+	int i;
+
+	for (i = 0; i < COMMIT_TIMEOUT_MS; i++) {
+		ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+		if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMIT_ERROR, ctrl)) {
+			ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT;
+			writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+			return -EIO;
+		}
+		if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl))
+			return 0;
+		fsleep(1000);
+	}
+
+	return -ETIMEDOUT;
+}
+
+static int cxl_decoder_commit(struct cxl_decoder *cxld)
+{
+	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+	struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
+	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+	int id = cxld->id, rc;
+	u64 base, size;
+	u32 ctrl;
+
+	if (cxld->flags & CXL_DECODER_F_ENABLE)
+		return 0;
+
+	if (cxl_num_decoders_committed(port) != id) {
+		dev_dbg(&port->dev,
+			"%s: out of order commit, expected decoder%d.%d\n",
+			dev_name(&cxld->dev), port->id,
+			cxl_num_decoders_committed(port));
+		return -EBUSY;
+	}
+
+	/*
+	 * For endpoint decoders hosted on CXL memory devices that
+	 * support the sanitize operation, make sure sanitize is not in-flight.
+	 */
+	if (is_endpoint_decoder(&cxld->dev)) {
+		struct cxl_endpoint_decoder *cxled =
+			to_cxl_endpoint_decoder(&cxld->dev);
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+		struct cxl_memdev_state *mds =
+			to_cxl_memdev_state(cxlmd->cxlds);
+
+		if (mds && mds->security.sanitize_active) {
+			dev_dbg(&cxlmd->dev,
+				"attempted to commit %s during sanitize\n",
+				dev_name(&cxld->dev));
+			return -EBUSY;
+		}
+	}
+
+	down_read(&cxl_dpa_rwsem);
+	/* common decoder settings */
+	ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id));
+	cxld_set_interleave(cxld, &ctrl);
+	cxld_set_type(cxld, &ctrl);
+	base = cxld->hpa_range.start;
+	size = range_len(&cxld->hpa_range);
+
+	writel(upper_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id));
+	writel(lower_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id));
+	writel(upper_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id));
+	writel(lower_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id));
+
+	if (is_switch_decoder(&cxld->dev)) {
+		struct cxl_switch_decoder *cxlsd =
+			to_cxl_switch_decoder(&cxld->dev);
+		void __iomem *tl_hi = hdm + CXL_HDM_DECODER0_TL_HIGH(id);
+		void __iomem *tl_lo = hdm + CXL_HDM_DECODER0_TL_LOW(id);
+		u64 targets;
+
+		cxlsd_set_targets(cxlsd, &targets);
+		writel(upper_32_bits(targets), tl_hi);
+		writel(lower_32_bits(targets), tl_lo);
+	} else {
+		struct cxl_endpoint_decoder *cxled =
+			to_cxl_endpoint_decoder(&cxld->dev);
+		void __iomem *sk_hi = hdm + CXL_HDM_DECODER0_SKIP_HIGH(id);
+		void __iomem *sk_lo = hdm + CXL_HDM_DECODER0_SKIP_LOW(id);
+
+		writel(upper_32_bits(cxled->skip), sk_hi);
+		writel(lower_32_bits(cxled->skip), sk_lo);
+	}
+
+	writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+	up_read(&cxl_dpa_rwsem);
+
+	port->commit_end++;
+	rc = cxld_await_commit(hdm, cxld->id);
+	if (rc) {
+		dev_dbg(&port->dev, "%s: error %d committing decoder\n",
+			dev_name(&cxld->dev), rc);
+		cxld->reset(cxld);
+		return rc;
+	}
+	cxld->flags |= CXL_DECODER_F_ENABLE;
+
+	return 0;
+}
+
+static int cxl_decoder_reset(struct cxl_decoder *cxld)
+{
+	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+	struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
+	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+	int id = cxld->id;
+	u32 ctrl;
+
+	if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
+		return 0;
+
+	if (port->commit_end != id) {
+		dev_dbg(&port->dev,
+			"%s: out of order reset, expected decoder%d.%d\n",
+			dev_name(&cxld->dev), port->id, port->commit_end);
+		return -EBUSY;
+	}
+
+	down_read(&cxl_dpa_rwsem);
+	ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+	ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT;
+	writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+
+	writel(0, hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id));
+	writel(0, hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id));
+	writel(0, hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id));
+	writel(0, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id));
+	up_read(&cxl_dpa_rwsem);
+
+	port->commit_end--;
+	cxld->flags &= ~CXL_DECODER_F_ENABLE;
+
+	/* Userspace is now responsible for reconfiguring this decoder */
+	if (is_endpoint_decoder(&cxld->dev)) {
+		struct cxl_endpoint_decoder *cxled;
+
+		cxled = to_cxl_endpoint_decoder(&cxld->dev);
+		cxled->state = CXL_DECODER_STATE_MANUAL;
+	}
+
+	return 0;
+}
+
+static int cxl_setup_hdm_decoder_from_dvsec(
+	struct cxl_port *port, struct cxl_decoder *cxld, u64 *dpa_base,
+	int which, struct cxl_endpoint_dvsec_info *info)
+{
+	struct cxl_endpoint_decoder *cxled;
+	u64 len;
+	int rc;
+
+	if (!is_cxl_endpoint(port))
+		return -EOPNOTSUPP;
+
+	cxled = to_cxl_endpoint_decoder(&cxld->dev);
+	len = range_len(&info->dvsec_range[which]);
+	if (!len)
+		return -ENOENT;
+
+	cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+	cxld->commit = NULL;
+	cxld->reset = NULL;
+	cxld->hpa_range = info->dvsec_range[which];
+
+	/*
+	 * Set the emulated decoder as locked pending additional support to
+	 * change the range registers at run time.
+	 */
+	cxld->flags |= CXL_DECODER_F_ENABLE | CXL_DECODER_F_LOCK;
+	port->commit_end = cxld->id;
+
+	rc = devm_cxl_dpa_reserve(cxled, *dpa_base, len, 0);
+	if (rc) {
+		dev_err(&port->dev,
+			"decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)",
+			port->id, cxld->id, *dpa_base, *dpa_base + len - 1, rc);
+		return rc;
+	}
+	*dpa_base += len;
+	cxled->state = CXL_DECODER_STATE_AUTO;
+
+	return 0;
+}
+
+static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
+			    int *target_map, void __iomem *hdm, int which,
+			    u64 *dpa_base, struct cxl_endpoint_dvsec_info *info)
+{
+	struct cxl_endpoint_decoder *cxled = NULL;
+	u64 size, base, skip, dpa_size, lo, hi;
+	bool committed;
+	u32 remainder;
+	int i, rc;
+	u32 ctrl;
+	union {
+		u64 value;
+		unsigned char target_id[8];
+	} target_list;
+
+	if (should_emulate_decoders(info))
+		return cxl_setup_hdm_decoder_from_dvsec(port, cxld, dpa_base,
+							which, info);
+
+	ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which));
+	lo = readl(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(which));
+	hi = readl(hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(which));
+	base = (hi << 32) + lo;
+	lo = readl(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(which));
+	hi = readl(hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(which));
+	size = (hi << 32) + lo;
+	committed = !!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED);
+	cxld->commit = cxl_decoder_commit;
+	cxld->reset = cxl_decoder_reset;
+
+	if (!committed)
+		size = 0;
+	if (base == U64_MAX || size == U64_MAX) {
+		dev_warn(&port->dev, "decoder%d.%d: Invalid resource range\n",
+			 port->id, cxld->id);
+		return -ENXIO;
+	}
+
+	if (info)
+		cxled = to_cxl_endpoint_decoder(&cxld->dev);
+	cxld->hpa_range = (struct range) {
+		.start = base,
+		.end = base + size - 1,
+	};
+
+	/* decoders are enabled if committed */
+	if (committed) {
+		cxld->flags |= CXL_DECODER_F_ENABLE;
+		if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK)
+			cxld->flags |= CXL_DECODER_F_LOCK;
+		if (FIELD_GET(CXL_HDM_DECODER0_CTRL_HOSTONLY, ctrl))
+			cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+		else
+			cxld->target_type = CXL_DECODER_DEVMEM;
+
+		guard(rwsem_write)(&cxl_region_rwsem);
+		if (cxld->id != cxl_num_decoders_committed(port)) {
+			dev_warn(&port->dev,
+				 "decoder%d.%d: Committed out of order\n",
+				 port->id, cxld->id);
+			return -ENXIO;
+		}
+
+		if (size == 0) {
+			dev_warn(&port->dev,
+				 "decoder%d.%d: Committed with zero size\n",
+				 port->id, cxld->id);
+			return -ENXIO;
+		}
+		port->commit_end = cxld->id;
+	} else {
+		if (cxled) {
+			struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+			struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+			/*
+			 * Default by devtype until a device arrives that needs
+			 * more precision.
+			 */
+			if (cxlds->type == CXL_DEVTYPE_CLASSMEM)
+				cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+			else
+				cxld->target_type = CXL_DECODER_DEVMEM;
+		} else {
+			/* To be overridden by region type at commit time */
+			cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+		}
+
+		if (!FIELD_GET(CXL_HDM_DECODER0_CTRL_HOSTONLY, ctrl) &&
+		    cxld->target_type == CXL_DECODER_HOSTONLYMEM) {
+			ctrl |= CXL_HDM_DECODER0_CTRL_HOSTONLY;
+			writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which));
+		}
+	}
+	rc = eiw_to_ways(FIELD_GET(CXL_HDM_DECODER0_CTRL_IW_MASK, ctrl),
+			  &cxld->interleave_ways);
+	if (rc) {
+		dev_warn(&port->dev,
+			 "decoder%d.%d: Invalid interleave ways (ctrl: %#x)\n",
+			 port->id, cxld->id, ctrl);
+		return rc;
+	}
+	rc = eig_to_granularity(FIELD_GET(CXL_HDM_DECODER0_CTRL_IG_MASK, ctrl),
+				 &cxld->interleave_granularity);
+	if (rc)
+		return rc;
+
+	dev_dbg(&port->dev, "decoder%d.%d: range: %#llx-%#llx iw: %d ig: %d\n",
+		port->id, cxld->id, cxld->hpa_range.start, cxld->hpa_range.end,
+		cxld->interleave_ways, cxld->interleave_granularity);
+
+	if (!cxled) {
+		lo = readl(hdm + CXL_HDM_DECODER0_TL_LOW(which));
+		hi = readl(hdm + CXL_HDM_DECODER0_TL_HIGH(which));
+		target_list.value = (hi << 32) + lo;
+		for (i = 0; i < cxld->interleave_ways; i++)
+			target_map[i] = target_list.target_id[i];
+
+		return 0;
+	}
+
+	if (!committed)
+		return 0;
+
+	dpa_size = div_u64_rem(size, cxld->interleave_ways, &remainder);
+	if (remainder) {
+		dev_err(&port->dev,
+			"decoder%d.%d: invalid committed configuration size: %#llx ways: %d\n",
+			port->id, cxld->id, size, cxld->interleave_ways);
+		return -ENXIO;
+	}
+	lo = readl(hdm + CXL_HDM_DECODER0_SKIP_LOW(which));
+	hi = readl(hdm + CXL_HDM_DECODER0_SKIP_HIGH(which));
+	skip = (hi << 32) + lo;
+	rc = devm_cxl_dpa_reserve(cxled, *dpa_base + skip, dpa_size, skip);
+	if (rc) {
+		dev_err(&port->dev,
+			"decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)",
+			port->id, cxld->id, *dpa_base,
+			*dpa_base + dpa_size + skip - 1, rc);
+		return rc;
+	}
+	*dpa_base += dpa_size + skip;
+
+	cxled->state = CXL_DECODER_STATE_AUTO;
+
+	return 0;
+}
+
+static void cxl_settle_decoders(struct cxl_hdm *cxlhdm)
+{
+	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+	int committed, i;
+	u32 ctrl;
+
+	if (!hdm)
+		return;
+
+	/*
+	 * Since the register resource was recently claimed via request_region()
+	 * be careful about trusting the "not-committed" status until the commit
+	 * timeout has elapsed.  The commit timeout is 10ms (CXL 2.0
+	 * 8.2.5.12.20), but double it to be tolerant of any clock skew between
+	 * host and target.
+	 */
+	for (i = 0, committed = 0; i < cxlhdm->decoder_count; i++) {
+		ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i));
+		if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)
+			committed++;
+	}
+
+	/* ensure that future checks of committed can be trusted */
+	if (committed != cxlhdm->decoder_count)
+		msleep(20);
+}
+
+/**
+ * devm_cxl_enumerate_decoders - add decoder objects per HDM register set
+ * @cxlhdm: Structure to populate with HDM capabilities
+ * @info: cached DVSEC range register info
+ */
+int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
+				struct cxl_endpoint_dvsec_info *info)
+{
+	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+	struct cxl_port *port = cxlhdm->port;
+	int i;
+	u64 dpa_base = 0;
+
+	cxl_settle_decoders(cxlhdm);
+
+	for (i = 0; i < cxlhdm->decoder_count; i++) {
+		int target_map[CXL_DECODER_MAX_INTERLEAVE] = { 0 };
+		int rc, target_count = cxlhdm->target_count;
+		struct cxl_decoder *cxld;
+
+		if (is_cxl_endpoint(port)) {
+			struct cxl_endpoint_decoder *cxled;
+
+			cxled = cxl_endpoint_decoder_alloc(port);
+			if (IS_ERR(cxled)) {
+				dev_warn(&port->dev,
+					 "Failed to allocate decoder%d.%d\n",
+					 port->id, i);
+				return PTR_ERR(cxled);
+			}
+			cxld = &cxled->cxld;
+		} else {
+			struct cxl_switch_decoder *cxlsd;
+
+			cxlsd = cxl_switch_decoder_alloc(port, target_count);
+			if (IS_ERR(cxlsd)) {
+				dev_warn(&port->dev,
+					 "Failed to allocate decoder%d.%d\n",
+					 port->id, i);
+				return PTR_ERR(cxlsd);
+			}
+			cxld = &cxlsd->cxld;
+		}
+
+		rc = init_hdm_decoder(port, cxld, target_map, hdm, i,
+				      &dpa_base, info);
+		if (rc) {
+			dev_warn(&port->dev,
+				 "Failed to initialize decoder%d.%d\n",
+				 port->id, i);
+			put_device(&cxld->dev);
+			return rc;
+		}
+		rc = add_hdm_decoder(port, cxld, target_map);
+		if (rc) {
+			dev_warn(&port->dev,
+				 "Failed to add decoder%d.%d\n", port->id, i);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_decoders, CXL);
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
new file mode 100644
index 0000000000..b12986b968
--- /dev/null
+++ b/drivers/cxl/core/mbox.c
@@ -0,0 +1,1418 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <linux/security.h>
+#include <linux/debugfs.h>
+#include <linux/ktime.h>
+#include <linux/mutex.h>
+#include <asm/unaligned.h>
+#include <cxlpci.h>
+#include <cxlmem.h>
+#include <cxl.h>
+
+#include "core.h"
+#include "trace.h"
+
+static bool cxl_raw_allow_all;
+
+/**
+ * DOC: cxl mbox
+ *
+ * Core implementation of the CXL 2.0 Type-3 Memory Device Mailbox. The
+ * implementation is used by the cxl_pci driver to initialize the device
+ * and implement the cxl_mem.h IOCTL UAPI. It also implements the
+ * backend of the cxl_pmem_ctl() transport for LIBNVDIMM.
+ */
+
+#define cxl_for_each_cmd(cmd)                                                  \
+	for ((cmd) = &cxl_mem_commands[0];                                     \
+	     ((cmd) - cxl_mem_commands) < ARRAY_SIZE(cxl_mem_commands); (cmd)++)
+
+#define CXL_CMD(_id, sin, sout, _flags)                                        \
+	[CXL_MEM_COMMAND_ID_##_id] = {                                         \
+	.info =	{                                                              \
+			.id = CXL_MEM_COMMAND_ID_##_id,                        \
+			.size_in = sin,                                        \
+			.size_out = sout,                                      \
+		},                                                             \
+	.opcode = CXL_MBOX_OP_##_id,                                           \
+	.flags = _flags,                                                       \
+	}
+
+#define CXL_VARIABLE_PAYLOAD	~0U
+/*
+ * This table defines the supported mailbox commands for the driver. This table
+ * is made up of a UAPI structure. Non-negative values as parameters in the
+ * table will be validated against the user's input. For example, if size_in is
+ * 0, and the user passed in 1, it is an error.
+ */
+static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
+	CXL_CMD(IDENTIFY, 0, 0x43, CXL_CMD_FLAG_FORCE_ENABLE),
+#ifdef CONFIG_CXL_MEM_RAW_COMMANDS
+	CXL_CMD(RAW, CXL_VARIABLE_PAYLOAD, CXL_VARIABLE_PAYLOAD, 0),
+#endif
+	CXL_CMD(GET_SUPPORTED_LOGS, 0, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
+	CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
+	CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
+	CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0),
+	CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0),
+	CXL_CMD(GET_LOG, 0x18, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
+	CXL_CMD(SET_PARTITION_INFO, 0x0a, 0, 0),
+	CXL_CMD(SET_LSA, CXL_VARIABLE_PAYLOAD, 0, 0),
+	CXL_CMD(GET_ALERT_CONFIG, 0, 0x10, 0),
+	CXL_CMD(SET_ALERT_CONFIG, 0xc, 0, 0),
+	CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0),
+	CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0),
+	CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0),
+};
+
+/*
+ * Commands that RAW doesn't permit. The rationale for each:
+ *
+ * CXL_MBOX_OP_ACTIVATE_FW: Firmware activation requires adjustment /
+ * coordination of transaction timeout values at the root bridge level.
+ *
+ * CXL_MBOX_OP_SET_PARTITION_INFO: The device memory map may change live
+ * and needs to be coordinated with HDM updates.
+ *
+ * CXL_MBOX_OP_SET_LSA: The label storage area may be cached by the
+ * driver and any writes from userspace invalidates those contents.
+ *
+ * CXL_MBOX_OP_SET_SHUTDOWN_STATE: Set shutdown state assumes no writes
+ * to the device after it is marked clean, userspace can not make that
+ * assertion.
+ *
+ * CXL_MBOX_OP_[GET_]SCAN_MEDIA: The kernel provides a native error list that
+ * is kept up to date with patrol notifications and error management.
+ *
+ * CXL_MBOX_OP_[GET_,INJECT_,CLEAR_]POISON: These commands require kernel
+ * driver orchestration for safety.
+ */
+static u16 cxl_disabled_raw_commands[] = {
+	CXL_MBOX_OP_ACTIVATE_FW,
+	CXL_MBOX_OP_SET_PARTITION_INFO,
+	CXL_MBOX_OP_SET_LSA,
+	CXL_MBOX_OP_SET_SHUTDOWN_STATE,
+	CXL_MBOX_OP_SCAN_MEDIA,
+	CXL_MBOX_OP_GET_SCAN_MEDIA,
+	CXL_MBOX_OP_GET_POISON,
+	CXL_MBOX_OP_INJECT_POISON,
+	CXL_MBOX_OP_CLEAR_POISON,
+};
+
+/*
+ * Command sets that RAW doesn't permit. All opcodes in this set are
+ * disabled because they pass plain text security payloads over the
+ * user/kernel boundary. This functionality is intended to be wrapped
+ * behind the keys ABI which allows for encrypted payloads in the UAPI
+ */
+static u8 security_command_sets[] = {
+	0x44, /* Sanitize */
+	0x45, /* Persistent Memory Data-at-rest Security */
+	0x46, /* Security Passthrough */
+};
+
+static bool cxl_is_security_command(u16 opcode)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(security_command_sets); i++)
+		if (security_command_sets[i] == (opcode >> 8))
+			return true;
+	return false;
+}
+
+static void cxl_set_security_cmd_enabled(struct cxl_security_state *security,
+					 u16 opcode)
+{
+	switch (opcode) {
+	case CXL_MBOX_OP_SANITIZE:
+		set_bit(CXL_SEC_ENABLED_SANITIZE, security->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_SECURE_ERASE:
+		set_bit(CXL_SEC_ENABLED_SECURE_ERASE,
+			security->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_GET_SECURITY_STATE:
+		set_bit(CXL_SEC_ENABLED_GET_SECURITY_STATE,
+			security->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_SET_PASSPHRASE:
+		set_bit(CXL_SEC_ENABLED_SET_PASSPHRASE,
+			security->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_DISABLE_PASSPHRASE:
+		set_bit(CXL_SEC_ENABLED_DISABLE_PASSPHRASE,
+			security->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_UNLOCK:
+		set_bit(CXL_SEC_ENABLED_UNLOCK, security->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_FREEZE_SECURITY:
+		set_bit(CXL_SEC_ENABLED_FREEZE_SECURITY,
+			security->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE:
+		set_bit(CXL_SEC_ENABLED_PASSPHRASE_SECURE_ERASE,
+			security->enabled_cmds);
+		break;
+	default:
+		break;
+	}
+}
+
+static bool cxl_is_poison_command(u16 opcode)
+{
+#define CXL_MBOX_OP_POISON_CMDS 0x43
+
+	if ((opcode >> 8) == CXL_MBOX_OP_POISON_CMDS)
+		return true;
+
+	return false;
+}
+
+static void cxl_set_poison_cmd_enabled(struct cxl_poison_state *poison,
+				       u16 opcode)
+{
+	switch (opcode) {
+	case CXL_MBOX_OP_GET_POISON:
+		set_bit(CXL_POISON_ENABLED_LIST, poison->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_INJECT_POISON:
+		set_bit(CXL_POISON_ENABLED_INJECT, poison->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_CLEAR_POISON:
+		set_bit(CXL_POISON_ENABLED_CLEAR, poison->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_GET_SCAN_MEDIA_CAPS:
+		set_bit(CXL_POISON_ENABLED_SCAN_CAPS, poison->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_SCAN_MEDIA:
+		set_bit(CXL_POISON_ENABLED_SCAN_MEDIA, poison->enabled_cmds);
+		break;
+	case CXL_MBOX_OP_GET_SCAN_MEDIA:
+		set_bit(CXL_POISON_ENABLED_SCAN_RESULTS, poison->enabled_cmds);
+		break;
+	default:
+		break;
+	}
+}
+
+static struct cxl_mem_command *cxl_mem_find_command(u16 opcode)
+{
+	struct cxl_mem_command *c;
+
+	cxl_for_each_cmd(c)
+		if (c->opcode == opcode)
+			return c;
+
+	return NULL;
+}
+
+static const char *cxl_mem_opcode_to_name(u16 opcode)
+{
+	struct cxl_mem_command *c;
+
+	c = cxl_mem_find_command(opcode);
+	if (!c)
+		return NULL;
+
+	return cxl_command_names[c->info.id].name;
+}
+
+/**
+ * cxl_internal_send_cmd() - Kernel internal interface to send a mailbox command
+ * @mds: The driver data for the operation
+ * @mbox_cmd: initialized command to execute
+ *
+ * Context: Any context.
+ * Return:
+ *  * %>=0	- Number of bytes returned in @out.
+ *  * %-E2BIG	- Payload is too large for hardware.
+ *  * %-EBUSY	- Couldn't acquire exclusive mailbox access.
+ *  * %-EFAULT	- Hardware error occurred.
+ *  * %-ENXIO	- Command completed, but device reported an error.
+ *  * %-EIO	- Unexpected output size.
+ *
+ * Mailbox commands may execute successfully yet the device itself reported an
+ * error. While this distinction can be useful for commands from userspace, the
+ * kernel will only be able to use results when both are successful.
+ */
+int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
+			  struct cxl_mbox_cmd *mbox_cmd)
+{
+	size_t out_size, min_out;
+	int rc;
+
+	if (mbox_cmd->size_in > mds->payload_size ||
+	    mbox_cmd->size_out > mds->payload_size)
+		return -E2BIG;
+
+	out_size = mbox_cmd->size_out;
+	min_out = mbox_cmd->min_out;
+	rc = mds->mbox_send(mds, mbox_cmd);
+	/*
+	 * EIO is reserved for a payload size mismatch and mbox_send()
+	 * may not return this error.
+	 */
+	if (WARN_ONCE(rc == -EIO, "Bad return code: -EIO"))
+		return -ENXIO;
+	if (rc)
+		return rc;
+
+	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
+	    mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND)
+		return cxl_mbox_cmd_rc2errno(mbox_cmd);
+
+	if (!out_size)
+		return 0;
+
+	/*
+	 * Variable sized output needs to at least satisfy the caller's
+	 * minimum if not the fully requested size.
+	 */
+	if (min_out == 0)
+		min_out = out_size;
+
+	if (mbox_cmd->size_out < min_out)
+		return -EIO;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_internal_send_cmd, CXL);
+
+static bool cxl_mem_raw_command_allowed(u16 opcode)
+{
+	int i;
+
+	if (!IS_ENABLED(CONFIG_CXL_MEM_RAW_COMMANDS))
+		return false;
+
+	if (security_locked_down(LOCKDOWN_PCI_ACCESS))
+		return false;
+
+	if (cxl_raw_allow_all)
+		return true;
+
+	if (cxl_is_security_command(opcode))
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(cxl_disabled_raw_commands); i++)
+		if (cxl_disabled_raw_commands[i] == opcode)
+			return false;
+
+	return true;
+}
+
+/**
+ * cxl_payload_from_user_allowed() - Check contents of in_payload.
+ * @opcode: The mailbox command opcode.
+ * @payload_in: Pointer to the input payload passed in from user space.
+ *
+ * Return:
+ *  * true	- payload_in passes check for @opcode.
+ *  * false	- payload_in contains invalid or unsupported values.
+ *
+ * The driver may inspect payload contents before sending a mailbox
+ * command from user space to the device. The intent is to reject
+ * commands with input payloads that are known to be unsafe. This
+ * check is not intended to replace the users careful selection of
+ * mailbox command parameters and makes no guarantee that the user
+ * command will succeed, nor that it is appropriate.
+ *
+ * The specific checks are determined by the opcode.
+ */
+static bool cxl_payload_from_user_allowed(u16 opcode, void *payload_in)
+{
+	switch (opcode) {
+	case CXL_MBOX_OP_SET_PARTITION_INFO: {
+		struct cxl_mbox_set_partition_info *pi = payload_in;
+
+		if (pi->flags & CXL_SET_PARTITION_IMMEDIATE_FLAG)
+			return false;
+		break;
+	}
+	default:
+		break;
+	}
+	return true;
+}
+
+static int cxl_mbox_cmd_ctor(struct cxl_mbox_cmd *mbox,
+			     struct cxl_memdev_state *mds, u16 opcode,
+			     size_t in_size, size_t out_size, u64 in_payload)
+{
+	*mbox = (struct cxl_mbox_cmd) {
+		.opcode = opcode,
+		.size_in = in_size,
+	};
+
+	if (in_size) {
+		mbox->payload_in = vmemdup_user(u64_to_user_ptr(in_payload),
+						in_size);
+		if (IS_ERR(mbox->payload_in))
+			return PTR_ERR(mbox->payload_in);
+
+		if (!cxl_payload_from_user_allowed(opcode, mbox->payload_in)) {
+			dev_dbg(mds->cxlds.dev, "%s: input payload not allowed\n",
+				cxl_mem_opcode_to_name(opcode));
+			kvfree(mbox->payload_in);
+			return -EBUSY;
+		}
+	}
+
+	/* Prepare to handle a full payload for variable sized output */
+	if (out_size == CXL_VARIABLE_PAYLOAD)
+		mbox->size_out = mds->payload_size;
+	else
+		mbox->size_out = out_size;
+
+	if (mbox->size_out) {
+		mbox->payload_out = kvzalloc(mbox->size_out, GFP_KERNEL);
+		if (!mbox->payload_out) {
+			kvfree(mbox->payload_in);
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+static void cxl_mbox_cmd_dtor(struct cxl_mbox_cmd *mbox)
+{
+	kvfree(mbox->payload_in);
+	kvfree(mbox->payload_out);
+}
+
+static int cxl_to_mem_cmd_raw(struct cxl_mem_command *mem_cmd,
+			      const struct cxl_send_command *send_cmd,
+			      struct cxl_memdev_state *mds)
+{
+	if (send_cmd->raw.rsvd)
+		return -EINVAL;
+
+	/*
+	 * Unlike supported commands, the output size of RAW commands
+	 * gets passed along without further checking, so it must be
+	 * validated here.
+	 */
+	if (send_cmd->out.size > mds->payload_size)
+		return -EINVAL;
+
+	if (!cxl_mem_raw_command_allowed(send_cmd->raw.opcode))
+		return -EPERM;
+
+	dev_WARN_ONCE(mds->cxlds.dev, true, "raw command path used\n");
+
+	*mem_cmd = (struct cxl_mem_command) {
+		.info = {
+			.id = CXL_MEM_COMMAND_ID_RAW,
+			.size_in = send_cmd->in.size,
+			.size_out = send_cmd->out.size,
+		},
+		.opcode = send_cmd->raw.opcode
+	};
+
+	return 0;
+}
+
+static int cxl_to_mem_cmd(struct cxl_mem_command *mem_cmd,
+			  const struct cxl_send_command *send_cmd,
+			  struct cxl_memdev_state *mds)
+{
+	struct cxl_mem_command *c = &cxl_mem_commands[send_cmd->id];
+	const struct cxl_command_info *info = &c->info;
+
+	if (send_cmd->flags & ~CXL_MEM_COMMAND_FLAG_MASK)
+		return -EINVAL;
+
+	if (send_cmd->rsvd)
+		return -EINVAL;
+
+	if (send_cmd->in.rsvd || send_cmd->out.rsvd)
+		return -EINVAL;
+
+	/* Check that the command is enabled for hardware */
+	if (!test_bit(info->id, mds->enabled_cmds))
+		return -ENOTTY;
+
+	/* Check that the command is not claimed for exclusive kernel use */
+	if (test_bit(info->id, mds->exclusive_cmds))
+		return -EBUSY;
+
+	/* Check the input buffer is the expected size */
+	if ((info->size_in != CXL_VARIABLE_PAYLOAD) &&
+	    (info->size_in != send_cmd->in.size))
+		return -ENOMEM;
+
+	/* Check the output buffer is at least large enough */
+	if ((info->size_out != CXL_VARIABLE_PAYLOAD) &&
+	    (send_cmd->out.size < info->size_out))
+		return -ENOMEM;
+
+	*mem_cmd = (struct cxl_mem_command) {
+		.info = {
+			.id = info->id,
+			.flags = info->flags,
+			.size_in = send_cmd->in.size,
+			.size_out = send_cmd->out.size,
+		},
+		.opcode = c->opcode
+	};
+
+	return 0;
+}
+
+/**
+ * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND.
+ * @mbox_cmd: Sanitized and populated &struct cxl_mbox_cmd.
+ * @mds: The driver data for the operation
+ * @send_cmd: &struct cxl_send_command copied in from userspace.
+ *
+ * Return:
+ *  * %0	- @out_cmd is ready to send.
+ *  * %-ENOTTY	- Invalid command specified.
+ *  * %-EINVAL	- Reserved fields or invalid values were used.
+ *  * %-ENOMEM	- Input or output buffer wasn't sized properly.
+ *  * %-EPERM	- Attempted to use a protected command.
+ *  * %-EBUSY	- Kernel has claimed exclusive access to this opcode
+ *
+ * The result of this command is a fully validated command in @mbox_cmd that is
+ * safe to send to the hardware.
+ */
+static int cxl_validate_cmd_from_user(struct cxl_mbox_cmd *mbox_cmd,
+				      struct cxl_memdev_state *mds,
+				      const struct cxl_send_command *send_cmd)
+{
+	struct cxl_mem_command mem_cmd;
+	int rc;
+
+	if (send_cmd->id == 0 || send_cmd->id >= CXL_MEM_COMMAND_ID_MAX)
+		return -ENOTTY;
+
+	/*
+	 * The user can never specify an input payload larger than what hardware
+	 * supports, but output can be arbitrarily large (simply write out as
+	 * much data as the hardware provides).
+	 */
+	if (send_cmd->in.size > mds->payload_size)
+		return -EINVAL;
+
+	/* Sanitize and construct a cxl_mem_command */
+	if (send_cmd->id == CXL_MEM_COMMAND_ID_RAW)
+		rc = cxl_to_mem_cmd_raw(&mem_cmd, send_cmd, mds);
+	else
+		rc = cxl_to_mem_cmd(&mem_cmd, send_cmd, mds);
+
+	if (rc)
+		return rc;
+
+	/* Sanitize and construct a cxl_mbox_cmd */
+	return cxl_mbox_cmd_ctor(mbox_cmd, mds, mem_cmd.opcode,
+				 mem_cmd.info.size_in, mem_cmd.info.size_out,
+				 send_cmd->in.payload);
+}
+
+int cxl_query_cmd(struct cxl_memdev *cxlmd,
+		  struct cxl_mem_query_commands __user *q)
+{
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct device *dev = &cxlmd->dev;
+	struct cxl_mem_command *cmd;
+	u32 n_commands;
+	int j = 0;
+
+	dev_dbg(dev, "Query IOCTL\n");
+
+	if (get_user(n_commands, &q->n_commands))
+		return -EFAULT;
+
+	/* returns the total number if 0 elements are requested. */
+	if (n_commands == 0)
+		return put_user(ARRAY_SIZE(cxl_mem_commands), &q->n_commands);
+
+	/*
+	 * otherwise, return max(n_commands, total commands) cxl_command_info
+	 * structures.
+	 */
+	cxl_for_each_cmd(cmd) {
+		struct cxl_command_info info = cmd->info;
+
+		if (test_bit(info.id, mds->enabled_cmds))
+			info.flags |= CXL_MEM_COMMAND_FLAG_ENABLED;
+		if (test_bit(info.id, mds->exclusive_cmds))
+			info.flags |= CXL_MEM_COMMAND_FLAG_EXCLUSIVE;
+
+		if (copy_to_user(&q->commands[j++], &info, sizeof(info)))
+			return -EFAULT;
+
+		if (j == n_commands)
+			break;
+	}
+
+	return 0;
+}
+
+/**
+ * handle_mailbox_cmd_from_user() - Dispatch a mailbox command for userspace.
+ * @mds: The driver data for the operation
+ * @mbox_cmd: The validated mailbox command.
+ * @out_payload: Pointer to userspace's output payload.
+ * @size_out: (Input) Max payload size to copy out.
+ *            (Output) Payload size hardware generated.
+ * @retval: Hardware generated return code from the operation.
+ *
+ * Return:
+ *  * %0	- Mailbox transaction succeeded. This implies the mailbox
+ *		  protocol completed successfully not that the operation itself
+ *		  was successful.
+ *  * %-ENOMEM  - Couldn't allocate a bounce buffer.
+ *  * %-EFAULT	- Something happened with copy_to/from_user.
+ *  * %-EINTR	- Mailbox acquisition interrupted.
+ *  * %-EXXX	- Transaction level failures.
+ *
+ * Dispatches a mailbox command on behalf of a userspace request.
+ * The output payload is copied to userspace.
+ *
+ * See cxl_send_cmd().
+ */
+static int handle_mailbox_cmd_from_user(struct cxl_memdev_state *mds,
+					struct cxl_mbox_cmd *mbox_cmd,
+					u64 out_payload, s32 *size_out,
+					u32 *retval)
+{
+	struct device *dev = mds->cxlds.dev;
+	int rc;
+
+	dev_dbg(dev,
+		"Submitting %s command for user\n"
+		"\topcode: %x\n"
+		"\tsize: %zx\n",
+		cxl_mem_opcode_to_name(mbox_cmd->opcode),
+		mbox_cmd->opcode, mbox_cmd->size_in);
+
+	rc = mds->mbox_send(mds, mbox_cmd);
+	if (rc)
+		goto out;
+
+	/*
+	 * @size_out contains the max size that's allowed to be written back out
+	 * to userspace. While the payload may have written more output than
+	 * this it will have to be ignored.
+	 */
+	if (mbox_cmd->size_out) {
+		dev_WARN_ONCE(dev, mbox_cmd->size_out > *size_out,
+			      "Invalid return size\n");
+		if (copy_to_user(u64_to_user_ptr(out_payload),
+				 mbox_cmd->payload_out, mbox_cmd->size_out)) {
+			rc = -EFAULT;
+			goto out;
+		}
+	}
+
+	*size_out = mbox_cmd->size_out;
+	*retval = mbox_cmd->return_code;
+
+out:
+	cxl_mbox_cmd_dtor(mbox_cmd);
+	return rc;
+}
+
+int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s)
+{
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct device *dev = &cxlmd->dev;
+	struct cxl_send_command send;
+	struct cxl_mbox_cmd mbox_cmd;
+	int rc;
+
+	dev_dbg(dev, "Send IOCTL\n");
+
+	if (copy_from_user(&send, s, sizeof(send)))
+		return -EFAULT;
+
+	rc = cxl_validate_cmd_from_user(&mbox_cmd, mds, &send);
+	if (rc)
+		return rc;
+
+	rc = handle_mailbox_cmd_from_user(mds, &mbox_cmd, send.out.payload,
+					  &send.out.size, &send.retval);
+	if (rc)
+		return rc;
+
+	if (copy_to_user(s, &send, sizeof(send)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid,
+			u32 *size, u8 *out)
+{
+	u32 remaining = *size;
+	u32 offset = 0;
+
+	while (remaining) {
+		u32 xfer_size = min_t(u32, remaining, mds->payload_size);
+		struct cxl_mbox_cmd mbox_cmd;
+		struct cxl_mbox_get_log log;
+		int rc;
+
+		log = (struct cxl_mbox_get_log) {
+			.uuid = *uuid,
+			.offset = cpu_to_le32(offset),
+			.length = cpu_to_le32(xfer_size),
+		};
+
+		mbox_cmd = (struct cxl_mbox_cmd) {
+			.opcode = CXL_MBOX_OP_GET_LOG,
+			.size_in = sizeof(log),
+			.payload_in = &log,
+			.size_out = xfer_size,
+			.payload_out = out,
+		};
+
+		rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+
+		/*
+		 * The output payload length that indicates the number
+		 * of valid bytes can be smaller than the Log buffer
+		 * size.
+		 */
+		if (rc == -EIO && mbox_cmd.size_out < xfer_size) {
+			offset += mbox_cmd.size_out;
+			break;
+		}
+
+		if (rc < 0)
+			return rc;
+
+		out += xfer_size;
+		remaining -= xfer_size;
+		offset += xfer_size;
+	}
+
+	*size = offset;
+
+	return 0;
+}
+
+/**
+ * cxl_walk_cel() - Walk through the Command Effects Log.
+ * @mds: The driver data for the operation
+ * @size: Length of the Command Effects Log.
+ * @cel: CEL
+ *
+ * Iterate over each entry in the CEL and determine if the driver supports the
+ * command. If so, the command is enabled for the device and can be used later.
+ */
+static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
+{
+	struct cxl_cel_entry *cel_entry;
+	const int cel_entries = size / sizeof(*cel_entry);
+	struct device *dev = mds->cxlds.dev;
+	int i;
+
+	cel_entry = (struct cxl_cel_entry *) cel;
+
+	for (i = 0; i < cel_entries; i++) {
+		u16 opcode = le16_to_cpu(cel_entry[i].opcode);
+		struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
+		int enabled = 0;
+
+		if (cmd) {
+			set_bit(cmd->info.id, mds->enabled_cmds);
+			enabled++;
+		}
+
+		if (cxl_is_poison_command(opcode)) {
+			cxl_set_poison_cmd_enabled(&mds->poison, opcode);
+			enabled++;
+		}
+
+		if (cxl_is_security_command(opcode)) {
+			cxl_set_security_cmd_enabled(&mds->security, opcode);
+			enabled++;
+		}
+
+		dev_dbg(dev, "Opcode 0x%04x %s\n", opcode,
+			enabled ? "enabled" : "unsupported by driver");
+	}
+}
+
+static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_memdev_state *mds)
+{
+	struct cxl_mbox_get_supported_logs *ret;
+	struct cxl_mbox_cmd mbox_cmd;
+	int rc;
+
+	ret = kvmalloc(mds->payload_size, GFP_KERNEL);
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_GET_SUPPORTED_LOGS,
+		.size_out = mds->payload_size,
+		.payload_out = ret,
+		/* At least the record number field must be valid */
+		.min_out = 2,
+	};
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0) {
+		kvfree(ret);
+		return ERR_PTR(rc);
+	}
+
+
+	return ret;
+}
+
+enum {
+	CEL_UUID,
+	VENDOR_DEBUG_UUID,
+};
+
+/* See CXL 2.0 Table 170. Get Log Input Payload */
+static const uuid_t log_uuid[] = {
+	[CEL_UUID] = DEFINE_CXL_CEL_UUID,
+	[VENDOR_DEBUG_UUID] = DEFINE_CXL_VENDOR_DEBUG_UUID,
+};
+
+/**
+ * cxl_enumerate_cmds() - Enumerate commands for a device.
+ * @mds: The driver data for the operation
+ *
+ * Returns 0 if enumerate completed successfully.
+ *
+ * CXL devices have optional support for certain commands. This function will
+ * determine the set of supported commands for the hardware and update the
+ * enabled_cmds bitmap in the @mds.
+ */
+int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
+{
+	struct cxl_mbox_get_supported_logs *gsl;
+	struct device *dev = mds->cxlds.dev;
+	struct cxl_mem_command *cmd;
+	int i, rc;
+
+	gsl = cxl_get_gsl(mds);
+	if (IS_ERR(gsl))
+		return PTR_ERR(gsl);
+
+	rc = -ENOENT;
+	for (i = 0; i < le16_to_cpu(gsl->entries); i++) {
+		u32 size = le32_to_cpu(gsl->entry[i].size);
+		uuid_t uuid = gsl->entry[i].uuid;
+		u8 *log;
+
+		dev_dbg(dev, "Found LOG type %pU of size %d", &uuid, size);
+
+		if (!uuid_equal(&uuid, &log_uuid[CEL_UUID]))
+			continue;
+
+		log = kvmalloc(size, GFP_KERNEL);
+		if (!log) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		rc = cxl_xfer_log(mds, &uuid, &size, log);
+		if (rc) {
+			kvfree(log);
+			goto out;
+		}
+
+		cxl_walk_cel(mds, size, log);
+		kvfree(log);
+
+		/* In case CEL was bogus, enable some default commands. */
+		cxl_for_each_cmd(cmd)
+			if (cmd->flags & CXL_CMD_FLAG_FORCE_ENABLE)
+				set_bit(cmd->info.id, mds->enabled_cmds);
+
+		/* Found the required CEL */
+		rc = 0;
+	}
+out:
+	kvfree(gsl);
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
+
+/*
+ * General Media Event Record
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+static const uuid_t gen_media_event_uuid =
+	UUID_INIT(0xfbcd0a77, 0xc260, 0x417f,
+		  0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6);
+
+/*
+ * DRAM Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ */
+static const uuid_t dram_event_uuid =
+	UUID_INIT(0x601dcbb3, 0x9c06, 0x4eab,
+		  0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24);
+
+/*
+ * Memory Module Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+static const uuid_t mem_mod_event_uuid =
+	UUID_INIT(0xfe927475, 0xdd59, 0x4339,
+		  0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74);
+
+static void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
+				   enum cxl_event_log_type type,
+				   struct cxl_event_record_raw *record)
+{
+	uuid_t *id = &record->hdr.id;
+
+	if (uuid_equal(id, &gen_media_event_uuid)) {
+		struct cxl_event_gen_media *rec =
+				(struct cxl_event_gen_media *)record;
+
+		trace_cxl_general_media(cxlmd, type, rec);
+	} else if (uuid_equal(id, &dram_event_uuid)) {
+		struct cxl_event_dram *rec = (struct cxl_event_dram *)record;
+
+		trace_cxl_dram(cxlmd, type, rec);
+	} else if (uuid_equal(id, &mem_mod_event_uuid)) {
+		struct cxl_event_mem_module *rec =
+				(struct cxl_event_mem_module *)record;
+
+		trace_cxl_memory_module(cxlmd, type, rec);
+	} else {
+		/* For unknown record types print just the header */
+		trace_cxl_generic_event(cxlmd, type, record);
+	}
+}
+
+static int cxl_clear_event_record(struct cxl_memdev_state *mds,
+				  enum cxl_event_log_type log,
+				  struct cxl_get_event_payload *get_pl)
+{
+	struct cxl_mbox_clear_event_payload *payload;
+	u16 total = le16_to_cpu(get_pl->record_count);
+	u8 max_handles = CXL_CLEAR_EVENT_MAX_HANDLES;
+	size_t pl_size = struct_size(payload, handles, max_handles);
+	struct cxl_mbox_cmd mbox_cmd;
+	u16 cnt;
+	int rc = 0;
+	int i;
+
+	/* Payload size may limit the max handles */
+	if (pl_size > mds->payload_size) {
+		max_handles = (mds->payload_size - sizeof(*payload)) /
+			      sizeof(__le16);
+		pl_size = struct_size(payload, handles, max_handles);
+	}
+
+	payload = kvzalloc(pl_size, GFP_KERNEL);
+	if (!payload)
+		return -ENOMEM;
+
+	*payload = (struct cxl_mbox_clear_event_payload) {
+		.event_log = log,
+	};
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_CLEAR_EVENT_RECORD,
+		.payload_in = payload,
+		.size_in = pl_size,
+	};
+
+	/*
+	 * Clear Event Records uses u8 for the handle cnt while Get Event
+	 * Record can return up to 0xffff records.
+	 */
+	i = 0;
+	for (cnt = 0; cnt < total; cnt++) {
+		payload->handles[i++] = get_pl->records[cnt].hdr.handle;
+		dev_dbg(mds->cxlds.dev, "Event log '%d': Clearing %u\n", log,
+			le16_to_cpu(payload->handles[i]));
+
+		if (i == max_handles) {
+			payload->nr_recs = i;
+			rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+			if (rc)
+				goto free_pl;
+			i = 0;
+		}
+	}
+
+	/* Clear what is left if any */
+	if (i) {
+		payload->nr_recs = i;
+		mbox_cmd.size_in = struct_size(payload, handles, i);
+		rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+		if (rc)
+			goto free_pl;
+	}
+
+free_pl:
+	kvfree(payload);
+	return rc;
+}
+
+static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
+				    enum cxl_event_log_type type)
+{
+	struct cxl_memdev *cxlmd = mds->cxlds.cxlmd;
+	struct device *dev = mds->cxlds.dev;
+	struct cxl_get_event_payload *payload;
+	struct cxl_mbox_cmd mbox_cmd;
+	u8 log_type = type;
+	u16 nr_rec;
+
+	mutex_lock(&mds->event.log_lock);
+	payload = mds->event.buf;
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_GET_EVENT_RECORD,
+		.payload_in = &log_type,
+		.size_in = sizeof(log_type),
+		.payload_out = payload,
+		.size_out = mds->payload_size,
+		.min_out = struct_size(payload, records, 0),
+	};
+
+	do {
+		int rc, i;
+
+		rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+		if (rc) {
+			dev_err_ratelimited(dev,
+				"Event log '%d': Failed to query event records : %d",
+				type, rc);
+			break;
+		}
+
+		nr_rec = le16_to_cpu(payload->record_count);
+		if (!nr_rec)
+			break;
+
+		for (i = 0; i < nr_rec; i++)
+			cxl_event_trace_record(cxlmd, type,
+					       &payload->records[i]);
+
+		if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW)
+			trace_cxl_overflow(cxlmd, type, payload);
+
+		rc = cxl_clear_event_record(mds, type, payload);
+		if (rc) {
+			dev_err_ratelimited(dev,
+				"Event log '%d': Failed to clear events : %d",
+				type, rc);
+			break;
+		}
+	} while (nr_rec);
+
+	mutex_unlock(&mds->event.log_lock);
+}
+
+/**
+ * cxl_mem_get_event_records - Get Event Records from the device
+ * @mds: The driver data for the operation
+ * @status: Event Status register value identifying which events are available.
+ *
+ * Retrieve all event records available on the device, report them as trace
+ * events, and clear them.
+ *
+ * See CXL rev 3.0 @8.2.9.2.2 Get Event Records
+ * See CXL rev 3.0 @8.2.9.2.3 Clear Event Records
+ */
+void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status)
+{
+	dev_dbg(mds->cxlds.dev, "Reading event logs: %x\n", status);
+
+	if (status & CXLDEV_EVENT_STATUS_FATAL)
+		cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_FATAL);
+	if (status & CXLDEV_EVENT_STATUS_FAIL)
+		cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_FAIL);
+	if (status & CXLDEV_EVENT_STATUS_WARN)
+		cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_WARN);
+	if (status & CXLDEV_EVENT_STATUS_INFO)
+		cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_INFO);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, CXL);
+
+/**
+ * cxl_mem_get_partition_info - Get partition info
+ * @mds: The driver data for the operation
+ *
+ * Retrieve the current partition info for the device specified.  The active
+ * values are the current capacity in bytes.  If not 0, the 'next' values are
+ * the pending values, in bytes, which take affect on next cold reset.
+ *
+ * Return: 0 if no error: or the result of the mailbox command.
+ *
+ * See CXL @8.2.9.5.2.1 Get Partition Info
+ */
+static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds)
+{
+	struct cxl_mbox_get_partition_info pi;
+	struct cxl_mbox_cmd mbox_cmd;
+	int rc;
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_GET_PARTITION_INFO,
+		.size_out = sizeof(pi),
+		.payload_out = &pi,
+	};
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc)
+		return rc;
+
+	mds->active_volatile_bytes =
+		le64_to_cpu(pi.active_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
+	mds->active_persistent_bytes =
+		le64_to_cpu(pi.active_persistent_cap) * CXL_CAPACITY_MULTIPLIER;
+	mds->next_volatile_bytes =
+		le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
+	mds->next_persistent_bytes =
+		le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
+
+	return 0;
+}
+
+/**
+ * cxl_dev_state_identify() - Send the IDENTIFY command to the device.
+ * @mds: The driver data for the operation
+ *
+ * Return: 0 if identify was executed successfully or media not ready.
+ *
+ * This will dispatch the identify command to the device and on success populate
+ * structures to be exported to sysfs.
+ */
+int cxl_dev_state_identify(struct cxl_memdev_state *mds)
+{
+	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
+	struct cxl_mbox_identify id;
+	struct cxl_mbox_cmd mbox_cmd;
+	u32 val;
+	int rc;
+
+	if (!mds->cxlds.media_ready)
+		return 0;
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_IDENTIFY,
+		.size_out = sizeof(id),
+		.payload_out = &id,
+	};
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0)
+		return rc;
+
+	mds->total_bytes =
+		le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
+	mds->volatile_only_bytes =
+		le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
+	mds->persistent_only_bytes =
+		le64_to_cpu(id.persistent_capacity) * CXL_CAPACITY_MULTIPLIER;
+	mds->partition_align_bytes =
+		le64_to_cpu(id.partition_align) * CXL_CAPACITY_MULTIPLIER;
+
+	mds->lsa_size = le32_to_cpu(id.lsa_size);
+	memcpy(mds->firmware_version, id.fw_revision,
+	       sizeof(id.fw_revision));
+
+	if (test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds)) {
+		val = get_unaligned_le24(id.poison_list_max_mer);
+		mds->poison.max_errors = min_t(u32, val, CXL_POISON_LIST_MAX);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
+
+static int __cxl_mem_sanitize(struct cxl_memdev_state *mds, u16 cmd)
+{
+	int rc;
+	u32 sec_out = 0;
+	struct cxl_get_security_output {
+		__le32 flags;
+	} out;
+	struct cxl_mbox_cmd sec_cmd = {
+		.opcode = CXL_MBOX_OP_GET_SECURITY_STATE,
+		.payload_out = &out,
+		.size_out = sizeof(out),
+	};
+	struct cxl_mbox_cmd mbox_cmd = { .opcode = cmd };
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+
+	if (cmd != CXL_MBOX_OP_SANITIZE && cmd != CXL_MBOX_OP_SECURE_ERASE)
+		return -EINVAL;
+
+	rc = cxl_internal_send_cmd(mds, &sec_cmd);
+	if (rc < 0) {
+		dev_err(cxlds->dev, "Failed to get security state : %d", rc);
+		return rc;
+	}
+
+	/*
+	 * Prior to using these commands, any security applied to
+	 * the user data areas of the device shall be DISABLED (or
+	 * UNLOCKED for secure erase case).
+	 */
+	sec_out = le32_to_cpu(out.flags);
+	if (sec_out & CXL_PMEM_SEC_STATE_USER_PASS_SET)
+		return -EINVAL;
+
+	if (cmd == CXL_MBOX_OP_SECURE_ERASE &&
+	    sec_out & CXL_PMEM_SEC_STATE_LOCKED)
+		return -EINVAL;
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0) {
+		dev_err(cxlds->dev, "Failed to sanitize device : %d", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+
+/**
+ * cxl_mem_sanitize() - Send a sanitization command to the device.
+ * @cxlmd: The device for the operation
+ * @cmd: The specific sanitization command opcode
+ *
+ * Return: 0 if the command was executed successfully, regardless of
+ * whether or not the actual security operation is done in the background,
+ * such as for the Sanitize case.
+ * Error return values can be the result of the mailbox command, -EINVAL
+ * when security requirements are not met or invalid contexts, or -EBUSY
+ * if the sanitize operation is already in flight.
+ *
+ * See CXL 3.0 @8.2.9.8.5.1 Sanitize and @8.2.9.8.5.2 Secure Erase.
+ */
+int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
+{
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct cxl_port  *endpoint;
+	int rc;
+
+	/* synchronize with cxl_mem_probe() and decoder write operations */
+	device_lock(&cxlmd->dev);
+	endpoint = cxlmd->endpoint;
+	down_read(&cxl_region_rwsem);
+	/*
+	 * Require an endpoint to be safe otherwise the driver can not
+	 * be sure that the device is unmapped.
+	 */
+	if (endpoint && cxl_num_decoders_committed(endpoint) == 0)
+		rc = __cxl_mem_sanitize(mds, cmd);
+	else
+		rc = -EBUSY;
+	up_read(&cxl_region_rwsem);
+	device_unlock(&cxlmd->dev);
+
+	return rc;
+}
+
+static int add_dpa_res(struct device *dev, struct resource *parent,
+		       struct resource *res, resource_size_t start,
+		       resource_size_t size, const char *type)
+{
+	int rc;
+
+	res->name = type;
+	res->start = start;
+	res->end = start + size - 1;
+	res->flags = IORESOURCE_MEM;
+	if (resource_size(res) == 0) {
+		dev_dbg(dev, "DPA(%s): no capacity\n", res->name);
+		return 0;
+	}
+	rc = request_resource(parent, res);
+	if (rc) {
+		dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name,
+			res, rc);
+		return rc;
+	}
+
+	dev_dbg(dev, "DPA(%s): %pr\n", res->name, res);
+
+	return 0;
+}
+
+int cxl_mem_create_range_info(struct cxl_memdev_state *mds)
+{
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct device *dev = cxlds->dev;
+	int rc;
+
+	if (!cxlds->media_ready) {
+		cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
+		cxlds->ram_res = DEFINE_RES_MEM(0, 0);
+		cxlds->pmem_res = DEFINE_RES_MEM(0, 0);
+		return 0;
+	}
+
+	cxlds->dpa_res =
+		(struct resource)DEFINE_RES_MEM(0, mds->total_bytes);
+
+	if (mds->partition_align_bytes == 0) {
+		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
+				 mds->volatile_only_bytes, "ram");
+		if (rc)
+			return rc;
+		return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
+				   mds->volatile_only_bytes,
+				   mds->persistent_only_bytes, "pmem");
+	}
+
+	rc = cxl_mem_get_partition_info(mds);
+	if (rc) {
+		dev_err(dev, "Failed to query partition information\n");
+		return rc;
+	}
+
+	rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
+			 mds->active_volatile_bytes, "ram");
+	if (rc)
+		return rc;
+	return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
+			   mds->active_volatile_bytes,
+			   mds->active_persistent_bytes, "pmem");
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
+
+int cxl_set_timestamp(struct cxl_memdev_state *mds)
+{
+	struct cxl_mbox_cmd mbox_cmd;
+	struct cxl_mbox_set_timestamp_in pi;
+	int rc;
+
+	pi.timestamp = cpu_to_le64(ktime_get_real_ns());
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_SET_TIMESTAMP,
+		.size_in = sizeof(pi),
+		.payload_in = &pi,
+	};
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	/*
+	 * Command is optional. Devices may have another way of providing
+	 * a timestamp, or may return all 0s in timestamp fields.
+	 * Don't report an error if this command isn't supported
+	 */
+	if (rc && (mbox_cmd.return_code != CXL_MBOX_CMD_RC_UNSUPPORTED))
+		return rc;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_set_timestamp, CXL);
+
+int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
+		       struct cxl_region *cxlr)
+{
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct cxl_mbox_poison_out *po;
+	struct cxl_mbox_poison_in pi;
+	struct cxl_mbox_cmd mbox_cmd;
+	int nr_records = 0;
+	int rc;
+
+	rc = mutex_lock_interruptible(&mds->poison.lock);
+	if (rc)
+		return rc;
+
+	po = mds->poison.list_out;
+	pi.offset = cpu_to_le64(offset);
+	pi.length = cpu_to_le64(len / CXL_POISON_LEN_MULT);
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_GET_POISON,
+		.size_in = sizeof(pi),
+		.payload_in = &pi,
+		.size_out = mds->payload_size,
+		.payload_out = po,
+		.min_out = struct_size(po, record, 0),
+	};
+
+	do {
+		rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+		if (rc)
+			break;
+
+		for (int i = 0; i < le16_to_cpu(po->count); i++)
+			trace_cxl_poison(cxlmd, cxlr, &po->record[i],
+					 po->flags, po->overflow_ts,
+					 CXL_POISON_TRACE_LIST);
+
+		/* Protect against an uncleared _FLAG_MORE */
+		nr_records = nr_records + le16_to_cpu(po->count);
+		if (nr_records >= mds->poison.max_errors) {
+			dev_dbg(&cxlmd->dev, "Max Error Records reached: %d\n",
+				nr_records);
+			break;
+		}
+	} while (po->flags & CXL_POISON_FLAG_MORE);
+
+	mutex_unlock(&mds->poison.lock);
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison, CXL);
+
+static void free_poison_buf(void *buf)
+{
+	kvfree(buf);
+}
+
+/* Get Poison List output buffer is protected by mds->poison.lock */
+static int cxl_poison_alloc_buf(struct cxl_memdev_state *mds)
+{
+	mds->poison.list_out = kvmalloc(mds->payload_size, GFP_KERNEL);
+	if (!mds->poison.list_out)
+		return -ENOMEM;
+
+	return devm_add_action_or_reset(mds->cxlds.dev, free_poison_buf,
+					mds->poison.list_out);
+}
+
+int cxl_poison_state_init(struct cxl_memdev_state *mds)
+{
+	int rc;
+
+	if (!test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds))
+		return 0;
+
+	rc = cxl_poison_alloc_buf(mds);
+	if (rc) {
+		clear_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds);
+		return rc;
+	}
+
+	mutex_init(&mds->poison.lock);
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
+
+struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
+{
+	struct cxl_memdev_state *mds;
+
+	mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL);
+	if (!mds) {
+		dev_err(dev, "No memory available\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mutex_init(&mds->mbox_mutex);
+	mutex_init(&mds->event.log_lock);
+	mds->cxlds.dev = dev;
+	mds->cxlds.type = CXL_DEVTYPE_CLASSMEM;
+
+	return mds;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, CXL);
+
+void __init cxl_mbox_init(void)
+{
+	struct dentry *mbox_debugfs;
+
+	mbox_debugfs = cxl_debugfs_create_dir("mbox");
+	debugfs_create_bool("raw_allow_all", 0600, mbox_debugfs,
+			    &cxl_raw_allow_all);
+}
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
new file mode 100644
index 0000000000..2f43d368ba
--- /dev/null
+++ b/drivers/cxl/core/memdev.c
@@ -0,0 +1,1101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. */
+
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/firmware.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <cxlmem.h>
+#include "trace.h"
+#include "core.h"
+
+static DECLARE_RWSEM(cxl_memdev_rwsem);
+
+/*
+ * An entire PCI topology full of devices should be enough for any
+ * config
+ */
+#define CXL_MEM_MAX_DEVS 65536
+
+static int cxl_mem_major;
+static DEFINE_IDA(cxl_memdev_ida);
+
+static void cxl_memdev_release(struct device *dev)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+
+	ida_free(&cxl_memdev_ida, cxlmd->id);
+	kfree(cxlmd);
+}
+
+static char *cxl_memdev_devnode(const struct device *dev, umode_t *mode, kuid_t *uid,
+				kgid_t *gid)
+{
+	return kasprintf(GFP_KERNEL, "cxl/%s", dev_name(dev));
+}
+
+static ssize_t firmware_version_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+
+	if (!mds)
+		return sysfs_emit(buf, "\n");
+	return sysfs_emit(buf, "%.16s\n", mds->firmware_version);
+}
+static DEVICE_ATTR_RO(firmware_version);
+
+static ssize_t payload_max_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+
+	if (!mds)
+		return sysfs_emit(buf, "\n");
+	return sysfs_emit(buf, "%zu\n", mds->payload_size);
+}
+static DEVICE_ATTR_RO(payload_max);
+
+static ssize_t label_storage_size_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+
+	if (!mds)
+		return sysfs_emit(buf, "\n");
+	return sysfs_emit(buf, "%zu\n", mds->lsa_size);
+}
+static DEVICE_ATTR_RO(label_storage_size);
+
+static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	unsigned long long len = resource_size(&cxlds->ram_res);
+
+	return sysfs_emit(buf, "%#llx\n", len);
+}
+
+static struct device_attribute dev_attr_ram_size =
+	__ATTR(size, 0444, ram_size_show, NULL);
+
+static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	unsigned long long len = resource_size(&cxlds->pmem_res);
+
+	return sysfs_emit(buf, "%#llx\n", len);
+}
+
+static struct device_attribute dev_attr_pmem_size =
+	__ATTR(size, 0444, pmem_size_show, NULL);
+
+static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+	return sysfs_emit(buf, "%#llx\n", cxlds->serial);
+}
+static DEVICE_ATTR_RO(serial);
+
+static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "%d\n", dev_to_node(dev));
+}
+static DEVICE_ATTR_RO(numa_node);
+
+static ssize_t security_state_show(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+	unsigned long state = mds->security.state;
+	int rc = 0;
+
+	/* sync with latest submission state */
+	mutex_lock(&mds->mbox_mutex);
+	if (mds->security.sanitize_active)
+		rc = sysfs_emit(buf, "sanitize\n");
+	mutex_unlock(&mds->mbox_mutex);
+	if (rc)
+		return rc;
+
+	if (!(state & CXL_PMEM_SEC_STATE_USER_PASS_SET))
+		return sysfs_emit(buf, "disabled\n");
+	if (state & CXL_PMEM_SEC_STATE_FROZEN ||
+	    state & CXL_PMEM_SEC_STATE_MASTER_PLIMIT ||
+	    state & CXL_PMEM_SEC_STATE_USER_PLIMIT)
+		return sysfs_emit(buf, "frozen\n");
+	if (state & CXL_PMEM_SEC_STATE_LOCKED)
+		return sysfs_emit(buf, "locked\n");
+	else
+		return sysfs_emit(buf, "unlocked\n");
+}
+static struct device_attribute dev_attr_security_state =
+	__ATTR(state, 0444, security_state_show, NULL);
+
+static ssize_t security_sanitize_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t len)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	bool sanitize;
+	ssize_t rc;
+
+	if (kstrtobool(buf, &sanitize) || !sanitize)
+		return -EINVAL;
+
+	rc = cxl_mem_sanitize(cxlmd, CXL_MBOX_OP_SANITIZE);
+	if (rc)
+		return rc;
+
+	return len;
+}
+static struct device_attribute dev_attr_security_sanitize =
+	__ATTR(sanitize, 0200, NULL, security_sanitize_store);
+
+static ssize_t security_erase_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t len)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	ssize_t rc;
+	bool erase;
+
+	if (kstrtobool(buf, &erase) || !erase)
+		return -EINVAL;
+
+	rc = cxl_mem_sanitize(cxlmd, CXL_MBOX_OP_SECURE_ERASE);
+	if (rc)
+		return rc;
+
+	return len;
+}
+static struct device_attribute dev_attr_security_erase =
+	__ATTR(erase, 0200, NULL, security_erase_store);
+
+static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd)
+{
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	u64 offset, length;
+	int rc = 0;
+
+	/* CXL 3.0 Spec 8.2.9.8.4.1 Separate pmem and ram poison requests */
+	if (resource_size(&cxlds->pmem_res)) {
+		offset = cxlds->pmem_res.start;
+		length = resource_size(&cxlds->pmem_res);
+		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+		if (rc)
+			return rc;
+	}
+	if (resource_size(&cxlds->ram_res)) {
+		offset = cxlds->ram_res.start;
+		length = resource_size(&cxlds->ram_res);
+		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+		/*
+		 * Invalid Physical Address is not an error for
+		 * volatile addresses. Device support is optional.
+		 */
+		if (rc == -EFAULT)
+			rc = 0;
+	}
+	return rc;
+}
+
+int cxl_trigger_poison_list(struct cxl_memdev *cxlmd)
+{
+	struct cxl_port *port;
+	int rc;
+
+	port = cxlmd->endpoint;
+	if (!port || !is_cxl_endpoint(port))
+		return -EINVAL;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	rc = down_read_interruptible(&cxl_dpa_rwsem);
+	if (rc) {
+		up_read(&cxl_region_rwsem);
+		return rc;
+	}
+
+	if (cxl_num_decoders_committed(port) == 0) {
+		/* No regions mapped to this memdev */
+		rc = cxl_get_poison_by_memdev(cxlmd);
+	} else {
+		/* Regions mapped, collect poison by endpoint */
+		rc =  cxl_get_poison_by_endpoint(port);
+	}
+	up_read(&cxl_dpa_rwsem);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_trigger_poison_list, CXL);
+
+struct cxl_dpa_to_region_context {
+	struct cxl_region *cxlr;
+	u64 dpa;
+};
+
+static int __cxl_dpa_to_region(struct device *dev, void *arg)
+{
+	struct cxl_dpa_to_region_context *ctx = arg;
+	struct cxl_endpoint_decoder *cxled;
+	u64 dpa = ctx->dpa;
+
+	if (!is_endpoint_decoder(dev))
+		return 0;
+
+	cxled = to_cxl_endpoint_decoder(dev);
+	if (!cxled->dpa_res || !resource_size(cxled->dpa_res))
+		return 0;
+
+	if (dpa > cxled->dpa_res->end || dpa < cxled->dpa_res->start)
+		return 0;
+
+	dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa,
+		dev_name(&cxled->cxld.region->dev));
+
+	ctx->cxlr = cxled->cxld.region;
+
+	return 1;
+}
+
+static struct cxl_region *cxl_dpa_to_region(struct cxl_memdev *cxlmd, u64 dpa)
+{
+	struct cxl_dpa_to_region_context ctx;
+	struct cxl_port *port;
+
+	ctx = (struct cxl_dpa_to_region_context) {
+		.dpa = dpa,
+	};
+	port = cxlmd->endpoint;
+	if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
+		device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region);
+
+	return ctx.cxlr;
+}
+
+static int cxl_validate_poison_dpa(struct cxl_memdev *cxlmd, u64 dpa)
+{
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
+
+	if (!resource_size(&cxlds->dpa_res)) {
+		dev_dbg(cxlds->dev, "device has no dpa resource\n");
+		return -EINVAL;
+	}
+	if (dpa < cxlds->dpa_res.start || dpa > cxlds->dpa_res.end) {
+		dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n",
+			dpa, &cxlds->dpa_res);
+		return -EINVAL;
+	}
+	if (!IS_ALIGNED(dpa, 64)) {
+		dev_dbg(cxlds->dev, "dpa:0x%llx is not 64-byte aligned\n", dpa);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa)
+{
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct cxl_mbox_inject_poison inject;
+	struct cxl_poison_record record;
+	struct cxl_mbox_cmd mbox_cmd;
+	struct cxl_region *cxlr;
+	int rc;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	rc = down_read_interruptible(&cxl_dpa_rwsem);
+	if (rc) {
+		up_read(&cxl_region_rwsem);
+		return rc;
+	}
+
+	rc = cxl_validate_poison_dpa(cxlmd, dpa);
+	if (rc)
+		goto out;
+
+	inject.address = cpu_to_le64(dpa);
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_INJECT_POISON,
+		.size_in = sizeof(inject),
+		.payload_in = &inject,
+	};
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc)
+		goto out;
+
+	cxlr = cxl_dpa_to_region(cxlmd, dpa);
+	if (cxlr)
+		dev_warn_once(mds->cxlds.dev,
+			      "poison inject dpa:%#llx region: %s\n", dpa,
+			      dev_name(&cxlr->dev));
+
+	record = (struct cxl_poison_record) {
+		.address = cpu_to_le64(dpa),
+		.length = cpu_to_le32(1),
+	};
+	trace_cxl_poison(cxlmd, cxlr, &record, 0, 0, CXL_POISON_TRACE_INJECT);
+out:
+	up_read(&cxl_dpa_rwsem);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, CXL);
+
+int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
+{
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct cxl_mbox_clear_poison clear;
+	struct cxl_poison_record record;
+	struct cxl_mbox_cmd mbox_cmd;
+	struct cxl_region *cxlr;
+	int rc;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	rc = down_read_interruptible(&cxl_dpa_rwsem);
+	if (rc) {
+		up_read(&cxl_region_rwsem);
+		return rc;
+	}
+
+	rc = cxl_validate_poison_dpa(cxlmd, dpa);
+	if (rc)
+		goto out;
+
+	/*
+	 * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command
+	 * is defined to accept 64 bytes of write-data, along with the
+	 * address to clear. This driver uses zeroes as write-data.
+	 */
+	clear = (struct cxl_mbox_clear_poison) {
+		.address = cpu_to_le64(dpa)
+	};
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_CLEAR_POISON,
+		.size_in = sizeof(clear),
+		.payload_in = &clear,
+	};
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc)
+		goto out;
+
+	cxlr = cxl_dpa_to_region(cxlmd, dpa);
+	if (cxlr)
+		dev_warn_once(mds->cxlds.dev,
+			      "poison clear dpa:%#llx region: %s\n", dpa,
+			      dev_name(&cxlr->dev));
+
+	record = (struct cxl_poison_record) {
+		.address = cpu_to_le64(dpa),
+		.length = cpu_to_le32(1),
+	};
+	trace_cxl_poison(cxlmd, cxlr, &record, 0, 0, CXL_POISON_TRACE_CLEAR);
+out:
+	up_read(&cxl_dpa_rwsem);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, CXL);
+
+static struct attribute *cxl_memdev_attributes[] = {
+	&dev_attr_serial.attr,
+	&dev_attr_firmware_version.attr,
+	&dev_attr_payload_max.attr,
+	&dev_attr_label_storage_size.attr,
+	&dev_attr_numa_node.attr,
+	NULL,
+};
+
+static struct attribute *cxl_memdev_pmem_attributes[] = {
+	&dev_attr_pmem_size.attr,
+	NULL,
+};
+
+static struct attribute *cxl_memdev_ram_attributes[] = {
+	&dev_attr_ram_size.attr,
+	NULL,
+};
+
+static struct attribute *cxl_memdev_security_attributes[] = {
+	&dev_attr_security_state.attr,
+	&dev_attr_security_sanitize.attr,
+	&dev_attr_security_erase.attr,
+	NULL,
+};
+
+static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
+				  int n)
+{
+	if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr)
+		return 0;
+	return a->mode;
+}
+
+static struct attribute_group cxl_memdev_attribute_group = {
+	.attrs = cxl_memdev_attributes,
+	.is_visible = cxl_memdev_visible,
+};
+
+static struct attribute_group cxl_memdev_ram_attribute_group = {
+	.name = "ram",
+	.attrs = cxl_memdev_ram_attributes,
+};
+
+static struct attribute_group cxl_memdev_pmem_attribute_group = {
+	.name = "pmem",
+	.attrs = cxl_memdev_pmem_attributes,
+};
+
+static umode_t cxl_memdev_security_visible(struct kobject *kobj,
+					   struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+
+	if (a == &dev_attr_security_sanitize.attr &&
+	    !test_bit(CXL_SEC_ENABLED_SANITIZE, mds->security.enabled_cmds))
+		return 0;
+
+	if (a == &dev_attr_security_erase.attr &&
+	    !test_bit(CXL_SEC_ENABLED_SECURE_ERASE, mds->security.enabled_cmds))
+		return 0;
+
+	return a->mode;
+}
+
+static struct attribute_group cxl_memdev_security_attribute_group = {
+	.name = "security",
+	.attrs = cxl_memdev_security_attributes,
+	.is_visible = cxl_memdev_security_visible,
+};
+
+static const struct attribute_group *cxl_memdev_attribute_groups[] = {
+	&cxl_memdev_attribute_group,
+	&cxl_memdev_ram_attribute_group,
+	&cxl_memdev_pmem_attribute_group,
+	&cxl_memdev_security_attribute_group,
+	NULL,
+};
+
+static const struct device_type cxl_memdev_type = {
+	.name = "cxl_memdev",
+	.release = cxl_memdev_release,
+	.devnode = cxl_memdev_devnode,
+	.groups = cxl_memdev_attribute_groups,
+};
+
+bool is_cxl_memdev(const struct device *dev)
+{
+	return dev->type == &cxl_memdev_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, CXL);
+
+/**
+ * set_exclusive_cxl_commands() - atomically disable user cxl commands
+ * @mds: The device state to operate on
+ * @cmds: bitmap of commands to mark exclusive
+ *
+ * Grab the cxl_memdev_rwsem in write mode to flush in-flight
+ * invocations of the ioctl path and then disable future execution of
+ * commands with the command ids set in @cmds.
+ */
+void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
+				unsigned long *cmds)
+{
+	down_write(&cxl_memdev_rwsem);
+	bitmap_or(mds->exclusive_cmds, mds->exclusive_cmds, cmds,
+		  CXL_MEM_COMMAND_ID_MAX);
+	up_write(&cxl_memdev_rwsem);
+}
+EXPORT_SYMBOL_NS_GPL(set_exclusive_cxl_commands, CXL);
+
+/**
+ * clear_exclusive_cxl_commands() - atomically enable user cxl commands
+ * @mds: The device state to modify
+ * @cmds: bitmap of commands to mark available for userspace
+ */
+void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
+				  unsigned long *cmds)
+{
+	down_write(&cxl_memdev_rwsem);
+	bitmap_andnot(mds->exclusive_cmds, mds->exclusive_cmds, cmds,
+		      CXL_MEM_COMMAND_ID_MAX);
+	up_write(&cxl_memdev_rwsem);
+}
+EXPORT_SYMBOL_NS_GPL(clear_exclusive_cxl_commands, CXL);
+
+static void cxl_memdev_shutdown(struct device *dev)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+
+	down_write(&cxl_memdev_rwsem);
+	cxlmd->cxlds = NULL;
+	up_write(&cxl_memdev_rwsem);
+}
+
+static void cxl_memdev_unregister(void *_cxlmd)
+{
+	struct cxl_memdev *cxlmd = _cxlmd;
+	struct device *dev = &cxlmd->dev;
+
+	cdev_device_del(&cxlmd->cdev, dev);
+	cxl_memdev_shutdown(dev);
+	put_device(dev);
+}
+
+static void detach_memdev(struct work_struct *work)
+{
+	struct cxl_memdev *cxlmd;
+
+	cxlmd = container_of(work, typeof(*cxlmd), detach_work);
+	device_release_driver(&cxlmd->dev);
+	put_device(&cxlmd->dev);
+}
+
+static struct lock_class_key cxl_memdev_key;
+
+static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds,
+					   const struct file_operations *fops)
+{
+	struct cxl_memdev *cxlmd;
+	struct device *dev;
+	struct cdev *cdev;
+	int rc;
+
+	cxlmd = kzalloc(sizeof(*cxlmd), GFP_KERNEL);
+	if (!cxlmd)
+		return ERR_PTR(-ENOMEM);
+
+	rc = ida_alloc_max(&cxl_memdev_ida, CXL_MEM_MAX_DEVS - 1, GFP_KERNEL);
+	if (rc < 0)
+		goto err;
+	cxlmd->id = rc;
+	cxlmd->depth = -1;
+
+	dev = &cxlmd->dev;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_memdev_key);
+	dev->parent = cxlds->dev;
+	dev->bus = &cxl_bus_type;
+	dev->devt = MKDEV(cxl_mem_major, cxlmd->id);
+	dev->type = &cxl_memdev_type;
+	device_set_pm_not_required(dev);
+	INIT_WORK(&cxlmd->detach_work, detach_memdev);
+
+	cdev = &cxlmd->cdev;
+	cdev_init(cdev, fops);
+	return cxlmd;
+
+err:
+	kfree(cxlmd);
+	return ERR_PTR(rc);
+}
+
+static long __cxl_memdev_ioctl(struct cxl_memdev *cxlmd, unsigned int cmd,
+			       unsigned long arg)
+{
+	switch (cmd) {
+	case CXL_MEM_QUERY_COMMANDS:
+		return cxl_query_cmd(cxlmd, (void __user *)arg);
+	case CXL_MEM_SEND_COMMAND:
+		return cxl_send_cmd(cxlmd, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct cxl_memdev *cxlmd = file->private_data;
+	struct cxl_dev_state *cxlds;
+	int rc = -ENXIO;
+
+	down_read(&cxl_memdev_rwsem);
+	cxlds = cxlmd->cxlds;
+	if (cxlds && cxlds->type == CXL_DEVTYPE_CLASSMEM)
+		rc = __cxl_memdev_ioctl(cxlmd, cmd, arg);
+	up_read(&cxl_memdev_rwsem);
+
+	return rc;
+}
+
+static int cxl_memdev_open(struct inode *inode, struct file *file)
+{
+	struct cxl_memdev *cxlmd =
+		container_of(inode->i_cdev, typeof(*cxlmd), cdev);
+
+	get_device(&cxlmd->dev);
+	file->private_data = cxlmd;
+
+	return 0;
+}
+
+static int cxl_memdev_release_file(struct inode *inode, struct file *file)
+{
+	struct cxl_memdev *cxlmd =
+		container_of(inode->i_cdev, typeof(*cxlmd), cdev);
+
+	put_device(&cxlmd->dev);
+
+	return 0;
+}
+
+/**
+ * cxl_mem_get_fw_info - Get Firmware info
+ * @mds: The device data for the operation
+ *
+ * Retrieve firmware info for the device specified.
+ *
+ * Return: 0 if no error: or the result of the mailbox command.
+ *
+ * See CXL-3.0 8.2.9.3.1 Get FW Info
+ */
+static int cxl_mem_get_fw_info(struct cxl_memdev_state *mds)
+{
+	struct cxl_mbox_get_fw_info info;
+	struct cxl_mbox_cmd mbox_cmd;
+	int rc;
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_GET_FW_INFO,
+		.size_out = sizeof(info),
+		.payload_out = &info,
+	};
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0)
+		return rc;
+
+	mds->fw.num_slots = info.num_slots;
+	mds->fw.cur_slot = FIELD_GET(CXL_FW_INFO_SLOT_INFO_CUR_MASK,
+				       info.slot_info);
+
+	return 0;
+}
+
+/**
+ * cxl_mem_activate_fw - Activate Firmware
+ * @mds: The device data for the operation
+ * @slot: slot number to activate
+ *
+ * Activate firmware in a given slot for the device specified.
+ *
+ * Return: 0 if no error: or the result of the mailbox command.
+ *
+ * See CXL-3.0 8.2.9.3.3 Activate FW
+ */
+static int cxl_mem_activate_fw(struct cxl_memdev_state *mds, int slot)
+{
+	struct cxl_mbox_activate_fw activate;
+	struct cxl_mbox_cmd mbox_cmd;
+
+	if (slot == 0 || slot > mds->fw.num_slots)
+		return -EINVAL;
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_ACTIVATE_FW,
+		.size_in = sizeof(activate),
+		.payload_in = &activate,
+	};
+
+	/* Only offline activation supported for now */
+	activate.action = CXL_FW_ACTIVATE_OFFLINE;
+	activate.slot = slot;
+
+	return cxl_internal_send_cmd(mds, &mbox_cmd);
+}
+
+/**
+ * cxl_mem_abort_fw_xfer - Abort an in-progress FW transfer
+ * @mds: The device data for the operation
+ *
+ * Abort an in-progress firmware transfer for the device specified.
+ *
+ * Return: 0 if no error: or the result of the mailbox command.
+ *
+ * See CXL-3.0 8.2.9.3.2 Transfer FW
+ */
+static int cxl_mem_abort_fw_xfer(struct cxl_memdev_state *mds)
+{
+	struct cxl_mbox_transfer_fw *transfer;
+	struct cxl_mbox_cmd mbox_cmd;
+	int rc;
+
+	transfer = kzalloc(struct_size(transfer, data, 0), GFP_KERNEL);
+	if (!transfer)
+		return -ENOMEM;
+
+	/* Set a 1s poll interval and a total wait time of 30s */
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_TRANSFER_FW,
+		.size_in = sizeof(*transfer),
+		.payload_in = transfer,
+		.poll_interval_ms = 1000,
+		.poll_count = 30,
+	};
+
+	transfer->action = CXL_FW_TRANSFER_ACTION_ABORT;
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	kfree(transfer);
+	return rc;
+}
+
+static void cxl_fw_cleanup(struct fw_upload *fwl)
+{
+	struct cxl_memdev_state *mds = fwl->dd_handle;
+
+	mds->fw.next_slot = 0;
+}
+
+static int cxl_fw_do_cancel(struct fw_upload *fwl)
+{
+	struct cxl_memdev_state *mds = fwl->dd_handle;
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	int rc;
+
+	rc = cxl_mem_abort_fw_xfer(mds);
+	if (rc < 0)
+		dev_err(&cxlmd->dev, "Error aborting FW transfer: %d\n", rc);
+
+	return FW_UPLOAD_ERR_CANCELED;
+}
+
+static enum fw_upload_err cxl_fw_prepare(struct fw_upload *fwl, const u8 *data,
+					 u32 size)
+{
+	struct cxl_memdev_state *mds = fwl->dd_handle;
+	struct cxl_mbox_transfer_fw *transfer;
+
+	if (!size)
+		return FW_UPLOAD_ERR_INVALID_SIZE;
+
+	mds->fw.oneshot = struct_size(transfer, data, size) <
+			    mds->payload_size;
+
+	if (cxl_mem_get_fw_info(mds))
+		return FW_UPLOAD_ERR_HW_ERROR;
+
+	/*
+	 * So far no state has been changed, hence no other cleanup is
+	 * necessary. Simply return the cancelled status.
+	 */
+	if (test_and_clear_bit(CXL_FW_CANCEL, mds->fw.state))
+		return FW_UPLOAD_ERR_CANCELED;
+
+	return FW_UPLOAD_ERR_NONE;
+}
+
+static enum fw_upload_err cxl_fw_write(struct fw_upload *fwl, const u8 *data,
+				       u32 offset, u32 size, u32 *written)
+{
+	struct cxl_memdev_state *mds = fwl->dd_handle;
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct cxl_mbox_transfer_fw *transfer;
+	struct cxl_mbox_cmd mbox_cmd;
+	u32 cur_size, remaining;
+	size_t size_in;
+	int rc;
+
+	*written = 0;
+
+	/* Offset has to be aligned to 128B (CXL-3.0 8.2.9.3.2 Table 8-57) */
+	if (!IS_ALIGNED(offset, CXL_FW_TRANSFER_ALIGNMENT)) {
+		dev_err(&cxlmd->dev,
+			"misaligned offset for FW transfer slice (%u)\n",
+			offset);
+		return FW_UPLOAD_ERR_RW_ERROR;
+	}
+
+	/*
+	 * Pick transfer size based on mds->payload_size @size must bw 128-byte
+	 * aligned, ->payload_size is a power of 2 starting at 256 bytes, and
+	 * sizeof(*transfer) is 128.  These constraints imply that @cur_size
+	 * will always be 128b aligned.
+	 */
+	cur_size = min_t(size_t, size, mds->payload_size - sizeof(*transfer));
+
+	remaining = size - cur_size;
+	size_in = struct_size(transfer, data, cur_size);
+
+	if (test_and_clear_bit(CXL_FW_CANCEL, mds->fw.state))
+		return cxl_fw_do_cancel(fwl);
+
+	/*
+	 * Slot numbers are 1-indexed
+	 * cur_slot is the 0-indexed next_slot (i.e. 'cur_slot - 1 + 1')
+	 * Check for rollover using modulo, and 1-index it by adding 1
+	 */
+	mds->fw.next_slot = (mds->fw.cur_slot % mds->fw.num_slots) + 1;
+
+	/* Do the transfer via mailbox cmd */
+	transfer = kzalloc(size_in, GFP_KERNEL);
+	if (!transfer)
+		return FW_UPLOAD_ERR_RW_ERROR;
+
+	transfer->offset = cpu_to_le32(offset / CXL_FW_TRANSFER_ALIGNMENT);
+	memcpy(transfer->data, data + offset, cur_size);
+	if (mds->fw.oneshot) {
+		transfer->action = CXL_FW_TRANSFER_ACTION_FULL;
+		transfer->slot = mds->fw.next_slot;
+	} else {
+		if (offset == 0) {
+			transfer->action = CXL_FW_TRANSFER_ACTION_INITIATE;
+		} else if (remaining == 0) {
+			transfer->action = CXL_FW_TRANSFER_ACTION_END;
+			transfer->slot = mds->fw.next_slot;
+		} else {
+			transfer->action = CXL_FW_TRANSFER_ACTION_CONTINUE;
+		}
+	}
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_TRANSFER_FW,
+		.size_in = size_in,
+		.payload_in = transfer,
+		.poll_interval_ms = 1000,
+		.poll_count = 30,
+	};
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0) {
+		rc = FW_UPLOAD_ERR_RW_ERROR;
+		goto out_free;
+	}
+
+	*written = cur_size;
+
+	/* Activate FW if oneshot or if the last slice was written */
+	if (mds->fw.oneshot || remaining == 0) {
+		dev_dbg(&cxlmd->dev, "Activating firmware slot: %d\n",
+			mds->fw.next_slot);
+		rc = cxl_mem_activate_fw(mds, mds->fw.next_slot);
+		if (rc < 0) {
+			dev_err(&cxlmd->dev, "Error activating firmware: %d\n",
+				rc);
+			rc = FW_UPLOAD_ERR_HW_ERROR;
+			goto out_free;
+		}
+	}
+
+	rc = FW_UPLOAD_ERR_NONE;
+
+out_free:
+	kfree(transfer);
+	return rc;
+}
+
+static enum fw_upload_err cxl_fw_poll_complete(struct fw_upload *fwl)
+{
+	struct cxl_memdev_state *mds = fwl->dd_handle;
+
+	/*
+	 * cxl_internal_send_cmd() handles background operations synchronously.
+	 * No need to wait for completions here - any errors would've been
+	 * reported and handled during the ->write() call(s).
+	 * Just check if a cancel request was received, and return success.
+	 */
+	if (test_and_clear_bit(CXL_FW_CANCEL, mds->fw.state))
+		return cxl_fw_do_cancel(fwl);
+
+	return FW_UPLOAD_ERR_NONE;
+}
+
+static void cxl_fw_cancel(struct fw_upload *fwl)
+{
+	struct cxl_memdev_state *mds = fwl->dd_handle;
+
+	set_bit(CXL_FW_CANCEL, mds->fw.state);
+}
+
+static const struct fw_upload_ops cxl_memdev_fw_ops = {
+        .prepare = cxl_fw_prepare,
+        .write = cxl_fw_write,
+        .poll_complete = cxl_fw_poll_complete,
+        .cancel = cxl_fw_cancel,
+        .cleanup = cxl_fw_cleanup,
+};
+
+static void cxl_remove_fw_upload(void *fwl)
+{
+	firmware_upload_unregister(fwl);
+}
+
+int devm_cxl_setup_fw_upload(struct device *host, struct cxl_memdev_state *mds)
+{
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct device *dev = &cxlds->cxlmd->dev;
+	struct fw_upload *fwl;
+
+	if (!test_bit(CXL_MEM_COMMAND_ID_GET_FW_INFO, mds->enabled_cmds))
+		return 0;
+
+	fwl = firmware_upload_register(THIS_MODULE, dev, dev_name(dev),
+				       &cxl_memdev_fw_ops, mds);
+	if (IS_ERR(fwl))
+		return PTR_ERR(fwl);
+	return devm_add_action_or_reset(host, cxl_remove_fw_upload, fwl);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_fw_upload, CXL);
+
+static const struct file_operations cxl_memdev_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = cxl_memdev_ioctl,
+	.open = cxl_memdev_open,
+	.release = cxl_memdev_release_file,
+	.compat_ioctl = compat_ptr_ioctl,
+	.llseek = noop_llseek,
+};
+
+struct cxl_memdev *devm_cxl_add_memdev(struct device *host,
+				       struct cxl_dev_state *cxlds)
+{
+	struct cxl_memdev *cxlmd;
+	struct device *dev;
+	struct cdev *cdev;
+	int rc;
+
+	cxlmd = cxl_memdev_alloc(cxlds, &cxl_memdev_fops);
+	if (IS_ERR(cxlmd))
+		return cxlmd;
+
+	dev = &cxlmd->dev;
+	rc = dev_set_name(dev, "mem%d", cxlmd->id);
+	if (rc)
+		goto err;
+
+	/*
+	 * Activate ioctl operations, no cxl_memdev_rwsem manipulation
+	 * needed as this is ordered with cdev_add() publishing the device.
+	 */
+	cxlmd->cxlds = cxlds;
+	cxlds->cxlmd = cxlmd;
+
+	cdev = &cxlmd->cdev;
+	rc = cdev_device_add(cdev, dev);
+	if (rc)
+		goto err;
+
+	rc = devm_add_action_or_reset(host, cxl_memdev_unregister, cxlmd);
+	if (rc)
+		return ERR_PTR(rc);
+	return cxlmd;
+
+err:
+	/*
+	 * The cdev was briefly live, shutdown any ioctl operations that
+	 * saw that state.
+	 */
+	cxl_memdev_shutdown(dev);
+	put_device(dev);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, CXL);
+
+static void sanitize_teardown_notifier(void *data)
+{
+	struct cxl_memdev_state *mds = data;
+	struct kernfs_node *state;
+
+	/*
+	 * Prevent new irq triggered invocations of the workqueue and
+	 * flush inflight invocations.
+	 */
+	mutex_lock(&mds->mbox_mutex);
+	state = mds->security.sanitize_node;
+	mds->security.sanitize_node = NULL;
+	mutex_unlock(&mds->mbox_mutex);
+
+	cancel_delayed_work_sync(&mds->security.poll_dwork);
+	sysfs_put(state);
+}
+
+int devm_cxl_sanitize_setup_notifier(struct device *host,
+				     struct cxl_memdev *cxlmd)
+{
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+	struct kernfs_node *sec;
+
+	if (!test_bit(CXL_SEC_ENABLED_SANITIZE, mds->security.enabled_cmds))
+		return 0;
+
+	/*
+	 * Note, the expectation is that @cxlmd would have failed to be
+	 * created if these sysfs_get_dirent calls fail.
+	 */
+	sec = sysfs_get_dirent(cxlmd->dev.kobj.sd, "security");
+	if (!sec)
+		return -ENOENT;
+	mds->security.sanitize_node = sysfs_get_dirent(sec, "state");
+	sysfs_put(sec);
+	if (!mds->security.sanitize_node)
+		return -ENOENT;
+
+	return devm_add_action_or_reset(host, sanitize_teardown_notifier, mds);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_sanitize_setup_notifier, CXL);
+
+__init int cxl_memdev_init(void)
+{
+	dev_t devt;
+	int rc;
+
+	rc = alloc_chrdev_region(&devt, 0, CXL_MEM_MAX_DEVS, "cxl");
+	if (rc)
+		return rc;
+
+	cxl_mem_major = MAJOR(devt);
+
+	return 0;
+}
+
+void cxl_memdev_exit(void)
+{
+	unregister_chrdev_region(MKDEV(cxl_mem_major, 0), CXL_MEM_MAX_DEVS);
+}
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
new file mode 100644
index 0000000000..c7a7887ebd
--- /dev/null
+++ b/drivers/cxl/core/pci.c
@@ -0,0 +1,757 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2021 Intel Corporation. All rights reserved. */
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/pci-doe.h>
+#include <cxlpci.h>
+#include <cxlmem.h>
+#include <cxl.h>
+#include "core.h"
+#include "trace.h"
+
+/**
+ * DOC: cxl core pci
+ *
+ * Compute Express Link protocols are layered on top of PCIe. CXL core provides
+ * a set of helpers for CXL interactions which occur via PCIe.
+ */
+
+static unsigned short media_ready_timeout = 60;
+module_param(media_ready_timeout, ushort, 0644);
+MODULE_PARM_DESC(media_ready_timeout, "seconds to wait for media ready");
+
+struct cxl_walk_context {
+	struct pci_bus *bus;
+	struct cxl_port *port;
+	int type;
+	int error;
+	int count;
+};
+
+static int match_add_dports(struct pci_dev *pdev, void *data)
+{
+	struct cxl_walk_context *ctx = data;
+	struct cxl_port *port = ctx->port;
+	int type = pci_pcie_type(pdev);
+	struct cxl_register_map map;
+	struct cxl_dport *dport;
+	u32 lnkcap, port_num;
+	int rc;
+
+	if (pdev->bus != ctx->bus)
+		return 0;
+	if (!pci_is_pcie(pdev))
+		return 0;
+	if (type != ctx->type)
+		return 0;
+	if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP,
+				  &lnkcap))
+		return 0;
+
+	rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map);
+	if (rc)
+		dev_dbg(&port->dev, "failed to find component registers\n");
+
+	port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap);
+	dport = devm_cxl_add_dport(port, &pdev->dev, port_num, map.resource);
+	if (IS_ERR(dport)) {
+		ctx->error = PTR_ERR(dport);
+		return PTR_ERR(dport);
+	}
+	ctx->count++;
+
+	return 0;
+}
+
+/**
+ * devm_cxl_port_enumerate_dports - enumerate downstream ports of the upstream port
+ * @port: cxl_port whose ->uport_dev is the upstream of dports to be enumerated
+ *
+ * Returns a positive number of dports enumerated or a negative error
+ * code.
+ */
+int devm_cxl_port_enumerate_dports(struct cxl_port *port)
+{
+	struct pci_bus *bus = cxl_port_to_pci_bus(port);
+	struct cxl_walk_context ctx;
+	int type;
+
+	if (!bus)
+		return -ENXIO;
+
+	if (pci_is_root_bus(bus))
+		type = PCI_EXP_TYPE_ROOT_PORT;
+	else
+		type = PCI_EXP_TYPE_DOWNSTREAM;
+
+	ctx = (struct cxl_walk_context) {
+		.port = port,
+		.bus = bus,
+		.type = type,
+	};
+	pci_walk_bus(bus, match_add_dports, &ctx);
+
+	if (ctx.count == 0)
+		return -ENODEV;
+	if (ctx.error)
+		return ctx.error;
+	return ctx.count;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_port_enumerate_dports, CXL);
+
+static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id)
+{
+	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+	int d = cxlds->cxl_dvsec;
+	bool valid = false;
+	int rc, i;
+	u32 temp;
+
+	if (id > CXL_DVSEC_RANGE_MAX)
+		return -EINVAL;
+
+	/* Check MEM INFO VALID bit first, give up after 1s */
+	i = 1;
+	do {
+		rc = pci_read_config_dword(pdev,
+					   d + CXL_DVSEC_RANGE_SIZE_LOW(id),
+					   &temp);
+		if (rc)
+			return rc;
+
+		valid = FIELD_GET(CXL_DVSEC_MEM_INFO_VALID, temp);
+		if (valid)
+			break;
+		msleep(1000);
+	} while (i--);
+
+	if (!valid) {
+		dev_err(&pdev->dev,
+			"Timeout awaiting memory range %d valid after 1s.\n",
+			id);
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id)
+{
+	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+	int d = cxlds->cxl_dvsec;
+	bool active = false;
+	int rc, i;
+	u32 temp;
+
+	if (id > CXL_DVSEC_RANGE_MAX)
+		return -EINVAL;
+
+	/* Check MEM ACTIVE bit, up to 60s timeout by default */
+	for (i = media_ready_timeout; i; i--) {
+		rc = pci_read_config_dword(
+			pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(id), &temp);
+		if (rc)
+			return rc;
+
+		active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp);
+		if (active)
+			break;
+		msleep(1000);
+	}
+
+	if (!active) {
+		dev_err(&pdev->dev,
+			"timeout awaiting memory active after %d seconds\n",
+			media_ready_timeout);
+		return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+/*
+ * Wait up to @media_ready_timeout for the device to report memory
+ * active.
+ */
+int cxl_await_media_ready(struct cxl_dev_state *cxlds)
+{
+	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+	int d = cxlds->cxl_dvsec;
+	int rc, i, hdm_count;
+	u64 md_status;
+	u16 cap;
+
+	rc = pci_read_config_word(pdev,
+				  d + CXL_DVSEC_CAP_OFFSET, &cap);
+	if (rc)
+		return rc;
+
+	hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap);
+	for (i = 0; i < hdm_count; i++) {
+		rc = cxl_dvsec_mem_range_valid(cxlds, i);
+		if (rc)
+			return rc;
+	}
+
+	for (i = 0; i < hdm_count; i++) {
+		rc = cxl_dvsec_mem_range_active(cxlds, i);
+		if (rc)
+			return rc;
+	}
+
+	md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+	if (!CXLMDEV_READY(md_status))
+		return -EIO;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_await_media_ready, CXL);
+
+static int wait_for_valid(struct pci_dev *pdev, int d)
+{
+	u32 val;
+	int rc;
+
+	/*
+	 * Memory_Info_Valid: When set, indicates that the CXL Range 1 Size high
+	 * and Size Low registers are valid. Must be set within 1 second of
+	 * deassertion of reset to CXL device. Likely it is already set by the
+	 * time this runs, but otherwise give a 1.5 second timeout in case of
+	 * clock skew.
+	 */
+	rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val);
+	if (rc)
+		return rc;
+
+	if (val & CXL_DVSEC_MEM_INFO_VALID)
+		return 0;
+
+	msleep(1500);
+
+	rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val);
+	if (rc)
+		return rc;
+
+	if (val & CXL_DVSEC_MEM_INFO_VALID)
+		return 0;
+
+	return -ETIMEDOUT;
+}
+
+static int cxl_set_mem_enable(struct cxl_dev_state *cxlds, u16 val)
+{
+	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+	int d = cxlds->cxl_dvsec;
+	u16 ctrl;
+	int rc;
+
+	rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl);
+	if (rc < 0)
+		return rc;
+
+	if ((ctrl & CXL_DVSEC_MEM_ENABLE) == val)
+		return 1;
+	ctrl &= ~CXL_DVSEC_MEM_ENABLE;
+	ctrl |= val;
+
+	rc = pci_write_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, ctrl);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+static void clear_mem_enable(void *cxlds)
+{
+	cxl_set_mem_enable(cxlds, 0);
+}
+
+static int devm_cxl_enable_mem(struct device *host, struct cxl_dev_state *cxlds)
+{
+	int rc;
+
+	rc = cxl_set_mem_enable(cxlds, CXL_DVSEC_MEM_ENABLE);
+	if (rc < 0)
+		return rc;
+	if (rc > 0)
+		return 0;
+	return devm_add_action_or_reset(host, clear_mem_enable, cxlds);
+}
+
+/* require dvsec ranges to be covered by a locked platform window */
+static int dvsec_range_allowed(struct device *dev, void *arg)
+{
+	struct range *dev_range = arg;
+	struct cxl_decoder *cxld;
+
+	if (!is_root_decoder(dev))
+		return 0;
+
+	cxld = to_cxl_decoder(dev);
+
+	if (!(cxld->flags & CXL_DECODER_F_RAM))
+		return 0;
+
+	return range_contains(&cxld->hpa_range, dev_range);
+}
+
+static void disable_hdm(void *_cxlhdm)
+{
+	u32 global_ctrl;
+	struct cxl_hdm *cxlhdm = _cxlhdm;
+	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+
+	global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+	writel(global_ctrl & ~CXL_HDM_DECODER_ENABLE,
+	       hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+}
+
+static int devm_cxl_enable_hdm(struct device *host, struct cxl_hdm *cxlhdm)
+{
+	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+	u32 global_ctrl;
+
+	global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+	writel(global_ctrl | CXL_HDM_DECODER_ENABLE,
+	       hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+
+	return devm_add_action_or_reset(host, disable_hdm, cxlhdm);
+}
+
+int cxl_dvsec_rr_decode(struct device *dev, int d,
+			struct cxl_endpoint_dvsec_info *info)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int hdm_count, rc, i, ranges = 0;
+	u16 cap, ctrl;
+
+	if (!d) {
+		dev_dbg(dev, "No DVSEC Capability\n");
+		return -ENXIO;
+	}
+
+	rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap);
+	if (rc)
+		return rc;
+
+	rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl);
+	if (rc)
+		return rc;
+
+	if (!(cap & CXL_DVSEC_MEM_CAPABLE)) {
+		dev_dbg(dev, "Not MEM Capable\n");
+		return -ENXIO;
+	}
+
+	/*
+	 * It is not allowed by spec for MEM.capable to be set and have 0 legacy
+	 * HDM decoders (values > 2 are also undefined as of CXL 2.0). As this
+	 * driver is for a spec defined class code which must be CXL.mem
+	 * capable, there is no point in continuing to enable CXL.mem.
+	 */
+	hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap);
+	if (!hdm_count || hdm_count > 2)
+		return -EINVAL;
+
+	rc = wait_for_valid(pdev, d);
+	if (rc) {
+		dev_dbg(dev, "Failure awaiting MEM_INFO_VALID (%d)\n", rc);
+		return rc;
+	}
+
+	/*
+	 * The current DVSEC values are moot if the memory capability is
+	 * disabled, and they will remain moot after the HDM Decoder
+	 * capability is enabled.
+	 */
+	info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl);
+	if (!info->mem_enabled)
+		return 0;
+
+	for (i = 0; i < hdm_count; i++) {
+		u64 base, size;
+		u32 temp;
+
+		rc = pci_read_config_dword(
+			pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp);
+		if (rc)
+			return rc;
+
+		size = (u64)temp << 32;
+
+		rc = pci_read_config_dword(
+			pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp);
+		if (rc)
+			return rc;
+
+		size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK;
+		if (!size) {
+			info->dvsec_range[i] = (struct range) {
+				.start = 0,
+				.end = CXL_RESOURCE_NONE,
+			};
+			continue;
+		}
+
+		rc = pci_read_config_dword(
+			pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp);
+		if (rc)
+			return rc;
+
+		base = (u64)temp << 32;
+
+		rc = pci_read_config_dword(
+			pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp);
+		if (rc)
+			return rc;
+
+		base |= temp & CXL_DVSEC_MEM_BASE_LOW_MASK;
+
+		info->dvsec_range[i] = (struct range) {
+			.start = base,
+			.end = base + size - 1
+		};
+
+		ranges++;
+	}
+
+	info->ranges = ranges;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dvsec_rr_decode, CXL);
+
+/**
+ * cxl_hdm_decode_init() - Setup HDM decoding for the endpoint
+ * @cxlds: Device state
+ * @cxlhdm: Mapped HDM decoder Capability
+ * @info: Cached DVSEC range registers info
+ *
+ * Try to enable the endpoint's HDM Decoder Capability
+ */
+int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
+			struct cxl_endpoint_dvsec_info *info)
+{
+	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+	struct cxl_port *port = cxlhdm->port;
+	struct device *dev = cxlds->dev;
+	struct cxl_port *root;
+	int i, rc, allowed;
+	u32 global_ctrl = 0;
+
+	if (hdm)
+		global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+
+	/*
+	 * If the HDM Decoder Capability is already enabled then assume
+	 * that some other agent like platform firmware set it up.
+	 */
+	if (global_ctrl & CXL_HDM_DECODER_ENABLE || (!hdm && info->mem_enabled))
+		return devm_cxl_enable_mem(&port->dev, cxlds);
+	else if (!hdm)
+		return -ENODEV;
+
+	root = to_cxl_port(port->dev.parent);
+	while (!is_cxl_root(root) && is_cxl_port(root->dev.parent))
+		root = to_cxl_port(root->dev.parent);
+	if (!is_cxl_root(root)) {
+		dev_err(dev, "Failed to acquire root port for HDM enable\n");
+		return -ENODEV;
+	}
+
+	for (i = 0, allowed = 0; info->mem_enabled && i < info->ranges; i++) {
+		struct device *cxld_dev;
+
+		cxld_dev = device_find_child(&root->dev, &info->dvsec_range[i],
+					     dvsec_range_allowed);
+		if (!cxld_dev) {
+			dev_dbg(dev, "DVSEC Range%d denied by platform\n", i);
+			continue;
+		}
+		dev_dbg(dev, "DVSEC Range%d allowed by platform\n", i);
+		put_device(cxld_dev);
+		allowed++;
+	}
+
+	if (!allowed) {
+		cxl_set_mem_enable(cxlds, 0);
+		info->mem_enabled = 0;
+	}
+
+	/*
+	 * Per CXL 2.0 Section 8.1.3.8.3 and 8.1.3.8.4 DVSEC CXL Range 1 Base
+	 * [High,Low] when HDM operation is enabled the range register values
+	 * are ignored by the device, but the spec also recommends matching the
+	 * DVSEC Range 1,2 to HDM Decoder Range 0,1. So, non-zero info->ranges
+	 * are expected even though Linux does not require or maintain that
+	 * match. If at least one DVSEC range is enabled and allowed, skip HDM
+	 * Decoder Capability Enable.
+	 */
+	if (info->mem_enabled)
+		return 0;
+
+	rc = devm_cxl_enable_hdm(&port->dev, cxlhdm);
+	if (rc)
+		return rc;
+
+	return devm_cxl_enable_mem(&port->dev, cxlds);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, CXL);
+
+#define CXL_DOE_TABLE_ACCESS_REQ_CODE		0x000000ff
+#define   CXL_DOE_TABLE_ACCESS_REQ_CODE_READ	0
+#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE		0x0000ff00
+#define   CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA	0
+#define CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE	0xffff0000
+#define CXL_DOE_TABLE_ACCESS_LAST_ENTRY		0xffff
+#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2
+
+#define CDAT_DOE_REQ(entry_handle) cpu_to_le32				\
+	(FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE,			\
+		    CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) |		\
+	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE,			\
+		    CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA) |		\
+	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, (entry_handle)))
+
+static int cxl_cdat_get_length(struct device *dev,
+			       struct pci_doe_mb *cdat_doe,
+			       size_t *length)
+{
+	__le32 request = CDAT_DOE_REQ(0);
+	__le32 response[2];
+	int rc;
+
+	rc = pci_doe(cdat_doe, PCI_DVSEC_VENDOR_ID_CXL,
+		     CXL_DOE_PROTOCOL_TABLE_ACCESS,
+		     &request, sizeof(request),
+		     &response, sizeof(response));
+	if (rc < 0) {
+		dev_err(dev, "DOE failed: %d", rc);
+		return rc;
+	}
+	if (rc < sizeof(response))
+		return -EIO;
+
+	*length = le32_to_cpu(response[1]);
+	dev_dbg(dev, "CDAT length %zu\n", *length);
+
+	return 0;
+}
+
+static int cxl_cdat_read_table(struct device *dev,
+			       struct pci_doe_mb *cdat_doe,
+			       void *cdat_table, size_t *cdat_length)
+{
+	size_t length = *cdat_length + sizeof(__le32);
+	__le32 *data = cdat_table;
+	int entry_handle = 0;
+	__le32 saved_dw = 0;
+
+	do {
+		__le32 request = CDAT_DOE_REQ(entry_handle);
+		struct cdat_entry_header *entry;
+		size_t entry_dw;
+		int rc;
+
+		rc = pci_doe(cdat_doe, PCI_DVSEC_VENDOR_ID_CXL,
+			     CXL_DOE_PROTOCOL_TABLE_ACCESS,
+			     &request, sizeof(request),
+			     data, length);
+		if (rc < 0) {
+			dev_err(dev, "DOE failed: %d", rc);
+			return rc;
+		}
+
+		/* 1 DW Table Access Response Header + CDAT entry */
+		entry = (struct cdat_entry_header *)(data + 1);
+		if ((entry_handle == 0 &&
+		     rc != sizeof(__le32) + sizeof(struct cdat_header)) ||
+		    (entry_handle > 0 &&
+		     (rc < sizeof(__le32) + sizeof(*entry) ||
+		      rc != sizeof(__le32) + le16_to_cpu(entry->length))))
+			return -EIO;
+
+		/* Get the CXL table access header entry handle */
+		entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE,
+					 le32_to_cpu(data[0]));
+		entry_dw = rc / sizeof(__le32);
+		/* Skip Header */
+		entry_dw -= 1;
+		/*
+		 * Table Access Response Header overwrote the last DW of
+		 * previous entry, so restore that DW
+		 */
+		*data = saved_dw;
+		length -= entry_dw * sizeof(__le32);
+		data += entry_dw;
+		saved_dw = *data;
+	} while (entry_handle != CXL_DOE_TABLE_ACCESS_LAST_ENTRY);
+
+	/* Length in CDAT header may exceed concatenation of CDAT entries */
+	*cdat_length -= length - sizeof(__le32);
+
+	return 0;
+}
+
+/**
+ * read_cdat_data - Read the CDAT data on this port
+ * @port: Port to read data from
+ *
+ * This call will sleep waiting for responses from the DOE mailbox.
+ */
+void read_cdat_data(struct cxl_port *port)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
+	struct device *host = cxlmd->dev.parent;
+	struct device *dev = &port->dev;
+	struct pci_doe_mb *cdat_doe;
+	size_t cdat_length;
+	void *cdat_table;
+	int rc;
+
+	if (!dev_is_pci(host))
+		return;
+	cdat_doe = pci_find_doe_mailbox(to_pci_dev(host),
+					PCI_DVSEC_VENDOR_ID_CXL,
+					CXL_DOE_PROTOCOL_TABLE_ACCESS);
+	if (!cdat_doe) {
+		dev_dbg(dev, "No CDAT mailbox\n");
+		return;
+	}
+
+	port->cdat_available = true;
+
+	if (cxl_cdat_get_length(dev, cdat_doe, &cdat_length)) {
+		dev_dbg(dev, "No CDAT length\n");
+		return;
+	}
+
+	cdat_table = devm_kzalloc(dev, cdat_length + sizeof(__le32),
+				  GFP_KERNEL);
+	if (!cdat_table)
+		return;
+
+	rc = cxl_cdat_read_table(dev, cdat_doe, cdat_table, &cdat_length);
+	if (rc) {
+		/* Don't leave table data allocated on error */
+		devm_kfree(dev, cdat_table);
+		dev_err(dev, "CDAT data read error\n");
+		return;
+	}
+
+	port->cdat.table = cdat_table + sizeof(__le32);
+	port->cdat.length = cdat_length;
+}
+EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
+
+void cxl_cor_error_detected(struct pci_dev *pdev)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	void __iomem *addr;
+	u32 status;
+
+	if (!cxlds->regs.ras)
+		return;
+
+	addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
+	status = readl(addr);
+	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
+		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
+		trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
+
+/* CXL spec rev3.0 8.2.4.16.1 */
+static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log)
+{
+	void __iomem *addr;
+	u32 *log_addr;
+	int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32);
+
+	addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET;
+	log_addr = log;
+
+	for (i = 0; i < log_u32_size; i++) {
+		*log_addr = readl(addr);
+		log_addr++;
+		addr += sizeof(u32);
+	}
+}
+
+/*
+ * Log the state of the RAS status registers and prepare them to log the
+ * next error status. Return 1 if reset needed.
+ */
+static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
+{
+	u32 hl[CXL_HEADERLOG_SIZE_U32];
+	void __iomem *addr;
+	u32 status;
+	u32 fe;
+
+	if (!cxlds->regs.ras)
+		return false;
+
+	addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
+	status = readl(addr);
+	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
+		return false;
+
+	/* If multiple errors, log header points to first error from ctrl reg */
+	if (hweight32(status) > 1) {
+		void __iomem *rcc_addr =
+			cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET;
+
+		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+				   readl(rcc_addr)));
+	} else {
+		fe = status;
+	}
+
+	header_log_copy(cxlds, hl);
+	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl);
+	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
+
+	return true;
+}
+
+pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
+				    pci_channel_state_t state)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct device *dev = &cxlmd->dev;
+	bool ue;
+
+	/*
+	 * A frozen channel indicates an impending reset which is fatal to
+	 * CXL.mem operation, and will likely crash the system. On the off
+	 * chance the situation is recoverable dump the status of the RAS
+	 * capability registers and bounce the active state of the memdev.
+	 */
+	ue = cxl_report_and_clear(cxlds);
+
+	switch (state) {
+	case pci_channel_io_normal:
+		if (ue) {
+			device_release_driver(dev);
+			return PCI_ERS_RESULT_NEED_RESET;
+		}
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	case pci_channel_io_frozen:
+		dev_warn(&pdev->dev,
+			 "%s: frozen state error detected, disable CXL.mem\n",
+			 dev_name(dev));
+		device_release_driver(dev);
+		return PCI_ERS_RESULT_NEED_RESET;
+	case pci_channel_io_perm_failure:
+		dev_warn(&pdev->dev,
+			 "failure state error detected, request disconnect\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_error_detected, CXL);
diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c
new file mode 100644
index 0000000000..fc94f52403
--- /dev/null
+++ b/drivers/cxl/core/pmem.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. */
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <cxlmem.h>
+#include <cxl.h>
+#include "core.h"
+
+/**
+ * DOC: cxl pmem
+ *
+ * The core CXL PMEM infrastructure supports persistent memory
+ * provisioning and serves as a bridge to the LIBNVDIMM subsystem. A CXL
+ * 'bridge' device is added at the root of a CXL device topology if
+ * platform firmware advertises at least one persistent memory capable
+ * CXL window. That root-level bridge corresponds to a LIBNVDIMM 'bus'
+ * device. Then for each cxl_memdev in the CXL device topology a bridge
+ * device is added to host a LIBNVDIMM dimm object. When these bridges
+ * are registered native LIBNVDIMM uapis are translated to CXL
+ * operations, for example, namespace label access commands.
+ */
+
+static DEFINE_IDA(cxl_nvdimm_bridge_ida);
+
+static void cxl_nvdimm_bridge_release(struct device *dev)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev);
+
+	ida_free(&cxl_nvdimm_bridge_ida, cxl_nvb->id);
+	kfree(cxl_nvb);
+}
+
+static const struct attribute_group *cxl_nvdimm_bridge_attribute_groups[] = {
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+const struct device_type cxl_nvdimm_bridge_type = {
+	.name = "cxl_nvdimm_bridge",
+	.release = cxl_nvdimm_bridge_release,
+	.groups = cxl_nvdimm_bridge_attribute_groups,
+};
+
+struct cxl_nvdimm_bridge *to_cxl_nvdimm_bridge(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, dev->type != &cxl_nvdimm_bridge_type,
+			  "not a cxl_nvdimm_bridge device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_nvdimm_bridge, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm_bridge, CXL);
+
+bool is_cxl_nvdimm_bridge(struct device *dev)
+{
+	return dev->type == &cxl_nvdimm_bridge_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_nvdimm_bridge, CXL);
+
+static int match_nvdimm_bridge(struct device *dev, void *data)
+{
+	return is_cxl_nvdimm_bridge(dev);
+}
+
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_memdev *cxlmd)
+{
+	struct cxl_port *port = find_cxl_root(cxlmd->endpoint);
+	struct device *dev;
+
+	if (!port)
+		return NULL;
+
+	dev = device_find_child(&port->dev, NULL, match_nvdimm_bridge);
+	put_device(&port->dev);
+
+	if (!dev)
+		return NULL;
+
+	return to_cxl_nvdimm_bridge(dev);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_find_nvdimm_bridge, CXL);
+
+static struct lock_class_key cxl_nvdimm_bridge_key;
+
+static struct cxl_nvdimm_bridge *cxl_nvdimm_bridge_alloc(struct cxl_port *port)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct device *dev;
+	int rc;
+
+	cxl_nvb = kzalloc(sizeof(*cxl_nvb), GFP_KERNEL);
+	if (!cxl_nvb)
+		return ERR_PTR(-ENOMEM);
+
+	rc = ida_alloc(&cxl_nvdimm_bridge_ida, GFP_KERNEL);
+	if (rc < 0)
+		goto err;
+	cxl_nvb->id = rc;
+
+	dev = &cxl_nvb->dev;
+	cxl_nvb->port = port;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_nvdimm_bridge_key);
+	device_set_pm_not_required(dev);
+	dev->parent = &port->dev;
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_nvdimm_bridge_type;
+
+	return cxl_nvb;
+
+err:
+	kfree(cxl_nvb);
+	return ERR_PTR(rc);
+}
+
+static void unregister_nvb(void *_cxl_nvb)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb;
+
+	device_unregister(&cxl_nvb->dev);
+}
+
+/**
+ * devm_cxl_add_nvdimm_bridge() - add the root of a LIBNVDIMM topology
+ * @host: platform firmware root device
+ * @port: CXL port at the root of a CXL topology
+ *
+ * Return: bridge device that can host cxl_nvdimm objects
+ */
+struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host,
+						     struct cxl_port *port)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct device *dev;
+	int rc;
+
+	if (!IS_ENABLED(CONFIG_CXL_PMEM))
+		return ERR_PTR(-ENXIO);
+
+	cxl_nvb = cxl_nvdimm_bridge_alloc(port);
+	if (IS_ERR(cxl_nvb))
+		return cxl_nvb;
+
+	dev = &cxl_nvb->dev;
+	rc = dev_set_name(dev, "nvdimm-bridge%d", cxl_nvb->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	rc = devm_add_action_or_reset(host, unregister_nvb, cxl_nvb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	return cxl_nvb;
+
+err:
+	put_device(dev);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm_bridge, CXL);
+
+static void cxl_nvdimm_release(struct device *dev)
+{
+	struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
+
+	kfree(cxl_nvd);
+}
+
+static const struct attribute_group *cxl_nvdimm_attribute_groups[] = {
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+const struct device_type cxl_nvdimm_type = {
+	.name = "cxl_nvdimm",
+	.release = cxl_nvdimm_release,
+	.groups = cxl_nvdimm_attribute_groups,
+};
+
+bool is_cxl_nvdimm(struct device *dev)
+{
+	return dev->type == &cxl_nvdimm_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_nvdimm, CXL);
+
+struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_cxl_nvdimm(dev),
+			  "not a cxl_nvdimm device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_nvdimm, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm, CXL);
+
+static struct lock_class_key cxl_nvdimm_key;
+
+static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_nvdimm_bridge *cxl_nvb,
+					   struct cxl_memdev *cxlmd)
+{
+	struct cxl_nvdimm *cxl_nvd;
+	struct device *dev;
+
+	cxl_nvd = kzalloc(sizeof(*cxl_nvd), GFP_KERNEL);
+	if (!cxl_nvd)
+		return ERR_PTR(-ENOMEM);
+
+	dev = &cxl_nvd->dev;
+	cxl_nvd->cxlmd = cxlmd;
+	cxlmd->cxl_nvd = cxl_nvd;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_nvdimm_key);
+	device_set_pm_not_required(dev);
+	dev->parent = &cxlmd->dev;
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_nvdimm_type;
+	/*
+	 * A "%llx" string is 17-bytes vs dimm_id that is max
+	 * NVDIMM_KEY_DESC_LEN
+	 */
+	BUILD_BUG_ON(sizeof(cxl_nvd->dev_id) < 17 ||
+		     sizeof(cxl_nvd->dev_id) > NVDIMM_KEY_DESC_LEN);
+	sprintf(cxl_nvd->dev_id, "%llx", cxlmd->cxlds->serial);
+
+	return cxl_nvd;
+}
+
+static void cxlmd_release_nvdimm(void *_cxlmd)
+{
+	struct cxl_memdev *cxlmd = _cxlmd;
+	struct cxl_nvdimm *cxl_nvd = cxlmd->cxl_nvd;
+	struct cxl_nvdimm_bridge *cxl_nvb = cxlmd->cxl_nvb;
+
+	cxl_nvd->cxlmd = NULL;
+	cxlmd->cxl_nvd = NULL;
+	cxlmd->cxl_nvb = NULL;
+	device_unregister(&cxl_nvd->dev);
+	put_device(&cxl_nvb->dev);
+}
+
+/**
+ * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm
+ * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations
+ *
+ * Return: 0 on success negative error code on failure.
+ */
+int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct cxl_nvdimm *cxl_nvd;
+	struct device *dev;
+	int rc;
+
+	cxl_nvb = cxl_find_nvdimm_bridge(cxlmd);
+	if (!cxl_nvb)
+		return -ENODEV;
+
+	cxl_nvd = cxl_nvdimm_alloc(cxl_nvb, cxlmd);
+	if (IS_ERR(cxl_nvd)) {
+		rc = PTR_ERR(cxl_nvd);
+		goto err_alloc;
+	}
+	cxlmd->cxl_nvb = cxl_nvb;
+
+	dev = &cxl_nvd->dev;
+	rc = dev_set_name(dev, "pmem%d", cxlmd->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	dev_dbg(&cxlmd->dev, "register %s\n", dev_name(dev));
+
+	/* @cxlmd carries a reference on @cxl_nvb until cxlmd_release_nvdimm */
+	return devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm, cxlmd);
+
+err:
+	put_device(dev);
+err_alloc:
+	cxlmd->cxl_nvb = NULL;
+	cxlmd->cxl_nvd = NULL;
+	put_device(&cxl_nvb->dev);
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm, CXL);
diff --git a/drivers/cxl/core/pmu.c b/drivers/cxl/core/pmu.c
new file mode 100644
index 0000000000..5d8e06b0ba
--- /dev/null
+++ b/drivers/cxl/core/pmu.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Huawei. All rights reserved. */
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <cxlmem.h>
+#include <pmu.h>
+#include <cxl.h>
+#include "core.h"
+
+static void cxl_pmu_release(struct device *dev)
+{
+	struct cxl_pmu *pmu = to_cxl_pmu(dev);
+
+	kfree(pmu);
+}
+
+const struct device_type cxl_pmu_type = {
+	.name = "cxl_pmu",
+	.release = cxl_pmu_release,
+};
+
+static void remove_dev(void *dev)
+{
+	device_unregister(dev);
+}
+
+int devm_cxl_pmu_add(struct device *parent, struct cxl_pmu_regs *regs,
+		     int assoc_id, int index, enum cxl_pmu_type type)
+{
+	struct cxl_pmu *pmu;
+	struct device *dev;
+	int rc;
+
+	pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
+	if (!pmu)
+		return -ENOMEM;
+
+	pmu->assoc_id = assoc_id;
+	pmu->index = index;
+	pmu->type = type;
+	pmu->base = regs->pmu;
+	dev = &pmu->dev;
+	device_initialize(dev);
+	device_set_pm_not_required(dev);
+	dev->parent = parent;
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_pmu_type;
+	switch (pmu->type) {
+	case CXL_PMU_MEMDEV:
+		rc = dev_set_name(dev, "pmu_mem%d.%d", assoc_id, index);
+		break;
+	}
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	return devm_add_action_or_reset(parent, remove_dev, dev);
+
+err:
+	put_device(&pmu->dev);
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_pmu_add, CXL);
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
new file mode 100644
index 0000000000..c67cc8c9d5
--- /dev/null
+++ b/drivers/cxl/core/port.c
@@ -0,0 +1,2092 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <linux/platform_device.h>
+#include <linux/memregion.h>
+#include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <cxlmem.h>
+#include <cxlpci.h>
+#include <cxl.h>
+#include "core.h"
+
+/**
+ * DOC: cxl core
+ *
+ * The CXL core provides a set of interfaces that can be consumed by CXL aware
+ * drivers. The interfaces allow for creation, modification, and destruction of
+ * regions, memory devices, ports, and decoders. CXL aware drivers must register
+ * with the CXL core via these interfaces in order to be able to participate in
+ * cross-device interleave coordination. The CXL core also establishes and
+ * maintains the bridge to the nvdimm subsystem.
+ *
+ * CXL core introduces sysfs hierarchy to control the devices that are
+ * instantiated by the core.
+ */
+
+/*
+ * All changes to the interleave configuration occur with this lock held
+ * for write.
+ */
+DECLARE_RWSEM(cxl_region_rwsem);
+
+static DEFINE_IDA(cxl_port_ida);
+static DEFINE_XARRAY(cxl_root_buses);
+
+int cxl_num_decoders_committed(struct cxl_port *port)
+{
+	lockdep_assert_held(&cxl_region_rwsem);
+
+	return port->commit_end + 1;
+}
+
+static ssize_t devtype_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	return sysfs_emit(buf, "%s\n", dev->type->name);
+}
+static DEVICE_ATTR_RO(devtype);
+
+static int cxl_device_id(const struct device *dev)
+{
+	if (dev->type == &cxl_nvdimm_bridge_type)
+		return CXL_DEVICE_NVDIMM_BRIDGE;
+	if (dev->type == &cxl_nvdimm_type)
+		return CXL_DEVICE_NVDIMM;
+	if (dev->type == CXL_PMEM_REGION_TYPE())
+		return CXL_DEVICE_PMEM_REGION;
+	if (dev->type == CXL_DAX_REGION_TYPE())
+		return CXL_DEVICE_DAX_REGION;
+	if (is_cxl_port(dev)) {
+		if (is_cxl_root(to_cxl_port(dev)))
+			return CXL_DEVICE_ROOT;
+		return CXL_DEVICE_PORT;
+	}
+	if (is_cxl_memdev(dev))
+		return CXL_DEVICE_MEMORY_EXPANDER;
+	if (dev->type == CXL_REGION_TYPE())
+		return CXL_DEVICE_REGION;
+	if (dev->type == &cxl_pmu_type)
+		return CXL_DEVICE_PMU;
+	return 0;
+}
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	return sysfs_emit(buf, CXL_MODALIAS_FMT "\n", cxl_device_id(dev));
+}
+static DEVICE_ATTR_RO(modalias);
+
+static struct attribute *cxl_base_attributes[] = {
+	&dev_attr_devtype.attr,
+	&dev_attr_modalias.attr,
+	NULL,
+};
+
+struct attribute_group cxl_base_attribute_group = {
+	.attrs = cxl_base_attributes,
+};
+
+static ssize_t start_show(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+	return sysfs_emit(buf, "%#llx\n", cxld->hpa_range.start);
+}
+static DEVICE_ATTR_ADMIN_RO(start);
+
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+	return sysfs_emit(buf, "%#llx\n", range_len(&cxld->hpa_range));
+}
+static DEVICE_ATTR_RO(size);
+
+#define CXL_DECODER_FLAG_ATTR(name, flag)                            \
+static ssize_t name##_show(struct device *dev,                       \
+			   struct device_attribute *attr, char *buf) \
+{                                                                    \
+	struct cxl_decoder *cxld = to_cxl_decoder(dev);              \
+                                                                     \
+	return sysfs_emit(buf, "%s\n",                               \
+			  (cxld->flags & (flag)) ? "1" : "0");       \
+}                                                                    \
+static DEVICE_ATTR_RO(name)
+
+CXL_DECODER_FLAG_ATTR(cap_pmem, CXL_DECODER_F_PMEM);
+CXL_DECODER_FLAG_ATTR(cap_ram, CXL_DECODER_F_RAM);
+CXL_DECODER_FLAG_ATTR(cap_type2, CXL_DECODER_F_TYPE2);
+CXL_DECODER_FLAG_ATTR(cap_type3, CXL_DECODER_F_TYPE3);
+CXL_DECODER_FLAG_ATTR(locked, CXL_DECODER_F_LOCK);
+
+static ssize_t target_type_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+	switch (cxld->target_type) {
+	case CXL_DECODER_DEVMEM:
+		return sysfs_emit(buf, "accelerator\n");
+	case CXL_DECODER_HOSTONLYMEM:
+		return sysfs_emit(buf, "expander\n");
+	}
+	return -ENXIO;
+}
+static DEVICE_ATTR_RO(target_type);
+
+static ssize_t emit_target_list(struct cxl_switch_decoder *cxlsd, char *buf)
+{
+	struct cxl_decoder *cxld = &cxlsd->cxld;
+	ssize_t offset = 0;
+	int i, rc = 0;
+
+	for (i = 0; i < cxld->interleave_ways; i++) {
+		struct cxl_dport *dport = cxlsd->target[i];
+		struct cxl_dport *next = NULL;
+
+		if (!dport)
+			break;
+
+		if (i + 1 < cxld->interleave_ways)
+			next = cxlsd->target[i + 1];
+		rc = sysfs_emit_at(buf, offset, "%d%s", dport->port_id,
+				   next ? "," : "");
+		if (rc < 0)
+			return rc;
+		offset += rc;
+	}
+
+	return offset;
+}
+
+static ssize_t target_list_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev);
+	ssize_t offset;
+	int rc;
+
+	guard(rwsem_read)(&cxl_region_rwsem);
+	rc = emit_target_list(cxlsd, buf);
+	if (rc < 0)
+		return rc;
+	offset = rc;
+
+	rc = sysfs_emit_at(buf, offset, "\n");
+	if (rc < 0)
+		return rc;
+
+	return offset + rc;
+}
+static DEVICE_ATTR_RO(target_list);
+
+static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+
+	return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxled->mode));
+}
+
+static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
+			  const char *buf, size_t len)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+	enum cxl_decoder_mode mode;
+	ssize_t rc;
+
+	if (sysfs_streq(buf, "pmem"))
+		mode = CXL_DECODER_PMEM;
+	else if (sysfs_streq(buf, "ram"))
+		mode = CXL_DECODER_RAM;
+	else
+		return -EINVAL;
+
+	rc = cxl_dpa_set_mode(cxled, mode);
+	if (rc)
+		return rc;
+
+	return len;
+}
+static DEVICE_ATTR_RW(mode);
+
+static ssize_t dpa_resource_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+
+	guard(rwsem_read)(&cxl_dpa_rwsem);
+	return sysfs_emit(buf, "%#llx\n", (u64)cxl_dpa_resource_start(cxled));
+}
+static DEVICE_ATTR_RO(dpa_resource);
+
+static ssize_t dpa_size_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+	resource_size_t size = cxl_dpa_size(cxled);
+
+	return sysfs_emit(buf, "%pa\n", &size);
+}
+
+static ssize_t dpa_size_store(struct device *dev, struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+	unsigned long long size;
+	ssize_t rc;
+
+	rc = kstrtoull(buf, 0, &size);
+	if (rc)
+		return rc;
+
+	if (!IS_ALIGNED(size, SZ_256M))
+		return -EINVAL;
+
+	rc = cxl_dpa_free(cxled);
+	if (rc)
+		return rc;
+
+	if (size == 0)
+		return len;
+
+	rc = cxl_dpa_alloc(cxled, size);
+	if (rc)
+		return rc;
+
+	return len;
+}
+static DEVICE_ATTR_RW(dpa_size);
+
+static ssize_t interleave_granularity_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+	return sysfs_emit(buf, "%d\n", cxld->interleave_granularity);
+}
+
+static DEVICE_ATTR_RO(interleave_granularity);
+
+static ssize_t interleave_ways_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+	return sysfs_emit(buf, "%d\n", cxld->interleave_ways);
+}
+
+static DEVICE_ATTR_RO(interleave_ways);
+
+static struct attribute *cxl_decoder_base_attrs[] = {
+	&dev_attr_start.attr,
+	&dev_attr_size.attr,
+	&dev_attr_locked.attr,
+	&dev_attr_interleave_granularity.attr,
+	&dev_attr_interleave_ways.attr,
+	NULL,
+};
+
+static struct attribute_group cxl_decoder_base_attribute_group = {
+	.attrs = cxl_decoder_base_attrs,
+};
+
+static struct attribute *cxl_decoder_root_attrs[] = {
+	&dev_attr_cap_pmem.attr,
+	&dev_attr_cap_ram.attr,
+	&dev_attr_cap_type2.attr,
+	&dev_attr_cap_type3.attr,
+	&dev_attr_target_list.attr,
+	SET_CXL_REGION_ATTR(create_pmem_region)
+	SET_CXL_REGION_ATTR(create_ram_region)
+	SET_CXL_REGION_ATTR(delete_region)
+	NULL,
+};
+
+static bool can_create_pmem(struct cxl_root_decoder *cxlrd)
+{
+	unsigned long flags = CXL_DECODER_F_TYPE3 | CXL_DECODER_F_PMEM;
+
+	return (cxlrd->cxlsd.cxld.flags & flags) == flags;
+}
+
+static bool can_create_ram(struct cxl_root_decoder *cxlrd)
+{
+	unsigned long flags = CXL_DECODER_F_TYPE3 | CXL_DECODER_F_RAM;
+
+	return (cxlrd->cxlsd.cxld.flags & flags) == flags;
+}
+
+static umode_t cxl_root_decoder_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+
+	if (a == CXL_REGION_ATTR(create_pmem_region) && !can_create_pmem(cxlrd))
+		return 0;
+
+	if (a == CXL_REGION_ATTR(create_ram_region) && !can_create_ram(cxlrd))
+		return 0;
+
+	if (a == CXL_REGION_ATTR(delete_region) &&
+	    !(can_create_pmem(cxlrd) || can_create_ram(cxlrd)))
+		return 0;
+
+	return a->mode;
+}
+
+static struct attribute_group cxl_decoder_root_attribute_group = {
+	.attrs = cxl_decoder_root_attrs,
+	.is_visible = cxl_root_decoder_visible,
+};
+
+static const struct attribute_group *cxl_decoder_root_attribute_groups[] = {
+	&cxl_decoder_root_attribute_group,
+	&cxl_decoder_base_attribute_group,
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+static struct attribute *cxl_decoder_switch_attrs[] = {
+	&dev_attr_target_type.attr,
+	&dev_attr_target_list.attr,
+	SET_CXL_REGION_ATTR(region)
+	NULL,
+};
+
+static struct attribute_group cxl_decoder_switch_attribute_group = {
+	.attrs = cxl_decoder_switch_attrs,
+};
+
+static const struct attribute_group *cxl_decoder_switch_attribute_groups[] = {
+	&cxl_decoder_switch_attribute_group,
+	&cxl_decoder_base_attribute_group,
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+static struct attribute *cxl_decoder_endpoint_attrs[] = {
+	&dev_attr_target_type.attr,
+	&dev_attr_mode.attr,
+	&dev_attr_dpa_size.attr,
+	&dev_attr_dpa_resource.attr,
+	SET_CXL_REGION_ATTR(region)
+	NULL,
+};
+
+static struct attribute_group cxl_decoder_endpoint_attribute_group = {
+	.attrs = cxl_decoder_endpoint_attrs,
+};
+
+static const struct attribute_group *cxl_decoder_endpoint_attribute_groups[] = {
+	&cxl_decoder_base_attribute_group,
+	&cxl_decoder_endpoint_attribute_group,
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+static void __cxl_decoder_release(struct cxl_decoder *cxld)
+{
+	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+
+	ida_free(&port->decoder_ida, cxld->id);
+	put_device(&port->dev);
+}
+
+static void cxl_endpoint_decoder_release(struct device *dev)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+
+	__cxl_decoder_release(&cxled->cxld);
+	kfree(cxled);
+}
+
+static void cxl_switch_decoder_release(struct device *dev)
+{
+	struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev);
+
+	__cxl_decoder_release(&cxlsd->cxld);
+	kfree(cxlsd);
+}
+
+struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_root_decoder(dev),
+			  "not a cxl_root_decoder device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_root_decoder, cxlsd.cxld.dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_root_decoder, CXL);
+
+static void cxl_root_decoder_release(struct device *dev)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+
+	if (atomic_read(&cxlrd->region_id) >= 0)
+		memregion_free(atomic_read(&cxlrd->region_id));
+	__cxl_decoder_release(&cxlrd->cxlsd.cxld);
+	kfree(cxlrd);
+}
+
+static const struct device_type cxl_decoder_endpoint_type = {
+	.name = "cxl_decoder_endpoint",
+	.release = cxl_endpoint_decoder_release,
+	.groups = cxl_decoder_endpoint_attribute_groups,
+};
+
+static const struct device_type cxl_decoder_switch_type = {
+	.name = "cxl_decoder_switch",
+	.release = cxl_switch_decoder_release,
+	.groups = cxl_decoder_switch_attribute_groups,
+};
+
+static const struct device_type cxl_decoder_root_type = {
+	.name = "cxl_decoder_root",
+	.release = cxl_root_decoder_release,
+	.groups = cxl_decoder_root_attribute_groups,
+};
+
+bool is_endpoint_decoder(struct device *dev)
+{
+	return dev->type == &cxl_decoder_endpoint_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_endpoint_decoder, CXL);
+
+bool is_root_decoder(struct device *dev)
+{
+	return dev->type == &cxl_decoder_root_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_root_decoder, CXL);
+
+bool is_switch_decoder(struct device *dev)
+{
+	return is_root_decoder(dev) || dev->type == &cxl_decoder_switch_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_switch_decoder, CXL);
+
+struct cxl_decoder *to_cxl_decoder(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev,
+			  !is_switch_decoder(dev) && !is_endpoint_decoder(dev),
+			  "not a cxl_decoder device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_decoder, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_decoder, CXL);
+
+struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_endpoint_decoder(dev),
+			  "not a cxl_endpoint_decoder device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_endpoint_decoder, cxld.dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_endpoint_decoder, CXL);
+
+struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_switch_decoder(dev),
+			  "not a cxl_switch_decoder device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_switch_decoder, cxld.dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_switch_decoder, CXL);
+
+static void cxl_ep_release(struct cxl_ep *ep)
+{
+	put_device(ep->ep);
+	kfree(ep);
+}
+
+static void cxl_ep_remove(struct cxl_port *port, struct cxl_ep *ep)
+{
+	if (!ep)
+		return;
+	xa_erase(&port->endpoints, (unsigned long) ep->ep);
+	cxl_ep_release(ep);
+}
+
+static void cxl_port_release(struct device *dev)
+{
+	struct cxl_port *port = to_cxl_port(dev);
+	unsigned long index;
+	struct cxl_ep *ep;
+
+	xa_for_each(&port->endpoints, index, ep)
+		cxl_ep_remove(port, ep);
+	xa_destroy(&port->endpoints);
+	xa_destroy(&port->dports);
+	xa_destroy(&port->regions);
+	ida_free(&cxl_port_ida, port->id);
+	kfree(port);
+}
+
+static const struct attribute_group *cxl_port_attribute_groups[] = {
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+static const struct device_type cxl_port_type = {
+	.name = "cxl_port",
+	.release = cxl_port_release,
+	.groups = cxl_port_attribute_groups,
+};
+
+bool is_cxl_port(const struct device *dev)
+{
+	return dev->type == &cxl_port_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_port, CXL);
+
+struct cxl_port *to_cxl_port(const struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, dev->type != &cxl_port_type,
+			  "not a cxl_port device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_port, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_port, CXL);
+
+static void unregister_port(void *_port)
+{
+	struct cxl_port *port = _port;
+	struct cxl_port *parent;
+	struct device *lock_dev;
+
+	if (is_cxl_root(port))
+		parent = NULL;
+	else
+		parent = to_cxl_port(port->dev.parent);
+
+	/*
+	 * CXL root port's and the first level of ports are unregistered
+	 * under the platform firmware device lock, all other ports are
+	 * unregistered while holding their parent port lock.
+	 */
+	if (!parent)
+		lock_dev = port->uport_dev;
+	else if (is_cxl_root(parent))
+		lock_dev = parent->uport_dev;
+	else
+		lock_dev = &parent->dev;
+
+	device_lock_assert(lock_dev);
+	port->dead = true;
+	device_unregister(&port->dev);
+}
+
+static void cxl_unlink_uport(void *_port)
+{
+	struct cxl_port *port = _port;
+
+	sysfs_remove_link(&port->dev.kobj, "uport");
+}
+
+static int devm_cxl_link_uport(struct device *host, struct cxl_port *port)
+{
+	int rc;
+
+	rc = sysfs_create_link(&port->dev.kobj, &port->uport_dev->kobj,
+			       "uport");
+	if (rc)
+		return rc;
+	return devm_add_action_or_reset(host, cxl_unlink_uport, port);
+}
+
+static void cxl_unlink_parent_dport(void *_port)
+{
+	struct cxl_port *port = _port;
+
+	sysfs_remove_link(&port->dev.kobj, "parent_dport");
+}
+
+static int devm_cxl_link_parent_dport(struct device *host,
+				      struct cxl_port *port,
+				      struct cxl_dport *parent_dport)
+{
+	int rc;
+
+	if (!parent_dport)
+		return 0;
+
+	rc = sysfs_create_link(&port->dev.kobj, &parent_dport->dport_dev->kobj,
+			       "parent_dport");
+	if (rc)
+		return rc;
+	return devm_add_action_or_reset(host, cxl_unlink_parent_dport, port);
+}
+
+static struct lock_class_key cxl_port_key;
+
+static struct cxl_port *cxl_port_alloc(struct device *uport_dev,
+				       resource_size_t component_reg_phys,
+				       struct cxl_dport *parent_dport)
+{
+	struct cxl_port *port;
+	struct device *dev;
+	int rc;
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return ERR_PTR(-ENOMEM);
+
+	rc = ida_alloc(&cxl_port_ida, GFP_KERNEL);
+	if (rc < 0)
+		goto err;
+	port->id = rc;
+	port->uport_dev = uport_dev;
+
+	/*
+	 * The top-level cxl_port "cxl_root" does not have a cxl_port as
+	 * its parent and it does not have any corresponding component
+	 * registers as its decode is described by a fixed platform
+	 * description.
+	 */
+	dev = &port->dev;
+	if (parent_dport) {
+		struct cxl_port *parent_port = parent_dport->port;
+		struct cxl_port *iter;
+
+		dev->parent = &parent_port->dev;
+		port->depth = parent_port->depth + 1;
+		port->parent_dport = parent_dport;
+
+		/*
+		 * walk to the host bridge, or the first ancestor that knows
+		 * the host bridge
+		 */
+		iter = port;
+		while (!iter->host_bridge &&
+		       !is_cxl_root(to_cxl_port(iter->dev.parent)))
+			iter = to_cxl_port(iter->dev.parent);
+		if (iter->host_bridge)
+			port->host_bridge = iter->host_bridge;
+		else if (parent_dport->rch)
+			port->host_bridge = parent_dport->dport_dev;
+		else
+			port->host_bridge = iter->uport_dev;
+		dev_dbg(uport_dev, "host-bridge: %s\n",
+			dev_name(port->host_bridge));
+	} else
+		dev->parent = uport_dev;
+
+	port->component_reg_phys = component_reg_phys;
+	ida_init(&port->decoder_ida);
+	port->hdm_end = -1;
+	port->commit_end = -1;
+	xa_init(&port->dports);
+	xa_init(&port->endpoints);
+	xa_init(&port->regions);
+
+	device_initialize(dev);
+	lockdep_set_class_and_subclass(&dev->mutex, &cxl_port_key, port->depth);
+	device_set_pm_not_required(dev);
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_port_type;
+
+	return port;
+
+err:
+	kfree(port);
+	return ERR_PTR(rc);
+}
+
+static int cxl_setup_comp_regs(struct device *host, struct cxl_register_map *map,
+			       resource_size_t component_reg_phys)
+{
+	if (component_reg_phys == CXL_RESOURCE_NONE)
+		return 0;
+
+	*map = (struct cxl_register_map) {
+		.host = host,
+		.reg_type = CXL_REGLOC_RBI_COMPONENT,
+		.resource = component_reg_phys,
+		.max_size = CXL_COMPONENT_REG_BLOCK_SIZE,
+	};
+
+	return cxl_setup_regs(map);
+}
+
+static int cxl_port_setup_regs(struct cxl_port *port,
+			resource_size_t component_reg_phys)
+{
+	if (dev_is_platform(port->uport_dev))
+		return 0;
+	return cxl_setup_comp_regs(&port->dev, &port->comp_map,
+				   component_reg_phys);
+}
+
+static int cxl_dport_setup_regs(struct device *host, struct cxl_dport *dport,
+				resource_size_t component_reg_phys)
+{
+	int rc;
+
+	if (dev_is_platform(dport->dport_dev))
+		return 0;
+
+	/*
+	 * use @dport->dport_dev for the context for error messages during
+	 * register probing, and fixup @host after the fact, since @host may be
+	 * NULL.
+	 */
+	rc = cxl_setup_comp_regs(dport->dport_dev, &dport->comp_map,
+				 component_reg_phys);
+	dport->comp_map.host = host;
+	return rc;
+}
+
+static struct cxl_port *__devm_cxl_add_port(struct device *host,
+					    struct device *uport_dev,
+					    resource_size_t component_reg_phys,
+					    struct cxl_dport *parent_dport)
+{
+	struct cxl_port *port;
+	struct device *dev;
+	int rc;
+
+	port = cxl_port_alloc(uport_dev, component_reg_phys, parent_dport);
+	if (IS_ERR(port))
+		return port;
+
+	dev = &port->dev;
+	if (is_cxl_memdev(uport_dev))
+		rc = dev_set_name(dev, "endpoint%d", port->id);
+	else if (parent_dport)
+		rc = dev_set_name(dev, "port%d", port->id);
+	else
+		rc = dev_set_name(dev, "root%d", port->id);
+	if (rc)
+		goto err;
+
+	rc = cxl_port_setup_regs(port, component_reg_phys);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	rc = devm_add_action_or_reset(host, unregister_port, port);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = devm_cxl_link_uport(host, port);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = devm_cxl_link_parent_dport(host, port, parent_dport);
+	if (rc)
+		return ERR_PTR(rc);
+
+	return port;
+
+err:
+	put_device(dev);
+	return ERR_PTR(rc);
+}
+
+/**
+ * devm_cxl_add_port - register a cxl_port in CXL memory decode hierarchy
+ * @host: host device for devm operations
+ * @uport_dev: "physical" device implementing this upstream port
+ * @component_reg_phys: (optional) for configurable cxl_port instances
+ * @parent_dport: next hop up in the CXL memory decode hierarchy
+ */
+struct cxl_port *devm_cxl_add_port(struct device *host,
+				   struct device *uport_dev,
+				   resource_size_t component_reg_phys,
+				   struct cxl_dport *parent_dport)
+{
+	struct cxl_port *port, *parent_port;
+
+	port = __devm_cxl_add_port(host, uport_dev, component_reg_phys,
+				   parent_dport);
+
+	parent_port = parent_dport ? parent_dport->port : NULL;
+	if (IS_ERR(port)) {
+		dev_dbg(uport_dev, "Failed to add%s%s%s: %ld\n",
+			parent_port ? " port to " : "",
+			parent_port ? dev_name(&parent_port->dev) : "",
+			parent_port ? "" : " root port",
+			PTR_ERR(port));
+	} else {
+		dev_dbg(uport_dev, "%s added%s%s%s\n",
+			dev_name(&port->dev),
+			parent_port ? " to " : "",
+			parent_port ? dev_name(&parent_port->dev) : "",
+			parent_port ? "" : " (root port)");
+	}
+
+	return port;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_port, CXL);
+
+struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port)
+{
+	/* There is no pci_bus associated with a CXL platform-root port */
+	if (is_cxl_root(port))
+		return NULL;
+
+	if (dev_is_pci(port->uport_dev)) {
+		struct pci_dev *pdev = to_pci_dev(port->uport_dev);
+
+		return pdev->subordinate;
+	}
+
+	return xa_load(&cxl_root_buses, (unsigned long)port->uport_dev);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_to_pci_bus, CXL);
+
+static void unregister_pci_bus(void *uport_dev)
+{
+	xa_erase(&cxl_root_buses, (unsigned long)uport_dev);
+}
+
+int devm_cxl_register_pci_bus(struct device *host, struct device *uport_dev,
+			      struct pci_bus *bus)
+{
+	int rc;
+
+	if (dev_is_pci(uport_dev))
+		return -EINVAL;
+
+	rc = xa_insert(&cxl_root_buses, (unsigned long)uport_dev, bus,
+		       GFP_KERNEL);
+	if (rc)
+		return rc;
+	return devm_add_action_or_reset(host, unregister_pci_bus, uport_dev);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_register_pci_bus, CXL);
+
+static bool dev_is_cxl_root_child(struct device *dev)
+{
+	struct cxl_port *port, *parent;
+
+	if (!is_cxl_port(dev))
+		return false;
+
+	port = to_cxl_port(dev);
+	if (is_cxl_root(port))
+		return false;
+
+	parent = to_cxl_port(port->dev.parent);
+	if (is_cxl_root(parent))
+		return true;
+
+	return false;
+}
+
+struct cxl_port *find_cxl_root(struct cxl_port *port)
+{
+	struct cxl_port *iter = port;
+
+	while (iter && !is_cxl_root(iter))
+		iter = to_cxl_port(iter->dev.parent);
+
+	if (!iter)
+		return NULL;
+	get_device(&iter->dev);
+	return iter;
+}
+EXPORT_SYMBOL_NS_GPL(find_cxl_root, CXL);
+
+static struct cxl_dport *find_dport(struct cxl_port *port, int id)
+{
+	struct cxl_dport *dport;
+	unsigned long index;
+
+	device_lock_assert(&port->dev);
+	xa_for_each(&port->dports, index, dport)
+		if (dport->port_id == id)
+			return dport;
+	return NULL;
+}
+
+static int add_dport(struct cxl_port *port, struct cxl_dport *dport)
+{
+	struct cxl_dport *dup;
+	int rc;
+
+	device_lock_assert(&port->dev);
+	dup = find_dport(port, dport->port_id);
+	if (dup) {
+		dev_err(&port->dev,
+			"unable to add dport%d-%s non-unique port id (%s)\n",
+			dport->port_id, dev_name(dport->dport_dev),
+			dev_name(dup->dport_dev));
+		return -EBUSY;
+	}
+
+	rc = xa_insert(&port->dports, (unsigned long)dport->dport_dev, dport,
+		       GFP_KERNEL);
+	if (rc)
+		return rc;
+
+	port->nr_dports++;
+	return 0;
+}
+
+/*
+ * Since root-level CXL dports cannot be enumerated by PCI they are not
+ * enumerated by the common port driver that acquires the port lock over
+ * dport add/remove. Instead, root dports are manually added by a
+ * platform driver and cond_cxl_root_lock() is used to take the missing
+ * port lock in that case.
+ */
+static void cond_cxl_root_lock(struct cxl_port *port)
+{
+	if (is_cxl_root(port))
+		device_lock(&port->dev);
+}
+
+static void cond_cxl_root_unlock(struct cxl_port *port)
+{
+	if (is_cxl_root(port))
+		device_unlock(&port->dev);
+}
+
+static void cxl_dport_remove(void *data)
+{
+	struct cxl_dport *dport = data;
+	struct cxl_port *port = dport->port;
+
+	xa_erase(&port->dports, (unsigned long) dport->dport_dev);
+	put_device(dport->dport_dev);
+}
+
+static void cxl_dport_unlink(void *data)
+{
+	struct cxl_dport *dport = data;
+	struct cxl_port *port = dport->port;
+	char link_name[CXL_TARGET_STRLEN];
+
+	sprintf(link_name, "dport%d", dport->port_id);
+	sysfs_remove_link(&port->dev.kobj, link_name);
+}
+
+static struct cxl_dport *
+__devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev,
+		     int port_id, resource_size_t component_reg_phys,
+		     resource_size_t rcrb)
+{
+	char link_name[CXL_TARGET_STRLEN];
+	struct cxl_dport *dport;
+	struct device *host;
+	int rc;
+
+	if (is_cxl_root(port))
+		host = port->uport_dev;
+	else
+		host = &port->dev;
+
+	if (!host->driver) {
+		dev_WARN_ONCE(&port->dev, 1, "dport:%s bad devm context\n",
+			      dev_name(dport_dev));
+		return ERR_PTR(-ENXIO);
+	}
+
+	if (snprintf(link_name, CXL_TARGET_STRLEN, "dport%d", port_id) >=
+	    CXL_TARGET_STRLEN)
+		return ERR_PTR(-EINVAL);
+
+	dport = devm_kzalloc(host, sizeof(*dport), GFP_KERNEL);
+	if (!dport)
+		return ERR_PTR(-ENOMEM);
+
+	dport->dport_dev = dport_dev;
+	dport->port_id = port_id;
+	dport->port = port;
+
+	if (rcrb == CXL_RESOURCE_NONE) {
+		rc = cxl_dport_setup_regs(&port->dev, dport,
+					  component_reg_phys);
+		if (rc)
+			return ERR_PTR(rc);
+	} else {
+		dport->rcrb.base = rcrb;
+		component_reg_phys = __rcrb_to_component(dport_dev, &dport->rcrb,
+							 CXL_RCRB_DOWNSTREAM);
+		if (component_reg_phys == CXL_RESOURCE_NONE) {
+			dev_warn(dport_dev, "Invalid Component Registers in RCRB");
+			return ERR_PTR(-ENXIO);
+		}
+
+		/*
+		 * RCH @dport is not ready to map until associated with its
+		 * memdev
+		 */
+		rc = cxl_dport_setup_regs(NULL, dport, component_reg_phys);
+		if (rc)
+			return ERR_PTR(rc);
+
+		dport->rch = true;
+	}
+
+	if (component_reg_phys != CXL_RESOURCE_NONE)
+		dev_dbg(dport_dev, "Component Registers found for dport: %pa\n",
+			&component_reg_phys);
+
+	cond_cxl_root_lock(port);
+	rc = add_dport(port, dport);
+	cond_cxl_root_unlock(port);
+	if (rc)
+		return ERR_PTR(rc);
+
+	get_device(dport_dev);
+	rc = devm_add_action_or_reset(host, cxl_dport_remove, dport);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = sysfs_create_link(&port->dev.kobj, &dport_dev->kobj, link_name);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = devm_add_action_or_reset(host, cxl_dport_unlink, dport);
+	if (rc)
+		return ERR_PTR(rc);
+
+	return dport;
+}
+
+/**
+ * devm_cxl_add_dport - append VH downstream port data to a cxl_port
+ * @port: the cxl_port that references this dport
+ * @dport_dev: firmware or PCI device representing the dport
+ * @port_id: identifier for this dport in a decoder's target list
+ * @component_reg_phys: optional location of CXL component registers
+ *
+ * Note that dports are appended to the devm release action's of the
+ * either the port's host (for root ports), or the port itself (for
+ * switch ports)
+ */
+struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port,
+				     struct device *dport_dev, int port_id,
+				     resource_size_t component_reg_phys)
+{
+	struct cxl_dport *dport;
+
+	dport = __devm_cxl_add_dport(port, dport_dev, port_id,
+				     component_reg_phys, CXL_RESOURCE_NONE);
+	if (IS_ERR(dport)) {
+		dev_dbg(dport_dev, "failed to add dport to %s: %ld\n",
+			dev_name(&port->dev), PTR_ERR(dport));
+	} else {
+		dev_dbg(dport_dev, "dport added to %s\n",
+			dev_name(&port->dev));
+	}
+
+	return dport;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport, CXL);
+
+/**
+ * devm_cxl_add_rch_dport - append RCH downstream port data to a cxl_port
+ * @port: the cxl_port that references this dport
+ * @dport_dev: firmware or PCI device representing the dport
+ * @port_id: identifier for this dport in a decoder's target list
+ * @rcrb: mandatory location of a Root Complex Register Block
+ *
+ * See CXL 3.0 9.11.8 CXL Devices Attached to an RCH
+ */
+struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port,
+					 struct device *dport_dev, int port_id,
+					 resource_size_t rcrb)
+{
+	struct cxl_dport *dport;
+
+	if (rcrb == CXL_RESOURCE_NONE) {
+		dev_dbg(&port->dev, "failed to add RCH dport, missing RCRB\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	dport = __devm_cxl_add_dport(port, dport_dev, port_id,
+				     CXL_RESOURCE_NONE, rcrb);
+	if (IS_ERR(dport)) {
+		dev_dbg(dport_dev, "failed to add RCH dport to %s: %ld\n",
+			dev_name(&port->dev), PTR_ERR(dport));
+	} else {
+		dev_dbg(dport_dev, "RCH dport added to %s\n",
+			dev_name(&port->dev));
+	}
+
+	return dport;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_rch_dport, CXL);
+
+static int add_ep(struct cxl_ep *new)
+{
+	struct cxl_port *port = new->dport->port;
+	int rc;
+
+	device_lock(&port->dev);
+	if (port->dead) {
+		device_unlock(&port->dev);
+		return -ENXIO;
+	}
+	rc = xa_insert(&port->endpoints, (unsigned long)new->ep, new,
+		       GFP_KERNEL);
+	device_unlock(&port->dev);
+
+	return rc;
+}
+
+/**
+ * cxl_add_ep - register an endpoint's interest in a port
+ * @dport: the dport that routes to @ep_dev
+ * @ep_dev: device representing the endpoint
+ *
+ * Intermediate CXL ports are scanned based on the arrival of endpoints.
+ * When those endpoints depart the port can be destroyed once all
+ * endpoints that care about that port have been removed.
+ */
+static int cxl_add_ep(struct cxl_dport *dport, struct device *ep_dev)
+{
+	struct cxl_ep *ep;
+	int rc;
+
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	if (!ep)
+		return -ENOMEM;
+
+	ep->ep = get_device(ep_dev);
+	ep->dport = dport;
+
+	rc = add_ep(ep);
+	if (rc)
+		cxl_ep_release(ep);
+	return rc;
+}
+
+struct cxl_find_port_ctx {
+	const struct device *dport_dev;
+	const struct cxl_port *parent_port;
+	struct cxl_dport **dport;
+};
+
+static int match_port_by_dport(struct device *dev, const void *data)
+{
+	const struct cxl_find_port_ctx *ctx = data;
+	struct cxl_dport *dport;
+	struct cxl_port *port;
+
+	if (!is_cxl_port(dev))
+		return 0;
+	if (ctx->parent_port && dev->parent != &ctx->parent_port->dev)
+		return 0;
+
+	port = to_cxl_port(dev);
+	dport = cxl_find_dport_by_dev(port, ctx->dport_dev);
+	if (ctx->dport)
+		*ctx->dport = dport;
+	return dport != NULL;
+}
+
+static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx)
+{
+	struct device *dev;
+
+	if (!ctx->dport_dev)
+		return NULL;
+
+	dev = bus_find_device(&cxl_bus_type, NULL, ctx, match_port_by_dport);
+	if (dev)
+		return to_cxl_port(dev);
+	return NULL;
+}
+
+static struct cxl_port *find_cxl_port(struct device *dport_dev,
+				      struct cxl_dport **dport)
+{
+	struct cxl_find_port_ctx ctx = {
+		.dport_dev = dport_dev,
+		.dport = dport,
+	};
+	struct cxl_port *port;
+
+	port = __find_cxl_port(&ctx);
+	return port;
+}
+
+static struct cxl_port *find_cxl_port_at(struct cxl_port *parent_port,
+					 struct device *dport_dev,
+					 struct cxl_dport **dport)
+{
+	struct cxl_find_port_ctx ctx = {
+		.dport_dev = dport_dev,
+		.parent_port = parent_port,
+		.dport = dport,
+	};
+	struct cxl_port *port;
+
+	port = __find_cxl_port(&ctx);
+	return port;
+}
+
+/*
+ * All users of grandparent() are using it to walk PCIe-like switch port
+ * hierarchy. A PCIe switch is comprised of a bridge device representing the
+ * upstream switch port and N bridges representing downstream switch ports. When
+ * bridges stack the grand-parent of a downstream switch port is another
+ * downstream switch port in the immediate ancestor switch.
+ */
+static struct device *grandparent(struct device *dev)
+{
+	if (dev && dev->parent)
+		return dev->parent->parent;
+	return NULL;
+}
+
+static struct device *endpoint_host(struct cxl_port *endpoint)
+{
+	struct cxl_port *port = to_cxl_port(endpoint->dev.parent);
+
+	if (is_cxl_root(port))
+		return port->uport_dev;
+	return &port->dev;
+}
+
+static void delete_endpoint(void *data)
+{
+	struct cxl_memdev *cxlmd = data;
+	struct cxl_port *endpoint = cxlmd->endpoint;
+	struct device *host = endpoint_host(endpoint);
+
+	device_lock(host);
+	if (host->driver && !endpoint->dead) {
+		devm_release_action(host, cxl_unlink_parent_dport, endpoint);
+		devm_release_action(host, cxl_unlink_uport, endpoint);
+		devm_release_action(host, unregister_port, endpoint);
+	}
+	cxlmd->endpoint = NULL;
+	device_unlock(host);
+	put_device(&endpoint->dev);
+	put_device(host);
+}
+
+int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint)
+{
+	struct device *host = endpoint_host(endpoint);
+	struct device *dev = &cxlmd->dev;
+
+	get_device(host);
+	get_device(&endpoint->dev);
+	cxlmd->endpoint = endpoint;
+	cxlmd->depth = endpoint->depth;
+	return devm_add_action_or_reset(dev, delete_endpoint, cxlmd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_endpoint_autoremove, CXL);
+
+/*
+ * The natural end of life of a non-root 'cxl_port' is when its parent port goes
+ * through a ->remove() event ("top-down" unregistration). The unnatural trigger
+ * for a port to be unregistered is when all memdevs beneath that port have gone
+ * through ->remove(). This "bottom-up" removal selectively removes individual
+ * child ports manually. This depends on devm_cxl_add_port() to not change is
+ * devm action registration order, and for dports to have already been
+ * destroyed by reap_dports().
+ */
+static void delete_switch_port(struct cxl_port *port)
+{
+	devm_release_action(port->dev.parent, cxl_unlink_parent_dport, port);
+	devm_release_action(port->dev.parent, cxl_unlink_uport, port);
+	devm_release_action(port->dev.parent, unregister_port, port);
+}
+
+static void reap_dports(struct cxl_port *port)
+{
+	struct cxl_dport *dport;
+	unsigned long index;
+
+	device_lock_assert(&port->dev);
+
+	xa_for_each(&port->dports, index, dport) {
+		devm_release_action(&port->dev, cxl_dport_unlink, dport);
+		devm_release_action(&port->dev, cxl_dport_remove, dport);
+		devm_kfree(&port->dev, dport);
+	}
+}
+
+struct detach_ctx {
+	struct cxl_memdev *cxlmd;
+	int depth;
+};
+
+static int port_has_memdev(struct device *dev, const void *data)
+{
+	const struct detach_ctx *ctx = data;
+	struct cxl_port *port;
+
+	if (!is_cxl_port(dev))
+		return 0;
+
+	port = to_cxl_port(dev);
+	if (port->depth != ctx->depth)
+		return 0;
+
+	return !!cxl_ep_load(port, ctx->cxlmd);
+}
+
+static void cxl_detach_ep(void *data)
+{
+	struct cxl_memdev *cxlmd = data;
+
+	for (int i = cxlmd->depth - 1; i >= 1; i--) {
+		struct cxl_port *port, *parent_port;
+		struct detach_ctx ctx = {
+			.cxlmd = cxlmd,
+			.depth = i,
+		};
+		struct device *dev;
+		struct cxl_ep *ep;
+		bool died = false;
+
+		dev = bus_find_device(&cxl_bus_type, NULL, &ctx,
+				      port_has_memdev);
+		if (!dev)
+			continue;
+		port = to_cxl_port(dev);
+
+		parent_port = to_cxl_port(port->dev.parent);
+		device_lock(&parent_port->dev);
+		device_lock(&port->dev);
+		ep = cxl_ep_load(port, cxlmd);
+		dev_dbg(&cxlmd->dev, "disconnect %s from %s\n",
+			ep ? dev_name(ep->ep) : "", dev_name(&port->dev));
+		cxl_ep_remove(port, ep);
+		if (ep && !port->dead && xa_empty(&port->endpoints) &&
+		    !is_cxl_root(parent_port) && parent_port->dev.driver) {
+			/*
+			 * This was the last ep attached to a dynamically
+			 * enumerated port. Block new cxl_add_ep() and garbage
+			 * collect the port.
+			 */
+			died = true;
+			port->dead = true;
+			reap_dports(port);
+		}
+		device_unlock(&port->dev);
+
+		if (died) {
+			dev_dbg(&cxlmd->dev, "delete %s\n",
+				dev_name(&port->dev));
+			delete_switch_port(port);
+		}
+		put_device(&port->dev);
+		device_unlock(&parent_port->dev);
+	}
+}
+
+static resource_size_t find_component_registers(struct device *dev)
+{
+	struct cxl_register_map map;
+	struct pci_dev *pdev;
+
+	/*
+	 * Theoretically, CXL component registers can be hosted on a
+	 * non-PCI device, in practice, only cxl_test hits this case.
+	 */
+	if (!dev_is_pci(dev))
+		return CXL_RESOURCE_NONE;
+
+	pdev = to_pci_dev(dev);
+
+	cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map);
+	return map.resource;
+}
+
+static int add_port_attach_ep(struct cxl_memdev *cxlmd,
+			      struct device *uport_dev,
+			      struct device *dport_dev)
+{
+	struct device *dparent = grandparent(dport_dev);
+	struct cxl_port *port, *parent_port = NULL;
+	struct cxl_dport *dport, *parent_dport;
+	resource_size_t component_reg_phys;
+	int rc;
+
+	if (!dparent) {
+		/*
+		 * The iteration reached the topology root without finding the
+		 * CXL-root 'cxl_port' on a previous iteration, fail for now to
+		 * be re-probed after platform driver attaches.
+		 */
+		dev_dbg(&cxlmd->dev, "%s is a root dport\n",
+			dev_name(dport_dev));
+		return -ENXIO;
+	}
+
+	parent_port = find_cxl_port(dparent, &parent_dport);
+	if (!parent_port) {
+		/* iterate to create this parent_port */
+		return -EAGAIN;
+	}
+
+	device_lock(&parent_port->dev);
+	if (!parent_port->dev.driver) {
+		dev_warn(&cxlmd->dev,
+			 "port %s:%s disabled, failed to enumerate CXL.mem\n",
+			 dev_name(&parent_port->dev), dev_name(uport_dev));
+		port = ERR_PTR(-ENXIO);
+		goto out;
+	}
+
+	port = find_cxl_port_at(parent_port, dport_dev, &dport);
+	if (!port) {
+		component_reg_phys = find_component_registers(uport_dev);
+		port = devm_cxl_add_port(&parent_port->dev, uport_dev,
+					 component_reg_phys, parent_dport);
+		/* retry find to pick up the new dport information */
+		if (!IS_ERR(port))
+			port = find_cxl_port_at(parent_port, dport_dev, &dport);
+	}
+out:
+	device_unlock(&parent_port->dev);
+
+	if (IS_ERR(port))
+		rc = PTR_ERR(port);
+	else {
+		dev_dbg(&cxlmd->dev, "add to new port %s:%s\n",
+			dev_name(&port->dev), dev_name(port->uport_dev));
+		rc = cxl_add_ep(dport, &cxlmd->dev);
+		if (rc == -EBUSY) {
+			/*
+			 * "can't" happen, but this error code means
+			 * something to the caller, so translate it.
+			 */
+			rc = -ENXIO;
+		}
+		put_device(&port->dev);
+	}
+
+	put_device(&parent_port->dev);
+	return rc;
+}
+
+int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd)
+{
+	struct device *dev = &cxlmd->dev;
+	struct device *iter;
+	int rc;
+
+	/*
+	 * Skip intermediate port enumeration in the RCH case, there
+	 * are no ports in between a host bridge and an endpoint.
+	 */
+	if (cxlmd->cxlds->rcd)
+		return 0;
+
+	rc = devm_add_action_or_reset(&cxlmd->dev, cxl_detach_ep, cxlmd);
+	if (rc)
+		return rc;
+
+	/*
+	 * Scan for and add all cxl_ports in this device's ancestry.
+	 * Repeat until no more ports are added. Abort if a port add
+	 * attempt fails.
+	 */
+retry:
+	for (iter = dev; iter; iter = grandparent(iter)) {
+		struct device *dport_dev = grandparent(iter);
+		struct device *uport_dev;
+		struct cxl_dport *dport;
+		struct cxl_port *port;
+
+		if (!dport_dev)
+			return 0;
+
+		uport_dev = dport_dev->parent;
+		if (!uport_dev) {
+			dev_warn(dev, "at %s no parent for dport: %s\n",
+				 dev_name(iter), dev_name(dport_dev));
+			return -ENXIO;
+		}
+
+		dev_dbg(dev, "scan: iter: %s dport_dev: %s parent: %s\n",
+			dev_name(iter), dev_name(dport_dev),
+			dev_name(uport_dev));
+		port = find_cxl_port(dport_dev, &dport);
+		if (port) {
+			dev_dbg(&cxlmd->dev,
+				"found already registered port %s:%s\n",
+				dev_name(&port->dev),
+				dev_name(port->uport_dev));
+			rc = cxl_add_ep(dport, &cxlmd->dev);
+
+			/*
+			 * If the endpoint already exists in the port's list,
+			 * that's ok, it was added on a previous pass.
+			 * Otherwise, retry in add_port_attach_ep() after taking
+			 * the parent_port lock as the current port may be being
+			 * reaped.
+			 */
+			if (rc && rc != -EBUSY) {
+				put_device(&port->dev);
+				return rc;
+			}
+
+			/* Any more ports to add between this one and the root? */
+			if (!dev_is_cxl_root_child(&port->dev)) {
+				put_device(&port->dev);
+				continue;
+			}
+
+			put_device(&port->dev);
+			return 0;
+		}
+
+		rc = add_port_attach_ep(cxlmd, uport_dev, dport_dev);
+		/* port missing, try to add parent */
+		if (rc == -EAGAIN)
+			continue;
+		/* failed to add ep or port */
+		if (rc)
+			return rc;
+		/* port added, new descendants possible, start over */
+		goto retry;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_ports, CXL);
+
+struct cxl_port *cxl_pci_find_port(struct pci_dev *pdev,
+				   struct cxl_dport **dport)
+{
+	return find_cxl_port(pdev->dev.parent, dport);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_pci_find_port, CXL);
+
+struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd,
+				   struct cxl_dport **dport)
+{
+	return find_cxl_port(grandparent(&cxlmd->dev), dport);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_find_port, CXL);
+
+static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd,
+				    struct cxl_port *port, int *target_map)
+{
+	int i;
+
+	if (!target_map)
+		return 0;
+
+	device_lock_assert(&port->dev);
+
+	if (xa_empty(&port->dports))
+		return -EINVAL;
+
+	guard(rwsem_write)(&cxl_region_rwsem);
+	for (i = 0; i < cxlsd->cxld.interleave_ways; i++) {
+		struct cxl_dport *dport = find_dport(port, target_map[i]);
+
+		if (!dport)
+			return -ENXIO;
+		cxlsd->target[i] = dport;
+	}
+
+	return 0;
+}
+
+struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos)
+{
+	struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd;
+	struct cxl_decoder *cxld = &cxlsd->cxld;
+	int iw;
+
+	iw = cxld->interleave_ways;
+	if (dev_WARN_ONCE(&cxld->dev, iw != cxlsd->nr_targets,
+			  "misconfigured root decoder\n"))
+		return NULL;
+
+	return cxlrd->cxlsd.target[pos % iw];
+}
+EXPORT_SYMBOL_NS_GPL(cxl_hb_modulo, CXL);
+
+static struct lock_class_key cxl_decoder_key;
+
+/**
+ * cxl_decoder_init - Common decoder setup / initialization
+ * @port: owning port of this decoder
+ * @cxld: common decoder properties to initialize
+ *
+ * A port may contain one or more decoders. Each of those decoders
+ * enable some address space for CXL.mem utilization. A decoder is
+ * expected to be configured by the caller before registering via
+ * cxl_decoder_add()
+ */
+static int cxl_decoder_init(struct cxl_port *port, struct cxl_decoder *cxld)
+{
+	struct device *dev;
+	int rc;
+
+	rc = ida_alloc(&port->decoder_ida, GFP_KERNEL);
+	if (rc < 0)
+		return rc;
+
+	/* need parent to stick around to release the id */
+	get_device(&port->dev);
+	cxld->id = rc;
+
+	dev = &cxld->dev;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_decoder_key);
+	device_set_pm_not_required(dev);
+	dev->parent = &port->dev;
+	dev->bus = &cxl_bus_type;
+
+	/* Pre initialize an "empty" decoder */
+	cxld->interleave_ways = 1;
+	cxld->interleave_granularity = PAGE_SIZE;
+	cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+	cxld->hpa_range = (struct range) {
+		.start = 0,
+		.end = -1,
+	};
+
+	return 0;
+}
+
+static int cxl_switch_decoder_init(struct cxl_port *port,
+				   struct cxl_switch_decoder *cxlsd,
+				   int nr_targets)
+{
+	if (nr_targets > CXL_DECODER_MAX_INTERLEAVE)
+		return -EINVAL;
+
+	cxlsd->nr_targets = nr_targets;
+	return cxl_decoder_init(port, &cxlsd->cxld);
+}
+
+/**
+ * cxl_root_decoder_alloc - Allocate a root level decoder
+ * @port: owning CXL root of this decoder
+ * @nr_targets: static number of downstream targets
+ * @calc_hb: which host bridge covers the n'th position by granularity
+ *
+ * Return: A new cxl decoder to be registered by cxl_decoder_add(). A
+ * 'CXL root' decoder is one that decodes from a top-level / static platform
+ * firmware description of CXL resources into a CXL standard decode
+ * topology.
+ */
+struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
+						unsigned int nr_targets,
+						cxl_calc_hb_fn calc_hb)
+{
+	struct cxl_root_decoder *cxlrd;
+	struct cxl_switch_decoder *cxlsd;
+	struct cxl_decoder *cxld;
+	int rc;
+
+	if (!is_cxl_root(port))
+		return ERR_PTR(-EINVAL);
+
+	cxlrd = kzalloc(struct_size(cxlrd, cxlsd.target, nr_targets),
+			GFP_KERNEL);
+	if (!cxlrd)
+		return ERR_PTR(-ENOMEM);
+
+	cxlsd = &cxlrd->cxlsd;
+	rc = cxl_switch_decoder_init(port, cxlsd, nr_targets);
+	if (rc) {
+		kfree(cxlrd);
+		return ERR_PTR(rc);
+	}
+
+	cxlrd->calc_hb = calc_hb;
+	mutex_init(&cxlrd->range_lock);
+
+	cxld = &cxlsd->cxld;
+	cxld->dev.type = &cxl_decoder_root_type;
+	/*
+	 * cxl_root_decoder_release() special cases negative ids to
+	 * detect memregion_alloc() failures.
+	 */
+	atomic_set(&cxlrd->region_id, -1);
+	rc = memregion_alloc(GFP_KERNEL);
+	if (rc < 0) {
+		put_device(&cxld->dev);
+		return ERR_PTR(rc);
+	}
+
+	atomic_set(&cxlrd->region_id, rc);
+	return cxlrd;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, CXL);
+
+/**
+ * cxl_switch_decoder_alloc - Allocate a switch level decoder
+ * @port: owning CXL switch port of this decoder
+ * @nr_targets: max number of dynamically addressable downstream targets
+ *
+ * Return: A new cxl decoder to be registered by cxl_decoder_add(). A
+ * 'switch' decoder is any decoder that can be enumerated by PCIe
+ * topology and the HDM Decoder Capability. This includes the decoders
+ * that sit between Switch Upstream Ports / Switch Downstream Ports and
+ * Host Bridges / Root Ports.
+ */
+struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port,
+						    unsigned int nr_targets)
+{
+	struct cxl_switch_decoder *cxlsd;
+	struct cxl_decoder *cxld;
+	int rc;
+
+	if (is_cxl_root(port) || is_cxl_endpoint(port))
+		return ERR_PTR(-EINVAL);
+
+	cxlsd = kzalloc(struct_size(cxlsd, target, nr_targets), GFP_KERNEL);
+	if (!cxlsd)
+		return ERR_PTR(-ENOMEM);
+
+	rc = cxl_switch_decoder_init(port, cxlsd, nr_targets);
+	if (rc) {
+		kfree(cxlsd);
+		return ERR_PTR(rc);
+	}
+
+	cxld = &cxlsd->cxld;
+	cxld->dev.type = &cxl_decoder_switch_type;
+	return cxlsd;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, CXL);
+
+/**
+ * cxl_endpoint_decoder_alloc - Allocate an endpoint decoder
+ * @port: owning port of this decoder
+ *
+ * Return: A new cxl decoder to be registered by cxl_decoder_add()
+ */
+struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
+{
+	struct cxl_endpoint_decoder *cxled;
+	struct cxl_decoder *cxld;
+	int rc;
+
+	if (!is_cxl_endpoint(port))
+		return ERR_PTR(-EINVAL);
+
+	cxled = kzalloc(sizeof(*cxled), GFP_KERNEL);
+	if (!cxled)
+		return ERR_PTR(-ENOMEM);
+
+	cxled->pos = -1;
+	cxld = &cxled->cxld;
+	rc = cxl_decoder_init(port, cxld);
+	if (rc)	 {
+		kfree(cxled);
+		return ERR_PTR(rc);
+	}
+
+	cxld->dev.type = &cxl_decoder_endpoint_type;
+	return cxled;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_alloc, CXL);
+
+/**
+ * cxl_decoder_add_locked - Add a decoder with targets
+ * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc()
+ * @target_map: A list of downstream ports that this decoder can direct memory
+ *              traffic to. These numbers should correspond with the port number
+ *              in the PCIe Link Capabilities structure.
+ *
+ * Certain types of decoders may not have any targets. The main example of this
+ * is an endpoint device. A more awkward example is a hostbridge whose root
+ * ports get hot added (technically possible, though unlikely).
+ *
+ * This is the locked variant of cxl_decoder_add().
+ *
+ * Context: Process context. Expects the device lock of the port that owns the
+ *	    @cxld to be held.
+ *
+ * Return: Negative error code if the decoder wasn't properly configured; else
+ *	   returns 0.
+ */
+int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map)
+{
+	struct cxl_port *port;
+	struct device *dev;
+	int rc;
+
+	if (WARN_ON_ONCE(!cxld))
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(IS_ERR(cxld)))
+		return PTR_ERR(cxld);
+
+	if (cxld->interleave_ways < 1)
+		return -EINVAL;
+
+	dev = &cxld->dev;
+
+	port = to_cxl_port(cxld->dev.parent);
+	if (!is_endpoint_decoder(dev)) {
+		struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev);
+
+		rc = decoder_populate_targets(cxlsd, port, target_map);
+		if (rc && (cxld->flags & CXL_DECODER_F_ENABLE)) {
+			dev_err(&port->dev,
+				"Failed to populate active decoder targets\n");
+			return rc;
+		}
+	}
+
+	rc = dev_set_name(dev, "decoder%d.%d", port->id, cxld->id);
+	if (rc)
+		return rc;
+
+	return device_add(dev);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_decoder_add_locked, CXL);
+
+/**
+ * cxl_decoder_add - Add a decoder with targets
+ * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc()
+ * @target_map: A list of downstream ports that this decoder can direct memory
+ *              traffic to. These numbers should correspond with the port number
+ *              in the PCIe Link Capabilities structure.
+ *
+ * This is the unlocked variant of cxl_decoder_add_locked().
+ * See cxl_decoder_add_locked().
+ *
+ * Context: Process context. Takes and releases the device lock of the port that
+ *	    owns the @cxld.
+ */
+int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map)
+{
+	struct cxl_port *port;
+	int rc;
+
+	if (WARN_ON_ONCE(!cxld))
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(IS_ERR(cxld)))
+		return PTR_ERR(cxld);
+
+	port = to_cxl_port(cxld->dev.parent);
+
+	device_lock(&port->dev);
+	rc = cxl_decoder_add_locked(cxld, target_map);
+	device_unlock(&port->dev);
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_decoder_add, CXL);
+
+static void cxld_unregister(void *dev)
+{
+	struct cxl_endpoint_decoder *cxled;
+
+	if (is_endpoint_decoder(dev)) {
+		cxled = to_cxl_endpoint_decoder(dev);
+		cxl_decoder_kill_region(cxled);
+	}
+
+	device_unregister(dev);
+}
+
+int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld)
+{
+	return devm_add_action_or_reset(host, cxld_unregister, &cxld->dev);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_decoder_autoremove, CXL);
+
+/**
+ * __cxl_driver_register - register a driver for the cxl bus
+ * @cxl_drv: cxl driver structure to attach
+ * @owner: owning module/driver
+ * @modname: KBUILD_MODNAME for parent driver
+ */
+int __cxl_driver_register(struct cxl_driver *cxl_drv, struct module *owner,
+			  const char *modname)
+{
+	if (!cxl_drv->probe) {
+		pr_debug("%s ->probe() must be specified\n", modname);
+		return -EINVAL;
+	}
+
+	if (!cxl_drv->name) {
+		pr_debug("%s ->name must be specified\n", modname);
+		return -EINVAL;
+	}
+
+	if (!cxl_drv->id) {
+		pr_debug("%s ->id must be specified\n", modname);
+		return -EINVAL;
+	}
+
+	cxl_drv->drv.bus = &cxl_bus_type;
+	cxl_drv->drv.owner = owner;
+	cxl_drv->drv.mod_name = modname;
+	cxl_drv->drv.name = cxl_drv->name;
+
+	return driver_register(&cxl_drv->drv);
+}
+EXPORT_SYMBOL_NS_GPL(__cxl_driver_register, CXL);
+
+void cxl_driver_unregister(struct cxl_driver *cxl_drv)
+{
+	driver_unregister(&cxl_drv->drv);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_driver_unregister, CXL);
+
+static int cxl_bus_uevent(const struct device *dev, struct kobj_uevent_env *env)
+{
+	return add_uevent_var(env, "MODALIAS=" CXL_MODALIAS_FMT,
+			      cxl_device_id(dev));
+}
+
+static int cxl_bus_match(struct device *dev, struct device_driver *drv)
+{
+	return cxl_device_id(dev) == to_cxl_drv(drv)->id;
+}
+
+static int cxl_bus_probe(struct device *dev)
+{
+	int rc;
+
+	rc = to_cxl_drv(dev->driver)->probe(dev);
+	dev_dbg(dev, "probe: %d\n", rc);
+	return rc;
+}
+
+static void cxl_bus_remove(struct device *dev)
+{
+	struct cxl_driver *cxl_drv = to_cxl_drv(dev->driver);
+
+	if (cxl_drv->remove)
+		cxl_drv->remove(dev);
+}
+
+static struct workqueue_struct *cxl_bus_wq;
+
+static void cxl_bus_rescan_queue(struct work_struct *w)
+{
+	int rc = bus_rescan_devices(&cxl_bus_type);
+
+	pr_debug("CXL bus rescan result: %d\n", rc);
+}
+
+void cxl_bus_rescan(void)
+{
+	static DECLARE_WORK(rescan_work, cxl_bus_rescan_queue);
+
+	queue_work(cxl_bus_wq, &rescan_work);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_bus_rescan, CXL);
+
+void cxl_bus_drain(void)
+{
+	drain_workqueue(cxl_bus_wq);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_bus_drain, CXL);
+
+bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd)
+{
+	return queue_work(cxl_bus_wq, &cxlmd->detach_work);
+}
+EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL);
+
+/* for user tooling to ensure port disable work has completed */
+static ssize_t flush_store(const struct bus_type *bus, const char *buf, size_t count)
+{
+	if (sysfs_streq(buf, "1")) {
+		flush_workqueue(cxl_bus_wq);
+		return count;
+	}
+
+	return -EINVAL;
+}
+
+static BUS_ATTR_WO(flush);
+
+static struct attribute *cxl_bus_attributes[] = {
+	&bus_attr_flush.attr,
+	NULL,
+};
+
+static struct attribute_group cxl_bus_attribute_group = {
+	.attrs = cxl_bus_attributes,
+};
+
+static const struct attribute_group *cxl_bus_attribute_groups[] = {
+	&cxl_bus_attribute_group,
+	NULL,
+};
+
+struct bus_type cxl_bus_type = {
+	.name = "cxl",
+	.uevent = cxl_bus_uevent,
+	.match = cxl_bus_match,
+	.probe = cxl_bus_probe,
+	.remove = cxl_bus_remove,
+	.bus_groups = cxl_bus_attribute_groups,
+};
+EXPORT_SYMBOL_NS_GPL(cxl_bus_type, CXL);
+
+static struct dentry *cxl_debugfs;
+
+struct dentry *cxl_debugfs_create_dir(const char *dir)
+{
+	return debugfs_create_dir(dir, cxl_debugfs);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_debugfs_create_dir, CXL);
+
+static __init int cxl_core_init(void)
+{
+	int rc;
+
+	cxl_debugfs = debugfs_create_dir("cxl", NULL);
+
+	cxl_mbox_init();
+
+	rc = cxl_memdev_init();
+	if (rc)
+		return rc;
+
+	cxl_bus_wq = alloc_ordered_workqueue("cxl_port", 0);
+	if (!cxl_bus_wq) {
+		rc = -ENOMEM;
+		goto err_wq;
+	}
+
+	rc = bus_register(&cxl_bus_type);
+	if (rc)
+		goto err_bus;
+
+	rc = cxl_region_init();
+	if (rc)
+		goto err_region;
+
+	return 0;
+
+err_region:
+	bus_unregister(&cxl_bus_type);
+err_bus:
+	destroy_workqueue(cxl_bus_wq);
+err_wq:
+	cxl_memdev_exit();
+	return rc;
+}
+
+static void cxl_core_exit(void)
+{
+	cxl_region_exit();
+	bus_unregister(&cxl_bus_type);
+	destroy_workqueue(cxl_bus_wq);
+	cxl_memdev_exit();
+	debugfs_remove_recursive(cxl_debugfs);
+}
+
+subsys_initcall(cxl_core_init);
+module_exit(cxl_core_exit);
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
new file mode 100644
index 0000000000..6ebd12f797
--- /dev/null
+++ b/drivers/cxl/core/region.c
@@ -0,0 +1,3006 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/memregion.h>
+#include <linux/genalloc.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/sort.h>
+#include <linux/idr.h>
+#include <cxlmem.h>
+#include <cxl.h>
+#include "core.h"
+
+/**
+ * DOC: cxl core region
+ *
+ * CXL Regions represent mapped memory capacity in system physical address
+ * space. Whereas the CXL Root Decoders identify the bounds of potential CXL
+ * Memory ranges, Regions represent the active mapped capacity by the HDM
+ * Decoder Capability structures throughout the Host Bridges, Switches, and
+ * Endpoints in the topology.
+ *
+ * Region configuration has ordering constraints. UUID may be set at any time
+ * but is only visible for persistent regions.
+ * 1. Interleave granularity
+ * 2. Interleave size
+ * 3. Decoder targets
+ */
+
+static struct cxl_region *to_cxl_region(struct device *dev);
+
+static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (cxlr->mode != CXL_DECODER_PMEM)
+		rc = sysfs_emit(buf, "\n");
+	else
+		rc = sysfs_emit(buf, "%pUb\n", &p->uuid);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+static int is_dup(struct device *match, void *data)
+{
+	struct cxl_region_params *p;
+	struct cxl_region *cxlr;
+	uuid_t *uuid = data;
+
+	if (!is_cxl_region(match))
+		return 0;
+
+	lockdep_assert_held(&cxl_region_rwsem);
+	cxlr = to_cxl_region(match);
+	p = &cxlr->params;
+
+	if (uuid_equal(&p->uuid, uuid)) {
+		dev_dbg(match, "already has uuid: %pUb\n", uuid);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static ssize_t uuid_store(struct device *dev, struct device_attribute *attr,
+			  const char *buf, size_t len)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	uuid_t temp;
+	ssize_t rc;
+
+	if (len != UUID_STRING_LEN + 1)
+		return -EINVAL;
+
+	rc = uuid_parse(buf, &temp);
+	if (rc)
+		return rc;
+
+	if (uuid_is_null(&temp))
+		return -EINVAL;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	if (uuid_equal(&p->uuid, &temp))
+		goto out;
+
+	rc = -EBUSY;
+	if (p->state >= CXL_CONFIG_ACTIVE)
+		goto out;
+
+	rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup);
+	if (rc < 0)
+		goto out;
+
+	uuid_copy(&p->uuid, &temp);
+out:
+	up_write(&cxl_region_rwsem);
+
+	if (rc)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port,
+					  struct cxl_region *cxlr)
+{
+	return xa_load(&port->regions, (unsigned long)cxlr);
+}
+
+static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
+{
+	if (!cpu_cache_has_invalidate_memregion()) {
+		if (IS_ENABLED(CONFIG_CXL_REGION_INVALIDATION_TEST)) {
+			dev_warn_once(
+				&cxlr->dev,
+				"Bypassing cpu_cache_invalidate_memregion() for testing!\n");
+			return 0;
+		} else {
+			dev_err(&cxlr->dev,
+				"Failed to synchronize CPU cache state\n");
+			return -ENXIO;
+		}
+	}
+
+	cpu_cache_invalidate_memregion(IORES_DESC_CXL);
+	return 0;
+}
+
+static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	int i, rc = 0;
+
+	/*
+	 * Before region teardown attempt to flush, and if the flush
+	 * fails cancel the region teardown for data consistency
+	 * concerns
+	 */
+	rc = cxl_region_invalidate_memregion(cxlr);
+	if (rc)
+		return rc;
+
+	for (i = count - 1; i >= 0; i--) {
+		struct cxl_endpoint_decoder *cxled = p->targets[i];
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+		struct cxl_port *iter = cxled_to_port(cxled);
+		struct cxl_dev_state *cxlds = cxlmd->cxlds;
+		struct cxl_ep *ep;
+
+		if (cxlds->rcd)
+			goto endpoint_reset;
+
+		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
+			iter = to_cxl_port(iter->dev.parent);
+
+		for (ep = cxl_ep_load(iter, cxlmd); iter;
+		     iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
+			struct cxl_region_ref *cxl_rr;
+			struct cxl_decoder *cxld;
+
+			cxl_rr = cxl_rr_load(iter, cxlr);
+			cxld = cxl_rr->decoder;
+			if (cxld->reset)
+				rc = cxld->reset(cxld);
+			if (rc)
+				return rc;
+			set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+		}
+
+endpoint_reset:
+		rc = cxled->cxld.reset(&cxled->cxld);
+		if (rc)
+			return rc;
+		set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+	}
+
+	/* all decoders associated with this region have been torn down */
+	clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+
+	return 0;
+}
+
+static int commit_decoder(struct cxl_decoder *cxld)
+{
+	struct cxl_switch_decoder *cxlsd = NULL;
+
+	if (cxld->commit)
+		return cxld->commit(cxld);
+
+	if (is_switch_decoder(&cxld->dev))
+		cxlsd = to_cxl_switch_decoder(&cxld->dev);
+
+	if (dev_WARN_ONCE(&cxld->dev, !cxlsd || cxlsd->nr_targets > 1,
+			  "->commit() is required\n"))
+		return -ENXIO;
+	return 0;
+}
+
+static int cxl_region_decode_commit(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	int i, rc = 0;
+
+	for (i = 0; i < p->nr_targets; i++) {
+		struct cxl_endpoint_decoder *cxled = p->targets[i];
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+		struct cxl_region_ref *cxl_rr;
+		struct cxl_decoder *cxld;
+		struct cxl_port *iter;
+		struct cxl_ep *ep;
+
+		/* commit bottom up */
+		for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
+		     iter = to_cxl_port(iter->dev.parent)) {
+			cxl_rr = cxl_rr_load(iter, cxlr);
+			cxld = cxl_rr->decoder;
+			rc = commit_decoder(cxld);
+			if (rc)
+				break;
+		}
+
+		if (rc) {
+			/* programming @iter failed, teardown */
+			for (ep = cxl_ep_load(iter, cxlmd); ep && iter;
+			     iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
+				cxl_rr = cxl_rr_load(iter, cxlr);
+				cxld = cxl_rr->decoder;
+				if (cxld->reset)
+					cxld->reset(cxld);
+			}
+
+			cxled->cxld.reset(&cxled->cxld);
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	/* undo the targets that were successfully committed */
+	cxl_region_decode_reset(cxlr, i);
+	return rc;
+}
+
+static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t len)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	bool commit;
+	ssize_t rc;
+
+	rc = kstrtobool(buf, &commit);
+	if (rc)
+		return rc;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	/* Already in the requested state? */
+	if (commit && p->state >= CXL_CONFIG_COMMIT)
+		goto out;
+	if (!commit && p->state < CXL_CONFIG_COMMIT)
+		goto out;
+
+	/* Not ready to commit? */
+	if (commit && p->state < CXL_CONFIG_ACTIVE) {
+		rc = -ENXIO;
+		goto out;
+	}
+
+	/*
+	 * Invalidate caches before region setup to drop any speculative
+	 * consumption of this address space
+	 */
+	rc = cxl_region_invalidate_memregion(cxlr);
+	if (rc)
+		goto out;
+
+	if (commit) {
+		rc = cxl_region_decode_commit(cxlr);
+		if (rc == 0)
+			p->state = CXL_CONFIG_COMMIT;
+	} else {
+		p->state = CXL_CONFIG_RESET_PENDING;
+		up_write(&cxl_region_rwsem);
+		device_release_driver(&cxlr->dev);
+		down_write(&cxl_region_rwsem);
+
+		/*
+		 * The lock was dropped, so need to revalidate that the reset is
+		 * still pending.
+		 */
+		if (p->state == CXL_CONFIG_RESET_PENDING) {
+			rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
+			/*
+			 * Revert to committed since there may still be active
+			 * decoders associated with this region, or move forward
+			 * to active to mark the reset successful
+			 */
+			if (rc)
+				p->state = CXL_CONFIG_COMMIT;
+			else
+				p->state = CXL_CONFIG_ACTIVE;
+		}
+	}
+
+out:
+	up_write(&cxl_region_rwsem);
+
+	if (rc)
+		return rc;
+	return len;
+}
+
+static ssize_t commit_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	rc = sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(commit);
+
+static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
+				  int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_region *cxlr = to_cxl_region(dev);
+
+	/*
+	 * Support tooling that expects to find a 'uuid' attribute for all
+	 * regions regardless of mode.
+	 */
+	if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM)
+		return 0444;
+	return a->mode;
+}
+
+static ssize_t interleave_ways_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	rc = sysfs_emit(buf, "%d\n", p->interleave_ways);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+static const struct attribute_group *get_cxl_region_target_group(void);
+
+static ssize_t interleave_ways_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
+	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	unsigned int val, save;
+	int rc;
+	u8 iw;
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	rc = ways_to_eiw(val, &iw);
+	if (rc)
+		return rc;
+
+	/*
+	 * Even for x3, x6, and x12 interleaves the region interleave must be a
+	 * power of 2 multiple of the host bridge interleave.
+	 */
+	if (!is_power_of_2(val / cxld->interleave_ways) ||
+	    (val % cxld->interleave_ways)) {
+		dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val);
+		return -EINVAL;
+	}
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	save = p->interleave_ways;
+	p->interleave_ways = val;
+	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
+	if (rc)
+		p->interleave_ways = save;
+out:
+	up_write(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_RW(interleave_ways);
+
+static ssize_t interleave_granularity_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	rc = sysfs_emit(buf, "%d\n", p->interleave_granularity);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+static ssize_t interleave_granularity_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
+	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	int rc, val;
+	u16 ig;
+
+	rc = kstrtoint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	rc = granularity_to_eig(val, &ig);
+	if (rc)
+		return rc;
+
+	/*
+	 * When the host-bridge is interleaved, disallow region granularity !=
+	 * root granularity. Regions with a granularity less than the root
+	 * interleave result in needing multiple endpoints to support a single
+	 * slot in the interleave (possible to support in the future). Regions
+	 * with a granularity greater than the root interleave result in invalid
+	 * DPA translations (invalid to support).
+	 */
+	if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity)
+		return -EINVAL;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	p->interleave_granularity = val;
+out:
+	up_write(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_RW(interleave_granularity);
+
+static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	u64 resource = -1ULL;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (p->res)
+		resource = p->res->start;
+	rc = sysfs_emit(buf, "%#llx\n", resource);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(resource);
+
+static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+
+	return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxlr->mode));
+}
+static DEVICE_ATTR_RO(mode);
+
+static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+	struct cxl_region_params *p = &cxlr->params;
+	struct resource *res;
+	u64 remainder = 0;
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	/* Nothing to do... */
+	if (p->res && resource_size(p->res) == size)
+		return 0;
+
+	/* To change size the old size must be freed first */
+	if (p->res)
+		return -EBUSY;
+
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
+		return -EBUSY;
+
+	/* ways, granularity and uuid (if PMEM) need to be set before HPA */
+	if (!p->interleave_ways || !p->interleave_granularity ||
+	    (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid)))
+		return -ENXIO;
+
+	div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder);
+	if (remainder)
+		return -EINVAL;
+
+	res = alloc_free_mem_region(cxlrd->res, size, SZ_256M,
+				    dev_name(&cxlr->dev));
+	if (IS_ERR(res)) {
+		dev_dbg(&cxlr->dev, "failed to allocate HPA: %ld\n",
+			PTR_ERR(res));
+		return PTR_ERR(res);
+	}
+
+	p->res = res;
+	p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
+
+	return 0;
+}
+
+static void cxl_region_iomem_release(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+
+	if (device_is_registered(&cxlr->dev))
+		lockdep_assert_held_write(&cxl_region_rwsem);
+	if (p->res) {
+		/*
+		 * Autodiscovered regions may not have been able to insert their
+		 * resource.
+		 */
+		if (p->res->parent)
+			remove_resource(p->res);
+		kfree(p->res);
+		p->res = NULL;
+	}
+}
+
+static int free_hpa(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	if (!p->res)
+		return 0;
+
+	if (p->state >= CXL_CONFIG_ACTIVE)
+		return -EBUSY;
+
+	cxl_region_iomem_release(cxlr);
+	p->state = CXL_CONFIG_IDLE;
+	return 0;
+}
+
+static ssize_t size_store(struct device *dev, struct device_attribute *attr,
+			  const char *buf, size_t len)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	u64 val;
+	int rc;
+
+	rc = kstrtou64(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	if (val)
+		rc = alloc_hpa(cxlr, val);
+	else
+		rc = free_hpa(cxlr);
+	up_write(&cxl_region_rwsem);
+
+	if (rc)
+		return rc;
+
+	return len;
+}
+
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	u64 size = 0;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (p->res)
+		size = resource_size(p->res);
+	rc = sysfs_emit(buf, "%#llx\n", size);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(size);
+
+static struct attribute *cxl_region_attrs[] = {
+	&dev_attr_uuid.attr,
+	&dev_attr_commit.attr,
+	&dev_attr_interleave_ways.attr,
+	&dev_attr_interleave_granularity.attr,
+	&dev_attr_resource.attr,
+	&dev_attr_size.attr,
+	&dev_attr_mode.attr,
+	NULL,
+};
+
+static const struct attribute_group cxl_region_group = {
+	.attrs = cxl_region_attrs,
+	.is_visible = cxl_region_visible,
+};
+
+static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_endpoint_decoder *cxled;
+	int rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	if (pos >= p->interleave_ways) {
+		dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
+			p->interleave_ways);
+		rc = -ENXIO;
+		goto out;
+	}
+
+	cxled = p->targets[pos];
+	if (!cxled)
+		rc = sysfs_emit(buf, "\n");
+	else
+		rc = sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev));
+out:
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+static int match_free_decoder(struct device *dev, void *data)
+{
+	struct cxl_decoder *cxld;
+	int *id = data;
+
+	if (!is_switch_decoder(dev))
+		return 0;
+
+	cxld = to_cxl_decoder(dev);
+
+	/* enforce ordered allocation */
+	if (cxld->id != *id)
+		return 0;
+
+	if (!cxld->region)
+		return 1;
+
+	(*id)++;
+
+	return 0;
+}
+
+static int match_auto_decoder(struct device *dev, void *data)
+{
+	struct cxl_region_params *p = data;
+	struct cxl_decoder *cxld;
+	struct range *r;
+
+	if (!is_switch_decoder(dev))
+		return 0;
+
+	cxld = to_cxl_decoder(dev);
+	r = &cxld->hpa_range;
+
+	if (p->res && p->res->start == r->start && p->res->end == r->end)
+		return 1;
+
+	return 0;
+}
+
+static struct cxl_decoder *cxl_region_find_decoder(struct cxl_port *port,
+						   struct cxl_region *cxlr)
+{
+	struct device *dev;
+	int id = 0;
+
+	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
+		dev = device_find_child(&port->dev, &cxlr->params,
+					match_auto_decoder);
+	else
+		dev = device_find_child(&port->dev, &id, match_free_decoder);
+	if (!dev)
+		return NULL;
+	/*
+	 * This decoder is pinned registered as long as the endpoint decoder is
+	 * registered, and endpoint decoder unregistration holds the
+	 * cxl_region_rwsem over unregister events, so no need to hold on to
+	 * this extra reference.
+	 */
+	put_device(dev);
+	return to_cxl_decoder(dev);
+}
+
+static struct cxl_region_ref *alloc_region_ref(struct cxl_port *port,
+					       struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_region_ref *cxl_rr, *iter;
+	unsigned long index;
+	int rc;
+
+	xa_for_each(&port->regions, index, iter) {
+		struct cxl_region_params *ip = &iter->region->params;
+
+		if (!ip->res)
+			continue;
+
+		if (ip->res->start > p->res->start) {
+			dev_dbg(&cxlr->dev,
+				"%s: HPA order violation %s:%pr vs %pr\n",
+				dev_name(&port->dev),
+				dev_name(&iter->region->dev), ip->res, p->res);
+			return ERR_PTR(-EBUSY);
+		}
+	}
+
+	cxl_rr = kzalloc(sizeof(*cxl_rr), GFP_KERNEL);
+	if (!cxl_rr)
+		return ERR_PTR(-ENOMEM);
+	cxl_rr->port = port;
+	cxl_rr->region = cxlr;
+	cxl_rr->nr_targets = 1;
+	xa_init(&cxl_rr->endpoints);
+
+	rc = xa_insert(&port->regions, (unsigned long)cxlr, cxl_rr, GFP_KERNEL);
+	if (rc) {
+		dev_dbg(&cxlr->dev,
+			"%s: failed to track region reference: %d\n",
+			dev_name(&port->dev), rc);
+		kfree(cxl_rr);
+		return ERR_PTR(rc);
+	}
+
+	return cxl_rr;
+}
+
+static void cxl_rr_free_decoder(struct cxl_region_ref *cxl_rr)
+{
+	struct cxl_region *cxlr = cxl_rr->region;
+	struct cxl_decoder *cxld = cxl_rr->decoder;
+
+	if (!cxld)
+		return;
+
+	dev_WARN_ONCE(&cxlr->dev, cxld->region != cxlr, "region mismatch\n");
+	if (cxld->region == cxlr) {
+		cxld->region = NULL;
+		put_device(&cxlr->dev);
+	}
+}
+
+static void free_region_ref(struct cxl_region_ref *cxl_rr)
+{
+	struct cxl_port *port = cxl_rr->port;
+	struct cxl_region *cxlr = cxl_rr->region;
+
+	cxl_rr_free_decoder(cxl_rr);
+	xa_erase(&port->regions, (unsigned long)cxlr);
+	xa_destroy(&cxl_rr->endpoints);
+	kfree(cxl_rr);
+}
+
+static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr,
+			 struct cxl_endpoint_decoder *cxled)
+{
+	int rc;
+	struct cxl_port *port = cxl_rr->port;
+	struct cxl_region *cxlr = cxl_rr->region;
+	struct cxl_decoder *cxld = cxl_rr->decoder;
+	struct cxl_ep *ep = cxl_ep_load(port, cxled_to_memdev(cxled));
+
+	if (ep) {
+		rc = xa_insert(&cxl_rr->endpoints, (unsigned long)cxled, ep,
+			       GFP_KERNEL);
+		if (rc)
+			return rc;
+	}
+	cxl_rr->nr_eps++;
+
+	if (!cxld->region) {
+		cxld->region = cxlr;
+		get_device(&cxlr->dev);
+	}
+
+	return 0;
+}
+
+static int cxl_rr_alloc_decoder(struct cxl_port *port, struct cxl_region *cxlr,
+				struct cxl_endpoint_decoder *cxled,
+				struct cxl_region_ref *cxl_rr)
+{
+	struct cxl_decoder *cxld;
+
+	if (port == cxled_to_port(cxled))
+		cxld = &cxled->cxld;
+	else
+		cxld = cxl_region_find_decoder(port, cxlr);
+	if (!cxld) {
+		dev_dbg(&cxlr->dev, "%s: no decoder available\n",
+			dev_name(&port->dev));
+		return -EBUSY;
+	}
+
+	if (cxld->region) {
+		dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
+			dev_name(&port->dev), dev_name(&cxld->dev),
+			dev_name(&cxld->region->dev));
+		return -EBUSY;
+	}
+
+	/*
+	 * Endpoints should already match the region type, but backstop that
+	 * assumption with an assertion. Switch-decoders change mapping-type
+	 * based on what is mapped when they are assigned to a region.
+	 */
+	dev_WARN_ONCE(&cxlr->dev,
+		      port == cxled_to_port(cxled) &&
+			      cxld->target_type != cxlr->type,
+		      "%s:%s mismatch decoder type %d -> %d\n",
+		      dev_name(&cxled_to_memdev(cxled)->dev),
+		      dev_name(&cxld->dev), cxld->target_type, cxlr->type);
+	cxld->target_type = cxlr->type;
+	cxl_rr->decoder = cxld;
+	return 0;
+}
+
+/**
+ * cxl_port_attach_region() - track a region's interest in a port by endpoint
+ * @port: port to add a new region reference 'struct cxl_region_ref'
+ * @cxlr: region to attach to @port
+ * @cxled: endpoint decoder used to create or further pin a region reference
+ * @pos: interleave position of @cxled in @cxlr
+ *
+ * The attach event is an opportunity to validate CXL decode setup
+ * constraints and record metadata needed for programming HDM decoders,
+ * in particular decoder target lists.
+ *
+ * The steps are:
+ *
+ * - validate that there are no other regions with a higher HPA already
+ *   associated with @port
+ * - establish a region reference if one is not already present
+ *
+ *   - additionally allocate a decoder instance that will host @cxlr on
+ *     @port
+ *
+ * - pin the region reference by the endpoint
+ * - account for how many entries in @port's target list are needed to
+ *   cover all of the added endpoints.
+ */
+static int cxl_port_attach_region(struct cxl_port *port,
+				  struct cxl_region *cxlr,
+				  struct cxl_endpoint_decoder *cxled, int pos)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
+	struct cxl_region_ref *cxl_rr;
+	bool nr_targets_inc = false;
+	struct cxl_decoder *cxld;
+	unsigned long index;
+	int rc = -EBUSY;
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	cxl_rr = cxl_rr_load(port, cxlr);
+	if (cxl_rr) {
+		struct cxl_ep *ep_iter;
+		int found = 0;
+
+		/*
+		 * Walk the existing endpoints that have been attached to
+		 * @cxlr at @port and see if they share the same 'next' port
+		 * in the downstream direction. I.e. endpoints that share common
+		 * upstream switch.
+		 */
+		xa_for_each(&cxl_rr->endpoints, index, ep_iter) {
+			if (ep_iter == ep)
+				continue;
+			if (ep_iter->next == ep->next) {
+				found++;
+				break;
+			}
+		}
+
+		/*
+		 * New target port, or @port is an endpoint port that always
+		 * accounts its own local decode as a target.
+		 */
+		if (!found || !ep->next) {
+			cxl_rr->nr_targets++;
+			nr_targets_inc = true;
+		}
+	} else {
+		cxl_rr = alloc_region_ref(port, cxlr);
+		if (IS_ERR(cxl_rr)) {
+			dev_dbg(&cxlr->dev,
+				"%s: failed to allocate region reference\n",
+				dev_name(&port->dev));
+			return PTR_ERR(cxl_rr);
+		}
+		nr_targets_inc = true;
+
+		rc = cxl_rr_alloc_decoder(port, cxlr, cxled, cxl_rr);
+		if (rc)
+			goto out_erase;
+	}
+	cxld = cxl_rr->decoder;
+
+	rc = cxl_rr_ep_add(cxl_rr, cxled);
+	if (rc) {
+		dev_dbg(&cxlr->dev,
+			"%s: failed to track endpoint %s:%s reference\n",
+			dev_name(&port->dev), dev_name(&cxlmd->dev),
+			dev_name(&cxld->dev));
+		goto out_erase;
+	}
+
+	dev_dbg(&cxlr->dev,
+		"%s:%s %s add: %s:%s @ %d next: %s nr_eps: %d nr_targets: %d\n",
+		dev_name(port->uport_dev), dev_name(&port->dev),
+		dev_name(&cxld->dev), dev_name(&cxlmd->dev),
+		dev_name(&cxled->cxld.dev), pos,
+		ep ? ep->next ? dev_name(ep->next->uport_dev) :
+				      dev_name(&cxlmd->dev) :
+			   "none",
+		cxl_rr->nr_eps, cxl_rr->nr_targets);
+
+	return 0;
+out_erase:
+	if (nr_targets_inc)
+		cxl_rr->nr_targets--;
+	if (cxl_rr->nr_eps == 0)
+		free_region_ref(cxl_rr);
+	return rc;
+}
+
+static void cxl_port_detach_region(struct cxl_port *port,
+				   struct cxl_region *cxlr,
+				   struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_region_ref *cxl_rr;
+	struct cxl_ep *ep = NULL;
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	cxl_rr = cxl_rr_load(port, cxlr);
+	if (!cxl_rr)
+		return;
+
+	/*
+	 * Endpoint ports do not carry cxl_ep references, and they
+	 * never target more than one endpoint by definition
+	 */
+	if (cxl_rr->decoder == &cxled->cxld)
+		cxl_rr->nr_eps--;
+	else
+		ep = xa_erase(&cxl_rr->endpoints, (unsigned long)cxled);
+	if (ep) {
+		struct cxl_ep *ep_iter;
+		unsigned long index;
+		int found = 0;
+
+		cxl_rr->nr_eps--;
+		xa_for_each(&cxl_rr->endpoints, index, ep_iter) {
+			if (ep_iter->next == ep->next) {
+				found++;
+				break;
+			}
+		}
+		if (!found)
+			cxl_rr->nr_targets--;
+	}
+
+	if (cxl_rr->nr_eps == 0)
+		free_region_ref(cxl_rr);
+}
+
+static int check_last_peer(struct cxl_endpoint_decoder *cxled,
+			   struct cxl_ep *ep, struct cxl_region_ref *cxl_rr,
+			   int distance)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_region *cxlr = cxl_rr->region;
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_endpoint_decoder *cxled_peer;
+	struct cxl_port *port = cxl_rr->port;
+	struct cxl_memdev *cxlmd_peer;
+	struct cxl_ep *ep_peer;
+	int pos = cxled->pos;
+
+	/*
+	 * If this position wants to share a dport with the last endpoint mapped
+	 * then that endpoint, at index 'position - distance', must also be
+	 * mapped by this dport.
+	 */
+	if (pos < distance) {
+		dev_dbg(&cxlr->dev, "%s:%s: cannot host %s:%s at %d\n",
+			dev_name(port->uport_dev), dev_name(&port->dev),
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
+		return -ENXIO;
+	}
+	cxled_peer = p->targets[pos - distance];
+	cxlmd_peer = cxled_to_memdev(cxled_peer);
+	ep_peer = cxl_ep_load(port, cxlmd_peer);
+	if (ep->dport != ep_peer->dport) {
+		dev_dbg(&cxlr->dev,
+			"%s:%s: %s:%s pos %d mismatched peer %s:%s\n",
+			dev_name(port->uport_dev), dev_name(&port->dev),
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos,
+			dev_name(&cxlmd_peer->dev),
+			dev_name(&cxled_peer->cxld.dev));
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static int cxl_port_setup_targets(struct cxl_port *port,
+				  struct cxl_region *cxlr,
+				  struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+	int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos;
+	struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
+	struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_decoder *cxld = cxl_rr->decoder;
+	struct cxl_switch_decoder *cxlsd;
+	u16 eig, peig;
+	u8 eiw, peiw;
+
+	/*
+	 * While root level decoders support x3, x6, x12, switch level
+	 * decoders only support powers of 2 up to x16.
+	 */
+	if (!is_power_of_2(cxl_rr->nr_targets)) {
+		dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n",
+			dev_name(port->uport_dev), dev_name(&port->dev),
+			cxl_rr->nr_targets);
+		return -EINVAL;
+	}
+
+	cxlsd = to_cxl_switch_decoder(&cxld->dev);
+	if (cxl_rr->nr_targets_set) {
+		int i, distance;
+
+		/*
+		 * Passthrough decoders impose no distance requirements between
+		 * peers
+		 */
+		if (cxl_rr->nr_targets == 1)
+			distance = 0;
+		else
+			distance = p->nr_targets / cxl_rr->nr_targets;
+		for (i = 0; i < cxl_rr->nr_targets_set; i++)
+			if (ep->dport == cxlsd->target[i]) {
+				rc = check_last_peer(cxled, ep, cxl_rr,
+						     distance);
+				if (rc)
+					return rc;
+				goto out_target_set;
+			}
+		goto add_target;
+	}
+
+	if (is_cxl_root(parent_port)) {
+		/*
+		 * Root decoder IG is always set to value in CFMWS which
+		 * may be different than this region's IG.  We can use the
+		 * region's IG here since interleave_granularity_store()
+		 * does not allow interleaved host-bridges with
+		 * root IG != region IG.
+		 */
+		parent_ig = p->interleave_granularity;
+		parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
+		/*
+		 * For purposes of address bit routing, use power-of-2 math for
+		 * switch ports.
+		 */
+		if (!is_power_of_2(parent_iw))
+			parent_iw /= 3;
+	} else {
+		struct cxl_region_ref *parent_rr;
+		struct cxl_decoder *parent_cxld;
+
+		parent_rr = cxl_rr_load(parent_port, cxlr);
+		parent_cxld = parent_rr->decoder;
+		parent_ig = parent_cxld->interleave_granularity;
+		parent_iw = parent_cxld->interleave_ways;
+	}
+
+	rc = granularity_to_eig(parent_ig, &peig);
+	if (rc) {
+		dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n",
+			dev_name(parent_port->uport_dev),
+			dev_name(&parent_port->dev), parent_ig);
+		return rc;
+	}
+
+	rc = ways_to_eiw(parent_iw, &peiw);
+	if (rc) {
+		dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n",
+			dev_name(parent_port->uport_dev),
+			dev_name(&parent_port->dev), parent_iw);
+		return rc;
+	}
+
+	iw = cxl_rr->nr_targets;
+	rc = ways_to_eiw(iw, &eiw);
+	if (rc) {
+		dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n",
+			dev_name(port->uport_dev), dev_name(&port->dev), iw);
+		return rc;
+	}
+
+	/*
+	 * Interleave granularity is a multiple of @parent_port granularity.
+	 * Multiplier is the parent port interleave ways.
+	 */
+	rc = granularity_to_eig(parent_ig * parent_iw, &eig);
+	if (rc) {
+		dev_dbg(&cxlr->dev,
+			"%s: invalid granularity calculation (%d * %d)\n",
+			dev_name(&parent_port->dev), parent_ig, parent_iw);
+		return rc;
+	}
+
+	rc = eig_to_granularity(eig, &ig);
+	if (rc) {
+		dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n",
+			dev_name(port->uport_dev), dev_name(&port->dev),
+			256 << eig);
+		return rc;
+	}
+
+	if (iw > 8 || iw > cxlsd->nr_targets) {
+		dev_dbg(&cxlr->dev,
+			"%s:%s:%s: ways: %d overflows targets: %d\n",
+			dev_name(port->uport_dev), dev_name(&port->dev),
+			dev_name(&cxld->dev), iw, cxlsd->nr_targets);
+		return -ENXIO;
+	}
+
+	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+		if (cxld->interleave_ways != iw ||
+		    cxld->interleave_granularity != ig ||
+		    cxld->hpa_range.start != p->res->start ||
+		    cxld->hpa_range.end != p->res->end ||
+		    ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
+			dev_err(&cxlr->dev,
+				"%s:%s %s expected iw: %d ig: %d %pr\n",
+				dev_name(port->uport_dev), dev_name(&port->dev),
+				__func__, iw, ig, p->res);
+			dev_err(&cxlr->dev,
+				"%s:%s %s got iw: %d ig: %d state: %s %#llx:%#llx\n",
+				dev_name(port->uport_dev), dev_name(&port->dev),
+				__func__, cxld->interleave_ways,
+				cxld->interleave_granularity,
+				(cxld->flags & CXL_DECODER_F_ENABLE) ?
+					"enabled" :
+					"disabled",
+				cxld->hpa_range.start, cxld->hpa_range.end);
+			return -ENXIO;
+		}
+	} else {
+		cxld->interleave_ways = iw;
+		cxld->interleave_granularity = ig;
+		cxld->hpa_range = (struct range) {
+			.start = p->res->start,
+			.end = p->res->end,
+		};
+	}
+	dev_dbg(&cxlr->dev, "%s:%s iw: %d ig: %d\n", dev_name(port->uport_dev),
+		dev_name(&port->dev), iw, ig);
+add_target:
+	if (cxl_rr->nr_targets_set == cxl_rr->nr_targets) {
+		dev_dbg(&cxlr->dev,
+			"%s:%s: targets full trying to add %s:%s at %d\n",
+			dev_name(port->uport_dev), dev_name(&port->dev),
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
+		return -ENXIO;
+	}
+	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+		if (cxlsd->target[cxl_rr->nr_targets_set] != ep->dport) {
+			dev_dbg(&cxlr->dev, "%s:%s: %s expected %s at %d\n",
+				dev_name(port->uport_dev), dev_name(&port->dev),
+				dev_name(&cxlsd->cxld.dev),
+				dev_name(ep->dport->dport_dev),
+				cxl_rr->nr_targets_set);
+			return -ENXIO;
+		}
+	} else
+		cxlsd->target[cxl_rr->nr_targets_set] = ep->dport;
+	inc = 1;
+out_target_set:
+	cxl_rr->nr_targets_set += inc;
+	dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n",
+		dev_name(port->uport_dev), dev_name(&port->dev),
+		cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport_dev),
+		dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
+
+	return 0;
+}
+
+static void cxl_port_reset_targets(struct cxl_port *port,
+				   struct cxl_region *cxlr)
+{
+	struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
+	struct cxl_decoder *cxld;
+
+	/*
+	 * After the last endpoint has been detached the entire cxl_rr may now
+	 * be gone.
+	 */
+	if (!cxl_rr)
+		return;
+	cxl_rr->nr_targets_set = 0;
+
+	cxld = cxl_rr->decoder;
+	cxld->hpa_range = (struct range) {
+		.start = 0,
+		.end = -1,
+	};
+}
+
+static void cxl_region_teardown_targets(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_endpoint_decoder *cxled;
+	struct cxl_dev_state *cxlds;
+	struct cxl_memdev *cxlmd;
+	struct cxl_port *iter;
+	struct cxl_ep *ep;
+	int i;
+
+	/*
+	 * In the auto-discovery case skip automatic teardown since the
+	 * address space is already active
+	 */
+	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
+		return;
+
+	for (i = 0; i < p->nr_targets; i++) {
+		cxled = p->targets[i];
+		cxlmd = cxled_to_memdev(cxled);
+		cxlds = cxlmd->cxlds;
+
+		if (cxlds->rcd)
+			continue;
+
+		iter = cxled_to_port(cxled);
+		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
+			iter = to_cxl_port(iter->dev.parent);
+
+		for (ep = cxl_ep_load(iter, cxlmd); iter;
+		     iter = ep->next, ep = cxl_ep_load(iter, cxlmd))
+			cxl_port_reset_targets(iter, cxlr);
+	}
+}
+
+static int cxl_region_setup_targets(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_endpoint_decoder *cxled;
+	struct cxl_dev_state *cxlds;
+	int i, rc, rch = 0, vh = 0;
+	struct cxl_memdev *cxlmd;
+	struct cxl_port *iter;
+	struct cxl_ep *ep;
+
+	for (i = 0; i < p->nr_targets; i++) {
+		cxled = p->targets[i];
+		cxlmd = cxled_to_memdev(cxled);
+		cxlds = cxlmd->cxlds;
+
+		/* validate that all targets agree on topology */
+		if (!cxlds->rcd) {
+			vh++;
+		} else {
+			rch++;
+			continue;
+		}
+
+		iter = cxled_to_port(cxled);
+		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
+			iter = to_cxl_port(iter->dev.parent);
+
+		/*
+		 * Descend the topology tree programming / validating
+		 * targets while looking for conflicts.
+		 */
+		for (ep = cxl_ep_load(iter, cxlmd); iter;
+		     iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
+			rc = cxl_port_setup_targets(iter, cxlr, cxled);
+			if (rc) {
+				cxl_region_teardown_targets(cxlr);
+				return rc;
+			}
+		}
+	}
+
+	if (rch && vh) {
+		dev_err(&cxlr->dev, "mismatched CXL topologies detected\n");
+		cxl_region_teardown_targets(cxlr);
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static int cxl_region_validate_position(struct cxl_region *cxlr,
+					struct cxl_endpoint_decoder *cxled,
+					int pos)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_region_params *p = &cxlr->params;
+	int i;
+
+	if (pos < 0 || pos >= p->interleave_ways) {
+		dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
+			p->interleave_ways);
+		return -ENXIO;
+	}
+
+	if (p->targets[pos] == cxled)
+		return 0;
+
+	if (p->targets[pos]) {
+		struct cxl_endpoint_decoder *cxled_target = p->targets[pos];
+		struct cxl_memdev *cxlmd_target = cxled_to_memdev(cxled_target);
+
+		dev_dbg(&cxlr->dev, "position %d already assigned to %s:%s\n",
+			pos, dev_name(&cxlmd_target->dev),
+			dev_name(&cxled_target->cxld.dev));
+		return -EBUSY;
+	}
+
+	for (i = 0; i < p->interleave_ways; i++) {
+		struct cxl_endpoint_decoder *cxled_target;
+		struct cxl_memdev *cxlmd_target;
+
+		cxled_target = p->targets[i];
+		if (!cxled_target)
+			continue;
+
+		cxlmd_target = cxled_to_memdev(cxled_target);
+		if (cxlmd_target == cxlmd) {
+			dev_dbg(&cxlr->dev,
+				"%s already specified at position %d via: %s\n",
+				dev_name(&cxlmd->dev), pos,
+				dev_name(&cxled_target->cxld.dev));
+			return -EBUSY;
+		}
+	}
+
+	return 0;
+}
+
+static int cxl_region_attach_position(struct cxl_region *cxlr,
+				      struct cxl_root_decoder *cxlrd,
+				      struct cxl_endpoint_decoder *cxled,
+				      const struct cxl_dport *dport, int pos)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_port *iter;
+	int rc;
+
+	if (cxlrd->calc_hb(cxlrd, pos) != dport) {
+		dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			dev_name(&cxlrd->cxlsd.cxld.dev));
+		return -ENXIO;
+	}
+
+	for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
+	     iter = to_cxl_port(iter->dev.parent)) {
+		rc = cxl_port_attach_region(iter, cxlr, cxled, pos);
+		if (rc)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
+	     iter = to_cxl_port(iter->dev.parent))
+		cxl_port_detach_region(iter, cxlr, cxled);
+	return rc;
+}
+
+static int cxl_region_attach_auto(struct cxl_region *cxlr,
+				  struct cxl_endpoint_decoder *cxled, int pos)
+{
+	struct cxl_region_params *p = &cxlr->params;
+
+	if (cxled->state != CXL_DECODER_STATE_AUTO) {
+		dev_err(&cxlr->dev,
+			"%s: unable to add decoder to autodetected region\n",
+			dev_name(&cxled->cxld.dev));
+		return -EINVAL;
+	}
+
+	if (pos >= 0) {
+		dev_dbg(&cxlr->dev, "%s: expected auto position, not %d\n",
+			dev_name(&cxled->cxld.dev), pos);
+		return -EINVAL;
+	}
+
+	if (p->nr_targets >= p->interleave_ways) {
+		dev_err(&cxlr->dev, "%s: no more target slots available\n",
+			dev_name(&cxled->cxld.dev));
+		return -ENXIO;
+	}
+
+	/*
+	 * Temporarily record the endpoint decoder into the target array. Yes,
+	 * this means that userspace can view devices in the wrong position
+	 * before the region activates, and must be careful to understand when
+	 * it might be racing region autodiscovery.
+	 */
+	pos = p->nr_targets;
+	p->targets[pos] = cxled;
+	cxled->pos = pos;
+	p->nr_targets++;
+
+	return 0;
+}
+
+static int cmp_interleave_pos(const void *a, const void *b)
+{
+	struct cxl_endpoint_decoder *cxled_a = *(typeof(cxled_a) *)a;
+	struct cxl_endpoint_decoder *cxled_b = *(typeof(cxled_b) *)b;
+
+	return cxled_a->pos - cxled_b->pos;
+}
+
+static struct cxl_port *next_port(struct cxl_port *port)
+{
+	if (!port->parent_dport)
+		return NULL;
+	return port->parent_dport->port;
+}
+
+static int match_switch_decoder_by_range(struct device *dev, void *data)
+{
+	struct cxl_switch_decoder *cxlsd;
+	struct range *r1, *r2 = data;
+
+	if (!is_switch_decoder(dev))
+		return 0;
+
+	cxlsd = to_cxl_switch_decoder(dev);
+	r1 = &cxlsd->cxld.hpa_range;
+
+	if (is_root_decoder(dev))
+		return range_contains(r1, r2);
+	return (r1->start == r2->start && r1->end == r2->end);
+}
+
+static int find_pos_and_ways(struct cxl_port *port, struct range *range,
+			     int *pos, int *ways)
+{
+	struct cxl_switch_decoder *cxlsd;
+	struct cxl_port *parent;
+	struct device *dev;
+	int rc = -ENXIO;
+
+	parent = next_port(port);
+	if (!parent)
+		return rc;
+
+	dev = device_find_child(&parent->dev, range,
+				match_switch_decoder_by_range);
+	if (!dev) {
+		dev_err(port->uport_dev,
+			"failed to find decoder mapping %#llx-%#llx\n",
+			range->start, range->end);
+		return rc;
+	}
+	cxlsd = to_cxl_switch_decoder(dev);
+	*ways = cxlsd->cxld.interleave_ways;
+
+	for (int i = 0; i < *ways; i++) {
+		if (cxlsd->target[i] == port->parent_dport) {
+			*pos = i;
+			rc = 0;
+			break;
+		}
+	}
+	put_device(dev);
+
+	return rc;
+}
+
+/**
+ * cxl_calc_interleave_pos() - calculate an endpoint position in a region
+ * @cxled: endpoint decoder member of given region
+ *
+ * The endpoint position is calculated by traversing the topology from
+ * the endpoint to the root decoder and iteratively applying this
+ * calculation:
+ *
+ *    position = position * parent_ways + parent_pos;
+ *
+ * ...where @position is inferred from switch and root decoder target lists.
+ *
+ * Return: position >= 0 on success
+ *	   -ENXIO on failure
+ */
+static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_port *iter, *port = cxled_to_port(cxled);
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct range *range = &cxled->cxld.hpa_range;
+	int parent_ways = 0, parent_pos = 0, pos = 0;
+	int rc;
+
+	/*
+	 * Example: the expected interleave order of the 4-way region shown
+	 * below is: mem0, mem2, mem1, mem3
+	 *
+	 *		  root_port
+	 *                 /      \
+	 *      host_bridge_0    host_bridge_1
+	 *        |    |           |    |
+	 *       mem0 mem1        mem2 mem3
+	 *
+	 * In the example the calculator will iterate twice. The first iteration
+	 * uses the mem position in the host-bridge and the ways of the host-
+	 * bridge to generate the first, or local, position. The second
+	 * iteration uses the host-bridge position in the root_port and the ways
+	 * of the root_port to refine the position.
+	 *
+	 * A trace of the calculation per endpoint looks like this:
+	 * mem0: pos = 0 * 2 + 0    mem2: pos = 0 * 2 + 0
+	 *       pos = 0 * 2 + 0          pos = 0 * 2 + 1
+	 *       pos: 0                   pos: 1
+	 *
+	 * mem1: pos = 0 * 2 + 1    mem3: pos = 0 * 2 + 1
+	 *       pos = 1 * 2 + 0          pos = 1 * 2 + 1
+	 *       pos: 2                   pos = 3
+	 *
+	 * Note that while this example is simple, the method applies to more
+	 * complex topologies, including those with switches.
+	 */
+
+	/* Iterate from endpoint to root_port refining the position */
+	for (iter = port; iter; iter = next_port(iter)) {
+		if (is_cxl_root(iter))
+			break;
+
+		rc = find_pos_and_ways(iter, range, &parent_pos, &parent_ways);
+		if (rc)
+			return rc;
+
+		pos = pos * parent_ways + parent_pos;
+	}
+
+	dev_dbg(&cxlmd->dev,
+		"decoder:%s parent:%s port:%s range:%#llx-%#llx pos:%d\n",
+		dev_name(&cxled->cxld.dev), dev_name(cxlmd->dev.parent),
+		dev_name(&port->dev), range->start, range->end, pos);
+
+	return pos;
+}
+
+static int cxl_region_sort_targets(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	int i, rc = 0;
+
+	for (i = 0; i < p->nr_targets; i++) {
+		struct cxl_endpoint_decoder *cxled = p->targets[i];
+
+		cxled->pos = cxl_calc_interleave_pos(cxled);
+		/*
+		 * Record that sorting failed, but still continue to calc
+		 * cxled->pos so that follow-on code paths can reliably
+		 * do p->targets[cxled->pos] to self-reference their entry.
+		 */
+		if (cxled->pos < 0)
+			rc = -ENXIO;
+	}
+	/* Keep the cxlr target list in interleave position order */
+	sort(p->targets, p->nr_targets, sizeof(p->targets[0]),
+	     cmp_interleave_pos, NULL);
+
+	dev_dbg(&cxlr->dev, "region sort %s\n", rc ? "failed" : "successful");
+	return rc;
+}
+
+static int cxl_region_attach(struct cxl_region *cxlr,
+			     struct cxl_endpoint_decoder *cxled, int pos)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_port *ep_port, *root_port;
+	struct cxl_dport *dport;
+	int rc = -ENXIO;
+
+	if (cxled->mode != cxlr->mode) {
+		dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n",
+			dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode);
+		return -EINVAL;
+	}
+
+	if (cxled->mode == CXL_DECODER_DEAD) {
+		dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev));
+		return -ENODEV;
+	}
+
+	/* all full of members, or interleave config not established? */
+	if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		dev_dbg(&cxlr->dev, "region already active\n");
+		return -EBUSY;
+	} else if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		dev_dbg(&cxlr->dev, "interleave config missing\n");
+		return -ENXIO;
+	}
+
+	if (p->nr_targets >= p->interleave_ways) {
+		dev_dbg(&cxlr->dev, "region already has %d endpoints\n",
+			p->nr_targets);
+		return -EINVAL;
+	}
+
+	ep_port = cxled_to_port(cxled);
+	root_port = cxlrd_to_port(cxlrd);
+	dport = cxl_find_dport_by_dev(root_port, ep_port->host_bridge);
+	if (!dport) {
+		dev_dbg(&cxlr->dev, "%s:%s invalid target for %s\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			dev_name(cxlr->dev.parent));
+		return -ENXIO;
+	}
+
+	if (cxled->cxld.target_type != cxlr->type) {
+		dev_dbg(&cxlr->dev, "%s:%s type mismatch: %d vs %d\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			cxled->cxld.target_type, cxlr->type);
+		return -ENXIO;
+	}
+
+	if (!cxled->dpa_res) {
+		dev_dbg(&cxlr->dev, "%s:%s: missing DPA allocation.\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev));
+		return -ENXIO;
+	}
+
+	if (resource_size(cxled->dpa_res) * p->interleave_ways !=
+	    resource_size(p->res)) {
+		dev_dbg(&cxlr->dev,
+			"%s:%s: decoder-size-%#llx * ways-%d != region-size-%#llx\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			(u64)resource_size(cxled->dpa_res), p->interleave_ways,
+			(u64)resource_size(p->res));
+		return -EINVAL;
+	}
+
+	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+		int i;
+
+		rc = cxl_region_attach_auto(cxlr, cxled, pos);
+		if (rc)
+			return rc;
+
+		/* await more targets to arrive... */
+		if (p->nr_targets < p->interleave_ways)
+			return 0;
+
+		/*
+		 * All targets are here, which implies all PCI enumeration that
+		 * affects this region has been completed. Walk the topology to
+		 * sort the devices into their relative region decode position.
+		 */
+		rc = cxl_region_sort_targets(cxlr);
+		if (rc)
+			return rc;
+
+		for (i = 0; i < p->nr_targets; i++) {
+			cxled = p->targets[i];
+			ep_port = cxled_to_port(cxled);
+			dport = cxl_find_dport_by_dev(root_port,
+						      ep_port->host_bridge);
+			rc = cxl_region_attach_position(cxlr, cxlrd, cxled,
+							dport, i);
+			if (rc)
+				return rc;
+		}
+
+		rc = cxl_region_setup_targets(cxlr);
+		if (rc)
+			return rc;
+
+		/*
+		 * If target setup succeeds in the autodiscovery case
+		 * then the region is already committed.
+		 */
+		p->state = CXL_CONFIG_COMMIT;
+
+		return 0;
+	}
+
+	rc = cxl_region_validate_position(cxlr, cxled, pos);
+	if (rc)
+		return rc;
+
+	rc = cxl_region_attach_position(cxlr, cxlrd, cxled, dport, pos);
+	if (rc)
+		return rc;
+
+	p->targets[pos] = cxled;
+	cxled->pos = pos;
+	p->nr_targets++;
+
+	if (p->nr_targets == p->interleave_ways) {
+		rc = cxl_region_setup_targets(cxlr);
+		if (rc)
+			return rc;
+		p->state = CXL_CONFIG_ACTIVE;
+	}
+
+	cxled->cxld.interleave_ways = p->interleave_ways;
+	cxled->cxld.interleave_granularity = p->interleave_granularity;
+	cxled->cxld.hpa_range = (struct range) {
+		.start = p->res->start,
+		.end = p->res->end,
+	};
+
+	if (p->nr_targets != p->interleave_ways)
+		return 0;
+
+	/*
+	 * Test the auto-discovery position calculator function
+	 * against this successfully created user-defined region.
+	 * A fail message here means that this interleave config
+	 * will fail when presented as CXL_REGION_F_AUTO.
+	 */
+	for (int i = 0; i < p->nr_targets; i++) {
+		struct cxl_endpoint_decoder *cxled = p->targets[i];
+		int test_pos;
+
+		test_pos = cxl_calc_interleave_pos(cxled);
+		dev_dbg(&cxled->cxld.dev,
+			"Test cxl_calc_interleave_pos(): %s test_pos:%d cxled->pos:%d\n",
+			(test_pos == cxled->pos) ? "success" : "fail",
+			test_pos, cxled->pos);
+	}
+
+	return 0;
+}
+
+static int cxl_region_detach(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_port *iter, *ep_port = cxled_to_port(cxled);
+	struct cxl_region *cxlr = cxled->cxld.region;
+	struct cxl_region_params *p;
+	int rc = 0;
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	if (!cxlr)
+		return 0;
+
+	p = &cxlr->params;
+	get_device(&cxlr->dev);
+
+	if (p->state > CXL_CONFIG_ACTIVE) {
+		/*
+		 * TODO: tear down all impacted regions if a device is
+		 * removed out of order
+		 */
+		rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
+		if (rc)
+			goto out;
+		p->state = CXL_CONFIG_ACTIVE;
+	}
+
+	for (iter = ep_port; !is_cxl_root(iter);
+	     iter = to_cxl_port(iter->dev.parent))
+		cxl_port_detach_region(iter, cxlr, cxled);
+
+	if (cxled->pos < 0 || cxled->pos >= p->interleave_ways ||
+	    p->targets[cxled->pos] != cxled) {
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+
+		dev_WARN_ONCE(&cxlr->dev, 1, "expected %s:%s at position %d\n",
+			      dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			      cxled->pos);
+		goto out;
+	}
+
+	if (p->state == CXL_CONFIG_ACTIVE) {
+		p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
+		cxl_region_teardown_targets(cxlr);
+	}
+	p->targets[cxled->pos] = NULL;
+	p->nr_targets--;
+	cxled->cxld.hpa_range = (struct range) {
+		.start = 0,
+		.end = -1,
+	};
+
+	/* notify the region driver that one of its targets has departed */
+	up_write(&cxl_region_rwsem);
+	device_release_driver(&cxlr->dev);
+	down_write(&cxl_region_rwsem);
+out:
+	put_device(&cxlr->dev);
+	return rc;
+}
+
+void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled)
+{
+	down_write(&cxl_region_rwsem);
+	cxled->mode = CXL_DECODER_DEAD;
+	cxl_region_detach(cxled);
+	up_write(&cxl_region_rwsem);
+}
+
+static int attach_target(struct cxl_region *cxlr,
+			 struct cxl_endpoint_decoder *cxled, int pos,
+			 unsigned int state)
+{
+	int rc = 0;
+
+	if (state == TASK_INTERRUPTIBLE)
+		rc = down_write_killable(&cxl_region_rwsem);
+	else
+		down_write(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	down_read(&cxl_dpa_rwsem);
+	rc = cxl_region_attach(cxlr, cxled, pos);
+	up_read(&cxl_dpa_rwsem);
+	up_write(&cxl_region_rwsem);
+	return rc;
+}
+
+static int detach_target(struct cxl_region *cxlr, int pos)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	int rc;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	if (pos >= p->interleave_ways) {
+		dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
+			p->interleave_ways);
+		rc = -ENXIO;
+		goto out;
+	}
+
+	if (!p->targets[pos]) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = cxl_region_detach(p->targets[pos]);
+out:
+	up_write(&cxl_region_rwsem);
+	return rc;
+}
+
+static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos,
+			    size_t len)
+{
+	int rc;
+
+	if (sysfs_streq(buf, "\n"))
+		rc = detach_target(cxlr, pos);
+	else {
+		struct device *dev;
+
+		dev = bus_find_device_by_name(&cxl_bus_type, NULL, buf);
+		if (!dev)
+			return -ENODEV;
+
+		if (!is_endpoint_decoder(dev)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = attach_target(cxlr, to_cxl_endpoint_decoder(dev), pos,
+				   TASK_INTERRUPTIBLE);
+out:
+		put_device(dev);
+	}
+
+	if (rc < 0)
+		return rc;
+	return len;
+}
+
+#define TARGET_ATTR_RW(n)                                              \
+static ssize_t target##n##_show(                                       \
+	struct device *dev, struct device_attribute *attr, char *buf)  \
+{                                                                      \
+	return show_targetN(to_cxl_region(dev), buf, (n));             \
+}                                                                      \
+static ssize_t target##n##_store(struct device *dev,                   \
+				 struct device_attribute *attr,        \
+				 const char *buf, size_t len)          \
+{                                                                      \
+	return store_targetN(to_cxl_region(dev), buf, (n), len);       \
+}                                                                      \
+static DEVICE_ATTR_RW(target##n)
+
+TARGET_ATTR_RW(0);
+TARGET_ATTR_RW(1);
+TARGET_ATTR_RW(2);
+TARGET_ATTR_RW(3);
+TARGET_ATTR_RW(4);
+TARGET_ATTR_RW(5);
+TARGET_ATTR_RW(6);
+TARGET_ATTR_RW(7);
+TARGET_ATTR_RW(8);
+TARGET_ATTR_RW(9);
+TARGET_ATTR_RW(10);
+TARGET_ATTR_RW(11);
+TARGET_ATTR_RW(12);
+TARGET_ATTR_RW(13);
+TARGET_ATTR_RW(14);
+TARGET_ATTR_RW(15);
+
+static struct attribute *target_attrs[] = {
+	&dev_attr_target0.attr,
+	&dev_attr_target1.attr,
+	&dev_attr_target2.attr,
+	&dev_attr_target3.attr,
+	&dev_attr_target4.attr,
+	&dev_attr_target5.attr,
+	&dev_attr_target6.attr,
+	&dev_attr_target7.attr,
+	&dev_attr_target8.attr,
+	&dev_attr_target9.attr,
+	&dev_attr_target10.attr,
+	&dev_attr_target11.attr,
+	&dev_attr_target12.attr,
+	&dev_attr_target13.attr,
+	&dev_attr_target14.attr,
+	&dev_attr_target15.attr,
+	NULL,
+};
+
+static umode_t cxl_region_target_visible(struct kobject *kobj,
+					 struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+
+	if (n < p->interleave_ways)
+		return a->mode;
+	return 0;
+}
+
+static const struct attribute_group cxl_region_target_group = {
+	.attrs = target_attrs,
+	.is_visible = cxl_region_target_visible,
+};
+
+static const struct attribute_group *get_cxl_region_target_group(void)
+{
+	return &cxl_region_target_group;
+}
+
+static const struct attribute_group *region_groups[] = {
+	&cxl_base_attribute_group,
+	&cxl_region_group,
+	&cxl_region_target_group,
+	NULL,
+};
+
+static void cxl_region_release(struct device *dev)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	int id = atomic_read(&cxlrd->region_id);
+
+	/*
+	 * Try to reuse the recently idled id rather than the cached
+	 * next id to prevent the region id space from increasing
+	 * unnecessarily.
+	 */
+	if (cxlr->id < id)
+		if (atomic_try_cmpxchg(&cxlrd->region_id, &id, cxlr->id)) {
+			memregion_free(id);
+			goto out;
+		}
+
+	memregion_free(cxlr->id);
+out:
+	put_device(dev->parent);
+	kfree(cxlr);
+}
+
+const struct device_type cxl_region_type = {
+	.name = "cxl_region",
+	.release = cxl_region_release,
+	.groups = region_groups
+};
+
+bool is_cxl_region(struct device *dev)
+{
+	return dev->type == &cxl_region_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_region, CXL);
+
+static struct cxl_region *to_cxl_region(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, dev->type != &cxl_region_type,
+			  "not a cxl_region device\n"))
+		return NULL;
+
+	return container_of(dev, struct cxl_region, dev);
+}
+
+static void unregister_region(void *dev)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	int i;
+
+	device_del(dev);
+
+	/*
+	 * Now that region sysfs is shutdown, the parameter block is now
+	 * read-only, so no need to hold the region rwsem to access the
+	 * region parameters.
+	 */
+	for (i = 0; i < p->interleave_ways; i++)
+		detach_target(cxlr, i);
+
+	cxl_region_iomem_release(cxlr);
+	put_device(dev);
+}
+
+static struct lock_class_key cxl_region_key;
+
+static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id)
+{
+	struct cxl_region *cxlr;
+	struct device *dev;
+
+	cxlr = kzalloc(sizeof(*cxlr), GFP_KERNEL);
+	if (!cxlr) {
+		memregion_free(id);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dev = &cxlr->dev;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_region_key);
+	dev->parent = &cxlrd->cxlsd.cxld.dev;
+	/*
+	 * Keep root decoder pinned through cxl_region_release to fixup
+	 * region id allocations
+	 */
+	get_device(dev->parent);
+	device_set_pm_not_required(dev);
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_region_type;
+	cxlr->id = id;
+
+	return cxlr;
+}
+
+/**
+ * devm_cxl_add_region - Adds a region to a decoder
+ * @cxlrd: root decoder
+ * @id: memregion id to create, or memregion_free() on failure
+ * @mode: mode for the endpoint decoders of this region
+ * @type: select whether this is an expander or accelerator (type-2 or type-3)
+ *
+ * This is the second step of region initialization. Regions exist within an
+ * address space which is mapped by a @cxlrd.
+ *
+ * Return: 0 if the region was added to the @cxlrd, else returns negative error
+ * code. The region will be named "regionZ" where Z is the unique region number.
+ */
+static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
+					      int id,
+					      enum cxl_decoder_mode mode,
+					      enum cxl_decoder_type type)
+{
+	struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
+	struct cxl_region *cxlr;
+	struct device *dev;
+	int rc;
+
+	switch (mode) {
+	case CXL_DECODER_RAM:
+	case CXL_DECODER_PMEM:
+		break;
+	default:
+		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
+		return ERR_PTR(-EINVAL);
+	}
+
+	cxlr = cxl_region_alloc(cxlrd, id);
+	if (IS_ERR(cxlr))
+		return cxlr;
+	cxlr->mode = mode;
+	cxlr->type = type;
+
+	dev = &cxlr->dev;
+	rc = dev_set_name(dev, "region%d", id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr);
+	if (rc)
+		return ERR_PTR(rc);
+
+	dev_dbg(port->uport_dev, "%s: created %s\n",
+		dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev));
+	return cxlr;
+
+err:
+	put_device(dev);
+	return ERR_PTR(rc);
+}
+
+static ssize_t __create_region_show(struct cxl_root_decoder *cxlrd, char *buf)
+{
+	return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id));
+}
+
+static ssize_t create_pmem_region_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	return __create_region_show(to_cxl_root_decoder(dev), buf);
+}
+
+static ssize_t create_ram_region_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	return __create_region_show(to_cxl_root_decoder(dev), buf);
+}
+
+static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd,
+					  enum cxl_decoder_mode mode, int id)
+{
+	int rc;
+
+	rc = memregion_alloc(GFP_KERNEL);
+	if (rc < 0)
+		return ERR_PTR(rc);
+
+	if (atomic_cmpxchg(&cxlrd->region_id, id, rc) != id) {
+		memregion_free(rc);
+		return ERR_PTR(-EBUSY);
+	}
+
+	return devm_cxl_add_region(cxlrd, id, mode, CXL_DECODER_HOSTONLYMEM);
+}
+
+static ssize_t create_pmem_region_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+	struct cxl_region *cxlr;
+	int rc, id;
+
+	rc = sscanf(buf, "region%d\n", &id);
+	if (rc != 1)
+		return -EINVAL;
+
+	cxlr = __create_region(cxlrd, CXL_DECODER_PMEM, id);
+	if (IS_ERR(cxlr))
+		return PTR_ERR(cxlr);
+
+	return len;
+}
+DEVICE_ATTR_RW(create_pmem_region);
+
+static ssize_t create_ram_region_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+	struct cxl_region *cxlr;
+	int rc, id;
+
+	rc = sscanf(buf, "region%d\n", &id);
+	if (rc != 1)
+		return -EINVAL;
+
+	cxlr = __create_region(cxlrd, CXL_DECODER_RAM, id);
+	if (IS_ERR(cxlr))
+		return PTR_ERR(cxlr);
+
+	return len;
+}
+DEVICE_ATTR_RW(create_ram_region);
+
+static ssize_t region_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct cxl_decoder *cxld = to_cxl_decoder(dev);
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	if (cxld->region)
+		rc = sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev));
+	else
+		rc = sysfs_emit(buf, "\n");
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+DEVICE_ATTR_RO(region);
+
+static struct cxl_region *
+cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name)
+{
+	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+	struct device *region_dev;
+
+	region_dev = device_find_child_by_name(&cxld->dev, name);
+	if (!region_dev)
+		return ERR_PTR(-ENODEV);
+
+	return to_cxl_region(region_dev);
+}
+
+static ssize_t delete_region_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+	struct cxl_port *port = to_cxl_port(dev->parent);
+	struct cxl_region *cxlr;
+
+	cxlr = cxl_find_region_by_name(cxlrd, buf);
+	if (IS_ERR(cxlr))
+		return PTR_ERR(cxlr);
+
+	devm_release_action(port->uport_dev, unregister_region, cxlr);
+	put_device(&cxlr->dev);
+
+	return len;
+}
+DEVICE_ATTR_WO(delete_region);
+
+static void cxl_pmem_region_release(struct device *dev)
+{
+	struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
+	int i;
+
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd;
+
+		put_device(&cxlmd->dev);
+	}
+
+	kfree(cxlr_pmem);
+}
+
+static const struct attribute_group *cxl_pmem_region_attribute_groups[] = {
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+const struct device_type cxl_pmem_region_type = {
+	.name = "cxl_pmem_region",
+	.release = cxl_pmem_region_release,
+	.groups = cxl_pmem_region_attribute_groups,
+};
+
+bool is_cxl_pmem_region(struct device *dev)
+{
+	return dev->type == &cxl_pmem_region_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, CXL);
+
+struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev),
+			  "not a cxl_pmem_region device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_pmem_region, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, CXL);
+
+struct cxl_poison_context {
+	struct cxl_port *port;
+	enum cxl_decoder_mode mode;
+	u64 offset;
+};
+
+static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd,
+				   struct cxl_poison_context *ctx)
+{
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	u64 offset, length;
+	int rc = 0;
+
+	/*
+	 * Collect poison for the remaining unmapped resources
+	 * after poison is collected by committed endpoints.
+	 *
+	 * Knowing that PMEM must always follow RAM, get poison
+	 * for unmapped resources based on the last decoder's mode:
+	 *	ram: scan remains of ram range, then any pmem range
+	 *	pmem: scan remains of pmem range
+	 */
+
+	if (ctx->mode == CXL_DECODER_RAM) {
+		offset = ctx->offset;
+		length = resource_size(&cxlds->ram_res) - offset;
+		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+		if (rc == -EFAULT)
+			rc = 0;
+		if (rc)
+			return rc;
+	}
+	if (ctx->mode == CXL_DECODER_PMEM) {
+		offset = ctx->offset;
+		length = resource_size(&cxlds->dpa_res) - offset;
+		if (!length)
+			return 0;
+	} else if (resource_size(&cxlds->pmem_res)) {
+		offset = cxlds->pmem_res.start;
+		length = resource_size(&cxlds->pmem_res);
+	} else {
+		return 0;
+	}
+
+	return cxl_mem_get_poison(cxlmd, offset, length, NULL);
+}
+
+static int poison_by_decoder(struct device *dev, void *arg)
+{
+	struct cxl_poison_context *ctx = arg;
+	struct cxl_endpoint_decoder *cxled;
+	struct cxl_memdev *cxlmd;
+	u64 offset, length;
+	int rc = 0;
+
+	if (!is_endpoint_decoder(dev))
+		return rc;
+
+	cxled = to_cxl_endpoint_decoder(dev);
+	if (!cxled->dpa_res || !resource_size(cxled->dpa_res))
+		return rc;
+
+	/*
+	 * Regions are only created with single mode decoders: pmem or ram.
+	 * Linux does not support mixed mode decoders. This means that
+	 * reading poison per endpoint decoder adheres to the requirement
+	 * that poison reads of pmem and ram must be separated.
+	 * CXL 3.0 Spec 8.2.9.8.4.1
+	 */
+	if (cxled->mode == CXL_DECODER_MIXED) {
+		dev_dbg(dev, "poison list read unsupported in mixed mode\n");
+		return rc;
+	}
+
+	cxlmd = cxled_to_memdev(cxled);
+	if (cxled->skip) {
+		offset = cxled->dpa_res->start - cxled->skip;
+		length = cxled->skip;
+		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+		if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM)
+			rc = 0;
+		if (rc)
+			return rc;
+	}
+
+	offset = cxled->dpa_res->start;
+	length = cxled->dpa_res->end - offset + 1;
+	rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region);
+	if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM)
+		rc = 0;
+	if (rc)
+		return rc;
+
+	/* Iterate until commit_end is reached */
+	if (cxled->cxld.id == ctx->port->commit_end) {
+		ctx->offset = cxled->dpa_res->end + 1;
+		ctx->mode = cxled->mode;
+		return 1;
+	}
+
+	return 0;
+}
+
+int cxl_get_poison_by_endpoint(struct cxl_port *port)
+{
+	struct cxl_poison_context ctx;
+	int rc = 0;
+
+	ctx = (struct cxl_poison_context) {
+		.port = port
+	};
+
+	rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder);
+	if (rc == 1)
+		rc = cxl_get_poison_unmapped(to_cxl_memdev(port->uport_dev),
+					     &ctx);
+
+	return rc;
+}
+
+static struct lock_class_key cxl_pmem_region_key;
+
+static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct cxl_pmem_region *cxlr_pmem;
+	struct device *dev;
+	int i;
+
+	down_read(&cxl_region_rwsem);
+	if (p->state != CXL_CONFIG_COMMIT) {
+		cxlr_pmem = ERR_PTR(-ENXIO);
+		goto out;
+	}
+
+	cxlr_pmem = kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets),
+			    GFP_KERNEL);
+	if (!cxlr_pmem) {
+		cxlr_pmem = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	cxlr_pmem->hpa_range.start = p->res->start;
+	cxlr_pmem->hpa_range.end = p->res->end;
+
+	/* Snapshot the region configuration underneath the cxl_region_rwsem */
+	cxlr_pmem->nr_mappings = p->nr_targets;
+	for (i = 0; i < p->nr_targets; i++) {
+		struct cxl_endpoint_decoder *cxled = p->targets[i];
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+
+		/*
+		 * Regions never span CXL root devices, so by definition the
+		 * bridge for one device is the same for all.
+		 */
+		if (i == 0) {
+			cxl_nvb = cxl_find_nvdimm_bridge(cxlmd);
+			if (!cxl_nvb) {
+				cxlr_pmem = ERR_PTR(-ENODEV);
+				goto out;
+			}
+			cxlr->cxl_nvb = cxl_nvb;
+		}
+		m->cxlmd = cxlmd;
+		get_device(&cxlmd->dev);
+		m->start = cxled->dpa_res->start;
+		m->size = resource_size(cxled->dpa_res);
+		m->position = i;
+	}
+
+	dev = &cxlr_pmem->dev;
+	cxlr_pmem->cxlr = cxlr;
+	cxlr->cxlr_pmem = cxlr_pmem;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_pmem_region_key);
+	device_set_pm_not_required(dev);
+	dev->parent = &cxlr->dev;
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_pmem_region_type;
+out:
+	up_read(&cxl_region_rwsem);
+
+	return cxlr_pmem;
+}
+
+static void cxl_dax_region_release(struct device *dev)
+{
+	struct cxl_dax_region *cxlr_dax = to_cxl_dax_region(dev);
+
+	kfree(cxlr_dax);
+}
+
+static const struct attribute_group *cxl_dax_region_attribute_groups[] = {
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+const struct device_type cxl_dax_region_type = {
+	.name = "cxl_dax_region",
+	.release = cxl_dax_region_release,
+	.groups = cxl_dax_region_attribute_groups,
+};
+
+static bool is_cxl_dax_region(struct device *dev)
+{
+	return dev->type == &cxl_dax_region_type;
+}
+
+struct cxl_dax_region *to_cxl_dax_region(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_cxl_dax_region(dev),
+			  "not a cxl_dax_region device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_dax_region, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_dax_region, CXL);
+
+static struct lock_class_key cxl_dax_region_key;
+
+static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_dax_region *cxlr_dax;
+	struct device *dev;
+
+	down_read(&cxl_region_rwsem);
+	if (p->state != CXL_CONFIG_COMMIT) {
+		cxlr_dax = ERR_PTR(-ENXIO);
+		goto out;
+	}
+
+	cxlr_dax = kzalloc(sizeof(*cxlr_dax), GFP_KERNEL);
+	if (!cxlr_dax) {
+		cxlr_dax = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	cxlr_dax->hpa_range.start = p->res->start;
+	cxlr_dax->hpa_range.end = p->res->end;
+
+	dev = &cxlr_dax->dev;
+	cxlr_dax->cxlr = cxlr;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
+	device_set_pm_not_required(dev);
+	dev->parent = &cxlr->dev;
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_dax_region_type;
+out:
+	up_read(&cxl_region_rwsem);
+
+	return cxlr_dax;
+}
+
+static void cxlr_pmem_unregister(void *_cxlr_pmem)
+{
+	struct cxl_pmem_region *cxlr_pmem = _cxlr_pmem;
+	struct cxl_region *cxlr = cxlr_pmem->cxlr;
+	struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb;
+
+	/*
+	 * Either the bridge is in ->remove() context under the device_lock(),
+	 * or cxlr_release_nvdimm() is cancelling the bridge's release action
+	 * for @cxlr_pmem and doing it itself (while manually holding the bridge
+	 * lock).
+	 */
+	device_lock_assert(&cxl_nvb->dev);
+	cxlr->cxlr_pmem = NULL;
+	cxlr_pmem->cxlr = NULL;
+	device_unregister(&cxlr_pmem->dev);
+}
+
+static void cxlr_release_nvdimm(void *_cxlr)
+{
+	struct cxl_region *cxlr = _cxlr;
+	struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb;
+
+	device_lock(&cxl_nvb->dev);
+	if (cxlr->cxlr_pmem)
+		devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister,
+				    cxlr->cxlr_pmem);
+	device_unlock(&cxl_nvb->dev);
+	cxlr->cxl_nvb = NULL;
+	put_device(&cxl_nvb->dev);
+}
+
+/**
+ * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge
+ * @cxlr: parent CXL region for this pmem region bridge device
+ *
+ * Return: 0 on success negative error code on failure.
+ */
+static int devm_cxl_add_pmem_region(struct cxl_region *cxlr)
+{
+	struct cxl_pmem_region *cxlr_pmem;
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct device *dev;
+	int rc;
+
+	cxlr_pmem = cxl_pmem_region_alloc(cxlr);
+	if (IS_ERR(cxlr_pmem))
+		return PTR_ERR(cxlr_pmem);
+	cxl_nvb = cxlr->cxl_nvb;
+
+	dev = &cxlr_pmem->dev;
+	rc = dev_set_name(dev, "pmem_region%d", cxlr->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
+		dev_name(dev));
+
+	device_lock(&cxl_nvb->dev);
+	if (cxl_nvb->dev.driver)
+		rc = devm_add_action_or_reset(&cxl_nvb->dev,
+					      cxlr_pmem_unregister, cxlr_pmem);
+	else
+		rc = -ENXIO;
+	device_unlock(&cxl_nvb->dev);
+
+	if (rc)
+		goto err_bridge;
+
+	/* @cxlr carries a reference on @cxl_nvb until cxlr_release_nvdimm */
+	return devm_add_action_or_reset(&cxlr->dev, cxlr_release_nvdimm, cxlr);
+
+err:
+	put_device(dev);
+err_bridge:
+	put_device(&cxl_nvb->dev);
+	cxlr->cxl_nvb = NULL;
+	return rc;
+}
+
+static void cxlr_dax_unregister(void *_cxlr_dax)
+{
+	struct cxl_dax_region *cxlr_dax = _cxlr_dax;
+
+	device_unregister(&cxlr_dax->dev);
+}
+
+static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
+{
+	struct cxl_dax_region *cxlr_dax;
+	struct device *dev;
+	int rc;
+
+	cxlr_dax = cxl_dax_region_alloc(cxlr);
+	if (IS_ERR(cxlr_dax))
+		return PTR_ERR(cxlr_dax);
+
+	dev = &cxlr_dax->dev;
+	rc = dev_set_name(dev, "dax_region%d", cxlr->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
+		dev_name(dev));
+
+	return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
+					cxlr_dax);
+err:
+	put_device(dev);
+	return rc;
+}
+
+static int match_root_decoder_by_range(struct device *dev, void *data)
+{
+	struct range *r1, *r2 = data;
+	struct cxl_root_decoder *cxlrd;
+
+	if (!is_root_decoder(dev))
+		return 0;
+
+	cxlrd = to_cxl_root_decoder(dev);
+	r1 = &cxlrd->cxlsd.cxld.hpa_range;
+	return range_contains(r1, r2);
+}
+
+static int match_region_by_range(struct device *dev, void *data)
+{
+	struct cxl_region_params *p;
+	struct cxl_region *cxlr;
+	struct range *r = data;
+	int rc = 0;
+
+	if (!is_cxl_region(dev))
+		return 0;
+
+	cxlr = to_cxl_region(dev);
+	p = &cxlr->params;
+
+	down_read(&cxl_region_rwsem);
+	if (p->res && p->res->start == r->start && p->res->end == r->end)
+		rc = 1;
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+/* Establish an empty region covering the given HPA range */
+static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
+					   struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_port *port = cxlrd_to_port(cxlrd);
+	struct range *hpa = &cxled->cxld.hpa_range;
+	struct cxl_region_params *p;
+	struct cxl_region *cxlr;
+	struct resource *res;
+	int rc;
+
+	do {
+		cxlr = __create_region(cxlrd, cxled->mode,
+				       atomic_read(&cxlrd->region_id));
+	} while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY);
+
+	if (IS_ERR(cxlr)) {
+		dev_err(cxlmd->dev.parent,
+			"%s:%s: %s failed assign region: %ld\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			__func__, PTR_ERR(cxlr));
+		return cxlr;
+	}
+
+	down_write(&cxl_region_rwsem);
+	p = &cxlr->params;
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		dev_err(cxlmd->dev.parent,
+			"%s:%s: %s autodiscovery interrupted\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			__func__);
+		rc = -EBUSY;
+		goto err;
+	}
+
+	set_bit(CXL_REGION_F_AUTO, &cxlr->flags);
+
+	res = kmalloc(sizeof(*res), GFP_KERNEL);
+	if (!res) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	*res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa),
+				    dev_name(&cxlr->dev));
+	rc = insert_resource(cxlrd->res, res);
+	if (rc) {
+		/*
+		 * Platform-firmware may not have split resources like "System
+		 * RAM" on CXL window boundaries see cxl_region_iomem_release()
+		 */
+		dev_warn(cxlmd->dev.parent,
+			 "%s:%s: %s %s cannot insert resource\n",
+			 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			 __func__, dev_name(&cxlr->dev));
+	}
+
+	p->res = res;
+	p->interleave_ways = cxled->cxld.interleave_ways;
+	p->interleave_granularity = cxled->cxld.interleave_granularity;
+	p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
+
+	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
+	if (rc)
+		goto err;
+
+	dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n",
+		dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__,
+		dev_name(&cxlr->dev), p->res, p->interleave_ways,
+		p->interleave_granularity);
+
+	/* ...to match put_device() in cxl_add_to_region() */
+	get_device(&cxlr->dev);
+	up_write(&cxl_region_rwsem);
+
+	return cxlr;
+
+err:
+	up_write(&cxl_region_rwsem);
+	devm_release_action(port->uport_dev, unregister_region, cxlr);
+	return ERR_PTR(rc);
+}
+
+int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct range *hpa = &cxled->cxld.hpa_range;
+	struct cxl_decoder *cxld = &cxled->cxld;
+	struct device *cxlrd_dev, *region_dev;
+	struct cxl_root_decoder *cxlrd;
+	struct cxl_region_params *p;
+	struct cxl_region *cxlr;
+	bool attach = false;
+	int rc;
+
+	cxlrd_dev = device_find_child(&root->dev, &cxld->hpa_range,
+				      match_root_decoder_by_range);
+	if (!cxlrd_dev) {
+		dev_err(cxlmd->dev.parent,
+			"%s:%s no CXL window for range %#llx:%#llx\n",
+			dev_name(&cxlmd->dev), dev_name(&cxld->dev),
+			cxld->hpa_range.start, cxld->hpa_range.end);
+		return -ENXIO;
+	}
+
+	cxlrd = to_cxl_root_decoder(cxlrd_dev);
+
+	/*
+	 * Ensure that if multiple threads race to construct_region() for @hpa
+	 * one does the construction and the others add to that.
+	 */
+	mutex_lock(&cxlrd->range_lock);
+	region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa,
+				       match_region_by_range);
+	if (!region_dev) {
+		cxlr = construct_region(cxlrd, cxled);
+		region_dev = &cxlr->dev;
+	} else
+		cxlr = to_cxl_region(region_dev);
+	mutex_unlock(&cxlrd->range_lock);
+
+	rc = PTR_ERR_OR_ZERO(cxlr);
+	if (rc)
+		goto out;
+
+	attach_target(cxlr, cxled, -1, TASK_UNINTERRUPTIBLE);
+
+	down_read(&cxl_region_rwsem);
+	p = &cxlr->params;
+	attach = p->state == CXL_CONFIG_COMMIT;
+	up_read(&cxl_region_rwsem);
+
+	if (attach) {
+		/*
+		 * If device_attach() fails the range may still be active via
+		 * the platform-firmware memory map, otherwise the driver for
+		 * regions is local to this file, so driver matching can't fail.
+		 */
+		if (device_attach(&cxlr->dev) < 0)
+			dev_err(&cxlr->dev, "failed to enable, range: %pr\n",
+				p->res);
+	}
+
+	put_device(region_dev);
+out:
+	put_device(cxlrd_dev);
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, CXL);
+
+static int is_system_ram(struct resource *res, void *arg)
+{
+	struct cxl_region *cxlr = arg;
+	struct cxl_region_params *p = &cxlr->params;
+
+	dev_dbg(&cxlr->dev, "%pr has System RAM: %pr\n", p->res, res);
+	return 1;
+}
+
+static int cxl_region_probe(struct device *dev)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	int rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc) {
+		dev_dbg(&cxlr->dev, "probe interrupted\n");
+		return rc;
+	}
+
+	if (p->state < CXL_CONFIG_COMMIT) {
+		dev_dbg(&cxlr->dev, "config state: %d\n", p->state);
+		rc = -ENXIO;
+		goto out;
+	}
+
+	if (test_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags)) {
+		dev_err(&cxlr->dev,
+			"failed to activate, re-commit region and retry\n");
+		rc = -ENXIO;
+		goto out;
+	}
+
+	/*
+	 * From this point on any path that changes the region's state away from
+	 * CXL_CONFIG_COMMIT is also responsible for releasing the driver.
+	 */
+out:
+	up_read(&cxl_region_rwsem);
+
+	if (rc)
+		return rc;
+
+	switch (cxlr->mode) {
+	case CXL_DECODER_PMEM:
+		return devm_cxl_add_pmem_region(cxlr);
+	case CXL_DECODER_RAM:
+		/*
+		 * The region can not be manged by CXL if any portion of
+		 * it is already online as 'System RAM'
+		 */
+		if (walk_iomem_res_desc(IORES_DESC_NONE,
+					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+					p->res->start, p->res->end, cxlr,
+					is_system_ram) > 0)
+			return 0;
+		return devm_cxl_add_dax_region(cxlr);
+	default:
+		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
+			cxlr->mode);
+		return -ENXIO;
+	}
+}
+
+static struct cxl_driver cxl_region_driver = {
+	.name = "cxl_region",
+	.probe = cxl_region_probe,
+	.id = CXL_DEVICE_REGION,
+};
+
+int cxl_region_init(void)
+{
+	return cxl_driver_register(&cxl_region_driver);
+}
+
+void cxl_region_exit(void)
+{
+	cxl_driver_unregister(&cxl_region_driver);
+}
+
+MODULE_IMPORT_NS(CXL);
+MODULE_IMPORT_NS(DEVMEM);
+MODULE_ALIAS_CXL(CXL_DEVICE_REGION);
diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c
new file mode 100644
index 0000000000..e0fbe964f6
--- /dev/null
+++ b/drivers/cxl/core/regs.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. */
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <cxlmem.h>
+#include <cxlpci.h>
+#include <pmu.h>
+
+#include "core.h"
+
+/**
+ * DOC: cxl registers
+ *
+ * CXL device capabilities are enumerated by PCI DVSEC (Designated
+ * Vendor-specific) and / or descriptors provided by platform firmware.
+ * They can be defined as a set like the device and component registers
+ * mandated by CXL Section 8.1.12.2 Memory Device PCIe Capabilities and
+ * Extended Capabilities, or they can be individual capabilities
+ * appended to bridged and endpoint devices.
+ *
+ * Provide common infrastructure for enumerating and mapping these
+ * discrete capabilities.
+ */
+
+/**
+ * cxl_probe_component_regs() - Detect CXL Component register blocks
+ * @dev: Host device of the @base mapping
+ * @base: Mapping containing the HDM Decoder Capability Header
+ * @map: Map object describing the register block information found
+ *
+ * See CXL 2.0 8.2.4 Component Register Layout and Definition
+ * See CXL 2.0 8.2.5.5 CXL Device Register Interface
+ *
+ * Probe for component register information and return it in map object.
+ */
+void cxl_probe_component_regs(struct device *dev, void __iomem *base,
+			      struct cxl_component_reg_map *map)
+{
+	int cap, cap_count;
+	u32 cap_array;
+
+	*map = (struct cxl_component_reg_map) { 0 };
+
+	/*
+	 * CXL.cache and CXL.mem registers are at offset 0x1000 as defined in
+	 * CXL 2.0 8.2.4 Table 141.
+	 */
+	base += CXL_CM_OFFSET;
+
+	cap_array = readl(base + CXL_CM_CAP_HDR_OFFSET);
+
+	if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, cap_array) != CM_CAP_HDR_CAP_ID) {
+		dev_err(dev,
+			"Couldn't locate the CXL.cache and CXL.mem capability array header.\n");
+		return;
+	}
+
+	/* It's assumed that future versions will be backward compatible */
+	cap_count = FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_array);
+
+	for (cap = 1; cap <= cap_count; cap++) {
+		void __iomem *register_block;
+		struct cxl_reg_map *rmap;
+		u16 cap_id, offset;
+		u32 length, hdr;
+
+		hdr = readl(base + cap * 0x4);
+
+		cap_id = FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, hdr);
+		offset = FIELD_GET(CXL_CM_CAP_PTR_MASK, hdr);
+		register_block = base + offset;
+		hdr = readl(register_block);
+
+		rmap = NULL;
+		switch (cap_id) {
+		case CXL_CM_CAP_CAP_ID_HDM: {
+			int decoder_cnt;
+
+			dev_dbg(dev, "found HDM decoder capability (0x%x)\n",
+				offset);
+
+			decoder_cnt = cxl_hdm_decoder_count(hdr);
+			length = 0x20 * decoder_cnt + 0x10;
+			rmap = &map->hdm_decoder;
+			break;
+		}
+		case CXL_CM_CAP_CAP_ID_RAS:
+			dev_dbg(dev, "found RAS capability (0x%x)\n",
+				offset);
+			length = CXL_RAS_CAPABILITY_LENGTH;
+			rmap = &map->ras;
+			break;
+		default:
+			dev_dbg(dev, "Unknown CM cap ID: %d (0x%x)\n", cap_id,
+				offset);
+			break;
+		}
+
+		if (!rmap)
+			continue;
+		rmap->valid = true;
+		rmap->id = cap_id;
+		rmap->offset = CXL_CM_OFFSET + offset;
+		rmap->size = length;
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cxl_probe_component_regs, CXL);
+
+/**
+ * cxl_probe_device_regs() - Detect CXL Device register blocks
+ * @dev: Host device of the @base mapping
+ * @base: Mapping of CXL 2.0 8.2.8 CXL Device Register Interface
+ * @map: Map object describing the register block information found
+ *
+ * Probe for device register information and return it in map object.
+ */
+void cxl_probe_device_regs(struct device *dev, void __iomem *base,
+			   struct cxl_device_reg_map *map)
+{
+	int cap, cap_count;
+	u64 cap_array;
+
+	*map = (struct cxl_device_reg_map){ 0 };
+
+	cap_array = readq(base + CXLDEV_CAP_ARRAY_OFFSET);
+	if (FIELD_GET(CXLDEV_CAP_ARRAY_ID_MASK, cap_array) !=
+	    CXLDEV_CAP_ARRAY_CAP_ID)
+		return;
+
+	cap_count = FIELD_GET(CXLDEV_CAP_ARRAY_COUNT_MASK, cap_array);
+
+	for (cap = 1; cap <= cap_count; cap++) {
+		struct cxl_reg_map *rmap;
+		u32 offset, length;
+		u16 cap_id;
+
+		cap_id = FIELD_GET(CXLDEV_CAP_HDR_CAP_ID_MASK,
+				   readl(base + cap * 0x10));
+		offset = readl(base + cap * 0x10 + 0x4);
+		length = readl(base + cap * 0x10 + 0x8);
+
+		rmap = NULL;
+		switch (cap_id) {
+		case CXLDEV_CAP_CAP_ID_DEVICE_STATUS:
+			dev_dbg(dev, "found Status capability (0x%x)\n", offset);
+			rmap = &map->status;
+			break;
+		case CXLDEV_CAP_CAP_ID_PRIMARY_MAILBOX:
+			dev_dbg(dev, "found Mailbox capability (0x%x)\n", offset);
+			rmap = &map->mbox;
+			break;
+		case CXLDEV_CAP_CAP_ID_SECONDARY_MAILBOX:
+			dev_dbg(dev, "found Secondary Mailbox capability (0x%x)\n", offset);
+			break;
+		case CXLDEV_CAP_CAP_ID_MEMDEV:
+			dev_dbg(dev, "found Memory Device capability (0x%x)\n", offset);
+			rmap = &map->memdev;
+			break;
+		default:
+			if (cap_id >= 0x8000)
+				dev_dbg(dev, "Vendor cap ID: %#x offset: %#x\n", cap_id, offset);
+			else
+				dev_dbg(dev, "Unknown cap ID: %#x offset: %#x\n", cap_id, offset);
+			break;
+		}
+
+		if (!rmap)
+			continue;
+		rmap->valid = true;
+		rmap->id = cap_id;
+		rmap->offset = offset;
+		rmap->size = length;
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cxl_probe_device_regs, CXL);
+
+void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
+				   resource_size_t length)
+{
+	void __iomem *ret_val;
+	struct resource *res;
+
+	if (WARN_ON_ONCE(addr == CXL_RESOURCE_NONE))
+		return NULL;
+
+	res = devm_request_mem_region(dev, addr, length, dev_name(dev));
+	if (!res) {
+		resource_size_t end = addr + length - 1;
+
+		dev_err(dev, "Failed to request region %pa-%pa\n", &addr, &end);
+		return NULL;
+	}
+
+	ret_val = devm_ioremap(dev, addr, length);
+	if (!ret_val)
+		dev_err(dev, "Failed to map region %pr\n", res);
+
+	return ret_val;
+}
+
+int cxl_map_component_regs(const struct cxl_register_map *map,
+			   struct cxl_component_regs *regs,
+			   unsigned long map_mask)
+{
+	struct device *host = map->host;
+	struct mapinfo {
+		const struct cxl_reg_map *rmap;
+		void __iomem **addr;
+	} mapinfo[] = {
+		{ &map->component_map.hdm_decoder, &regs->hdm_decoder },
+		{ &map->component_map.ras, &regs->ras },
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mapinfo); i++) {
+		struct mapinfo *mi = &mapinfo[i];
+		resource_size_t phys_addr;
+		resource_size_t length;
+
+		if (!mi->rmap->valid)
+			continue;
+		if (!test_bit(mi->rmap->id, &map_mask))
+			continue;
+		phys_addr = map->resource + mi->rmap->offset;
+		length = mi->rmap->size;
+		*(mi->addr) = devm_cxl_iomap_block(host, phys_addr, length);
+		if (!*(mi->addr))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_map_component_regs, CXL);
+
+int cxl_map_device_regs(const struct cxl_register_map *map,
+			struct cxl_device_regs *regs)
+{
+	struct device *host = map->host;
+	resource_size_t phys_addr = map->resource;
+	struct mapinfo {
+		const struct cxl_reg_map *rmap;
+		void __iomem **addr;
+	} mapinfo[] = {
+		{ &map->device_map.status, &regs->status, },
+		{ &map->device_map.mbox, &regs->mbox, },
+		{ &map->device_map.memdev, &regs->memdev, },
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mapinfo); i++) {
+		struct mapinfo *mi = &mapinfo[i];
+		resource_size_t length;
+		resource_size_t addr;
+
+		if (!mi->rmap->valid)
+			continue;
+
+		addr = phys_addr + mi->rmap->offset;
+		length = mi->rmap->size;
+		*(mi->addr) = devm_cxl_iomap_block(host, addr, length);
+		if (!*(mi->addr))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, CXL);
+
+static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi,
+				struct cxl_register_map *map)
+{
+	int bar = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo);
+	u64 offset = ((u64)reg_hi << 32) |
+		     (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK);
+
+	if (offset > pci_resource_len(pdev, bar)) {
+		dev_warn(&pdev->dev,
+			 "BAR%d: %pr: too small (offset: %pa, type: %d)\n", bar,
+			 &pdev->resource[bar], &offset, map->reg_type);
+		return false;
+	}
+
+	map->reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo);
+	map->resource = pci_resource_start(pdev, bar) + offset;
+	map->max_size = pci_resource_len(pdev, bar) - offset;
+	return true;
+}
+
+/**
+ * cxl_find_regblock_instance() - Locate a register block by type / index
+ * @pdev: The CXL PCI device to enumerate.
+ * @type: Register Block Indicator id
+ * @map: Enumeration output, clobbered on error
+ * @index: Index into which particular instance of a regblock wanted in the
+ *	   order found in register locator DVSEC.
+ *
+ * Return: 0 if register block enumerated, negative error code otherwise
+ *
+ * A CXL DVSEC may point to one or more register blocks, search for them
+ * by @type and @index.
+ */
+int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type,
+			       struct cxl_register_map *map, int index)
+{
+	u32 regloc_size, regblocks;
+	int instance = 0;
+	int regloc, i;
+
+	*map = (struct cxl_register_map) {
+		.host = &pdev->dev,
+		.resource = CXL_RESOURCE_NONE,
+	};
+
+	regloc = pci_find_dvsec_capability(pdev, PCI_DVSEC_VENDOR_ID_CXL,
+					   CXL_DVSEC_REG_LOCATOR);
+	if (!regloc)
+		return -ENXIO;
+
+	pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, &regloc_size);
+	regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size);
+
+	regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET;
+	regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8;
+
+	for (i = 0; i < regblocks; i++, regloc += 8) {
+		u32 reg_lo, reg_hi;
+
+		pci_read_config_dword(pdev, regloc, &reg_lo);
+		pci_read_config_dword(pdev, regloc + 4, &reg_hi);
+
+		if (!cxl_decode_regblock(pdev, reg_lo, reg_hi, map))
+			continue;
+
+		if (map->reg_type == type) {
+			if (index == instance)
+				return 0;
+			instance++;
+		}
+	}
+
+	map->resource = CXL_RESOURCE_NONE;
+	return -ENODEV;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_find_regblock_instance, CXL);
+
+/**
+ * cxl_find_regblock() - Locate register blocks by type
+ * @pdev: The CXL PCI device to enumerate.
+ * @type: Register Block Indicator id
+ * @map: Enumeration output, clobbered on error
+ *
+ * Return: 0 if register block enumerated, negative error code otherwise
+ *
+ * A CXL DVSEC may point to one or more register blocks, search for them
+ * by @type.
+ */
+int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type,
+		      struct cxl_register_map *map)
+{
+	return cxl_find_regblock_instance(pdev, type, map, 0);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_find_regblock, CXL);
+
+/**
+ * cxl_count_regblock() - Count instances of a given regblock type.
+ * @pdev: The CXL PCI device to enumerate.
+ * @type: Register Block Indicator id
+ *
+ * Some regblocks may be repeated. Count how many instances.
+ *
+ * Return: count of matching regblocks.
+ */
+int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type)
+{
+	struct cxl_register_map map;
+	int rc, count = 0;
+
+	while (1) {
+		rc = cxl_find_regblock_instance(pdev, type, &map, count);
+		if (rc)
+			return count;
+		count++;
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cxl_count_regblock, CXL);
+
+int cxl_map_pmu_regs(struct pci_dev *pdev, struct cxl_pmu_regs *regs,
+		     struct cxl_register_map *map)
+{
+	struct device *dev = &pdev->dev;
+	resource_size_t phys_addr;
+
+	phys_addr = map->resource;
+	regs->pmu = devm_cxl_iomap_block(dev, phys_addr, CXL_PMU_REGMAP_SIZE);
+	if (!regs->pmu)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_map_pmu_regs, CXL);
+
+static int cxl_map_regblock(struct cxl_register_map *map)
+{
+	struct device *host = map->host;
+
+	map->base = ioremap(map->resource, map->max_size);
+	if (!map->base) {
+		dev_err(host, "failed to map registers\n");
+		return -ENOMEM;
+	}
+
+	dev_dbg(host, "Mapped CXL Memory Device resource %pa\n", &map->resource);
+	return 0;
+}
+
+static void cxl_unmap_regblock(struct cxl_register_map *map)
+{
+	iounmap(map->base);
+	map->base = NULL;
+}
+
+static int cxl_probe_regs(struct cxl_register_map *map)
+{
+	struct cxl_component_reg_map *comp_map;
+	struct cxl_device_reg_map *dev_map;
+	struct device *host = map->host;
+	void __iomem *base = map->base;
+
+	switch (map->reg_type) {
+	case CXL_REGLOC_RBI_COMPONENT:
+		comp_map = &map->component_map;
+		cxl_probe_component_regs(host, base, comp_map);
+		dev_dbg(host, "Set up component registers\n");
+		break;
+	case CXL_REGLOC_RBI_MEMDEV:
+		dev_map = &map->device_map;
+		cxl_probe_device_regs(host, base, dev_map);
+		if (!dev_map->status.valid || !dev_map->mbox.valid ||
+		    !dev_map->memdev.valid) {
+			dev_err(host, "registers not found: %s%s%s\n",
+				!dev_map->status.valid ? "status " : "",
+				!dev_map->mbox.valid ? "mbox " : "",
+				!dev_map->memdev.valid ? "memdev " : "");
+			return -ENXIO;
+		}
+
+		dev_dbg(host, "Probing device registers...\n");
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+int cxl_setup_regs(struct cxl_register_map *map)
+{
+	int rc;
+
+	rc = cxl_map_regblock(map);
+	if (rc)
+		return rc;
+
+	rc = cxl_probe_regs(map);
+	cxl_unmap_regblock(map);
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_setup_regs, CXL);
+
+resource_size_t __rcrb_to_component(struct device *dev, struct cxl_rcrb_info *ri,
+				    enum cxl_rcrb which)
+{
+	resource_size_t component_reg_phys;
+	resource_size_t rcrb = ri->base;
+	void __iomem *addr;
+	u32 bar0, bar1;
+	u16 cmd;
+	u32 id;
+
+	if (which == CXL_RCRB_UPSTREAM)
+		rcrb += SZ_4K;
+
+	/*
+	 * RCRB's BAR[0..1] point to component block containing CXL
+	 * subsystem component registers. MEMBAR extraction follows
+	 * the PCI Base spec here, esp. 64 bit extraction and memory
+	 * ranges alignment (6.0, 7.5.1.2.1).
+	 */
+	if (!request_mem_region(rcrb, SZ_4K, "CXL RCRB"))
+		return CXL_RESOURCE_NONE;
+	addr = ioremap(rcrb, SZ_4K);
+	if (!addr) {
+		dev_err(dev, "Failed to map region %pr\n", addr);
+		release_mem_region(rcrb, SZ_4K);
+		return CXL_RESOURCE_NONE;
+	}
+
+	id = readl(addr + PCI_VENDOR_ID);
+	cmd = readw(addr + PCI_COMMAND);
+	bar0 = readl(addr + PCI_BASE_ADDRESS_0);
+	bar1 = readl(addr + PCI_BASE_ADDRESS_1);
+	iounmap(addr);
+	release_mem_region(rcrb, SZ_4K);
+
+	/*
+	 * Sanity check, see CXL 3.0 Figure 9-8 CXL Device that Does Not
+	 * Remap Upstream Port and Component Registers
+	 */
+	if (id == U32_MAX) {
+		if (which == CXL_RCRB_DOWNSTREAM)
+			dev_err(dev, "Failed to access Downstream Port RCRB\n");
+		return CXL_RESOURCE_NONE;
+	}
+	if (!(cmd & PCI_COMMAND_MEMORY))
+		return CXL_RESOURCE_NONE;
+	/* The RCRB is a Memory Window, and the MEM_TYPE_1M bit is obsolete */
+	if (bar0 & (PCI_BASE_ADDRESS_MEM_TYPE_1M | PCI_BASE_ADDRESS_SPACE_IO))
+		return CXL_RESOURCE_NONE;
+
+	component_reg_phys = bar0 & PCI_BASE_ADDRESS_MEM_MASK;
+	if (bar0 & PCI_BASE_ADDRESS_MEM_TYPE_64)
+		component_reg_phys |= ((u64)bar1) << 32;
+
+	if (!component_reg_phys)
+		return CXL_RESOURCE_NONE;
+
+	/* MEMBAR is block size (64k) aligned. */
+	if (!IS_ALIGNED(component_reg_phys, CXL_COMPONENT_REG_BLOCK_SIZE))
+		return CXL_RESOURCE_NONE;
+
+	return component_reg_phys;
+}
+
+resource_size_t cxl_rcd_component_reg_phys(struct device *dev,
+					   struct cxl_dport *dport)
+{
+	if (!dport->rch)
+		return CXL_RESOURCE_NONE;
+	return __rcrb_to_component(dev, &dport->rcrb, CXL_RCRB_UPSTREAM);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_rcd_component_reg_phys, CXL);
diff --git a/drivers/cxl/core/suspend.c b/drivers/cxl/core/suspend.c
new file mode 100644
index 0000000000..a5984d96ea
--- /dev/null
+++ b/drivers/cxl/core/suspend.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/atomic.h>
+#include <linux/export.h>
+#include "cxlmem.h"
+
+static atomic_t mem_active;
+
+bool cxl_mem_active(void)
+{
+	return atomic_read(&mem_active) != 0;
+}
+
+void cxl_mem_active_inc(void)
+{
+	atomic_inc(&mem_active);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_active_inc, CXL);
+
+void cxl_mem_active_dec(void)
+{
+	atomic_dec(&mem_active);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_active_dec, CXL);
diff --git a/drivers/cxl/core/trace.c b/drivers/cxl/core/trace.c
new file mode 100644
index 0000000000..d0403dc3c8
--- /dev/null
+++ b/drivers/cxl/core/trace.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+
+#include <cxl.h>
+#include "core.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	int gran = p->interleave_granularity;
+	int ways = p->interleave_ways;
+	u64 offset;
+
+	/* Is the hpa within this region at all */
+	if (hpa < p->res->start || hpa > p->res->end) {
+		dev_dbg(&cxlr->dev,
+			"Addr trans fail: hpa 0x%llx not in region\n", hpa);
+		return false;
+	}
+
+	/* Is the hpa in an expected chunk for its pos(-ition) */
+	offset = hpa - p->res->start;
+	offset = do_div(offset, gran * ways);
+	if ((offset >= pos * gran) && (offset < (pos + 1) * gran))
+		return true;
+
+	dev_dbg(&cxlr->dev,
+		"Addr trans fail: hpa 0x%llx not in expected chunk\n", hpa);
+
+	return false;
+}
+
+static u64 cxl_dpa_to_hpa(u64 dpa,  struct cxl_region *cxlr,
+			  struct cxl_endpoint_decoder *cxled)
+{
+	u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa;
+	struct cxl_region_params *p = &cxlr->params;
+	int pos = cxled->pos;
+	u16 eig = 0;
+	u8 eiw = 0;
+
+	ways_to_eiw(p->interleave_ways, &eiw);
+	granularity_to_eig(p->interleave_granularity, &eig);
+
+	/*
+	 * The device position in the region interleave set was removed
+	 * from the offset at HPA->DPA translation. To reconstruct the
+	 * HPA, place the 'pos' in the offset.
+	 *
+	 * The placement of 'pos' in the HPA is determined by interleave
+	 * ways and granularity and is defined in the CXL Spec 3.0 Section
+	 * 8.2.4.19.13 Implementation Note: Device Decode Logic
+	 */
+
+	/* Remove the dpa base */
+	dpa_offset = dpa - cxl_dpa_resource_start(cxled);
+
+	mask_upper = GENMASK_ULL(51, eig + 8);
+
+	if (eiw < 8) {
+		hpa_offset = (dpa_offset & mask_upper) << eiw;
+		hpa_offset |= pos << (eig + 8);
+	} else {
+		bits_upper = (dpa_offset & mask_upper) >> (eig + 8);
+		bits_upper = bits_upper * 3;
+		hpa_offset = ((bits_upper << (eiw - 8)) + pos) << (eig + 8);
+	}
+
+	/* The lower bits remain unchanged */
+	hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0);
+
+	/* Apply the hpa_offset to the region base address */
+	hpa = hpa_offset + p->res->start;
+
+	if (!cxl_is_hpa_in_range(hpa, cxlr, cxled->pos))
+		return ULLONG_MAX;
+
+	return hpa;
+}
+
+u64 cxl_trace_hpa(struct cxl_region *cxlr, struct cxl_memdev *cxlmd,
+		  u64 dpa)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_endpoint_decoder *cxled = NULL;
+
+	for (int i = 0; i <  p->nr_targets; i++) {
+		cxled = p->targets[i];
+		if (cxlmd == cxled_to_memdev(cxled))
+			break;
+	}
+	if (!cxled || cxlmd != cxled_to_memdev(cxled))
+		return ULLONG_MAX;
+
+	return cxl_dpa_to_hpa(dpa, cxlr, cxled);
+}
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
new file mode 100644
index 0000000000..a0b5819bc7
--- /dev/null
+++ b/drivers/cxl/core/trace.h
@@ -0,0 +1,709 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cxl
+
+#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _CXL_EVENTS_H
+
+#include <linux/tracepoint.h>
+#include <linux/pci.h>
+#include <asm-generic/unaligned.h>
+
+#include <cxl.h>
+#include <cxlmem.h>
+#include "core.h"
+
+#define CXL_RAS_UC_CACHE_DATA_PARITY	BIT(0)
+#define CXL_RAS_UC_CACHE_ADDR_PARITY	BIT(1)
+#define CXL_RAS_UC_CACHE_BE_PARITY	BIT(2)
+#define CXL_RAS_UC_CACHE_DATA_ECC	BIT(3)
+#define CXL_RAS_UC_MEM_DATA_PARITY	BIT(4)
+#define CXL_RAS_UC_MEM_ADDR_PARITY	BIT(5)
+#define CXL_RAS_UC_MEM_BE_PARITY	BIT(6)
+#define CXL_RAS_UC_MEM_DATA_ECC		BIT(7)
+#define CXL_RAS_UC_REINIT_THRESH	BIT(8)
+#define CXL_RAS_UC_RSVD_ENCODE		BIT(9)
+#define CXL_RAS_UC_POISON		BIT(10)
+#define CXL_RAS_UC_RECV_OVERFLOW	BIT(11)
+#define CXL_RAS_UC_INTERNAL_ERR		BIT(14)
+#define CXL_RAS_UC_IDE_TX_ERR		BIT(15)
+#define CXL_RAS_UC_IDE_RX_ERR		BIT(16)
+
+#define show_uc_errs(status)	__print_flags(status, " | ",		  \
+	{ CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" },	  \
+	{ CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" },	  \
+	{ CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" }, \
+	{ CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" },		  \
+	{ CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" },	  \
+	{ CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error" },	  \
+	{ CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error" },  \
+	{ CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" },		  \
+	{ CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" },		  \
+	{ CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" },	  \
+	{ CXL_RAS_UC_POISON, "Received Poison From Peer" },		  \
+	{ CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" },		  \
+	{ CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" },	  \
+	{ CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" },			  \
+	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
+)
+
+TRACE_EVENT(cxl_aer_uncorrectable_error,
+	TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
+	TP_ARGS(cxlmd, status, fe, hl),
+	TP_STRUCT__entry(
+		__string(memdev, dev_name(&cxlmd->dev))
+		__string(host, dev_name(cxlmd->dev.parent))
+		__field(u64, serial)
+		__field(u32, status)
+		__field(u32, first_error)
+		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
+	),
+	TP_fast_assign(
+		__assign_str(memdev, dev_name(&cxlmd->dev));
+		__assign_str(host, dev_name(cxlmd->dev.parent));
+		__entry->serial = cxlmd->cxlds->serial;
+		__entry->status = status;
+		__entry->first_error = fe;
+		/*
+		 * Embed the 512B headerlog data for user app retrieval and
+		 * parsing, but no need to print this in the trace buffer.
+		 */
+		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
+	),
+	TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error: '%s'",
+		  __get_str(memdev), __get_str(host), __entry->serial,
+		  show_uc_errs(__entry->status),
+		  show_uc_errs(__entry->first_error)
+	)
+);
+
+#define CXL_RAS_CE_CACHE_DATA_ECC	BIT(0)
+#define CXL_RAS_CE_MEM_DATA_ECC		BIT(1)
+#define CXL_RAS_CE_CRC_THRESH		BIT(2)
+#define CLX_RAS_CE_RETRY_THRESH		BIT(3)
+#define CXL_RAS_CE_CACHE_POISON		BIT(4)
+#define CXL_RAS_CE_MEM_POISON		BIT(5)
+#define CXL_RAS_CE_PHYS_LAYER_ERR	BIT(6)
+
+#define show_ce_errs(status)	__print_flags(status, " | ",			\
+	{ CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" },			\
+	{ CXL_RAS_CE_MEM_DATA_ECC, "Memory Data ECC Error" },			\
+	{ CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" },				\
+	{ CLX_RAS_CE_RETRY_THRESH, "Retry Threshold" },				\
+	{ CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer" },		\
+	{ CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer" },		\
+	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" }	\
+)
+
+TRACE_EVENT(cxl_aer_correctable_error,
+	TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
+	TP_ARGS(cxlmd, status),
+	TP_STRUCT__entry(
+		__string(memdev, dev_name(&cxlmd->dev))
+		__string(host, dev_name(cxlmd->dev.parent))
+		__field(u64, serial)
+		__field(u32, status)
+	),
+	TP_fast_assign(
+		__assign_str(memdev, dev_name(&cxlmd->dev));
+		__assign_str(host, dev_name(cxlmd->dev.parent));
+		__entry->serial = cxlmd->cxlds->serial;
+		__entry->status = status;
+	),
+	TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
+		  __get_str(memdev), __get_str(host), __entry->serial,
+		  show_ce_errs(__entry->status)
+	)
+);
+
+#define cxl_event_log_type_str(type)				\
+	__print_symbolic(type,					\
+		{ CXL_EVENT_TYPE_INFO, "Informational" },	\
+		{ CXL_EVENT_TYPE_WARN, "Warning" },		\
+		{ CXL_EVENT_TYPE_FAIL, "Failure" },		\
+		{ CXL_EVENT_TYPE_FATAL, "Fatal" })
+
+TRACE_EVENT(cxl_overflow,
+
+	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+		 struct cxl_get_event_payload *payload),
+
+	TP_ARGS(cxlmd, log, payload),
+
+	TP_STRUCT__entry(
+		__string(memdev, dev_name(&cxlmd->dev))
+		__string(host, dev_name(cxlmd->dev.parent))
+		__field(int, log)
+		__field(u64, serial)
+		__field(u64, first_ts)
+		__field(u64, last_ts)
+		__field(u16, count)
+	),
+
+	TP_fast_assign(
+		__assign_str(memdev, dev_name(&cxlmd->dev));
+		__assign_str(host, dev_name(cxlmd->dev.parent));
+		__entry->serial = cxlmd->cxlds->serial;
+		__entry->log = log;
+		__entry->count = le16_to_cpu(payload->overflow_err_count);
+		__entry->first_ts = le64_to_cpu(payload->first_overflow_timestamp);
+		__entry->last_ts = le64_to_cpu(payload->last_overflow_timestamp);
+	),
+
+	TP_printk("memdev=%s host=%s serial=%lld: log=%s : %u records from %llu to %llu",
+		__get_str(memdev), __get_str(host), __entry->serial,
+		cxl_event_log_type_str(__entry->log), __entry->count,
+		__entry->first_ts, __entry->last_ts)
+
+);
+
+/*
+ * Common Event Record Format
+ * CXL 3.0 section 8.2.9.2.1; Table 8-42
+ */
+#define CXL_EVENT_RECORD_FLAG_PERMANENT		BIT(2)
+#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED	BIT(3)
+#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED	BIT(4)
+#define CXL_EVENT_RECORD_FLAG_HW_REPLACE	BIT(5)
+#define show_hdr_flags(flags)	__print_flags(flags, " | ",			   \
+	{ CXL_EVENT_RECORD_FLAG_PERMANENT,	"PERMANENT_CONDITION"		}, \
+	{ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,	"MAINTENANCE_NEEDED"		}, \
+	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"PERFORMANCE_DEGRADED"		}, \
+	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"HARDWARE_REPLACEMENT_NEEDED"	}  \
+)
+
+/*
+ * Define macros for the common header of each CXL event.
+ *
+ * Tracepoints using these macros must do 3 things:
+ *
+ *	1) Add CXL_EVT_TP_entry to TP_STRUCT__entry
+ *	2) Use CXL_EVT_TP_fast_assign within TP_fast_assign;
+ *	   pass the dev, log, and CXL event header
+ *	3) Use CXL_EVT_TP_printk() instead of TP_printk()
+ *
+ * See the generic_event tracepoint as an example.
+ */
+#define CXL_EVT_TP_entry					\
+	__string(memdev, dev_name(&cxlmd->dev))			\
+	__string(host, dev_name(cxlmd->dev.parent))		\
+	__field(int, log)					\
+	__field_struct(uuid_t, hdr_uuid)			\
+	__field(u64, serial)					\
+	__field(u32, hdr_flags)					\
+	__field(u16, hdr_handle)				\
+	__field(u16, hdr_related_handle)			\
+	__field(u64, hdr_timestamp)				\
+	__field(u8, hdr_length)					\
+	__field(u8, hdr_maint_op_class)
+
+#define CXL_EVT_TP_fast_assign(cxlmd, l, hdr)					\
+	__assign_str(memdev, dev_name(&(cxlmd)->dev));				\
+	__assign_str(host, dev_name((cxlmd)->dev.parent));			\
+	__entry->log = (l);							\
+	__entry->serial = (cxlmd)->cxlds->serial;				\
+	memcpy(&__entry->hdr_uuid, &(hdr).id, sizeof(uuid_t));			\
+	__entry->hdr_length = (hdr).length;					\
+	__entry->hdr_flags = get_unaligned_le24((hdr).flags);			\
+	__entry->hdr_handle = le16_to_cpu((hdr).handle);			\
+	__entry->hdr_related_handle = le16_to_cpu((hdr).related_handle);	\
+	__entry->hdr_timestamp = le64_to_cpu((hdr).timestamp);			\
+	__entry->hdr_maint_op_class = (hdr).maint_op_class
+
+#define CXL_EVT_TP_printk(fmt, ...) \
+	TP_printk("memdev=%s host=%s serial=%lld log=%s : time=%llu uuid=%pUb "	\
+		"len=%d flags='%s' handle=%x related_handle=%x "		\
+		"maint_op_class=%u : " fmt,					\
+		__get_str(memdev), __get_str(host), __entry->serial,		\
+		cxl_event_log_type_str(__entry->log),				\
+		__entry->hdr_timestamp, &__entry->hdr_uuid, __entry->hdr_length,\
+		show_hdr_flags(__entry->hdr_flags), __entry->hdr_handle,	\
+		__entry->hdr_related_handle, __entry->hdr_maint_op_class,	\
+		##__VA_ARGS__)
+
+TRACE_EVENT(cxl_generic_event,
+
+	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+		 struct cxl_event_record_raw *rec),
+
+	TP_ARGS(cxlmd, log, rec),
+
+	TP_STRUCT__entry(
+		CXL_EVT_TP_entry
+		__array(u8, data, CXL_EVENT_RECORD_DATA_LENGTH)
+	),
+
+	TP_fast_assign(
+		CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr);
+		memcpy(__entry->data, &rec->data, CXL_EVENT_RECORD_DATA_LENGTH);
+	),
+
+	CXL_EVT_TP_printk("%s",
+		__print_hex(__entry->data, CXL_EVENT_RECORD_DATA_LENGTH))
+);
+
+/*
+ * Physical Address field masks
+ *
+ * General Media Event Record
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ *
+ * DRAM Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ */
+#define CXL_DPA_FLAGS_MASK			0x3F
+#define CXL_DPA_MASK				(~CXL_DPA_FLAGS_MASK)
+
+#define CXL_DPA_VOLATILE			BIT(0)
+#define CXL_DPA_NOT_REPAIRABLE			BIT(1)
+#define show_dpa_flags(flags)	__print_flags(flags, "|",		   \
+	{ CXL_DPA_VOLATILE,			"VOLATILE"		}, \
+	{ CXL_DPA_NOT_REPAIRABLE,		"NOT_REPAIRABLE"	}  \
+)
+
+/*
+ * General Media Event Record - GMER
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+#define CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT		BIT(0)
+#define CXL_GMER_EVT_DESC_THRESHOLD_EVENT		BIT(1)
+#define CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW		BIT(2)
+#define show_event_desc_flags(flags)	__print_flags(flags, "|",		   \
+	{ CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT,		"UNCORRECTABLE_EVENT"	}, \
+	{ CXL_GMER_EVT_DESC_THRESHOLD_EVENT,		"THRESHOLD_EVENT"	}, \
+	{ CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW,	"POISON_LIST_OVERFLOW"	}  \
+)
+
+#define CXL_GMER_MEM_EVT_TYPE_ECC_ERROR			0x00
+#define CXL_GMER_MEM_EVT_TYPE_INV_ADDR			0x01
+#define CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR		0x02
+#define show_mem_event_type(type)	__print_symbolic(type,			\
+	{ CXL_GMER_MEM_EVT_TYPE_ECC_ERROR,		"ECC Error" },		\
+	{ CXL_GMER_MEM_EVT_TYPE_INV_ADDR,		"Invalid Address" },	\
+	{ CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR,	"Data Path Error" }	\
+)
+
+#define CXL_GMER_TRANS_UNKNOWN				0x00
+#define CXL_GMER_TRANS_HOST_READ			0x01
+#define CXL_GMER_TRANS_HOST_WRITE			0x02
+#define CXL_GMER_TRANS_HOST_SCAN_MEDIA			0x03
+#define CXL_GMER_TRANS_HOST_INJECT_POISON		0x04
+#define CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB		0x05
+#define CXL_GMER_TRANS_INTERNAL_MEDIA_MANAGEMENT	0x06
+#define show_trans_type(type)	__print_symbolic(type,					\
+	{ CXL_GMER_TRANS_UNKNOWN,			"Unknown" },			\
+	{ CXL_GMER_TRANS_HOST_READ,			"Host Read" },			\
+	{ CXL_GMER_TRANS_HOST_WRITE,			"Host Write" },			\
+	{ CXL_GMER_TRANS_HOST_SCAN_MEDIA,		"Host Scan Media" },		\
+	{ CXL_GMER_TRANS_HOST_INJECT_POISON,		"Host Inject Poison" },		\
+	{ CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB,		"Internal Media Scrub" },	\
+	{ CXL_GMER_TRANS_INTERNAL_MEDIA_MANAGEMENT,	"Internal Media Management" }	\
+)
+
+#define CXL_GMER_VALID_CHANNEL				BIT(0)
+#define CXL_GMER_VALID_RANK				BIT(1)
+#define CXL_GMER_VALID_DEVICE				BIT(2)
+#define CXL_GMER_VALID_COMPONENT			BIT(3)
+#define show_valid_flags(flags)	__print_flags(flags, "|",		   \
+	{ CXL_GMER_VALID_CHANNEL,			"CHANNEL"	}, \
+	{ CXL_GMER_VALID_RANK,				"RANK"		}, \
+	{ CXL_GMER_VALID_DEVICE,			"DEVICE"	}, \
+	{ CXL_GMER_VALID_COMPONENT,			"COMPONENT"	}  \
+)
+
+TRACE_EVENT(cxl_general_media,
+
+	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+		 struct cxl_event_gen_media *rec),
+
+	TP_ARGS(cxlmd, log, rec),
+
+	TP_STRUCT__entry(
+		CXL_EVT_TP_entry
+		/* General Media */
+		__field(u64, dpa)
+		__field(u8, descriptor)
+		__field(u8, type)
+		__field(u8, transaction_type)
+		__field(u8, channel)
+		__field(u32, device)
+		__array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
+		__field(u16, validity_flags)
+		/* Following are out of order to pack trace record */
+		__field(u8, rank)
+		__field(u8, dpa_flags)
+	),
+
+	TP_fast_assign(
+		CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr);
+
+		/* General Media */
+		__entry->dpa = le64_to_cpu(rec->phys_addr);
+		__entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK;
+		/* Mask after flags have been parsed */
+		__entry->dpa &= CXL_DPA_MASK;
+		__entry->descriptor = rec->descriptor;
+		__entry->type = rec->type;
+		__entry->transaction_type = rec->transaction_type;
+		__entry->channel = rec->channel;
+		__entry->rank = rec->rank;
+		__entry->device = get_unaligned_le24(rec->device);
+		memcpy(__entry->comp_id, &rec->component_id,
+			CXL_EVENT_GEN_MED_COMP_ID_SIZE);
+		__entry->validity_flags = get_unaligned_le16(&rec->validity_flags);
+	),
+
+	CXL_EVT_TP_printk("dpa=%llx dpa_flags='%s' " \
+		"descriptor='%s' type='%s' transaction_type='%s' channel=%u rank=%u " \
+		"device=%x comp_id=%s validity_flags='%s'",
+		__entry->dpa, show_dpa_flags(__entry->dpa_flags),
+		show_event_desc_flags(__entry->descriptor),
+		show_mem_event_type(__entry->type),
+		show_trans_type(__entry->transaction_type),
+		__entry->channel, __entry->rank, __entry->device,
+		__print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE),
+		show_valid_flags(__entry->validity_flags)
+	)
+);
+
+/*
+ * DRAM Event Record - DER
+ *
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ */
+/*
+ * DRAM Event Record defines many fields the same as the General Media Event
+ * Record.  Reuse those definitions as appropriate.
+ */
+#define CXL_DER_VALID_CHANNEL				BIT(0)
+#define CXL_DER_VALID_RANK				BIT(1)
+#define CXL_DER_VALID_NIBBLE				BIT(2)
+#define CXL_DER_VALID_BANK_GROUP			BIT(3)
+#define CXL_DER_VALID_BANK				BIT(4)
+#define CXL_DER_VALID_ROW				BIT(5)
+#define CXL_DER_VALID_COLUMN				BIT(6)
+#define CXL_DER_VALID_CORRECTION_MASK			BIT(7)
+#define show_dram_valid_flags(flags)	__print_flags(flags, "|",			   \
+	{ CXL_DER_VALID_CHANNEL,			"CHANNEL"		}, \
+	{ CXL_DER_VALID_RANK,				"RANK"			}, \
+	{ CXL_DER_VALID_NIBBLE,				"NIBBLE"		}, \
+	{ CXL_DER_VALID_BANK_GROUP,			"BANK GROUP"		}, \
+	{ CXL_DER_VALID_BANK,				"BANK"			}, \
+	{ CXL_DER_VALID_ROW,				"ROW"			}, \
+	{ CXL_DER_VALID_COLUMN,				"COLUMN"		}, \
+	{ CXL_DER_VALID_CORRECTION_MASK,		"CORRECTION MASK"	}  \
+)
+
+TRACE_EVENT(cxl_dram,
+
+	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+		 struct cxl_event_dram *rec),
+
+	TP_ARGS(cxlmd, log, rec),
+
+	TP_STRUCT__entry(
+		CXL_EVT_TP_entry
+		/* DRAM */
+		__field(u64, dpa)
+		__field(u8, descriptor)
+		__field(u8, type)
+		__field(u8, transaction_type)
+		__field(u8, channel)
+		__field(u16, validity_flags)
+		__field(u16, column)	/* Out of order to pack trace record */
+		__field(u32, nibble_mask)
+		__field(u32, row)
+		__array(u8, cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE)
+		__field(u8, rank)	/* Out of order to pack trace record */
+		__field(u8, bank_group)	/* Out of order to pack trace record */
+		__field(u8, bank)	/* Out of order to pack trace record */
+		__field(u8, dpa_flags)	/* Out of order to pack trace record */
+	),
+
+	TP_fast_assign(
+		CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr);
+
+		/* DRAM */
+		__entry->dpa = le64_to_cpu(rec->phys_addr);
+		__entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK;
+		__entry->dpa &= CXL_DPA_MASK;
+		__entry->descriptor = rec->descriptor;
+		__entry->type = rec->type;
+		__entry->transaction_type = rec->transaction_type;
+		__entry->validity_flags = get_unaligned_le16(rec->validity_flags);
+		__entry->channel = rec->channel;
+		__entry->rank = rec->rank;
+		__entry->nibble_mask = get_unaligned_le24(rec->nibble_mask);
+		__entry->bank_group = rec->bank_group;
+		__entry->bank = rec->bank;
+		__entry->row = get_unaligned_le24(rec->row);
+		__entry->column = get_unaligned_le16(rec->column);
+		memcpy(__entry->cor_mask, &rec->correction_mask,
+			CXL_EVENT_DER_CORRECTION_MASK_SIZE);
+	),
+
+	CXL_EVT_TP_printk("dpa=%llx dpa_flags='%s' descriptor='%s' type='%s' " \
+		"transaction_type='%s' channel=%u rank=%u nibble_mask=%x " \
+		"bank_group=%u bank=%u row=%u column=%u cor_mask=%s " \
+		"validity_flags='%s'",
+		__entry->dpa, show_dpa_flags(__entry->dpa_flags),
+		show_event_desc_flags(__entry->descriptor),
+		show_mem_event_type(__entry->type),
+		show_trans_type(__entry->transaction_type),
+		__entry->channel, __entry->rank, __entry->nibble_mask,
+		__entry->bank_group, __entry->bank,
+		__entry->row, __entry->column,
+		__print_hex(__entry->cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE),
+		show_dram_valid_flags(__entry->validity_flags)
+	)
+);
+
+/*
+ * Memory Module Event Record - MMER
+ *
+ * CXL res 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+#define CXL_MMER_HEALTH_STATUS_CHANGE		0x00
+#define CXL_MMER_MEDIA_STATUS_CHANGE		0x01
+#define CXL_MMER_LIFE_USED_CHANGE		0x02
+#define CXL_MMER_TEMP_CHANGE			0x03
+#define CXL_MMER_DATA_PATH_ERROR		0x04
+#define CXL_MMER_LSA_ERROR			0x05
+#define show_dev_evt_type(type)	__print_symbolic(type,			   \
+	{ CXL_MMER_HEALTH_STATUS_CHANGE,	"Health Status Change"	}, \
+	{ CXL_MMER_MEDIA_STATUS_CHANGE,		"Media Status Change"	}, \
+	{ CXL_MMER_LIFE_USED_CHANGE,		"Life Used Change"	}, \
+	{ CXL_MMER_TEMP_CHANGE,			"Temperature Change"	}, \
+	{ CXL_MMER_DATA_PATH_ERROR,		"Data Path Error"	}, \
+	{ CXL_MMER_LSA_ERROR,			"LSA Error"		}  \
+)
+
+/*
+ * Device Health Information - DHI
+ *
+ * CXL res 3.0 section 8.2.9.8.3.1; Table 8-100
+ */
+#define CXL_DHI_HS_MAINTENANCE_NEEDED				BIT(0)
+#define CXL_DHI_HS_PERFORMANCE_DEGRADED				BIT(1)
+#define CXL_DHI_HS_HW_REPLACEMENT_NEEDED			BIT(2)
+#define show_health_status_flags(flags)	__print_flags(flags, "|",	   \
+	{ CXL_DHI_HS_MAINTENANCE_NEEDED,	"MAINTENANCE_NEEDED"	}, \
+	{ CXL_DHI_HS_PERFORMANCE_DEGRADED,	"PERFORMANCE_DEGRADED"	}, \
+	{ CXL_DHI_HS_HW_REPLACEMENT_NEEDED,	"REPLACEMENT_NEEDED"	}  \
+)
+
+#define CXL_DHI_MS_NORMAL							0x00
+#define CXL_DHI_MS_NOT_READY							0x01
+#define CXL_DHI_MS_WRITE_PERSISTENCY_LOST					0x02
+#define CXL_DHI_MS_ALL_DATA_LOST						0x03
+#define CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_POWER_LOSS			0x04
+#define CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_SHUTDOWN			0x05
+#define CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_IMMINENT				0x06
+#define CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_POWER_LOSS				0x07
+#define CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_SHUTDOWN				0x08
+#define CXL_DHI_MS_WRITE_ALL_DATA_LOSS_IMMINENT					0x09
+#define show_media_status(ms)	__print_symbolic(ms,			   \
+	{ CXL_DHI_MS_NORMAL,						   \
+		"Normal"						}, \
+	{ CXL_DHI_MS_NOT_READY,						   \
+		"Not Ready"						}, \
+	{ CXL_DHI_MS_WRITE_PERSISTENCY_LOST,				   \
+		"Write Persistency Lost"				}, \
+	{ CXL_DHI_MS_ALL_DATA_LOST,					   \
+		"All Data Lost"						}, \
+	{ CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_POWER_LOSS,		   \
+		"Write Persistency Loss in the Event of Power Loss"	}, \
+	{ CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_SHUTDOWN,		   \
+		"Write Persistency Loss in Event of Shutdown"		}, \
+	{ CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_IMMINENT,			   \
+		"Write Persistency Loss Imminent"			}, \
+	{ CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_POWER_LOSS,		   \
+		"All Data Loss in Event of Power Loss"			}, \
+	{ CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_SHUTDOWN,		   \
+		"All Data loss in the Event of Shutdown"		}, \
+	{ CXL_DHI_MS_WRITE_ALL_DATA_LOSS_IMMINENT,			   \
+		"All Data Loss Imminent"				}  \
+)
+
+#define CXL_DHI_AS_NORMAL		0x0
+#define CXL_DHI_AS_WARNING		0x1
+#define CXL_DHI_AS_CRITICAL		0x2
+#define show_two_bit_status(as) __print_symbolic(as,	   \
+	{ CXL_DHI_AS_NORMAL,		"Normal"	}, \
+	{ CXL_DHI_AS_WARNING,		"Warning"	}, \
+	{ CXL_DHI_AS_CRITICAL,		"Critical"	}  \
+)
+#define show_one_bit_status(as) __print_symbolic(as,	   \
+	{ CXL_DHI_AS_NORMAL,		"Normal"	}, \
+	{ CXL_DHI_AS_WARNING,		"Warning"	}  \
+)
+
+#define CXL_DHI_AS_LIFE_USED(as)			(as & 0x3)
+#define CXL_DHI_AS_DEV_TEMP(as)				((as & 0xC) >> 2)
+#define CXL_DHI_AS_COR_VOL_ERR_CNT(as)			((as & 0x10) >> 4)
+#define CXL_DHI_AS_COR_PER_ERR_CNT(as)			((as & 0x20) >> 5)
+
+TRACE_EVENT(cxl_memory_module,
+
+	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+		 struct cxl_event_mem_module *rec),
+
+	TP_ARGS(cxlmd, log, rec),
+
+	TP_STRUCT__entry(
+		CXL_EVT_TP_entry
+
+		/* Memory Module Event */
+		__field(u8, event_type)
+
+		/* Device Health Info */
+		__field(u8, health_status)
+		__field(u8, media_status)
+		__field(u8, life_used)
+		__field(u32, dirty_shutdown_cnt)
+		__field(u32, cor_vol_err_cnt)
+		__field(u32, cor_per_err_cnt)
+		__field(s16, device_temp)
+		__field(u8, add_status)
+	),
+
+	TP_fast_assign(
+		CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr);
+
+		/* Memory Module Event */
+		__entry->event_type = rec->event_type;
+
+		/* Device Health Info */
+		__entry->health_status = rec->info.health_status;
+		__entry->media_status = rec->info.media_status;
+		__entry->life_used = rec->info.life_used;
+		__entry->dirty_shutdown_cnt = get_unaligned_le32(rec->info.dirty_shutdown_cnt);
+		__entry->cor_vol_err_cnt = get_unaligned_le32(rec->info.cor_vol_err_cnt);
+		__entry->cor_per_err_cnt = get_unaligned_le32(rec->info.cor_per_err_cnt);
+		__entry->device_temp = get_unaligned_le16(rec->info.device_temp);
+		__entry->add_status = rec->info.add_status;
+	),
+
+	CXL_EVT_TP_printk("event_type='%s' health_status='%s' media_status='%s' " \
+		"as_life_used=%s as_dev_temp=%s as_cor_vol_err_cnt=%s " \
+		"as_cor_per_err_cnt=%s life_used=%u device_temp=%d " \
+		"dirty_shutdown_cnt=%u cor_vol_err_cnt=%u cor_per_err_cnt=%u",
+		show_dev_evt_type(__entry->event_type),
+		show_health_status_flags(__entry->health_status),
+		show_media_status(__entry->media_status),
+		show_two_bit_status(CXL_DHI_AS_LIFE_USED(__entry->add_status)),
+		show_two_bit_status(CXL_DHI_AS_DEV_TEMP(__entry->add_status)),
+		show_one_bit_status(CXL_DHI_AS_COR_VOL_ERR_CNT(__entry->add_status)),
+		show_one_bit_status(CXL_DHI_AS_COR_PER_ERR_CNT(__entry->add_status)),
+		__entry->life_used, __entry->device_temp,
+		__entry->dirty_shutdown_cnt, __entry->cor_vol_err_cnt,
+		__entry->cor_per_err_cnt
+	)
+);
+
+#define show_poison_trace_type(type)			\
+	__print_symbolic(type,				\
+	{ CXL_POISON_TRACE_LIST,	"List"   },	\
+	{ CXL_POISON_TRACE_INJECT,	"Inject" },	\
+	{ CXL_POISON_TRACE_CLEAR,	"Clear"  })
+
+#define __show_poison_source(source)                          \
+	__print_symbolic(source,                              \
+		{ CXL_POISON_SOURCE_UNKNOWN,   "Unknown"  },  \
+		{ CXL_POISON_SOURCE_EXTERNAL,  "External" },  \
+		{ CXL_POISON_SOURCE_INTERNAL,  "Internal" },  \
+		{ CXL_POISON_SOURCE_INJECTED,  "Injected" },  \
+		{ CXL_POISON_SOURCE_VENDOR,    "Vendor"   })
+
+#define show_poison_source(source)			     \
+	(((source > CXL_POISON_SOURCE_INJECTED) &&	     \
+	 (source != CXL_POISON_SOURCE_VENDOR)) ? "Reserved"  \
+	 : __show_poison_source(source))
+
+#define show_poison_flags(flags)                             \
+	__print_flags(flags, "|",                            \
+		{ CXL_POISON_FLAG_MORE,      "More"     },   \
+		{ CXL_POISON_FLAG_OVERFLOW,  "Overflow"  },  \
+		{ CXL_POISON_FLAG_SCANNING,  "Scanning"  })
+
+#define __cxl_poison_addr(record)					\
+	(le64_to_cpu(record->address))
+#define cxl_poison_record_dpa(record)					\
+	(__cxl_poison_addr(record) & CXL_POISON_START_MASK)
+#define cxl_poison_record_source(record)				\
+	(__cxl_poison_addr(record)  & CXL_POISON_SOURCE_MASK)
+#define cxl_poison_record_dpa_length(record)				\
+	(le32_to_cpu(record->length) * CXL_POISON_LEN_MULT)
+#define cxl_poison_overflow(flags, time)				\
+	(flags & CXL_POISON_FLAG_OVERFLOW ? le64_to_cpu(time) : 0)
+
+u64 cxl_trace_hpa(struct cxl_region *cxlr, struct cxl_memdev *memdev, u64 dpa);
+
+TRACE_EVENT(cxl_poison,
+
+	TP_PROTO(struct cxl_memdev *cxlmd, struct cxl_region *region,
+		 const struct cxl_poison_record *record, u8 flags,
+		 __le64 overflow_ts, enum cxl_poison_trace_type trace_type),
+
+	TP_ARGS(cxlmd, region, record, flags, overflow_ts, trace_type),
+
+	TP_STRUCT__entry(
+		__string(memdev, dev_name(&cxlmd->dev))
+		__string(host, dev_name(cxlmd->dev.parent))
+		__field(u64, serial)
+		__field(u8, trace_type)
+		__string(region, region)
+		__field(u64, overflow_ts)
+		__field(u64, hpa)
+		__field(u64, dpa)
+		__field(u32, dpa_length)
+		__array(char, uuid, 16)
+		__field(u8, source)
+		__field(u8, flags)
+	    ),
+
+	TP_fast_assign(
+		__assign_str(memdev, dev_name(&cxlmd->dev));
+		__assign_str(host, dev_name(cxlmd->dev.parent));
+		__entry->serial = cxlmd->cxlds->serial;
+		__entry->overflow_ts = cxl_poison_overflow(flags, overflow_ts);
+		__entry->dpa = cxl_poison_record_dpa(record);
+		__entry->dpa_length = cxl_poison_record_dpa_length(record);
+		__entry->source = cxl_poison_record_source(record);
+		__entry->trace_type = trace_type;
+		__entry->flags = flags;
+		if (region) {
+			__assign_str(region, dev_name(&region->dev));
+			memcpy(__entry->uuid, &region->params.uuid, 16);
+			__entry->hpa = cxl_trace_hpa(region, cxlmd,
+						     __entry->dpa);
+		} else {
+			__assign_str(region, "");
+			memset(__entry->uuid, 0, 16);
+			__entry->hpa = ULLONG_MAX;
+		}
+	    ),
+
+	TP_printk("memdev=%s host=%s serial=%lld trace_type=%s region=%s "  \
+		"region_uuid=%pU hpa=0x%llx dpa=0x%llx dpa_length=0x%x "    \
+		"source=%s flags=%s overflow_time=%llu",
+		__get_str(memdev),
+		__get_str(host),
+		__entry->serial,
+		show_poison_trace_type(__entry->trace_type),
+		__get_str(region),
+		__entry->uuid,
+		__entry->hpa,
+		__entry->dpa,
+		__entry->dpa_length,
+		show_poison_source(__entry->source),
+		show_poison_flags(__entry->flags),
+		__entry->overflow_ts
+	)
+);
+
+#endif /* _CXL_EVENTS_H */
+
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
new file mode 100644
index 0000000000..de2c250c89
--- /dev/null
+++ b/drivers/cxl/cxl.h
@@ -0,0 +1,831 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020 Intel Corporation. */
+
+#ifndef __CXL_H__
+#define __CXL_H__
+
+#include <linux/libnvdimm.h>
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+#include <linux/io.h>
+
+/**
+ * DOC: cxl objects
+ *
+ * The CXL core objects like ports, decoders, and regions are shared
+ * between the subsystem drivers cxl_acpi, cxl_pci, and core drivers
+ * (port-driver, region-driver, nvdimm object-drivers... etc).
+ */
+
+/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */
+#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K
+
+/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers*/
+#define CXL_CM_OFFSET 0x1000
+#define CXL_CM_CAP_HDR_OFFSET 0x0
+#define   CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0)
+#define     CM_CAP_HDR_CAP_ID 1
+#define   CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16)
+#define     CM_CAP_HDR_CAP_VERSION 1
+#define   CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20)
+#define     CM_CAP_HDR_CACHE_MEM_VERSION 1
+#define   CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24)
+#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20)
+
+#define   CXL_CM_CAP_CAP_ID_RAS 0x2
+#define   CXL_CM_CAP_CAP_ID_HDM 0x5
+#define   CXL_CM_CAP_CAP_HDM_VERSION 1
+
+/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */
+#define CXL_HDM_DECODER_CAP_OFFSET 0x0
+#define   CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0)
+#define   CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4)
+#define   CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8)
+#define   CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9)
+#define CXL_HDM_DECODER_CTRL_OFFSET 0x4
+#define   CXL_HDM_DECODER_ENABLE BIT(1)
+#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10)
+#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14)
+#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18)
+#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c)
+#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20)
+#define   CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0)
+#define   CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4)
+#define   CXL_HDM_DECODER0_CTRL_LOCK BIT(8)
+#define   CXL_HDM_DECODER0_CTRL_COMMIT BIT(9)
+#define   CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10)
+#define   CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11)
+#define   CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12)
+#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24)
+#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28)
+#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i)
+#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i)
+
+/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */
+#define CXL_DECODER_MIN_GRANULARITY 256
+#define CXL_DECODER_MAX_ENCODED_IG 6
+
+static inline int cxl_hdm_decoder_count(u32 cap_hdr)
+{
+	int val = FIELD_GET(CXL_HDM_DECODER_COUNT_MASK, cap_hdr);
+
+	return val ? val * 2 : 1;
+}
+
+/* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */
+static inline int eig_to_granularity(u16 eig, unsigned int *granularity)
+{
+	if (eig > CXL_DECODER_MAX_ENCODED_IG)
+		return -EINVAL;
+	*granularity = CXL_DECODER_MIN_GRANULARITY << eig;
+	return 0;
+}
+
+/* Encode defined in CXL ECN "3, 6, 12 and 16-way memory Interleaving" */
+static inline int eiw_to_ways(u8 eiw, unsigned int *ways)
+{
+	switch (eiw) {
+	case 0 ... 4:
+		*ways = 1 << eiw;
+		break;
+	case 8 ... 10:
+		*ways = 3 << (eiw - 8);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline int granularity_to_eig(int granularity, u16 *eig)
+{
+	if (granularity > SZ_16K || granularity < CXL_DECODER_MIN_GRANULARITY ||
+	    !is_power_of_2(granularity))
+		return -EINVAL;
+	*eig = ilog2(granularity) - 8;
+	return 0;
+}
+
+static inline int ways_to_eiw(unsigned int ways, u8 *eiw)
+{
+	if (ways > 16)
+		return -EINVAL;
+	if (is_power_of_2(ways)) {
+		*eiw = ilog2(ways);
+		return 0;
+	}
+	if (ways % 3)
+		return -EINVAL;
+	ways /= 3;
+	if (!is_power_of_2(ways))
+		return -EINVAL;
+	*eiw = ilog2(ways) + 8;
+	return 0;
+}
+
+/* RAS Registers CXL 2.0 8.2.5.9 CXL RAS Capability Structure */
+#define CXL_RAS_UNCORRECTABLE_STATUS_OFFSET 0x0
+#define   CXL_RAS_UNCORRECTABLE_STATUS_MASK (GENMASK(16, 14) | GENMASK(11, 0))
+#define CXL_RAS_UNCORRECTABLE_MASK_OFFSET 0x4
+#define   CXL_RAS_UNCORRECTABLE_MASK_MASK (GENMASK(16, 14) | GENMASK(11, 0))
+#define   CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK BIT(8)
+#define CXL_RAS_UNCORRECTABLE_SEVERITY_OFFSET 0x8
+#define   CXL_RAS_UNCORRECTABLE_SEVERITY_MASK (GENMASK(16, 14) | GENMASK(11, 0))
+#define CXL_RAS_CORRECTABLE_STATUS_OFFSET 0xC
+#define   CXL_RAS_CORRECTABLE_STATUS_MASK GENMASK(6, 0)
+#define CXL_RAS_CORRECTABLE_MASK_OFFSET 0x10
+#define   CXL_RAS_CORRECTABLE_MASK_MASK GENMASK(6, 0)
+#define CXL_RAS_CAP_CONTROL_OFFSET 0x14
+#define CXL_RAS_CAP_CONTROL_FE_MASK GENMASK(5, 0)
+#define CXL_RAS_HEADER_LOG_OFFSET 0x18
+#define CXL_RAS_CAPABILITY_LENGTH 0x58
+#define CXL_HEADERLOG_SIZE SZ_512
+#define CXL_HEADERLOG_SIZE_U32 SZ_512 / sizeof(u32)
+
+/* CXL 2.0 8.2.8.1 Device Capabilities Array Register */
+#define CXLDEV_CAP_ARRAY_OFFSET 0x0
+#define   CXLDEV_CAP_ARRAY_CAP_ID 0
+#define   CXLDEV_CAP_ARRAY_ID_MASK GENMASK_ULL(15, 0)
+#define   CXLDEV_CAP_ARRAY_COUNT_MASK GENMASK_ULL(47, 32)
+/* CXL 2.0 8.2.8.2 CXL Device Capability Header Register */
+#define CXLDEV_CAP_HDR_CAP_ID_MASK GENMASK(15, 0)
+/* CXL 2.0 8.2.8.2.1 CXL Device Capabilities */
+#define CXLDEV_CAP_CAP_ID_DEVICE_STATUS 0x1
+#define CXLDEV_CAP_CAP_ID_PRIMARY_MAILBOX 0x2
+#define CXLDEV_CAP_CAP_ID_SECONDARY_MAILBOX 0x3
+#define CXLDEV_CAP_CAP_ID_MEMDEV 0x4000
+
+/* CXL 3.0 8.2.8.3.1 Event Status Register */
+#define CXLDEV_DEV_EVENT_STATUS_OFFSET		0x00
+#define CXLDEV_EVENT_STATUS_INFO		BIT(0)
+#define CXLDEV_EVENT_STATUS_WARN		BIT(1)
+#define CXLDEV_EVENT_STATUS_FAIL		BIT(2)
+#define CXLDEV_EVENT_STATUS_FATAL		BIT(3)
+
+#define CXLDEV_EVENT_STATUS_ALL (CXLDEV_EVENT_STATUS_INFO |	\
+				 CXLDEV_EVENT_STATUS_WARN |	\
+				 CXLDEV_EVENT_STATUS_FAIL |	\
+				 CXLDEV_EVENT_STATUS_FATAL)
+
+/* CXL rev 3.0 section 8.2.9.2.4; Table 8-52 */
+#define CXLDEV_EVENT_INT_MODE_MASK	GENMASK(1, 0)
+#define CXLDEV_EVENT_INT_MSGNUM_MASK	GENMASK(7, 4)
+
+/* CXL 2.0 8.2.8.4 Mailbox Registers */
+#define CXLDEV_MBOX_CAPS_OFFSET 0x00
+#define   CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
+#define   CXLDEV_MBOX_CAP_BG_CMD_IRQ BIT(6)
+#define   CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK GENMASK(10, 7)
+#define CXLDEV_MBOX_CTRL_OFFSET 0x04
+#define   CXLDEV_MBOX_CTRL_DOORBELL BIT(0)
+#define   CXLDEV_MBOX_CTRL_BG_CMD_IRQ BIT(2)
+#define CXLDEV_MBOX_CMD_OFFSET 0x08
+#define   CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
+#define   CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK GENMASK_ULL(36, 16)
+#define CXLDEV_MBOX_STATUS_OFFSET 0x10
+#define   CXLDEV_MBOX_STATUS_BG_CMD BIT(0)
+#define   CXLDEV_MBOX_STATUS_RET_CODE_MASK GENMASK_ULL(47, 32)
+#define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
+#define   CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
+#define   CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK GENMASK_ULL(22, 16)
+#define   CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK GENMASK_ULL(47, 32)
+#define   CXLDEV_MBOX_BG_CMD_COMMAND_VENDOR_MASK GENMASK_ULL(63, 48)
+#define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
+
+/*
+ * Using struct_group() allows for per register-block-type helper routines,
+ * without requiring block-type agnostic code to include the prefix.
+ */
+struct cxl_regs {
+	/*
+	 * Common set of CXL Component register block base pointers
+	 * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure
+	 * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure
+	 */
+	struct_group_tagged(cxl_component_regs, component,
+		void __iomem *hdm_decoder;
+		void __iomem *ras;
+	);
+	/*
+	 * Common set of CXL Device register block base pointers
+	 * @status: CXL 2.0 8.2.8.3 Device Status Registers
+	 * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers
+	 * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers
+	 */
+	struct_group_tagged(cxl_device_regs, device_regs,
+		void __iomem *status, *mbox, *memdev;
+	);
+
+	struct_group_tagged(cxl_pmu_regs, pmu_regs,
+		void __iomem *pmu;
+	);
+};
+
+struct cxl_reg_map {
+	bool valid;
+	int id;
+	unsigned long offset;
+	unsigned long size;
+};
+
+struct cxl_component_reg_map {
+	struct cxl_reg_map hdm_decoder;
+	struct cxl_reg_map ras;
+};
+
+struct cxl_device_reg_map {
+	struct cxl_reg_map status;
+	struct cxl_reg_map mbox;
+	struct cxl_reg_map memdev;
+};
+
+struct cxl_pmu_reg_map {
+	struct cxl_reg_map pmu;
+};
+
+/**
+ * struct cxl_register_map - DVSEC harvested register block mapping parameters
+ * @host: device for devm operations and logging
+ * @base: virtual base of the register-block-BAR + @block_offset
+ * @resource: physical resource base of the register block
+ * @max_size: maximum mapping size to perform register search
+ * @reg_type: see enum cxl_regloc_type
+ * @component_map: cxl_reg_map for component registers
+ * @device_map: cxl_reg_maps for device registers
+ * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units
+ */
+struct cxl_register_map {
+	struct device *host;
+	void __iomem *base;
+	resource_size_t resource;
+	resource_size_t max_size;
+	u8 reg_type;
+	union {
+		struct cxl_component_reg_map component_map;
+		struct cxl_device_reg_map device_map;
+		struct cxl_pmu_reg_map pmu_map;
+	};
+};
+
+void cxl_probe_component_regs(struct device *dev, void __iomem *base,
+			      struct cxl_component_reg_map *map);
+void cxl_probe_device_regs(struct device *dev, void __iomem *base,
+			   struct cxl_device_reg_map *map);
+int cxl_map_component_regs(const struct cxl_register_map *map,
+			   struct cxl_component_regs *regs,
+			   unsigned long map_mask);
+int cxl_map_device_regs(const struct cxl_register_map *map,
+			struct cxl_device_regs *regs);
+int cxl_map_pmu_regs(struct pci_dev *pdev, struct cxl_pmu_regs *regs,
+		     struct cxl_register_map *map);
+
+enum cxl_regloc_type;
+int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type);
+int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type,
+			       struct cxl_register_map *map, int index);
+int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type,
+		      struct cxl_register_map *map);
+int cxl_setup_regs(struct cxl_register_map *map);
+struct cxl_dport;
+resource_size_t cxl_rcd_component_reg_phys(struct device *dev,
+					   struct cxl_dport *dport);
+
+#define CXL_RESOURCE_NONE ((resource_size_t) -1)
+#define CXL_TARGET_STRLEN 20
+
+/*
+ * cxl_decoder flags that define the type of memory / devices this
+ * decoder supports as well as configuration lock status See "CXL 2.0
+ * 8.2.5.12.7 CXL HDM Decoder 0 Control Register" for details.
+ * Additionally indicate whether decoder settings were autodetected,
+ * user customized.
+ */
+#define CXL_DECODER_F_RAM   BIT(0)
+#define CXL_DECODER_F_PMEM  BIT(1)
+#define CXL_DECODER_F_TYPE2 BIT(2)
+#define CXL_DECODER_F_TYPE3 BIT(3)
+#define CXL_DECODER_F_LOCK  BIT(4)
+#define CXL_DECODER_F_ENABLE    BIT(5)
+#define CXL_DECODER_F_MASK  GENMASK(5, 0)
+
+enum cxl_decoder_type {
+	CXL_DECODER_DEVMEM = 2,
+	CXL_DECODER_HOSTONLYMEM = 3,
+};
+
+/*
+ * Current specification goes up to 8, double that seems a reasonable
+ * software max for the foreseeable future
+ */
+#define CXL_DECODER_MAX_INTERLEAVE 16
+
+
+/**
+ * struct cxl_decoder - Common CXL HDM Decoder Attributes
+ * @dev: this decoder's device
+ * @id: kernel device name id
+ * @hpa_range: Host physical address range mapped by this decoder
+ * @interleave_ways: number of cxl_dports in this decode
+ * @interleave_granularity: data stride per dport
+ * @target_type: accelerator vs expander (type2 vs type3) selector
+ * @region: currently assigned region for this decoder
+ * @flags: memory type capabilities and locking
+ * @commit: device/decoder-type specific callback to commit settings to hw
+ * @reset: device/decoder-type specific callback to reset hw settings
+*/
+struct cxl_decoder {
+	struct device dev;
+	int id;
+	struct range hpa_range;
+	int interleave_ways;
+	int interleave_granularity;
+	enum cxl_decoder_type target_type;
+	struct cxl_region *region;
+	unsigned long flags;
+	int (*commit)(struct cxl_decoder *cxld);
+	int (*reset)(struct cxl_decoder *cxld);
+};
+
+/*
+ * CXL_DECODER_DEAD prevents endpoints from being reattached to regions
+ * while cxld_unregister() is running
+ */
+enum cxl_decoder_mode {
+	CXL_DECODER_NONE,
+	CXL_DECODER_RAM,
+	CXL_DECODER_PMEM,
+	CXL_DECODER_MIXED,
+	CXL_DECODER_DEAD,
+};
+
+static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode)
+{
+	static const char * const names[] = {
+		[CXL_DECODER_NONE] = "none",
+		[CXL_DECODER_RAM] = "ram",
+		[CXL_DECODER_PMEM] = "pmem",
+		[CXL_DECODER_MIXED] = "mixed",
+	};
+
+	if (mode >= CXL_DECODER_NONE && mode <= CXL_DECODER_MIXED)
+		return names[mode];
+	return "mixed";
+}
+
+/*
+ * Track whether this decoder is reserved for region autodiscovery, or
+ * free for userspace provisioning.
+ */
+enum cxl_decoder_state {
+	CXL_DECODER_STATE_MANUAL,
+	CXL_DECODER_STATE_AUTO,
+};
+
+/**
+ * struct cxl_endpoint_decoder - Endpoint  / SPA to DPA decoder
+ * @cxld: base cxl_decoder_object
+ * @dpa_res: actively claimed DPA span of this decoder
+ * @skip: offset into @dpa_res where @cxld.hpa_range maps
+ * @mode: which memory type / access-mode-partition this decoder targets
+ * @state: autodiscovery state
+ * @pos: interleave position in @cxld.region
+ */
+struct cxl_endpoint_decoder {
+	struct cxl_decoder cxld;
+	struct resource *dpa_res;
+	resource_size_t skip;
+	enum cxl_decoder_mode mode;
+	enum cxl_decoder_state state;
+	int pos;
+};
+
+/**
+ * struct cxl_switch_decoder - Switch specific CXL HDM Decoder
+ * @cxld: base cxl_decoder object
+ * @nr_targets: number of elements in @target
+ * @target: active ordered target list in current decoder configuration
+ *
+ * The 'switch' decoder type represents the decoder instances of cxl_port's that
+ * route from the root of a CXL memory decode topology to the endpoints. They
+ * come in two flavors, root-level decoders, statically defined by platform
+ * firmware, and mid-level decoders, where interleave-granularity,
+ * interleave-width, and the target list are mutable.
+ */
+struct cxl_switch_decoder {
+	struct cxl_decoder cxld;
+	int nr_targets;
+	struct cxl_dport *target[];
+};
+
+struct cxl_root_decoder;
+typedef struct cxl_dport *(*cxl_calc_hb_fn)(struct cxl_root_decoder *cxlrd,
+					    int pos);
+
+/**
+ * struct cxl_root_decoder - Static platform CXL address decoder
+ * @res: host / parent resource for region allocations
+ * @region_id: region id for next region provisioning event
+ * @calc_hb: which host bridge covers the n'th position by granularity
+ * @platform_data: platform specific configuration data
+ * @range_lock: sync region autodiscovery by address range
+ * @cxlsd: base cxl switch decoder
+ */
+struct cxl_root_decoder {
+	struct resource *res;
+	atomic_t region_id;
+	cxl_calc_hb_fn calc_hb;
+	void *platform_data;
+	struct mutex range_lock;
+	struct cxl_switch_decoder cxlsd;
+};
+
+/*
+ * enum cxl_config_state - State machine for region configuration
+ * @CXL_CONFIG_IDLE: Any sysfs attribute can be written freely
+ * @CXL_CONFIG_INTERLEAVE_ACTIVE: region size has been set, no more
+ * changes to interleave_ways or interleave_granularity
+ * @CXL_CONFIG_ACTIVE: All targets have been added the region is now
+ * active
+ * @CXL_CONFIG_RESET_PENDING: see commit_store()
+ * @CXL_CONFIG_COMMIT: Soft-config has been committed to hardware
+ */
+enum cxl_config_state {
+	CXL_CONFIG_IDLE,
+	CXL_CONFIG_INTERLEAVE_ACTIVE,
+	CXL_CONFIG_ACTIVE,
+	CXL_CONFIG_RESET_PENDING,
+	CXL_CONFIG_COMMIT,
+};
+
+/**
+ * struct cxl_region_params - region settings
+ * @state: allow the driver to lockdown further parameter changes
+ * @uuid: unique id for persistent regions
+ * @interleave_ways: number of endpoints in the region
+ * @interleave_granularity: capacity each endpoint contributes to a stripe
+ * @res: allocated iomem capacity for this region
+ * @targets: active ordered targets in current decoder configuration
+ * @nr_targets: number of targets
+ *
+ * State transitions are protected by the cxl_region_rwsem
+ */
+struct cxl_region_params {
+	enum cxl_config_state state;
+	uuid_t uuid;
+	int interleave_ways;
+	int interleave_granularity;
+	struct resource *res;
+	struct cxl_endpoint_decoder *targets[CXL_DECODER_MAX_INTERLEAVE];
+	int nr_targets;
+};
+
+/*
+ * Indicate whether this region has been assembled by autodetection or
+ * userspace assembly. Prevent endpoint decoders outside of automatic
+ * detection from being added to the region.
+ */
+#define CXL_REGION_F_AUTO 0
+
+/*
+ * Require that a committed region successfully complete a teardown once
+ * any of its associated decoders have been torn down. This maintains
+ * the commit state for the region since there are committed decoders,
+ * but blocks cxl_region_probe().
+ */
+#define CXL_REGION_F_NEEDS_RESET 1
+
+/**
+ * struct cxl_region - CXL region
+ * @dev: This region's device
+ * @id: This region's id. Id is globally unique across all regions
+ * @mode: Endpoint decoder allocation / access mode
+ * @type: Endpoint decoder target type
+ * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown
+ * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
+ * @flags: Region state flags
+ * @params: active + config params for the region
+ */
+struct cxl_region {
+	struct device dev;
+	int id;
+	enum cxl_decoder_mode mode;
+	enum cxl_decoder_type type;
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct cxl_pmem_region *cxlr_pmem;
+	unsigned long flags;
+	struct cxl_region_params params;
+};
+
+struct cxl_nvdimm_bridge {
+	int id;
+	struct device dev;
+	struct cxl_port *port;
+	struct nvdimm_bus *nvdimm_bus;
+	struct nvdimm_bus_descriptor nd_desc;
+};
+
+#define CXL_DEV_ID_LEN 19
+
+struct cxl_nvdimm {
+	struct device dev;
+	struct cxl_memdev *cxlmd;
+	u8 dev_id[CXL_DEV_ID_LEN]; /* for nvdimm, string of 'serial' */
+};
+
+struct cxl_pmem_region_mapping {
+	struct cxl_memdev *cxlmd;
+	struct cxl_nvdimm *cxl_nvd;
+	u64 start;
+	u64 size;
+	int position;
+};
+
+struct cxl_pmem_region {
+	struct device dev;
+	struct cxl_region *cxlr;
+	struct nd_region *nd_region;
+	struct range hpa_range;
+	int nr_mappings;
+	struct cxl_pmem_region_mapping mapping[];
+};
+
+struct cxl_dax_region {
+	struct device dev;
+	struct cxl_region *cxlr;
+	struct range hpa_range;
+};
+
+/**
+ * struct cxl_port - logical collection of upstream port devices and
+ *		     downstream port devices to construct a CXL memory
+ *		     decode hierarchy.
+ * @dev: this port's device
+ * @uport_dev: PCI or platform device implementing the upstream port capability
+ * @host_bridge: Shortcut to the platform attach point for this port
+ * @id: id for port device-name
+ * @dports: cxl_dport instances referenced by decoders
+ * @endpoints: cxl_ep instances, endpoints that are a descendant of this port
+ * @regions: cxl_region_ref instances, regions mapped by this port
+ * @parent_dport: dport that points to this port in the parent
+ * @decoder_ida: allocator for decoder ids
+ * @comp_map: component register capability mappings
+ * @nr_dports: number of entries in @dports
+ * @hdm_end: track last allocated HDM decoder instance for allocation ordering
+ * @commit_end: cursor to track highest committed decoder for commit ordering
+ * @component_reg_phys: component register capability base address (optional)
+ * @dead: last ep has been removed, force port re-creation
+ * @depth: How deep this port is relative to the root. depth 0 is the root.
+ * @cdat: Cached CDAT data
+ * @cdat_available: Should a CDAT attribute be available in sysfs
+ */
+struct cxl_port {
+	struct device dev;
+	struct device *uport_dev;
+	struct device *host_bridge;
+	int id;
+	struct xarray dports;
+	struct xarray endpoints;
+	struct xarray regions;
+	struct cxl_dport *parent_dport;
+	struct ida decoder_ida;
+	struct cxl_register_map comp_map;
+	int nr_dports;
+	int hdm_end;
+	int commit_end;
+	resource_size_t component_reg_phys;
+	bool dead;
+	unsigned int depth;
+	struct cxl_cdat {
+		void *table;
+		size_t length;
+	} cdat;
+	bool cdat_available;
+};
+
+static inline struct cxl_dport *
+cxl_find_dport_by_dev(struct cxl_port *port, const struct device *dport_dev)
+{
+	return xa_load(&port->dports, (unsigned long)dport_dev);
+}
+
+struct cxl_rcrb_info {
+	resource_size_t base;
+	u16 aer_cap;
+};
+
+/**
+ * struct cxl_dport - CXL downstream port
+ * @dport_dev: PCI bridge or firmware device representing the downstream link
+ * @comp_map: component register capability mappings
+ * @port_id: unique hardware identifier for dport in decoder target list
+ * @rcrb: Data about the Root Complex Register Block layout
+ * @rch: Indicate whether this dport was enumerated in RCH or VH mode
+ * @port: reference to cxl_port that contains this downstream port
+ */
+struct cxl_dport {
+	struct device *dport_dev;
+	struct cxl_register_map comp_map;
+	int port_id;
+	struct cxl_rcrb_info rcrb;
+	bool rch;
+	struct cxl_port *port;
+};
+
+/**
+ * struct cxl_ep - track an endpoint's interest in a port
+ * @ep: device that hosts a generic CXL endpoint (expander or accelerator)
+ * @dport: which dport routes to this endpoint on @port
+ * @next: cxl switch port across the link attached to @dport NULL if
+ *	  attached to an endpoint
+ */
+struct cxl_ep {
+	struct device *ep;
+	struct cxl_dport *dport;
+	struct cxl_port *next;
+};
+
+/**
+ * struct cxl_region_ref - track a region's interest in a port
+ * @port: point in topology to install this reference
+ * @decoder: decoder assigned for @region in @port
+ * @region: region for this reference
+ * @endpoints: cxl_ep references for region members beneath @port
+ * @nr_targets_set: track how many targets have been programmed during setup
+ * @nr_eps: number of endpoints beneath @port
+ * @nr_targets: number of distinct targets needed to reach @nr_eps
+ */
+struct cxl_region_ref {
+	struct cxl_port *port;
+	struct cxl_decoder *decoder;
+	struct cxl_region *region;
+	struct xarray endpoints;
+	int nr_targets_set;
+	int nr_eps;
+	int nr_targets;
+};
+
+/*
+ * The platform firmware device hosting the root is also the top of the
+ * CXL port topology. All other CXL ports have another CXL port as their
+ * parent and their ->uport_dev / host device is out-of-line of the port
+ * ancestry.
+ */
+static inline bool is_cxl_root(struct cxl_port *port)
+{
+	return port->uport_dev == port->dev.parent;
+}
+
+int cxl_num_decoders_committed(struct cxl_port *port);
+bool is_cxl_port(const struct device *dev);
+struct cxl_port *to_cxl_port(const struct device *dev);
+struct pci_bus;
+int devm_cxl_register_pci_bus(struct device *host, struct device *uport_dev,
+			      struct pci_bus *bus);
+struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port);
+struct cxl_port *devm_cxl_add_port(struct device *host,
+				   struct device *uport_dev,
+				   resource_size_t component_reg_phys,
+				   struct cxl_dport *parent_dport);
+struct cxl_port *find_cxl_root(struct cxl_port *port);
+int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd);
+void cxl_bus_rescan(void);
+void cxl_bus_drain(void);
+struct cxl_port *cxl_pci_find_port(struct pci_dev *pdev,
+				   struct cxl_dport **dport);
+struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd,
+				   struct cxl_dport **dport);
+bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd);
+
+struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port,
+				     struct device *dport, int port_id,
+				     resource_size_t component_reg_phys);
+struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port,
+					 struct device *dport_dev, int port_id,
+					 resource_size_t rcrb);
+
+struct cxl_decoder *to_cxl_decoder(struct device *dev);
+struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev);
+struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev);
+struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev);
+bool is_root_decoder(struct device *dev);
+bool is_switch_decoder(struct device *dev);
+bool is_endpoint_decoder(struct device *dev);
+struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
+						unsigned int nr_targets,
+						cxl_calc_hb_fn calc_hb);
+struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos);
+struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port,
+						    unsigned int nr_targets);
+int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map);
+struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port);
+int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map);
+int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld);
+int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint);
+
+/**
+ * struct cxl_endpoint_dvsec_info - Cached DVSEC info
+ * @mem_enabled: cached value of mem_enabled in the DVSEC at init time
+ * @ranges: Number of active HDM ranges this device uses.
+ * @port: endpoint port associated with this info instance
+ * @dvsec_range: cached attributes of the ranges in the DVSEC, PCIE_DEVICE
+ */
+struct cxl_endpoint_dvsec_info {
+	bool mem_enabled;
+	int ranges;
+	struct cxl_port *port;
+	struct range dvsec_range[2];
+};
+
+struct cxl_hdm;
+struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port,
+				   struct cxl_endpoint_dvsec_info *info);
+int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
+				struct cxl_endpoint_dvsec_info *info);
+int devm_cxl_add_passthrough_decoder(struct cxl_port *port);
+int cxl_dvsec_rr_decode(struct device *dev, int dvsec,
+			struct cxl_endpoint_dvsec_info *info);
+
+bool is_cxl_region(struct device *dev);
+
+extern struct bus_type cxl_bus_type;
+
+struct cxl_driver {
+	const char *name;
+	int (*probe)(struct device *dev);
+	void (*remove)(struct device *dev);
+	struct device_driver drv;
+	int id;
+};
+
+static inline struct cxl_driver *to_cxl_drv(struct device_driver *drv)
+{
+	return container_of(drv, struct cxl_driver, drv);
+}
+
+int __cxl_driver_register(struct cxl_driver *cxl_drv, struct module *owner,
+			  const char *modname);
+#define cxl_driver_register(x) __cxl_driver_register(x, THIS_MODULE, KBUILD_MODNAME)
+void cxl_driver_unregister(struct cxl_driver *cxl_drv);
+
+#define module_cxl_driver(__cxl_driver) \
+	module_driver(__cxl_driver, cxl_driver_register, cxl_driver_unregister)
+
+#define CXL_DEVICE_NVDIMM_BRIDGE	1
+#define CXL_DEVICE_NVDIMM		2
+#define CXL_DEVICE_PORT			3
+#define CXL_DEVICE_ROOT			4
+#define CXL_DEVICE_MEMORY_EXPANDER	5
+#define CXL_DEVICE_REGION		6
+#define CXL_DEVICE_PMEM_REGION		7
+#define CXL_DEVICE_DAX_REGION		8
+#define CXL_DEVICE_PMU			9
+
+#define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*")
+#define CXL_MODALIAS_FMT "cxl:t%d"
+
+struct cxl_nvdimm_bridge *to_cxl_nvdimm_bridge(struct device *dev);
+struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host,
+						     struct cxl_port *port);
+struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev);
+bool is_cxl_nvdimm(struct device *dev);
+bool is_cxl_nvdimm_bridge(struct device *dev);
+int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd);
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_memdev *cxlmd);
+
+#ifdef CONFIG_CXL_REGION
+bool is_cxl_pmem_region(struct device *dev);
+struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev);
+int cxl_add_to_region(struct cxl_port *root,
+		      struct cxl_endpoint_decoder *cxled);
+struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
+#else
+static inline bool is_cxl_pmem_region(struct device *dev)
+{
+	return false;
+}
+static inline struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
+{
+	return NULL;
+}
+static inline int cxl_add_to_region(struct cxl_port *root,
+				    struct cxl_endpoint_decoder *cxled)
+{
+	return 0;
+}
+static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev)
+{
+	return NULL;
+}
+#endif
+
+/*
+ * Unit test builds overrides this to __weak, find the 'strong' version
+ * of these symbols in tools/testing/cxl/.
+ */
+#ifndef __mock
+#define __mock static
+#endif
+
+#endif /* __CXL_H__ */
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
new file mode 100644
index 0000000000..6933bc20e7
--- /dev/null
+++ b/drivers/cxl/cxlmem.h
@@ -0,0 +1,902 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020-2021 Intel Corporation. */
+#ifndef __CXL_MEM_H__
+#define __CXL_MEM_H__
+#include <uapi/linux/cxl_mem.h>
+#include <linux/cdev.h>
+#include <linux/uuid.h>
+#include <linux/rcuwait.h>
+#include "cxl.h"
+
+/* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
+#define CXLMDEV_STATUS_OFFSET 0x0
+#define   CXLMDEV_DEV_FATAL BIT(0)
+#define   CXLMDEV_FW_HALT BIT(1)
+#define   CXLMDEV_STATUS_MEDIA_STATUS_MASK GENMASK(3, 2)
+#define     CXLMDEV_MS_NOT_READY 0
+#define     CXLMDEV_MS_READY 1
+#define     CXLMDEV_MS_ERROR 2
+#define     CXLMDEV_MS_DISABLED 3
+#define CXLMDEV_READY(status)                                                  \
+	(FIELD_GET(CXLMDEV_STATUS_MEDIA_STATUS_MASK, status) ==                \
+	 CXLMDEV_MS_READY)
+#define   CXLMDEV_MBOX_IF_READY BIT(4)
+#define   CXLMDEV_RESET_NEEDED_MASK GENMASK(7, 5)
+#define     CXLMDEV_RESET_NEEDED_NOT 0
+#define     CXLMDEV_RESET_NEEDED_COLD 1
+#define     CXLMDEV_RESET_NEEDED_WARM 2
+#define     CXLMDEV_RESET_NEEDED_HOT 3
+#define     CXLMDEV_RESET_NEEDED_CXL 4
+#define CXLMDEV_RESET_NEEDED(status)                                           \
+	(FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) !=                       \
+	 CXLMDEV_RESET_NEEDED_NOT)
+
+/**
+ * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device
+ * @dev: driver core device object
+ * @cdev: char dev core object for ioctl operations
+ * @cxlds: The device state backing this device
+ * @detach_work: active memdev lost a port in its ancestry
+ * @cxl_nvb: coordinate removal of @cxl_nvd if present
+ * @cxl_nvd: optional bridge to an nvdimm if the device supports pmem
+ * @endpoint: connection to the CXL port topology for this memory device
+ * @id: id number of this memdev instance.
+ * @depth: endpoint port depth
+ */
+struct cxl_memdev {
+	struct device dev;
+	struct cdev cdev;
+	struct cxl_dev_state *cxlds;
+	struct work_struct detach_work;
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct cxl_nvdimm *cxl_nvd;
+	struct cxl_port *endpoint;
+	int id;
+	int depth;
+};
+
+static inline struct cxl_memdev *to_cxl_memdev(struct device *dev)
+{
+	return container_of(dev, struct cxl_memdev, dev);
+}
+
+static inline struct cxl_port *cxled_to_port(struct cxl_endpoint_decoder *cxled)
+{
+	return to_cxl_port(cxled->cxld.dev.parent);
+}
+
+static inline struct cxl_port *cxlrd_to_port(struct cxl_root_decoder *cxlrd)
+{
+	return to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
+}
+
+static inline struct cxl_memdev *
+cxled_to_memdev(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_port *port = to_cxl_port(cxled->cxld.dev.parent);
+
+	return to_cxl_memdev(port->uport_dev);
+}
+
+bool is_cxl_memdev(const struct device *dev);
+static inline bool is_cxl_endpoint(struct cxl_port *port)
+{
+	return is_cxl_memdev(port->uport_dev);
+}
+
+struct cxl_memdev *devm_cxl_add_memdev(struct device *host,
+				       struct cxl_dev_state *cxlds);
+int devm_cxl_sanitize_setup_notifier(struct device *host,
+				     struct cxl_memdev *cxlmd);
+struct cxl_memdev_state;
+int devm_cxl_setup_fw_upload(struct device *host, struct cxl_memdev_state *mds);
+int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
+			 resource_size_t base, resource_size_t len,
+			 resource_size_t skipped);
+
+static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
+					 struct cxl_memdev *cxlmd)
+{
+	if (!port)
+		return NULL;
+
+	return xa_load(&port->endpoints, (unsigned long)&cxlmd->dev);
+}
+
+/**
+ * struct cxl_mbox_cmd - A command to be submitted to hardware.
+ * @opcode: (input) The command set and command submitted to hardware.
+ * @payload_in: (input) Pointer to the input payload.
+ * @payload_out: (output) Pointer to the output payload. Must be allocated by
+ *		 the caller.
+ * @size_in: (input) Number of bytes to load from @payload_in.
+ * @size_out: (input) Max number of bytes loaded into @payload_out.
+ *            (output) Number of bytes generated by the device. For fixed size
+ *            outputs commands this is always expected to be deterministic. For
+ *            variable sized output commands, it tells the exact number of bytes
+ *            written.
+ * @min_out: (input) internal command output payload size validation
+ * @poll_count: (input) Number of timeouts to attempt.
+ * @poll_interval_ms: (input) Time between mailbox background command polling
+ *                    interval timeouts.
+ * @return_code: (output) Error code returned from hardware.
+ *
+ * This is the primary mechanism used to send commands to the hardware.
+ * All the fields except @payload_* correspond exactly to the fields described in
+ * Command Register section of the CXL 2.0 8.2.8.4.5. @payload_in and
+ * @payload_out are written to, and read from the Command Payload Registers
+ * defined in CXL 2.0 8.2.8.4.8.
+ */
+struct cxl_mbox_cmd {
+	u16 opcode;
+	void *payload_in;
+	void *payload_out;
+	size_t size_in;
+	size_t size_out;
+	size_t min_out;
+	int poll_count;
+	int poll_interval_ms;
+	u16 return_code;
+};
+
+/*
+ * Per CXL 3.0 Section 8.2.8.4.5.1
+ */
+#define CMD_CMD_RC_TABLE							\
+	C(SUCCESS, 0, NULL),							\
+	C(BACKGROUND, -ENXIO, "background cmd started successfully"),           \
+	C(INPUT, -ENXIO, "cmd input was invalid"),				\
+	C(UNSUPPORTED, -ENXIO, "cmd is not supported"),				\
+	C(INTERNAL, -ENXIO, "internal device error"),				\
+	C(RETRY, -ENXIO, "temporary error, retry once"),			\
+	C(BUSY, -ENXIO, "ongoing background operation"),			\
+	C(MEDIADISABLED, -ENXIO, "media access is disabled"),			\
+	C(FWINPROGRESS, -ENXIO,	"one FW package can be transferred at a time"), \
+	C(FWOOO, -ENXIO, "FW package content was transferred out of order"),    \
+	C(FWAUTH, -ENXIO, "FW package authentication failed"),			\
+	C(FWSLOT, -ENXIO, "FW slot is not supported for requested operation"),  \
+	C(FWROLLBACK, -ENXIO, "rolled back to the previous active FW"),         \
+	C(FWRESET, -ENXIO, "FW failed to activate, needs cold reset"),		\
+	C(HANDLE, -ENXIO, "one or more Event Record Handles were invalid"),     \
+	C(PADDR, -EFAULT, "physical address specified is invalid"),		\
+	C(POISONLMT, -ENXIO, "poison injection limit has been reached"),        \
+	C(MEDIAFAILURE, -ENXIO, "permanent issue with the media"),		\
+	C(ABORT, -ENXIO, "background cmd was aborted by device"),               \
+	C(SECURITY, -ENXIO, "not valid in the current security state"),         \
+	C(PASSPHRASE, -ENXIO, "phrase doesn't match current set passphrase"),   \
+	C(MBUNSUPPORTED, -ENXIO, "unsupported on the mailbox it was issued on"),\
+	C(PAYLOADLEN, -ENXIO, "invalid payload length"),			\
+	C(LOG, -ENXIO, "invalid or unsupported log page"),			\
+	C(INTERRUPTED, -ENXIO, "asynchronous event occured"),			\
+	C(FEATUREVERSION, -ENXIO, "unsupported feature version"),		\
+	C(FEATURESELVALUE, -ENXIO, "unsupported feature selection value"),	\
+	C(FEATURETRANSFERIP, -ENXIO, "feature transfer in progress"),		\
+	C(FEATURETRANSFEROOO, -ENXIO, "feature transfer out of order"),		\
+	C(RESOURCEEXHAUSTED, -ENXIO, "resources are exhausted"),		\
+	C(EXTLIST, -ENXIO, "invalid Extent List"),				\
+
+#undef C
+#define C(a, b, c) CXL_MBOX_CMD_RC_##a
+enum  { CMD_CMD_RC_TABLE };
+#undef C
+#define C(a, b, c) { b, c }
+struct cxl_mbox_cmd_rc {
+	int err;
+	const char *desc;
+};
+
+static const
+struct cxl_mbox_cmd_rc cxl_mbox_cmd_rctable[] ={ CMD_CMD_RC_TABLE };
+#undef C
+
+static inline const char *cxl_mbox_cmd_rc2str(struct cxl_mbox_cmd *mbox_cmd)
+{
+	return cxl_mbox_cmd_rctable[mbox_cmd->return_code].desc;
+}
+
+static inline int cxl_mbox_cmd_rc2errno(struct cxl_mbox_cmd *mbox_cmd)
+{
+	return cxl_mbox_cmd_rctable[mbox_cmd->return_code].err;
+}
+
+/*
+ * CXL 2.0 - Memory capacity multiplier
+ * See Section 8.2.9.5
+ *
+ * Volatile, Persistent, and Partition capacities are specified to be in
+ * multiples of 256MB - define a multiplier to convert to/from bytes.
+ */
+#define CXL_CAPACITY_MULTIPLIER SZ_256M
+
+/*
+ * Event Interrupt Policy
+ *
+ * CXL rev 3.0 section 8.2.9.2.4; Table 8-52
+ */
+enum cxl_event_int_mode {
+	CXL_INT_NONE		= 0x00,
+	CXL_INT_MSI_MSIX	= 0x01,
+	CXL_INT_FW		= 0x02
+};
+struct cxl_event_interrupt_policy {
+	u8 info_settings;
+	u8 warn_settings;
+	u8 failure_settings;
+	u8 fatal_settings;
+} __packed;
+
+/**
+ * struct cxl_event_state - Event log driver state
+ *
+ * @buf: Buffer to receive event data
+ * @log_lock: Serialize event_buf and log use
+ */
+struct cxl_event_state {
+	struct cxl_get_event_payload *buf;
+	struct mutex log_lock;
+};
+
+/* Device enabled poison commands */
+enum poison_cmd_enabled_bits {
+	CXL_POISON_ENABLED_LIST,
+	CXL_POISON_ENABLED_INJECT,
+	CXL_POISON_ENABLED_CLEAR,
+	CXL_POISON_ENABLED_SCAN_CAPS,
+	CXL_POISON_ENABLED_SCAN_MEDIA,
+	CXL_POISON_ENABLED_SCAN_RESULTS,
+	CXL_POISON_ENABLED_MAX
+};
+
+/* Device enabled security commands */
+enum security_cmd_enabled_bits {
+	CXL_SEC_ENABLED_SANITIZE,
+	CXL_SEC_ENABLED_SECURE_ERASE,
+	CXL_SEC_ENABLED_GET_SECURITY_STATE,
+	CXL_SEC_ENABLED_SET_PASSPHRASE,
+	CXL_SEC_ENABLED_DISABLE_PASSPHRASE,
+	CXL_SEC_ENABLED_UNLOCK,
+	CXL_SEC_ENABLED_FREEZE_SECURITY,
+	CXL_SEC_ENABLED_PASSPHRASE_SECURE_ERASE,
+	CXL_SEC_ENABLED_MAX
+};
+
+/**
+ * struct cxl_poison_state - Driver poison state info
+ *
+ * @max_errors: Maximum media error records held in device cache
+ * @enabled_cmds: All poison commands enabled in the CEL
+ * @list_out: The poison list payload returned by device
+ * @lock: Protect reads of the poison list
+ *
+ * Reads of the poison list are synchronized to ensure that a reader
+ * does not get an incomplete list because their request overlapped
+ * (was interrupted or preceded by) another read request of the same
+ * DPA range. CXL Spec 3.0 Section 8.2.9.8.4.1
+ */
+struct cxl_poison_state {
+	u32 max_errors;
+	DECLARE_BITMAP(enabled_cmds, CXL_POISON_ENABLED_MAX);
+	struct cxl_mbox_poison_out *list_out;
+	struct mutex lock;  /* Protect reads of poison list */
+};
+
+/*
+ * Get FW Info
+ * CXL rev 3.0 section 8.2.9.3.1; Table 8-56
+ */
+struct cxl_mbox_get_fw_info {
+	u8 num_slots;
+	u8 slot_info;
+	u8 activation_cap;
+	u8 reserved[13];
+	char slot_1_revision[16];
+	char slot_2_revision[16];
+	char slot_3_revision[16];
+	char slot_4_revision[16];
+} __packed;
+
+#define CXL_FW_INFO_SLOT_INFO_CUR_MASK			GENMASK(2, 0)
+#define CXL_FW_INFO_SLOT_INFO_NEXT_MASK			GENMASK(5, 3)
+#define CXL_FW_INFO_SLOT_INFO_NEXT_SHIFT		3
+#define CXL_FW_INFO_ACTIVATION_CAP_HAS_LIVE_ACTIVATE	BIT(0)
+
+/*
+ * Transfer FW Input Payload
+ * CXL rev 3.0 section 8.2.9.3.2; Table 8-57
+ */
+struct cxl_mbox_transfer_fw {
+	u8 action;
+	u8 slot;
+	u8 reserved[2];
+	__le32 offset;
+	u8 reserved2[0x78];
+	u8 data[];
+} __packed;
+
+#define CXL_FW_TRANSFER_ACTION_FULL	0x0
+#define CXL_FW_TRANSFER_ACTION_INITIATE	0x1
+#define CXL_FW_TRANSFER_ACTION_CONTINUE	0x2
+#define CXL_FW_TRANSFER_ACTION_END	0x3
+#define CXL_FW_TRANSFER_ACTION_ABORT	0x4
+
+/*
+ * CXL rev 3.0 section 8.2.9.3.2 mandates 128-byte alignment for FW packages
+ * and for each part transferred in a Transfer FW command.
+ */
+#define CXL_FW_TRANSFER_ALIGNMENT	128
+
+/*
+ * Activate FW Input Payload
+ * CXL rev 3.0 section 8.2.9.3.3; Table 8-58
+ */
+struct cxl_mbox_activate_fw {
+	u8 action;
+	u8 slot;
+} __packed;
+
+#define CXL_FW_ACTIVATE_ONLINE		0x0
+#define CXL_FW_ACTIVATE_OFFLINE		0x1
+
+/* FW state bits */
+#define CXL_FW_STATE_BITS		32
+#define CXL_FW_CANCEL			0
+
+/**
+ * struct cxl_fw_state - Firmware upload / activation state
+ *
+ * @state: fw_uploader state bitmask
+ * @oneshot: whether the fw upload fits in a single transfer
+ * @num_slots: Number of FW slots available
+ * @cur_slot: Slot number currently active
+ * @next_slot: Slot number for the new firmware
+ */
+struct cxl_fw_state {
+	DECLARE_BITMAP(state, CXL_FW_STATE_BITS);
+	bool oneshot;
+	int num_slots;
+	int cur_slot;
+	int next_slot;
+};
+
+/**
+ * struct cxl_security_state - Device security state
+ *
+ * @state: state of last security operation
+ * @enabled_cmds: All security commands enabled in the CEL
+ * @poll_tmo_secs: polling timeout
+ * @sanitize_active: sanitize completion pending
+ * @poll_dwork: polling work item
+ * @sanitize_node: sanitation sysfs file to notify
+ */
+struct cxl_security_state {
+	unsigned long state;
+	DECLARE_BITMAP(enabled_cmds, CXL_SEC_ENABLED_MAX);
+	int poll_tmo_secs;
+	bool sanitize_active;
+	struct delayed_work poll_dwork;
+	struct kernfs_node *sanitize_node;
+};
+
+/*
+ * enum cxl_devtype - delineate type-2 from a generic type-3 device
+ * @CXL_DEVTYPE_DEVMEM - Vendor specific CXL Type-2 device implementing HDM-D or
+ *			 HDM-DB, no requirement that this device implements a
+ *			 mailbox, or other memory-device-standard manageability
+ *			 flows.
+ * @CXL_DEVTYPE_CLASSMEM - Common class definition of a CXL Type-3 device with
+ *			   HDM-H and class-mandatory memory device registers
+ */
+enum cxl_devtype {
+	CXL_DEVTYPE_DEVMEM,
+	CXL_DEVTYPE_CLASSMEM,
+};
+
+/**
+ * struct cxl_dev_state - The driver device state
+ *
+ * cxl_dev_state represents the CXL driver/device state.  It provides an
+ * interface to mailbox commands as well as some cached data about the device.
+ * Currently only memory devices are represented.
+ *
+ * @dev: The device associated with this CXL state
+ * @cxlmd: The device representing the CXL.mem capabilities of @dev
+ * @regs: Parsed register blocks
+ * @cxl_dvsec: Offset to the PCIe device DVSEC
+ * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH)
+ * @media_ready: Indicate whether the device media is usable
+ * @dpa_res: Overall DPA resource tree for the device
+ * @pmem_res: Active Persistent memory capacity configuration
+ * @ram_res: Active Volatile memory capacity configuration
+ * @component_reg_phys: register base of component registers
+ * @serial: PCIe Device Serial Number
+ * @type: Generic Memory Class device or Vendor Specific Memory device
+ */
+struct cxl_dev_state {
+	struct device *dev;
+	struct cxl_memdev *cxlmd;
+	struct cxl_regs regs;
+	int cxl_dvsec;
+	bool rcd;
+	bool media_ready;
+	struct resource dpa_res;
+	struct resource pmem_res;
+	struct resource ram_res;
+	resource_size_t component_reg_phys;
+	u64 serial;
+	enum cxl_devtype type;
+};
+
+/**
+ * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data
+ *
+ * CXL 8.1.12.1 PCI Header - Class Code Register Memory Device defines
+ * common memory device functionality like the presence of a mailbox and
+ * the functionality related to that like Identify Memory Device and Get
+ * Partition Info
+ * @cxlds: Core driver state common across Type-2 and Type-3 devices
+ * @payload_size: Size of space for payload
+ *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
+ * @lsa_size: Size of Label Storage Area
+ *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
+ * @mbox_mutex: Mutex to synchronize mailbox access.
+ * @firmware_version: Firmware version for the memory device.
+ * @enabled_cmds: Hardware commands found enabled in CEL.
+ * @exclusive_cmds: Commands that are kernel-internal only
+ * @total_bytes: sum of all possible capacities
+ * @volatile_only_bytes: hard volatile capacity
+ * @persistent_only_bytes: hard persistent capacity
+ * @partition_align_bytes: alignment size for partition-able capacity
+ * @active_volatile_bytes: sum of hard + soft volatile
+ * @active_persistent_bytes: sum of hard + soft persistent
+ * @next_volatile_bytes: volatile capacity change pending device reset
+ * @next_persistent_bytes: persistent capacity change pending device reset
+ * @event: event log driver state
+ * @poison: poison driver state info
+ * @security: security driver state info
+ * @fw: firmware upload / activation state
+ * @mbox_send: @dev specific transport for transmitting mailbox commands
+ *
+ * See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for
+ * details on capacity parameters.
+ */
+struct cxl_memdev_state {
+	struct cxl_dev_state cxlds;
+	size_t payload_size;
+	size_t lsa_size;
+	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
+	char firmware_version[0x10];
+	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
+	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
+	u64 total_bytes;
+	u64 volatile_only_bytes;
+	u64 persistent_only_bytes;
+	u64 partition_align_bytes;
+	u64 active_volatile_bytes;
+	u64 active_persistent_bytes;
+	u64 next_volatile_bytes;
+	u64 next_persistent_bytes;
+	struct cxl_event_state event;
+	struct cxl_poison_state poison;
+	struct cxl_security_state security;
+	struct cxl_fw_state fw;
+
+	struct rcuwait mbox_wait;
+	int (*mbox_send)(struct cxl_memdev_state *mds,
+			 struct cxl_mbox_cmd *cmd);
+};
+
+static inline struct cxl_memdev_state *
+to_cxl_memdev_state(struct cxl_dev_state *cxlds)
+{
+	if (cxlds->type != CXL_DEVTYPE_CLASSMEM)
+		return NULL;
+	return container_of(cxlds, struct cxl_memdev_state, cxlds);
+}
+
+enum cxl_opcode {
+	CXL_MBOX_OP_INVALID		= 0x0000,
+	CXL_MBOX_OP_RAW			= CXL_MBOX_OP_INVALID,
+	CXL_MBOX_OP_GET_EVENT_RECORD	= 0x0100,
+	CXL_MBOX_OP_CLEAR_EVENT_RECORD	= 0x0101,
+	CXL_MBOX_OP_GET_EVT_INT_POLICY	= 0x0102,
+	CXL_MBOX_OP_SET_EVT_INT_POLICY	= 0x0103,
+	CXL_MBOX_OP_GET_FW_INFO		= 0x0200,
+	CXL_MBOX_OP_TRANSFER_FW		= 0x0201,
+	CXL_MBOX_OP_ACTIVATE_FW		= 0x0202,
+	CXL_MBOX_OP_SET_TIMESTAMP	= 0x0301,
+	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
+	CXL_MBOX_OP_GET_LOG		= 0x0401,
+	CXL_MBOX_OP_IDENTIFY		= 0x4000,
+	CXL_MBOX_OP_GET_PARTITION_INFO	= 0x4100,
+	CXL_MBOX_OP_SET_PARTITION_INFO	= 0x4101,
+	CXL_MBOX_OP_GET_LSA		= 0x4102,
+	CXL_MBOX_OP_SET_LSA		= 0x4103,
+	CXL_MBOX_OP_GET_HEALTH_INFO	= 0x4200,
+	CXL_MBOX_OP_GET_ALERT_CONFIG	= 0x4201,
+	CXL_MBOX_OP_SET_ALERT_CONFIG	= 0x4202,
+	CXL_MBOX_OP_GET_SHUTDOWN_STATE	= 0x4203,
+	CXL_MBOX_OP_SET_SHUTDOWN_STATE	= 0x4204,
+	CXL_MBOX_OP_GET_POISON		= 0x4300,
+	CXL_MBOX_OP_INJECT_POISON	= 0x4301,
+	CXL_MBOX_OP_CLEAR_POISON	= 0x4302,
+	CXL_MBOX_OP_GET_SCAN_MEDIA_CAPS	= 0x4303,
+	CXL_MBOX_OP_SCAN_MEDIA		= 0x4304,
+	CXL_MBOX_OP_GET_SCAN_MEDIA	= 0x4305,
+	CXL_MBOX_OP_SANITIZE		= 0x4400,
+	CXL_MBOX_OP_SECURE_ERASE	= 0x4401,
+	CXL_MBOX_OP_GET_SECURITY_STATE	= 0x4500,
+	CXL_MBOX_OP_SET_PASSPHRASE	= 0x4501,
+	CXL_MBOX_OP_DISABLE_PASSPHRASE	= 0x4502,
+	CXL_MBOX_OP_UNLOCK		= 0x4503,
+	CXL_MBOX_OP_FREEZE_SECURITY	= 0x4504,
+	CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE	= 0x4505,
+	CXL_MBOX_OP_MAX			= 0x10000
+};
+
+#define DEFINE_CXL_CEL_UUID                                                    \
+	UUID_INIT(0xda9c0b5, 0xbf41, 0x4b78, 0x8f, 0x79, 0x96, 0xb1, 0x62,     \
+		  0x3b, 0x3f, 0x17)
+
+#define DEFINE_CXL_VENDOR_DEBUG_UUID                                           \
+	UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19,     \
+		  0x40, 0x3d, 0x86)
+
+struct cxl_mbox_get_supported_logs {
+	__le16 entries;
+	u8 rsvd[6];
+	struct cxl_gsl_entry {
+		uuid_t uuid;
+		__le32 size;
+	} __packed entry[];
+}  __packed;
+
+struct cxl_cel_entry {
+	__le16 opcode;
+	__le16 effect;
+} __packed;
+
+struct cxl_mbox_get_log {
+	uuid_t uuid;
+	__le32 offset;
+	__le32 length;
+} __packed;
+
+/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
+struct cxl_mbox_identify {
+	char fw_revision[0x10];
+	__le64 total_capacity;
+	__le64 volatile_capacity;
+	__le64 persistent_capacity;
+	__le64 partition_align;
+	__le16 info_event_log_size;
+	__le16 warning_event_log_size;
+	__le16 failure_event_log_size;
+	__le16 fatal_event_log_size;
+	__le32 lsa_size;
+	u8 poison_list_max_mer[3];
+	__le16 inject_poison_limit;
+	u8 poison_caps;
+	u8 qos_telemetry_caps;
+} __packed;
+
+/*
+ * Common Event Record Format
+ * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
+ */
+struct cxl_event_record_hdr {
+	uuid_t id;
+	u8 length;
+	u8 flags[3];
+	__le16 handle;
+	__le16 related_handle;
+	__le64 timestamp;
+	u8 maint_op_class;
+	u8 reserved[15];
+} __packed;
+
+#define CXL_EVENT_RECORD_DATA_LENGTH 0x50
+struct cxl_event_record_raw {
+	struct cxl_event_record_hdr hdr;
+	u8 data[CXL_EVENT_RECORD_DATA_LENGTH];
+} __packed;
+
+/*
+ * Get Event Records output payload
+ * CXL rev 3.0 section 8.2.9.2.2; Table 8-50
+ */
+#define CXL_GET_EVENT_FLAG_OVERFLOW		BIT(0)
+#define CXL_GET_EVENT_FLAG_MORE_RECORDS		BIT(1)
+struct cxl_get_event_payload {
+	u8 flags;
+	u8 reserved1;
+	__le16 overflow_err_count;
+	__le64 first_overflow_timestamp;
+	__le64 last_overflow_timestamp;
+	__le16 record_count;
+	u8 reserved2[10];
+	struct cxl_event_record_raw records[];
+} __packed;
+
+/*
+ * CXL rev 3.0 section 8.2.9.2.2; Table 8-49
+ */
+enum cxl_event_log_type {
+	CXL_EVENT_TYPE_INFO = 0x00,
+	CXL_EVENT_TYPE_WARN,
+	CXL_EVENT_TYPE_FAIL,
+	CXL_EVENT_TYPE_FATAL,
+	CXL_EVENT_TYPE_MAX
+};
+
+/*
+ * Clear Event Records input payload
+ * CXL rev 3.0 section 8.2.9.2.3; Table 8-51
+ */
+struct cxl_mbox_clear_event_payload {
+	u8 event_log;		/* enum cxl_event_log_type */
+	u8 clear_flags;
+	u8 nr_recs;
+	u8 reserved[3];
+	__le16 handles[];
+} __packed;
+#define CXL_CLEAR_EVENT_MAX_HANDLES U8_MAX
+
+/*
+ * General Media Event Record
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+#define CXL_EVENT_GEN_MED_COMP_ID_SIZE	0x10
+struct cxl_event_gen_media {
+	struct cxl_event_record_hdr hdr;
+	__le64 phys_addr;
+	u8 descriptor;
+	u8 type;
+	u8 transaction_type;
+	u8 validity_flags[2];
+	u8 channel;
+	u8 rank;
+	u8 device[3];
+	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
+	u8 reserved[46];
+} __packed;
+
+/*
+ * DRAM Event Record - DER
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 3-44
+ */
+#define CXL_EVENT_DER_CORRECTION_MASK_SIZE	0x20
+struct cxl_event_dram {
+	struct cxl_event_record_hdr hdr;
+	__le64 phys_addr;
+	u8 descriptor;
+	u8 type;
+	u8 transaction_type;
+	u8 validity_flags[2];
+	u8 channel;
+	u8 rank;
+	u8 nibble_mask[3];
+	u8 bank_group;
+	u8 bank;
+	u8 row[3];
+	u8 column[2];
+	u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE];
+	u8 reserved[0x17];
+} __packed;
+
+/*
+ * Get Health Info Record
+ * CXL rev 3.0 section 8.2.9.8.3.1; Table 8-100
+ */
+struct cxl_get_health_info {
+	u8 health_status;
+	u8 media_status;
+	u8 add_status;
+	u8 life_used;
+	u8 device_temp[2];
+	u8 dirty_shutdown_cnt[4];
+	u8 cor_vol_err_cnt[4];
+	u8 cor_per_err_cnt[4];
+} __packed;
+
+/*
+ * Memory Module Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+struct cxl_event_mem_module {
+	struct cxl_event_record_hdr hdr;
+	u8 event_type;
+	struct cxl_get_health_info info;
+	u8 reserved[0x3d];
+} __packed;
+
+struct cxl_mbox_get_partition_info {
+	__le64 active_volatile_cap;
+	__le64 active_persistent_cap;
+	__le64 next_volatile_cap;
+	__le64 next_persistent_cap;
+} __packed;
+
+struct cxl_mbox_get_lsa {
+	__le32 offset;
+	__le32 length;
+} __packed;
+
+struct cxl_mbox_set_lsa {
+	__le32 offset;
+	__le32 reserved;
+	u8 data[];
+} __packed;
+
+struct cxl_mbox_set_partition_info {
+	__le64 volatile_capacity;
+	u8 flags;
+} __packed;
+
+#define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
+
+/* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
+struct cxl_mbox_set_timestamp_in {
+	__le64 timestamp;
+
+} __packed;
+
+/* Get Poison List  CXL 3.0 Spec 8.2.9.8.4.1 */
+struct cxl_mbox_poison_in {
+	__le64 offset;
+	__le64 length;
+} __packed;
+
+struct cxl_mbox_poison_out {
+	u8 flags;
+	u8 rsvd1;
+	__le64 overflow_ts;
+	__le16 count;
+	u8 rsvd2[20];
+	struct cxl_poison_record {
+		__le64 address;
+		__le32 length;
+		__le32 rsvd;
+	} __packed record[];
+} __packed;
+
+/*
+ * Get Poison List address field encodes the starting
+ * address of poison, and the source of the poison.
+ */
+#define CXL_POISON_START_MASK		GENMASK_ULL(63, 6)
+#define CXL_POISON_SOURCE_MASK		GENMASK(2, 0)
+
+/* Get Poison List record length is in units of 64 bytes */
+#define CXL_POISON_LEN_MULT	64
+
+/* Kernel defined maximum for a list of poison errors */
+#define CXL_POISON_LIST_MAX	1024
+
+/* Get Poison List: Payload out flags */
+#define CXL_POISON_FLAG_MORE            BIT(0)
+#define CXL_POISON_FLAG_OVERFLOW        BIT(1)
+#define CXL_POISON_FLAG_SCANNING        BIT(2)
+
+/* Get Poison List: Poison Source */
+#define CXL_POISON_SOURCE_UNKNOWN	0
+#define CXL_POISON_SOURCE_EXTERNAL	1
+#define CXL_POISON_SOURCE_INTERNAL	2
+#define CXL_POISON_SOURCE_INJECTED	3
+#define CXL_POISON_SOURCE_VENDOR	7
+
+/* Inject & Clear Poison  CXL 3.0 Spec 8.2.9.8.4.2/3 */
+struct cxl_mbox_inject_poison {
+	__le64 address;
+};
+
+/* Clear Poison  CXL 3.0 Spec 8.2.9.8.4.3 */
+struct cxl_mbox_clear_poison {
+	__le64 address;
+	u8 write_data[CXL_POISON_LEN_MULT];
+} __packed;
+
+/**
+ * struct cxl_mem_command - Driver representation of a memory device command
+ * @info: Command information as it exists for the UAPI
+ * @opcode: The actual bits used for the mailbox protocol
+ * @flags: Set of flags effecting driver behavior.
+ *
+ *  * %CXL_CMD_FLAG_FORCE_ENABLE: In cases of error, commands with this flag
+ *    will be enabled by the driver regardless of what hardware may have
+ *    advertised.
+ *
+ * The cxl_mem_command is the driver's internal representation of commands that
+ * are supported by the driver. Some of these commands may not be supported by
+ * the hardware. The driver will use @info to validate the fields passed in by
+ * the user then submit the @opcode to the hardware.
+ *
+ * See struct cxl_command_info.
+ */
+struct cxl_mem_command {
+	struct cxl_command_info info;
+	enum cxl_opcode opcode;
+	u32 flags;
+#define CXL_CMD_FLAG_FORCE_ENABLE BIT(0)
+};
+
+#define CXL_PMEM_SEC_STATE_USER_PASS_SET	0x01
+#define CXL_PMEM_SEC_STATE_MASTER_PASS_SET	0x02
+#define CXL_PMEM_SEC_STATE_LOCKED		0x04
+#define CXL_PMEM_SEC_STATE_FROZEN		0x08
+#define CXL_PMEM_SEC_STATE_USER_PLIMIT		0x10
+#define CXL_PMEM_SEC_STATE_MASTER_PLIMIT	0x20
+
+/* set passphrase input payload */
+struct cxl_set_pass {
+	u8 type;
+	u8 reserved[31];
+	/* CXL field using NVDIMM define, same length */
+	u8 old_pass[NVDIMM_PASSPHRASE_LEN];
+	u8 new_pass[NVDIMM_PASSPHRASE_LEN];
+} __packed;
+
+/* disable passphrase input payload */
+struct cxl_disable_pass {
+	u8 type;
+	u8 reserved[31];
+	u8 pass[NVDIMM_PASSPHRASE_LEN];
+} __packed;
+
+/* passphrase secure erase payload */
+struct cxl_pass_erase {
+	u8 type;
+	u8 reserved[31];
+	u8 pass[NVDIMM_PASSPHRASE_LEN];
+} __packed;
+
+enum {
+	CXL_PMEM_SEC_PASS_MASTER = 0,
+	CXL_PMEM_SEC_PASS_USER,
+};
+
+int cxl_internal_send_cmd(struct cxl_memdev_state *mds,
+			  struct cxl_mbox_cmd *cmd);
+int cxl_dev_state_identify(struct cxl_memdev_state *mds);
+int cxl_await_media_ready(struct cxl_dev_state *cxlds);
+int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
+int cxl_mem_create_range_info(struct cxl_memdev_state *mds);
+struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev);
+void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
+				unsigned long *cmds);
+void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
+				  unsigned long *cmds);
+void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status);
+int cxl_set_timestamp(struct cxl_memdev_state *mds);
+int cxl_poison_state_init(struct cxl_memdev_state *mds);
+int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
+		       struct cxl_region *cxlr);
+int cxl_trigger_poison_list(struct cxl_memdev *cxlmd);
+int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa);
+int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa);
+
+#ifdef CONFIG_CXL_SUSPEND
+void cxl_mem_active_inc(void);
+void cxl_mem_active_dec(void);
+#else
+static inline void cxl_mem_active_inc(void)
+{
+}
+static inline void cxl_mem_active_dec(void)
+{
+}
+#endif
+
+int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd);
+
+struct cxl_hdm {
+	struct cxl_component_regs regs;
+	unsigned int decoder_count;
+	unsigned int target_count;
+	unsigned int interleave_mask;
+	struct cxl_port *port;
+};
+
+struct seq_file;
+struct dentry *cxl_debugfs_create_dir(const char *dir);
+void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds);
+#endif /* __CXL_MEM_H__ */
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
new file mode 100644
index 0000000000..0fa4799ea3
--- /dev/null
+++ b/drivers/cxl/cxlpci.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#ifndef __CXL_PCI_H__
+#define __CXL_PCI_H__
+#include <linux/pci.h>
+#include "cxl.h"
+
+#define CXL_MEMORY_PROGIF	0x10
+
+/*
+ * See section 8.1 Configuration Space Registers in the CXL 2.0
+ * Specification. Names are taken straight from the specification with "CXL" and
+ * "DVSEC" redundancies removed. When obvious, abbreviations may be used.
+ */
+#define PCI_DVSEC_HEADER1_LENGTH_MASK	GENMASK(31, 20)
+#define PCI_DVSEC_VENDOR_ID_CXL		0x1E98
+
+/* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */
+#define CXL_DVSEC_PCIE_DEVICE					0
+#define   CXL_DVSEC_CAP_OFFSET		0xA
+#define     CXL_DVSEC_MEM_CAPABLE	BIT(2)
+#define     CXL_DVSEC_HDM_COUNT_MASK	GENMASK(5, 4)
+#define   CXL_DVSEC_CTRL_OFFSET		0xC
+#define     CXL_DVSEC_MEM_ENABLE	BIT(2)
+#define   CXL_DVSEC_RANGE_SIZE_HIGH(i)	(0x18 + (i * 0x10))
+#define   CXL_DVSEC_RANGE_SIZE_LOW(i)	(0x1C + (i * 0x10))
+#define     CXL_DVSEC_MEM_INFO_VALID	BIT(0)
+#define     CXL_DVSEC_MEM_ACTIVE	BIT(1)
+#define     CXL_DVSEC_MEM_SIZE_LOW_MASK	GENMASK(31, 28)
+#define   CXL_DVSEC_RANGE_BASE_HIGH(i)	(0x20 + (i * 0x10))
+#define   CXL_DVSEC_RANGE_BASE_LOW(i)	(0x24 + (i * 0x10))
+#define     CXL_DVSEC_MEM_BASE_LOW_MASK	GENMASK(31, 28)
+
+#define CXL_DVSEC_RANGE_MAX		2
+
+/* CXL 2.0 8.1.4: Non-CXL Function Map DVSEC */
+#define CXL_DVSEC_FUNCTION_MAP					2
+
+/* CXL 2.0 8.1.5: CXL 2.0 Extensions DVSEC for Ports */
+#define CXL_DVSEC_PORT_EXTENSIONS				3
+
+/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */
+#define CXL_DVSEC_PORT_GPF					4
+
+/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */
+#define CXL_DVSEC_DEVICE_GPF					5
+
+/* CXL 2.0 8.1.8: PCIe DVSEC for Flex Bus Port */
+#define CXL_DVSEC_PCIE_FLEXBUS_PORT				7
+
+/* CXL 2.0 8.1.9: Register Locator DVSEC */
+#define CXL_DVSEC_REG_LOCATOR					8
+#define   CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET			0xC
+#define     CXL_DVSEC_REG_LOCATOR_BIR_MASK			GENMASK(2, 0)
+#define	    CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK			GENMASK(15, 8)
+#define     CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK		GENMASK(31, 16)
+
+/*
+ * NOTE: Currently all the functions which are enabled for CXL require their
+ * vectors to be in the first 16.  Use this as the default max.
+ */
+#define CXL_PCI_DEFAULT_MAX_VECTORS 16
+
+/* Register Block Identifier (RBI) */
+enum cxl_regloc_type {
+	CXL_REGLOC_RBI_EMPTY = 0,
+	CXL_REGLOC_RBI_COMPONENT,
+	CXL_REGLOC_RBI_VIRT,
+	CXL_REGLOC_RBI_MEMDEV,
+	CXL_REGLOC_RBI_PMU,
+	CXL_REGLOC_RBI_TYPES
+};
+
+struct cdat_header {
+	__le32 length;
+	u8 revision;
+	u8 checksum;
+	u8 reserved[6];
+	__le32 sequence;
+} __packed;
+
+struct cdat_entry_header {
+	u8 type;
+	u8 reserved;
+	__le16 length;
+} __packed;
+
+int devm_cxl_port_enumerate_dports(struct cxl_port *port);
+struct cxl_dev_state;
+int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
+			struct cxl_endpoint_dvsec_info *info);
+void read_cdat_data(struct cxl_port *port);
+void cxl_cor_error_detected(struct pci_dev *pdev);
+pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
+				    pci_channel_state_t state);
+#endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
new file mode 100644
index 0000000000..317c7548e4
--- /dev/null
+++ b/drivers/cxl/mem.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include "cxlmem.h"
+#include "cxlpci.h"
+
+/**
+ * DOC: cxl mem
+ *
+ * CXL memory endpoint devices and switches are CXL capable devices that are
+ * participating in CXL.mem protocol. Their functionality builds on top of the
+ * CXL.io protocol that allows enumerating and configuring components via
+ * standard PCI mechanisms.
+ *
+ * The cxl_mem driver owns kicking off the enumeration of this CXL.mem
+ * capability. With the detection of a CXL capable endpoint, the driver will
+ * walk up to find the platform specific port it is connected to, and determine
+ * if there are intervening switches in the path. If there are switches, a
+ * secondary action is to enumerate those (implemented in cxl_core). Finally the
+ * cxl_mem driver adds the device it is bound to as a CXL endpoint-port for use
+ * in higher level operations.
+ */
+
+static void enable_suspend(void *data)
+{
+	cxl_mem_active_dec();
+}
+
+static void remove_debugfs(void *dentry)
+{
+	debugfs_remove_recursive(dentry);
+}
+
+static int cxl_mem_dpa_show(struct seq_file *file, void *data)
+{
+	struct device *dev = file->private;
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+
+	cxl_dpa_debug(file, cxlmd->cxlds);
+
+	return 0;
+}
+
+static int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd,
+				 struct cxl_dport *parent_dport)
+{
+	struct cxl_port *parent_port = parent_dport->port;
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_port *endpoint, *iter, *down;
+	int rc;
+
+	/*
+	 * Now that the path to the root is established record all the
+	 * intervening ports in the chain.
+	 */
+	for (iter = parent_port, down = NULL; !is_cxl_root(iter);
+	     down = iter, iter = to_cxl_port(iter->dev.parent)) {
+		struct cxl_ep *ep;
+
+		ep = cxl_ep_load(iter, cxlmd);
+		ep->next = down;
+	}
+
+	endpoint = devm_cxl_add_port(host, &cxlmd->dev,
+				     cxlds->component_reg_phys,
+				     parent_dport);
+	if (IS_ERR(endpoint))
+		return PTR_ERR(endpoint);
+
+	rc = cxl_endpoint_autoremove(cxlmd, endpoint);
+	if (rc)
+		return rc;
+
+	if (!endpoint->dev.driver) {
+		dev_err(&cxlmd->dev, "%s failed probe\n",
+			dev_name(&endpoint->dev));
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static int cxl_debugfs_poison_inject(void *data, u64 dpa)
+{
+	struct cxl_memdev *cxlmd = data;
+
+	return cxl_inject_poison(cxlmd, dpa);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_inject_fops, NULL,
+			 cxl_debugfs_poison_inject, "%llx\n");
+
+static int cxl_debugfs_poison_clear(void *data, u64 dpa)
+{
+	struct cxl_memdev *cxlmd = data;
+
+	return cxl_clear_poison(cxlmd, dpa);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
+			 cxl_debugfs_poison_clear, "%llx\n");
+
+static int cxl_mem_probe(struct device *dev)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct device *endpoint_parent;
+	struct cxl_port *parent_port;
+	struct cxl_dport *dport;
+	struct dentry *dentry;
+	int rc;
+
+	if (!cxlds->media_ready)
+		return -EBUSY;
+
+	/*
+	 * Someone is trying to reattach this device after it lost its port
+	 * connection (an endpoint port previously registered by this memdev was
+	 * disabled). This racy check is ok because if the port is still gone,
+	 * no harm done, and if the port hierarchy comes back it will re-trigger
+	 * this probe. Port rescan and memdev detach work share the same
+	 * single-threaded workqueue.
+	 */
+	if (work_pending(&cxlmd->detach_work))
+		return -EBUSY;
+
+	dentry = cxl_debugfs_create_dir(dev_name(dev));
+	debugfs_create_devm_seqfile(dev, "dpamem", dentry, cxl_mem_dpa_show);
+
+	if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds))
+		debugfs_create_file("inject_poison", 0200, dentry, cxlmd,
+				    &cxl_poison_inject_fops);
+	if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds))
+		debugfs_create_file("clear_poison", 0200, dentry, cxlmd,
+				    &cxl_poison_clear_fops);
+
+	rc = devm_add_action_or_reset(dev, remove_debugfs, dentry);
+	if (rc)
+		return rc;
+
+	rc = devm_cxl_enumerate_ports(cxlmd);
+	if (rc)
+		return rc;
+
+	parent_port = cxl_mem_find_port(cxlmd, &dport);
+	if (!parent_port) {
+		dev_err(dev, "CXL port topology not found\n");
+		return -ENXIO;
+	}
+
+	if (dport->rch)
+		endpoint_parent = parent_port->uport_dev;
+	else
+		endpoint_parent = &parent_port->dev;
+
+	device_lock(endpoint_parent);
+	if (!endpoint_parent->driver) {
+		dev_err(dev, "CXL port topology %s not enabled\n",
+			dev_name(endpoint_parent));
+		rc = -ENXIO;
+		goto unlock;
+	}
+
+	rc = devm_cxl_add_endpoint(endpoint_parent, cxlmd, dport);
+unlock:
+	device_unlock(endpoint_parent);
+	put_device(&parent_port->dev);
+	if (rc)
+		return rc;
+
+	if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) {
+		rc = devm_cxl_add_nvdimm(cxlmd);
+		if (rc == -ENODEV)
+			dev_info(dev, "PMEM disabled by platform\n");
+		else
+			return rc;
+	}
+
+	/*
+	 * The kernel may be operating out of CXL memory on this device,
+	 * there is no spec defined way to determine whether this device
+	 * preserves contents over suspend, and there is no simple way
+	 * to arrange for the suspend image to avoid CXL memory which
+	 * would setup a circular dependency between PCI resume and save
+	 * state restoration.
+	 *
+	 * TODO: support suspend when all the regions this device is
+	 * hosting are locked and covered by the system address map,
+	 * i.e. platform firmware owns restoring the HDM configuration
+	 * that it locked.
+	 */
+	cxl_mem_active_inc();
+	return devm_add_action_or_reset(dev, enable_suspend, NULL);
+}
+
+static ssize_t trigger_poison_list_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t len)
+{
+	bool trigger;
+	int rc;
+
+	if (kstrtobool(buf, &trigger) || !trigger)
+		return -EINVAL;
+
+	rc = cxl_trigger_poison_list(to_cxl_memdev(dev));
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_WO(trigger_poison_list);
+
+static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	if (a == &dev_attr_trigger_poison_list.attr) {
+		struct device *dev = kobj_to_dev(kobj);
+		struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+		struct cxl_memdev_state *mds =
+			to_cxl_memdev_state(cxlmd->cxlds);
+
+		if (!test_bit(CXL_POISON_ENABLED_LIST,
+			      mds->poison.enabled_cmds))
+			return 0;
+	}
+	return a->mode;
+}
+
+static struct attribute *cxl_mem_attrs[] = {
+	&dev_attr_trigger_poison_list.attr,
+	NULL
+};
+
+static struct attribute_group cxl_mem_group = {
+	.attrs = cxl_mem_attrs,
+	.is_visible = cxl_mem_visible,
+};
+
+__ATTRIBUTE_GROUPS(cxl_mem);
+
+static struct cxl_driver cxl_mem_driver = {
+	.name = "cxl_mem",
+	.probe = cxl_mem_probe,
+	.id = CXL_DEVICE_MEMORY_EXPANDER,
+	.drv = {
+		.dev_groups = cxl_mem_groups,
+	},
+};
+
+module_cxl_driver(cxl_mem_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS(CXL);
+MODULE_ALIAS_CXL(CXL_DEVICE_MEMORY_EXPANDER);
+/*
+ * create_endpoint() wants to validate port driver attach immediately after
+ * endpoint registration.
+ */
+MODULE_SOFTDEP("pre: cxl_port");
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
new file mode 100644
index 0000000000..8bece1e2e2
--- /dev/null
+++ b/drivers/cxl/pci.c
@@ -0,0 +1,973 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/sizes.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include <linux/io.h>
+#include "cxlmem.h"
+#include "cxlpci.h"
+#include "cxl.h"
+#include "pmu.h"
+
+/**
+ * DOC: cxl pci
+ *
+ * This implements the PCI exclusive functionality for a CXL device as it is
+ * defined by the Compute Express Link specification. CXL devices may surface
+ * certain functionality even if it isn't CXL enabled. While this driver is
+ * focused around the PCI specific aspects of a CXL device, it binds to the
+ * specific CXL memory device class code, and therefore the implementation of
+ * cxl_pci is focused around CXL memory devices.
+ *
+ * The driver has several responsibilities, mainly:
+ *  - Create the memX device and register on the CXL bus.
+ *  - Enumerate device's register interface and map them.
+ *  - Registers nvdimm bridge device with cxl_core.
+ *  - Registers a CXL mailbox with cxl_core.
+ */
+
+#define cxl_doorbell_busy(cxlds)                                                \
+	(readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) &                  \
+	 CXLDEV_MBOX_CTRL_DOORBELL)
+
+/* CXL 2.0 - 8.2.8.4 */
+#define CXL_MAILBOX_TIMEOUT_MS (2 * HZ)
+
+/*
+ * CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to
+ * dictate how long to wait for the mailbox to become ready. The new
+ * field allows the device to tell software the amount of time to wait
+ * before mailbox ready. This field per the spec theoretically allows
+ * for up to 255 seconds. 255 seconds is unreasonably long, its longer
+ * than the maximum SATA port link recovery wait. Default to 60 seconds
+ * until someone builds a CXL device that needs more time in practice.
+ */
+static unsigned short mbox_ready_timeout = 60;
+module_param(mbox_ready_timeout, ushort, 0644);
+MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready");
+
+static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
+{
+	const unsigned long start = jiffies;
+	unsigned long end = start;
+
+	while (cxl_doorbell_busy(cxlds)) {
+		end = jiffies;
+
+		if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) {
+			/* Check again in case preempted before timeout test */
+			if (!cxl_doorbell_busy(cxlds))
+				break;
+			return -ETIMEDOUT;
+		}
+		cpu_relax();
+	}
+
+	dev_dbg(cxlds->dev, "Doorbell wait took %dms",
+		jiffies_to_msecs(end) - jiffies_to_msecs(start));
+	return 0;
+}
+
+#define cxl_err(dev, status, msg)                                        \
+	dev_err_ratelimited(dev, msg ", device state %s%s\n",                  \
+			    status & CXLMDEV_DEV_FATAL ? " fatal" : "",        \
+			    status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
+
+#define cxl_cmd_err(dev, cmd, status, msg)                               \
+	dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n",    \
+			    (cmd)->opcode,                                     \
+			    status & CXLMDEV_DEV_FATAL ? " fatal" : "",        \
+			    status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
+
+struct cxl_dev_id {
+	struct cxl_dev_state *cxlds;
+};
+
+static int cxl_request_irq(struct cxl_dev_state *cxlds, int irq,
+			   irq_handler_t handler, irq_handler_t thread_fn)
+{
+	struct device *dev = cxlds->dev;
+	struct cxl_dev_id *dev_id;
+
+	/* dev_id must be globally unique and must contain the cxlds */
+	dev_id = devm_kzalloc(dev, sizeof(*dev_id), GFP_KERNEL);
+	if (!dev_id)
+		return -ENOMEM;
+	dev_id->cxlds = cxlds;
+
+	return devm_request_threaded_irq(dev, irq, handler, thread_fn,
+					 IRQF_SHARED | IRQF_ONESHOT,
+					 NULL, dev_id);
+}
+
+static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds)
+{
+	u64 reg;
+
+	reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
+	return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100;
+}
+
+static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
+{
+	u64 reg;
+	u16 opcode;
+	struct cxl_dev_id *dev_id = id;
+	struct cxl_dev_state *cxlds = dev_id->cxlds;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+
+	if (!cxl_mbox_background_complete(cxlds))
+		return IRQ_NONE;
+
+	reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
+	opcode = FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK, reg);
+	if (opcode == CXL_MBOX_OP_SANITIZE) {
+		mutex_lock(&mds->mbox_mutex);
+		if (mds->security.sanitize_node)
+			mod_delayed_work(system_wq, &mds->security.poll_dwork, 0);
+		mutex_unlock(&mds->mbox_mutex);
+	} else {
+		/* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
+		rcuwait_wake_up(&mds->mbox_wait);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Sanitization operation polling mode.
+ */
+static void cxl_mbox_sanitize_work(struct work_struct *work)
+{
+	struct cxl_memdev_state *mds =
+		container_of(work, typeof(*mds), security.poll_dwork.work);
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+
+	mutex_lock(&mds->mbox_mutex);
+	if (cxl_mbox_background_complete(cxlds)) {
+		mds->security.poll_tmo_secs = 0;
+		if (mds->security.sanitize_node)
+			sysfs_notify_dirent(mds->security.sanitize_node);
+		mds->security.sanitize_active = false;
+
+		dev_dbg(cxlds->dev, "Sanitization operation ended\n");
+	} else {
+		int timeout = mds->security.poll_tmo_secs + 10;
+
+		mds->security.poll_tmo_secs = min(15 * 60, timeout);
+		schedule_delayed_work(&mds->security.poll_dwork, timeout * HZ);
+	}
+	mutex_unlock(&mds->mbox_mutex);
+}
+
+/**
+ * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
+ * @mds: The memory device driver data
+ * @mbox_cmd: Command to send to the memory device.
+ *
+ * Context: Any context. Expects mbox_mutex to be held.
+ * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
+ *         Caller should check the return code in @mbox_cmd to make sure it
+ *         succeeded.
+ *
+ * This is a generic form of the CXL mailbox send command thus only using the
+ * registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory
+ * devices, and perhaps other types of CXL devices may have further information
+ * available upon error conditions. Driver facilities wishing to send mailbox
+ * commands should use the wrapper command.
+ *
+ * The CXL spec allows for up to two mailboxes. The intention is for the primary
+ * mailbox to be OS controlled and the secondary mailbox to be used by system
+ * firmware. This allows the OS and firmware to communicate with the device and
+ * not need to coordinate with each other. The driver only uses the primary
+ * mailbox.
+ */
+static int __cxl_pci_mbox_send_cmd(struct cxl_memdev_state *mds,
+				   struct cxl_mbox_cmd *mbox_cmd)
+{
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+	void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET;
+	struct device *dev = cxlds->dev;
+	u64 cmd_reg, status_reg;
+	size_t out_len;
+	int rc;
+
+	lockdep_assert_held(&mds->mbox_mutex);
+
+	/*
+	 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
+	 *   1. Caller reads MB Control Register to verify doorbell is clear
+	 *   2. Caller writes Command Register
+	 *   3. Caller writes Command Payload Registers if input payload is non-empty
+	 *   4. Caller writes MB Control Register to set doorbell
+	 *   5. Caller either polls for doorbell to be clear or waits for interrupt if configured
+	 *   6. Caller reads MB Status Register to fetch Return code
+	 *   7. If command successful, Caller reads Command Register to get Payload Length
+	 *   8. If output payload is non-empty, host reads Command Payload Registers
+	 *
+	 * Hardware is free to do whatever it wants before the doorbell is rung,
+	 * and isn't allowed to change anything after it clears the doorbell. As
+	 * such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can
+	 * also happen in any order (though some orders might not make sense).
+	 */
+
+	/* #1 */
+	if (cxl_doorbell_busy(cxlds)) {
+		u64 md_status =
+			readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+
+		cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
+			    "mailbox queue busy");
+		return -EBUSY;
+	}
+
+	/*
+	 * With sanitize polling, hardware might be done and the poller still
+	 * not be in sync. Ensure no new command comes in until so. Keep the
+	 * hardware semantics and only allow device health status.
+	 */
+	if (mds->security.poll_tmo_secs > 0) {
+		if (mbox_cmd->opcode != CXL_MBOX_OP_GET_HEALTH_INFO)
+			return -EBUSY;
+	}
+
+	cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK,
+			     mbox_cmd->opcode);
+	if (mbox_cmd->size_in) {
+		if (WARN_ON(!mbox_cmd->payload_in))
+			return -EINVAL;
+
+		cmd_reg |= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK,
+				      mbox_cmd->size_in);
+		memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
+	}
+
+	/* #2, #3 */
+	writeq(cmd_reg, cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
+
+	/* #4 */
+	dev_dbg(dev, "Sending command: 0x%04x\n", mbox_cmd->opcode);
+	writel(CXLDEV_MBOX_CTRL_DOORBELL,
+	       cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
+
+	/* #5 */
+	rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
+	if (rc == -ETIMEDOUT) {
+		u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+
+		cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout");
+		return rc;
+	}
+
+	/* #6 */
+	status_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET);
+	mbox_cmd->return_code =
+		FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
+
+	/*
+	 * Handle the background command in a synchronous manner.
+	 *
+	 * All other mailbox commands will serialize/queue on the mbox_mutex,
+	 * which we currently hold. Furthermore this also guarantees that
+	 * cxl_mbox_background_complete() checks are safe amongst each other,
+	 * in that no new bg operation can occur in between.
+	 *
+	 * Background operations are timesliced in accordance with the nature
+	 * of the command. In the event of timeout, the mailbox state is
+	 * indeterminate until the next successful command submission and the
+	 * driver can get back in sync with the hardware state.
+	 */
+	if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
+		u64 bg_status_reg;
+		int i, timeout;
+
+		/*
+		 * Sanitization is a special case which monopolizes the device
+		 * and cannot be timesliced. Handle asynchronously instead,
+		 * and allow userspace to poll(2) for completion.
+		 */
+		if (mbox_cmd->opcode == CXL_MBOX_OP_SANITIZE) {
+			if (mds->security.sanitize_active)
+				return -EBUSY;
+
+			/* give first timeout a second */
+			timeout = 1;
+			mds->security.poll_tmo_secs = timeout;
+			mds->security.sanitize_active = true;
+			schedule_delayed_work(&mds->security.poll_dwork,
+					      timeout * HZ);
+			dev_dbg(dev, "Sanitization operation started\n");
+			goto success;
+		}
+
+		dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
+			mbox_cmd->opcode);
+
+		timeout = mbox_cmd->poll_interval_ms;
+		for (i = 0; i < mbox_cmd->poll_count; i++) {
+			if (rcuwait_wait_event_timeout(&mds->mbox_wait,
+				       cxl_mbox_background_complete(cxlds),
+				       TASK_UNINTERRUPTIBLE,
+				       msecs_to_jiffies(timeout)) > 0)
+				break;
+		}
+
+		if (!cxl_mbox_background_complete(cxlds)) {
+			dev_err(dev, "timeout waiting for background (%d ms)\n",
+				timeout * mbox_cmd->poll_count);
+			return -ETIMEDOUT;
+		}
+
+		bg_status_reg = readq(cxlds->regs.mbox +
+				      CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
+		mbox_cmd->return_code =
+			FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
+				  bg_status_reg);
+		dev_dbg(dev,
+			"Mailbox background operation (0x%04x) completed\n",
+			mbox_cmd->opcode);
+	}
+
+	if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
+		dev_dbg(dev, "Mailbox operation had an error: %s\n",
+			cxl_mbox_cmd_rc2str(mbox_cmd));
+		return 0; /* completed but caller must check return_code */
+	}
+
+success:
+	/* #7 */
+	cmd_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
+	out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg);
+
+	/* #8 */
+	if (out_len && mbox_cmd->payload_out) {
+		/*
+		 * Sanitize the copy. If hardware misbehaves, out_len per the
+		 * spec can actually be greater than the max allowed size (21
+		 * bits available but spec defined 1M max). The caller also may
+		 * have requested less data than the hardware supplied even
+		 * within spec.
+		 */
+		size_t n;
+
+		n = min3(mbox_cmd->size_out, mds->payload_size, out_len);
+		memcpy_fromio(mbox_cmd->payload_out, payload, n);
+		mbox_cmd->size_out = n;
+	} else {
+		mbox_cmd->size_out = 0;
+	}
+
+	return 0;
+}
+
+static int cxl_pci_mbox_send(struct cxl_memdev_state *mds,
+			     struct cxl_mbox_cmd *cmd)
+{
+	int rc;
+
+	mutex_lock_io(&mds->mbox_mutex);
+	rc = __cxl_pci_mbox_send_cmd(mds, cmd);
+	mutex_unlock(&mds->mbox_mutex);
+
+	return rc;
+}
+
+static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds)
+{
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+	const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
+	struct device *dev = cxlds->dev;
+	unsigned long timeout;
+	int irq, msgnum;
+	u64 md_status;
+	u32 ctrl;
+
+	timeout = jiffies + mbox_ready_timeout * HZ;
+	do {
+		md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+		if (md_status & CXLMDEV_MBOX_IF_READY)
+			break;
+		if (msleep_interruptible(100))
+			break;
+	} while (!time_after(jiffies, timeout));
+
+	if (!(md_status & CXLMDEV_MBOX_IF_READY)) {
+		cxl_err(dev, md_status, "timeout awaiting mailbox ready");
+		return -ETIMEDOUT;
+	}
+
+	/*
+	 * A command may be in flight from a previous driver instance,
+	 * think kexec, do one doorbell wait so that
+	 * __cxl_pci_mbox_send_cmd() can assume that it is the only
+	 * source for future doorbell busy events.
+	 */
+	if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) {
+		cxl_err(dev, md_status, "timeout awaiting mailbox idle");
+		return -ETIMEDOUT;
+	}
+
+	mds->mbox_send = cxl_pci_mbox_send;
+	mds->payload_size =
+		1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);
+
+	/*
+	 * CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
+	 *
+	 * If the size is too small, mandatory commands will not work and so
+	 * there's no point in going forward. If the size is too large, there's
+	 * no harm is soft limiting it.
+	 */
+	mds->payload_size = min_t(size_t, mds->payload_size, SZ_1M);
+	if (mds->payload_size < 256) {
+		dev_err(dev, "Mailbox is too small (%zub)",
+			mds->payload_size);
+		return -ENXIO;
+	}
+
+	dev_dbg(dev, "Mailbox payload sized %zu", mds->payload_size);
+
+	rcuwait_init(&mds->mbox_wait);
+	INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mbox_sanitize_work);
+
+	/* background command interrupts are optional */
+	if (!(cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ))
+		return 0;
+
+	msgnum = FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap);
+	irq = pci_irq_vector(to_pci_dev(cxlds->dev), msgnum);
+	if (irq < 0)
+		return 0;
+
+	if (cxl_request_irq(cxlds, irq, NULL, cxl_pci_mbox_irq))
+		return 0;
+
+	dev_dbg(cxlds->dev, "Mailbox interrupts enabled\n");
+	/* enable background command mbox irq support */
+	ctrl = readl(cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
+	ctrl |= CXLDEV_MBOX_CTRL_BG_CMD_IRQ;
+	writel(ctrl, cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
+
+	return 0;
+}
+
+/*
+ * Assume that any RCIEP that emits the CXL memory expander class code
+ * is an RCD
+ */
+static bool is_cxl_restricted(struct pci_dev *pdev)
+{
+	return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END;
+}
+
+static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev,
+				  struct cxl_register_map *map)
+{
+	struct cxl_port *port;
+	struct cxl_dport *dport;
+	resource_size_t component_reg_phys;
+
+	*map = (struct cxl_register_map) {
+		.host = &pdev->dev,
+		.resource = CXL_RESOURCE_NONE,
+	};
+
+	port = cxl_pci_find_port(pdev, &dport);
+	if (!port)
+		return -EPROBE_DEFER;
+
+	component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport);
+
+	put_device(&port->dev);
+
+	if (component_reg_phys == CXL_RESOURCE_NONE)
+		return -ENXIO;
+
+	map->resource = component_reg_phys;
+	map->reg_type = CXL_REGLOC_RBI_COMPONENT;
+	map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE;
+
+	return 0;
+}
+
+static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
+			      struct cxl_register_map *map)
+{
+	int rc;
+
+	rc = cxl_find_regblock(pdev, type, map);
+
+	/*
+	 * If the Register Locator DVSEC does not exist, check if it
+	 * is an RCH and try to extract the Component Registers from
+	 * an RCRB.
+	 */
+	if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev))
+		rc = cxl_rcrb_get_comp_regs(pdev, map);
+
+	if (rc)
+		return rc;
+
+	return cxl_setup_regs(map);
+}
+
+static int cxl_pci_ras_unmask(struct pci_dev *pdev)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	void __iomem *addr;
+	u32 orig_val, val, mask;
+	u16 cap;
+	int rc;
+
+	if (!cxlds->regs.ras) {
+		dev_dbg(&pdev->dev, "No RAS registers.\n");
+		return 0;
+	}
+
+	/* BIOS has PCIe AER error control */
+	if (!pcie_aer_is_native(pdev))
+		return 0;
+
+	rc = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &cap);
+	if (rc)
+		return rc;
+
+	if (cap & PCI_EXP_DEVCTL_URRE) {
+		addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_MASK_OFFSET;
+		orig_val = readl(addr);
+
+		mask = CXL_RAS_UNCORRECTABLE_MASK_MASK |
+		       CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK;
+		val = orig_val & ~mask;
+		writel(val, addr);
+		dev_dbg(&pdev->dev,
+			"Uncorrectable RAS Errors Mask: %#x -> %#x\n",
+			orig_val, val);
+	}
+
+	if (cap & PCI_EXP_DEVCTL_CERE) {
+		addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_MASK_OFFSET;
+		orig_val = readl(addr);
+		val = orig_val & ~CXL_RAS_CORRECTABLE_MASK_MASK;
+		writel(val, addr);
+		dev_dbg(&pdev->dev, "Correctable RAS Errors Mask: %#x -> %#x\n",
+			orig_val, val);
+	}
+
+	return 0;
+}
+
+static void free_event_buf(void *buf)
+{
+	kvfree(buf);
+}
+
+/*
+ * There is a single buffer for reading event logs from the mailbox.  All logs
+ * share this buffer protected by the mds->event_log_lock.
+ */
+static int cxl_mem_alloc_event_buf(struct cxl_memdev_state *mds)
+{
+	struct cxl_get_event_payload *buf;
+
+	buf = kvmalloc(mds->payload_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	mds->event.buf = buf;
+
+	return devm_add_action_or_reset(mds->cxlds.dev, free_event_buf, buf);
+}
+
+static int cxl_alloc_irq_vectors(struct pci_dev *pdev)
+{
+	int nvecs;
+
+	/*
+	 * Per CXL 3.0 3.1.1 CXL.io Endpoint a function on a CXL device must
+	 * not generate INTx messages if that function participates in
+	 * CXL.cache or CXL.mem.
+	 *
+	 * Additionally pci_alloc_irq_vectors() handles calling
+	 * pci_free_irq_vectors() automatically despite not being called
+	 * pcim_*.  See pci_setup_msi_context().
+	 */
+	nvecs = pci_alloc_irq_vectors(pdev, 1, CXL_PCI_DEFAULT_MAX_VECTORS,
+				      PCI_IRQ_MSIX | PCI_IRQ_MSI);
+	if (nvecs < 1) {
+		dev_dbg(&pdev->dev, "Failed to alloc irq vectors: %d\n", nvecs);
+		return -ENXIO;
+	}
+	return 0;
+}
+
+static irqreturn_t cxl_event_thread(int irq, void *id)
+{
+	struct cxl_dev_id *dev_id = id;
+	struct cxl_dev_state *cxlds = dev_id->cxlds;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+	u32 status;
+
+	do {
+		/*
+		 * CXL 3.0 8.2.8.3.1: The lower 32 bits are the status;
+		 * ignore the reserved upper 32 bits
+		 */
+		status = readl(cxlds->regs.status + CXLDEV_DEV_EVENT_STATUS_OFFSET);
+		/* Ignore logs unknown to the driver */
+		status &= CXLDEV_EVENT_STATUS_ALL;
+		if (!status)
+			break;
+		cxl_mem_get_event_records(mds, status);
+		cond_resched();
+	} while (status);
+
+	return IRQ_HANDLED;
+}
+
+static int cxl_event_req_irq(struct cxl_dev_state *cxlds, u8 setting)
+{
+	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+	int irq;
+
+	if (FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting) != CXL_INT_MSI_MSIX)
+		return -ENXIO;
+
+	irq =  pci_irq_vector(pdev,
+			      FIELD_GET(CXLDEV_EVENT_INT_MSGNUM_MASK, setting));
+	if (irq < 0)
+		return irq;
+
+	return cxl_request_irq(cxlds, irq, NULL, cxl_event_thread);
+}
+
+static int cxl_event_get_int_policy(struct cxl_memdev_state *mds,
+				    struct cxl_event_interrupt_policy *policy)
+{
+	struct cxl_mbox_cmd mbox_cmd = {
+		.opcode = CXL_MBOX_OP_GET_EVT_INT_POLICY,
+		.payload_out = policy,
+		.size_out = sizeof(*policy),
+	};
+	int rc;
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0)
+		dev_err(mds->cxlds.dev,
+			"Failed to get event interrupt policy : %d", rc);
+
+	return rc;
+}
+
+static int cxl_event_config_msgnums(struct cxl_memdev_state *mds,
+				    struct cxl_event_interrupt_policy *policy)
+{
+	struct cxl_mbox_cmd mbox_cmd;
+	int rc;
+
+	*policy = (struct cxl_event_interrupt_policy) {
+		.info_settings = CXL_INT_MSI_MSIX,
+		.warn_settings = CXL_INT_MSI_MSIX,
+		.failure_settings = CXL_INT_MSI_MSIX,
+		.fatal_settings = CXL_INT_MSI_MSIX,
+	};
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_SET_EVT_INT_POLICY,
+		.payload_in = policy,
+		.size_in = sizeof(*policy),
+	};
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0) {
+		dev_err(mds->cxlds.dev, "Failed to set event interrupt policy : %d",
+			rc);
+		return rc;
+	}
+
+	/* Retrieve final interrupt settings */
+	return cxl_event_get_int_policy(mds, policy);
+}
+
+static int cxl_event_irqsetup(struct cxl_memdev_state *mds)
+{
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct cxl_event_interrupt_policy policy;
+	int rc;
+
+	rc = cxl_event_config_msgnums(mds, &policy);
+	if (rc)
+		return rc;
+
+	rc = cxl_event_req_irq(cxlds, policy.info_settings);
+	if (rc) {
+		dev_err(cxlds->dev, "Failed to get interrupt for event Info log\n");
+		return rc;
+	}
+
+	rc = cxl_event_req_irq(cxlds, policy.warn_settings);
+	if (rc) {
+		dev_err(cxlds->dev, "Failed to get interrupt for event Warn log\n");
+		return rc;
+	}
+
+	rc = cxl_event_req_irq(cxlds, policy.failure_settings);
+	if (rc) {
+		dev_err(cxlds->dev, "Failed to get interrupt for event Failure log\n");
+		return rc;
+	}
+
+	rc = cxl_event_req_irq(cxlds, policy.fatal_settings);
+	if (rc) {
+		dev_err(cxlds->dev, "Failed to get interrupt for event Fatal log\n");
+		return rc;
+	}
+
+	return 0;
+}
+
+static bool cxl_event_int_is_fw(u8 setting)
+{
+	u8 mode = FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting);
+
+	return mode == CXL_INT_FW;
+}
+
+static int cxl_event_config(struct pci_host_bridge *host_bridge,
+			    struct cxl_memdev_state *mds)
+{
+	struct cxl_event_interrupt_policy policy;
+	int rc;
+
+	/*
+	 * When BIOS maintains CXL error reporting control, it will process
+	 * event records.  Only one agent can do so.
+	 */
+	if (!host_bridge->native_cxl_error)
+		return 0;
+
+	rc = cxl_mem_alloc_event_buf(mds);
+	if (rc)
+		return rc;
+
+	rc = cxl_event_get_int_policy(mds, &policy);
+	if (rc)
+		return rc;
+
+	if (cxl_event_int_is_fw(policy.info_settings) ||
+	    cxl_event_int_is_fw(policy.warn_settings) ||
+	    cxl_event_int_is_fw(policy.failure_settings) ||
+	    cxl_event_int_is_fw(policy.fatal_settings)) {
+		dev_err(mds->cxlds.dev,
+			"FW still in control of Event Logs despite _OSC settings\n");
+		return -EBUSY;
+	}
+
+	rc = cxl_event_irqsetup(mds);
+	if (rc)
+		return rc;
+
+	cxl_mem_get_event_records(mds, CXLDEV_EVENT_STATUS_ALL);
+
+	return 0;
+}
+
+static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus);
+	struct cxl_memdev_state *mds;
+	struct cxl_dev_state *cxlds;
+	struct cxl_register_map map;
+	struct cxl_memdev *cxlmd;
+	int i, rc, pmu_count;
+
+	/*
+	 * Double check the anonymous union trickery in struct cxl_regs
+	 * FIXME switch to struct_group()
+	 */
+	BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) !=
+		     offsetof(struct cxl_regs, device_regs.memdev));
+
+	rc = pcim_enable_device(pdev);
+	if (rc)
+		return rc;
+	pci_set_master(pdev);
+
+	mds = cxl_memdev_state_create(&pdev->dev);
+	if (IS_ERR(mds))
+		return PTR_ERR(mds);
+	cxlds = &mds->cxlds;
+	pci_set_drvdata(pdev, cxlds);
+
+	cxlds->rcd = is_cxl_restricted(pdev);
+	cxlds->serial = pci_get_dsn(pdev);
+	cxlds->cxl_dvsec = pci_find_dvsec_capability(
+		pdev, PCI_DVSEC_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
+	if (!cxlds->cxl_dvsec)
+		dev_warn(&pdev->dev,
+			 "Device DVSEC not present, skip CXL.mem init\n");
+
+	rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map);
+	if (rc)
+		return rc;
+
+	rc = cxl_map_device_regs(&map, &cxlds->regs.device_regs);
+	if (rc)
+		return rc;
+
+	/*
+	 * If the component registers can't be found, the cxl_pci driver may
+	 * still be useful for management functions so don't return an error.
+	 */
+	cxlds->component_reg_phys = CXL_RESOURCE_NONE;
+	rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT, &map);
+	if (rc)
+		dev_warn(&pdev->dev, "No component registers (%d)\n", rc);
+	else if (!map.component_map.ras.valid)
+		dev_dbg(&pdev->dev, "RAS registers not found\n");
+
+	cxlds->component_reg_phys = map.resource;
+
+	rc = cxl_map_component_regs(&map, &cxlds->regs.component,
+				    BIT(CXL_CM_CAP_CAP_ID_RAS));
+	if (rc)
+		dev_dbg(&pdev->dev, "Failed to map RAS capability.\n");
+
+	rc = cxl_await_media_ready(cxlds);
+	if (rc == 0)
+		cxlds->media_ready = true;
+	else
+		dev_warn(&pdev->dev, "Media not active (%d)\n", rc);
+
+	rc = cxl_alloc_irq_vectors(pdev);
+	if (rc)
+		return rc;
+
+	rc = cxl_pci_setup_mailbox(mds);
+	if (rc)
+		return rc;
+
+	rc = cxl_enumerate_cmds(mds);
+	if (rc)
+		return rc;
+
+	rc = cxl_set_timestamp(mds);
+	if (rc)
+		return rc;
+
+	rc = cxl_poison_state_init(mds);
+	if (rc)
+		return rc;
+
+	rc = cxl_dev_state_identify(mds);
+	if (rc)
+		return rc;
+
+	rc = cxl_mem_create_range_info(mds);
+	if (rc)
+		return rc;
+
+	cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds);
+	if (IS_ERR(cxlmd))
+		return PTR_ERR(cxlmd);
+
+	rc = devm_cxl_setup_fw_upload(&pdev->dev, mds);
+	if (rc)
+		return rc;
+
+	rc = devm_cxl_sanitize_setup_notifier(&pdev->dev, cxlmd);
+	if (rc)
+		return rc;
+
+	pmu_count = cxl_count_regblock(pdev, CXL_REGLOC_RBI_PMU);
+	for (i = 0; i < pmu_count; i++) {
+		struct cxl_pmu_regs pmu_regs;
+
+		rc = cxl_find_regblock_instance(pdev, CXL_REGLOC_RBI_PMU, &map, i);
+		if (rc) {
+			dev_dbg(&pdev->dev, "Could not find PMU regblock\n");
+			break;
+		}
+
+		rc = cxl_map_pmu_regs(pdev, &pmu_regs, &map);
+		if (rc) {
+			dev_dbg(&pdev->dev, "Could not map PMU regs\n");
+			break;
+		}
+
+		rc = devm_cxl_pmu_add(cxlds->dev, &pmu_regs, cxlmd->id, i, CXL_PMU_MEMDEV);
+		if (rc) {
+			dev_dbg(&pdev->dev, "Could not add PMU instance\n");
+			break;
+		}
+	}
+
+	rc = cxl_event_config(host_bridge, mds);
+	if (rc)
+		return rc;
+
+	rc = cxl_pci_ras_unmask(pdev);
+	if (rc)
+		dev_dbg(&pdev->dev, "No RAS reporting unmasked\n");
+
+	pci_save_state(pdev);
+
+	return rc;
+}
+
+static const struct pci_device_id cxl_mem_pci_tbl[] = {
+	/* PCI class code for CXL.mem Type-3 Devices */
+	{ PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
+	{ /* terminate list */ },
+};
+MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
+
+static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct device *dev = &cxlmd->dev;
+
+	dev_info(&pdev->dev, "%s: restart CXL.mem after slot reset\n",
+		 dev_name(dev));
+	pci_restore_state(pdev);
+	if (device_attach(dev) <= 0)
+		return PCI_ERS_RESULT_DISCONNECT;
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void cxl_error_resume(struct pci_dev *pdev)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct device *dev = &cxlmd->dev;
+
+	dev_info(&pdev->dev, "%s: error resume %s\n", dev_name(dev),
+		 dev->driver ? "successful" : "failed");
+}
+
+static const struct pci_error_handlers cxl_error_handlers = {
+	.error_detected	= cxl_error_detected,
+	.slot_reset	= cxl_slot_reset,
+	.resume		= cxl_error_resume,
+	.cor_error_detected	= cxl_cor_error_detected,
+};
+
+static struct pci_driver cxl_pci_driver = {
+	.name			= KBUILD_MODNAME,
+	.id_table		= cxl_mem_pci_tbl,
+	.probe			= cxl_pci_probe,
+	.err_handler		= &cxl_error_handlers,
+	.driver	= {
+		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
+	},
+};
+
+MODULE_LICENSE("GPL v2");
+module_pci_driver(cxl_pci_driver);
+MODULE_IMPORT_NS(CXL);
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
new file mode 100644
index 0000000000..7cb8994f88
--- /dev/null
+++ b/drivers/cxl/pmem.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2021 Intel Corporation. All rights reserved. */
+#include <linux/libnvdimm.h>
+#include <asm/unaligned.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/ndctl.h>
+#include <linux/async.h>
+#include <linux/slab.h>
+#include <linux/nd.h>
+#include "cxlmem.h"
+#include "cxl.h"
+
+extern const struct nvdimm_security_ops *cxl_security_ops;
+
+static __read_mostly DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
+
+static void clear_exclusive(void *mds)
+{
+	clear_exclusive_cxl_commands(mds, exclusive_cmds);
+}
+
+static void unregister_nvdimm(void *nvdimm)
+{
+	nvdimm_delete(nvdimm);
+}
+
+static ssize_t provider_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+
+	return sysfs_emit(buf, "%s\n", dev_name(&cxl_nvd->dev));
+}
+static DEVICE_ATTR_RO(provider);
+
+static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	struct cxl_dev_state *cxlds = cxl_nvd->cxlmd->cxlds;
+
+	return sysfs_emit(buf, "%lld\n", cxlds->serial);
+}
+static DEVICE_ATTR_RO(id);
+
+static struct attribute *cxl_dimm_attributes[] = {
+	&dev_attr_id.attr,
+	&dev_attr_provider.attr,
+	NULL
+};
+
+static const struct attribute_group cxl_dimm_attribute_group = {
+	.name = "cxl",
+	.attrs = cxl_dimm_attributes,
+};
+
+static const struct attribute_group *cxl_dimm_attribute_groups[] = {
+	&cxl_dimm_attribute_group,
+	NULL
+};
+
+static int cxl_nvdimm_probe(struct device *dev)
+{
+	struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
+	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+	struct cxl_nvdimm_bridge *cxl_nvb = cxlmd->cxl_nvb;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	unsigned long flags = 0, cmd_mask = 0;
+	struct nvdimm *nvdimm;
+	int rc;
+
+	set_exclusive_cxl_commands(mds, exclusive_cmds);
+	rc = devm_add_action_or_reset(dev, clear_exclusive, mds);
+	if (rc)
+		return rc;
+
+	set_bit(NDD_LABELING, &flags);
+	set_bit(NDD_REGISTER_SYNC, &flags);
+	set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
+	set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
+	set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
+	nvdimm = __nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd,
+				 cxl_dimm_attribute_groups, flags,
+				 cmd_mask, 0, NULL, cxl_nvd->dev_id,
+				 cxl_security_ops, NULL);
+	if (!nvdimm)
+		return -ENOMEM;
+
+	dev_set_drvdata(dev, nvdimm);
+	return devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm);
+}
+
+static struct cxl_driver cxl_nvdimm_driver = {
+	.name = "cxl_nvdimm",
+	.probe = cxl_nvdimm_probe,
+	.id = CXL_DEVICE_NVDIMM,
+	.drv = {
+		.suppress_bind_attrs = true,
+	},
+};
+
+static int cxl_pmem_get_config_size(struct cxl_memdev_state *mds,
+				    struct nd_cmd_get_config_size *cmd,
+				    unsigned int buf_len)
+{
+	if (sizeof(*cmd) > buf_len)
+		return -EINVAL;
+
+	*cmd = (struct nd_cmd_get_config_size){
+		.config_size = mds->lsa_size,
+		.max_xfer =
+			mds->payload_size - sizeof(struct cxl_mbox_set_lsa),
+	};
+
+	return 0;
+}
+
+static int cxl_pmem_get_config_data(struct cxl_memdev_state *mds,
+				    struct nd_cmd_get_config_data_hdr *cmd,
+				    unsigned int buf_len)
+{
+	struct cxl_mbox_get_lsa get_lsa;
+	struct cxl_mbox_cmd mbox_cmd;
+	int rc;
+
+	if (sizeof(*cmd) > buf_len)
+		return -EINVAL;
+	if (struct_size(cmd, out_buf, cmd->in_length) > buf_len)
+		return -EINVAL;
+
+	get_lsa = (struct cxl_mbox_get_lsa) {
+		.offset = cpu_to_le32(cmd->in_offset),
+		.length = cpu_to_le32(cmd->in_length),
+	};
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_GET_LSA,
+		.payload_in = &get_lsa,
+		.size_in = sizeof(get_lsa),
+		.size_out = cmd->in_length,
+		.payload_out = cmd->out_buf,
+	};
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	cmd->status = 0;
+
+	return rc;
+}
+
+static int cxl_pmem_set_config_data(struct cxl_memdev_state *mds,
+				    struct nd_cmd_set_config_hdr *cmd,
+				    unsigned int buf_len)
+{
+	struct cxl_mbox_set_lsa *set_lsa;
+	struct cxl_mbox_cmd mbox_cmd;
+	int rc;
+
+	if (sizeof(*cmd) > buf_len)
+		return -EINVAL;
+
+	/* 4-byte status follows the input data in the payload */
+	if (size_add(struct_size(cmd, in_buf, cmd->in_length), 4) > buf_len)
+		return -EINVAL;
+
+	set_lsa =
+		kvzalloc(struct_size(set_lsa, data, cmd->in_length), GFP_KERNEL);
+	if (!set_lsa)
+		return -ENOMEM;
+
+	*set_lsa = (struct cxl_mbox_set_lsa) {
+		.offset = cpu_to_le32(cmd->in_offset),
+	};
+	memcpy(set_lsa->data, cmd->in_buf, cmd->in_length);
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_SET_LSA,
+		.payload_in = set_lsa,
+		.size_in = struct_size(set_lsa, data, cmd->in_length),
+	};
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+
+	/*
+	 * Set "firmware" status (4-packed bytes at the end of the input
+	 * payload.
+	 */
+	put_unaligned(0, (u32 *) &cmd->in_buf[cmd->in_length]);
+	kvfree(set_lsa);
+
+	return rc;
+}
+
+static int cxl_pmem_nvdimm_ctl(struct nvdimm *nvdimm, unsigned int cmd,
+			       void *buf, unsigned int buf_len)
+{
+	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm);
+	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+
+	if (!test_bit(cmd, &cmd_mask))
+		return -ENOTTY;
+
+	switch (cmd) {
+	case ND_CMD_GET_CONFIG_SIZE:
+		return cxl_pmem_get_config_size(mds, buf, buf_len);
+	case ND_CMD_GET_CONFIG_DATA:
+		return cxl_pmem_get_config_data(mds, buf, buf_len);
+	case ND_CMD_SET_CONFIG_DATA:
+		return cxl_pmem_set_config_data(mds, buf, buf_len);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static int cxl_pmem_ctl(struct nvdimm_bus_descriptor *nd_desc,
+			struct nvdimm *nvdimm, unsigned int cmd, void *buf,
+			unsigned int buf_len, int *cmd_rc)
+{
+	/*
+	 * No firmware response to translate, let the transport error
+	 * code take precedence.
+	 */
+	*cmd_rc = 0;
+
+	if (!nvdimm)
+		return -ENOTTY;
+	return cxl_pmem_nvdimm_ctl(nvdimm, cmd, buf, buf_len);
+}
+
+static int detach_nvdimm(struct device *dev, void *data)
+{
+	struct cxl_nvdimm *cxl_nvd;
+	bool release = false;
+
+	if (!is_cxl_nvdimm(dev))
+		return 0;
+
+	device_lock(dev);
+	if (!dev->driver)
+		goto out;
+
+	cxl_nvd = to_cxl_nvdimm(dev);
+	if (cxl_nvd->cxlmd && cxl_nvd->cxlmd->cxl_nvb == data)
+		release = true;
+out:
+	device_unlock(dev);
+	if (release)
+		device_release_driver(dev);
+	return 0;
+}
+
+static void unregister_nvdimm_bus(void *_cxl_nvb)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb;
+	struct nvdimm_bus *nvdimm_bus = cxl_nvb->nvdimm_bus;
+
+	bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb, detach_nvdimm);
+
+	cxl_nvb->nvdimm_bus = NULL;
+	nvdimm_bus_unregister(nvdimm_bus);
+}
+
+static int cxl_nvdimm_bridge_probe(struct device *dev)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev);
+
+	cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor) {
+		.provider_name = "CXL",
+		.module = THIS_MODULE,
+		.ndctl = cxl_pmem_ctl,
+	};
+
+	cxl_nvb->nvdimm_bus =
+		nvdimm_bus_register(&cxl_nvb->dev, &cxl_nvb->nd_desc);
+
+	if (!cxl_nvb->nvdimm_bus)
+		return -ENOMEM;
+
+	return devm_add_action_or_reset(dev, unregister_nvdimm_bus, cxl_nvb);
+}
+
+static struct cxl_driver cxl_nvdimm_bridge_driver = {
+	.name = "cxl_nvdimm_bridge",
+	.probe = cxl_nvdimm_bridge_probe,
+	.id = CXL_DEVICE_NVDIMM_BRIDGE,
+	.drv = {
+		.suppress_bind_attrs = true,
+	},
+};
+
+static void unregister_nvdimm_region(void *nd_region)
+{
+	nvdimm_region_delete(nd_region);
+}
+
+static void cxlr_pmem_remove_resource(void *res)
+{
+	remove_resource(res);
+}
+
+struct cxl_pmem_region_info {
+	u64 offset;
+	u64 serial;
+};
+
+static int cxl_pmem_region_probe(struct device *dev)
+{
+	struct nd_mapping_desc mappings[CXL_DECODER_MAX_INTERLEAVE];
+	struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
+	struct cxl_region *cxlr = cxlr_pmem->cxlr;
+	struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb;
+	struct cxl_pmem_region_info *info = NULL;
+	struct nd_interleave_set *nd_set;
+	struct nd_region_desc ndr_desc;
+	struct cxl_nvdimm *cxl_nvd;
+	struct nvdimm *nvdimm;
+	struct resource *res;
+	int rc, i = 0;
+
+	memset(&mappings, 0, sizeof(mappings));
+	memset(&ndr_desc, 0, sizeof(ndr_desc));
+
+	res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	res->name = "Persistent Memory";
+	res->start = cxlr_pmem->hpa_range.start;
+	res->end = cxlr_pmem->hpa_range.end;
+	res->flags = IORESOURCE_MEM;
+	res->desc = IORES_DESC_PERSISTENT_MEMORY;
+
+	rc = insert_resource(&iomem_resource, res);
+	if (rc)
+		return rc;
+
+	rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res);
+	if (rc)
+		return rc;
+
+	ndr_desc.res = res;
+	ndr_desc.provider_data = cxlr_pmem;
+
+	ndr_desc.numa_node = memory_add_physaddr_to_nid(res->start);
+	ndr_desc.target_node = phys_to_target_node(res->start);
+	if (ndr_desc.target_node == NUMA_NO_NODE) {
+		ndr_desc.target_node = ndr_desc.numa_node;
+		dev_dbg(&cxlr->dev, "changing target node from %d to %d",
+			NUMA_NO_NODE, ndr_desc.target_node);
+	}
+
+	nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
+	if (!nd_set)
+		return -ENOMEM;
+
+	ndr_desc.memregion = cxlr->id;
+	set_bit(ND_REGION_CXL, &ndr_desc.flags);
+	set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
+
+	info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+		struct cxl_memdev *cxlmd = m->cxlmd;
+		struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+		cxl_nvd = cxlmd->cxl_nvd;
+		nvdimm = dev_get_drvdata(&cxl_nvd->dev);
+		if (!nvdimm) {
+			dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i,
+				dev_name(&cxlmd->dev));
+			rc = -ENODEV;
+			goto out_nvd;
+		}
+
+		m->cxl_nvd = cxl_nvd;
+		mappings[i] = (struct nd_mapping_desc) {
+			.nvdimm = nvdimm,
+			.start = m->start,
+			.size = m->size,
+			.position = i,
+		};
+		info[i].offset = m->start;
+		info[i].serial = cxlds->serial;
+	}
+	ndr_desc.num_mappings = cxlr_pmem->nr_mappings;
+	ndr_desc.mapping = mappings;
+
+	/*
+	 * TODO enable CXL labels which skip the need for 'interleave-set cookie'
+	 */
+	nd_set->cookie1 =
+		nd_fletcher64(info, sizeof(*info) * cxlr_pmem->nr_mappings, 0);
+	nd_set->cookie2 = nd_set->cookie1;
+	ndr_desc.nd_set = nd_set;
+
+	cxlr_pmem->nd_region =
+		nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc);
+	if (!cxlr_pmem->nd_region) {
+		rc = -ENOMEM;
+		goto out_nvd;
+	}
+
+	rc = devm_add_action_or_reset(dev, unregister_nvdimm_region,
+				      cxlr_pmem->nd_region);
+out_nvd:
+	kfree(info);
+
+	return rc;
+}
+
+static struct cxl_driver cxl_pmem_region_driver = {
+	.name = "cxl_pmem_region",
+	.probe = cxl_pmem_region_probe,
+	.id = CXL_DEVICE_PMEM_REGION,
+	.drv = {
+		.suppress_bind_attrs = true,
+	},
+};
+
+static __init int cxl_pmem_init(void)
+{
+	int rc;
+
+	set_bit(CXL_MEM_COMMAND_ID_SET_SHUTDOWN_STATE, exclusive_cmds);
+	set_bit(CXL_MEM_COMMAND_ID_SET_LSA, exclusive_cmds);
+
+	rc = cxl_driver_register(&cxl_nvdimm_bridge_driver);
+	if (rc)
+		return rc;
+
+	rc = cxl_driver_register(&cxl_nvdimm_driver);
+	if (rc)
+		goto err_nvdimm;
+
+	rc = cxl_driver_register(&cxl_pmem_region_driver);
+	if (rc)
+		goto err_region;
+
+	return 0;
+
+err_region:
+	cxl_driver_unregister(&cxl_nvdimm_driver);
+err_nvdimm:
+	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
+	return rc;
+}
+
+static __exit void cxl_pmem_exit(void)
+{
+	cxl_driver_unregister(&cxl_pmem_region_driver);
+	cxl_driver_unregister(&cxl_nvdimm_driver);
+	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
+}
+
+MODULE_LICENSE("GPL v2");
+module_init(cxl_pmem_init);
+module_exit(cxl_pmem_exit);
+MODULE_IMPORT_NS(CXL);
+MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM_BRIDGE);
+MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM);
+MODULE_ALIAS_CXL(CXL_DEVICE_PMEM_REGION);
diff --git a/drivers/cxl/pmu.h b/drivers/cxl/pmu.h
new file mode 100644
index 0000000000..b1e9bcd9f2
--- /dev/null
+++ b/drivers/cxl/pmu.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright(c) 2023 Huawei
+ * CXL Specification rev 3.0 Setion 8.2.7 (CPMU Register Interface)
+ */
+#ifndef CXL_PMU_H
+#define CXL_PMU_H
+#include <linux/device.h>
+
+enum cxl_pmu_type {
+	CXL_PMU_MEMDEV,
+};
+
+#define CXL_PMU_REGMAP_SIZE 0xe00 /* Table 8-32 CXL 3.0 specification */
+struct cxl_pmu {
+	struct device dev;
+	void __iomem *base;
+	int assoc_id;
+	int index;
+	enum cxl_pmu_type type;
+};
+
+#define to_cxl_pmu(dev) container_of(dev, struct cxl_pmu, dev)
+struct cxl_pmu_regs;
+int devm_cxl_pmu_add(struct device *parent, struct cxl_pmu_regs *regs,
+		     int assoc_id, int idx, enum cxl_pmu_type type);
+
+#endif
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
new file mode 100644
index 0000000000..6240e05b95
--- /dev/null
+++ b/drivers/cxl/port.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include "cxlmem.h"
+#include "cxlpci.h"
+
+/**
+ * DOC: cxl port
+ *
+ * The port driver enumerates dport via PCI and scans for HDM
+ * (Host-managed-Device-Memory) decoder resources via the
+ * @component_reg_phys value passed in by the agent that registered the
+ * port. All descendant ports of a CXL root port (described by platform
+ * firmware) are managed in this drivers context. Each driver instance
+ * is responsible for tearing down the driver context of immediate
+ * descendant ports. The locking for this is validated by
+ * CONFIG_PROVE_CXL_LOCKING.
+ *
+ * The primary service this driver provides is presenting APIs to other
+ * drivers to utilize the decoders, and indicating to userspace (via bind
+ * status) the connectivity of the CXL.mem protocol throughout the
+ * PCIe topology.
+ */
+
+static void schedule_detach(void *cxlmd)
+{
+	schedule_cxl_memdev_detach(cxlmd);
+}
+
+static int discover_region(struct device *dev, void *root)
+{
+	struct cxl_endpoint_decoder *cxled;
+	int rc;
+
+	if (!is_endpoint_decoder(dev))
+		return 0;
+
+	cxled = to_cxl_endpoint_decoder(dev);
+	if ((cxled->cxld.flags & CXL_DECODER_F_ENABLE) == 0)
+		return 0;
+
+	if (cxled->state != CXL_DECODER_STATE_AUTO)
+		return 0;
+
+	/*
+	 * Region enumeration is opportunistic, if this add-event fails,
+	 * continue to the next endpoint decoder.
+	 */
+	rc = cxl_add_to_region(root, cxled);
+	if (rc)
+		dev_dbg(dev, "failed to add to region: %#llx-%#llx\n",
+			cxled->cxld.hpa_range.start, cxled->cxld.hpa_range.end);
+
+	return 0;
+}
+
+static int cxl_switch_port_probe(struct cxl_port *port)
+{
+	struct cxl_hdm *cxlhdm;
+	int rc;
+
+	rc = devm_cxl_port_enumerate_dports(port);
+	if (rc < 0)
+		return rc;
+
+	cxlhdm = devm_cxl_setup_hdm(port, NULL);
+	if (!IS_ERR(cxlhdm))
+		return devm_cxl_enumerate_decoders(cxlhdm, NULL);
+
+	if (PTR_ERR(cxlhdm) != -ENODEV) {
+		dev_err(&port->dev, "Failed to map HDM decoder capability\n");
+		return PTR_ERR(cxlhdm);
+	}
+
+	if (rc == 1) {
+		dev_dbg(&port->dev, "Fallback to passthrough decoder\n");
+		return devm_cxl_add_passthrough_decoder(port);
+	}
+
+	dev_err(&port->dev, "HDM decoder capability not found\n");
+	return -ENXIO;
+}
+
+static int cxl_endpoint_port_probe(struct cxl_port *port)
+{
+	struct cxl_endpoint_dvsec_info info = { .port = port };
+	struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_hdm *cxlhdm;
+	struct cxl_port *root;
+	int rc;
+
+	rc = cxl_dvsec_rr_decode(cxlds->dev, cxlds->cxl_dvsec, &info);
+	if (rc < 0)
+		return rc;
+
+	cxlhdm = devm_cxl_setup_hdm(port, &info);
+	if (IS_ERR(cxlhdm)) {
+		if (PTR_ERR(cxlhdm) == -ENODEV)
+			dev_err(&port->dev, "HDM decoder registers not found\n");
+		return PTR_ERR(cxlhdm);
+	}
+
+	/* Cache the data early to ensure is_visible() works */
+	read_cdat_data(port);
+
+	get_device(&cxlmd->dev);
+	rc = devm_add_action_or_reset(&port->dev, schedule_detach, cxlmd);
+	if (rc)
+		return rc;
+
+	rc = cxl_hdm_decode_init(cxlds, cxlhdm, &info);
+	if (rc)
+		return rc;
+
+	rc = devm_cxl_enumerate_decoders(cxlhdm, &info);
+	if (rc)
+		return rc;
+
+	/*
+	 * This can't fail in practice as CXL root exit unregisters all
+	 * descendant ports and that in turn synchronizes with cxl_port_probe()
+	 */
+	root = find_cxl_root(port);
+
+	/*
+	 * Now that all endpoint decoders are successfully enumerated, try to
+	 * assemble regions from committed decoders
+	 */
+	device_for_each_child(&port->dev, root, discover_region);
+	put_device(&root->dev);
+
+	return 0;
+}
+
+static int cxl_port_probe(struct device *dev)
+{
+	struct cxl_port *port = to_cxl_port(dev);
+
+	if (is_cxl_endpoint(port))
+		return cxl_endpoint_port_probe(port);
+	return cxl_switch_port_probe(port);
+}
+
+static ssize_t CDAT_read(struct file *filp, struct kobject *kobj,
+			 struct bin_attribute *bin_attr, char *buf,
+			 loff_t offset, size_t count)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_port *port = to_cxl_port(dev);
+
+	if (!port->cdat_available)
+		return -ENXIO;
+
+	if (!port->cdat.table)
+		return 0;
+
+	return memory_read_from_buffer(buf, count, &offset,
+				       port->cdat.table,
+				       port->cdat.length);
+}
+
+static BIN_ATTR_ADMIN_RO(CDAT, 0);
+
+static umode_t cxl_port_bin_attr_is_visible(struct kobject *kobj,
+					    struct bin_attribute *attr, int i)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_port *port = to_cxl_port(dev);
+
+	if ((attr == &bin_attr_CDAT) && port->cdat_available)
+		return attr->attr.mode;
+
+	return 0;
+}
+
+static struct bin_attribute *cxl_cdat_bin_attributes[] = {
+	&bin_attr_CDAT,
+	NULL,
+};
+
+static struct attribute_group cxl_cdat_attribute_group = {
+	.bin_attrs = cxl_cdat_bin_attributes,
+	.is_bin_visible = cxl_port_bin_attr_is_visible,
+};
+
+static const struct attribute_group *cxl_port_attribute_groups[] = {
+	&cxl_cdat_attribute_group,
+	NULL,
+};
+
+static struct cxl_driver cxl_port_driver = {
+	.name = "cxl_port",
+	.probe = cxl_port_probe,
+	.id = CXL_DEVICE_PORT,
+	.drv = {
+		.dev_groups = cxl_port_attribute_groups,
+	},
+};
+
+module_cxl_driver(cxl_port_driver);
+MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS(CXL);
+MODULE_ALIAS_CXL(CXL_DEVICE_PORT);
diff --git a/drivers/cxl/security.c b/drivers/cxl/security.c
new file mode 100644
index 0000000000..21856a3f40
--- /dev/null
+++ b/drivers/cxl/security.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/libnvdimm.h>
+#include <asm/unaligned.h>
+#include <linux/module.h>
+#include <linux/async.h>
+#include <linux/slab.h>
+#include <linux/memregion.h>
+#include "cxlmem.h"
+#include "cxl.h"
+
+static unsigned long cxl_pmem_get_security_flags(struct nvdimm *nvdimm,
+						 enum nvdimm_passphrase_type ptype)
+{
+	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	unsigned long security_flags = 0;
+	struct cxl_get_security_output {
+		__le32 flags;
+	} out;
+	struct cxl_mbox_cmd mbox_cmd;
+	u32 sec_out;
+	int rc;
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_GET_SECURITY_STATE,
+		.size_out = sizeof(out),
+		.payload_out = &out,
+	};
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0)
+		return 0;
+
+	sec_out = le32_to_cpu(out.flags);
+	/* cache security state */
+	mds->security.state = sec_out;
+
+	if (ptype == NVDIMM_MASTER) {
+		if (sec_out & CXL_PMEM_SEC_STATE_MASTER_PASS_SET)
+			set_bit(NVDIMM_SECURITY_UNLOCKED, &security_flags);
+		else
+			set_bit(NVDIMM_SECURITY_DISABLED, &security_flags);
+		if (sec_out & CXL_PMEM_SEC_STATE_MASTER_PLIMIT)
+			set_bit(NVDIMM_SECURITY_FROZEN, &security_flags);
+		return security_flags;
+	}
+
+	if (sec_out & CXL_PMEM_SEC_STATE_USER_PASS_SET) {
+		if (sec_out & CXL_PMEM_SEC_STATE_FROZEN ||
+		    sec_out & CXL_PMEM_SEC_STATE_USER_PLIMIT)
+			set_bit(NVDIMM_SECURITY_FROZEN, &security_flags);
+
+		if (sec_out & CXL_PMEM_SEC_STATE_LOCKED)
+			set_bit(NVDIMM_SECURITY_LOCKED, &security_flags);
+		else
+			set_bit(NVDIMM_SECURITY_UNLOCKED, &security_flags);
+	} else {
+		set_bit(NVDIMM_SECURITY_DISABLED, &security_flags);
+	}
+
+	return security_flags;
+}
+
+static int cxl_pmem_security_change_key(struct nvdimm *nvdimm,
+					const struct nvdimm_key_data *old_data,
+					const struct nvdimm_key_data *new_data,
+					enum nvdimm_passphrase_type ptype)
+{
+	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct cxl_mbox_cmd mbox_cmd;
+	struct cxl_set_pass set_pass;
+
+	set_pass = (struct cxl_set_pass) {
+		.type = ptype == NVDIMM_MASTER ? CXL_PMEM_SEC_PASS_MASTER :
+						 CXL_PMEM_SEC_PASS_USER,
+	};
+	memcpy(set_pass.old_pass, old_data->data, NVDIMM_PASSPHRASE_LEN);
+	memcpy(set_pass.new_pass, new_data->data, NVDIMM_PASSPHRASE_LEN);
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_SET_PASSPHRASE,
+		.size_in = sizeof(set_pass),
+		.payload_in = &set_pass,
+	};
+
+	return cxl_internal_send_cmd(mds, &mbox_cmd);
+}
+
+static int __cxl_pmem_security_disable(struct nvdimm *nvdimm,
+				       const struct nvdimm_key_data *key_data,
+				       enum nvdimm_passphrase_type ptype)
+{
+	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct cxl_disable_pass dis_pass;
+	struct cxl_mbox_cmd mbox_cmd;
+
+	dis_pass = (struct cxl_disable_pass) {
+		.type = ptype == NVDIMM_MASTER ? CXL_PMEM_SEC_PASS_MASTER :
+						 CXL_PMEM_SEC_PASS_USER,
+	};
+	memcpy(dis_pass.pass, key_data->data, NVDIMM_PASSPHRASE_LEN);
+
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_DISABLE_PASSPHRASE,
+		.size_in = sizeof(dis_pass),
+		.payload_in = &dis_pass,
+	};
+
+	return cxl_internal_send_cmd(mds, &mbox_cmd);
+}
+
+static int cxl_pmem_security_disable(struct nvdimm *nvdimm,
+				     const struct nvdimm_key_data *key_data)
+{
+	return __cxl_pmem_security_disable(nvdimm, key_data, NVDIMM_USER);
+}
+
+static int cxl_pmem_security_disable_master(struct nvdimm *nvdimm,
+					    const struct nvdimm_key_data *key_data)
+{
+	return __cxl_pmem_security_disable(nvdimm, key_data, NVDIMM_MASTER);
+}
+
+static int cxl_pmem_security_freeze(struct nvdimm *nvdimm)
+{
+	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct cxl_mbox_cmd mbox_cmd = {
+		.opcode = CXL_MBOX_OP_FREEZE_SECURITY,
+	};
+
+	return cxl_internal_send_cmd(mds, &mbox_cmd);
+}
+
+static int cxl_pmem_security_unlock(struct nvdimm *nvdimm,
+				    const struct nvdimm_key_data *key_data)
+{
+	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	u8 pass[NVDIMM_PASSPHRASE_LEN];
+	struct cxl_mbox_cmd mbox_cmd;
+	int rc;
+
+	memcpy(pass, key_data->data, NVDIMM_PASSPHRASE_LEN);
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_UNLOCK,
+		.size_in = NVDIMM_PASSPHRASE_LEN,
+		.payload_in = pass,
+	};
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+static int cxl_pmem_security_passphrase_erase(struct nvdimm *nvdimm,
+					      const struct nvdimm_key_data *key,
+					      enum nvdimm_passphrase_type ptype)
+{
+	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+	struct cxl_mbox_cmd mbox_cmd;
+	struct cxl_pass_erase erase;
+	int rc;
+
+	erase = (struct cxl_pass_erase) {
+		.type = ptype == NVDIMM_MASTER ? CXL_PMEM_SEC_PASS_MASTER :
+						 CXL_PMEM_SEC_PASS_USER,
+	};
+	memcpy(erase.pass, key->data, NVDIMM_PASSPHRASE_LEN);
+	mbox_cmd = (struct cxl_mbox_cmd) {
+		.opcode = CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE,
+		.size_in = sizeof(erase),
+		.payload_in = &erase,
+	};
+
+	rc = cxl_internal_send_cmd(mds, &mbox_cmd);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+static const struct nvdimm_security_ops __cxl_security_ops = {
+	.get_flags = cxl_pmem_get_security_flags,
+	.change_key = cxl_pmem_security_change_key,
+	.disable = cxl_pmem_security_disable,
+	.freeze = cxl_pmem_security_freeze,
+	.unlock = cxl_pmem_security_unlock,
+	.erase = cxl_pmem_security_passphrase_erase,
+	.disable_master = cxl_pmem_security_disable_master,
+};
+
+const struct nvdimm_security_ops *cxl_security_ops = &__cxl_security_ops;
-- 
cgit v1.2.3