diff options
Diffstat (limited to '')
-rw-r--r-- | drivers/acpi/numa/Kconfig | 18 | ||||
-rw-r--r-- | drivers/acpi/numa/Makefile | 3 | ||||
-rw-r--r-- | drivers/acpi/numa/hmat.c | 877 | ||||
-rw-r--r-- | drivers/acpi/numa/srat.c | 575 |
4 files changed, 1473 insertions, 0 deletions
diff --git a/drivers/acpi/numa/Kconfig b/drivers/acpi/numa/Kconfig new file mode 100644 index 000000000..39b1f34c2 --- /dev/null +++ b/drivers/acpi/numa/Kconfig @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 +config ACPI_NUMA + bool "NUMA support" + depends on NUMA + depends on (X86 || IA64 || ARM64 || LOONGARCH) + default y if IA64 || ARM64 + +config ACPI_HMAT + bool "ACPI Heterogeneous Memory Attribute Table Support" + depends on ACPI_NUMA + select HMEM_REPORTING + select MEMREGION + help + If set, this option has the kernel parse and report the + platform's ACPI HMAT (Heterogeneous Memory Attributes Table), + register memory initiators with their targets, and export + performance attributes through the node's sysfs device if + provided. diff --git a/drivers/acpi/numa/Makefile b/drivers/acpi/numa/Makefile new file mode 100644 index 000000000..517a6c689 --- /dev/null +++ b/drivers/acpi/numa/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_ACPI_NUMA) += srat.o +obj-$(CONFIG_ACPI_HMAT) += hmat.o diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c new file mode 100644 index 000000000..6cceca64a --- /dev/null +++ b/drivers/acpi/numa/hmat.c @@ -0,0 +1,877 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019, Intel Corporation. + * + * Heterogeneous Memory Attributes Table (HMAT) representation + * + * This program parses and reports the platform's HMAT tables, and registers + * the applicable attributes with the node's interfaces. + */ + +#define pr_fmt(fmt) "acpi/hmat: " fmt + +#include <linux/acpi.h> +#include <linux/bitops.h> +#include <linux/device.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/platform_device.h> +#include <linux/list_sort.h> +#include <linux/memregion.h> +#include <linux/memory.h> +#include <linux/mutex.h> +#include <linux/node.h> +#include <linux/sysfs.h> +#include <linux/dax.h> + +static u8 hmat_revision; +static int hmat_disable __initdata; + +void __init disable_hmat(void) +{ + hmat_disable = 1; +} + +static LIST_HEAD(targets); +static LIST_HEAD(initiators); +static LIST_HEAD(localities); + +static DEFINE_MUTEX(target_lock); + +/* + * The defined enum order is used to prioritize attributes to break ties when + * selecting the best performing node. + */ +enum locality_types { + WRITE_LATENCY, + READ_LATENCY, + WRITE_BANDWIDTH, + READ_BANDWIDTH, +}; + +static struct memory_locality *localities_types[4]; + +struct target_cache { + struct list_head node; + struct node_cache_attrs cache_attrs; +}; + +struct memory_target { + struct list_head node; + unsigned int memory_pxm; + unsigned int processor_pxm; + struct resource memregions; + struct node_hmem_attrs hmem_attrs[2]; + struct list_head caches; + struct node_cache_attrs cache_attrs; + bool registered; +}; + +struct memory_initiator { + struct list_head node; + unsigned int processor_pxm; + bool has_cpu; +}; + +struct memory_locality { + struct list_head node; + struct acpi_hmat_locality *hmat_loc; +}; + +static struct memory_initiator *find_mem_initiator(unsigned int cpu_pxm) +{ + struct memory_initiator *initiator; + + list_for_each_entry(initiator, &initiators, node) + if (initiator->processor_pxm == cpu_pxm) + return initiator; + return NULL; +} + +static struct memory_target *find_mem_target(unsigned int mem_pxm) +{ + struct memory_target *target; + + list_for_each_entry(target, &targets, node) + if (target->memory_pxm == mem_pxm) + return target; + return NULL; +} + +static __init void alloc_memory_initiator(unsigned int cpu_pxm) +{ + struct memory_initiator *initiator; + + if (pxm_to_node(cpu_pxm) == NUMA_NO_NODE) + return; + + initiator = find_mem_initiator(cpu_pxm); + if (initiator) + return; + + initiator = kzalloc(sizeof(*initiator), GFP_KERNEL); + if (!initiator) + return; + + initiator->processor_pxm = cpu_pxm; + initiator->has_cpu = node_state(pxm_to_node(cpu_pxm), N_CPU); + list_add_tail(&initiator->node, &initiators); +} + +static __init void alloc_memory_target(unsigned int mem_pxm, + resource_size_t start, resource_size_t len) +{ + struct memory_target *target; + + target = find_mem_target(mem_pxm); + if (!target) { + target = kzalloc(sizeof(*target), GFP_KERNEL); + if (!target) + return; + target->memory_pxm = mem_pxm; + target->processor_pxm = PXM_INVAL; + target->memregions = (struct resource) { + .name = "ACPI mem", + .start = 0, + .end = -1, + .flags = IORESOURCE_MEM, + }; + list_add_tail(&target->node, &targets); + INIT_LIST_HEAD(&target->caches); + } + + /* + * There are potentially multiple ranges per PXM, so record each + * in the per-target memregions resource tree. + */ + if (!__request_region(&target->memregions, start, len, "memory target", + IORESOURCE_MEM)) + pr_warn("failed to reserve %#llx - %#llx in pxm: %d\n", + start, start + len, mem_pxm); +} + +static __init const char *hmat_data_type(u8 type) +{ + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + return "Access Latency"; + case ACPI_HMAT_READ_LATENCY: + return "Read Latency"; + case ACPI_HMAT_WRITE_LATENCY: + return "Write Latency"; + case ACPI_HMAT_ACCESS_BANDWIDTH: + return "Access Bandwidth"; + case ACPI_HMAT_READ_BANDWIDTH: + return "Read Bandwidth"; + case ACPI_HMAT_WRITE_BANDWIDTH: + return "Write Bandwidth"; + default: + return "Reserved"; + } +} + +static __init const char *hmat_data_type_suffix(u8 type) +{ + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + case ACPI_HMAT_READ_LATENCY: + case ACPI_HMAT_WRITE_LATENCY: + return " nsec"; + case ACPI_HMAT_ACCESS_BANDWIDTH: + case ACPI_HMAT_READ_BANDWIDTH: + case ACPI_HMAT_WRITE_BANDWIDTH: + return " MB/s"; + default: + return ""; + } +} + +static u32 hmat_normalize(u16 entry, u64 base, u8 type) +{ + u32 value; + + /* + * Check for invalid and overflow values + */ + if (entry == 0xffff || !entry) + return 0; + else if (base > (UINT_MAX / (entry))) + return 0; + + /* + * Divide by the base unit for version 1, convert latency from + * picosenonds to nanoseconds if revision 2. + */ + value = entry * base; + if (hmat_revision == 1) { + if (value < 10) + return 0; + value = DIV_ROUND_UP(value, 10); + } else if (hmat_revision == 2) { + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + case ACPI_HMAT_READ_LATENCY: + case ACPI_HMAT_WRITE_LATENCY: + value = DIV_ROUND_UP(value, 1000); + break; + default: + break; + } + } + return value; +} + +static void hmat_update_target_access(struct memory_target *target, + u8 type, u32 value, int access) +{ + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + target->hmem_attrs[access].read_latency = value; + target->hmem_attrs[access].write_latency = value; + break; + case ACPI_HMAT_READ_LATENCY: + target->hmem_attrs[access].read_latency = value; + break; + case ACPI_HMAT_WRITE_LATENCY: + target->hmem_attrs[access].write_latency = value; + break; + case ACPI_HMAT_ACCESS_BANDWIDTH: + target->hmem_attrs[access].read_bandwidth = value; + target->hmem_attrs[access].write_bandwidth = value; + break; + case ACPI_HMAT_READ_BANDWIDTH: + target->hmem_attrs[access].read_bandwidth = value; + break; + case ACPI_HMAT_WRITE_BANDWIDTH: + target->hmem_attrs[access].write_bandwidth = value; + break; + default: + break; + } +} + +static __init void hmat_add_locality(struct acpi_hmat_locality *hmat_loc) +{ + struct memory_locality *loc; + + loc = kzalloc(sizeof(*loc), GFP_KERNEL); + if (!loc) { + pr_notice_once("Failed to allocate HMAT locality\n"); + return; + } + + loc->hmat_loc = hmat_loc; + list_add_tail(&loc->node, &localities); + + switch (hmat_loc->data_type) { + case ACPI_HMAT_ACCESS_LATENCY: + localities_types[READ_LATENCY] = loc; + localities_types[WRITE_LATENCY] = loc; + break; + case ACPI_HMAT_READ_LATENCY: + localities_types[READ_LATENCY] = loc; + break; + case ACPI_HMAT_WRITE_LATENCY: + localities_types[WRITE_LATENCY] = loc; + break; + case ACPI_HMAT_ACCESS_BANDWIDTH: + localities_types[READ_BANDWIDTH] = loc; + localities_types[WRITE_BANDWIDTH] = loc; + break; + case ACPI_HMAT_READ_BANDWIDTH: + localities_types[READ_BANDWIDTH] = loc; + break; + case ACPI_HMAT_WRITE_BANDWIDTH: + localities_types[WRITE_BANDWIDTH] = loc; + break; + default: + break; + } +} + +static __init int hmat_parse_locality(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_hmat_locality *hmat_loc = (void *)header; + struct memory_target *target; + unsigned int init, targ, total_size, ipds, tpds; + u32 *inits, *targs, value; + u16 *entries; + u8 type, mem_hier; + + if (hmat_loc->header.length < sizeof(*hmat_loc)) { + pr_notice("Unexpected locality header length: %u\n", + hmat_loc->header.length); + return -EINVAL; + } + + type = hmat_loc->data_type; + mem_hier = hmat_loc->flags & ACPI_HMAT_MEMORY_HIERARCHY; + ipds = hmat_loc->number_of_initiator_Pds; + tpds = hmat_loc->number_of_target_Pds; + total_size = sizeof(*hmat_loc) + sizeof(*entries) * ipds * tpds + + sizeof(*inits) * ipds + sizeof(*targs) * tpds; + if (hmat_loc->header.length < total_size) { + pr_notice("Unexpected locality header length:%u, minimum required:%u\n", + hmat_loc->header.length, total_size); + return -EINVAL; + } + + pr_info("Locality: Flags:%02x Type:%s Initiator Domains:%u Target Domains:%u Base:%lld\n", + hmat_loc->flags, hmat_data_type(type), ipds, tpds, + hmat_loc->entry_base_unit); + + inits = (u32 *)(hmat_loc + 1); + targs = inits + ipds; + entries = (u16 *)(targs + tpds); + for (init = 0; init < ipds; init++) { + alloc_memory_initiator(inits[init]); + for (targ = 0; targ < tpds; targ++) { + value = hmat_normalize(entries[init * tpds + targ], + hmat_loc->entry_base_unit, + type); + pr_info(" Initiator-Target[%u-%u]:%u%s\n", + inits[init], targs[targ], value, + hmat_data_type_suffix(type)); + + if (mem_hier == ACPI_HMAT_MEMORY) { + target = find_mem_target(targs[targ]); + if (target && target->processor_pxm == inits[init]) { + hmat_update_target_access(target, type, value, 0); + /* If the node has a CPU, update access 1 */ + if (node_state(pxm_to_node(inits[init]), N_CPU)) + hmat_update_target_access(target, type, value, 1); + } + } + } + } + + if (mem_hier == ACPI_HMAT_MEMORY) + hmat_add_locality(hmat_loc); + + return 0; +} + +static __init int hmat_parse_cache(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_hmat_cache *cache = (void *)header; + struct memory_target *target; + struct target_cache *tcache; + u32 attrs; + + if (cache->header.length < sizeof(*cache)) { + pr_notice("Unexpected cache header length: %u\n", + cache->header.length); + return -EINVAL; + } + + attrs = cache->cache_attributes; + pr_info("Cache: Domain:%u Size:%llu Attrs:%08x SMBIOS Handles:%d\n", + cache->memory_PD, cache->cache_size, attrs, + cache->number_of_SMBIOShandles); + + target = find_mem_target(cache->memory_PD); + if (!target) + return 0; + + tcache = kzalloc(sizeof(*tcache), GFP_KERNEL); + if (!tcache) { + pr_notice_once("Failed to allocate HMAT cache info\n"); + return 0; + } + + tcache->cache_attrs.size = cache->cache_size; + tcache->cache_attrs.level = (attrs & ACPI_HMAT_CACHE_LEVEL) >> 4; + tcache->cache_attrs.line_size = (attrs & ACPI_HMAT_CACHE_LINE_SIZE) >> 16; + + switch ((attrs & ACPI_HMAT_CACHE_ASSOCIATIVITY) >> 8) { + case ACPI_HMAT_CA_DIRECT_MAPPED: + tcache->cache_attrs.indexing = NODE_CACHE_DIRECT_MAP; + break; + case ACPI_HMAT_CA_COMPLEX_CACHE_INDEXING: + tcache->cache_attrs.indexing = NODE_CACHE_INDEXED; + break; + case ACPI_HMAT_CA_NONE: + default: + tcache->cache_attrs.indexing = NODE_CACHE_OTHER; + break; + } + + switch ((attrs & ACPI_HMAT_WRITE_POLICY) >> 12) { + case ACPI_HMAT_CP_WB: + tcache->cache_attrs.write_policy = NODE_CACHE_WRITE_BACK; + break; + case ACPI_HMAT_CP_WT: + tcache->cache_attrs.write_policy = NODE_CACHE_WRITE_THROUGH; + break; + case ACPI_HMAT_CP_NONE: + default: + tcache->cache_attrs.write_policy = NODE_CACHE_WRITE_OTHER; + break; + } + list_add_tail(&tcache->node, &target->caches); + + return 0; +} + +static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_hmat_proximity_domain *p = (void *)header; + struct memory_target *target = NULL; + + if (p->header.length != sizeof(*p)) { + pr_notice("Unexpected address range header length: %u\n", + p->header.length); + return -EINVAL; + } + + if (hmat_revision == 1) + pr_info("Memory (%#llx length %#llx) Flags:%04x Processor Domain:%u Memory Domain:%u\n", + p->reserved3, p->reserved4, p->flags, p->processor_PD, + p->memory_PD); + else + pr_info("Memory Flags:%04x Processor Domain:%u Memory Domain:%u\n", + p->flags, p->processor_PD, p->memory_PD); + + if ((hmat_revision == 1 && p->flags & ACPI_HMAT_MEMORY_PD_VALID) || + hmat_revision > 1) { + target = find_mem_target(p->memory_PD); + if (!target) { + pr_debug("Memory Domain missing from SRAT\n"); + return -EINVAL; + } + } + if (target && p->flags & ACPI_HMAT_PROCESSOR_PD_VALID) { + int p_node = pxm_to_node(p->processor_PD); + + if (p_node == NUMA_NO_NODE) { + pr_debug("Invalid Processor Domain\n"); + return -EINVAL; + } + target->processor_pxm = p->processor_PD; + } + + return 0; +} + +static int __init hmat_parse_subtable(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_hmat_structure *hdr = (void *)header; + + if (!hdr) + return -EINVAL; + + switch (hdr->type) { + case ACPI_HMAT_TYPE_PROXIMITY: + return hmat_parse_proximity_domain(header, end); + case ACPI_HMAT_TYPE_LOCALITY: + return hmat_parse_locality(header, end); + case ACPI_HMAT_TYPE_CACHE: + return hmat_parse_cache(header, end); + default: + return -EINVAL; + } +} + +static __init int srat_parse_mem_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_mem_affinity *ma = (void *)header; + + if (!ma) + return -EINVAL; + if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) + return 0; + alloc_memory_target(ma->proximity_domain, ma->base_address, ma->length); + return 0; +} + +static u32 hmat_initiator_perf(struct memory_target *target, + struct memory_initiator *initiator, + struct acpi_hmat_locality *hmat_loc) +{ + unsigned int ipds, tpds, i, idx = 0, tdx = 0; + u32 *inits, *targs; + u16 *entries; + + ipds = hmat_loc->number_of_initiator_Pds; + tpds = hmat_loc->number_of_target_Pds; + inits = (u32 *)(hmat_loc + 1); + targs = inits + ipds; + entries = (u16 *)(targs + tpds); + + for (i = 0; i < ipds; i++) { + if (inits[i] == initiator->processor_pxm) { + idx = i; + break; + } + } + + if (i == ipds) + return 0; + + for (i = 0; i < tpds; i++) { + if (targs[i] == target->memory_pxm) { + tdx = i; + break; + } + } + if (i == tpds) + return 0; + + return hmat_normalize(entries[idx * tpds + tdx], + hmat_loc->entry_base_unit, + hmat_loc->data_type); +} + +static bool hmat_update_best(u8 type, u32 value, u32 *best) +{ + bool updated = false; + + if (!value) + return false; + + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + case ACPI_HMAT_READ_LATENCY: + case ACPI_HMAT_WRITE_LATENCY: + if (!*best || *best > value) { + *best = value; + updated = true; + } + break; + case ACPI_HMAT_ACCESS_BANDWIDTH: + case ACPI_HMAT_READ_BANDWIDTH: + case ACPI_HMAT_WRITE_BANDWIDTH: + if (!*best || *best < value) { + *best = value; + updated = true; + } + break; + } + + return updated; +} + +static int initiator_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct memory_initiator *ia; + struct memory_initiator *ib; + + ia = list_entry(a, struct memory_initiator, node); + ib = list_entry(b, struct memory_initiator, node); + + return ia->processor_pxm - ib->processor_pxm; +} + +static int initiators_to_nodemask(unsigned long *p_nodes) +{ + struct memory_initiator *initiator; + + if (list_empty(&initiators)) + return -ENXIO; + + list_for_each_entry(initiator, &initiators, node) + set_bit(initiator->processor_pxm, p_nodes); + + return 0; +} + +static void hmat_register_target_initiators(struct memory_target *target) +{ + static DECLARE_BITMAP(p_nodes, MAX_NUMNODES); + struct memory_initiator *initiator; + unsigned int mem_nid, cpu_nid; + struct memory_locality *loc = NULL; + u32 best = 0; + bool access0done = false; + int i; + + mem_nid = pxm_to_node(target->memory_pxm); + /* + * If the Address Range Structure provides a local processor pxm, link + * only that one. Otherwise, find the best performance attributes and + * register all initiators that match. + */ + if (target->processor_pxm != PXM_INVAL) { + cpu_nid = pxm_to_node(target->processor_pxm); + register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); + access0done = true; + if (node_state(cpu_nid, N_CPU)) { + register_memory_node_under_compute_node(mem_nid, cpu_nid, 1); + return; + } + } + + if (list_empty(&localities)) + return; + + /* + * We need the initiator list sorted so we can use bitmap_clear for + * previously set initiators when we find a better memory accessor. + * We'll also use the sorting to prime the candidate nodes with known + * initiators. + */ + bitmap_zero(p_nodes, MAX_NUMNODES); + list_sort(NULL, &initiators, initiator_cmp); + if (initiators_to_nodemask(p_nodes) < 0) + return; + + if (!access0done) { + for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) { + loc = localities_types[i]; + if (!loc) + continue; + + best = 0; + list_for_each_entry(initiator, &initiators, node) { + u32 value; + + if (!test_bit(initiator->processor_pxm, p_nodes)) + continue; + + value = hmat_initiator_perf(target, initiator, + loc->hmat_loc); + if (hmat_update_best(loc->hmat_loc->data_type, value, &best)) + bitmap_clear(p_nodes, 0, initiator->processor_pxm); + if (value != best) + clear_bit(initiator->processor_pxm, p_nodes); + } + if (best) + hmat_update_target_access(target, loc->hmat_loc->data_type, + best, 0); + } + + for_each_set_bit(i, p_nodes, MAX_NUMNODES) { + cpu_nid = pxm_to_node(i); + register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); + } + } + + /* Access 1 ignores Generic Initiators */ + bitmap_zero(p_nodes, MAX_NUMNODES); + if (initiators_to_nodemask(p_nodes) < 0) + return; + + for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) { + loc = localities_types[i]; + if (!loc) + continue; + + best = 0; + list_for_each_entry(initiator, &initiators, node) { + u32 value; + + if (!initiator->has_cpu) { + clear_bit(initiator->processor_pxm, p_nodes); + continue; + } + if (!test_bit(initiator->processor_pxm, p_nodes)) + continue; + + value = hmat_initiator_perf(target, initiator, loc->hmat_loc); + if (hmat_update_best(loc->hmat_loc->data_type, value, &best)) + bitmap_clear(p_nodes, 0, initiator->processor_pxm); + if (value != best) + clear_bit(initiator->processor_pxm, p_nodes); + } + if (best) + hmat_update_target_access(target, loc->hmat_loc->data_type, best, 1); + } + for_each_set_bit(i, p_nodes, MAX_NUMNODES) { + cpu_nid = pxm_to_node(i); + register_memory_node_under_compute_node(mem_nid, cpu_nid, 1); + } +} + +static void hmat_register_target_cache(struct memory_target *target) +{ + unsigned mem_nid = pxm_to_node(target->memory_pxm); + struct target_cache *tcache; + + list_for_each_entry(tcache, &target->caches, node) + node_add_cache(mem_nid, &tcache->cache_attrs); +} + +static void hmat_register_target_perf(struct memory_target *target, int access) +{ + unsigned mem_nid = pxm_to_node(target->memory_pxm); + node_set_perf_attrs(mem_nid, &target->hmem_attrs[access], access); +} + +static void hmat_register_target_devices(struct memory_target *target) +{ + struct resource *res; + + /* + * Do not bother creating devices if no driver is available to + * consume them. + */ + if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM)) + return; + + for (res = target->memregions.child; res; res = res->sibling) { + int target_nid = pxm_to_node(target->memory_pxm); + + hmem_register_device(target_nid, res); + } +} + +static void hmat_register_target(struct memory_target *target) +{ + int nid = pxm_to_node(target->memory_pxm); + + /* + * Devices may belong to either an offline or online + * node, so unconditionally add them. + */ + hmat_register_target_devices(target); + + /* + * Skip offline nodes. This can happen when memory + * marked EFI_MEMORY_SP, "specific purpose", is applied + * to all the memory in a proximity domain leading to + * the node being marked offline / unplugged, or if + * memory-only "hotplug" node is offline. + */ + if (nid == NUMA_NO_NODE || !node_online(nid)) + return; + + mutex_lock(&target_lock); + if (!target->registered) { + hmat_register_target_initiators(target); + hmat_register_target_cache(target); + hmat_register_target_perf(target, 0); + hmat_register_target_perf(target, 1); + target->registered = true; + } + mutex_unlock(&target_lock); +} + +static void hmat_register_targets(void) +{ + struct memory_target *target; + + list_for_each_entry(target, &targets, node) + hmat_register_target(target); +} + +static int hmat_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_target *target; + struct memory_notify *mnb = arg; + int pxm, nid = mnb->status_change_nid; + + if (nid == NUMA_NO_NODE || action != MEM_ONLINE) + return NOTIFY_OK; + + pxm = node_to_pxm(nid); + target = find_mem_target(pxm); + if (!target) + return NOTIFY_OK; + + hmat_register_target(target); + return NOTIFY_OK; +} + +static struct notifier_block hmat_callback_nb = { + .notifier_call = hmat_callback, + .priority = 2, +}; + +static __init void hmat_free_structures(void) +{ + struct memory_target *target, *tnext; + struct memory_locality *loc, *lnext; + struct memory_initiator *initiator, *inext; + struct target_cache *tcache, *cnext; + + list_for_each_entry_safe(target, tnext, &targets, node) { + struct resource *res, *res_next; + + list_for_each_entry_safe(tcache, cnext, &target->caches, node) { + list_del(&tcache->node); + kfree(tcache); + } + + list_del(&target->node); + res = target->memregions.child; + while (res) { + res_next = res->sibling; + __release_region(&target->memregions, res->start, + resource_size(res)); + res = res_next; + } + kfree(target); + } + + list_for_each_entry_safe(initiator, inext, &initiators, node) { + list_del(&initiator->node); + kfree(initiator); + } + + list_for_each_entry_safe(loc, lnext, &localities, node) { + list_del(&loc->node); + kfree(loc); + } +} + +static __init int hmat_init(void) +{ + struct acpi_table_header *tbl; + enum acpi_hmat_type i; + acpi_status status; + + if (srat_disabled() || hmat_disable) + return 0; + + status = acpi_get_table(ACPI_SIG_SRAT, 0, &tbl); + if (ACPI_FAILURE(status)) + return 0; + + if (acpi_table_parse_entries(ACPI_SIG_SRAT, + sizeof(struct acpi_table_srat), + ACPI_SRAT_TYPE_MEMORY_AFFINITY, + srat_parse_mem_affinity, 0) < 0) + goto out_put; + acpi_put_table(tbl); + + status = acpi_get_table(ACPI_SIG_HMAT, 0, &tbl); + if (ACPI_FAILURE(status)) + goto out_put; + + hmat_revision = tbl->revision; + switch (hmat_revision) { + case 1: + case 2: + break; + default: + pr_notice("Ignoring: Unknown revision:%d\n", hmat_revision); + goto out_put; + } + + for (i = ACPI_HMAT_TYPE_PROXIMITY; i < ACPI_HMAT_TYPE_RESERVED; i++) { + if (acpi_table_parse_entries(ACPI_SIG_HMAT, + sizeof(struct acpi_table_hmat), i, + hmat_parse_subtable, 0) < 0) { + pr_notice("Ignoring: Invalid table"); + goto out_put; + } + } + hmat_register_targets(); + + /* Keep the table and structures if the notifier may use them */ + if (!register_hotmemory_notifier(&hmat_callback_nb)) + return 0; +out_put: + hmat_free_structures(); + acpi_put_table(tbl); + return 0; +} +device_initcall(hmat_init); diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c new file mode 100644 index 000000000..12f330b0e --- /dev/null +++ b/drivers/acpi/numa/srat.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * acpi_numa.c - ACPI NUMA support + * + * Copyright (C) 2002 Takayoshi Kochi <t-kochi@bq.jp.nec.com> + */ + +#define pr_fmt(fmt) "ACPI: " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/acpi.h> +#include <linux/memblock.h> +#include <linux/numa.h> +#include <linux/nodemask.h> +#include <linux/topology.h> + +static nodemask_t nodes_found_map = NODE_MASK_NONE; + +/* maps to convert between proximity domain and logical node ID */ +static int pxm_to_node_map[MAX_PXM_DOMAINS] + = { [0 ... MAX_PXM_DOMAINS - 1] = NUMA_NO_NODE }; +static int node_to_pxm_map[MAX_NUMNODES] + = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL }; + +unsigned char acpi_srat_revision __initdata; +static int acpi_numa __initdata; + +void __init disable_srat(void) +{ + acpi_numa = -1; +} + +int pxm_to_node(int pxm) +{ + if (pxm < 0 || pxm >= MAX_PXM_DOMAINS || numa_off) + return NUMA_NO_NODE; + return pxm_to_node_map[pxm]; +} +EXPORT_SYMBOL(pxm_to_node); + +int node_to_pxm(int node) +{ + if (node < 0) + return PXM_INVAL; + return node_to_pxm_map[node]; +} + +static void __acpi_map_pxm_to_node(int pxm, int node) +{ + if (pxm_to_node_map[pxm] == NUMA_NO_NODE || node < pxm_to_node_map[pxm]) + pxm_to_node_map[pxm] = node; + if (node_to_pxm_map[node] == PXM_INVAL || pxm < node_to_pxm_map[node]) + node_to_pxm_map[node] = pxm; +} + +int acpi_map_pxm_to_node(int pxm) +{ + int node; + + if (pxm < 0 || pxm >= MAX_PXM_DOMAINS || numa_off) + return NUMA_NO_NODE; + + node = pxm_to_node_map[pxm]; + + if (node == NUMA_NO_NODE) { + if (nodes_weight(nodes_found_map) >= MAX_NUMNODES) + return NUMA_NO_NODE; + node = first_unset_node(nodes_found_map); + __acpi_map_pxm_to_node(pxm, node); + node_set(node, nodes_found_map); + } + + return node; +} +EXPORT_SYMBOL(acpi_map_pxm_to_node); + +static void __init +acpi_table_print_srat_entry(struct acpi_subtable_header *header) +{ + switch (header->type) { + case ACPI_SRAT_TYPE_CPU_AFFINITY: + { + struct acpi_srat_cpu_affinity *p = + (struct acpi_srat_cpu_affinity *)header; + pr_debug("SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n", + p->apic_id, p->local_sapic_eid, + p->proximity_domain_lo, + (p->flags & ACPI_SRAT_CPU_ENABLED) ? + "enabled" : "disabled"); + } + break; + + case ACPI_SRAT_TYPE_MEMORY_AFFINITY: + { + struct acpi_srat_mem_affinity *p = + (struct acpi_srat_mem_affinity *)header; + pr_debug("SRAT Memory (0x%llx length 0x%llx) in proximity domain %d %s%s%s\n", + (unsigned long long)p->base_address, + (unsigned long long)p->length, + p->proximity_domain, + (p->flags & ACPI_SRAT_MEM_ENABLED) ? + "enabled" : "disabled", + (p->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? + " hot-pluggable" : "", + (p->flags & ACPI_SRAT_MEM_NON_VOLATILE) ? + " non-volatile" : ""); + } + break; + + case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: + { + struct acpi_srat_x2apic_cpu_affinity *p = + (struct acpi_srat_x2apic_cpu_affinity *)header; + pr_debug("SRAT Processor (x2apicid[0x%08x]) in proximity domain %d %s\n", + p->apic_id, + p->proximity_domain, + (p->flags & ACPI_SRAT_CPU_ENABLED) ? + "enabled" : "disabled"); + } + break; + + case ACPI_SRAT_TYPE_GICC_AFFINITY: + { + struct acpi_srat_gicc_affinity *p = + (struct acpi_srat_gicc_affinity *)header; + pr_debug("SRAT Processor (acpi id[0x%04x]) in proximity domain %d %s\n", + p->acpi_processor_uid, + p->proximity_domain, + (p->flags & ACPI_SRAT_GICC_ENABLED) ? + "enabled" : "disabled"); + } + break; + + case ACPI_SRAT_TYPE_GENERIC_AFFINITY: + { + struct acpi_srat_generic_affinity *p = + (struct acpi_srat_generic_affinity *)header; + + if (p->device_handle_type == 0) { + /* + * For pci devices this may be the only place they + * are assigned a proximity domain + */ + pr_debug("SRAT Generic Initiator(Seg:%u BDF:%u) in proximity domain %d %s\n", + *(u16 *)(&p->device_handle[0]), + *(u16 *)(&p->device_handle[2]), + p->proximity_domain, + (p->flags & ACPI_SRAT_GENERIC_AFFINITY_ENABLED) ? + "enabled" : "disabled"); + } else { + /* + * In this case we can rely on the device having a + * proximity domain reference + */ + pr_debug("SRAT Generic Initiator(HID=%.8s UID=%.4s) in proximity domain %d %s\n", + (char *)(&p->device_handle[0]), + (char *)(&p->device_handle[8]), + p->proximity_domain, + (p->flags & ACPI_SRAT_GENERIC_AFFINITY_ENABLED) ? + "enabled" : "disabled"); + } + } + break; + default: + pr_warn("Found unsupported SRAT entry (type = 0x%x)\n", + header->type); + break; + } +} + +/* + * A lot of BIOS fill in 10 (= no distance) everywhere. This messes + * up the NUMA heuristics which wants the local node to have a smaller + * distance than the others. + * Do some quick checks here and only use the SLIT if it passes. + */ +static int __init slit_valid(struct acpi_table_slit *slit) +{ + int i, j; + int d = slit->locality_count; + for (i = 0; i < d; i++) { + for (j = 0; j < d; j++) { + u8 val = slit->entry[d*i + j]; + if (i == j) { + if (val != LOCAL_DISTANCE) + return 0; + } else if (val <= LOCAL_DISTANCE) + return 0; + } + } + return 1; +} + +void __init bad_srat(void) +{ + pr_err("SRAT: SRAT not used.\n"); + disable_srat(); +} + +int __init srat_disabled(void) +{ + return acpi_numa < 0; +} + +#if defined(CONFIG_X86) || defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH) +/* + * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for + * I/O localities since SRAT does not list them. I/O localities are + * not supported at this point. + */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + int i, j; + + for (i = 0; i < slit->locality_count; i++) { + const int from_node = pxm_to_node(i); + + if (from_node == NUMA_NO_NODE) + continue; + + for (j = 0; j < slit->locality_count; j++) { + const int to_node = pxm_to_node(j); + + if (to_node == NUMA_NO_NODE) + continue; + + numa_set_distance(from_node, to_node, + slit->entry[slit->locality_count * i + j]); + } + } +} + +/* + * Default callback for parsing of the Proximity Domain <-> Memory + * Area mappings + */ +int __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) +{ + u64 start, end; + u32 hotpluggable; + int node, pxm; + + if (srat_disabled()) + goto out_err; + if (ma->header.length < sizeof(struct acpi_srat_mem_affinity)) { + pr_err("SRAT: Unexpected header length: %d\n", + ma->header.length); + goto out_err_bad_srat; + } + if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) + goto out_err; + hotpluggable = IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && + (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE); + + start = ma->base_address; + end = start + ma->length; + pxm = ma->proximity_domain; + if (acpi_srat_revision <= 1) + pxm &= 0xff; + + node = acpi_map_pxm_to_node(pxm); + if (node == NUMA_NO_NODE) { + pr_err("SRAT: Too many proximity domains.\n"); + goto out_err_bad_srat; + } + + if (numa_add_memblk(node, start, end) < 0) { + pr_err("SRAT: Failed to add memblk to node %u [mem %#010Lx-%#010Lx]\n", + node, (unsigned long long) start, + (unsigned long long) end - 1); + goto out_err_bad_srat; + } + + node_set(node, numa_nodes_parsed); + + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n", + node, pxm, + (unsigned long long) start, (unsigned long long) end - 1, + hotpluggable ? " hotplug" : "", + ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : ""); + + /* Mark hotplug range in memblock. */ + if (hotpluggable && memblock_mark_hotplug(start, ma->length)) + pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", + (unsigned long long)start, (unsigned long long)end - 1); + + max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1)); + + return 0; +out_err_bad_srat: + bad_srat(); +out_err: + return -EINVAL; +} + +static int __init acpi_parse_cfmws(union acpi_subtable_headers *header, + void *arg, const unsigned long table_end) +{ + struct acpi_cedt_cfmws *cfmws; + int *fake_pxm = arg; + u64 start, end; + int node; + + cfmws = (struct acpi_cedt_cfmws *)header; + start = cfmws->base_hpa; + end = cfmws->base_hpa + cfmws->window_size; + + /* + * The SRAT may have already described NUMA details for all, + * or a portion of, this CFMWS HPA range. Extend the memblks + * found for any portion of the window to cover the entire + * window. + */ + if (!numa_fill_memblks(start, end)) + return 0; + + /* No SRAT description. Create a new node. */ + node = acpi_map_pxm_to_node(*fake_pxm); + + if (node == NUMA_NO_NODE) { + pr_err("ACPI NUMA: Too many proximity domains while processing CFMWS.\n"); + return -EINVAL; + } + + if (numa_add_memblk(node, start, end) < 0) { + /* CXL driver must handle the NUMA_NO_NODE case */ + pr_warn("ACPI NUMA: Failed to add memblk for CFMWS node %d [mem %#llx-%#llx]\n", + node, start, end); + } + node_set(node, numa_nodes_parsed); + + /* Set the next available fake_pxm value */ + (*fake_pxm)++; + return 0; +} +#else +static int __init acpi_parse_cfmws(union acpi_subtable_headers *header, + void *arg, const unsigned long table_end) +{ + return 0; +} +#endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ + +static int __init acpi_parse_slit(struct acpi_table_header *table) +{ + struct acpi_table_slit *slit = (struct acpi_table_slit *)table; + + if (!slit_valid(slit)) { + pr_info("SLIT table looks invalid. Not used.\n"); + return -EINVAL; + } + acpi_numa_slit_init(slit); + + return 0; +} + +void __init __weak +acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) +{ + pr_warn("Found unsupported x2apic [0x%08x] SRAT entry\n", pa->apic_id); +} + +static int __init +acpi_parse_x2apic_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_x2apic_cpu_affinity *processor_affinity; + + processor_affinity = (struct acpi_srat_x2apic_cpu_affinity *)header; + + acpi_table_print_srat_entry(&header->common); + + /* let architecture-dependent part to do it */ + acpi_numa_x2apic_affinity_init(processor_affinity); + + return 0; +} + +static int __init +acpi_parse_processor_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_cpu_affinity *processor_affinity; + + processor_affinity = (struct acpi_srat_cpu_affinity *)header; + + acpi_table_print_srat_entry(&header->common); + + /* let architecture-dependent part to do it */ + acpi_numa_processor_affinity_init(processor_affinity); + + return 0; +} + +static int __init +acpi_parse_gicc_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_gicc_affinity *processor_affinity; + + processor_affinity = (struct acpi_srat_gicc_affinity *)header; + + acpi_table_print_srat_entry(&header->common); + + /* let architecture-dependent part to do it */ + acpi_numa_gicc_affinity_init(processor_affinity); + + return 0; +} + +#if defined(CONFIG_X86) || defined(CONFIG_ARM64) +static int __init +acpi_parse_gi_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_generic_affinity *gi_affinity; + int node; + + gi_affinity = (struct acpi_srat_generic_affinity *)header; + if (!gi_affinity) + return -EINVAL; + acpi_table_print_srat_entry(&header->common); + + if (!(gi_affinity->flags & ACPI_SRAT_GENERIC_AFFINITY_ENABLED)) + return -EINVAL; + + node = acpi_map_pxm_to_node(gi_affinity->proximity_domain); + if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { + pr_err("SRAT: Too many proximity domains.\n"); + return -EINVAL; + } + node_set(node, numa_nodes_parsed); + node_set_state(node, N_GENERIC_INITIATOR); + + return 0; +} +#else +static int __init +acpi_parse_gi_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + return 0; +} +#endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ + +static int __initdata parsed_numa_memblks; + +static int __init +acpi_parse_memory_affinity(union acpi_subtable_headers * header, + const unsigned long end) +{ + struct acpi_srat_mem_affinity *memory_affinity; + + memory_affinity = (struct acpi_srat_mem_affinity *)header; + + acpi_table_print_srat_entry(&header->common); + + /* let architecture-dependent part to do it */ + if (!acpi_numa_memory_affinity_init(memory_affinity)) + parsed_numa_memblks++; + return 0; +} + +static int __init acpi_parse_srat(struct acpi_table_header *table) +{ + struct acpi_table_srat *srat = (struct acpi_table_srat *)table; + + acpi_srat_revision = srat->header.revision; + + /* Real work done in acpi_table_parse_srat below. */ + + return 0; +} + +static int __init +acpi_table_parse_srat(enum acpi_srat_type id, + acpi_tbl_entry_handler handler, unsigned int max_entries) +{ + return acpi_table_parse_entries(ACPI_SIG_SRAT, + sizeof(struct acpi_table_srat), id, + handler, max_entries); +} + +int __init acpi_numa_init(void) +{ + int i, fake_pxm, cnt = 0; + + if (acpi_disabled) + return -EINVAL; + + /* + * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= + * SRAT cpu entries could have different order with that in MADT. + * So go over all cpu entries in SRAT to get apicid to node mapping. + */ + + /* SRAT: System Resource Affinity Table */ + if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { + struct acpi_subtable_proc srat_proc[4]; + + memset(srat_proc, 0, sizeof(srat_proc)); + srat_proc[0].id = ACPI_SRAT_TYPE_CPU_AFFINITY; + srat_proc[0].handler = acpi_parse_processor_affinity; + srat_proc[1].id = ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY; + srat_proc[1].handler = acpi_parse_x2apic_affinity; + srat_proc[2].id = ACPI_SRAT_TYPE_GICC_AFFINITY; + srat_proc[2].handler = acpi_parse_gicc_affinity; + srat_proc[3].id = ACPI_SRAT_TYPE_GENERIC_AFFINITY; + srat_proc[3].handler = acpi_parse_gi_affinity; + + acpi_table_parse_entries_array(ACPI_SIG_SRAT, + sizeof(struct acpi_table_srat), + srat_proc, ARRAY_SIZE(srat_proc), 0); + + cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, + acpi_parse_memory_affinity, 0); + } + + /* SLIT: System Locality Information Table */ + acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); + + /* + * CXL Fixed Memory Window Structures (CFMWS) must be parsed + * after the SRAT. Create NUMA Nodes for CXL memory ranges that + * are defined in the CFMWS and not already defined in the SRAT. + * Initialize a fake_pxm as the first available PXM to emulate. + */ + + /* fake_pxm is the next unused PXM value after SRAT parsing */ + for (i = 0, fake_pxm = -1; i < MAX_NUMNODES - 1; i++) { + if (node_to_pxm_map[i] > fake_pxm) + fake_pxm = node_to_pxm_map[i]; + } + fake_pxm++; + acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, acpi_parse_cfmws, + &fake_pxm); + + if (cnt < 0) + return cnt; + else if (!parsed_numa_memblks) + return -ENOENT; + return 0; +} + +static int acpi_get_pxm(acpi_handle h) +{ + unsigned long long pxm; + acpi_status status; + acpi_handle handle; + acpi_handle phandle = h; + + do { + handle = phandle; + status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); + if (ACPI_SUCCESS(status)) + return pxm; + status = acpi_get_parent(handle, &phandle); + } while (ACPI_SUCCESS(status)); + return -1; +} + +int acpi_get_node(acpi_handle handle) +{ + int pxm; + + pxm = acpi_get_pxm(handle); + + return pxm_to_node(pxm); +} +EXPORT_SYMBOL(acpi_get_node); |